1 /* 2 * linux/fs/ext4/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 23 */ 24 25 #include <linux/module.h> 26 #include <linux/fs.h> 27 #include <linux/time.h> 28 #include <linux/ext4_jbd2.h> 29 #include <linux/jbd2.h> 30 #include <linux/highuid.h> 31 #include <linux/pagemap.h> 32 #include <linux/quotaops.h> 33 #include <linux/string.h> 34 #include <linux/buffer_head.h> 35 #include <linux/writeback.h> 36 #include <linux/mpage.h> 37 #include <linux/uio.h> 38 #include <linux/bio.h> 39 #include "xattr.h" 40 #include "acl.h" 41 42 /* 43 * Test whether an inode is a fast symlink. 44 */ 45 static int ext4_inode_is_fast_symlink(struct inode *inode) 46 { 47 int ea_blocks = EXT4_I(inode)->i_file_acl ? 48 (inode->i_sb->s_blocksize >> 9) : 0; 49 50 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 51 } 52 53 /* 54 * The ext4 forget function must perform a revoke if we are freeing data 55 * which has been journaled. Metadata (eg. indirect blocks) must be 56 * revoked in all cases. 57 * 58 * "bh" may be NULL: a metadata block may have been freed from memory 59 * but there may still be a record of it in the journal, and that record 60 * still needs to be revoked. 61 */ 62 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 63 struct buffer_head *bh, ext4_fsblk_t blocknr) 64 { 65 int err; 66 67 might_sleep(); 68 69 BUFFER_TRACE(bh, "enter"); 70 71 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 72 "data mode %lx\n", 73 bh, is_metadata, inode->i_mode, 74 test_opt(inode->i_sb, DATA_FLAGS)); 75 76 /* Never use the revoke function if we are doing full data 77 * journaling: there is no need to, and a V1 superblock won't 78 * support it. Otherwise, only skip the revoke on un-journaled 79 * data blocks. */ 80 81 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 82 (!is_metadata && !ext4_should_journal_data(inode))) { 83 if (bh) { 84 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 85 return ext4_journal_forget(handle, bh); 86 } 87 return 0; 88 } 89 90 /* 91 * data!=journal && (is_metadata || should_journal_data(inode)) 92 */ 93 BUFFER_TRACE(bh, "call ext4_journal_revoke"); 94 err = ext4_journal_revoke(handle, blocknr, bh); 95 if (err) 96 ext4_abort(inode->i_sb, __FUNCTION__, 97 "error %d when attempting revoke", err); 98 BUFFER_TRACE(bh, "exit"); 99 return err; 100 } 101 102 /* 103 * Work out how many blocks we need to proceed with the next chunk of a 104 * truncate transaction. 105 */ 106 static unsigned long blocks_for_truncate(struct inode *inode) 107 { 108 ext4_lblk_t needed; 109 110 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 111 112 /* Give ourselves just enough room to cope with inodes in which 113 * i_blocks is corrupt: we've seen disk corruptions in the past 114 * which resulted in random data in an inode which looked enough 115 * like a regular file for ext4 to try to delete it. Things 116 * will go a bit crazy if that happens, but at least we should 117 * try not to panic the whole kernel. */ 118 if (needed < 2) 119 needed = 2; 120 121 /* But we need to bound the transaction so we don't overflow the 122 * journal. */ 123 if (needed > EXT4_MAX_TRANS_DATA) 124 needed = EXT4_MAX_TRANS_DATA; 125 126 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 127 } 128 129 /* 130 * Truncate transactions can be complex and absolutely huge. So we need to 131 * be able to restart the transaction at a conventient checkpoint to make 132 * sure we don't overflow the journal. 133 * 134 * start_transaction gets us a new handle for a truncate transaction, 135 * and extend_transaction tries to extend the existing one a bit. If 136 * extend fails, we need to propagate the failure up and restart the 137 * transaction in the top-level truncate loop. --sct 138 */ 139 static handle_t *start_transaction(struct inode *inode) 140 { 141 handle_t *result; 142 143 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 144 if (!IS_ERR(result)) 145 return result; 146 147 ext4_std_error(inode->i_sb, PTR_ERR(result)); 148 return result; 149 } 150 151 /* 152 * Try to extend this transaction for the purposes of truncation. 153 * 154 * Returns 0 if we managed to create more room. If we can't create more 155 * room, and the transaction must be restarted we return 1. 156 */ 157 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 158 { 159 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 160 return 0; 161 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 162 return 0; 163 return 1; 164 } 165 166 /* 167 * Restart the transaction associated with *handle. This does a commit, 168 * so before we call here everything must be consistently dirtied against 169 * this transaction. 170 */ 171 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 172 { 173 jbd_debug(2, "restarting handle %p\n", handle); 174 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 175 } 176 177 /* 178 * Called at the last iput() if i_nlink is zero. 179 */ 180 void ext4_delete_inode (struct inode * inode) 181 { 182 handle_t *handle; 183 184 truncate_inode_pages(&inode->i_data, 0); 185 186 if (is_bad_inode(inode)) 187 goto no_delete; 188 189 handle = start_transaction(inode); 190 if (IS_ERR(handle)) { 191 /* 192 * If we're going to skip the normal cleanup, we still need to 193 * make sure that the in-core orphan linked list is properly 194 * cleaned up. 195 */ 196 ext4_orphan_del(NULL, inode); 197 goto no_delete; 198 } 199 200 if (IS_SYNC(inode)) 201 handle->h_sync = 1; 202 inode->i_size = 0; 203 if (inode->i_blocks) 204 ext4_truncate(inode); 205 /* 206 * Kill off the orphan record which ext4_truncate created. 207 * AKPM: I think this can be inside the above `if'. 208 * Note that ext4_orphan_del() has to be able to cope with the 209 * deletion of a non-existent orphan - this is because we don't 210 * know if ext4_truncate() actually created an orphan record. 211 * (Well, we could do this if we need to, but heck - it works) 212 */ 213 ext4_orphan_del(handle, inode); 214 EXT4_I(inode)->i_dtime = get_seconds(); 215 216 /* 217 * One subtle ordering requirement: if anything has gone wrong 218 * (transaction abort, IO errors, whatever), then we can still 219 * do these next steps (the fs will already have been marked as 220 * having errors), but we can't free the inode if the mark_dirty 221 * fails. 222 */ 223 if (ext4_mark_inode_dirty(handle, inode)) 224 /* If that failed, just do the required in-core inode clear. */ 225 clear_inode(inode); 226 else 227 ext4_free_inode(handle, inode); 228 ext4_journal_stop(handle); 229 return; 230 no_delete: 231 clear_inode(inode); /* We must guarantee clearing of inode... */ 232 } 233 234 typedef struct { 235 __le32 *p; 236 __le32 key; 237 struct buffer_head *bh; 238 } Indirect; 239 240 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 241 { 242 p->key = *(p->p = v); 243 p->bh = bh; 244 } 245 246 /** 247 * ext4_block_to_path - parse the block number into array of offsets 248 * @inode: inode in question (we are only interested in its superblock) 249 * @i_block: block number to be parsed 250 * @offsets: array to store the offsets in 251 * @boundary: set this non-zero if the referred-to block is likely to be 252 * followed (on disk) by an indirect block. 253 * 254 * To store the locations of file's data ext4 uses a data structure common 255 * for UNIX filesystems - tree of pointers anchored in the inode, with 256 * data blocks at leaves and indirect blocks in intermediate nodes. 257 * This function translates the block number into path in that tree - 258 * return value is the path length and @offsets[n] is the offset of 259 * pointer to (n+1)th node in the nth one. If @block is out of range 260 * (negative or too large) warning is printed and zero returned. 261 * 262 * Note: function doesn't find node addresses, so no IO is needed. All 263 * we need to know is the capacity of indirect blocks (taken from the 264 * inode->i_sb). 265 */ 266 267 /* 268 * Portability note: the last comparison (check that we fit into triple 269 * indirect block) is spelled differently, because otherwise on an 270 * architecture with 32-bit longs and 8Kb pages we might get into trouble 271 * if our filesystem had 8Kb blocks. We might use long long, but that would 272 * kill us on x86. Oh, well, at least the sign propagation does not matter - 273 * i_block would have to be negative in the very beginning, so we would not 274 * get there at all. 275 */ 276 277 static int ext4_block_to_path(struct inode *inode, 278 ext4_lblk_t i_block, 279 ext4_lblk_t offsets[4], int *boundary) 280 { 281 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 282 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 283 const long direct_blocks = EXT4_NDIR_BLOCKS, 284 indirect_blocks = ptrs, 285 double_blocks = (1 << (ptrs_bits * 2)); 286 int n = 0; 287 int final = 0; 288 289 if (i_block < 0) { 290 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0"); 291 } else if (i_block < direct_blocks) { 292 offsets[n++] = i_block; 293 final = direct_blocks; 294 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 295 offsets[n++] = EXT4_IND_BLOCK; 296 offsets[n++] = i_block; 297 final = ptrs; 298 } else if ((i_block -= indirect_blocks) < double_blocks) { 299 offsets[n++] = EXT4_DIND_BLOCK; 300 offsets[n++] = i_block >> ptrs_bits; 301 offsets[n++] = i_block & (ptrs - 1); 302 final = ptrs; 303 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 304 offsets[n++] = EXT4_TIND_BLOCK; 305 offsets[n++] = i_block >> (ptrs_bits * 2); 306 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 307 offsets[n++] = i_block & (ptrs - 1); 308 final = ptrs; 309 } else { 310 ext4_warning(inode->i_sb, "ext4_block_to_path", 311 "block %lu > max", 312 i_block + direct_blocks + 313 indirect_blocks + double_blocks); 314 } 315 if (boundary) 316 *boundary = final - 1 - (i_block & (ptrs - 1)); 317 return n; 318 } 319 320 /** 321 * ext4_get_branch - read the chain of indirect blocks leading to data 322 * @inode: inode in question 323 * @depth: depth of the chain (1 - direct pointer, etc.) 324 * @offsets: offsets of pointers in inode/indirect blocks 325 * @chain: place to store the result 326 * @err: here we store the error value 327 * 328 * Function fills the array of triples <key, p, bh> and returns %NULL 329 * if everything went OK or the pointer to the last filled triple 330 * (incomplete one) otherwise. Upon the return chain[i].key contains 331 * the number of (i+1)-th block in the chain (as it is stored in memory, 332 * i.e. little-endian 32-bit), chain[i].p contains the address of that 333 * number (it points into struct inode for i==0 and into the bh->b_data 334 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 335 * block for i>0 and NULL for i==0. In other words, it holds the block 336 * numbers of the chain, addresses they were taken from (and where we can 337 * verify that chain did not change) and buffer_heads hosting these 338 * numbers. 339 * 340 * Function stops when it stumbles upon zero pointer (absent block) 341 * (pointer to last triple returned, *@err == 0) 342 * or when it gets an IO error reading an indirect block 343 * (ditto, *@err == -EIO) 344 * or when it reads all @depth-1 indirect blocks successfully and finds 345 * the whole chain, all way to the data (returns %NULL, *err == 0). 346 * 347 * Need to be called with 348 * down_read(&EXT4_I(inode)->i_data_sem) 349 */ 350 static Indirect *ext4_get_branch(struct inode *inode, int depth, 351 ext4_lblk_t *offsets, 352 Indirect chain[4], int *err) 353 { 354 struct super_block *sb = inode->i_sb; 355 Indirect *p = chain; 356 struct buffer_head *bh; 357 358 *err = 0; 359 /* i_data is not going away, no lock needed */ 360 add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets); 361 if (!p->key) 362 goto no_block; 363 while (--depth) { 364 bh = sb_bread(sb, le32_to_cpu(p->key)); 365 if (!bh) 366 goto failure; 367 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 368 /* Reader: end */ 369 if (!p->key) 370 goto no_block; 371 } 372 return NULL; 373 374 failure: 375 *err = -EIO; 376 no_block: 377 return p; 378 } 379 380 /** 381 * ext4_find_near - find a place for allocation with sufficient locality 382 * @inode: owner 383 * @ind: descriptor of indirect block. 384 * 385 * This function returns the prefered place for block allocation. 386 * It is used when heuristic for sequential allocation fails. 387 * Rules are: 388 * + if there is a block to the left of our position - allocate near it. 389 * + if pointer will live in indirect block - allocate near that block. 390 * + if pointer will live in inode - allocate in the same 391 * cylinder group. 392 * 393 * In the latter case we colour the starting block by the callers PID to 394 * prevent it from clashing with concurrent allocations for a different inode 395 * in the same block group. The PID is used here so that functionally related 396 * files will be close-by on-disk. 397 * 398 * Caller must make sure that @ind is valid and will stay that way. 399 */ 400 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 401 { 402 struct ext4_inode_info *ei = EXT4_I(inode); 403 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 404 __le32 *p; 405 ext4_fsblk_t bg_start; 406 ext4_grpblk_t colour; 407 408 /* Try to find previous block */ 409 for (p = ind->p - 1; p >= start; p--) { 410 if (*p) 411 return le32_to_cpu(*p); 412 } 413 414 /* No such thing, so let's try location of indirect block */ 415 if (ind->bh) 416 return ind->bh->b_blocknr; 417 418 /* 419 * It is going to be referred to from the inode itself? OK, just put it 420 * into the same cylinder group then. 421 */ 422 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 423 colour = (current->pid % 16) * 424 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 425 return bg_start + colour; 426 } 427 428 /** 429 * ext4_find_goal - find a prefered place for allocation. 430 * @inode: owner 431 * @block: block we want 432 * @chain: chain of indirect blocks 433 * @partial: pointer to the last triple within a chain 434 * @goal: place to store the result. 435 * 436 * Normally this function find the prefered place for block allocation, 437 * stores it in *@goal and returns zero. 438 */ 439 440 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 441 Indirect chain[4], Indirect *partial) 442 { 443 struct ext4_block_alloc_info *block_i; 444 445 block_i = EXT4_I(inode)->i_block_alloc_info; 446 447 /* 448 * try the heuristic for sequential allocation, 449 * failing that at least try to get decent locality. 450 */ 451 if (block_i && (block == block_i->last_alloc_logical_block + 1) 452 && (block_i->last_alloc_physical_block != 0)) { 453 return block_i->last_alloc_physical_block + 1; 454 } 455 456 return ext4_find_near(inode, partial); 457 } 458 459 /** 460 * ext4_blks_to_allocate: Look up the block map and count the number 461 * of direct blocks need to be allocated for the given branch. 462 * 463 * @branch: chain of indirect blocks 464 * @k: number of blocks need for indirect blocks 465 * @blks: number of data blocks to be mapped. 466 * @blocks_to_boundary: the offset in the indirect block 467 * 468 * return the total number of blocks to be allocate, including the 469 * direct and indirect blocks. 470 */ 471 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 472 int blocks_to_boundary) 473 { 474 unsigned long count = 0; 475 476 /* 477 * Simple case, [t,d]Indirect block(s) has not allocated yet 478 * then it's clear blocks on that path have not allocated 479 */ 480 if (k > 0) { 481 /* right now we don't handle cross boundary allocation */ 482 if (blks < blocks_to_boundary + 1) 483 count += blks; 484 else 485 count += blocks_to_boundary + 1; 486 return count; 487 } 488 489 count++; 490 while (count < blks && count <= blocks_to_boundary && 491 le32_to_cpu(*(branch[0].p + count)) == 0) { 492 count++; 493 } 494 return count; 495 } 496 497 /** 498 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 499 * @indirect_blks: the number of blocks need to allocate for indirect 500 * blocks 501 * 502 * @new_blocks: on return it will store the new block numbers for 503 * the indirect blocks(if needed) and the first direct block, 504 * @blks: on return it will store the total number of allocated 505 * direct blocks 506 */ 507 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 508 ext4_fsblk_t goal, int indirect_blks, int blks, 509 ext4_fsblk_t new_blocks[4], int *err) 510 { 511 int target, i; 512 unsigned long count = 0; 513 int index = 0; 514 ext4_fsblk_t current_block = 0; 515 int ret = 0; 516 517 /* 518 * Here we try to allocate the requested multiple blocks at once, 519 * on a best-effort basis. 520 * To build a branch, we should allocate blocks for 521 * the indirect blocks(if not allocated yet), and at least 522 * the first direct block of this branch. That's the 523 * minimum number of blocks need to allocate(required) 524 */ 525 target = blks + indirect_blks; 526 527 while (1) { 528 count = target; 529 /* allocating blocks for indirect blocks and direct blocks */ 530 current_block = ext4_new_blocks(handle,inode,goal,&count,err); 531 if (*err) 532 goto failed_out; 533 534 target -= count; 535 /* allocate blocks for indirect blocks */ 536 while (index < indirect_blks && count) { 537 new_blocks[index++] = current_block++; 538 count--; 539 } 540 541 if (count > 0) 542 break; 543 } 544 545 /* save the new block number for the first direct block */ 546 new_blocks[index] = current_block; 547 548 /* total number of blocks allocated for direct blocks */ 549 ret = count; 550 *err = 0; 551 return ret; 552 failed_out: 553 for (i = 0; i <index; i++) 554 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 555 return ret; 556 } 557 558 /** 559 * ext4_alloc_branch - allocate and set up a chain of blocks. 560 * @inode: owner 561 * @indirect_blks: number of allocated indirect blocks 562 * @blks: number of allocated direct blocks 563 * @offsets: offsets (in the blocks) to store the pointers to next. 564 * @branch: place to store the chain in. 565 * 566 * This function allocates blocks, zeroes out all but the last one, 567 * links them into chain and (if we are synchronous) writes them to disk. 568 * In other words, it prepares a branch that can be spliced onto the 569 * inode. It stores the information about that chain in the branch[], in 570 * the same format as ext4_get_branch() would do. We are calling it after 571 * we had read the existing part of chain and partial points to the last 572 * triple of that (one with zero ->key). Upon the exit we have the same 573 * picture as after the successful ext4_get_block(), except that in one 574 * place chain is disconnected - *branch->p is still zero (we did not 575 * set the last link), but branch->key contains the number that should 576 * be placed into *branch->p to fill that gap. 577 * 578 * If allocation fails we free all blocks we've allocated (and forget 579 * their buffer_heads) and return the error value the from failed 580 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 581 * as described above and return 0. 582 */ 583 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 584 int indirect_blks, int *blks, ext4_fsblk_t goal, 585 ext4_lblk_t *offsets, Indirect *branch) 586 { 587 int blocksize = inode->i_sb->s_blocksize; 588 int i, n = 0; 589 int err = 0; 590 struct buffer_head *bh; 591 int num; 592 ext4_fsblk_t new_blocks[4]; 593 ext4_fsblk_t current_block; 594 595 num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, 596 *blks, new_blocks, &err); 597 if (err) 598 return err; 599 600 branch[0].key = cpu_to_le32(new_blocks[0]); 601 /* 602 * metadata blocks and data blocks are allocated. 603 */ 604 for (n = 1; n <= indirect_blks; n++) { 605 /* 606 * Get buffer_head for parent block, zero it out 607 * and set the pointer to new one, then send 608 * parent to disk. 609 */ 610 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 611 branch[n].bh = bh; 612 lock_buffer(bh); 613 BUFFER_TRACE(bh, "call get_create_access"); 614 err = ext4_journal_get_create_access(handle, bh); 615 if (err) { 616 unlock_buffer(bh); 617 brelse(bh); 618 goto failed; 619 } 620 621 memset(bh->b_data, 0, blocksize); 622 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 623 branch[n].key = cpu_to_le32(new_blocks[n]); 624 *branch[n].p = branch[n].key; 625 if ( n == indirect_blks) { 626 current_block = new_blocks[n]; 627 /* 628 * End of chain, update the last new metablock of 629 * the chain to point to the new allocated 630 * data blocks numbers 631 */ 632 for (i=1; i < num; i++) 633 *(branch[n].p + i) = cpu_to_le32(++current_block); 634 } 635 BUFFER_TRACE(bh, "marking uptodate"); 636 set_buffer_uptodate(bh); 637 unlock_buffer(bh); 638 639 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 640 err = ext4_journal_dirty_metadata(handle, bh); 641 if (err) 642 goto failed; 643 } 644 *blks = num; 645 return err; 646 failed: 647 /* Allocation failed, free what we already allocated */ 648 for (i = 1; i <= n ; i++) { 649 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 650 ext4_journal_forget(handle, branch[i].bh); 651 } 652 for (i = 0; i <indirect_blks; i++) 653 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 654 655 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 656 657 return err; 658 } 659 660 /** 661 * ext4_splice_branch - splice the allocated branch onto inode. 662 * @inode: owner 663 * @block: (logical) number of block we are adding 664 * @chain: chain of indirect blocks (with a missing link - see 665 * ext4_alloc_branch) 666 * @where: location of missing link 667 * @num: number of indirect blocks we are adding 668 * @blks: number of direct blocks we are adding 669 * 670 * This function fills the missing link and does all housekeeping needed in 671 * inode (->i_blocks, etc.). In case of success we end up with the full 672 * chain to new block and return 0. 673 */ 674 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 675 ext4_lblk_t block, Indirect *where, int num, int blks) 676 { 677 int i; 678 int err = 0; 679 struct ext4_block_alloc_info *block_i; 680 ext4_fsblk_t current_block; 681 682 block_i = EXT4_I(inode)->i_block_alloc_info; 683 /* 684 * If we're splicing into a [td]indirect block (as opposed to the 685 * inode) then we need to get write access to the [td]indirect block 686 * before the splice. 687 */ 688 if (where->bh) { 689 BUFFER_TRACE(where->bh, "get_write_access"); 690 err = ext4_journal_get_write_access(handle, where->bh); 691 if (err) 692 goto err_out; 693 } 694 /* That's it */ 695 696 *where->p = where->key; 697 698 /* 699 * Update the host buffer_head or inode to point to more just allocated 700 * direct blocks blocks 701 */ 702 if (num == 0 && blks > 1) { 703 current_block = le32_to_cpu(where->key) + 1; 704 for (i = 1; i < blks; i++) 705 *(where->p + i ) = cpu_to_le32(current_block++); 706 } 707 708 /* 709 * update the most recently allocated logical & physical block 710 * in i_block_alloc_info, to assist find the proper goal block for next 711 * allocation 712 */ 713 if (block_i) { 714 block_i->last_alloc_logical_block = block + blks - 1; 715 block_i->last_alloc_physical_block = 716 le32_to_cpu(where[num].key) + blks - 1; 717 } 718 719 /* We are done with atomic stuff, now do the rest of housekeeping */ 720 721 inode->i_ctime = ext4_current_time(inode); 722 ext4_mark_inode_dirty(handle, inode); 723 724 /* had we spliced it onto indirect block? */ 725 if (where->bh) { 726 /* 727 * If we spliced it onto an indirect block, we haven't 728 * altered the inode. Note however that if it is being spliced 729 * onto an indirect block at the very end of the file (the 730 * file is growing) then we *will* alter the inode to reflect 731 * the new i_size. But that is not done here - it is done in 732 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 733 */ 734 jbd_debug(5, "splicing indirect only\n"); 735 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); 736 err = ext4_journal_dirty_metadata(handle, where->bh); 737 if (err) 738 goto err_out; 739 } else { 740 /* 741 * OK, we spliced it into the inode itself on a direct block. 742 * Inode was dirtied above. 743 */ 744 jbd_debug(5, "splicing direct\n"); 745 } 746 return err; 747 748 err_out: 749 for (i = 1; i <= num; i++) { 750 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 751 ext4_journal_forget(handle, where[i].bh); 752 ext4_free_blocks(handle, inode, 753 le32_to_cpu(where[i-1].key), 1, 0); 754 } 755 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 756 757 return err; 758 } 759 760 /* 761 * Allocation strategy is simple: if we have to allocate something, we will 762 * have to go the whole way to leaf. So let's do it before attaching anything 763 * to tree, set linkage between the newborn blocks, write them if sync is 764 * required, recheck the path, free and repeat if check fails, otherwise 765 * set the last missing link (that will protect us from any truncate-generated 766 * removals - all blocks on the path are immune now) and possibly force the 767 * write on the parent block. 768 * That has a nice additional property: no special recovery from the failed 769 * allocations is needed - we simply release blocks and do not touch anything 770 * reachable from inode. 771 * 772 * `handle' can be NULL if create == 0. 773 * 774 * The BKL may not be held on entry here. Be sure to take it early. 775 * return > 0, # of blocks mapped or allocated. 776 * return = 0, if plain lookup failed. 777 * return < 0, error case. 778 * 779 * 780 * Need to be called with 781 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 782 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 783 */ 784 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 785 ext4_lblk_t iblock, unsigned long maxblocks, 786 struct buffer_head *bh_result, 787 int create, int extend_disksize) 788 { 789 int err = -EIO; 790 ext4_lblk_t offsets[4]; 791 Indirect chain[4]; 792 Indirect *partial; 793 ext4_fsblk_t goal; 794 int indirect_blks; 795 int blocks_to_boundary = 0; 796 int depth; 797 struct ext4_inode_info *ei = EXT4_I(inode); 798 int count = 0; 799 ext4_fsblk_t first_block = 0; 800 801 802 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 803 J_ASSERT(handle != NULL || create == 0); 804 depth = ext4_block_to_path(inode, iblock, offsets, 805 &blocks_to_boundary); 806 807 if (depth == 0) 808 goto out; 809 810 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 811 812 /* Simplest case - block found, no allocation needed */ 813 if (!partial) { 814 first_block = le32_to_cpu(chain[depth - 1].key); 815 clear_buffer_new(bh_result); 816 count++; 817 /*map more blocks*/ 818 while (count < maxblocks && count <= blocks_to_boundary) { 819 ext4_fsblk_t blk; 820 821 blk = le32_to_cpu(*(chain[depth-1].p + count)); 822 823 if (blk == first_block + count) 824 count++; 825 else 826 break; 827 } 828 goto got_it; 829 } 830 831 /* Next simple case - plain lookup or failed read of indirect block */ 832 if (!create || err == -EIO) 833 goto cleanup; 834 835 /* 836 * Okay, we need to do block allocation. Lazily initialize the block 837 * allocation info here if necessary 838 */ 839 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) 840 ext4_init_block_alloc_info(inode); 841 842 goal = ext4_find_goal(inode, iblock, chain, partial); 843 844 /* the number of blocks need to allocate for [d,t]indirect blocks */ 845 indirect_blks = (chain + depth) - partial - 1; 846 847 /* 848 * Next look up the indirect map to count the totoal number of 849 * direct blocks to allocate for this branch. 850 */ 851 count = ext4_blks_to_allocate(partial, indirect_blks, 852 maxblocks, blocks_to_boundary); 853 /* 854 * Block out ext4_truncate while we alter the tree 855 */ 856 err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, 857 offsets + (partial - chain), partial); 858 859 /* 860 * The ext4_splice_branch call will free and forget any buffers 861 * on the new chain if there is a failure, but that risks using 862 * up transaction credits, especially for bitmaps where the 863 * credits cannot be returned. Can we handle this somehow? We 864 * may need to return -EAGAIN upwards in the worst case. --sct 865 */ 866 if (!err) 867 err = ext4_splice_branch(handle, inode, iblock, 868 partial, indirect_blks, count); 869 /* 870 * i_disksize growing is protected by i_data_sem. Don't forget to 871 * protect it if you're about to implement concurrent 872 * ext4_get_block() -bzzz 873 */ 874 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 875 ei->i_disksize = inode->i_size; 876 if (err) 877 goto cleanup; 878 879 set_buffer_new(bh_result); 880 got_it: 881 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 882 if (count > blocks_to_boundary) 883 set_buffer_boundary(bh_result); 884 err = count; 885 /* Clean up and exit */ 886 partial = chain + depth - 1; /* the whole chain */ 887 cleanup: 888 while (partial > chain) { 889 BUFFER_TRACE(partial->bh, "call brelse"); 890 brelse(partial->bh); 891 partial--; 892 } 893 BUFFER_TRACE(bh_result, "returned"); 894 out: 895 return err; 896 } 897 898 #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) 899 900 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 901 unsigned long max_blocks, struct buffer_head *bh, 902 int create, int extend_disksize) 903 { 904 int retval; 905 /* 906 * Try to see if we can get the block without requesting 907 * for new file system block. 908 */ 909 down_read((&EXT4_I(inode)->i_data_sem)); 910 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 911 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 912 bh, 0, 0); 913 } else { 914 retval = ext4_get_blocks_handle(handle, 915 inode, block, max_blocks, bh, 0, 0); 916 } 917 up_read((&EXT4_I(inode)->i_data_sem)); 918 if (!create || (retval > 0)) 919 return retval; 920 921 /* 922 * We need to allocate new blocks which will result 923 * in i_data update 924 */ 925 down_write((&EXT4_I(inode)->i_data_sem)); 926 /* 927 * We need to check for EXT4 here because migrate 928 * could have changed the inode type in between 929 */ 930 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 931 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 932 bh, create, extend_disksize); 933 } else { 934 retval = ext4_get_blocks_handle(handle, inode, block, 935 max_blocks, bh, create, extend_disksize); 936 } 937 up_write((&EXT4_I(inode)->i_data_sem)); 938 return retval; 939 } 940 941 static int ext4_get_block(struct inode *inode, sector_t iblock, 942 struct buffer_head *bh_result, int create) 943 { 944 handle_t *handle = ext4_journal_current_handle(); 945 int ret = 0; 946 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 947 948 if (!create) 949 goto get_block; /* A read */ 950 951 if (max_blocks == 1) 952 goto get_block; /* A single block get */ 953 954 if (handle->h_transaction->t_state == T_LOCKED) { 955 /* 956 * Huge direct-io writes can hold off commits for long 957 * periods of time. Let this commit run. 958 */ 959 ext4_journal_stop(handle); 960 handle = ext4_journal_start(inode, DIO_CREDITS); 961 if (IS_ERR(handle)) 962 ret = PTR_ERR(handle); 963 goto get_block; 964 } 965 966 if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) { 967 /* 968 * Getting low on buffer credits... 969 */ 970 ret = ext4_journal_extend(handle, DIO_CREDITS); 971 if (ret > 0) { 972 /* 973 * Couldn't extend the transaction. Start a new one. 974 */ 975 ret = ext4_journal_restart(handle, DIO_CREDITS); 976 } 977 } 978 979 get_block: 980 if (ret == 0) { 981 ret = ext4_get_blocks_wrap(handle, inode, iblock, 982 max_blocks, bh_result, create, 0); 983 if (ret > 0) { 984 bh_result->b_size = (ret << inode->i_blkbits); 985 ret = 0; 986 } 987 } 988 return ret; 989 } 990 991 /* 992 * `handle' can be NULL if create is zero 993 */ 994 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 995 ext4_lblk_t block, int create, int *errp) 996 { 997 struct buffer_head dummy; 998 int fatal = 0, err; 999 1000 J_ASSERT(handle != NULL || create == 0); 1001 1002 dummy.b_state = 0; 1003 dummy.b_blocknr = -1000; 1004 buffer_trace_init(&dummy.b_history); 1005 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1006 &dummy, create, 1); 1007 /* 1008 * ext4_get_blocks_handle() returns number of blocks 1009 * mapped. 0 in case of a HOLE. 1010 */ 1011 if (err > 0) { 1012 if (err > 1) 1013 WARN_ON(1); 1014 err = 0; 1015 } 1016 *errp = err; 1017 if (!err && buffer_mapped(&dummy)) { 1018 struct buffer_head *bh; 1019 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1020 if (!bh) { 1021 *errp = -EIO; 1022 goto err; 1023 } 1024 if (buffer_new(&dummy)) { 1025 J_ASSERT(create != 0); 1026 J_ASSERT(handle != NULL); 1027 1028 /* 1029 * Now that we do not always journal data, we should 1030 * keep in mind whether this should always journal the 1031 * new buffer as metadata. For now, regular file 1032 * writes use ext4_get_block instead, so it's not a 1033 * problem. 1034 */ 1035 lock_buffer(bh); 1036 BUFFER_TRACE(bh, "call get_create_access"); 1037 fatal = ext4_journal_get_create_access(handle, bh); 1038 if (!fatal && !buffer_uptodate(bh)) { 1039 memset(bh->b_data,0,inode->i_sb->s_blocksize); 1040 set_buffer_uptodate(bh); 1041 } 1042 unlock_buffer(bh); 1043 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1044 err = ext4_journal_dirty_metadata(handle, bh); 1045 if (!fatal) 1046 fatal = err; 1047 } else { 1048 BUFFER_TRACE(bh, "not a new buffer"); 1049 } 1050 if (fatal) { 1051 *errp = fatal; 1052 brelse(bh); 1053 bh = NULL; 1054 } 1055 return bh; 1056 } 1057 err: 1058 return NULL; 1059 } 1060 1061 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1062 ext4_lblk_t block, int create, int *err) 1063 { 1064 struct buffer_head * bh; 1065 1066 bh = ext4_getblk(handle, inode, block, create, err); 1067 if (!bh) 1068 return bh; 1069 if (buffer_uptodate(bh)) 1070 return bh; 1071 ll_rw_block(READ_META, 1, &bh); 1072 wait_on_buffer(bh); 1073 if (buffer_uptodate(bh)) 1074 return bh; 1075 put_bh(bh); 1076 *err = -EIO; 1077 return NULL; 1078 } 1079 1080 static int walk_page_buffers( handle_t *handle, 1081 struct buffer_head *head, 1082 unsigned from, 1083 unsigned to, 1084 int *partial, 1085 int (*fn)( handle_t *handle, 1086 struct buffer_head *bh)) 1087 { 1088 struct buffer_head *bh; 1089 unsigned block_start, block_end; 1090 unsigned blocksize = head->b_size; 1091 int err, ret = 0; 1092 struct buffer_head *next; 1093 1094 for ( bh = head, block_start = 0; 1095 ret == 0 && (bh != head || !block_start); 1096 block_start = block_end, bh = next) 1097 { 1098 next = bh->b_this_page; 1099 block_end = block_start + blocksize; 1100 if (block_end <= from || block_start >= to) { 1101 if (partial && !buffer_uptodate(bh)) 1102 *partial = 1; 1103 continue; 1104 } 1105 err = (*fn)(handle, bh); 1106 if (!ret) 1107 ret = err; 1108 } 1109 return ret; 1110 } 1111 1112 /* 1113 * To preserve ordering, it is essential that the hole instantiation and 1114 * the data write be encapsulated in a single transaction. We cannot 1115 * close off a transaction and start a new one between the ext4_get_block() 1116 * and the commit_write(). So doing the jbd2_journal_start at the start of 1117 * prepare_write() is the right place. 1118 * 1119 * Also, this function can nest inside ext4_writepage() -> 1120 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1121 * has generated enough buffer credits to do the whole page. So we won't 1122 * block on the journal in that case, which is good, because the caller may 1123 * be PF_MEMALLOC. 1124 * 1125 * By accident, ext4 can be reentered when a transaction is open via 1126 * quota file writes. If we were to commit the transaction while thus 1127 * reentered, there can be a deadlock - we would be holding a quota 1128 * lock, and the commit would never complete if another thread had a 1129 * transaction open and was blocking on the quota lock - a ranking 1130 * violation. 1131 * 1132 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1133 * will _not_ run commit under these circumstances because handle->h_ref 1134 * is elevated. We'll still have enough credits for the tiny quotafile 1135 * write. 1136 */ 1137 static int do_journal_get_write_access(handle_t *handle, 1138 struct buffer_head *bh) 1139 { 1140 if (!buffer_mapped(bh) || buffer_freed(bh)) 1141 return 0; 1142 return ext4_journal_get_write_access(handle, bh); 1143 } 1144 1145 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1146 loff_t pos, unsigned len, unsigned flags, 1147 struct page **pagep, void **fsdata) 1148 { 1149 struct inode *inode = mapping->host; 1150 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1151 handle_t *handle; 1152 int retries = 0; 1153 struct page *page; 1154 pgoff_t index; 1155 unsigned from, to; 1156 1157 index = pos >> PAGE_CACHE_SHIFT; 1158 from = pos & (PAGE_CACHE_SIZE - 1); 1159 to = from + len; 1160 1161 retry: 1162 page = __grab_cache_page(mapping, index); 1163 if (!page) 1164 return -ENOMEM; 1165 *pagep = page; 1166 1167 handle = ext4_journal_start(inode, needed_blocks); 1168 if (IS_ERR(handle)) { 1169 unlock_page(page); 1170 page_cache_release(page); 1171 ret = PTR_ERR(handle); 1172 goto out; 1173 } 1174 1175 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1176 ext4_get_block); 1177 1178 if (!ret && ext4_should_journal_data(inode)) { 1179 ret = walk_page_buffers(handle, page_buffers(page), 1180 from, to, NULL, do_journal_get_write_access); 1181 } 1182 1183 if (ret) { 1184 ext4_journal_stop(handle); 1185 unlock_page(page); 1186 page_cache_release(page); 1187 } 1188 1189 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1190 goto retry; 1191 out: 1192 return ret; 1193 } 1194 1195 int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) 1196 { 1197 int err = jbd2_journal_dirty_data(handle, bh); 1198 if (err) 1199 ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__, 1200 bh, handle, err); 1201 return err; 1202 } 1203 1204 /* For write_end() in data=journal mode */ 1205 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1206 { 1207 if (!buffer_mapped(bh) || buffer_freed(bh)) 1208 return 0; 1209 set_buffer_uptodate(bh); 1210 return ext4_journal_dirty_metadata(handle, bh); 1211 } 1212 1213 /* 1214 * Generic write_end handler for ordered and writeback ext4 journal modes. 1215 * We can't use generic_write_end, because that unlocks the page and we need to 1216 * unlock the page after ext4_journal_stop, but ext4_journal_stop must run 1217 * after block_write_end. 1218 */ 1219 static int ext4_generic_write_end(struct file *file, 1220 struct address_space *mapping, 1221 loff_t pos, unsigned len, unsigned copied, 1222 struct page *page, void *fsdata) 1223 { 1224 struct inode *inode = file->f_mapping->host; 1225 1226 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1227 1228 if (pos+copied > inode->i_size) { 1229 i_size_write(inode, pos+copied); 1230 mark_inode_dirty(inode); 1231 } 1232 1233 return copied; 1234 } 1235 1236 /* 1237 * We need to pick up the new inode size which generic_commit_write gave us 1238 * `file' can be NULL - eg, when called from page_symlink(). 1239 * 1240 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1241 * buffers are managed internally. 1242 */ 1243 static int ext4_ordered_write_end(struct file *file, 1244 struct address_space *mapping, 1245 loff_t pos, unsigned len, unsigned copied, 1246 struct page *page, void *fsdata) 1247 { 1248 handle_t *handle = ext4_journal_current_handle(); 1249 struct inode *inode = file->f_mapping->host; 1250 unsigned from, to; 1251 int ret = 0, ret2; 1252 1253 from = pos & (PAGE_CACHE_SIZE - 1); 1254 to = from + len; 1255 1256 ret = walk_page_buffers(handle, page_buffers(page), 1257 from, to, NULL, ext4_journal_dirty_data); 1258 1259 if (ret == 0) { 1260 /* 1261 * generic_write_end() will run mark_inode_dirty() if i_size 1262 * changes. So let's piggyback the i_disksize mark_inode_dirty 1263 * into that. 1264 */ 1265 loff_t new_i_size; 1266 1267 new_i_size = pos + copied; 1268 if (new_i_size > EXT4_I(inode)->i_disksize) 1269 EXT4_I(inode)->i_disksize = new_i_size; 1270 copied = ext4_generic_write_end(file, mapping, pos, len, copied, 1271 page, fsdata); 1272 if (copied < 0) 1273 ret = copied; 1274 } 1275 ret2 = ext4_journal_stop(handle); 1276 if (!ret) 1277 ret = ret2; 1278 unlock_page(page); 1279 page_cache_release(page); 1280 1281 return ret ? ret : copied; 1282 } 1283 1284 static int ext4_writeback_write_end(struct file *file, 1285 struct address_space *mapping, 1286 loff_t pos, unsigned len, unsigned copied, 1287 struct page *page, void *fsdata) 1288 { 1289 handle_t *handle = ext4_journal_current_handle(); 1290 struct inode *inode = file->f_mapping->host; 1291 int ret = 0, ret2; 1292 loff_t new_i_size; 1293 1294 new_i_size = pos + copied; 1295 if (new_i_size > EXT4_I(inode)->i_disksize) 1296 EXT4_I(inode)->i_disksize = new_i_size; 1297 1298 copied = ext4_generic_write_end(file, mapping, pos, len, copied, 1299 page, fsdata); 1300 if (copied < 0) 1301 ret = copied; 1302 1303 ret2 = ext4_journal_stop(handle); 1304 if (!ret) 1305 ret = ret2; 1306 unlock_page(page); 1307 page_cache_release(page); 1308 1309 return ret ? ret : copied; 1310 } 1311 1312 static int ext4_journalled_write_end(struct file *file, 1313 struct address_space *mapping, 1314 loff_t pos, unsigned len, unsigned copied, 1315 struct page *page, void *fsdata) 1316 { 1317 handle_t *handle = ext4_journal_current_handle(); 1318 struct inode *inode = mapping->host; 1319 int ret = 0, ret2; 1320 int partial = 0; 1321 unsigned from, to; 1322 1323 from = pos & (PAGE_CACHE_SIZE - 1); 1324 to = from + len; 1325 1326 if (copied < len) { 1327 if (!PageUptodate(page)) 1328 copied = 0; 1329 page_zero_new_buffers(page, from+copied, to); 1330 } 1331 1332 ret = walk_page_buffers(handle, page_buffers(page), from, 1333 to, &partial, write_end_fn); 1334 if (!partial) 1335 SetPageUptodate(page); 1336 if (pos+copied > inode->i_size) 1337 i_size_write(inode, pos+copied); 1338 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1339 if (inode->i_size > EXT4_I(inode)->i_disksize) { 1340 EXT4_I(inode)->i_disksize = inode->i_size; 1341 ret2 = ext4_mark_inode_dirty(handle, inode); 1342 if (!ret) 1343 ret = ret2; 1344 } 1345 1346 ret2 = ext4_journal_stop(handle); 1347 if (!ret) 1348 ret = ret2; 1349 unlock_page(page); 1350 page_cache_release(page); 1351 1352 return ret ? ret : copied; 1353 } 1354 1355 /* 1356 * bmap() is special. It gets used by applications such as lilo and by 1357 * the swapper to find the on-disk block of a specific piece of data. 1358 * 1359 * Naturally, this is dangerous if the block concerned is still in the 1360 * journal. If somebody makes a swapfile on an ext4 data-journaling 1361 * filesystem and enables swap, then they may get a nasty shock when the 1362 * data getting swapped to that swapfile suddenly gets overwritten by 1363 * the original zero's written out previously to the journal and 1364 * awaiting writeback in the kernel's buffer cache. 1365 * 1366 * So, if we see any bmap calls here on a modified, data-journaled file, 1367 * take extra steps to flush any blocks which might be in the cache. 1368 */ 1369 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 1370 { 1371 struct inode *inode = mapping->host; 1372 journal_t *journal; 1373 int err; 1374 1375 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 1376 /* 1377 * This is a REALLY heavyweight approach, but the use of 1378 * bmap on dirty files is expected to be extremely rare: 1379 * only if we run lilo or swapon on a freshly made file 1380 * do we expect this to happen. 1381 * 1382 * (bmap requires CAP_SYS_RAWIO so this does not 1383 * represent an unprivileged user DOS attack --- we'd be 1384 * in trouble if mortal users could trigger this path at 1385 * will.) 1386 * 1387 * NB. EXT4_STATE_JDATA is not set on files other than 1388 * regular files. If somebody wants to bmap a directory 1389 * or symlink and gets confused because the buffer 1390 * hasn't yet been flushed to disk, they deserve 1391 * everything they get. 1392 */ 1393 1394 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 1395 journal = EXT4_JOURNAL(inode); 1396 jbd2_journal_lock_updates(journal); 1397 err = jbd2_journal_flush(journal); 1398 jbd2_journal_unlock_updates(journal); 1399 1400 if (err) 1401 return 0; 1402 } 1403 1404 return generic_block_bmap(mapping,block,ext4_get_block); 1405 } 1406 1407 static int bget_one(handle_t *handle, struct buffer_head *bh) 1408 { 1409 get_bh(bh); 1410 return 0; 1411 } 1412 1413 static int bput_one(handle_t *handle, struct buffer_head *bh) 1414 { 1415 put_bh(bh); 1416 return 0; 1417 } 1418 1419 static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) 1420 { 1421 if (buffer_mapped(bh)) 1422 return ext4_journal_dirty_data(handle, bh); 1423 return 0; 1424 } 1425 1426 /* 1427 * Note that we always start a transaction even if we're not journalling 1428 * data. This is to preserve ordering: any hole instantiation within 1429 * __block_write_full_page -> ext4_get_block() should be journalled 1430 * along with the data so we don't crash and then get metadata which 1431 * refers to old data. 1432 * 1433 * In all journalling modes block_write_full_page() will start the I/O. 1434 * 1435 * Problem: 1436 * 1437 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 1438 * ext4_writepage() 1439 * 1440 * Similar for: 1441 * 1442 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 1443 * 1444 * Same applies to ext4_get_block(). We will deadlock on various things like 1445 * lock_journal and i_data_sem 1446 * 1447 * Setting PF_MEMALLOC here doesn't work - too many internal memory 1448 * allocations fail. 1449 * 1450 * 16May01: If we're reentered then journal_current_handle() will be 1451 * non-zero. We simply *return*. 1452 * 1453 * 1 July 2001: @@@ FIXME: 1454 * In journalled data mode, a data buffer may be metadata against the 1455 * current transaction. But the same file is part of a shared mapping 1456 * and someone does a writepage() on it. 1457 * 1458 * We will move the buffer onto the async_data list, but *after* it has 1459 * been dirtied. So there's a small window where we have dirty data on 1460 * BJ_Metadata. 1461 * 1462 * Note that this only applies to the last partial page in the file. The 1463 * bit which block_write_full_page() uses prepare/commit for. (That's 1464 * broken code anyway: it's wrong for msync()). 1465 * 1466 * It's a rare case: affects the final partial page, for journalled data 1467 * where the file is subject to bith write() and writepage() in the same 1468 * transction. To fix it we'll need a custom block_write_full_page(). 1469 * We'll probably need that anyway for journalling writepage() output. 1470 * 1471 * We don't honour synchronous mounts for writepage(). That would be 1472 * disastrous. Any write() or metadata operation will sync the fs for 1473 * us. 1474 * 1475 * AKPM2: if all the page's buffers are mapped to disk and !data=journal, 1476 * we don't need to open a transaction here. 1477 */ 1478 static int ext4_ordered_writepage(struct page *page, 1479 struct writeback_control *wbc) 1480 { 1481 struct inode *inode = page->mapping->host; 1482 struct buffer_head *page_bufs; 1483 handle_t *handle = NULL; 1484 int ret = 0; 1485 int err; 1486 1487 J_ASSERT(PageLocked(page)); 1488 1489 /* 1490 * We give up here if we're reentered, because it might be for a 1491 * different filesystem. 1492 */ 1493 if (ext4_journal_current_handle()) 1494 goto out_fail; 1495 1496 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 1497 1498 if (IS_ERR(handle)) { 1499 ret = PTR_ERR(handle); 1500 goto out_fail; 1501 } 1502 1503 if (!page_has_buffers(page)) { 1504 create_empty_buffers(page, inode->i_sb->s_blocksize, 1505 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1506 } 1507 page_bufs = page_buffers(page); 1508 walk_page_buffers(handle, page_bufs, 0, 1509 PAGE_CACHE_SIZE, NULL, bget_one); 1510 1511 ret = block_write_full_page(page, ext4_get_block, wbc); 1512 1513 /* 1514 * The page can become unlocked at any point now, and 1515 * truncate can then come in and change things. So we 1516 * can't touch *page from now on. But *page_bufs is 1517 * safe due to elevated refcount. 1518 */ 1519 1520 /* 1521 * And attach them to the current transaction. But only if 1522 * block_write_full_page() succeeded. Otherwise they are unmapped, 1523 * and generally junk. 1524 */ 1525 if (ret == 0) { 1526 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, 1527 NULL, jbd2_journal_dirty_data_fn); 1528 if (!ret) 1529 ret = err; 1530 } 1531 walk_page_buffers(handle, page_bufs, 0, 1532 PAGE_CACHE_SIZE, NULL, bput_one); 1533 err = ext4_journal_stop(handle); 1534 if (!ret) 1535 ret = err; 1536 return ret; 1537 1538 out_fail: 1539 redirty_page_for_writepage(wbc, page); 1540 unlock_page(page); 1541 return ret; 1542 } 1543 1544 static int ext4_writeback_writepage(struct page *page, 1545 struct writeback_control *wbc) 1546 { 1547 struct inode *inode = page->mapping->host; 1548 handle_t *handle = NULL; 1549 int ret = 0; 1550 int err; 1551 1552 if (ext4_journal_current_handle()) 1553 goto out_fail; 1554 1555 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 1556 if (IS_ERR(handle)) { 1557 ret = PTR_ERR(handle); 1558 goto out_fail; 1559 } 1560 1561 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 1562 ret = nobh_writepage(page, ext4_get_block, wbc); 1563 else 1564 ret = block_write_full_page(page, ext4_get_block, wbc); 1565 1566 err = ext4_journal_stop(handle); 1567 if (!ret) 1568 ret = err; 1569 return ret; 1570 1571 out_fail: 1572 redirty_page_for_writepage(wbc, page); 1573 unlock_page(page); 1574 return ret; 1575 } 1576 1577 static int ext4_journalled_writepage(struct page *page, 1578 struct writeback_control *wbc) 1579 { 1580 struct inode *inode = page->mapping->host; 1581 handle_t *handle = NULL; 1582 int ret = 0; 1583 int err; 1584 1585 if (ext4_journal_current_handle()) 1586 goto no_write; 1587 1588 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 1589 if (IS_ERR(handle)) { 1590 ret = PTR_ERR(handle); 1591 goto no_write; 1592 } 1593 1594 if (!page_has_buffers(page) || PageChecked(page)) { 1595 /* 1596 * It's mmapped pagecache. Add buffers and journal it. There 1597 * doesn't seem much point in redirtying the page here. 1598 */ 1599 ClearPageChecked(page); 1600 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 1601 ext4_get_block); 1602 if (ret != 0) { 1603 ext4_journal_stop(handle); 1604 goto out_unlock; 1605 } 1606 ret = walk_page_buffers(handle, page_buffers(page), 0, 1607 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 1608 1609 err = walk_page_buffers(handle, page_buffers(page), 0, 1610 PAGE_CACHE_SIZE, NULL, write_end_fn); 1611 if (ret == 0) 1612 ret = err; 1613 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1614 unlock_page(page); 1615 } else { 1616 /* 1617 * It may be a page full of checkpoint-mode buffers. We don't 1618 * really know unless we go poke around in the buffer_heads. 1619 * But block_write_full_page will do the right thing. 1620 */ 1621 ret = block_write_full_page(page, ext4_get_block, wbc); 1622 } 1623 err = ext4_journal_stop(handle); 1624 if (!ret) 1625 ret = err; 1626 out: 1627 return ret; 1628 1629 no_write: 1630 redirty_page_for_writepage(wbc, page); 1631 out_unlock: 1632 unlock_page(page); 1633 goto out; 1634 } 1635 1636 static int ext4_readpage(struct file *file, struct page *page) 1637 { 1638 return mpage_readpage(page, ext4_get_block); 1639 } 1640 1641 static int 1642 ext4_readpages(struct file *file, struct address_space *mapping, 1643 struct list_head *pages, unsigned nr_pages) 1644 { 1645 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 1646 } 1647 1648 static void ext4_invalidatepage(struct page *page, unsigned long offset) 1649 { 1650 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 1651 1652 /* 1653 * If it's a full truncate we just forget about the pending dirtying 1654 */ 1655 if (offset == 0) 1656 ClearPageChecked(page); 1657 1658 jbd2_journal_invalidatepage(journal, page, offset); 1659 } 1660 1661 static int ext4_releasepage(struct page *page, gfp_t wait) 1662 { 1663 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 1664 1665 WARN_ON(PageChecked(page)); 1666 if (!page_has_buffers(page)) 1667 return 0; 1668 return jbd2_journal_try_to_free_buffers(journal, page, wait); 1669 } 1670 1671 /* 1672 * If the O_DIRECT write will extend the file then add this inode to the 1673 * orphan list. So recovery will truncate it back to the original size 1674 * if the machine crashes during the write. 1675 * 1676 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1677 * crashes then stale disk data _may_ be exposed inside the file. 1678 */ 1679 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 1680 const struct iovec *iov, loff_t offset, 1681 unsigned long nr_segs) 1682 { 1683 struct file *file = iocb->ki_filp; 1684 struct inode *inode = file->f_mapping->host; 1685 struct ext4_inode_info *ei = EXT4_I(inode); 1686 handle_t *handle = NULL; 1687 ssize_t ret; 1688 int orphan = 0; 1689 size_t count = iov_length(iov, nr_segs); 1690 1691 if (rw == WRITE) { 1692 loff_t final_size = offset + count; 1693 1694 handle = ext4_journal_start(inode, DIO_CREDITS); 1695 if (IS_ERR(handle)) { 1696 ret = PTR_ERR(handle); 1697 goto out; 1698 } 1699 if (final_size > inode->i_size) { 1700 ret = ext4_orphan_add(handle, inode); 1701 if (ret) 1702 goto out_stop; 1703 orphan = 1; 1704 ei->i_disksize = inode->i_size; 1705 } 1706 } 1707 1708 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1709 offset, nr_segs, 1710 ext4_get_block, NULL); 1711 1712 /* 1713 * Reacquire the handle: ext4_get_block() can restart the transaction 1714 */ 1715 handle = ext4_journal_current_handle(); 1716 1717 out_stop: 1718 if (handle) { 1719 int err; 1720 1721 if (orphan && inode->i_nlink) 1722 ext4_orphan_del(handle, inode); 1723 if (orphan && ret > 0) { 1724 loff_t end = offset + ret; 1725 if (end > inode->i_size) { 1726 ei->i_disksize = end; 1727 i_size_write(inode, end); 1728 /* 1729 * We're going to return a positive `ret' 1730 * here due to non-zero-length I/O, so there's 1731 * no way of reporting error returns from 1732 * ext4_mark_inode_dirty() to userspace. So 1733 * ignore it. 1734 */ 1735 ext4_mark_inode_dirty(handle, inode); 1736 } 1737 } 1738 err = ext4_journal_stop(handle); 1739 if (ret == 0) 1740 ret = err; 1741 } 1742 out: 1743 return ret; 1744 } 1745 1746 /* 1747 * Pages can be marked dirty completely asynchronously from ext4's journalling 1748 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 1749 * much here because ->set_page_dirty is called under VFS locks. The page is 1750 * not necessarily locked. 1751 * 1752 * We cannot just dirty the page and leave attached buffers clean, because the 1753 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 1754 * or jbddirty because all the journalling code will explode. 1755 * 1756 * So what we do is to mark the page "pending dirty" and next time writepage 1757 * is called, propagate that into the buffers appropriately. 1758 */ 1759 static int ext4_journalled_set_page_dirty(struct page *page) 1760 { 1761 SetPageChecked(page); 1762 return __set_page_dirty_nobuffers(page); 1763 } 1764 1765 static const struct address_space_operations ext4_ordered_aops = { 1766 .readpage = ext4_readpage, 1767 .readpages = ext4_readpages, 1768 .writepage = ext4_ordered_writepage, 1769 .sync_page = block_sync_page, 1770 .write_begin = ext4_write_begin, 1771 .write_end = ext4_ordered_write_end, 1772 .bmap = ext4_bmap, 1773 .invalidatepage = ext4_invalidatepage, 1774 .releasepage = ext4_releasepage, 1775 .direct_IO = ext4_direct_IO, 1776 .migratepage = buffer_migrate_page, 1777 }; 1778 1779 static const struct address_space_operations ext4_writeback_aops = { 1780 .readpage = ext4_readpage, 1781 .readpages = ext4_readpages, 1782 .writepage = ext4_writeback_writepage, 1783 .sync_page = block_sync_page, 1784 .write_begin = ext4_write_begin, 1785 .write_end = ext4_writeback_write_end, 1786 .bmap = ext4_bmap, 1787 .invalidatepage = ext4_invalidatepage, 1788 .releasepage = ext4_releasepage, 1789 .direct_IO = ext4_direct_IO, 1790 .migratepage = buffer_migrate_page, 1791 }; 1792 1793 static const struct address_space_operations ext4_journalled_aops = { 1794 .readpage = ext4_readpage, 1795 .readpages = ext4_readpages, 1796 .writepage = ext4_journalled_writepage, 1797 .sync_page = block_sync_page, 1798 .write_begin = ext4_write_begin, 1799 .write_end = ext4_journalled_write_end, 1800 .set_page_dirty = ext4_journalled_set_page_dirty, 1801 .bmap = ext4_bmap, 1802 .invalidatepage = ext4_invalidatepage, 1803 .releasepage = ext4_releasepage, 1804 }; 1805 1806 void ext4_set_aops(struct inode *inode) 1807 { 1808 if (ext4_should_order_data(inode)) 1809 inode->i_mapping->a_ops = &ext4_ordered_aops; 1810 else if (ext4_should_writeback_data(inode)) 1811 inode->i_mapping->a_ops = &ext4_writeback_aops; 1812 else 1813 inode->i_mapping->a_ops = &ext4_journalled_aops; 1814 } 1815 1816 /* 1817 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 1818 * up to the end of the block which corresponds to `from'. 1819 * This required during truncate. We need to physically zero the tail end 1820 * of that block so it doesn't yield old data if the file is later grown. 1821 */ 1822 int ext4_block_truncate_page(handle_t *handle, struct page *page, 1823 struct address_space *mapping, loff_t from) 1824 { 1825 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 1826 unsigned offset = from & (PAGE_CACHE_SIZE-1); 1827 unsigned blocksize, length, pos; 1828 ext4_lblk_t iblock; 1829 struct inode *inode = mapping->host; 1830 struct buffer_head *bh; 1831 int err = 0; 1832 1833 blocksize = inode->i_sb->s_blocksize; 1834 length = blocksize - (offset & (blocksize - 1)); 1835 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1836 1837 /* 1838 * For "nobh" option, we can only work if we don't need to 1839 * read-in the page - otherwise we create buffers to do the IO. 1840 */ 1841 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 1842 ext4_should_writeback_data(inode) && PageUptodate(page)) { 1843 zero_user_page(page, offset, length, KM_USER0); 1844 set_page_dirty(page); 1845 goto unlock; 1846 } 1847 1848 if (!page_has_buffers(page)) 1849 create_empty_buffers(page, blocksize, 0); 1850 1851 /* Find the buffer that contains "offset" */ 1852 bh = page_buffers(page); 1853 pos = blocksize; 1854 while (offset >= pos) { 1855 bh = bh->b_this_page; 1856 iblock++; 1857 pos += blocksize; 1858 } 1859 1860 err = 0; 1861 if (buffer_freed(bh)) { 1862 BUFFER_TRACE(bh, "freed: skip"); 1863 goto unlock; 1864 } 1865 1866 if (!buffer_mapped(bh)) { 1867 BUFFER_TRACE(bh, "unmapped"); 1868 ext4_get_block(inode, iblock, bh, 0); 1869 /* unmapped? It's a hole - nothing to do */ 1870 if (!buffer_mapped(bh)) { 1871 BUFFER_TRACE(bh, "still unmapped"); 1872 goto unlock; 1873 } 1874 } 1875 1876 /* Ok, it's mapped. Make sure it's up-to-date */ 1877 if (PageUptodate(page)) 1878 set_buffer_uptodate(bh); 1879 1880 if (!buffer_uptodate(bh)) { 1881 err = -EIO; 1882 ll_rw_block(READ, 1, &bh); 1883 wait_on_buffer(bh); 1884 /* Uhhuh. Read error. Complain and punt. */ 1885 if (!buffer_uptodate(bh)) 1886 goto unlock; 1887 } 1888 1889 if (ext4_should_journal_data(inode)) { 1890 BUFFER_TRACE(bh, "get write access"); 1891 err = ext4_journal_get_write_access(handle, bh); 1892 if (err) 1893 goto unlock; 1894 } 1895 1896 zero_user_page(page, offset, length, KM_USER0); 1897 1898 BUFFER_TRACE(bh, "zeroed end of block"); 1899 1900 err = 0; 1901 if (ext4_should_journal_data(inode)) { 1902 err = ext4_journal_dirty_metadata(handle, bh); 1903 } else { 1904 if (ext4_should_order_data(inode)) 1905 err = ext4_journal_dirty_data(handle, bh); 1906 mark_buffer_dirty(bh); 1907 } 1908 1909 unlock: 1910 unlock_page(page); 1911 page_cache_release(page); 1912 return err; 1913 } 1914 1915 /* 1916 * Probably it should be a library function... search for first non-zero word 1917 * or memcmp with zero_page, whatever is better for particular architecture. 1918 * Linus? 1919 */ 1920 static inline int all_zeroes(__le32 *p, __le32 *q) 1921 { 1922 while (p < q) 1923 if (*p++) 1924 return 0; 1925 return 1; 1926 } 1927 1928 /** 1929 * ext4_find_shared - find the indirect blocks for partial truncation. 1930 * @inode: inode in question 1931 * @depth: depth of the affected branch 1932 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 1933 * @chain: place to store the pointers to partial indirect blocks 1934 * @top: place to the (detached) top of branch 1935 * 1936 * This is a helper function used by ext4_truncate(). 1937 * 1938 * When we do truncate() we may have to clean the ends of several 1939 * indirect blocks but leave the blocks themselves alive. Block is 1940 * partially truncated if some data below the new i_size is refered 1941 * from it (and it is on the path to the first completely truncated 1942 * data block, indeed). We have to free the top of that path along 1943 * with everything to the right of the path. Since no allocation 1944 * past the truncation point is possible until ext4_truncate() 1945 * finishes, we may safely do the latter, but top of branch may 1946 * require special attention - pageout below the truncation point 1947 * might try to populate it. 1948 * 1949 * We atomically detach the top of branch from the tree, store the 1950 * block number of its root in *@top, pointers to buffer_heads of 1951 * partially truncated blocks - in @chain[].bh and pointers to 1952 * their last elements that should not be removed - in 1953 * @chain[].p. Return value is the pointer to last filled element 1954 * of @chain. 1955 * 1956 * The work left to caller to do the actual freeing of subtrees: 1957 * a) free the subtree starting from *@top 1958 * b) free the subtrees whose roots are stored in 1959 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 1960 * c) free the subtrees growing from the inode past the @chain[0]. 1961 * (no partially truncated stuff there). */ 1962 1963 static Indirect *ext4_find_shared(struct inode *inode, int depth, 1964 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) 1965 { 1966 Indirect *partial, *p; 1967 int k, err; 1968 1969 *top = 0; 1970 /* Make k index the deepest non-null offest + 1 */ 1971 for (k = depth; k > 1 && !offsets[k-1]; k--) 1972 ; 1973 partial = ext4_get_branch(inode, k, offsets, chain, &err); 1974 /* Writer: pointers */ 1975 if (!partial) 1976 partial = chain + k-1; 1977 /* 1978 * If the branch acquired continuation since we've looked at it - 1979 * fine, it should all survive and (new) top doesn't belong to us. 1980 */ 1981 if (!partial->key && *partial->p) 1982 /* Writer: end */ 1983 goto no_top; 1984 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 1985 ; 1986 /* 1987 * OK, we've found the last block that must survive. The rest of our 1988 * branch should be detached before unlocking. However, if that rest 1989 * of branch is all ours and does not grow immediately from the inode 1990 * it's easier to cheat and just decrement partial->p. 1991 */ 1992 if (p == chain + k - 1 && p > chain) { 1993 p->p--; 1994 } else { 1995 *top = *p->p; 1996 /* Nope, don't do this in ext4. Must leave the tree intact */ 1997 #if 0 1998 *p->p = 0; 1999 #endif 2000 } 2001 /* Writer: end */ 2002 2003 while(partial > p) { 2004 brelse(partial->bh); 2005 partial--; 2006 } 2007 no_top: 2008 return partial; 2009 } 2010 2011 /* 2012 * Zero a number of block pointers in either an inode or an indirect block. 2013 * If we restart the transaction we must again get write access to the 2014 * indirect block for further modification. 2015 * 2016 * We release `count' blocks on disk, but (last - first) may be greater 2017 * than `count' because there can be holes in there. 2018 */ 2019 static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 2020 struct buffer_head *bh, ext4_fsblk_t block_to_free, 2021 unsigned long count, __le32 *first, __le32 *last) 2022 { 2023 __le32 *p; 2024 if (try_to_extend_transaction(handle, inode)) { 2025 if (bh) { 2026 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 2027 ext4_journal_dirty_metadata(handle, bh); 2028 } 2029 ext4_mark_inode_dirty(handle, inode); 2030 ext4_journal_test_restart(handle, inode); 2031 if (bh) { 2032 BUFFER_TRACE(bh, "retaking write access"); 2033 ext4_journal_get_write_access(handle, bh); 2034 } 2035 } 2036 2037 /* 2038 * Any buffers which are on the journal will be in memory. We find 2039 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() 2040 * on them. We've already detached each block from the file, so 2041 * bforget() in jbd2_journal_forget() should be safe. 2042 * 2043 * AKPM: turn on bforget in jbd2_journal_forget()!!! 2044 */ 2045 for (p = first; p < last; p++) { 2046 u32 nr = le32_to_cpu(*p); 2047 if (nr) { 2048 struct buffer_head *tbh; 2049 2050 *p = 0; 2051 tbh = sb_find_get_block(inode->i_sb, nr); 2052 ext4_forget(handle, 0, inode, tbh, nr); 2053 } 2054 } 2055 2056 ext4_free_blocks(handle, inode, block_to_free, count, 0); 2057 } 2058 2059 /** 2060 * ext4_free_data - free a list of data blocks 2061 * @handle: handle for this transaction 2062 * @inode: inode we are dealing with 2063 * @this_bh: indirect buffer_head which contains *@first and *@last 2064 * @first: array of block numbers 2065 * @last: points immediately past the end of array 2066 * 2067 * We are freeing all blocks refered from that array (numbers are stored as 2068 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 2069 * 2070 * We accumulate contiguous runs of blocks to free. Conveniently, if these 2071 * blocks are contiguous then releasing them at one time will only affect one 2072 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 2073 * actually use a lot of journal space. 2074 * 2075 * @this_bh will be %NULL if @first and @last point into the inode's direct 2076 * block pointers. 2077 */ 2078 static void ext4_free_data(handle_t *handle, struct inode *inode, 2079 struct buffer_head *this_bh, 2080 __le32 *first, __le32 *last) 2081 { 2082 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 2083 unsigned long count = 0; /* Number of blocks in the run */ 2084 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 2085 corresponding to 2086 block_to_free */ 2087 ext4_fsblk_t nr; /* Current block # */ 2088 __le32 *p; /* Pointer into inode/ind 2089 for current block */ 2090 int err; 2091 2092 if (this_bh) { /* For indirect block */ 2093 BUFFER_TRACE(this_bh, "get_write_access"); 2094 err = ext4_journal_get_write_access(handle, this_bh); 2095 /* Important: if we can't update the indirect pointers 2096 * to the blocks, we can't free them. */ 2097 if (err) 2098 return; 2099 } 2100 2101 for (p = first; p < last; p++) { 2102 nr = le32_to_cpu(*p); 2103 if (nr) { 2104 /* accumulate blocks to free if they're contiguous */ 2105 if (count == 0) { 2106 block_to_free = nr; 2107 block_to_free_p = p; 2108 count = 1; 2109 } else if (nr == block_to_free + count) { 2110 count++; 2111 } else { 2112 ext4_clear_blocks(handle, inode, this_bh, 2113 block_to_free, 2114 count, block_to_free_p, p); 2115 block_to_free = nr; 2116 block_to_free_p = p; 2117 count = 1; 2118 } 2119 } 2120 } 2121 2122 if (count > 0) 2123 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 2124 count, block_to_free_p, p); 2125 2126 if (this_bh) { 2127 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 2128 ext4_journal_dirty_metadata(handle, this_bh); 2129 } 2130 } 2131 2132 /** 2133 * ext4_free_branches - free an array of branches 2134 * @handle: JBD handle for this transaction 2135 * @inode: inode we are dealing with 2136 * @parent_bh: the buffer_head which contains *@first and *@last 2137 * @first: array of block numbers 2138 * @last: pointer immediately past the end of array 2139 * @depth: depth of the branches to free 2140 * 2141 * We are freeing all blocks refered from these branches (numbers are 2142 * stored as little-endian 32-bit) and updating @inode->i_blocks 2143 * appropriately. 2144 */ 2145 static void ext4_free_branches(handle_t *handle, struct inode *inode, 2146 struct buffer_head *parent_bh, 2147 __le32 *first, __le32 *last, int depth) 2148 { 2149 ext4_fsblk_t nr; 2150 __le32 *p; 2151 2152 if (is_handle_aborted(handle)) 2153 return; 2154 2155 if (depth--) { 2156 struct buffer_head *bh; 2157 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 2158 p = last; 2159 while (--p >= first) { 2160 nr = le32_to_cpu(*p); 2161 if (!nr) 2162 continue; /* A hole */ 2163 2164 /* Go read the buffer for the next level down */ 2165 bh = sb_bread(inode->i_sb, nr); 2166 2167 /* 2168 * A read failure? Report error and clear slot 2169 * (should be rare). 2170 */ 2171 if (!bh) { 2172 ext4_error(inode->i_sb, "ext4_free_branches", 2173 "Read failure, inode=%lu, block=%llu", 2174 inode->i_ino, nr); 2175 continue; 2176 } 2177 2178 /* This zaps the entire block. Bottom up. */ 2179 BUFFER_TRACE(bh, "free child branches"); 2180 ext4_free_branches(handle, inode, bh, 2181 (__le32*)bh->b_data, 2182 (__le32*)bh->b_data + addr_per_block, 2183 depth); 2184 2185 /* 2186 * We've probably journalled the indirect block several 2187 * times during the truncate. But it's no longer 2188 * needed and we now drop it from the transaction via 2189 * jbd2_journal_revoke(). 2190 * 2191 * That's easy if it's exclusively part of this 2192 * transaction. But if it's part of the committing 2193 * transaction then jbd2_journal_forget() will simply 2194 * brelse() it. That means that if the underlying 2195 * block is reallocated in ext4_get_block(), 2196 * unmap_underlying_metadata() will find this block 2197 * and will try to get rid of it. damn, damn. 2198 * 2199 * If this block has already been committed to the 2200 * journal, a revoke record will be written. And 2201 * revoke records must be emitted *before* clearing 2202 * this block's bit in the bitmaps. 2203 */ 2204 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 2205 2206 /* 2207 * Everything below this this pointer has been 2208 * released. Now let this top-of-subtree go. 2209 * 2210 * We want the freeing of this indirect block to be 2211 * atomic in the journal with the updating of the 2212 * bitmap block which owns it. So make some room in 2213 * the journal. 2214 * 2215 * We zero the parent pointer *after* freeing its 2216 * pointee in the bitmaps, so if extend_transaction() 2217 * for some reason fails to put the bitmap changes and 2218 * the release into the same transaction, recovery 2219 * will merely complain about releasing a free block, 2220 * rather than leaking blocks. 2221 */ 2222 if (is_handle_aborted(handle)) 2223 return; 2224 if (try_to_extend_transaction(handle, inode)) { 2225 ext4_mark_inode_dirty(handle, inode); 2226 ext4_journal_test_restart(handle, inode); 2227 } 2228 2229 ext4_free_blocks(handle, inode, nr, 1, 1); 2230 2231 if (parent_bh) { 2232 /* 2233 * The block which we have just freed is 2234 * pointed to by an indirect block: journal it 2235 */ 2236 BUFFER_TRACE(parent_bh, "get_write_access"); 2237 if (!ext4_journal_get_write_access(handle, 2238 parent_bh)){ 2239 *p = 0; 2240 BUFFER_TRACE(parent_bh, 2241 "call ext4_journal_dirty_metadata"); 2242 ext4_journal_dirty_metadata(handle, 2243 parent_bh); 2244 } 2245 } 2246 } 2247 } else { 2248 /* We have reached the bottom of the tree. */ 2249 BUFFER_TRACE(parent_bh, "free data blocks"); 2250 ext4_free_data(handle, inode, parent_bh, first, last); 2251 } 2252 } 2253 2254 /* 2255 * ext4_truncate() 2256 * 2257 * We block out ext4_get_block() block instantiations across the entire 2258 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 2259 * simultaneously on behalf of the same inode. 2260 * 2261 * As we work through the truncate and commmit bits of it to the journal there 2262 * is one core, guiding principle: the file's tree must always be consistent on 2263 * disk. We must be able to restart the truncate after a crash. 2264 * 2265 * The file's tree may be transiently inconsistent in memory (although it 2266 * probably isn't), but whenever we close off and commit a journal transaction, 2267 * the contents of (the filesystem + the journal) must be consistent and 2268 * restartable. It's pretty simple, really: bottom up, right to left (although 2269 * left-to-right works OK too). 2270 * 2271 * Note that at recovery time, journal replay occurs *before* the restart of 2272 * truncate against the orphan inode list. 2273 * 2274 * The committed inode has the new, desired i_size (which is the same as 2275 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 2276 * that this inode's truncate did not complete and it will again call 2277 * ext4_truncate() to have another go. So there will be instantiated blocks 2278 * to the right of the truncation point in a crashed ext4 filesystem. But 2279 * that's fine - as long as they are linked from the inode, the post-crash 2280 * ext4_truncate() run will find them and release them. 2281 */ 2282 void ext4_truncate(struct inode *inode) 2283 { 2284 handle_t *handle; 2285 struct ext4_inode_info *ei = EXT4_I(inode); 2286 __le32 *i_data = ei->i_data; 2287 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 2288 struct address_space *mapping = inode->i_mapping; 2289 ext4_lblk_t offsets[4]; 2290 Indirect chain[4]; 2291 Indirect *partial; 2292 __le32 nr = 0; 2293 int n; 2294 ext4_lblk_t last_block; 2295 unsigned blocksize = inode->i_sb->s_blocksize; 2296 struct page *page; 2297 2298 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2299 S_ISLNK(inode->i_mode))) 2300 return; 2301 if (ext4_inode_is_fast_symlink(inode)) 2302 return; 2303 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 2304 return; 2305 2306 /* 2307 * We have to lock the EOF page here, because lock_page() nests 2308 * outside jbd2_journal_start(). 2309 */ 2310 if ((inode->i_size & (blocksize - 1)) == 0) { 2311 /* Block boundary? Nothing to do */ 2312 page = NULL; 2313 } else { 2314 page = grab_cache_page(mapping, 2315 inode->i_size >> PAGE_CACHE_SHIFT); 2316 if (!page) 2317 return; 2318 } 2319 2320 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 2321 ext4_ext_truncate(inode, page); 2322 return; 2323 } 2324 2325 handle = start_transaction(inode); 2326 if (IS_ERR(handle)) { 2327 if (page) { 2328 clear_highpage(page); 2329 flush_dcache_page(page); 2330 unlock_page(page); 2331 page_cache_release(page); 2332 } 2333 return; /* AKPM: return what? */ 2334 } 2335 2336 last_block = (inode->i_size + blocksize-1) 2337 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 2338 2339 if (page) 2340 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 2341 2342 n = ext4_block_to_path(inode, last_block, offsets, NULL); 2343 if (n == 0) 2344 goto out_stop; /* error */ 2345 2346 /* 2347 * OK. This truncate is going to happen. We add the inode to the 2348 * orphan list, so that if this truncate spans multiple transactions, 2349 * and we crash, we will resume the truncate when the filesystem 2350 * recovers. It also marks the inode dirty, to catch the new size. 2351 * 2352 * Implication: the file must always be in a sane, consistent 2353 * truncatable state while each transaction commits. 2354 */ 2355 if (ext4_orphan_add(handle, inode)) 2356 goto out_stop; 2357 2358 /* 2359 * The orphan list entry will now protect us from any crash which 2360 * occurs before the truncate completes, so it is now safe to propagate 2361 * the new, shorter inode size (held for now in i_size) into the 2362 * on-disk inode. We do this via i_disksize, which is the value which 2363 * ext4 *really* writes onto the disk inode. 2364 */ 2365 ei->i_disksize = inode->i_size; 2366 2367 /* 2368 * From here we block out all ext4_get_block() callers who want to 2369 * modify the block allocation tree. 2370 */ 2371 down_write(&ei->i_data_sem); 2372 2373 if (n == 1) { /* direct blocks */ 2374 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 2375 i_data + EXT4_NDIR_BLOCKS); 2376 goto do_indirects; 2377 } 2378 2379 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 2380 /* Kill the top of shared branch (not detached) */ 2381 if (nr) { 2382 if (partial == chain) { 2383 /* Shared branch grows from the inode */ 2384 ext4_free_branches(handle, inode, NULL, 2385 &nr, &nr+1, (chain+n-1) - partial); 2386 *partial->p = 0; 2387 /* 2388 * We mark the inode dirty prior to restart, 2389 * and prior to stop. No need for it here. 2390 */ 2391 } else { 2392 /* Shared branch grows from an indirect block */ 2393 BUFFER_TRACE(partial->bh, "get_write_access"); 2394 ext4_free_branches(handle, inode, partial->bh, 2395 partial->p, 2396 partial->p+1, (chain+n-1) - partial); 2397 } 2398 } 2399 /* Clear the ends of indirect blocks on the shared branch */ 2400 while (partial > chain) { 2401 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 2402 (__le32*)partial->bh->b_data+addr_per_block, 2403 (chain+n-1) - partial); 2404 BUFFER_TRACE(partial->bh, "call brelse"); 2405 brelse (partial->bh); 2406 partial--; 2407 } 2408 do_indirects: 2409 /* Kill the remaining (whole) subtrees */ 2410 switch (offsets[0]) { 2411 default: 2412 nr = i_data[EXT4_IND_BLOCK]; 2413 if (nr) { 2414 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 2415 i_data[EXT4_IND_BLOCK] = 0; 2416 } 2417 case EXT4_IND_BLOCK: 2418 nr = i_data[EXT4_DIND_BLOCK]; 2419 if (nr) { 2420 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 2421 i_data[EXT4_DIND_BLOCK] = 0; 2422 } 2423 case EXT4_DIND_BLOCK: 2424 nr = i_data[EXT4_TIND_BLOCK]; 2425 if (nr) { 2426 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 2427 i_data[EXT4_TIND_BLOCK] = 0; 2428 } 2429 case EXT4_TIND_BLOCK: 2430 ; 2431 } 2432 2433 ext4_discard_reservation(inode); 2434 2435 up_write(&ei->i_data_sem); 2436 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 2437 ext4_mark_inode_dirty(handle, inode); 2438 2439 /* 2440 * In a multi-transaction truncate, we only make the final transaction 2441 * synchronous 2442 */ 2443 if (IS_SYNC(inode)) 2444 handle->h_sync = 1; 2445 out_stop: 2446 /* 2447 * If this was a simple ftruncate(), and the file will remain alive 2448 * then we need to clear up the orphan record which we created above. 2449 * However, if this was a real unlink then we were called by 2450 * ext4_delete_inode(), and we allow that function to clean up the 2451 * orphan info for us. 2452 */ 2453 if (inode->i_nlink) 2454 ext4_orphan_del(handle, inode); 2455 2456 ext4_journal_stop(handle); 2457 } 2458 2459 static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, 2460 unsigned long ino, struct ext4_iloc *iloc) 2461 { 2462 unsigned long desc, group_desc; 2463 ext4_group_t block_group; 2464 unsigned long offset; 2465 ext4_fsblk_t block; 2466 struct buffer_head *bh; 2467 struct ext4_group_desc * gdp; 2468 2469 if (!ext4_valid_inum(sb, ino)) { 2470 /* 2471 * This error is already checked for in namei.c unless we are 2472 * looking at an NFS filehandle, in which case no error 2473 * report is needed 2474 */ 2475 return 0; 2476 } 2477 2478 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 2479 if (block_group >= EXT4_SB(sb)->s_groups_count) { 2480 ext4_error(sb,"ext4_get_inode_block","group >= groups count"); 2481 return 0; 2482 } 2483 smp_rmb(); 2484 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 2485 desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2486 bh = EXT4_SB(sb)->s_group_desc[group_desc]; 2487 if (!bh) { 2488 ext4_error (sb, "ext4_get_inode_block", 2489 "Descriptor not loaded"); 2490 return 0; 2491 } 2492 2493 gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data + 2494 desc * EXT4_DESC_SIZE(sb)); 2495 /* 2496 * Figure out the offset within the block group inode table 2497 */ 2498 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) * 2499 EXT4_INODE_SIZE(sb); 2500 block = ext4_inode_table(sb, gdp) + 2501 (offset >> EXT4_BLOCK_SIZE_BITS(sb)); 2502 2503 iloc->block_group = block_group; 2504 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1); 2505 return block; 2506 } 2507 2508 /* 2509 * ext4_get_inode_loc returns with an extra refcount against the inode's 2510 * underlying buffer_head on success. If 'in_mem' is true, we have all 2511 * data in memory that is needed to recreate the on-disk version of this 2512 * inode. 2513 */ 2514 static int __ext4_get_inode_loc(struct inode *inode, 2515 struct ext4_iloc *iloc, int in_mem) 2516 { 2517 ext4_fsblk_t block; 2518 struct buffer_head *bh; 2519 2520 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); 2521 if (!block) 2522 return -EIO; 2523 2524 bh = sb_getblk(inode->i_sb, block); 2525 if (!bh) { 2526 ext4_error (inode->i_sb, "ext4_get_inode_loc", 2527 "unable to read inode block - " 2528 "inode=%lu, block=%llu", 2529 inode->i_ino, block); 2530 return -EIO; 2531 } 2532 if (!buffer_uptodate(bh)) { 2533 lock_buffer(bh); 2534 if (buffer_uptodate(bh)) { 2535 /* someone brought it uptodate while we waited */ 2536 unlock_buffer(bh); 2537 goto has_buffer; 2538 } 2539 2540 /* 2541 * If we have all information of the inode in memory and this 2542 * is the only valid inode in the block, we need not read the 2543 * block. 2544 */ 2545 if (in_mem) { 2546 struct buffer_head *bitmap_bh; 2547 struct ext4_group_desc *desc; 2548 int inodes_per_buffer; 2549 int inode_offset, i; 2550 ext4_group_t block_group; 2551 int start; 2552 2553 block_group = (inode->i_ino - 1) / 2554 EXT4_INODES_PER_GROUP(inode->i_sb); 2555 inodes_per_buffer = bh->b_size / 2556 EXT4_INODE_SIZE(inode->i_sb); 2557 inode_offset = ((inode->i_ino - 1) % 2558 EXT4_INODES_PER_GROUP(inode->i_sb)); 2559 start = inode_offset & ~(inodes_per_buffer - 1); 2560 2561 /* Is the inode bitmap in cache? */ 2562 desc = ext4_get_group_desc(inode->i_sb, 2563 block_group, NULL); 2564 if (!desc) 2565 goto make_io; 2566 2567 bitmap_bh = sb_getblk(inode->i_sb, 2568 ext4_inode_bitmap(inode->i_sb, desc)); 2569 if (!bitmap_bh) 2570 goto make_io; 2571 2572 /* 2573 * If the inode bitmap isn't in cache then the 2574 * optimisation may end up performing two reads instead 2575 * of one, so skip it. 2576 */ 2577 if (!buffer_uptodate(bitmap_bh)) { 2578 brelse(bitmap_bh); 2579 goto make_io; 2580 } 2581 for (i = start; i < start + inodes_per_buffer; i++) { 2582 if (i == inode_offset) 2583 continue; 2584 if (ext4_test_bit(i, bitmap_bh->b_data)) 2585 break; 2586 } 2587 brelse(bitmap_bh); 2588 if (i == start + inodes_per_buffer) { 2589 /* all other inodes are free, so skip I/O */ 2590 memset(bh->b_data, 0, bh->b_size); 2591 set_buffer_uptodate(bh); 2592 unlock_buffer(bh); 2593 goto has_buffer; 2594 } 2595 } 2596 2597 make_io: 2598 /* 2599 * There are other valid inodes in the buffer, this inode 2600 * has in-inode xattrs, or we don't have this inode in memory. 2601 * Read the block from disk. 2602 */ 2603 get_bh(bh); 2604 bh->b_end_io = end_buffer_read_sync; 2605 submit_bh(READ_META, bh); 2606 wait_on_buffer(bh); 2607 if (!buffer_uptodate(bh)) { 2608 ext4_error(inode->i_sb, "ext4_get_inode_loc", 2609 "unable to read inode block - " 2610 "inode=%lu, block=%llu", 2611 inode->i_ino, block); 2612 brelse(bh); 2613 return -EIO; 2614 } 2615 } 2616 has_buffer: 2617 iloc->bh = bh; 2618 return 0; 2619 } 2620 2621 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 2622 { 2623 /* We have all inode data except xattrs in memory here. */ 2624 return __ext4_get_inode_loc(inode, iloc, 2625 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 2626 } 2627 2628 void ext4_set_inode_flags(struct inode *inode) 2629 { 2630 unsigned int flags = EXT4_I(inode)->i_flags; 2631 2632 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 2633 if (flags & EXT4_SYNC_FL) 2634 inode->i_flags |= S_SYNC; 2635 if (flags & EXT4_APPEND_FL) 2636 inode->i_flags |= S_APPEND; 2637 if (flags & EXT4_IMMUTABLE_FL) 2638 inode->i_flags |= S_IMMUTABLE; 2639 if (flags & EXT4_NOATIME_FL) 2640 inode->i_flags |= S_NOATIME; 2641 if (flags & EXT4_DIRSYNC_FL) 2642 inode->i_flags |= S_DIRSYNC; 2643 } 2644 2645 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 2646 void ext4_get_inode_flags(struct ext4_inode_info *ei) 2647 { 2648 unsigned int flags = ei->vfs_inode.i_flags; 2649 2650 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 2651 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 2652 if (flags & S_SYNC) 2653 ei->i_flags |= EXT4_SYNC_FL; 2654 if (flags & S_APPEND) 2655 ei->i_flags |= EXT4_APPEND_FL; 2656 if (flags & S_IMMUTABLE) 2657 ei->i_flags |= EXT4_IMMUTABLE_FL; 2658 if (flags & S_NOATIME) 2659 ei->i_flags |= EXT4_NOATIME_FL; 2660 if (flags & S_DIRSYNC) 2661 ei->i_flags |= EXT4_DIRSYNC_FL; 2662 } 2663 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 2664 struct ext4_inode_info *ei) 2665 { 2666 blkcnt_t i_blocks ; 2667 struct inode *inode = &(ei->vfs_inode); 2668 struct super_block *sb = inode->i_sb; 2669 2670 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 2671 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 2672 /* we are using combined 48 bit field */ 2673 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 2674 le32_to_cpu(raw_inode->i_blocks_lo); 2675 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 2676 /* i_blocks represent file system block size */ 2677 return i_blocks << (inode->i_blkbits - 9); 2678 } else { 2679 return i_blocks; 2680 } 2681 } else { 2682 return le32_to_cpu(raw_inode->i_blocks_lo); 2683 } 2684 } 2685 2686 void ext4_read_inode(struct inode * inode) 2687 { 2688 struct ext4_iloc iloc; 2689 struct ext4_inode *raw_inode; 2690 struct ext4_inode_info *ei = EXT4_I(inode); 2691 struct buffer_head *bh; 2692 int block; 2693 2694 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 2695 ei->i_acl = EXT4_ACL_NOT_CACHED; 2696 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 2697 #endif 2698 ei->i_block_alloc_info = NULL; 2699 2700 if (__ext4_get_inode_loc(inode, &iloc, 0)) 2701 goto bad_inode; 2702 bh = iloc.bh; 2703 raw_inode = ext4_raw_inode(&iloc); 2704 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 2705 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 2706 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 2707 if(!(test_opt (inode->i_sb, NO_UID32))) { 2708 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 2709 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 2710 } 2711 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 2712 2713 ei->i_state = 0; 2714 ei->i_dir_start_lookup = 0; 2715 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2716 /* We now have enough fields to check if the inode was active or not. 2717 * This is needed because nfsd might try to access dead inodes 2718 * the test is that same one that e2fsck uses 2719 * NeilBrown 1999oct15 2720 */ 2721 if (inode->i_nlink == 0) { 2722 if (inode->i_mode == 0 || 2723 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 2724 /* this inode is deleted */ 2725 brelse (bh); 2726 goto bad_inode; 2727 } 2728 /* The only unlinked inodes we let through here have 2729 * valid i_mode and are being read by the orphan 2730 * recovery code: that's fine, we're about to complete 2731 * the process of deleting those. */ 2732 } 2733 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 2734 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 2735 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 2736 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 2737 cpu_to_le32(EXT4_OS_HURD)) { 2738 ei->i_file_acl |= 2739 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 2740 } 2741 inode->i_size = ext4_isize(raw_inode); 2742 ei->i_disksize = inode->i_size; 2743 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 2744 ei->i_block_group = iloc.block_group; 2745 /* 2746 * NOTE! The in-memory inode i_data array is in little-endian order 2747 * even on big-endian machines: we do NOT byteswap the block numbers! 2748 */ 2749 for (block = 0; block < EXT4_N_BLOCKS; block++) 2750 ei->i_data[block] = raw_inode->i_block[block]; 2751 INIT_LIST_HEAD(&ei->i_orphan); 2752 2753 if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 && 2754 EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 2755 /* 2756 * When mke2fs creates big inodes it does not zero out 2757 * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE, 2758 * so ignore those first few inodes. 2759 */ 2760 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 2761 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 2762 EXT4_INODE_SIZE(inode->i_sb)) { 2763 brelse (bh); 2764 goto bad_inode; 2765 } 2766 if (ei->i_extra_isize == 0) { 2767 /* The extra space is currently unused. Use it. */ 2768 ei->i_extra_isize = sizeof(struct ext4_inode) - 2769 EXT4_GOOD_OLD_INODE_SIZE; 2770 } else { 2771 __le32 *magic = (void *)raw_inode + 2772 EXT4_GOOD_OLD_INODE_SIZE + 2773 ei->i_extra_isize; 2774 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 2775 ei->i_state |= EXT4_STATE_XATTR; 2776 } 2777 } else 2778 ei->i_extra_isize = 0; 2779 2780 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 2781 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 2782 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 2783 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 2784 2785 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 2786 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 2787 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 2788 inode->i_version |= 2789 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 2790 } 2791 2792 if (S_ISREG(inode->i_mode)) { 2793 inode->i_op = &ext4_file_inode_operations; 2794 inode->i_fop = &ext4_file_operations; 2795 ext4_set_aops(inode); 2796 } else if (S_ISDIR(inode->i_mode)) { 2797 inode->i_op = &ext4_dir_inode_operations; 2798 inode->i_fop = &ext4_dir_operations; 2799 } else if (S_ISLNK(inode->i_mode)) { 2800 if (ext4_inode_is_fast_symlink(inode)) 2801 inode->i_op = &ext4_fast_symlink_inode_operations; 2802 else { 2803 inode->i_op = &ext4_symlink_inode_operations; 2804 ext4_set_aops(inode); 2805 } 2806 } else { 2807 inode->i_op = &ext4_special_inode_operations; 2808 if (raw_inode->i_block[0]) 2809 init_special_inode(inode, inode->i_mode, 2810 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 2811 else 2812 init_special_inode(inode, inode->i_mode, 2813 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 2814 } 2815 brelse (iloc.bh); 2816 ext4_set_inode_flags(inode); 2817 return; 2818 2819 bad_inode: 2820 make_bad_inode(inode); 2821 return; 2822 } 2823 2824 static int ext4_inode_blocks_set(handle_t *handle, 2825 struct ext4_inode *raw_inode, 2826 struct ext4_inode_info *ei) 2827 { 2828 struct inode *inode = &(ei->vfs_inode); 2829 u64 i_blocks = inode->i_blocks; 2830 struct super_block *sb = inode->i_sb; 2831 int err = 0; 2832 2833 if (i_blocks <= ~0U) { 2834 /* 2835 * i_blocks can be represnted in a 32 bit variable 2836 * as multiple of 512 bytes 2837 */ 2838 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 2839 raw_inode->i_blocks_high = 0; 2840 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 2841 } else if (i_blocks <= 0xffffffffffffULL) { 2842 /* 2843 * i_blocks can be represented in a 48 bit variable 2844 * as multiple of 512 bytes 2845 */ 2846 err = ext4_update_rocompat_feature(handle, sb, 2847 EXT4_FEATURE_RO_COMPAT_HUGE_FILE); 2848 if (err) 2849 goto err_out; 2850 /* i_block is stored in the split 48 bit fields */ 2851 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 2852 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 2853 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 2854 } else { 2855 /* 2856 * i_blocks should be represented in a 48 bit variable 2857 * as multiple of file system block size 2858 */ 2859 err = ext4_update_rocompat_feature(handle, sb, 2860 EXT4_FEATURE_RO_COMPAT_HUGE_FILE); 2861 if (err) 2862 goto err_out; 2863 ei->i_flags |= EXT4_HUGE_FILE_FL; 2864 /* i_block is stored in file system block size */ 2865 i_blocks = i_blocks >> (inode->i_blkbits - 9); 2866 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 2867 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 2868 } 2869 err_out: 2870 return err; 2871 } 2872 2873 /* 2874 * Post the struct inode info into an on-disk inode location in the 2875 * buffer-cache. This gobbles the caller's reference to the 2876 * buffer_head in the inode location struct. 2877 * 2878 * The caller must have write access to iloc->bh. 2879 */ 2880 static int ext4_do_update_inode(handle_t *handle, 2881 struct inode *inode, 2882 struct ext4_iloc *iloc) 2883 { 2884 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 2885 struct ext4_inode_info *ei = EXT4_I(inode); 2886 struct buffer_head *bh = iloc->bh; 2887 int err = 0, rc, block; 2888 2889 /* For fields not not tracking in the in-memory inode, 2890 * initialise them to zero for new inodes. */ 2891 if (ei->i_state & EXT4_STATE_NEW) 2892 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 2893 2894 ext4_get_inode_flags(ei); 2895 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 2896 if(!(test_opt(inode->i_sb, NO_UID32))) { 2897 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 2898 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 2899 /* 2900 * Fix up interoperability with old kernels. Otherwise, old inodes get 2901 * re-used with the upper 16 bits of the uid/gid intact 2902 */ 2903 if(!ei->i_dtime) { 2904 raw_inode->i_uid_high = 2905 cpu_to_le16(high_16_bits(inode->i_uid)); 2906 raw_inode->i_gid_high = 2907 cpu_to_le16(high_16_bits(inode->i_gid)); 2908 } else { 2909 raw_inode->i_uid_high = 0; 2910 raw_inode->i_gid_high = 0; 2911 } 2912 } else { 2913 raw_inode->i_uid_low = 2914 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 2915 raw_inode->i_gid_low = 2916 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 2917 raw_inode->i_uid_high = 0; 2918 raw_inode->i_gid_high = 0; 2919 } 2920 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 2921 2922 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 2923 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 2924 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 2925 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 2926 2927 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 2928 goto out_brelse; 2929 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 2930 raw_inode->i_flags = cpu_to_le32(ei->i_flags); 2931 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 2932 cpu_to_le32(EXT4_OS_HURD)) 2933 raw_inode->i_file_acl_high = 2934 cpu_to_le16(ei->i_file_acl >> 32); 2935 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 2936 ext4_isize_set(raw_inode, ei->i_disksize); 2937 if (ei->i_disksize > 0x7fffffffULL) { 2938 struct super_block *sb = inode->i_sb; 2939 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 2940 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 2941 EXT4_SB(sb)->s_es->s_rev_level == 2942 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 2943 /* If this is the first large file 2944 * created, add a flag to the superblock. 2945 */ 2946 err = ext4_journal_get_write_access(handle, 2947 EXT4_SB(sb)->s_sbh); 2948 if (err) 2949 goto out_brelse; 2950 ext4_update_dynamic_rev(sb); 2951 EXT4_SET_RO_COMPAT_FEATURE(sb, 2952 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 2953 sb->s_dirt = 1; 2954 handle->h_sync = 1; 2955 err = ext4_journal_dirty_metadata(handle, 2956 EXT4_SB(sb)->s_sbh); 2957 } 2958 } 2959 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 2960 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 2961 if (old_valid_dev(inode->i_rdev)) { 2962 raw_inode->i_block[0] = 2963 cpu_to_le32(old_encode_dev(inode->i_rdev)); 2964 raw_inode->i_block[1] = 0; 2965 } else { 2966 raw_inode->i_block[0] = 0; 2967 raw_inode->i_block[1] = 2968 cpu_to_le32(new_encode_dev(inode->i_rdev)); 2969 raw_inode->i_block[2] = 0; 2970 } 2971 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 2972 raw_inode->i_block[block] = ei->i_data[block]; 2973 2974 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 2975 if (ei->i_extra_isize) { 2976 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 2977 raw_inode->i_version_hi = 2978 cpu_to_le32(inode->i_version >> 32); 2979 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 2980 } 2981 2982 2983 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 2984 rc = ext4_journal_dirty_metadata(handle, bh); 2985 if (!err) 2986 err = rc; 2987 ei->i_state &= ~EXT4_STATE_NEW; 2988 2989 out_brelse: 2990 brelse (bh); 2991 ext4_std_error(inode->i_sb, err); 2992 return err; 2993 } 2994 2995 /* 2996 * ext4_write_inode() 2997 * 2998 * We are called from a few places: 2999 * 3000 * - Within generic_file_write() for O_SYNC files. 3001 * Here, there will be no transaction running. We wait for any running 3002 * trasnaction to commit. 3003 * 3004 * - Within sys_sync(), kupdate and such. 3005 * We wait on commit, if tol to. 3006 * 3007 * - Within prune_icache() (PF_MEMALLOC == true) 3008 * Here we simply return. We can't afford to block kswapd on the 3009 * journal commit. 3010 * 3011 * In all cases it is actually safe for us to return without doing anything, 3012 * because the inode has been copied into a raw inode buffer in 3013 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 3014 * knfsd. 3015 * 3016 * Note that we are absolutely dependent upon all inode dirtiers doing the 3017 * right thing: they *must* call mark_inode_dirty() after dirtying info in 3018 * which we are interested. 3019 * 3020 * It would be a bug for them to not do this. The code: 3021 * 3022 * mark_inode_dirty(inode) 3023 * stuff(); 3024 * inode->i_size = expr; 3025 * 3026 * is in error because a kswapd-driven write_inode() could occur while 3027 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3028 * will no longer be on the superblock's dirty inode list. 3029 */ 3030 int ext4_write_inode(struct inode *inode, int wait) 3031 { 3032 if (current->flags & PF_MEMALLOC) 3033 return 0; 3034 3035 if (ext4_journal_current_handle()) { 3036 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 3037 dump_stack(); 3038 return -EIO; 3039 } 3040 3041 if (!wait) 3042 return 0; 3043 3044 return ext4_force_commit(inode->i_sb); 3045 } 3046 3047 /* 3048 * ext4_setattr() 3049 * 3050 * Called from notify_change. 3051 * 3052 * We want to trap VFS attempts to truncate the file as soon as 3053 * possible. In particular, we want to make sure that when the VFS 3054 * shrinks i_size, we put the inode on the orphan list and modify 3055 * i_disksize immediately, so that during the subsequent flushing of 3056 * dirty pages and freeing of disk blocks, we can guarantee that any 3057 * commit will leave the blocks being flushed in an unused state on 3058 * disk. (On recovery, the inode will get truncated and the blocks will 3059 * be freed, so we have a strong guarantee that no future commit will 3060 * leave these blocks visible to the user.) 3061 * 3062 * Called with inode->sem down. 3063 */ 3064 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 3065 { 3066 struct inode *inode = dentry->d_inode; 3067 int error, rc = 0; 3068 const unsigned int ia_valid = attr->ia_valid; 3069 3070 error = inode_change_ok(inode, attr); 3071 if (error) 3072 return error; 3073 3074 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3075 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3076 handle_t *handle; 3077 3078 /* (user+group)*(old+new) structure, inode write (sb, 3079 * inode block, ? - but truncate inode update has it) */ 3080 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 3081 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 3082 if (IS_ERR(handle)) { 3083 error = PTR_ERR(handle); 3084 goto err_out; 3085 } 3086 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 3087 if (error) { 3088 ext4_journal_stop(handle); 3089 return error; 3090 } 3091 /* Update corresponding info in inode so that everything is in 3092 * one transaction */ 3093 if (attr->ia_valid & ATTR_UID) 3094 inode->i_uid = attr->ia_uid; 3095 if (attr->ia_valid & ATTR_GID) 3096 inode->i_gid = attr->ia_gid; 3097 error = ext4_mark_inode_dirty(handle, inode); 3098 ext4_journal_stop(handle); 3099 } 3100 3101 if (attr->ia_valid & ATTR_SIZE) { 3102 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 3103 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3104 3105 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 3106 error = -EFBIG; 3107 goto err_out; 3108 } 3109 } 3110 } 3111 3112 if (S_ISREG(inode->i_mode) && 3113 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 3114 handle_t *handle; 3115 3116 handle = ext4_journal_start(inode, 3); 3117 if (IS_ERR(handle)) { 3118 error = PTR_ERR(handle); 3119 goto err_out; 3120 } 3121 3122 error = ext4_orphan_add(handle, inode); 3123 EXT4_I(inode)->i_disksize = attr->ia_size; 3124 rc = ext4_mark_inode_dirty(handle, inode); 3125 if (!error) 3126 error = rc; 3127 ext4_journal_stop(handle); 3128 } 3129 3130 rc = inode_setattr(inode, attr); 3131 3132 /* If inode_setattr's call to ext4_truncate failed to get a 3133 * transaction handle at all, we need to clean up the in-core 3134 * orphan list manually. */ 3135 if (inode->i_nlink) 3136 ext4_orphan_del(NULL, inode); 3137 3138 if (!rc && (ia_valid & ATTR_MODE)) 3139 rc = ext4_acl_chmod(inode); 3140 3141 err_out: 3142 ext4_std_error(inode->i_sb, error); 3143 if (!error) 3144 error = rc; 3145 return error; 3146 } 3147 3148 3149 /* 3150 * How many blocks doth make a writepage()? 3151 * 3152 * With N blocks per page, it may be: 3153 * N data blocks 3154 * 2 indirect block 3155 * 2 dindirect 3156 * 1 tindirect 3157 * N+5 bitmap blocks (from the above) 3158 * N+5 group descriptor summary blocks 3159 * 1 inode block 3160 * 1 superblock. 3161 * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files 3162 * 3163 * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS 3164 * 3165 * With ordered or writeback data it's the same, less the N data blocks. 3166 * 3167 * If the inode's direct blocks can hold an integral number of pages then a 3168 * page cannot straddle two indirect blocks, and we can only touch one indirect 3169 * and dindirect block, and the "5" above becomes "3". 3170 * 3171 * This still overestimates under most circumstances. If we were to pass the 3172 * start and end offsets in here as well we could do block_to_path() on each 3173 * block and work out the exact number of indirects which are touched. Pah. 3174 */ 3175 3176 int ext4_writepage_trans_blocks(struct inode *inode) 3177 { 3178 int bpp = ext4_journal_blocks_per_page(inode); 3179 int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; 3180 int ret; 3181 3182 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 3183 return ext4_ext_writepage_trans_blocks(inode, bpp); 3184 3185 if (ext4_should_journal_data(inode)) 3186 ret = 3 * (bpp + indirects) + 2; 3187 else 3188 ret = 2 * (bpp + indirects) + 2; 3189 3190 #ifdef CONFIG_QUOTA 3191 /* We know that structure was already allocated during DQUOT_INIT so 3192 * we will be updating only the data blocks + inodes */ 3193 ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 3194 #endif 3195 3196 return ret; 3197 } 3198 3199 /* 3200 * The caller must have previously called ext4_reserve_inode_write(). 3201 * Give this, we know that the caller already has write access to iloc->bh. 3202 */ 3203 int ext4_mark_iloc_dirty(handle_t *handle, 3204 struct inode *inode, struct ext4_iloc *iloc) 3205 { 3206 int err = 0; 3207 3208 if (test_opt(inode->i_sb, I_VERSION)) 3209 inode_inc_iversion(inode); 3210 3211 /* the do_update_inode consumes one bh->b_count */ 3212 get_bh(iloc->bh); 3213 3214 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 3215 err = ext4_do_update_inode(handle, inode, iloc); 3216 put_bh(iloc->bh); 3217 return err; 3218 } 3219 3220 /* 3221 * On success, We end up with an outstanding reference count against 3222 * iloc->bh. This _must_ be cleaned up later. 3223 */ 3224 3225 int 3226 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 3227 struct ext4_iloc *iloc) 3228 { 3229 int err = 0; 3230 if (handle) { 3231 err = ext4_get_inode_loc(inode, iloc); 3232 if (!err) { 3233 BUFFER_TRACE(iloc->bh, "get_write_access"); 3234 err = ext4_journal_get_write_access(handle, iloc->bh); 3235 if (err) { 3236 brelse(iloc->bh); 3237 iloc->bh = NULL; 3238 } 3239 } 3240 } 3241 ext4_std_error(inode->i_sb, err); 3242 return err; 3243 } 3244 3245 /* 3246 * Expand an inode by new_extra_isize bytes. 3247 * Returns 0 on success or negative error number on failure. 3248 */ 3249 static int ext4_expand_extra_isize(struct inode *inode, 3250 unsigned int new_extra_isize, 3251 struct ext4_iloc iloc, 3252 handle_t *handle) 3253 { 3254 struct ext4_inode *raw_inode; 3255 struct ext4_xattr_ibody_header *header; 3256 struct ext4_xattr_entry *entry; 3257 3258 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 3259 return 0; 3260 3261 raw_inode = ext4_raw_inode(&iloc); 3262 3263 header = IHDR(inode, raw_inode); 3264 entry = IFIRST(header); 3265 3266 /* No extended attributes present */ 3267 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 3268 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 3269 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 3270 new_extra_isize); 3271 EXT4_I(inode)->i_extra_isize = new_extra_isize; 3272 return 0; 3273 } 3274 3275 /* try to expand with EAs present */ 3276 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 3277 raw_inode, handle); 3278 } 3279 3280 /* 3281 * What we do here is to mark the in-core inode as clean with respect to inode 3282 * dirtiness (it may still be data-dirty). 3283 * This means that the in-core inode may be reaped by prune_icache 3284 * without having to perform any I/O. This is a very good thing, 3285 * because *any* task may call prune_icache - even ones which 3286 * have a transaction open against a different journal. 3287 * 3288 * Is this cheating? Not really. Sure, we haven't written the 3289 * inode out, but prune_icache isn't a user-visible syncing function. 3290 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 3291 * we start and wait on commits. 3292 * 3293 * Is this efficient/effective? Well, we're being nice to the system 3294 * by cleaning up our inodes proactively so they can be reaped 3295 * without I/O. But we are potentially leaving up to five seconds' 3296 * worth of inodes floating about which prune_icache wants us to 3297 * write out. One way to fix that would be to get prune_icache() 3298 * to do a write_super() to free up some memory. It has the desired 3299 * effect. 3300 */ 3301 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 3302 { 3303 struct ext4_iloc iloc; 3304 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3305 static unsigned int mnt_count; 3306 int err, ret; 3307 3308 might_sleep(); 3309 err = ext4_reserve_inode_write(handle, inode, &iloc); 3310 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 3311 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 3312 /* 3313 * We need extra buffer credits since we may write into EA block 3314 * with this same handle. If journal_extend fails, then it will 3315 * only result in a minor loss of functionality for that inode. 3316 * If this is felt to be critical, then e2fsck should be run to 3317 * force a large enough s_min_extra_isize. 3318 */ 3319 if ((jbd2_journal_extend(handle, 3320 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 3321 ret = ext4_expand_extra_isize(inode, 3322 sbi->s_want_extra_isize, 3323 iloc, handle); 3324 if (ret) { 3325 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 3326 if (mnt_count != 3327 le16_to_cpu(sbi->s_es->s_mnt_count)) { 3328 ext4_warning(inode->i_sb, __FUNCTION__, 3329 "Unable to expand inode %lu. Delete" 3330 " some EAs or run e2fsck.", 3331 inode->i_ino); 3332 mnt_count = 3333 le16_to_cpu(sbi->s_es->s_mnt_count); 3334 } 3335 } 3336 } 3337 } 3338 if (!err) 3339 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 3340 return err; 3341 } 3342 3343 /* 3344 * ext4_dirty_inode() is called from __mark_inode_dirty() 3345 * 3346 * We're really interested in the case where a file is being extended. 3347 * i_size has been changed by generic_commit_write() and we thus need 3348 * to include the updated inode in the current transaction. 3349 * 3350 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks 3351 * are allocated to the file. 3352 * 3353 * If the inode is marked synchronous, we don't honour that here - doing 3354 * so would cause a commit on atime updates, which we don't bother doing. 3355 * We handle synchronous inodes at the highest possible level. 3356 */ 3357 void ext4_dirty_inode(struct inode *inode) 3358 { 3359 handle_t *current_handle = ext4_journal_current_handle(); 3360 handle_t *handle; 3361 3362 handle = ext4_journal_start(inode, 2); 3363 if (IS_ERR(handle)) 3364 goto out; 3365 if (current_handle && 3366 current_handle->h_transaction != handle->h_transaction) { 3367 /* This task has a transaction open against a different fs */ 3368 printk(KERN_EMERG "%s: transactions do not match!\n", 3369 __FUNCTION__); 3370 } else { 3371 jbd_debug(5, "marking dirty. outer handle=%p\n", 3372 current_handle); 3373 ext4_mark_inode_dirty(handle, inode); 3374 } 3375 ext4_journal_stop(handle); 3376 out: 3377 return; 3378 } 3379 3380 #if 0 3381 /* 3382 * Bind an inode's backing buffer_head into this transaction, to prevent 3383 * it from being flushed to disk early. Unlike 3384 * ext4_reserve_inode_write, this leaves behind no bh reference and 3385 * returns no iloc structure, so the caller needs to repeat the iloc 3386 * lookup to mark the inode dirty later. 3387 */ 3388 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 3389 { 3390 struct ext4_iloc iloc; 3391 3392 int err = 0; 3393 if (handle) { 3394 err = ext4_get_inode_loc(inode, &iloc); 3395 if (!err) { 3396 BUFFER_TRACE(iloc.bh, "get_write_access"); 3397 err = jbd2_journal_get_write_access(handle, iloc.bh); 3398 if (!err) 3399 err = ext4_journal_dirty_metadata(handle, 3400 iloc.bh); 3401 brelse(iloc.bh); 3402 } 3403 } 3404 ext4_std_error(inode->i_sb, err); 3405 return err; 3406 } 3407 #endif 3408 3409 int ext4_change_inode_journal_flag(struct inode *inode, int val) 3410 { 3411 journal_t *journal; 3412 handle_t *handle; 3413 int err; 3414 3415 /* 3416 * We have to be very careful here: changing a data block's 3417 * journaling status dynamically is dangerous. If we write a 3418 * data block to the journal, change the status and then delete 3419 * that block, we risk forgetting to revoke the old log record 3420 * from the journal and so a subsequent replay can corrupt data. 3421 * So, first we make sure that the journal is empty and that 3422 * nobody is changing anything. 3423 */ 3424 3425 journal = EXT4_JOURNAL(inode); 3426 if (is_journal_aborted(journal)) 3427 return -EROFS; 3428 3429 jbd2_journal_lock_updates(journal); 3430 jbd2_journal_flush(journal); 3431 3432 /* 3433 * OK, there are no updates running now, and all cached data is 3434 * synced to disk. We are now in a completely consistent state 3435 * which doesn't have anything in the journal, and we know that 3436 * no filesystem updates are running, so it is safe to modify 3437 * the inode's in-core data-journaling state flag now. 3438 */ 3439 3440 if (val) 3441 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 3442 else 3443 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 3444 ext4_set_aops(inode); 3445 3446 jbd2_journal_unlock_updates(journal); 3447 3448 /* Finally we can mark the inode as dirty. */ 3449 3450 handle = ext4_journal_start(inode, 1); 3451 if (IS_ERR(handle)) 3452 return PTR_ERR(handle); 3453 3454 err = ext4_mark_inode_dirty(handle, inode); 3455 handle->h_sync = 1; 3456 ext4_journal_stop(handle); 3457 ext4_std_error(inode->i_sb, err); 3458 3459 return err; 3460 } 3461