1 /* 2 * linux/fs/ext4/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * 64-bit file support on 64-bit platforms by Jakub Jelinek 16 * (jj@sunsite.ms.mff.cuni.cz) 17 * 18 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 19 */ 20 21 #include <linux/fs.h> 22 #include <linux/time.h> 23 #include <linux/highuid.h> 24 #include <linux/pagemap.h> 25 #include <linux/dax.h> 26 #include <linux/quotaops.h> 27 #include <linux/string.h> 28 #include <linux/buffer_head.h> 29 #include <linux/writeback.h> 30 #include <linux/pagevec.h> 31 #include <linux/mpage.h> 32 #include <linux/namei.h> 33 #include <linux/uio.h> 34 #include <linux/bio.h> 35 #include <linux/workqueue.h> 36 #include <linux/kernel.h> 37 #include <linux/printk.h> 38 #include <linux/slab.h> 39 #include <linux/bitops.h> 40 41 #include "ext4_jbd2.h" 42 #include "xattr.h" 43 #include "acl.h" 44 #include "truncate.h" 45 46 #include <trace/events/ext4.h> 47 48 #define MPAGE_DA_EXTENT_TAIL 0x01 49 50 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, 51 struct ext4_inode_info *ei) 52 { 53 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 54 __u16 csum_lo; 55 __u16 csum_hi = 0; 56 __u32 csum; 57 58 csum_lo = le16_to_cpu(raw->i_checksum_lo); 59 raw->i_checksum_lo = 0; 60 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 61 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { 62 csum_hi = le16_to_cpu(raw->i_checksum_hi); 63 raw->i_checksum_hi = 0; 64 } 65 66 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, 67 EXT4_INODE_SIZE(inode->i_sb)); 68 69 raw->i_checksum_lo = cpu_to_le16(csum_lo); 70 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 71 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 72 raw->i_checksum_hi = cpu_to_le16(csum_hi); 73 74 return csum; 75 } 76 77 static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, 78 struct ext4_inode_info *ei) 79 { 80 __u32 provided, calculated; 81 82 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 83 cpu_to_le32(EXT4_OS_LINUX) || 84 !ext4_has_metadata_csum(inode->i_sb)) 85 return 1; 86 87 provided = le16_to_cpu(raw->i_checksum_lo); 88 calculated = ext4_inode_csum(inode, raw, ei); 89 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 90 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 91 provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; 92 else 93 calculated &= 0xFFFF; 94 95 return provided == calculated; 96 } 97 98 static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, 99 struct ext4_inode_info *ei) 100 { 101 __u32 csum; 102 103 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 104 cpu_to_le32(EXT4_OS_LINUX) || 105 !ext4_has_metadata_csum(inode->i_sb)) 106 return; 107 108 csum = ext4_inode_csum(inode, raw, ei); 109 raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); 110 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 111 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 112 raw->i_checksum_hi = cpu_to_le16(csum >> 16); 113 } 114 115 static inline int ext4_begin_ordered_truncate(struct inode *inode, 116 loff_t new_size) 117 { 118 trace_ext4_begin_ordered_truncate(inode, new_size); 119 /* 120 * If jinode is zero, then we never opened the file for 121 * writing, so there's no need to call 122 * jbd2_journal_begin_ordered_truncate() since there's no 123 * outstanding writes we need to flush. 124 */ 125 if (!EXT4_I(inode)->jinode) 126 return 0; 127 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), 128 EXT4_I(inode)->jinode, 129 new_size); 130 } 131 132 static void ext4_invalidatepage(struct page *page, unsigned int offset, 133 unsigned int length); 134 static int __ext4_journalled_writepage(struct page *page, unsigned int len); 135 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 136 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 137 int pextents); 138 139 /* 140 * Test whether an inode is a fast symlink. 141 */ 142 int ext4_inode_is_fast_symlink(struct inode *inode) 143 { 144 int ea_blocks = EXT4_I(inode)->i_file_acl ? 145 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; 146 147 if (ext4_has_inline_data(inode)) 148 return 0; 149 150 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 151 } 152 153 /* 154 * Restart the transaction associated with *handle. This does a commit, 155 * so before we call here everything must be consistently dirtied against 156 * this transaction. 157 */ 158 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, 159 int nblocks) 160 { 161 int ret; 162 163 /* 164 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this 165 * moment, get_block can be called only for blocks inside i_size since 166 * page cache has been already dropped and writes are blocked by 167 * i_mutex. So we can safely drop the i_data_sem here. 168 */ 169 BUG_ON(EXT4_JOURNAL(inode) == NULL); 170 jbd_debug(2, "restarting handle %p\n", handle); 171 up_write(&EXT4_I(inode)->i_data_sem); 172 ret = ext4_journal_restart(handle, nblocks); 173 down_write(&EXT4_I(inode)->i_data_sem); 174 ext4_discard_preallocations(inode); 175 176 return ret; 177 } 178 179 /* 180 * Called at the last iput() if i_nlink is zero. 181 */ 182 void ext4_evict_inode(struct inode *inode) 183 { 184 handle_t *handle; 185 int err; 186 187 trace_ext4_evict_inode(inode); 188 189 if (inode->i_nlink) { 190 /* 191 * When journalling data dirty buffers are tracked only in the 192 * journal. So although mm thinks everything is clean and 193 * ready for reaping the inode might still have some pages to 194 * write in the running transaction or waiting to be 195 * checkpointed. Thus calling jbd2_journal_invalidatepage() 196 * (via truncate_inode_pages()) to discard these buffers can 197 * cause data loss. Also even if we did not discard these 198 * buffers, we would have no way to find them after the inode 199 * is reaped and thus user could see stale data if he tries to 200 * read them before the transaction is checkpointed. So be 201 * careful and force everything to disk here... We use 202 * ei->i_datasync_tid to store the newest transaction 203 * containing inode's data. 204 * 205 * Note that directories do not have this problem because they 206 * don't use page cache. 207 */ 208 if (ext4_should_journal_data(inode) && 209 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && 210 inode->i_ino != EXT4_JOURNAL_INO) { 211 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 212 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; 213 214 jbd2_complete_transaction(journal, commit_tid); 215 filemap_write_and_wait(&inode->i_data); 216 } 217 truncate_inode_pages_final(&inode->i_data); 218 219 goto no_delete; 220 } 221 222 if (is_bad_inode(inode)) 223 goto no_delete; 224 dquot_initialize(inode); 225 226 if (ext4_should_order_data(inode)) 227 ext4_begin_ordered_truncate(inode, 0); 228 truncate_inode_pages_final(&inode->i_data); 229 230 /* 231 * Protect us against freezing - iput() caller didn't have to have any 232 * protection against it 233 */ 234 sb_start_intwrite(inode->i_sb); 235 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, 236 ext4_blocks_for_truncate(inode)+3); 237 if (IS_ERR(handle)) { 238 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 239 /* 240 * If we're going to skip the normal cleanup, we still need to 241 * make sure that the in-core orphan linked list is properly 242 * cleaned up. 243 */ 244 ext4_orphan_del(NULL, inode); 245 sb_end_intwrite(inode->i_sb); 246 goto no_delete; 247 } 248 249 if (IS_SYNC(inode)) 250 ext4_handle_sync(handle); 251 inode->i_size = 0; 252 err = ext4_mark_inode_dirty(handle, inode); 253 if (err) { 254 ext4_warning(inode->i_sb, 255 "couldn't mark inode dirty (err %d)", err); 256 goto stop_handle; 257 } 258 if (inode->i_blocks) 259 ext4_truncate(inode); 260 261 /* 262 * ext4_ext_truncate() doesn't reserve any slop when it 263 * restarts journal transactions; therefore there may not be 264 * enough credits left in the handle to remove the inode from 265 * the orphan list and set the dtime field. 266 */ 267 if (!ext4_handle_has_enough_credits(handle, 3)) { 268 err = ext4_journal_extend(handle, 3); 269 if (err > 0) 270 err = ext4_journal_restart(handle, 3); 271 if (err != 0) { 272 ext4_warning(inode->i_sb, 273 "couldn't extend journal (err %d)", err); 274 stop_handle: 275 ext4_journal_stop(handle); 276 ext4_orphan_del(NULL, inode); 277 sb_end_intwrite(inode->i_sb); 278 goto no_delete; 279 } 280 } 281 282 /* 283 * Kill off the orphan record which ext4_truncate created. 284 * AKPM: I think this can be inside the above `if'. 285 * Note that ext4_orphan_del() has to be able to cope with the 286 * deletion of a non-existent orphan - this is because we don't 287 * know if ext4_truncate() actually created an orphan record. 288 * (Well, we could do this if we need to, but heck - it works) 289 */ 290 ext4_orphan_del(handle, inode); 291 EXT4_I(inode)->i_dtime = get_seconds(); 292 293 /* 294 * One subtle ordering requirement: if anything has gone wrong 295 * (transaction abort, IO errors, whatever), then we can still 296 * do these next steps (the fs will already have been marked as 297 * having errors), but we can't free the inode if the mark_dirty 298 * fails. 299 */ 300 if (ext4_mark_inode_dirty(handle, inode)) 301 /* If that failed, just do the required in-core inode clear. */ 302 ext4_clear_inode(inode); 303 else 304 ext4_free_inode(handle, inode); 305 ext4_journal_stop(handle); 306 sb_end_intwrite(inode->i_sb); 307 return; 308 no_delete: 309 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 310 } 311 312 #ifdef CONFIG_QUOTA 313 qsize_t *ext4_get_reserved_space(struct inode *inode) 314 { 315 return &EXT4_I(inode)->i_reserved_quota; 316 } 317 #endif 318 319 /* 320 * Called with i_data_sem down, which is important since we can call 321 * ext4_discard_preallocations() from here. 322 */ 323 void ext4_da_update_reserve_space(struct inode *inode, 324 int used, int quota_claim) 325 { 326 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 327 struct ext4_inode_info *ei = EXT4_I(inode); 328 329 spin_lock(&ei->i_block_reservation_lock); 330 trace_ext4_da_update_reserve_space(inode, used, quota_claim); 331 if (unlikely(used > ei->i_reserved_data_blocks)) { 332 ext4_warning(inode->i_sb, "%s: ino %lu, used %d " 333 "with only %d reserved data blocks", 334 __func__, inode->i_ino, used, 335 ei->i_reserved_data_blocks); 336 WARN_ON(1); 337 used = ei->i_reserved_data_blocks; 338 } 339 340 /* Update per-inode reservations */ 341 ei->i_reserved_data_blocks -= used; 342 percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); 343 344 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 345 346 /* Update quota subsystem for data blocks */ 347 if (quota_claim) 348 dquot_claim_block(inode, EXT4_C2B(sbi, used)); 349 else { 350 /* 351 * We did fallocate with an offset that is already delayed 352 * allocated. So on delayed allocated writeback we should 353 * not re-claim the quota for fallocated blocks. 354 */ 355 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); 356 } 357 358 /* 359 * If we have done all the pending block allocations and if 360 * there aren't any writers on the inode, we can discard the 361 * inode's preallocations. 362 */ 363 if ((ei->i_reserved_data_blocks == 0) && 364 (atomic_read(&inode->i_writecount) == 0)) 365 ext4_discard_preallocations(inode); 366 } 367 368 static int __check_block_validity(struct inode *inode, const char *func, 369 unsigned int line, 370 struct ext4_map_blocks *map) 371 { 372 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, 373 map->m_len)) { 374 ext4_error_inode(inode, func, line, map->m_pblk, 375 "lblock %lu mapped to illegal pblock " 376 "(length %d)", (unsigned long) map->m_lblk, 377 map->m_len); 378 return -EFSCORRUPTED; 379 } 380 return 0; 381 } 382 383 int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, 384 ext4_lblk_t len) 385 { 386 int ret; 387 388 if (ext4_encrypted_inode(inode)) 389 return ext4_encrypted_zeroout(inode, lblk, pblk, len); 390 391 ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); 392 if (ret > 0) 393 ret = 0; 394 395 return ret; 396 } 397 398 #define check_block_validity(inode, map) \ 399 __check_block_validity((inode), __func__, __LINE__, (map)) 400 401 #ifdef ES_AGGRESSIVE_TEST 402 static void ext4_map_blocks_es_recheck(handle_t *handle, 403 struct inode *inode, 404 struct ext4_map_blocks *es_map, 405 struct ext4_map_blocks *map, 406 int flags) 407 { 408 int retval; 409 410 map->m_flags = 0; 411 /* 412 * There is a race window that the result is not the same. 413 * e.g. xfstests #223 when dioread_nolock enables. The reason 414 * is that we lookup a block mapping in extent status tree with 415 * out taking i_data_sem. So at the time the unwritten extent 416 * could be converted. 417 */ 418 down_read(&EXT4_I(inode)->i_data_sem); 419 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 420 retval = ext4_ext_map_blocks(handle, inode, map, flags & 421 EXT4_GET_BLOCKS_KEEP_SIZE); 422 } else { 423 retval = ext4_ind_map_blocks(handle, inode, map, flags & 424 EXT4_GET_BLOCKS_KEEP_SIZE); 425 } 426 up_read((&EXT4_I(inode)->i_data_sem)); 427 428 /* 429 * We don't check m_len because extent will be collpased in status 430 * tree. So the m_len might not equal. 431 */ 432 if (es_map->m_lblk != map->m_lblk || 433 es_map->m_flags != map->m_flags || 434 es_map->m_pblk != map->m_pblk) { 435 printk("ES cache assertion failed for inode: %lu " 436 "es_cached ex [%d/%d/%llu/%x] != " 437 "found ex [%d/%d/%llu/%x] retval %d flags %x\n", 438 inode->i_ino, es_map->m_lblk, es_map->m_len, 439 es_map->m_pblk, es_map->m_flags, map->m_lblk, 440 map->m_len, map->m_pblk, map->m_flags, 441 retval, flags); 442 } 443 } 444 #endif /* ES_AGGRESSIVE_TEST */ 445 446 /* 447 * The ext4_map_blocks() function tries to look up the requested blocks, 448 * and returns if the blocks are already mapped. 449 * 450 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 451 * and store the allocated blocks in the result buffer head and mark it 452 * mapped. 453 * 454 * If file type is extents based, it will call ext4_ext_map_blocks(), 455 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping 456 * based files 457 * 458 * On success, it returns the number of blocks being mapped or allocated. if 459 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map 460 * is marked as unwritten. If the create == 1, it will mark @map as mapped. 461 * 462 * It returns 0 if plain look up failed (blocks have not been allocated), in 463 * that case, @map is returned as unmapped but we still do fill map->m_len to 464 * indicate the length of a hole starting at map->m_lblk. 465 * 466 * It returns the error in case of allocation failure. 467 */ 468 int ext4_map_blocks(handle_t *handle, struct inode *inode, 469 struct ext4_map_blocks *map, int flags) 470 { 471 struct extent_status es; 472 int retval; 473 int ret = 0; 474 #ifdef ES_AGGRESSIVE_TEST 475 struct ext4_map_blocks orig_map; 476 477 memcpy(&orig_map, map, sizeof(*map)); 478 #endif 479 480 map->m_flags = 0; 481 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," 482 "logical block %lu\n", inode->i_ino, flags, map->m_len, 483 (unsigned long) map->m_lblk); 484 485 /* 486 * ext4_map_blocks returns an int, and m_len is an unsigned int 487 */ 488 if (unlikely(map->m_len > INT_MAX)) 489 map->m_len = INT_MAX; 490 491 /* We can handle the block number less than EXT_MAX_BLOCKS */ 492 if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) 493 return -EFSCORRUPTED; 494 495 /* Lookup extent status tree firstly */ 496 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 497 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 498 map->m_pblk = ext4_es_pblock(&es) + 499 map->m_lblk - es.es_lblk; 500 map->m_flags |= ext4_es_is_written(&es) ? 501 EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; 502 retval = es.es_len - (map->m_lblk - es.es_lblk); 503 if (retval > map->m_len) 504 retval = map->m_len; 505 map->m_len = retval; 506 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { 507 map->m_pblk = 0; 508 retval = es.es_len - (map->m_lblk - es.es_lblk); 509 if (retval > map->m_len) 510 retval = map->m_len; 511 map->m_len = retval; 512 retval = 0; 513 } else { 514 BUG_ON(1); 515 } 516 #ifdef ES_AGGRESSIVE_TEST 517 ext4_map_blocks_es_recheck(handle, inode, map, 518 &orig_map, flags); 519 #endif 520 goto found; 521 } 522 523 /* 524 * Try to see if we can get the block without requesting a new 525 * file system block. 526 */ 527 down_read(&EXT4_I(inode)->i_data_sem); 528 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 529 retval = ext4_ext_map_blocks(handle, inode, map, flags & 530 EXT4_GET_BLOCKS_KEEP_SIZE); 531 } else { 532 retval = ext4_ind_map_blocks(handle, inode, map, flags & 533 EXT4_GET_BLOCKS_KEEP_SIZE); 534 } 535 if (retval > 0) { 536 unsigned int status; 537 538 if (unlikely(retval != map->m_len)) { 539 ext4_warning(inode->i_sb, 540 "ES len assertion failed for inode " 541 "%lu: retval %d != map->m_len %d", 542 inode->i_ino, retval, map->m_len); 543 WARN_ON(1); 544 } 545 546 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 547 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 548 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 549 !(status & EXTENT_STATUS_WRITTEN) && 550 ext4_find_delalloc_range(inode, map->m_lblk, 551 map->m_lblk + map->m_len - 1)) 552 status |= EXTENT_STATUS_DELAYED; 553 ret = ext4_es_insert_extent(inode, map->m_lblk, 554 map->m_len, map->m_pblk, status); 555 if (ret < 0) 556 retval = ret; 557 } 558 up_read((&EXT4_I(inode)->i_data_sem)); 559 560 found: 561 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 562 ret = check_block_validity(inode, map); 563 if (ret != 0) 564 return ret; 565 } 566 567 /* If it is only a block(s) look up */ 568 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) 569 return retval; 570 571 /* 572 * Returns if the blocks have already allocated 573 * 574 * Note that if blocks have been preallocated 575 * ext4_ext_get_block() returns the create = 0 576 * with buffer head unmapped. 577 */ 578 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 579 /* 580 * If we need to convert extent to unwritten 581 * we continue and do the actual work in 582 * ext4_ext_map_blocks() 583 */ 584 if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) 585 return retval; 586 587 /* 588 * Here we clear m_flags because after allocating an new extent, 589 * it will be set again. 590 */ 591 map->m_flags &= ~EXT4_MAP_FLAGS; 592 593 /* 594 * New blocks allocate and/or writing to unwritten extent 595 * will possibly result in updating i_data, so we take 596 * the write lock of i_data_sem, and call get_block() 597 * with create == 1 flag. 598 */ 599 down_write(&EXT4_I(inode)->i_data_sem); 600 601 /* 602 * We need to check for EXT4 here because migrate 603 * could have changed the inode type in between 604 */ 605 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 606 retval = ext4_ext_map_blocks(handle, inode, map, flags); 607 } else { 608 retval = ext4_ind_map_blocks(handle, inode, map, flags); 609 610 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { 611 /* 612 * We allocated new blocks which will result in 613 * i_data's format changing. Force the migrate 614 * to fail by clearing migrate flags 615 */ 616 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); 617 } 618 619 /* 620 * Update reserved blocks/metadata blocks after successful 621 * block allocation which had been deferred till now. We don't 622 * support fallocate for non extent files. So we can update 623 * reserve space here. 624 */ 625 if ((retval > 0) && 626 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 627 ext4_da_update_reserve_space(inode, retval, 1); 628 } 629 630 if (retval > 0) { 631 unsigned int status; 632 633 if (unlikely(retval != map->m_len)) { 634 ext4_warning(inode->i_sb, 635 "ES len assertion failed for inode " 636 "%lu: retval %d != map->m_len %d", 637 inode->i_ino, retval, map->m_len); 638 WARN_ON(1); 639 } 640 641 /* 642 * We have to zeroout blocks before inserting them into extent 643 * status tree. Otherwise someone could look them up there and 644 * use them before they are really zeroed. 645 */ 646 if (flags & EXT4_GET_BLOCKS_ZERO && 647 map->m_flags & EXT4_MAP_MAPPED && 648 map->m_flags & EXT4_MAP_NEW) { 649 ret = ext4_issue_zeroout(inode, map->m_lblk, 650 map->m_pblk, map->m_len); 651 if (ret) { 652 retval = ret; 653 goto out_sem; 654 } 655 } 656 657 /* 658 * If the extent has been zeroed out, we don't need to update 659 * extent status tree. 660 */ 661 if ((flags & EXT4_GET_BLOCKS_PRE_IO) && 662 ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 663 if (ext4_es_is_written(&es)) 664 goto out_sem; 665 } 666 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 667 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 668 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 669 !(status & EXTENT_STATUS_WRITTEN) && 670 ext4_find_delalloc_range(inode, map->m_lblk, 671 map->m_lblk + map->m_len - 1)) 672 status |= EXTENT_STATUS_DELAYED; 673 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 674 map->m_pblk, status); 675 if (ret < 0) { 676 retval = ret; 677 goto out_sem; 678 } 679 } 680 681 out_sem: 682 up_write((&EXT4_I(inode)->i_data_sem)); 683 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 684 ret = check_block_validity(inode, map); 685 if (ret != 0) 686 return ret; 687 } 688 return retval; 689 } 690 691 /* 692 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages 693 * we have to be careful as someone else may be manipulating b_state as well. 694 */ 695 static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) 696 { 697 unsigned long old_state; 698 unsigned long new_state; 699 700 flags &= EXT4_MAP_FLAGS; 701 702 /* Dummy buffer_head? Set non-atomically. */ 703 if (!bh->b_page) { 704 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; 705 return; 706 } 707 /* 708 * Someone else may be modifying b_state. Be careful! This is ugly but 709 * once we get rid of using bh as a container for mapping information 710 * to pass to / from get_block functions, this can go away. 711 */ 712 do { 713 old_state = READ_ONCE(bh->b_state); 714 new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; 715 } while (unlikely( 716 cmpxchg(&bh->b_state, old_state, new_state) != old_state)); 717 } 718 719 static int _ext4_get_block(struct inode *inode, sector_t iblock, 720 struct buffer_head *bh, int flags) 721 { 722 struct ext4_map_blocks map; 723 int ret = 0; 724 725 if (ext4_has_inline_data(inode)) 726 return -ERANGE; 727 728 map.m_lblk = iblock; 729 map.m_len = bh->b_size >> inode->i_blkbits; 730 731 ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, 732 flags); 733 if (ret > 0) { 734 map_bh(bh, inode->i_sb, map.m_pblk); 735 ext4_update_bh_state(bh, map.m_flags); 736 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 737 ret = 0; 738 } 739 return ret; 740 } 741 742 int ext4_get_block(struct inode *inode, sector_t iblock, 743 struct buffer_head *bh, int create) 744 { 745 return _ext4_get_block(inode, iblock, bh, 746 create ? EXT4_GET_BLOCKS_CREATE : 0); 747 } 748 749 /* 750 * Get block function used when preparing for buffered write if we require 751 * creating an unwritten extent if blocks haven't been allocated. The extent 752 * will be converted to written after the IO is complete. 753 */ 754 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 755 struct buffer_head *bh_result, int create) 756 { 757 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", 758 inode->i_ino, create); 759 return _ext4_get_block(inode, iblock, bh_result, 760 EXT4_GET_BLOCKS_IO_CREATE_EXT); 761 } 762 763 /* Maximum number of blocks we map for direct IO at once. */ 764 #define DIO_MAX_BLOCKS 4096 765 766 /* 767 * Get blocks function for the cases that need to start a transaction - 768 * generally difference cases of direct IO and DAX IO. It also handles retries 769 * in case of ENOSPC. 770 */ 771 static int ext4_get_block_trans(struct inode *inode, sector_t iblock, 772 struct buffer_head *bh_result, int flags) 773 { 774 int dio_credits; 775 handle_t *handle; 776 int retries = 0; 777 int ret; 778 779 /* Trim mapping request to maximum we can map at once for DIO */ 780 if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) 781 bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; 782 dio_credits = ext4_chunk_trans_blocks(inode, 783 bh_result->b_size >> inode->i_blkbits); 784 retry: 785 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); 786 if (IS_ERR(handle)) 787 return PTR_ERR(handle); 788 789 ret = _ext4_get_block(inode, iblock, bh_result, flags); 790 ext4_journal_stop(handle); 791 792 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 793 goto retry; 794 return ret; 795 } 796 797 /* Get block function for DIO reads and writes to inodes without extents */ 798 int ext4_dio_get_block(struct inode *inode, sector_t iblock, 799 struct buffer_head *bh, int create) 800 { 801 /* We don't expect handle for direct IO */ 802 WARN_ON_ONCE(ext4_journal_current_handle()); 803 804 if (!create) 805 return _ext4_get_block(inode, iblock, bh, 0); 806 return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); 807 } 808 809 /* 810 * Get block function for AIO DIO writes when we create unwritten extent if 811 * blocks are not allocated yet. The extent will be converted to written 812 * after IO is complete. 813 */ 814 static int ext4_dio_get_block_unwritten_async(struct inode *inode, 815 sector_t iblock, struct buffer_head *bh_result, int create) 816 { 817 int ret; 818 819 /* We don't expect handle for direct IO */ 820 WARN_ON_ONCE(ext4_journal_current_handle()); 821 822 ret = ext4_get_block_trans(inode, iblock, bh_result, 823 EXT4_GET_BLOCKS_IO_CREATE_EXT); 824 825 /* 826 * When doing DIO using unwritten extents, we need io_end to convert 827 * unwritten extents to written on IO completion. We allocate io_end 828 * once we spot unwritten extent and store it in b_private. Generic 829 * DIO code keeps b_private set and furthermore passes the value to 830 * our completion callback in 'private' argument. 831 */ 832 if (!ret && buffer_unwritten(bh_result)) { 833 if (!bh_result->b_private) { 834 ext4_io_end_t *io_end; 835 836 io_end = ext4_init_io_end(inode, GFP_KERNEL); 837 if (!io_end) 838 return -ENOMEM; 839 bh_result->b_private = io_end; 840 ext4_set_io_unwritten_flag(inode, io_end); 841 } 842 set_buffer_defer_completion(bh_result); 843 } 844 845 return ret; 846 } 847 848 /* 849 * Get block function for non-AIO DIO writes when we create unwritten extent if 850 * blocks are not allocated yet. The extent will be converted to written 851 * after IO is complete from ext4_ext_direct_IO() function. 852 */ 853 static int ext4_dio_get_block_unwritten_sync(struct inode *inode, 854 sector_t iblock, struct buffer_head *bh_result, int create) 855 { 856 int ret; 857 858 /* We don't expect handle for direct IO */ 859 WARN_ON_ONCE(ext4_journal_current_handle()); 860 861 ret = ext4_get_block_trans(inode, iblock, bh_result, 862 EXT4_GET_BLOCKS_IO_CREATE_EXT); 863 864 /* 865 * Mark inode as having pending DIO writes to unwritten extents. 866 * ext4_ext_direct_IO() checks this flag and converts extents to 867 * written. 868 */ 869 if (!ret && buffer_unwritten(bh_result)) 870 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 871 872 return ret; 873 } 874 875 static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, 876 struct buffer_head *bh_result, int create) 877 { 878 int ret; 879 880 ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", 881 inode->i_ino, create); 882 /* We don't expect handle for direct IO */ 883 WARN_ON_ONCE(ext4_journal_current_handle()); 884 885 ret = _ext4_get_block(inode, iblock, bh_result, 0); 886 /* 887 * Blocks should have been preallocated! ext4_file_write_iter() checks 888 * that. 889 */ 890 WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); 891 892 return ret; 893 } 894 895 896 /* 897 * `handle' can be NULL if create is zero 898 */ 899 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 900 ext4_lblk_t block, int map_flags) 901 { 902 struct ext4_map_blocks map; 903 struct buffer_head *bh; 904 int create = map_flags & EXT4_GET_BLOCKS_CREATE; 905 int err; 906 907 J_ASSERT(handle != NULL || create == 0); 908 909 map.m_lblk = block; 910 map.m_len = 1; 911 err = ext4_map_blocks(handle, inode, &map, map_flags); 912 913 if (err == 0) 914 return create ? ERR_PTR(-ENOSPC) : NULL; 915 if (err < 0) 916 return ERR_PTR(err); 917 918 bh = sb_getblk(inode->i_sb, map.m_pblk); 919 if (unlikely(!bh)) 920 return ERR_PTR(-ENOMEM); 921 if (map.m_flags & EXT4_MAP_NEW) { 922 J_ASSERT(create != 0); 923 J_ASSERT(handle != NULL); 924 925 /* 926 * Now that we do not always journal data, we should 927 * keep in mind whether this should always journal the 928 * new buffer as metadata. For now, regular file 929 * writes use ext4_get_block instead, so it's not a 930 * problem. 931 */ 932 lock_buffer(bh); 933 BUFFER_TRACE(bh, "call get_create_access"); 934 err = ext4_journal_get_create_access(handle, bh); 935 if (unlikely(err)) { 936 unlock_buffer(bh); 937 goto errout; 938 } 939 if (!buffer_uptodate(bh)) { 940 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 941 set_buffer_uptodate(bh); 942 } 943 unlock_buffer(bh); 944 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 945 err = ext4_handle_dirty_metadata(handle, inode, bh); 946 if (unlikely(err)) 947 goto errout; 948 } else 949 BUFFER_TRACE(bh, "not a new buffer"); 950 return bh; 951 errout: 952 brelse(bh); 953 return ERR_PTR(err); 954 } 955 956 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 957 ext4_lblk_t block, int map_flags) 958 { 959 struct buffer_head *bh; 960 961 bh = ext4_getblk(handle, inode, block, map_flags); 962 if (IS_ERR(bh)) 963 return bh; 964 if (!bh || buffer_uptodate(bh)) 965 return bh; 966 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); 967 wait_on_buffer(bh); 968 if (buffer_uptodate(bh)) 969 return bh; 970 put_bh(bh); 971 return ERR_PTR(-EIO); 972 } 973 974 int ext4_walk_page_buffers(handle_t *handle, 975 struct buffer_head *head, 976 unsigned from, 977 unsigned to, 978 int *partial, 979 int (*fn)(handle_t *handle, 980 struct buffer_head *bh)) 981 { 982 struct buffer_head *bh; 983 unsigned block_start, block_end; 984 unsigned blocksize = head->b_size; 985 int err, ret = 0; 986 struct buffer_head *next; 987 988 for (bh = head, block_start = 0; 989 ret == 0 && (bh != head || !block_start); 990 block_start = block_end, bh = next) { 991 next = bh->b_this_page; 992 block_end = block_start + blocksize; 993 if (block_end <= from || block_start >= to) { 994 if (partial && !buffer_uptodate(bh)) 995 *partial = 1; 996 continue; 997 } 998 err = (*fn)(handle, bh); 999 if (!ret) 1000 ret = err; 1001 } 1002 return ret; 1003 } 1004 1005 /* 1006 * To preserve ordering, it is essential that the hole instantiation and 1007 * the data write be encapsulated in a single transaction. We cannot 1008 * close off a transaction and start a new one between the ext4_get_block() 1009 * and the commit_write(). So doing the jbd2_journal_start at the start of 1010 * prepare_write() is the right place. 1011 * 1012 * Also, this function can nest inside ext4_writepage(). In that case, we 1013 * *know* that ext4_writepage() has generated enough buffer credits to do the 1014 * whole page. So we won't block on the journal in that case, which is good, 1015 * because the caller may be PF_MEMALLOC. 1016 * 1017 * By accident, ext4 can be reentered when a transaction is open via 1018 * quota file writes. If we were to commit the transaction while thus 1019 * reentered, there can be a deadlock - we would be holding a quota 1020 * lock, and the commit would never complete if another thread had a 1021 * transaction open and was blocking on the quota lock - a ranking 1022 * violation. 1023 * 1024 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1025 * will _not_ run commit under these circumstances because handle->h_ref 1026 * is elevated. We'll still have enough credits for the tiny quotafile 1027 * write. 1028 */ 1029 int do_journal_get_write_access(handle_t *handle, 1030 struct buffer_head *bh) 1031 { 1032 int dirty = buffer_dirty(bh); 1033 int ret; 1034 1035 if (!buffer_mapped(bh) || buffer_freed(bh)) 1036 return 0; 1037 /* 1038 * __block_write_begin() could have dirtied some buffers. Clean 1039 * the dirty bit as jbd2_journal_get_write_access() could complain 1040 * otherwise about fs integrity issues. Setting of the dirty bit 1041 * by __block_write_begin() isn't a real problem here as we clear 1042 * the bit before releasing a page lock and thus writeback cannot 1043 * ever write the buffer. 1044 */ 1045 if (dirty) 1046 clear_buffer_dirty(bh); 1047 BUFFER_TRACE(bh, "get write access"); 1048 ret = ext4_journal_get_write_access(handle, bh); 1049 if (!ret && dirty) 1050 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1051 return ret; 1052 } 1053 1054 #ifdef CONFIG_EXT4_FS_ENCRYPTION 1055 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, 1056 get_block_t *get_block) 1057 { 1058 unsigned from = pos & (PAGE_SIZE - 1); 1059 unsigned to = from + len; 1060 struct inode *inode = page->mapping->host; 1061 unsigned block_start, block_end; 1062 sector_t block; 1063 int err = 0; 1064 unsigned blocksize = inode->i_sb->s_blocksize; 1065 unsigned bbits; 1066 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; 1067 bool decrypt = false; 1068 1069 BUG_ON(!PageLocked(page)); 1070 BUG_ON(from > PAGE_SIZE); 1071 BUG_ON(to > PAGE_SIZE); 1072 BUG_ON(from > to); 1073 1074 if (!page_has_buffers(page)) 1075 create_empty_buffers(page, blocksize, 0); 1076 head = page_buffers(page); 1077 bbits = ilog2(blocksize); 1078 block = (sector_t)page->index << (PAGE_SHIFT - bbits); 1079 1080 for (bh = head, block_start = 0; bh != head || !block_start; 1081 block++, block_start = block_end, bh = bh->b_this_page) { 1082 block_end = block_start + blocksize; 1083 if (block_end <= from || block_start >= to) { 1084 if (PageUptodate(page)) { 1085 if (!buffer_uptodate(bh)) 1086 set_buffer_uptodate(bh); 1087 } 1088 continue; 1089 } 1090 if (buffer_new(bh)) 1091 clear_buffer_new(bh); 1092 if (!buffer_mapped(bh)) { 1093 WARN_ON(bh->b_size != blocksize); 1094 err = get_block(inode, block, bh, 1); 1095 if (err) 1096 break; 1097 if (buffer_new(bh)) { 1098 unmap_underlying_metadata(bh->b_bdev, 1099 bh->b_blocknr); 1100 if (PageUptodate(page)) { 1101 clear_buffer_new(bh); 1102 set_buffer_uptodate(bh); 1103 mark_buffer_dirty(bh); 1104 continue; 1105 } 1106 if (block_end > to || block_start < from) 1107 zero_user_segments(page, to, block_end, 1108 block_start, from); 1109 continue; 1110 } 1111 } 1112 if (PageUptodate(page)) { 1113 if (!buffer_uptodate(bh)) 1114 set_buffer_uptodate(bh); 1115 continue; 1116 } 1117 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 1118 !buffer_unwritten(bh) && 1119 (block_start < from || block_end > to)) { 1120 ll_rw_block(READ, 1, &bh); 1121 *wait_bh++ = bh; 1122 decrypt = ext4_encrypted_inode(inode) && 1123 S_ISREG(inode->i_mode); 1124 } 1125 } 1126 /* 1127 * If we issued read requests, let them complete. 1128 */ 1129 while (wait_bh > wait) { 1130 wait_on_buffer(*--wait_bh); 1131 if (!buffer_uptodate(*wait_bh)) 1132 err = -EIO; 1133 } 1134 if (unlikely(err)) 1135 page_zero_new_buffers(page, from, to); 1136 else if (decrypt) 1137 err = ext4_decrypt(page); 1138 return err; 1139 } 1140 #endif 1141 1142 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1143 loff_t pos, unsigned len, unsigned flags, 1144 struct page **pagep, void **fsdata) 1145 { 1146 struct inode *inode = mapping->host; 1147 int ret, needed_blocks; 1148 handle_t *handle; 1149 int retries = 0; 1150 struct page *page; 1151 pgoff_t index; 1152 unsigned from, to; 1153 1154 trace_ext4_write_begin(inode, pos, len, flags); 1155 /* 1156 * Reserve one block more for addition to orphan list in case 1157 * we allocate blocks but write fails for some reason 1158 */ 1159 needed_blocks = ext4_writepage_trans_blocks(inode) + 1; 1160 index = pos >> PAGE_SHIFT; 1161 from = pos & (PAGE_SIZE - 1); 1162 to = from + len; 1163 1164 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 1165 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, 1166 flags, pagep); 1167 if (ret < 0) 1168 return ret; 1169 if (ret == 1) 1170 return 0; 1171 } 1172 1173 /* 1174 * grab_cache_page_write_begin() can take a long time if the 1175 * system is thrashing due to memory pressure, or if the page 1176 * is being written back. So grab it first before we start 1177 * the transaction handle. This also allows us to allocate 1178 * the page (if needed) without using GFP_NOFS. 1179 */ 1180 retry_grab: 1181 page = grab_cache_page_write_begin(mapping, index, flags); 1182 if (!page) 1183 return -ENOMEM; 1184 unlock_page(page); 1185 1186 retry_journal: 1187 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); 1188 if (IS_ERR(handle)) { 1189 put_page(page); 1190 return PTR_ERR(handle); 1191 } 1192 1193 lock_page(page); 1194 if (page->mapping != mapping) { 1195 /* The page got truncated from under us */ 1196 unlock_page(page); 1197 put_page(page); 1198 ext4_journal_stop(handle); 1199 goto retry_grab; 1200 } 1201 /* In case writeback began while the page was unlocked */ 1202 wait_for_stable_page(page); 1203 1204 #ifdef CONFIG_EXT4_FS_ENCRYPTION 1205 if (ext4_should_dioread_nolock(inode)) 1206 ret = ext4_block_write_begin(page, pos, len, 1207 ext4_get_block_unwritten); 1208 else 1209 ret = ext4_block_write_begin(page, pos, len, 1210 ext4_get_block); 1211 #else 1212 if (ext4_should_dioread_nolock(inode)) 1213 ret = __block_write_begin(page, pos, len, 1214 ext4_get_block_unwritten); 1215 else 1216 ret = __block_write_begin(page, pos, len, ext4_get_block); 1217 #endif 1218 if (!ret && ext4_should_journal_data(inode)) { 1219 ret = ext4_walk_page_buffers(handle, page_buffers(page), 1220 from, to, NULL, 1221 do_journal_get_write_access); 1222 } 1223 1224 if (ret) { 1225 unlock_page(page); 1226 /* 1227 * __block_write_begin may have instantiated a few blocks 1228 * outside i_size. Trim these off again. Don't need 1229 * i_size_read because we hold i_mutex. 1230 * 1231 * Add inode to orphan list in case we crash before 1232 * truncate finishes 1233 */ 1234 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1235 ext4_orphan_add(handle, inode); 1236 1237 ext4_journal_stop(handle); 1238 if (pos + len > inode->i_size) { 1239 ext4_truncate_failed_write(inode); 1240 /* 1241 * If truncate failed early the inode might 1242 * still be on the orphan list; we need to 1243 * make sure the inode is removed from the 1244 * orphan list in that case. 1245 */ 1246 if (inode->i_nlink) 1247 ext4_orphan_del(NULL, inode); 1248 } 1249 1250 if (ret == -ENOSPC && 1251 ext4_should_retry_alloc(inode->i_sb, &retries)) 1252 goto retry_journal; 1253 put_page(page); 1254 return ret; 1255 } 1256 *pagep = page; 1257 return ret; 1258 } 1259 1260 /* For write_end() in data=journal mode */ 1261 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1262 { 1263 int ret; 1264 if (!buffer_mapped(bh) || buffer_freed(bh)) 1265 return 0; 1266 set_buffer_uptodate(bh); 1267 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1268 clear_buffer_meta(bh); 1269 clear_buffer_prio(bh); 1270 return ret; 1271 } 1272 1273 /* 1274 * We need to pick up the new inode size which generic_commit_write gave us 1275 * `file' can be NULL - eg, when called from page_symlink(). 1276 * 1277 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1278 * buffers are managed internally. 1279 */ 1280 static int ext4_write_end(struct file *file, 1281 struct address_space *mapping, 1282 loff_t pos, unsigned len, unsigned copied, 1283 struct page *page, void *fsdata) 1284 { 1285 handle_t *handle = ext4_journal_current_handle(); 1286 struct inode *inode = mapping->host; 1287 loff_t old_size = inode->i_size; 1288 int ret = 0, ret2; 1289 int i_size_changed = 0; 1290 1291 trace_ext4_write_end(inode, pos, len, copied); 1292 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { 1293 ret = ext4_jbd2_file_inode(handle, inode); 1294 if (ret) { 1295 unlock_page(page); 1296 put_page(page); 1297 goto errout; 1298 } 1299 } 1300 1301 if (ext4_has_inline_data(inode)) { 1302 ret = ext4_write_inline_data_end(inode, pos, len, 1303 copied, page); 1304 if (ret < 0) 1305 goto errout; 1306 copied = ret; 1307 } else 1308 copied = block_write_end(file, mapping, pos, 1309 len, copied, page, fsdata); 1310 /* 1311 * it's important to update i_size while still holding page lock: 1312 * page writeout could otherwise come in and zero beyond i_size. 1313 */ 1314 i_size_changed = ext4_update_inode_size(inode, pos + copied); 1315 unlock_page(page); 1316 put_page(page); 1317 1318 if (old_size < pos) 1319 pagecache_isize_extended(inode, old_size, pos); 1320 /* 1321 * Don't mark the inode dirty under page lock. First, it unnecessarily 1322 * makes the holding time of page lock longer. Second, it forces lock 1323 * ordering of page lock and transaction start for journaling 1324 * filesystems. 1325 */ 1326 if (i_size_changed) 1327 ext4_mark_inode_dirty(handle, inode); 1328 1329 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1330 /* if we have allocated more blocks and copied 1331 * less. We will have blocks allocated outside 1332 * inode->i_size. So truncate them 1333 */ 1334 ext4_orphan_add(handle, inode); 1335 errout: 1336 ret2 = ext4_journal_stop(handle); 1337 if (!ret) 1338 ret = ret2; 1339 1340 if (pos + len > inode->i_size) { 1341 ext4_truncate_failed_write(inode); 1342 /* 1343 * If truncate failed early the inode might still be 1344 * on the orphan list; we need to make sure the inode 1345 * is removed from the orphan list in that case. 1346 */ 1347 if (inode->i_nlink) 1348 ext4_orphan_del(NULL, inode); 1349 } 1350 1351 return ret ? ret : copied; 1352 } 1353 1354 /* 1355 * This is a private version of page_zero_new_buffers() which doesn't 1356 * set the buffer to be dirty, since in data=journalled mode we need 1357 * to call ext4_handle_dirty_metadata() instead. 1358 */ 1359 static void zero_new_buffers(struct page *page, unsigned from, unsigned to) 1360 { 1361 unsigned int block_start = 0, block_end; 1362 struct buffer_head *head, *bh; 1363 1364 bh = head = page_buffers(page); 1365 do { 1366 block_end = block_start + bh->b_size; 1367 if (buffer_new(bh)) { 1368 if (block_end > from && block_start < to) { 1369 if (!PageUptodate(page)) { 1370 unsigned start, size; 1371 1372 start = max(from, block_start); 1373 size = min(to, block_end) - start; 1374 1375 zero_user(page, start, size); 1376 set_buffer_uptodate(bh); 1377 } 1378 clear_buffer_new(bh); 1379 } 1380 } 1381 block_start = block_end; 1382 bh = bh->b_this_page; 1383 } while (bh != head); 1384 } 1385 1386 static int ext4_journalled_write_end(struct file *file, 1387 struct address_space *mapping, 1388 loff_t pos, unsigned len, unsigned copied, 1389 struct page *page, void *fsdata) 1390 { 1391 handle_t *handle = ext4_journal_current_handle(); 1392 struct inode *inode = mapping->host; 1393 loff_t old_size = inode->i_size; 1394 int ret = 0, ret2; 1395 int partial = 0; 1396 unsigned from, to; 1397 int size_changed = 0; 1398 1399 trace_ext4_journalled_write_end(inode, pos, len, copied); 1400 from = pos & (PAGE_SIZE - 1); 1401 to = from + len; 1402 1403 BUG_ON(!ext4_handle_valid(handle)); 1404 1405 if (ext4_has_inline_data(inode)) 1406 copied = ext4_write_inline_data_end(inode, pos, len, 1407 copied, page); 1408 else { 1409 if (copied < len) { 1410 if (!PageUptodate(page)) 1411 copied = 0; 1412 zero_new_buffers(page, from+copied, to); 1413 } 1414 1415 ret = ext4_walk_page_buffers(handle, page_buffers(page), from, 1416 to, &partial, write_end_fn); 1417 if (!partial) 1418 SetPageUptodate(page); 1419 } 1420 size_changed = ext4_update_inode_size(inode, pos + copied); 1421 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1422 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1423 unlock_page(page); 1424 put_page(page); 1425 1426 if (old_size < pos) 1427 pagecache_isize_extended(inode, old_size, pos); 1428 1429 if (size_changed) { 1430 ret2 = ext4_mark_inode_dirty(handle, inode); 1431 if (!ret) 1432 ret = ret2; 1433 } 1434 1435 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1436 /* if we have allocated more blocks and copied 1437 * less. We will have blocks allocated outside 1438 * inode->i_size. So truncate them 1439 */ 1440 ext4_orphan_add(handle, inode); 1441 1442 ret2 = ext4_journal_stop(handle); 1443 if (!ret) 1444 ret = ret2; 1445 if (pos + len > inode->i_size) { 1446 ext4_truncate_failed_write(inode); 1447 /* 1448 * If truncate failed early the inode might still be 1449 * on the orphan list; we need to make sure the inode 1450 * is removed from the orphan list in that case. 1451 */ 1452 if (inode->i_nlink) 1453 ext4_orphan_del(NULL, inode); 1454 } 1455 1456 return ret ? ret : copied; 1457 } 1458 1459 /* 1460 * Reserve space for a single cluster 1461 */ 1462 static int ext4_da_reserve_space(struct inode *inode) 1463 { 1464 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1465 struct ext4_inode_info *ei = EXT4_I(inode); 1466 int ret; 1467 1468 /* 1469 * We will charge metadata quota at writeout time; this saves 1470 * us from metadata over-estimation, though we may go over by 1471 * a small amount in the end. Here we just reserve for data. 1472 */ 1473 ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); 1474 if (ret) 1475 return ret; 1476 1477 spin_lock(&ei->i_block_reservation_lock); 1478 if (ext4_claim_free_clusters(sbi, 1, 0)) { 1479 spin_unlock(&ei->i_block_reservation_lock); 1480 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); 1481 return -ENOSPC; 1482 } 1483 ei->i_reserved_data_blocks++; 1484 trace_ext4_da_reserve_space(inode); 1485 spin_unlock(&ei->i_block_reservation_lock); 1486 1487 return 0; /* success */ 1488 } 1489 1490 static void ext4_da_release_space(struct inode *inode, int to_free) 1491 { 1492 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1493 struct ext4_inode_info *ei = EXT4_I(inode); 1494 1495 if (!to_free) 1496 return; /* Nothing to release, exit */ 1497 1498 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1499 1500 trace_ext4_da_release_space(inode, to_free); 1501 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1502 /* 1503 * if there aren't enough reserved blocks, then the 1504 * counter is messed up somewhere. Since this 1505 * function is called from invalidate page, it's 1506 * harmless to return without any action. 1507 */ 1508 ext4_warning(inode->i_sb, "ext4_da_release_space: " 1509 "ino %lu, to_free %d with only %d reserved " 1510 "data blocks", inode->i_ino, to_free, 1511 ei->i_reserved_data_blocks); 1512 WARN_ON(1); 1513 to_free = ei->i_reserved_data_blocks; 1514 } 1515 ei->i_reserved_data_blocks -= to_free; 1516 1517 /* update fs dirty data blocks counter */ 1518 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); 1519 1520 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1521 1522 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); 1523 } 1524 1525 static void ext4_da_page_release_reservation(struct page *page, 1526 unsigned int offset, 1527 unsigned int length) 1528 { 1529 int to_release = 0, contiguous_blks = 0; 1530 struct buffer_head *head, *bh; 1531 unsigned int curr_off = 0; 1532 struct inode *inode = page->mapping->host; 1533 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1534 unsigned int stop = offset + length; 1535 int num_clusters; 1536 ext4_fsblk_t lblk; 1537 1538 BUG_ON(stop > PAGE_SIZE || stop < length); 1539 1540 head = page_buffers(page); 1541 bh = head; 1542 do { 1543 unsigned int next_off = curr_off + bh->b_size; 1544 1545 if (next_off > stop) 1546 break; 1547 1548 if ((offset <= curr_off) && (buffer_delay(bh))) { 1549 to_release++; 1550 contiguous_blks++; 1551 clear_buffer_delay(bh); 1552 } else if (contiguous_blks) { 1553 lblk = page->index << 1554 (PAGE_SHIFT - inode->i_blkbits); 1555 lblk += (curr_off >> inode->i_blkbits) - 1556 contiguous_blks; 1557 ext4_es_remove_extent(inode, lblk, contiguous_blks); 1558 contiguous_blks = 0; 1559 } 1560 curr_off = next_off; 1561 } while ((bh = bh->b_this_page) != head); 1562 1563 if (contiguous_blks) { 1564 lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); 1565 lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; 1566 ext4_es_remove_extent(inode, lblk, contiguous_blks); 1567 } 1568 1569 /* If we have released all the blocks belonging to a cluster, then we 1570 * need to release the reserved space for that cluster. */ 1571 num_clusters = EXT4_NUM_B2C(sbi, to_release); 1572 while (num_clusters > 0) { 1573 lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + 1574 ((num_clusters - 1) << sbi->s_cluster_bits); 1575 if (sbi->s_cluster_ratio == 1 || 1576 !ext4_find_delalloc_cluster(inode, lblk)) 1577 ext4_da_release_space(inode, 1); 1578 1579 num_clusters--; 1580 } 1581 } 1582 1583 /* 1584 * Delayed allocation stuff 1585 */ 1586 1587 struct mpage_da_data { 1588 struct inode *inode; 1589 struct writeback_control *wbc; 1590 1591 pgoff_t first_page; /* The first page to write */ 1592 pgoff_t next_page; /* Current page to examine */ 1593 pgoff_t last_page; /* Last page to examine */ 1594 /* 1595 * Extent to map - this can be after first_page because that can be 1596 * fully mapped. We somewhat abuse m_flags to store whether the extent 1597 * is delalloc or unwritten. 1598 */ 1599 struct ext4_map_blocks map; 1600 struct ext4_io_submit io_submit; /* IO submission data */ 1601 }; 1602 1603 static void mpage_release_unused_pages(struct mpage_da_data *mpd, 1604 bool invalidate) 1605 { 1606 int nr_pages, i; 1607 pgoff_t index, end; 1608 struct pagevec pvec; 1609 struct inode *inode = mpd->inode; 1610 struct address_space *mapping = inode->i_mapping; 1611 1612 /* This is necessary when next_page == 0. */ 1613 if (mpd->first_page >= mpd->next_page) 1614 return; 1615 1616 index = mpd->first_page; 1617 end = mpd->next_page - 1; 1618 if (invalidate) { 1619 ext4_lblk_t start, last; 1620 start = index << (PAGE_SHIFT - inode->i_blkbits); 1621 last = end << (PAGE_SHIFT - inode->i_blkbits); 1622 ext4_es_remove_extent(inode, start, last - start + 1); 1623 } 1624 1625 pagevec_init(&pvec, 0); 1626 while (index <= end) { 1627 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1628 if (nr_pages == 0) 1629 break; 1630 for (i = 0; i < nr_pages; i++) { 1631 struct page *page = pvec.pages[i]; 1632 if (page->index > end) 1633 break; 1634 BUG_ON(!PageLocked(page)); 1635 BUG_ON(PageWriteback(page)); 1636 if (invalidate) { 1637 block_invalidatepage(page, 0, PAGE_SIZE); 1638 ClearPageUptodate(page); 1639 } 1640 unlock_page(page); 1641 } 1642 index = pvec.pages[nr_pages - 1]->index + 1; 1643 pagevec_release(&pvec); 1644 } 1645 } 1646 1647 static void ext4_print_free_blocks(struct inode *inode) 1648 { 1649 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1650 struct super_block *sb = inode->i_sb; 1651 struct ext4_inode_info *ei = EXT4_I(inode); 1652 1653 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", 1654 EXT4_C2B(EXT4_SB(inode->i_sb), 1655 ext4_count_free_clusters(sb))); 1656 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); 1657 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", 1658 (long long) EXT4_C2B(EXT4_SB(sb), 1659 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1660 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", 1661 (long long) EXT4_C2B(EXT4_SB(sb), 1662 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1663 ext4_msg(sb, KERN_CRIT, "Block reservation details"); 1664 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1665 ei->i_reserved_data_blocks); 1666 return; 1667 } 1668 1669 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 1670 { 1671 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1672 } 1673 1674 /* 1675 * This function is grabs code from the very beginning of 1676 * ext4_map_blocks, but assumes that the caller is from delayed write 1677 * time. This function looks up the requested blocks and sets the 1678 * buffer delay bit under the protection of i_data_sem. 1679 */ 1680 static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, 1681 struct ext4_map_blocks *map, 1682 struct buffer_head *bh) 1683 { 1684 struct extent_status es; 1685 int retval; 1686 sector_t invalid_block = ~((sector_t) 0xffff); 1687 #ifdef ES_AGGRESSIVE_TEST 1688 struct ext4_map_blocks orig_map; 1689 1690 memcpy(&orig_map, map, sizeof(*map)); 1691 #endif 1692 1693 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 1694 invalid_block = ~0; 1695 1696 map->m_flags = 0; 1697 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," 1698 "logical block %lu\n", inode->i_ino, map->m_len, 1699 (unsigned long) map->m_lblk); 1700 1701 /* Lookup extent status tree firstly */ 1702 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1703 if (ext4_es_is_hole(&es)) { 1704 retval = 0; 1705 down_read(&EXT4_I(inode)->i_data_sem); 1706 goto add_delayed; 1707 } 1708 1709 /* 1710 * Delayed extent could be allocated by fallocate. 1711 * So we need to check it. 1712 */ 1713 if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { 1714 map_bh(bh, inode->i_sb, invalid_block); 1715 set_buffer_new(bh); 1716 set_buffer_delay(bh); 1717 return 0; 1718 } 1719 1720 map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; 1721 retval = es.es_len - (iblock - es.es_lblk); 1722 if (retval > map->m_len) 1723 retval = map->m_len; 1724 map->m_len = retval; 1725 if (ext4_es_is_written(&es)) 1726 map->m_flags |= EXT4_MAP_MAPPED; 1727 else if (ext4_es_is_unwritten(&es)) 1728 map->m_flags |= EXT4_MAP_UNWRITTEN; 1729 else 1730 BUG_ON(1); 1731 1732 #ifdef ES_AGGRESSIVE_TEST 1733 ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); 1734 #endif 1735 return retval; 1736 } 1737 1738 /* 1739 * Try to see if we can get the block without requesting a new 1740 * file system block. 1741 */ 1742 down_read(&EXT4_I(inode)->i_data_sem); 1743 if (ext4_has_inline_data(inode)) 1744 retval = 0; 1745 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1746 retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1747 else 1748 retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1749 1750 add_delayed: 1751 if (retval == 0) { 1752 int ret; 1753 /* 1754 * XXX: __block_prepare_write() unmaps passed block, 1755 * is it OK? 1756 */ 1757 /* 1758 * If the block was allocated from previously allocated cluster, 1759 * then we don't need to reserve it again. However we still need 1760 * to reserve metadata for every block we're going to write. 1761 */ 1762 if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || 1763 !ext4_find_delalloc_cluster(inode, map->m_lblk)) { 1764 ret = ext4_da_reserve_space(inode); 1765 if (ret) { 1766 /* not enough space to reserve */ 1767 retval = ret; 1768 goto out_unlock; 1769 } 1770 } 1771 1772 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1773 ~0, EXTENT_STATUS_DELAYED); 1774 if (ret) { 1775 retval = ret; 1776 goto out_unlock; 1777 } 1778 1779 map_bh(bh, inode->i_sb, invalid_block); 1780 set_buffer_new(bh); 1781 set_buffer_delay(bh); 1782 } else if (retval > 0) { 1783 int ret; 1784 unsigned int status; 1785 1786 if (unlikely(retval != map->m_len)) { 1787 ext4_warning(inode->i_sb, 1788 "ES len assertion failed for inode " 1789 "%lu: retval %d != map->m_len %d", 1790 inode->i_ino, retval, map->m_len); 1791 WARN_ON(1); 1792 } 1793 1794 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 1795 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 1796 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1797 map->m_pblk, status); 1798 if (ret != 0) 1799 retval = ret; 1800 } 1801 1802 out_unlock: 1803 up_read((&EXT4_I(inode)->i_data_sem)); 1804 1805 return retval; 1806 } 1807 1808 /* 1809 * This is a special get_block_t callback which is used by 1810 * ext4_da_write_begin(). It will either return mapped block or 1811 * reserve space for a single block. 1812 * 1813 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. 1814 * We also have b_blocknr = -1 and b_bdev initialized properly 1815 * 1816 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. 1817 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 1818 * initialized properly. 1819 */ 1820 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 1821 struct buffer_head *bh, int create) 1822 { 1823 struct ext4_map_blocks map; 1824 int ret = 0; 1825 1826 BUG_ON(create == 0); 1827 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 1828 1829 map.m_lblk = iblock; 1830 map.m_len = 1; 1831 1832 /* 1833 * first, we need to know whether the block is allocated already 1834 * preallocated blocks are unmapped but should treated 1835 * the same as allocated blocks. 1836 */ 1837 ret = ext4_da_map_blocks(inode, iblock, &map, bh); 1838 if (ret <= 0) 1839 return ret; 1840 1841 map_bh(bh, inode->i_sb, map.m_pblk); 1842 ext4_update_bh_state(bh, map.m_flags); 1843 1844 if (buffer_unwritten(bh)) { 1845 /* A delayed write to unwritten bh should be marked 1846 * new and mapped. Mapped ensures that we don't do 1847 * get_block multiple times when we write to the same 1848 * offset and new ensures that we do proper zero out 1849 * for partial write. 1850 */ 1851 set_buffer_new(bh); 1852 set_buffer_mapped(bh); 1853 } 1854 return 0; 1855 } 1856 1857 static int bget_one(handle_t *handle, struct buffer_head *bh) 1858 { 1859 get_bh(bh); 1860 return 0; 1861 } 1862 1863 static int bput_one(handle_t *handle, struct buffer_head *bh) 1864 { 1865 put_bh(bh); 1866 return 0; 1867 } 1868 1869 static int __ext4_journalled_writepage(struct page *page, 1870 unsigned int len) 1871 { 1872 struct address_space *mapping = page->mapping; 1873 struct inode *inode = mapping->host; 1874 struct buffer_head *page_bufs = NULL; 1875 handle_t *handle = NULL; 1876 int ret = 0, err = 0; 1877 int inline_data = ext4_has_inline_data(inode); 1878 struct buffer_head *inode_bh = NULL; 1879 1880 ClearPageChecked(page); 1881 1882 if (inline_data) { 1883 BUG_ON(page->index != 0); 1884 BUG_ON(len > ext4_get_max_inline_size(inode)); 1885 inode_bh = ext4_journalled_write_inline_data(inode, len, page); 1886 if (inode_bh == NULL) 1887 goto out; 1888 } else { 1889 page_bufs = page_buffers(page); 1890 if (!page_bufs) { 1891 BUG(); 1892 goto out; 1893 } 1894 ext4_walk_page_buffers(handle, page_bufs, 0, len, 1895 NULL, bget_one); 1896 } 1897 /* 1898 * We need to release the page lock before we start the 1899 * journal, so grab a reference so the page won't disappear 1900 * out from under us. 1901 */ 1902 get_page(page); 1903 unlock_page(page); 1904 1905 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1906 ext4_writepage_trans_blocks(inode)); 1907 if (IS_ERR(handle)) { 1908 ret = PTR_ERR(handle); 1909 put_page(page); 1910 goto out_no_pagelock; 1911 } 1912 BUG_ON(!ext4_handle_valid(handle)); 1913 1914 lock_page(page); 1915 put_page(page); 1916 if (page->mapping != mapping) { 1917 /* The page got truncated from under us */ 1918 ext4_journal_stop(handle); 1919 ret = 0; 1920 goto out; 1921 } 1922 1923 if (inline_data) { 1924 BUFFER_TRACE(inode_bh, "get write access"); 1925 ret = ext4_journal_get_write_access(handle, inode_bh); 1926 1927 err = ext4_handle_dirty_metadata(handle, inode, inode_bh); 1928 1929 } else { 1930 ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, 1931 do_journal_get_write_access); 1932 1933 err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, 1934 write_end_fn); 1935 } 1936 if (ret == 0) 1937 ret = err; 1938 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1939 err = ext4_journal_stop(handle); 1940 if (!ret) 1941 ret = err; 1942 1943 if (!ext4_has_inline_data(inode)) 1944 ext4_walk_page_buffers(NULL, page_bufs, 0, len, 1945 NULL, bput_one); 1946 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1947 out: 1948 unlock_page(page); 1949 out_no_pagelock: 1950 brelse(inode_bh); 1951 return ret; 1952 } 1953 1954 /* 1955 * Note that we don't need to start a transaction unless we're journaling data 1956 * because we should have holes filled from ext4_page_mkwrite(). We even don't 1957 * need to file the inode to the transaction's list in ordered mode because if 1958 * we are writing back data added by write(), the inode is already there and if 1959 * we are writing back data modified via mmap(), no one guarantees in which 1960 * transaction the data will hit the disk. In case we are journaling data, we 1961 * cannot start transaction directly because transaction start ranks above page 1962 * lock so we have to do some magic. 1963 * 1964 * This function can get called via... 1965 * - ext4_writepages after taking page lock (have journal handle) 1966 * - journal_submit_inode_data_buffers (no journal handle) 1967 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1968 * - grab_page_cache when doing write_begin (have journal handle) 1969 * 1970 * We don't do any block allocation in this function. If we have page with 1971 * multiple blocks we need to write those buffer_heads that are mapped. This 1972 * is important for mmaped based write. So if we do with blocksize 1K 1973 * truncate(f, 1024); 1974 * a = mmap(f, 0, 4096); 1975 * a[0] = 'a'; 1976 * truncate(f, 4096); 1977 * we have in the page first buffer_head mapped via page_mkwrite call back 1978 * but other buffer_heads would be unmapped but dirty (dirty done via the 1979 * do_wp_page). So writepage should write the first block. If we modify 1980 * the mmap area beyond 1024 we will again get a page_fault and the 1981 * page_mkwrite callback will do the block allocation and mark the 1982 * buffer_heads mapped. 1983 * 1984 * We redirty the page if we have any buffer_heads that is either delay or 1985 * unwritten in the page. 1986 * 1987 * We can get recursively called as show below. 1988 * 1989 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 1990 * ext4_writepage() 1991 * 1992 * But since we don't do any block allocation we should not deadlock. 1993 * Page also have the dirty flag cleared so we don't get recurive page_lock. 1994 */ 1995 static int ext4_writepage(struct page *page, 1996 struct writeback_control *wbc) 1997 { 1998 int ret = 0; 1999 loff_t size; 2000 unsigned int len; 2001 struct buffer_head *page_bufs = NULL; 2002 struct inode *inode = page->mapping->host; 2003 struct ext4_io_submit io_submit; 2004 bool keep_towrite = false; 2005 2006 trace_ext4_writepage(page); 2007 size = i_size_read(inode); 2008 if (page->index == size >> PAGE_SHIFT) 2009 len = size & ~PAGE_MASK; 2010 else 2011 len = PAGE_SIZE; 2012 2013 page_bufs = page_buffers(page); 2014 /* 2015 * We cannot do block allocation or other extent handling in this 2016 * function. If there are buffers needing that, we have to redirty 2017 * the page. But we may reach here when we do a journal commit via 2018 * journal_submit_inode_data_buffers() and in that case we must write 2019 * allocated buffers to achieve data=ordered mode guarantees. 2020 * 2021 * Also, if there is only one buffer per page (the fs block 2022 * size == the page size), if one buffer needs block 2023 * allocation or needs to modify the extent tree to clear the 2024 * unwritten flag, we know that the page can't be written at 2025 * all, so we might as well refuse the write immediately. 2026 * Unfortunately if the block size != page size, we can't as 2027 * easily detect this case using ext4_walk_page_buffers(), but 2028 * for the extremely common case, this is an optimization that 2029 * skips a useless round trip through ext4_bio_write_page(). 2030 */ 2031 if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2032 ext4_bh_delay_or_unwritten)) { 2033 redirty_page_for_writepage(wbc, page); 2034 if ((current->flags & PF_MEMALLOC) || 2035 (inode->i_sb->s_blocksize == PAGE_SIZE)) { 2036 /* 2037 * For memory cleaning there's no point in writing only 2038 * some buffers. So just bail out. Warn if we came here 2039 * from direct reclaim. 2040 */ 2041 WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) 2042 == PF_MEMALLOC); 2043 unlock_page(page); 2044 return 0; 2045 } 2046 keep_towrite = true; 2047 } 2048 2049 if (PageChecked(page) && ext4_should_journal_data(inode)) 2050 /* 2051 * It's mmapped pagecache. Add buffers and journal it. There 2052 * doesn't seem much point in redirtying the page here. 2053 */ 2054 return __ext4_journalled_writepage(page, len); 2055 2056 ext4_io_submit_init(&io_submit, wbc); 2057 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 2058 if (!io_submit.io_end) { 2059 redirty_page_for_writepage(wbc, page); 2060 unlock_page(page); 2061 return -ENOMEM; 2062 } 2063 ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); 2064 ext4_io_submit(&io_submit); 2065 /* Drop io_end reference we got from init */ 2066 ext4_put_io_end_defer(io_submit.io_end); 2067 return ret; 2068 } 2069 2070 static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) 2071 { 2072 int len; 2073 loff_t size = i_size_read(mpd->inode); 2074 int err; 2075 2076 BUG_ON(page->index != mpd->first_page); 2077 if (page->index == size >> PAGE_SHIFT) 2078 len = size & ~PAGE_MASK; 2079 else 2080 len = PAGE_SIZE; 2081 clear_page_dirty_for_io(page); 2082 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); 2083 if (!err) 2084 mpd->wbc->nr_to_write--; 2085 mpd->first_page++; 2086 2087 return err; 2088 } 2089 2090 #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) 2091 2092 /* 2093 * mballoc gives us at most this number of blocks... 2094 * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). 2095 * The rest of mballoc seems to handle chunks up to full group size. 2096 */ 2097 #define MAX_WRITEPAGES_EXTENT_LEN 2048 2098 2099 /* 2100 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map 2101 * 2102 * @mpd - extent of blocks 2103 * @lblk - logical number of the block in the file 2104 * @bh - buffer head we want to add to the extent 2105 * 2106 * The function is used to collect contig. blocks in the same state. If the 2107 * buffer doesn't require mapping for writeback and we haven't started the 2108 * extent of buffers to map yet, the function returns 'true' immediately - the 2109 * caller can write the buffer right away. Otherwise the function returns true 2110 * if the block has been added to the extent, false if the block couldn't be 2111 * added. 2112 */ 2113 static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, 2114 struct buffer_head *bh) 2115 { 2116 struct ext4_map_blocks *map = &mpd->map; 2117 2118 /* Buffer that doesn't need mapping for writeback? */ 2119 if (!buffer_dirty(bh) || !buffer_mapped(bh) || 2120 (!buffer_delay(bh) && !buffer_unwritten(bh))) { 2121 /* So far no extent to map => we write the buffer right away */ 2122 if (map->m_len == 0) 2123 return true; 2124 return false; 2125 } 2126 2127 /* First block in the extent? */ 2128 if (map->m_len == 0) { 2129 map->m_lblk = lblk; 2130 map->m_len = 1; 2131 map->m_flags = bh->b_state & BH_FLAGS; 2132 return true; 2133 } 2134 2135 /* Don't go larger than mballoc is willing to allocate */ 2136 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) 2137 return false; 2138 2139 /* Can we merge the block to our big extent? */ 2140 if (lblk == map->m_lblk + map->m_len && 2141 (bh->b_state & BH_FLAGS) == map->m_flags) { 2142 map->m_len++; 2143 return true; 2144 } 2145 return false; 2146 } 2147 2148 /* 2149 * mpage_process_page_bufs - submit page buffers for IO or add them to extent 2150 * 2151 * @mpd - extent of blocks for mapping 2152 * @head - the first buffer in the page 2153 * @bh - buffer we should start processing from 2154 * @lblk - logical number of the block in the file corresponding to @bh 2155 * 2156 * Walk through page buffers from @bh upto @head (exclusive) and either submit 2157 * the page for IO if all buffers in this page were mapped and there's no 2158 * accumulated extent of buffers to map or add buffers in the page to the 2159 * extent of buffers to map. The function returns 1 if the caller can continue 2160 * by processing the next page, 0 if it should stop adding buffers to the 2161 * extent to map because we cannot extend it anymore. It can also return value 2162 * < 0 in case of error during IO submission. 2163 */ 2164 static int mpage_process_page_bufs(struct mpage_da_data *mpd, 2165 struct buffer_head *head, 2166 struct buffer_head *bh, 2167 ext4_lblk_t lblk) 2168 { 2169 struct inode *inode = mpd->inode; 2170 int err; 2171 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) 2172 >> inode->i_blkbits; 2173 2174 do { 2175 BUG_ON(buffer_locked(bh)); 2176 2177 if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { 2178 /* Found extent to map? */ 2179 if (mpd->map.m_len) 2180 return 0; 2181 /* Everything mapped so far and we hit EOF */ 2182 break; 2183 } 2184 } while (lblk++, (bh = bh->b_this_page) != head); 2185 /* So far everything mapped? Submit the page for IO. */ 2186 if (mpd->map.m_len == 0) { 2187 err = mpage_submit_page(mpd, head->b_page); 2188 if (err < 0) 2189 return err; 2190 } 2191 return lblk < blocks; 2192 } 2193 2194 /* 2195 * mpage_map_buffers - update buffers corresponding to changed extent and 2196 * submit fully mapped pages for IO 2197 * 2198 * @mpd - description of extent to map, on return next extent to map 2199 * 2200 * Scan buffers corresponding to changed extent (we expect corresponding pages 2201 * to be already locked) and update buffer state according to new extent state. 2202 * We map delalloc buffers to their physical location, clear unwritten bits, 2203 * and mark buffers as uninit when we perform writes to unwritten extents 2204 * and do extent conversion after IO is finished. If the last page is not fully 2205 * mapped, we update @map to the next extent in the last page that needs 2206 * mapping. Otherwise we submit the page for IO. 2207 */ 2208 static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) 2209 { 2210 struct pagevec pvec; 2211 int nr_pages, i; 2212 struct inode *inode = mpd->inode; 2213 struct buffer_head *head, *bh; 2214 int bpp_bits = PAGE_SHIFT - inode->i_blkbits; 2215 pgoff_t start, end; 2216 ext4_lblk_t lblk; 2217 sector_t pblock; 2218 int err; 2219 2220 start = mpd->map.m_lblk >> bpp_bits; 2221 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; 2222 lblk = start << bpp_bits; 2223 pblock = mpd->map.m_pblk; 2224 2225 pagevec_init(&pvec, 0); 2226 while (start <= end) { 2227 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, 2228 PAGEVEC_SIZE); 2229 if (nr_pages == 0) 2230 break; 2231 for (i = 0; i < nr_pages; i++) { 2232 struct page *page = pvec.pages[i]; 2233 2234 if (page->index > end) 2235 break; 2236 /* Up to 'end' pages must be contiguous */ 2237 BUG_ON(page->index != start); 2238 bh = head = page_buffers(page); 2239 do { 2240 if (lblk < mpd->map.m_lblk) 2241 continue; 2242 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { 2243 /* 2244 * Buffer after end of mapped extent. 2245 * Find next buffer in the page to map. 2246 */ 2247 mpd->map.m_len = 0; 2248 mpd->map.m_flags = 0; 2249 /* 2250 * FIXME: If dioread_nolock supports 2251 * blocksize < pagesize, we need to make 2252 * sure we add size mapped so far to 2253 * io_end->size as the following call 2254 * can submit the page for IO. 2255 */ 2256 err = mpage_process_page_bufs(mpd, head, 2257 bh, lblk); 2258 pagevec_release(&pvec); 2259 if (err > 0) 2260 err = 0; 2261 return err; 2262 } 2263 if (buffer_delay(bh)) { 2264 clear_buffer_delay(bh); 2265 bh->b_blocknr = pblock++; 2266 } 2267 clear_buffer_unwritten(bh); 2268 } while (lblk++, (bh = bh->b_this_page) != head); 2269 2270 /* 2271 * FIXME: This is going to break if dioread_nolock 2272 * supports blocksize < pagesize as we will try to 2273 * convert potentially unmapped parts of inode. 2274 */ 2275 mpd->io_submit.io_end->size += PAGE_SIZE; 2276 /* Page fully mapped - let IO run! */ 2277 err = mpage_submit_page(mpd, page); 2278 if (err < 0) { 2279 pagevec_release(&pvec); 2280 return err; 2281 } 2282 start++; 2283 } 2284 pagevec_release(&pvec); 2285 } 2286 /* Extent fully mapped and matches with page boundary. We are done. */ 2287 mpd->map.m_len = 0; 2288 mpd->map.m_flags = 0; 2289 return 0; 2290 } 2291 2292 static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) 2293 { 2294 struct inode *inode = mpd->inode; 2295 struct ext4_map_blocks *map = &mpd->map; 2296 int get_blocks_flags; 2297 int err, dioread_nolock; 2298 2299 trace_ext4_da_write_pages_extent(inode, map); 2300 /* 2301 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or 2302 * to convert an unwritten extent to be initialized (in the case 2303 * where we have written into one or more preallocated blocks). It is 2304 * possible that we're going to need more metadata blocks than 2305 * previously reserved. However we must not fail because we're in 2306 * writeback and there is nothing we can do about it so it might result 2307 * in data loss. So use reserved blocks to allocate metadata if 2308 * possible. 2309 * 2310 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if 2311 * the blocks in question are delalloc blocks. This indicates 2312 * that the blocks and quotas has already been checked when 2313 * the data was copied into the page cache. 2314 */ 2315 get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 2316 EXT4_GET_BLOCKS_METADATA_NOFAIL; 2317 dioread_nolock = ext4_should_dioread_nolock(inode); 2318 if (dioread_nolock) 2319 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2320 if (map->m_flags & (1 << BH_Delay)) 2321 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2322 2323 err = ext4_map_blocks(handle, inode, map, get_blocks_flags); 2324 if (err < 0) 2325 return err; 2326 if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { 2327 if (!mpd->io_submit.io_end->handle && 2328 ext4_handle_valid(handle)) { 2329 mpd->io_submit.io_end->handle = handle->h_rsv_handle; 2330 handle->h_rsv_handle = NULL; 2331 } 2332 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); 2333 } 2334 2335 BUG_ON(map->m_len == 0); 2336 if (map->m_flags & EXT4_MAP_NEW) { 2337 struct block_device *bdev = inode->i_sb->s_bdev; 2338 int i; 2339 2340 for (i = 0; i < map->m_len; i++) 2341 unmap_underlying_metadata(bdev, map->m_pblk + i); 2342 } 2343 return 0; 2344 } 2345 2346 /* 2347 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length 2348 * mpd->len and submit pages underlying it for IO 2349 * 2350 * @handle - handle for journal operations 2351 * @mpd - extent to map 2352 * @give_up_on_write - we set this to true iff there is a fatal error and there 2353 * is no hope of writing the data. The caller should discard 2354 * dirty pages to avoid infinite loops. 2355 * 2356 * The function maps extent starting at mpd->lblk of length mpd->len. If it is 2357 * delayed, blocks are allocated, if it is unwritten, we may need to convert 2358 * them to initialized or split the described range from larger unwritten 2359 * extent. Note that we need not map all the described range since allocation 2360 * can return less blocks or the range is covered by more unwritten extents. We 2361 * cannot map more because we are limited by reserved transaction credits. On 2362 * the other hand we always make sure that the last touched page is fully 2363 * mapped so that it can be written out (and thus forward progress is 2364 * guaranteed). After mapping we submit all mapped pages for IO. 2365 */ 2366 static int mpage_map_and_submit_extent(handle_t *handle, 2367 struct mpage_da_data *mpd, 2368 bool *give_up_on_write) 2369 { 2370 struct inode *inode = mpd->inode; 2371 struct ext4_map_blocks *map = &mpd->map; 2372 int err; 2373 loff_t disksize; 2374 int progress = 0; 2375 2376 mpd->io_submit.io_end->offset = 2377 ((loff_t)map->m_lblk) << inode->i_blkbits; 2378 do { 2379 err = mpage_map_one_extent(handle, mpd); 2380 if (err < 0) { 2381 struct super_block *sb = inode->i_sb; 2382 2383 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 2384 goto invalidate_dirty_pages; 2385 /* 2386 * Let the uper layers retry transient errors. 2387 * In the case of ENOSPC, if ext4_count_free_blocks() 2388 * is non-zero, a commit should free up blocks. 2389 */ 2390 if ((err == -ENOMEM) || 2391 (err == -ENOSPC && ext4_count_free_clusters(sb))) { 2392 if (progress) 2393 goto update_disksize; 2394 return err; 2395 } 2396 ext4_msg(sb, KERN_CRIT, 2397 "Delayed block allocation failed for " 2398 "inode %lu at logical offset %llu with" 2399 " max blocks %u with error %d", 2400 inode->i_ino, 2401 (unsigned long long)map->m_lblk, 2402 (unsigned)map->m_len, -err); 2403 ext4_msg(sb, KERN_CRIT, 2404 "This should not happen!! Data will " 2405 "be lost\n"); 2406 if (err == -ENOSPC) 2407 ext4_print_free_blocks(inode); 2408 invalidate_dirty_pages: 2409 *give_up_on_write = true; 2410 return err; 2411 } 2412 progress = 1; 2413 /* 2414 * Update buffer state, submit mapped pages, and get us new 2415 * extent to map 2416 */ 2417 err = mpage_map_and_submit_buffers(mpd); 2418 if (err < 0) 2419 goto update_disksize; 2420 } while (map->m_len); 2421 2422 update_disksize: 2423 /* 2424 * Update on-disk size after IO is submitted. Races with 2425 * truncate are avoided by checking i_size under i_data_sem. 2426 */ 2427 disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; 2428 if (disksize > EXT4_I(inode)->i_disksize) { 2429 int err2; 2430 loff_t i_size; 2431 2432 down_write(&EXT4_I(inode)->i_data_sem); 2433 i_size = i_size_read(inode); 2434 if (disksize > i_size) 2435 disksize = i_size; 2436 if (disksize > EXT4_I(inode)->i_disksize) 2437 EXT4_I(inode)->i_disksize = disksize; 2438 err2 = ext4_mark_inode_dirty(handle, inode); 2439 up_write(&EXT4_I(inode)->i_data_sem); 2440 if (err2) 2441 ext4_error(inode->i_sb, 2442 "Failed to mark inode %lu dirty", 2443 inode->i_ino); 2444 if (!err) 2445 err = err2; 2446 } 2447 return err; 2448 } 2449 2450 /* 2451 * Calculate the total number of credits to reserve for one writepages 2452 * iteration. This is called from ext4_writepages(). We map an extent of 2453 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping 2454 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + 2455 * bpp - 1 blocks in bpp different extents. 2456 */ 2457 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2458 { 2459 int bpp = ext4_journal_blocks_per_page(inode); 2460 2461 return ext4_meta_trans_blocks(inode, 2462 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); 2463 } 2464 2465 /* 2466 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages 2467 * and underlying extent to map 2468 * 2469 * @mpd - where to look for pages 2470 * 2471 * Walk dirty pages in the mapping. If they are fully mapped, submit them for 2472 * IO immediately. When we find a page which isn't mapped we start accumulating 2473 * extent of buffers underlying these pages that needs mapping (formed by 2474 * either delayed or unwritten buffers). We also lock the pages containing 2475 * these buffers. The extent found is returned in @mpd structure (starting at 2476 * mpd->lblk with length mpd->len blocks). 2477 * 2478 * Note that this function can attach bios to one io_end structure which are 2479 * neither logically nor physically contiguous. Although it may seem as an 2480 * unnecessary complication, it is actually inevitable in blocksize < pagesize 2481 * case as we need to track IO to all buffers underlying a page in one io_end. 2482 */ 2483 static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) 2484 { 2485 struct address_space *mapping = mpd->inode->i_mapping; 2486 struct pagevec pvec; 2487 unsigned int nr_pages; 2488 long left = mpd->wbc->nr_to_write; 2489 pgoff_t index = mpd->first_page; 2490 pgoff_t end = mpd->last_page; 2491 int tag; 2492 int i, err = 0; 2493 int blkbits = mpd->inode->i_blkbits; 2494 ext4_lblk_t lblk; 2495 struct buffer_head *head; 2496 2497 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) 2498 tag = PAGECACHE_TAG_TOWRITE; 2499 else 2500 tag = PAGECACHE_TAG_DIRTY; 2501 2502 pagevec_init(&pvec, 0); 2503 mpd->map.m_len = 0; 2504 mpd->next_page = index; 2505 while (index <= end) { 2506 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2507 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2508 if (nr_pages == 0) 2509 goto out; 2510 2511 for (i = 0; i < nr_pages; i++) { 2512 struct page *page = pvec.pages[i]; 2513 2514 /* 2515 * At this point, the page may be truncated or 2516 * invalidated (changing page->mapping to NULL), or 2517 * even swizzled back from swapper_space to tmpfs file 2518 * mapping. However, page->index will not change 2519 * because we have a reference on the page. 2520 */ 2521 if (page->index > end) 2522 goto out; 2523 2524 /* 2525 * Accumulated enough dirty pages? This doesn't apply 2526 * to WB_SYNC_ALL mode. For integrity sync we have to 2527 * keep going because someone may be concurrently 2528 * dirtying pages, and we might have synced a lot of 2529 * newly appeared dirty pages, but have not synced all 2530 * of the old dirty pages. 2531 */ 2532 if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) 2533 goto out; 2534 2535 /* If we can't merge this page, we are done. */ 2536 if (mpd->map.m_len > 0 && mpd->next_page != page->index) 2537 goto out; 2538 2539 lock_page(page); 2540 /* 2541 * If the page is no longer dirty, or its mapping no 2542 * longer corresponds to inode we are writing (which 2543 * means it has been truncated or invalidated), or the 2544 * page is already under writeback and we are not doing 2545 * a data integrity writeback, skip the page 2546 */ 2547 if (!PageDirty(page) || 2548 (PageWriteback(page) && 2549 (mpd->wbc->sync_mode == WB_SYNC_NONE)) || 2550 unlikely(page->mapping != mapping)) { 2551 unlock_page(page); 2552 continue; 2553 } 2554 2555 wait_on_page_writeback(page); 2556 BUG_ON(PageWriteback(page)); 2557 2558 if (mpd->map.m_len == 0) 2559 mpd->first_page = page->index; 2560 mpd->next_page = page->index + 1; 2561 /* Add all dirty buffers to mpd */ 2562 lblk = ((ext4_lblk_t)page->index) << 2563 (PAGE_SHIFT - blkbits); 2564 head = page_buffers(page); 2565 err = mpage_process_page_bufs(mpd, head, head, lblk); 2566 if (err <= 0) 2567 goto out; 2568 err = 0; 2569 left--; 2570 } 2571 pagevec_release(&pvec); 2572 cond_resched(); 2573 } 2574 return 0; 2575 out: 2576 pagevec_release(&pvec); 2577 return err; 2578 } 2579 2580 static int __writepage(struct page *page, struct writeback_control *wbc, 2581 void *data) 2582 { 2583 struct address_space *mapping = data; 2584 int ret = ext4_writepage(page, wbc); 2585 mapping_set_error(mapping, ret); 2586 return ret; 2587 } 2588 2589 static int ext4_writepages(struct address_space *mapping, 2590 struct writeback_control *wbc) 2591 { 2592 pgoff_t writeback_index = 0; 2593 long nr_to_write = wbc->nr_to_write; 2594 int range_whole = 0; 2595 int cycled = 1; 2596 handle_t *handle = NULL; 2597 struct mpage_da_data mpd; 2598 struct inode *inode = mapping->host; 2599 int needed_blocks, rsv_blocks = 0, ret = 0; 2600 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2601 bool done; 2602 struct blk_plug plug; 2603 bool give_up_on_write = false; 2604 2605 trace_ext4_writepages(inode, wbc); 2606 2607 if (dax_mapping(mapping)) 2608 return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, 2609 wbc); 2610 2611 /* 2612 * No pages to write? This is mainly a kludge to avoid starting 2613 * a transaction for special inodes like journal inode on last iput() 2614 * because that could violate lock ordering on umount 2615 */ 2616 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2617 goto out_writepages; 2618 2619 if (ext4_should_journal_data(inode)) { 2620 struct blk_plug plug; 2621 2622 blk_start_plug(&plug); 2623 ret = write_cache_pages(mapping, wbc, __writepage, mapping); 2624 blk_finish_plug(&plug); 2625 goto out_writepages; 2626 } 2627 2628 /* 2629 * If the filesystem has aborted, it is read-only, so return 2630 * right away instead of dumping stack traces later on that 2631 * will obscure the real source of the problem. We test 2632 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2633 * the latter could be true if the filesystem is mounted 2634 * read-only, and in that case, ext4_writepages should 2635 * *never* be called, so if that ever happens, we would want 2636 * the stack trace. 2637 */ 2638 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { 2639 ret = -EROFS; 2640 goto out_writepages; 2641 } 2642 2643 if (ext4_should_dioread_nolock(inode)) { 2644 /* 2645 * We may need to convert up to one extent per block in 2646 * the page and we may dirty the inode. 2647 */ 2648 rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits); 2649 } 2650 2651 /* 2652 * If we have inline data and arrive here, it means that 2653 * we will soon create the block for the 1st page, so 2654 * we'd better clear the inline data here. 2655 */ 2656 if (ext4_has_inline_data(inode)) { 2657 /* Just inode will be modified... */ 2658 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 2659 if (IS_ERR(handle)) { 2660 ret = PTR_ERR(handle); 2661 goto out_writepages; 2662 } 2663 BUG_ON(ext4_test_inode_state(inode, 2664 EXT4_STATE_MAY_INLINE_DATA)); 2665 ext4_destroy_inline_data(handle, inode); 2666 ext4_journal_stop(handle); 2667 } 2668 2669 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2670 range_whole = 1; 2671 2672 if (wbc->range_cyclic) { 2673 writeback_index = mapping->writeback_index; 2674 if (writeback_index) 2675 cycled = 0; 2676 mpd.first_page = writeback_index; 2677 mpd.last_page = -1; 2678 } else { 2679 mpd.first_page = wbc->range_start >> PAGE_SHIFT; 2680 mpd.last_page = wbc->range_end >> PAGE_SHIFT; 2681 } 2682 2683 mpd.inode = inode; 2684 mpd.wbc = wbc; 2685 ext4_io_submit_init(&mpd.io_submit, wbc); 2686 retry: 2687 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2688 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); 2689 done = false; 2690 blk_start_plug(&plug); 2691 while (!done && mpd.first_page <= mpd.last_page) { 2692 /* For each extent of pages we use new io_end */ 2693 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); 2694 if (!mpd.io_submit.io_end) { 2695 ret = -ENOMEM; 2696 break; 2697 } 2698 2699 /* 2700 * We have two constraints: We find one extent to map and we 2701 * must always write out whole page (makes a difference when 2702 * blocksize < pagesize) so that we don't block on IO when we 2703 * try to write out the rest of the page. Journalled mode is 2704 * not supported by delalloc. 2705 */ 2706 BUG_ON(ext4_should_journal_data(inode)); 2707 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2708 2709 /* start a new transaction */ 2710 handle = ext4_journal_start_with_reserve(inode, 2711 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); 2712 if (IS_ERR(handle)) { 2713 ret = PTR_ERR(handle); 2714 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2715 "%ld pages, ino %lu; err %d", __func__, 2716 wbc->nr_to_write, inode->i_ino, ret); 2717 /* Release allocated io_end */ 2718 ext4_put_io_end(mpd.io_submit.io_end); 2719 break; 2720 } 2721 2722 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); 2723 ret = mpage_prepare_extent_to_map(&mpd); 2724 if (!ret) { 2725 if (mpd.map.m_len) 2726 ret = mpage_map_and_submit_extent(handle, &mpd, 2727 &give_up_on_write); 2728 else { 2729 /* 2730 * We scanned the whole range (or exhausted 2731 * nr_to_write), submitted what was mapped and 2732 * didn't find anything needing mapping. We are 2733 * done. 2734 */ 2735 done = true; 2736 } 2737 } 2738 ext4_journal_stop(handle); 2739 /* Submit prepared bio */ 2740 ext4_io_submit(&mpd.io_submit); 2741 /* Unlock pages we didn't use */ 2742 mpage_release_unused_pages(&mpd, give_up_on_write); 2743 /* Drop our io_end reference we got from init */ 2744 ext4_put_io_end(mpd.io_submit.io_end); 2745 2746 if (ret == -ENOSPC && sbi->s_journal) { 2747 /* 2748 * Commit the transaction which would 2749 * free blocks released in the transaction 2750 * and try again 2751 */ 2752 jbd2_journal_force_commit_nested(sbi->s_journal); 2753 ret = 0; 2754 continue; 2755 } 2756 /* Fatal error - ENOMEM, EIO... */ 2757 if (ret) 2758 break; 2759 } 2760 blk_finish_plug(&plug); 2761 if (!ret && !cycled && wbc->nr_to_write > 0) { 2762 cycled = 1; 2763 mpd.last_page = writeback_index - 1; 2764 mpd.first_page = 0; 2765 goto retry; 2766 } 2767 2768 /* Update index */ 2769 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2770 /* 2771 * Set the writeback_index so that range_cyclic 2772 * mode will write it back later 2773 */ 2774 mapping->writeback_index = mpd.first_page; 2775 2776 out_writepages: 2777 trace_ext4_writepages_result(inode, wbc, ret, 2778 nr_to_write - wbc->nr_to_write); 2779 return ret; 2780 } 2781 2782 static int ext4_nonda_switch(struct super_block *sb) 2783 { 2784 s64 free_clusters, dirty_clusters; 2785 struct ext4_sb_info *sbi = EXT4_SB(sb); 2786 2787 /* 2788 * switch to non delalloc mode if we are running low 2789 * on free block. The free block accounting via percpu 2790 * counters can get slightly wrong with percpu_counter_batch getting 2791 * accumulated on each CPU without updating global counters 2792 * Delalloc need an accurate free block accounting. So switch 2793 * to non delalloc when we are near to error range. 2794 */ 2795 free_clusters = 2796 percpu_counter_read_positive(&sbi->s_freeclusters_counter); 2797 dirty_clusters = 2798 percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2799 /* 2800 * Start pushing delalloc when 1/2 of free blocks are dirty. 2801 */ 2802 if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) 2803 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); 2804 2805 if (2 * free_clusters < 3 * dirty_clusters || 2806 free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { 2807 /* 2808 * free block count is less than 150% of dirty blocks 2809 * or free blocks is less than watermark 2810 */ 2811 return 1; 2812 } 2813 return 0; 2814 } 2815 2816 /* We always reserve for an inode update; the superblock could be there too */ 2817 static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) 2818 { 2819 if (likely(ext4_has_feature_large_file(inode->i_sb))) 2820 return 1; 2821 2822 if (pos + len <= 0x7fffffffULL) 2823 return 1; 2824 2825 /* We might need to update the superblock to set LARGE_FILE */ 2826 return 2; 2827 } 2828 2829 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2830 loff_t pos, unsigned len, unsigned flags, 2831 struct page **pagep, void **fsdata) 2832 { 2833 int ret, retries = 0; 2834 struct page *page; 2835 pgoff_t index; 2836 struct inode *inode = mapping->host; 2837 handle_t *handle; 2838 2839 index = pos >> PAGE_SHIFT; 2840 2841 if (ext4_nonda_switch(inode->i_sb)) { 2842 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2843 return ext4_write_begin(file, mapping, pos, 2844 len, flags, pagep, fsdata); 2845 } 2846 *fsdata = (void *)0; 2847 trace_ext4_da_write_begin(inode, pos, len, flags); 2848 2849 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 2850 ret = ext4_da_write_inline_data_begin(mapping, inode, 2851 pos, len, flags, 2852 pagep, fsdata); 2853 if (ret < 0) 2854 return ret; 2855 if (ret == 1) 2856 return 0; 2857 } 2858 2859 /* 2860 * grab_cache_page_write_begin() can take a long time if the 2861 * system is thrashing due to memory pressure, or if the page 2862 * is being written back. So grab it first before we start 2863 * the transaction handle. This also allows us to allocate 2864 * the page (if needed) without using GFP_NOFS. 2865 */ 2866 retry_grab: 2867 page = grab_cache_page_write_begin(mapping, index, flags); 2868 if (!page) 2869 return -ENOMEM; 2870 unlock_page(page); 2871 2872 /* 2873 * With delayed allocation, we don't log the i_disksize update 2874 * if there is delayed block allocation. But we still need 2875 * to journalling the i_disksize update if writes to the end 2876 * of file which has an already mapped buffer. 2877 */ 2878 retry_journal: 2879 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2880 ext4_da_write_credits(inode, pos, len)); 2881 if (IS_ERR(handle)) { 2882 put_page(page); 2883 return PTR_ERR(handle); 2884 } 2885 2886 lock_page(page); 2887 if (page->mapping != mapping) { 2888 /* The page got truncated from under us */ 2889 unlock_page(page); 2890 put_page(page); 2891 ext4_journal_stop(handle); 2892 goto retry_grab; 2893 } 2894 /* In case writeback began while the page was unlocked */ 2895 wait_for_stable_page(page); 2896 2897 #ifdef CONFIG_EXT4_FS_ENCRYPTION 2898 ret = ext4_block_write_begin(page, pos, len, 2899 ext4_da_get_block_prep); 2900 #else 2901 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 2902 #endif 2903 if (ret < 0) { 2904 unlock_page(page); 2905 ext4_journal_stop(handle); 2906 /* 2907 * block_write_begin may have instantiated a few blocks 2908 * outside i_size. Trim these off again. Don't need 2909 * i_size_read because we hold i_mutex. 2910 */ 2911 if (pos + len > inode->i_size) 2912 ext4_truncate_failed_write(inode); 2913 2914 if (ret == -ENOSPC && 2915 ext4_should_retry_alloc(inode->i_sb, &retries)) 2916 goto retry_journal; 2917 2918 put_page(page); 2919 return ret; 2920 } 2921 2922 *pagep = page; 2923 return ret; 2924 } 2925 2926 /* 2927 * Check if we should update i_disksize 2928 * when write to the end of file but not require block allocation 2929 */ 2930 static int ext4_da_should_update_i_disksize(struct page *page, 2931 unsigned long offset) 2932 { 2933 struct buffer_head *bh; 2934 struct inode *inode = page->mapping->host; 2935 unsigned int idx; 2936 int i; 2937 2938 bh = page_buffers(page); 2939 idx = offset >> inode->i_blkbits; 2940 2941 for (i = 0; i < idx; i++) 2942 bh = bh->b_this_page; 2943 2944 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) 2945 return 0; 2946 return 1; 2947 } 2948 2949 static int ext4_da_write_end(struct file *file, 2950 struct address_space *mapping, 2951 loff_t pos, unsigned len, unsigned copied, 2952 struct page *page, void *fsdata) 2953 { 2954 struct inode *inode = mapping->host; 2955 int ret = 0, ret2; 2956 handle_t *handle = ext4_journal_current_handle(); 2957 loff_t new_i_size; 2958 unsigned long start, end; 2959 int write_mode = (int)(unsigned long)fsdata; 2960 2961 if (write_mode == FALL_BACK_TO_NONDELALLOC) 2962 return ext4_write_end(file, mapping, pos, 2963 len, copied, page, fsdata); 2964 2965 trace_ext4_da_write_end(inode, pos, len, copied); 2966 start = pos & (PAGE_SIZE - 1); 2967 end = start + copied - 1; 2968 2969 /* 2970 * generic_write_end() will run mark_inode_dirty() if i_size 2971 * changes. So let's piggyback the i_disksize mark_inode_dirty 2972 * into that. 2973 */ 2974 new_i_size = pos + copied; 2975 if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 2976 if (ext4_has_inline_data(inode) || 2977 ext4_da_should_update_i_disksize(page, end)) { 2978 ext4_update_i_disksize(inode, new_i_size); 2979 /* We need to mark inode dirty even if 2980 * new_i_size is less that inode->i_size 2981 * bu greater than i_disksize.(hint delalloc) 2982 */ 2983 ext4_mark_inode_dirty(handle, inode); 2984 } 2985 } 2986 2987 if (write_mode != CONVERT_INLINE_DATA && 2988 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && 2989 ext4_has_inline_data(inode)) 2990 ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, 2991 page); 2992 else 2993 ret2 = generic_write_end(file, mapping, pos, len, copied, 2994 page, fsdata); 2995 2996 copied = ret2; 2997 if (ret2 < 0) 2998 ret = ret2; 2999 ret2 = ext4_journal_stop(handle); 3000 if (!ret) 3001 ret = ret2; 3002 3003 return ret ? ret : copied; 3004 } 3005 3006 static void ext4_da_invalidatepage(struct page *page, unsigned int offset, 3007 unsigned int length) 3008 { 3009 /* 3010 * Drop reserved blocks 3011 */ 3012 BUG_ON(!PageLocked(page)); 3013 if (!page_has_buffers(page)) 3014 goto out; 3015 3016 ext4_da_page_release_reservation(page, offset, length); 3017 3018 out: 3019 ext4_invalidatepage(page, offset, length); 3020 3021 return; 3022 } 3023 3024 /* 3025 * Force all delayed allocation blocks to be allocated for a given inode. 3026 */ 3027 int ext4_alloc_da_blocks(struct inode *inode) 3028 { 3029 trace_ext4_alloc_da_blocks(inode); 3030 3031 if (!EXT4_I(inode)->i_reserved_data_blocks) 3032 return 0; 3033 3034 /* 3035 * We do something simple for now. The filemap_flush() will 3036 * also start triggering a write of the data blocks, which is 3037 * not strictly speaking necessary (and for users of 3038 * laptop_mode, not even desirable). However, to do otherwise 3039 * would require replicating code paths in: 3040 * 3041 * ext4_writepages() -> 3042 * write_cache_pages() ---> (via passed in callback function) 3043 * __mpage_da_writepage() --> 3044 * mpage_add_bh_to_extent() 3045 * mpage_da_map_blocks() 3046 * 3047 * The problem is that write_cache_pages(), located in 3048 * mm/page-writeback.c, marks pages clean in preparation for 3049 * doing I/O, which is not desirable if we're not planning on 3050 * doing I/O at all. 3051 * 3052 * We could call write_cache_pages(), and then redirty all of 3053 * the pages by calling redirty_page_for_writepage() but that 3054 * would be ugly in the extreme. So instead we would need to 3055 * replicate parts of the code in the above functions, 3056 * simplifying them because we wouldn't actually intend to 3057 * write out the pages, but rather only collect contiguous 3058 * logical block extents, call the multi-block allocator, and 3059 * then update the buffer heads with the block allocations. 3060 * 3061 * For now, though, we'll cheat by calling filemap_flush(), 3062 * which will map the blocks, and start the I/O, but not 3063 * actually wait for the I/O to complete. 3064 */ 3065 return filemap_flush(inode->i_mapping); 3066 } 3067 3068 /* 3069 * bmap() is special. It gets used by applications such as lilo and by 3070 * the swapper to find the on-disk block of a specific piece of data. 3071 * 3072 * Naturally, this is dangerous if the block concerned is still in the 3073 * journal. If somebody makes a swapfile on an ext4 data-journaling 3074 * filesystem and enables swap, then they may get a nasty shock when the 3075 * data getting swapped to that swapfile suddenly gets overwritten by 3076 * the original zero's written out previously to the journal and 3077 * awaiting writeback in the kernel's buffer cache. 3078 * 3079 * So, if we see any bmap calls here on a modified, data-journaled file, 3080 * take extra steps to flush any blocks which might be in the cache. 3081 */ 3082 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3083 { 3084 struct inode *inode = mapping->host; 3085 journal_t *journal; 3086 int err; 3087 3088 /* 3089 * We can get here for an inline file via the FIBMAP ioctl 3090 */ 3091 if (ext4_has_inline_data(inode)) 3092 return 0; 3093 3094 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3095 test_opt(inode->i_sb, DELALLOC)) { 3096 /* 3097 * With delalloc we want to sync the file 3098 * so that we can make sure we allocate 3099 * blocks for file 3100 */ 3101 filemap_write_and_wait(mapping); 3102 } 3103 3104 if (EXT4_JOURNAL(inode) && 3105 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { 3106 /* 3107 * This is a REALLY heavyweight approach, but the use of 3108 * bmap on dirty files is expected to be extremely rare: 3109 * only if we run lilo or swapon on a freshly made file 3110 * do we expect this to happen. 3111 * 3112 * (bmap requires CAP_SYS_RAWIO so this does not 3113 * represent an unprivileged user DOS attack --- we'd be 3114 * in trouble if mortal users could trigger this path at 3115 * will.) 3116 * 3117 * NB. EXT4_STATE_JDATA is not set on files other than 3118 * regular files. If somebody wants to bmap a directory 3119 * or symlink and gets confused because the buffer 3120 * hasn't yet been flushed to disk, they deserve 3121 * everything they get. 3122 */ 3123 3124 ext4_clear_inode_state(inode, EXT4_STATE_JDATA); 3125 journal = EXT4_JOURNAL(inode); 3126 jbd2_journal_lock_updates(journal); 3127 err = jbd2_journal_flush(journal); 3128 jbd2_journal_unlock_updates(journal); 3129 3130 if (err) 3131 return 0; 3132 } 3133 3134 return generic_block_bmap(mapping, block, ext4_get_block); 3135 } 3136 3137 static int ext4_readpage(struct file *file, struct page *page) 3138 { 3139 int ret = -EAGAIN; 3140 struct inode *inode = page->mapping->host; 3141 3142 trace_ext4_readpage(page); 3143 3144 if (ext4_has_inline_data(inode)) 3145 ret = ext4_readpage_inline(inode, page); 3146 3147 if (ret == -EAGAIN) 3148 return ext4_mpage_readpages(page->mapping, NULL, page, 1); 3149 3150 return ret; 3151 } 3152 3153 static int 3154 ext4_readpages(struct file *file, struct address_space *mapping, 3155 struct list_head *pages, unsigned nr_pages) 3156 { 3157 struct inode *inode = mapping->host; 3158 3159 /* If the file has inline data, no need to do readpages. */ 3160 if (ext4_has_inline_data(inode)) 3161 return 0; 3162 3163 return ext4_mpage_readpages(mapping, pages, NULL, nr_pages); 3164 } 3165 3166 static void ext4_invalidatepage(struct page *page, unsigned int offset, 3167 unsigned int length) 3168 { 3169 trace_ext4_invalidatepage(page, offset, length); 3170 3171 /* No journalling happens on data buffers when this function is used */ 3172 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 3173 3174 block_invalidatepage(page, offset, length); 3175 } 3176 3177 static int __ext4_journalled_invalidatepage(struct page *page, 3178 unsigned int offset, 3179 unsigned int length) 3180 { 3181 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3182 3183 trace_ext4_journalled_invalidatepage(page, offset, length); 3184 3185 /* 3186 * If it's a full truncate we just forget about the pending dirtying 3187 */ 3188 if (offset == 0 && length == PAGE_SIZE) 3189 ClearPageChecked(page); 3190 3191 return jbd2_journal_invalidatepage(journal, page, offset, length); 3192 } 3193 3194 /* Wrapper for aops... */ 3195 static void ext4_journalled_invalidatepage(struct page *page, 3196 unsigned int offset, 3197 unsigned int length) 3198 { 3199 WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); 3200 } 3201 3202 static int ext4_releasepage(struct page *page, gfp_t wait) 3203 { 3204 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3205 3206 trace_ext4_releasepage(page); 3207 3208 /* Page has dirty journalled data -> cannot release */ 3209 if (PageChecked(page)) 3210 return 0; 3211 if (journal) 3212 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3213 else 3214 return try_to_free_buffers(page); 3215 } 3216 3217 #ifdef CONFIG_FS_DAX 3218 int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 3219 struct buffer_head *bh_result, int create) 3220 { 3221 int ret, err; 3222 int credits; 3223 struct ext4_map_blocks map; 3224 handle_t *handle = NULL; 3225 int flags = 0; 3226 3227 ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", 3228 inode->i_ino, create); 3229 map.m_lblk = iblock; 3230 map.m_len = bh_result->b_size >> inode->i_blkbits; 3231 credits = ext4_chunk_trans_blocks(inode, map.m_len); 3232 if (create) { 3233 flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; 3234 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 3235 if (IS_ERR(handle)) { 3236 ret = PTR_ERR(handle); 3237 return ret; 3238 } 3239 } 3240 3241 ret = ext4_map_blocks(handle, inode, &map, flags); 3242 if (create) { 3243 err = ext4_journal_stop(handle); 3244 if (ret >= 0 && err < 0) 3245 ret = err; 3246 } 3247 if (ret <= 0) 3248 goto out; 3249 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 3250 int err2; 3251 3252 /* 3253 * We are protected by i_mmap_sem so we know block cannot go 3254 * away from under us even though we dropped i_data_sem. 3255 * Convert extent to written and write zeros there. 3256 * 3257 * Note: We may get here even when create == 0. 3258 */ 3259 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 3260 if (IS_ERR(handle)) { 3261 ret = PTR_ERR(handle); 3262 goto out; 3263 } 3264 3265 err = ext4_map_blocks(handle, inode, &map, 3266 EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); 3267 if (err < 0) 3268 ret = err; 3269 err2 = ext4_journal_stop(handle); 3270 if (err2 < 0 && ret > 0) 3271 ret = err2; 3272 } 3273 out: 3274 WARN_ON_ONCE(ret == 0 && create); 3275 if (ret > 0) { 3276 map_bh(bh_result, inode->i_sb, map.m_pblk); 3277 /* 3278 * At least for now we have to clear BH_New so that DAX code 3279 * doesn't attempt to zero blocks again in a racy way. 3280 */ 3281 map.m_flags &= ~EXT4_MAP_NEW; 3282 ext4_update_bh_state(bh_result, map.m_flags); 3283 bh_result->b_size = map.m_len << inode->i_blkbits; 3284 ret = 0; 3285 } 3286 return ret; 3287 } 3288 #endif 3289 3290 static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3291 ssize_t size, void *private) 3292 { 3293 ext4_io_end_t *io_end = private; 3294 3295 /* if not async direct IO just return */ 3296 if (!io_end) 3297 return 0; 3298 3299 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3300 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3301 io_end, io_end->inode->i_ino, iocb, offset, size); 3302 3303 /* 3304 * Error during AIO DIO. We cannot convert unwritten extents as the 3305 * data was not written. Just clear the unwritten flag and drop io_end. 3306 */ 3307 if (size <= 0) { 3308 ext4_clear_io_unwritten_flag(io_end); 3309 size = 0; 3310 } 3311 io_end->offset = offset; 3312 io_end->size = size; 3313 ext4_put_io_end(io_end); 3314 3315 return 0; 3316 } 3317 3318 /* 3319 * For ext4 extent files, ext4 will do direct-io write to holes, 3320 * preallocated extents, and those write extend the file, no need to 3321 * fall back to buffered IO. 3322 * 3323 * For holes, we fallocate those blocks, mark them as unwritten 3324 * If those blocks were preallocated, we mark sure they are split, but 3325 * still keep the range to write as unwritten. 3326 * 3327 * The unwritten extents will be converted to written when DIO is completed. 3328 * For async direct IO, since the IO may still pending when return, we 3329 * set up an end_io call back function, which will do the conversion 3330 * when async direct IO completed. 3331 * 3332 * If the O_DIRECT write will extend the file then add this inode to the 3333 * orphan list. So recovery will truncate it back to the original size 3334 * if the machine crashes during the write. 3335 * 3336 */ 3337 static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 3338 loff_t offset) 3339 { 3340 struct file *file = iocb->ki_filp; 3341 struct inode *inode = file->f_mapping->host; 3342 ssize_t ret; 3343 size_t count = iov_iter_count(iter); 3344 int overwrite = 0; 3345 get_block_t *get_block_func = NULL; 3346 int dio_flags = 0; 3347 loff_t final_size = offset + count; 3348 3349 /* Use the old path for reads and writes beyond i_size. */ 3350 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) 3351 return ext4_ind_direct_IO(iocb, iter, offset); 3352 3353 BUG_ON(iocb->private == NULL); 3354 3355 /* 3356 * Make all waiters for direct IO properly wait also for extent 3357 * conversion. This also disallows race between truncate() and 3358 * overwrite DIO as i_dio_count needs to be incremented under i_mutex. 3359 */ 3360 if (iov_iter_rw(iter) == WRITE) 3361 inode_dio_begin(inode); 3362 3363 /* If we do a overwrite dio, i_mutex locking can be released */ 3364 overwrite = *((int *)iocb->private); 3365 3366 if (overwrite) 3367 inode_unlock(inode); 3368 3369 /* 3370 * We could direct write to holes and fallocate. 3371 * 3372 * Allocated blocks to fill the hole are marked as unwritten to prevent 3373 * parallel buffered read to expose the stale data before DIO complete 3374 * the data IO. 3375 * 3376 * As to previously fallocated extents, ext4 get_block will just simply 3377 * mark the buffer mapped but still keep the extents unwritten. 3378 * 3379 * For non AIO case, we will convert those unwritten extents to written 3380 * after return back from blockdev_direct_IO. That way we save us from 3381 * allocating io_end structure and also the overhead of offloading 3382 * the extent convertion to a workqueue. 3383 * 3384 * For async DIO, the conversion needs to be deferred when the 3385 * IO is completed. The ext4 end_io callback function will be 3386 * called to take care of the conversion work. Here for async 3387 * case, we allocate an io_end structure to hook to the iocb. 3388 */ 3389 iocb->private = NULL; 3390 if (overwrite) 3391 get_block_func = ext4_dio_get_block_overwrite; 3392 else if (is_sync_kiocb(iocb)) { 3393 get_block_func = ext4_dio_get_block_unwritten_sync; 3394 dio_flags = DIO_LOCKING; 3395 } else { 3396 get_block_func = ext4_dio_get_block_unwritten_async; 3397 dio_flags = DIO_LOCKING; 3398 } 3399 #ifdef CONFIG_EXT4_FS_ENCRYPTION 3400 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); 3401 #endif 3402 if (IS_DAX(inode)) 3403 ret = dax_do_io(iocb, inode, iter, offset, get_block_func, 3404 ext4_end_io_dio, dio_flags); 3405 else 3406 ret = __blockdev_direct_IO(iocb, inode, 3407 inode->i_sb->s_bdev, iter, offset, 3408 get_block_func, 3409 ext4_end_io_dio, NULL, dio_flags); 3410 3411 if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3412 EXT4_STATE_DIO_UNWRITTEN)) { 3413 int err; 3414 /* 3415 * for non AIO case, since the IO is already 3416 * completed, we could do the conversion right here 3417 */ 3418 err = ext4_convert_unwritten_extents(NULL, inode, 3419 offset, ret); 3420 if (err < 0) 3421 ret = err; 3422 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3423 } 3424 3425 if (iov_iter_rw(iter) == WRITE) 3426 inode_dio_end(inode); 3427 /* take i_mutex locking again if we do a ovewrite dio */ 3428 if (overwrite) 3429 inode_lock(inode); 3430 3431 return ret; 3432 } 3433 3434 static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 3435 loff_t offset) 3436 { 3437 struct file *file = iocb->ki_filp; 3438 struct inode *inode = file->f_mapping->host; 3439 size_t count = iov_iter_count(iter); 3440 ssize_t ret; 3441 3442 #ifdef CONFIG_EXT4_FS_ENCRYPTION 3443 if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) 3444 return 0; 3445 #endif 3446 3447 /* 3448 * If we are doing data journalling we don't support O_DIRECT 3449 */ 3450 if (ext4_should_journal_data(inode)) 3451 return 0; 3452 3453 /* Let buffer I/O handle the inline data case. */ 3454 if (ext4_has_inline_data(inode)) 3455 return 0; 3456 3457 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); 3458 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3459 ret = ext4_ext_direct_IO(iocb, iter, offset); 3460 else 3461 ret = ext4_ind_direct_IO(iocb, iter, offset); 3462 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); 3463 return ret; 3464 } 3465 3466 /* 3467 * Pages can be marked dirty completely asynchronously from ext4's journalling 3468 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3469 * much here because ->set_page_dirty is called under VFS locks. The page is 3470 * not necessarily locked. 3471 * 3472 * We cannot just dirty the page and leave attached buffers clean, because the 3473 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3474 * or jbddirty because all the journalling code will explode. 3475 * 3476 * So what we do is to mark the page "pending dirty" and next time writepage 3477 * is called, propagate that into the buffers appropriately. 3478 */ 3479 static int ext4_journalled_set_page_dirty(struct page *page) 3480 { 3481 SetPageChecked(page); 3482 return __set_page_dirty_nobuffers(page); 3483 } 3484 3485 static const struct address_space_operations ext4_aops = { 3486 .readpage = ext4_readpage, 3487 .readpages = ext4_readpages, 3488 .writepage = ext4_writepage, 3489 .writepages = ext4_writepages, 3490 .write_begin = ext4_write_begin, 3491 .write_end = ext4_write_end, 3492 .bmap = ext4_bmap, 3493 .invalidatepage = ext4_invalidatepage, 3494 .releasepage = ext4_releasepage, 3495 .direct_IO = ext4_direct_IO, 3496 .migratepage = buffer_migrate_page, 3497 .is_partially_uptodate = block_is_partially_uptodate, 3498 .error_remove_page = generic_error_remove_page, 3499 }; 3500 3501 static const struct address_space_operations ext4_journalled_aops = { 3502 .readpage = ext4_readpage, 3503 .readpages = ext4_readpages, 3504 .writepage = ext4_writepage, 3505 .writepages = ext4_writepages, 3506 .write_begin = ext4_write_begin, 3507 .write_end = ext4_journalled_write_end, 3508 .set_page_dirty = ext4_journalled_set_page_dirty, 3509 .bmap = ext4_bmap, 3510 .invalidatepage = ext4_journalled_invalidatepage, 3511 .releasepage = ext4_releasepage, 3512 .direct_IO = ext4_direct_IO, 3513 .is_partially_uptodate = block_is_partially_uptodate, 3514 .error_remove_page = generic_error_remove_page, 3515 }; 3516 3517 static const struct address_space_operations ext4_da_aops = { 3518 .readpage = ext4_readpage, 3519 .readpages = ext4_readpages, 3520 .writepage = ext4_writepage, 3521 .writepages = ext4_writepages, 3522 .write_begin = ext4_da_write_begin, 3523 .write_end = ext4_da_write_end, 3524 .bmap = ext4_bmap, 3525 .invalidatepage = ext4_da_invalidatepage, 3526 .releasepage = ext4_releasepage, 3527 .direct_IO = ext4_direct_IO, 3528 .migratepage = buffer_migrate_page, 3529 .is_partially_uptodate = block_is_partially_uptodate, 3530 .error_remove_page = generic_error_remove_page, 3531 }; 3532 3533 void ext4_set_aops(struct inode *inode) 3534 { 3535 switch (ext4_inode_journal_mode(inode)) { 3536 case EXT4_INODE_ORDERED_DATA_MODE: 3537 ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); 3538 break; 3539 case EXT4_INODE_WRITEBACK_DATA_MODE: 3540 ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); 3541 break; 3542 case EXT4_INODE_JOURNAL_DATA_MODE: 3543 inode->i_mapping->a_ops = &ext4_journalled_aops; 3544 return; 3545 default: 3546 BUG(); 3547 } 3548 if (test_opt(inode->i_sb, DELALLOC)) 3549 inode->i_mapping->a_ops = &ext4_da_aops; 3550 else 3551 inode->i_mapping->a_ops = &ext4_aops; 3552 } 3553 3554 static int __ext4_block_zero_page_range(handle_t *handle, 3555 struct address_space *mapping, loff_t from, loff_t length) 3556 { 3557 ext4_fsblk_t index = from >> PAGE_SHIFT; 3558 unsigned offset = from & (PAGE_SIZE-1); 3559 unsigned blocksize, pos; 3560 ext4_lblk_t iblock; 3561 struct inode *inode = mapping->host; 3562 struct buffer_head *bh; 3563 struct page *page; 3564 int err = 0; 3565 3566 page = find_or_create_page(mapping, from >> PAGE_SHIFT, 3567 mapping_gfp_constraint(mapping, ~__GFP_FS)); 3568 if (!page) 3569 return -ENOMEM; 3570 3571 blocksize = inode->i_sb->s_blocksize; 3572 3573 iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); 3574 3575 if (!page_has_buffers(page)) 3576 create_empty_buffers(page, blocksize, 0); 3577 3578 /* Find the buffer that contains "offset" */ 3579 bh = page_buffers(page); 3580 pos = blocksize; 3581 while (offset >= pos) { 3582 bh = bh->b_this_page; 3583 iblock++; 3584 pos += blocksize; 3585 } 3586 if (buffer_freed(bh)) { 3587 BUFFER_TRACE(bh, "freed: skip"); 3588 goto unlock; 3589 } 3590 if (!buffer_mapped(bh)) { 3591 BUFFER_TRACE(bh, "unmapped"); 3592 ext4_get_block(inode, iblock, bh, 0); 3593 /* unmapped? It's a hole - nothing to do */ 3594 if (!buffer_mapped(bh)) { 3595 BUFFER_TRACE(bh, "still unmapped"); 3596 goto unlock; 3597 } 3598 } 3599 3600 /* Ok, it's mapped. Make sure it's up-to-date */ 3601 if (PageUptodate(page)) 3602 set_buffer_uptodate(bh); 3603 3604 if (!buffer_uptodate(bh)) { 3605 err = -EIO; 3606 ll_rw_block(READ, 1, &bh); 3607 wait_on_buffer(bh); 3608 /* Uhhuh. Read error. Complain and punt. */ 3609 if (!buffer_uptodate(bh)) 3610 goto unlock; 3611 if (S_ISREG(inode->i_mode) && 3612 ext4_encrypted_inode(inode)) { 3613 /* We expect the key to be set. */ 3614 BUG_ON(!ext4_has_encryption_key(inode)); 3615 BUG_ON(blocksize != PAGE_SIZE); 3616 WARN_ON_ONCE(ext4_decrypt(page)); 3617 } 3618 } 3619 if (ext4_should_journal_data(inode)) { 3620 BUFFER_TRACE(bh, "get write access"); 3621 err = ext4_journal_get_write_access(handle, bh); 3622 if (err) 3623 goto unlock; 3624 } 3625 zero_user(page, offset, length); 3626 BUFFER_TRACE(bh, "zeroed end of block"); 3627 3628 if (ext4_should_journal_data(inode)) { 3629 err = ext4_handle_dirty_metadata(handle, inode, bh); 3630 } else { 3631 err = 0; 3632 mark_buffer_dirty(bh); 3633 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) 3634 err = ext4_jbd2_file_inode(handle, inode); 3635 } 3636 3637 unlock: 3638 unlock_page(page); 3639 put_page(page); 3640 return err; 3641 } 3642 3643 /* 3644 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3645 * starting from file offset 'from'. The range to be zero'd must 3646 * be contained with in one block. If the specified range exceeds 3647 * the end of the block it will be shortened to end of the block 3648 * that cooresponds to 'from' 3649 */ 3650 static int ext4_block_zero_page_range(handle_t *handle, 3651 struct address_space *mapping, loff_t from, loff_t length) 3652 { 3653 struct inode *inode = mapping->host; 3654 unsigned offset = from & (PAGE_SIZE-1); 3655 unsigned blocksize = inode->i_sb->s_blocksize; 3656 unsigned max = blocksize - (offset & (blocksize - 1)); 3657 3658 /* 3659 * correct length if it does not fall between 3660 * 'from' and the end of the block 3661 */ 3662 if (length > max || length < 0) 3663 length = max; 3664 3665 if (IS_DAX(inode)) 3666 return dax_zero_page_range(inode, from, length, ext4_get_block); 3667 return __ext4_block_zero_page_range(handle, mapping, from, length); 3668 } 3669 3670 /* 3671 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3672 * up to the end of the block which corresponds to `from'. 3673 * This required during truncate. We need to physically zero the tail end 3674 * of that block so it doesn't yield old data if the file is later grown. 3675 */ 3676 static int ext4_block_truncate_page(handle_t *handle, 3677 struct address_space *mapping, loff_t from) 3678 { 3679 unsigned offset = from & (PAGE_SIZE-1); 3680 unsigned length; 3681 unsigned blocksize; 3682 struct inode *inode = mapping->host; 3683 3684 blocksize = inode->i_sb->s_blocksize; 3685 length = blocksize - (offset & (blocksize - 1)); 3686 3687 return ext4_block_zero_page_range(handle, mapping, from, length); 3688 } 3689 3690 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3691 loff_t lstart, loff_t length) 3692 { 3693 struct super_block *sb = inode->i_sb; 3694 struct address_space *mapping = inode->i_mapping; 3695 unsigned partial_start, partial_end; 3696 ext4_fsblk_t start, end; 3697 loff_t byte_end = (lstart + length - 1); 3698 int err = 0; 3699 3700 partial_start = lstart & (sb->s_blocksize - 1); 3701 partial_end = byte_end & (sb->s_blocksize - 1); 3702 3703 start = lstart >> sb->s_blocksize_bits; 3704 end = byte_end >> sb->s_blocksize_bits; 3705 3706 /* Handle partial zero within the single block */ 3707 if (start == end && 3708 (partial_start || (partial_end != sb->s_blocksize - 1))) { 3709 err = ext4_block_zero_page_range(handle, mapping, 3710 lstart, length); 3711 return err; 3712 } 3713 /* Handle partial zero out on the start of the range */ 3714 if (partial_start) { 3715 err = ext4_block_zero_page_range(handle, mapping, 3716 lstart, sb->s_blocksize); 3717 if (err) 3718 return err; 3719 } 3720 /* Handle partial zero out on the end of the range */ 3721 if (partial_end != sb->s_blocksize - 1) 3722 err = ext4_block_zero_page_range(handle, mapping, 3723 byte_end - partial_end, 3724 partial_end + 1); 3725 return err; 3726 } 3727 3728 int ext4_can_truncate(struct inode *inode) 3729 { 3730 if (S_ISREG(inode->i_mode)) 3731 return 1; 3732 if (S_ISDIR(inode->i_mode)) 3733 return 1; 3734 if (S_ISLNK(inode->i_mode)) 3735 return !ext4_inode_is_fast_symlink(inode); 3736 return 0; 3737 } 3738 3739 /* 3740 * We have to make sure i_disksize gets properly updated before we truncate 3741 * page cache due to hole punching or zero range. Otherwise i_disksize update 3742 * can get lost as it may have been postponed to submission of writeback but 3743 * that will never happen after we truncate page cache. 3744 */ 3745 int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, 3746 loff_t len) 3747 { 3748 handle_t *handle; 3749 loff_t size = i_size_read(inode); 3750 3751 WARN_ON(!inode_is_locked(inode)); 3752 if (offset > size || offset + len < size) 3753 return 0; 3754 3755 if (EXT4_I(inode)->i_disksize >= size) 3756 return 0; 3757 3758 handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 3759 if (IS_ERR(handle)) 3760 return PTR_ERR(handle); 3761 ext4_update_i_disksize(inode, size); 3762 ext4_mark_inode_dirty(handle, inode); 3763 ext4_journal_stop(handle); 3764 3765 return 0; 3766 } 3767 3768 /* 3769 * ext4_punch_hole: punches a hole in a file by releaseing the blocks 3770 * associated with the given offset and length 3771 * 3772 * @inode: File inode 3773 * @offset: The offset where the hole will begin 3774 * @len: The length of the hole 3775 * 3776 * Returns: 0 on success or negative on failure 3777 */ 3778 3779 int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) 3780 { 3781 struct super_block *sb = inode->i_sb; 3782 ext4_lblk_t first_block, stop_block; 3783 struct address_space *mapping = inode->i_mapping; 3784 loff_t first_block_offset, last_block_offset; 3785 handle_t *handle; 3786 unsigned int credits; 3787 int ret = 0; 3788 3789 if (!S_ISREG(inode->i_mode)) 3790 return -EOPNOTSUPP; 3791 3792 trace_ext4_punch_hole(inode, offset, length, 0); 3793 3794 /* 3795 * Write out all dirty pages to avoid race conditions 3796 * Then release them. 3797 */ 3798 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 3799 ret = filemap_write_and_wait_range(mapping, offset, 3800 offset + length - 1); 3801 if (ret) 3802 return ret; 3803 } 3804 3805 inode_lock(inode); 3806 3807 /* No need to punch hole beyond i_size */ 3808 if (offset >= inode->i_size) 3809 goto out_mutex; 3810 3811 /* 3812 * If the hole extends beyond i_size, set the hole 3813 * to end after the page that contains i_size 3814 */ 3815 if (offset + length > inode->i_size) { 3816 length = inode->i_size + 3817 PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - 3818 offset; 3819 } 3820 3821 if (offset & (sb->s_blocksize - 1) || 3822 (offset + length) & (sb->s_blocksize - 1)) { 3823 /* 3824 * Attach jinode to inode for jbd2 if we do any zeroing of 3825 * partial block 3826 */ 3827 ret = ext4_inode_attach_jinode(inode); 3828 if (ret < 0) 3829 goto out_mutex; 3830 3831 } 3832 3833 /* Wait all existing dio workers, newcomers will block on i_mutex */ 3834 ext4_inode_block_unlocked_dio(inode); 3835 inode_dio_wait(inode); 3836 3837 /* 3838 * Prevent page faults from reinstantiating pages we have released from 3839 * page cache. 3840 */ 3841 down_write(&EXT4_I(inode)->i_mmap_sem); 3842 first_block_offset = round_up(offset, sb->s_blocksize); 3843 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 3844 3845 /* Now release the pages and zero block aligned part of pages*/ 3846 if (last_block_offset > first_block_offset) { 3847 ret = ext4_update_disksize_before_punch(inode, offset, length); 3848 if (ret) 3849 goto out_dio; 3850 truncate_pagecache_range(inode, first_block_offset, 3851 last_block_offset); 3852 } 3853 3854 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3855 credits = ext4_writepage_trans_blocks(inode); 3856 else 3857 credits = ext4_blocks_for_truncate(inode); 3858 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 3859 if (IS_ERR(handle)) { 3860 ret = PTR_ERR(handle); 3861 ext4_std_error(sb, ret); 3862 goto out_dio; 3863 } 3864 3865 ret = ext4_zero_partial_blocks(handle, inode, offset, 3866 length); 3867 if (ret) 3868 goto out_stop; 3869 3870 first_block = (offset + sb->s_blocksize - 1) >> 3871 EXT4_BLOCK_SIZE_BITS(sb); 3872 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 3873 3874 /* If there are no blocks to remove, return now */ 3875 if (first_block >= stop_block) 3876 goto out_stop; 3877 3878 down_write(&EXT4_I(inode)->i_data_sem); 3879 ext4_discard_preallocations(inode); 3880 3881 ret = ext4_es_remove_extent(inode, first_block, 3882 stop_block - first_block); 3883 if (ret) { 3884 up_write(&EXT4_I(inode)->i_data_sem); 3885 goto out_stop; 3886 } 3887 3888 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3889 ret = ext4_ext_remove_space(inode, first_block, 3890 stop_block - 1); 3891 else 3892 ret = ext4_ind_remove_space(handle, inode, first_block, 3893 stop_block); 3894 3895 up_write(&EXT4_I(inode)->i_data_sem); 3896 if (IS_SYNC(inode)) 3897 ext4_handle_sync(handle); 3898 3899 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3900 ext4_mark_inode_dirty(handle, inode); 3901 out_stop: 3902 ext4_journal_stop(handle); 3903 out_dio: 3904 up_write(&EXT4_I(inode)->i_mmap_sem); 3905 ext4_inode_resume_unlocked_dio(inode); 3906 out_mutex: 3907 inode_unlock(inode); 3908 return ret; 3909 } 3910 3911 int ext4_inode_attach_jinode(struct inode *inode) 3912 { 3913 struct ext4_inode_info *ei = EXT4_I(inode); 3914 struct jbd2_inode *jinode; 3915 3916 if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) 3917 return 0; 3918 3919 jinode = jbd2_alloc_inode(GFP_KERNEL); 3920 spin_lock(&inode->i_lock); 3921 if (!ei->jinode) { 3922 if (!jinode) { 3923 spin_unlock(&inode->i_lock); 3924 return -ENOMEM; 3925 } 3926 ei->jinode = jinode; 3927 jbd2_journal_init_jbd_inode(ei->jinode, inode); 3928 jinode = NULL; 3929 } 3930 spin_unlock(&inode->i_lock); 3931 if (unlikely(jinode != NULL)) 3932 jbd2_free_inode(jinode); 3933 return 0; 3934 } 3935 3936 /* 3937 * ext4_truncate() 3938 * 3939 * We block out ext4_get_block() block instantiations across the entire 3940 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 3941 * simultaneously on behalf of the same inode. 3942 * 3943 * As we work through the truncate and commit bits of it to the journal there 3944 * is one core, guiding principle: the file's tree must always be consistent on 3945 * disk. We must be able to restart the truncate after a crash. 3946 * 3947 * The file's tree may be transiently inconsistent in memory (although it 3948 * probably isn't), but whenever we close off and commit a journal transaction, 3949 * the contents of (the filesystem + the journal) must be consistent and 3950 * restartable. It's pretty simple, really: bottom up, right to left (although 3951 * left-to-right works OK too). 3952 * 3953 * Note that at recovery time, journal replay occurs *before* the restart of 3954 * truncate against the orphan inode list. 3955 * 3956 * The committed inode has the new, desired i_size (which is the same as 3957 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 3958 * that this inode's truncate did not complete and it will again call 3959 * ext4_truncate() to have another go. So there will be instantiated blocks 3960 * to the right of the truncation point in a crashed ext4 filesystem. But 3961 * that's fine - as long as they are linked from the inode, the post-crash 3962 * ext4_truncate() run will find them and release them. 3963 */ 3964 void ext4_truncate(struct inode *inode) 3965 { 3966 struct ext4_inode_info *ei = EXT4_I(inode); 3967 unsigned int credits; 3968 handle_t *handle; 3969 struct address_space *mapping = inode->i_mapping; 3970 3971 /* 3972 * There is a possibility that we're either freeing the inode 3973 * or it's a completely new inode. In those cases we might not 3974 * have i_mutex locked because it's not necessary. 3975 */ 3976 if (!(inode->i_state & (I_NEW|I_FREEING))) 3977 WARN_ON(!inode_is_locked(inode)); 3978 trace_ext4_truncate_enter(inode); 3979 3980 if (!ext4_can_truncate(inode)) 3981 return; 3982 3983 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3984 3985 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 3986 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 3987 3988 if (ext4_has_inline_data(inode)) { 3989 int has_inline = 1; 3990 3991 ext4_inline_data_truncate(inode, &has_inline); 3992 if (has_inline) 3993 return; 3994 } 3995 3996 /* If we zero-out tail of the page, we have to create jinode for jbd2 */ 3997 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { 3998 if (ext4_inode_attach_jinode(inode) < 0) 3999 return; 4000 } 4001 4002 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4003 credits = ext4_writepage_trans_blocks(inode); 4004 else 4005 credits = ext4_blocks_for_truncate(inode); 4006 4007 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4008 if (IS_ERR(handle)) { 4009 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 4010 return; 4011 } 4012 4013 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) 4014 ext4_block_truncate_page(handle, mapping, inode->i_size); 4015 4016 /* 4017 * We add the inode to the orphan list, so that if this 4018 * truncate spans multiple transactions, and we crash, we will 4019 * resume the truncate when the filesystem recovers. It also 4020 * marks the inode dirty, to catch the new size. 4021 * 4022 * Implication: the file must always be in a sane, consistent 4023 * truncatable state while each transaction commits. 4024 */ 4025 if (ext4_orphan_add(handle, inode)) 4026 goto out_stop; 4027 4028 down_write(&EXT4_I(inode)->i_data_sem); 4029 4030 ext4_discard_preallocations(inode); 4031 4032 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4033 ext4_ext_truncate(handle, inode); 4034 else 4035 ext4_ind_truncate(handle, inode); 4036 4037 up_write(&ei->i_data_sem); 4038 4039 if (IS_SYNC(inode)) 4040 ext4_handle_sync(handle); 4041 4042 out_stop: 4043 /* 4044 * If this was a simple ftruncate() and the file will remain alive, 4045 * then we need to clear up the orphan record which we created above. 4046 * However, if this was a real unlink then we were called by 4047 * ext4_evict_inode(), and we allow that function to clean up the 4048 * orphan info for us. 4049 */ 4050 if (inode->i_nlink) 4051 ext4_orphan_del(handle, inode); 4052 4053 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4054 ext4_mark_inode_dirty(handle, inode); 4055 ext4_journal_stop(handle); 4056 4057 trace_ext4_truncate_exit(inode); 4058 } 4059 4060 /* 4061 * ext4_get_inode_loc returns with an extra refcount against the inode's 4062 * underlying buffer_head on success. If 'in_mem' is true, we have all 4063 * data in memory that is needed to recreate the on-disk version of this 4064 * inode. 4065 */ 4066 static int __ext4_get_inode_loc(struct inode *inode, 4067 struct ext4_iloc *iloc, int in_mem) 4068 { 4069 struct ext4_group_desc *gdp; 4070 struct buffer_head *bh; 4071 struct super_block *sb = inode->i_sb; 4072 ext4_fsblk_t block; 4073 int inodes_per_block, inode_offset; 4074 4075 iloc->bh = NULL; 4076 if (!ext4_valid_inum(sb, inode->i_ino)) 4077 return -EFSCORRUPTED; 4078 4079 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 4080 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4081 if (!gdp) 4082 return -EIO; 4083 4084 /* 4085 * Figure out the offset within the block group inode table 4086 */ 4087 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 4088 inode_offset = ((inode->i_ino - 1) % 4089 EXT4_INODES_PER_GROUP(sb)); 4090 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4091 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4092 4093 bh = sb_getblk(sb, block); 4094 if (unlikely(!bh)) 4095 return -ENOMEM; 4096 if (!buffer_uptodate(bh)) { 4097 lock_buffer(bh); 4098 4099 /* 4100 * If the buffer has the write error flag, we have failed 4101 * to write out another inode in the same block. In this 4102 * case, we don't have to read the block because we may 4103 * read the old inode data successfully. 4104 */ 4105 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4106 set_buffer_uptodate(bh); 4107 4108 if (buffer_uptodate(bh)) { 4109 /* someone brought it uptodate while we waited */ 4110 unlock_buffer(bh); 4111 goto has_buffer; 4112 } 4113 4114 /* 4115 * If we have all information of the inode in memory and this 4116 * is the only valid inode in the block, we need not read the 4117 * block. 4118 */ 4119 if (in_mem) { 4120 struct buffer_head *bitmap_bh; 4121 int i, start; 4122 4123 start = inode_offset & ~(inodes_per_block - 1); 4124 4125 /* Is the inode bitmap in cache? */ 4126 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4127 if (unlikely(!bitmap_bh)) 4128 goto make_io; 4129 4130 /* 4131 * If the inode bitmap isn't in cache then the 4132 * optimisation may end up performing two reads instead 4133 * of one, so skip it. 4134 */ 4135 if (!buffer_uptodate(bitmap_bh)) { 4136 brelse(bitmap_bh); 4137 goto make_io; 4138 } 4139 for (i = start; i < start + inodes_per_block; i++) { 4140 if (i == inode_offset) 4141 continue; 4142 if (ext4_test_bit(i, bitmap_bh->b_data)) 4143 break; 4144 } 4145 brelse(bitmap_bh); 4146 if (i == start + inodes_per_block) { 4147 /* all other inodes are free, so skip I/O */ 4148 memset(bh->b_data, 0, bh->b_size); 4149 set_buffer_uptodate(bh); 4150 unlock_buffer(bh); 4151 goto has_buffer; 4152 } 4153 } 4154 4155 make_io: 4156 /* 4157 * If we need to do any I/O, try to pre-readahead extra 4158 * blocks from the inode table. 4159 */ 4160 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4161 ext4_fsblk_t b, end, table; 4162 unsigned num; 4163 __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; 4164 4165 table = ext4_inode_table(sb, gdp); 4166 /* s_inode_readahead_blks is always a power of 2 */ 4167 b = block & ~((ext4_fsblk_t) ra_blks - 1); 4168 if (table > b) 4169 b = table; 4170 end = b + ra_blks; 4171 num = EXT4_INODES_PER_GROUP(sb); 4172 if (ext4_has_group_desc_csum(sb)) 4173 num -= ext4_itable_unused_count(sb, gdp); 4174 table += num / inodes_per_block; 4175 if (end > table) 4176 end = table; 4177 while (b <= end) 4178 sb_breadahead(sb, b++); 4179 } 4180 4181 /* 4182 * There are other valid inodes in the buffer, this inode 4183 * has in-inode xattrs, or we don't have this inode in memory. 4184 * Read the block from disk. 4185 */ 4186 trace_ext4_load_inode(inode); 4187 get_bh(bh); 4188 bh->b_end_io = end_buffer_read_sync; 4189 submit_bh(READ | REQ_META | REQ_PRIO, bh); 4190 wait_on_buffer(bh); 4191 if (!buffer_uptodate(bh)) { 4192 EXT4_ERROR_INODE_BLOCK(inode, block, 4193 "unable to read itable block"); 4194 brelse(bh); 4195 return -EIO; 4196 } 4197 } 4198 has_buffer: 4199 iloc->bh = bh; 4200 return 0; 4201 } 4202 4203 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4204 { 4205 /* We have all inode data except xattrs in memory here. */ 4206 return __ext4_get_inode_loc(inode, iloc, 4207 !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); 4208 } 4209 4210 void ext4_set_inode_flags(struct inode *inode) 4211 { 4212 unsigned int flags = EXT4_I(inode)->i_flags; 4213 unsigned int new_fl = 0; 4214 4215 if (flags & EXT4_SYNC_FL) 4216 new_fl |= S_SYNC; 4217 if (flags & EXT4_APPEND_FL) 4218 new_fl |= S_APPEND; 4219 if (flags & EXT4_IMMUTABLE_FL) 4220 new_fl |= S_IMMUTABLE; 4221 if (flags & EXT4_NOATIME_FL) 4222 new_fl |= S_NOATIME; 4223 if (flags & EXT4_DIRSYNC_FL) 4224 new_fl |= S_DIRSYNC; 4225 if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) 4226 new_fl |= S_DAX; 4227 inode_set_flags(inode, new_fl, 4228 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); 4229 } 4230 4231 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4232 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4233 { 4234 unsigned int vfs_fl; 4235 unsigned long old_fl, new_fl; 4236 4237 do { 4238 vfs_fl = ei->vfs_inode.i_flags; 4239 old_fl = ei->i_flags; 4240 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4241 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| 4242 EXT4_DIRSYNC_FL); 4243 if (vfs_fl & S_SYNC) 4244 new_fl |= EXT4_SYNC_FL; 4245 if (vfs_fl & S_APPEND) 4246 new_fl |= EXT4_APPEND_FL; 4247 if (vfs_fl & S_IMMUTABLE) 4248 new_fl |= EXT4_IMMUTABLE_FL; 4249 if (vfs_fl & S_NOATIME) 4250 new_fl |= EXT4_NOATIME_FL; 4251 if (vfs_fl & S_DIRSYNC) 4252 new_fl |= EXT4_DIRSYNC_FL; 4253 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); 4254 } 4255 4256 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4257 struct ext4_inode_info *ei) 4258 { 4259 blkcnt_t i_blocks ; 4260 struct inode *inode = &(ei->vfs_inode); 4261 struct super_block *sb = inode->i_sb; 4262 4263 if (ext4_has_feature_huge_file(sb)) { 4264 /* we are using combined 48 bit field */ 4265 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4266 le32_to_cpu(raw_inode->i_blocks_lo); 4267 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { 4268 /* i_blocks represent file system block size */ 4269 return i_blocks << (inode->i_blkbits - 9); 4270 } else { 4271 return i_blocks; 4272 } 4273 } else { 4274 return le32_to_cpu(raw_inode->i_blocks_lo); 4275 } 4276 } 4277 4278 static inline void ext4_iget_extra_inode(struct inode *inode, 4279 struct ext4_inode *raw_inode, 4280 struct ext4_inode_info *ei) 4281 { 4282 __le32 *magic = (void *)raw_inode + 4283 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; 4284 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { 4285 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 4286 ext4_find_inline_data_nolock(inode); 4287 } else 4288 EXT4_I(inode)->i_inline_off = 0; 4289 } 4290 4291 int ext4_get_projid(struct inode *inode, kprojid_t *projid) 4292 { 4293 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) 4294 return -EOPNOTSUPP; 4295 *projid = EXT4_I(inode)->i_projid; 4296 return 0; 4297 } 4298 4299 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4300 { 4301 struct ext4_iloc iloc; 4302 struct ext4_inode *raw_inode; 4303 struct ext4_inode_info *ei; 4304 struct inode *inode; 4305 journal_t *journal = EXT4_SB(sb)->s_journal; 4306 long ret; 4307 int block; 4308 uid_t i_uid; 4309 gid_t i_gid; 4310 projid_t i_projid; 4311 4312 inode = iget_locked(sb, ino); 4313 if (!inode) 4314 return ERR_PTR(-ENOMEM); 4315 if (!(inode->i_state & I_NEW)) 4316 return inode; 4317 4318 ei = EXT4_I(inode); 4319 iloc.bh = NULL; 4320 4321 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4322 if (ret < 0) 4323 goto bad_inode; 4324 raw_inode = ext4_raw_inode(&iloc); 4325 4326 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4327 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4328 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4329 EXT4_INODE_SIZE(inode->i_sb)) { 4330 EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", 4331 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, 4332 EXT4_INODE_SIZE(inode->i_sb)); 4333 ret = -EFSCORRUPTED; 4334 goto bad_inode; 4335 } 4336 } else 4337 ei->i_extra_isize = 0; 4338 4339 /* Precompute checksum seed for inode metadata */ 4340 if (ext4_has_metadata_csum(sb)) { 4341 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4342 __u32 csum; 4343 __le32 inum = cpu_to_le32(inode->i_ino); 4344 __le32 gen = raw_inode->i_generation; 4345 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, 4346 sizeof(inum)); 4347 ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, 4348 sizeof(gen)); 4349 } 4350 4351 if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { 4352 EXT4_ERROR_INODE(inode, "checksum invalid"); 4353 ret = -EFSBADCRC; 4354 goto bad_inode; 4355 } 4356 4357 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4358 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4359 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4360 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && 4361 EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && 4362 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 4363 i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); 4364 else 4365 i_projid = EXT4_DEF_PROJID; 4366 4367 if (!(test_opt(inode->i_sb, NO_UID32))) { 4368 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4369 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4370 } 4371 i_uid_write(inode, i_uid); 4372 i_gid_write(inode, i_gid); 4373 ei->i_projid = make_kprojid(&init_user_ns, i_projid); 4374 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 4375 4376 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 4377 ei->i_inline_off = 0; 4378 ei->i_dir_start_lookup = 0; 4379 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4380 /* We now have enough fields to check if the inode was active or not. 4381 * This is needed because nfsd might try to access dead inodes 4382 * the test is that same one that e2fsck uses 4383 * NeilBrown 1999oct15 4384 */ 4385 if (inode->i_nlink == 0) { 4386 if ((inode->i_mode == 0 || 4387 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && 4388 ino != EXT4_BOOT_LOADER_INO) { 4389 /* this inode is deleted */ 4390 ret = -ESTALE; 4391 goto bad_inode; 4392 } 4393 /* The only unlinked inodes we let through here have 4394 * valid i_mode and are being read by the orphan 4395 * recovery code: that's fine, we're about to complete 4396 * the process of deleting those. 4397 * OR it is the EXT4_BOOT_LOADER_INO which is 4398 * not initialized on a new filesystem. */ 4399 } 4400 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4401 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4402 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4403 if (ext4_has_feature_64bit(sb)) 4404 ei->i_file_acl |= 4405 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4406 inode->i_size = ext4_isize(raw_inode); 4407 ei->i_disksize = inode->i_size; 4408 #ifdef CONFIG_QUOTA 4409 ei->i_reserved_quota = 0; 4410 #endif 4411 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4412 ei->i_block_group = iloc.block_group; 4413 ei->i_last_alloc_group = ~0; 4414 /* 4415 * NOTE! The in-memory inode i_data array is in little-endian order 4416 * even on big-endian machines: we do NOT byteswap the block numbers! 4417 */ 4418 for (block = 0; block < EXT4_N_BLOCKS; block++) 4419 ei->i_data[block] = raw_inode->i_block[block]; 4420 INIT_LIST_HEAD(&ei->i_orphan); 4421 4422 /* 4423 * Set transaction id's of transactions that have to be committed 4424 * to finish f[data]sync. We set them to currently running transaction 4425 * as we cannot be sure that the inode or some of its metadata isn't 4426 * part of the transaction - the inode could have been reclaimed and 4427 * now it is reread from disk. 4428 */ 4429 if (journal) { 4430 transaction_t *transaction; 4431 tid_t tid; 4432 4433 read_lock(&journal->j_state_lock); 4434 if (journal->j_running_transaction) 4435 transaction = journal->j_running_transaction; 4436 else 4437 transaction = journal->j_committing_transaction; 4438 if (transaction) 4439 tid = transaction->t_tid; 4440 else 4441 tid = journal->j_commit_sequence; 4442 read_unlock(&journal->j_state_lock); 4443 ei->i_sync_tid = tid; 4444 ei->i_datasync_tid = tid; 4445 } 4446 4447 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4448 if (ei->i_extra_isize == 0) { 4449 /* The extra space is currently unused. Use it. */ 4450 ei->i_extra_isize = sizeof(struct ext4_inode) - 4451 EXT4_GOOD_OLD_INODE_SIZE; 4452 } else { 4453 ext4_iget_extra_inode(inode, raw_inode, ei); 4454 } 4455 } 4456 4457 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4458 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4459 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4460 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4461 4462 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 4463 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4464 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4465 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4466 inode->i_version |= 4467 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4468 } 4469 } 4470 4471 ret = 0; 4472 if (ei->i_file_acl && 4473 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 4474 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", 4475 ei->i_file_acl); 4476 ret = -EFSCORRUPTED; 4477 goto bad_inode; 4478 } else if (!ext4_has_inline_data(inode)) { 4479 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4480 if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4481 (S_ISLNK(inode->i_mode) && 4482 !ext4_inode_is_fast_symlink(inode)))) 4483 /* Validate extent which is part of inode */ 4484 ret = ext4_ext_check_inode(inode); 4485 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4486 (S_ISLNK(inode->i_mode) && 4487 !ext4_inode_is_fast_symlink(inode))) { 4488 /* Validate block references which are part of inode */ 4489 ret = ext4_ind_check_inode(inode); 4490 } 4491 } 4492 if (ret) 4493 goto bad_inode; 4494 4495 if (S_ISREG(inode->i_mode)) { 4496 inode->i_op = &ext4_file_inode_operations; 4497 inode->i_fop = &ext4_file_operations; 4498 ext4_set_aops(inode); 4499 } else if (S_ISDIR(inode->i_mode)) { 4500 inode->i_op = &ext4_dir_inode_operations; 4501 inode->i_fop = &ext4_dir_operations; 4502 } else if (S_ISLNK(inode->i_mode)) { 4503 if (ext4_encrypted_inode(inode)) { 4504 inode->i_op = &ext4_encrypted_symlink_inode_operations; 4505 ext4_set_aops(inode); 4506 } else if (ext4_inode_is_fast_symlink(inode)) { 4507 inode->i_link = (char *)ei->i_data; 4508 inode->i_op = &ext4_fast_symlink_inode_operations; 4509 nd_terminate_link(ei->i_data, inode->i_size, 4510 sizeof(ei->i_data) - 1); 4511 } else { 4512 inode->i_op = &ext4_symlink_inode_operations; 4513 ext4_set_aops(inode); 4514 } 4515 inode_nohighmem(inode); 4516 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 4517 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 4518 inode->i_op = &ext4_special_inode_operations; 4519 if (raw_inode->i_block[0]) 4520 init_special_inode(inode, inode->i_mode, 4521 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 4522 else 4523 init_special_inode(inode, inode->i_mode, 4524 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4525 } else if (ino == EXT4_BOOT_LOADER_INO) { 4526 make_bad_inode(inode); 4527 } else { 4528 ret = -EFSCORRUPTED; 4529 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); 4530 goto bad_inode; 4531 } 4532 brelse(iloc.bh); 4533 ext4_set_inode_flags(inode); 4534 unlock_new_inode(inode); 4535 return inode; 4536 4537 bad_inode: 4538 brelse(iloc.bh); 4539 iget_failed(inode); 4540 return ERR_PTR(ret); 4541 } 4542 4543 struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) 4544 { 4545 if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) 4546 return ERR_PTR(-EFSCORRUPTED); 4547 return ext4_iget(sb, ino); 4548 } 4549 4550 static int ext4_inode_blocks_set(handle_t *handle, 4551 struct ext4_inode *raw_inode, 4552 struct ext4_inode_info *ei) 4553 { 4554 struct inode *inode = &(ei->vfs_inode); 4555 u64 i_blocks = inode->i_blocks; 4556 struct super_block *sb = inode->i_sb; 4557 4558 if (i_blocks <= ~0U) { 4559 /* 4560 * i_blocks can be represented in a 32 bit variable 4561 * as multiple of 512 bytes 4562 */ 4563 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4564 raw_inode->i_blocks_high = 0; 4565 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4566 return 0; 4567 } 4568 if (!ext4_has_feature_huge_file(sb)) 4569 return -EFBIG; 4570 4571 if (i_blocks <= 0xffffffffffffULL) { 4572 /* 4573 * i_blocks can be represented in a 48 bit variable 4574 * as multiple of 512 bytes 4575 */ 4576 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4577 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4578 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4579 } else { 4580 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4581 /* i_block is stored in file system block size */ 4582 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4583 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4584 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4585 } 4586 return 0; 4587 } 4588 4589 struct other_inode { 4590 unsigned long orig_ino; 4591 struct ext4_inode *raw_inode; 4592 }; 4593 4594 static int other_inode_match(struct inode * inode, unsigned long ino, 4595 void *data) 4596 { 4597 struct other_inode *oi = (struct other_inode *) data; 4598 4599 if ((inode->i_ino != ino) || 4600 (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | 4601 I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || 4602 ((inode->i_state & I_DIRTY_TIME) == 0)) 4603 return 0; 4604 spin_lock(&inode->i_lock); 4605 if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | 4606 I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) && 4607 (inode->i_state & I_DIRTY_TIME)) { 4608 struct ext4_inode_info *ei = EXT4_I(inode); 4609 4610 inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); 4611 spin_unlock(&inode->i_lock); 4612 4613 spin_lock(&ei->i_raw_lock); 4614 EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode); 4615 EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode); 4616 EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode); 4617 ext4_inode_csum_set(inode, oi->raw_inode, ei); 4618 spin_unlock(&ei->i_raw_lock); 4619 trace_ext4_other_inode_update_time(inode, oi->orig_ino); 4620 return -1; 4621 } 4622 spin_unlock(&inode->i_lock); 4623 return -1; 4624 } 4625 4626 /* 4627 * Opportunistically update the other time fields for other inodes in 4628 * the same inode table block. 4629 */ 4630 static void ext4_update_other_inodes_time(struct super_block *sb, 4631 unsigned long orig_ino, char *buf) 4632 { 4633 struct other_inode oi; 4634 unsigned long ino; 4635 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 4636 int inode_size = EXT4_INODE_SIZE(sb); 4637 4638 oi.orig_ino = orig_ino; 4639 /* 4640 * Calculate the first inode in the inode table block. Inode 4641 * numbers are one-based. That is, the first inode in a block 4642 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). 4643 */ 4644 ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; 4645 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { 4646 if (ino == orig_ino) 4647 continue; 4648 oi.raw_inode = (struct ext4_inode *) buf; 4649 (void) find_inode_nowait(sb, ino, other_inode_match, &oi); 4650 } 4651 } 4652 4653 /* 4654 * Post the struct inode info into an on-disk inode location in the 4655 * buffer-cache. This gobbles the caller's reference to the 4656 * buffer_head in the inode location struct. 4657 * 4658 * The caller must have write access to iloc->bh. 4659 */ 4660 static int ext4_do_update_inode(handle_t *handle, 4661 struct inode *inode, 4662 struct ext4_iloc *iloc) 4663 { 4664 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4665 struct ext4_inode_info *ei = EXT4_I(inode); 4666 struct buffer_head *bh = iloc->bh; 4667 struct super_block *sb = inode->i_sb; 4668 int err = 0, rc, block; 4669 int need_datasync = 0, set_large_file = 0; 4670 uid_t i_uid; 4671 gid_t i_gid; 4672 projid_t i_projid; 4673 4674 spin_lock(&ei->i_raw_lock); 4675 4676 /* For fields not tracked in the in-memory inode, 4677 * initialise them to zero for new inodes. */ 4678 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) 4679 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 4680 4681 ext4_get_inode_flags(ei); 4682 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4683 i_uid = i_uid_read(inode); 4684 i_gid = i_gid_read(inode); 4685 i_projid = from_kprojid(&init_user_ns, ei->i_projid); 4686 if (!(test_opt(inode->i_sb, NO_UID32))) { 4687 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); 4688 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); 4689 /* 4690 * Fix up interoperability with old kernels. Otherwise, old inodes get 4691 * re-used with the upper 16 bits of the uid/gid intact 4692 */ 4693 if (!ei->i_dtime) { 4694 raw_inode->i_uid_high = 4695 cpu_to_le16(high_16_bits(i_uid)); 4696 raw_inode->i_gid_high = 4697 cpu_to_le16(high_16_bits(i_gid)); 4698 } else { 4699 raw_inode->i_uid_high = 0; 4700 raw_inode->i_gid_high = 0; 4701 } 4702 } else { 4703 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); 4704 raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); 4705 raw_inode->i_uid_high = 0; 4706 raw_inode->i_gid_high = 0; 4707 } 4708 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4709 4710 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4711 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4712 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4713 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4714 4715 err = ext4_inode_blocks_set(handle, raw_inode, ei); 4716 if (err) { 4717 spin_unlock(&ei->i_raw_lock); 4718 goto out_brelse; 4719 } 4720 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4721 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 4722 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) 4723 raw_inode->i_file_acl_high = 4724 cpu_to_le16(ei->i_file_acl >> 32); 4725 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4726 if (ei->i_disksize != ext4_isize(raw_inode)) { 4727 ext4_isize_set(raw_inode, ei->i_disksize); 4728 need_datasync = 1; 4729 } 4730 if (ei->i_disksize > 0x7fffffffULL) { 4731 if (!ext4_has_feature_large_file(sb) || 4732 EXT4_SB(sb)->s_es->s_rev_level == 4733 cpu_to_le32(EXT4_GOOD_OLD_REV)) 4734 set_large_file = 1; 4735 } 4736 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4737 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4738 if (old_valid_dev(inode->i_rdev)) { 4739 raw_inode->i_block[0] = 4740 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4741 raw_inode->i_block[1] = 0; 4742 } else { 4743 raw_inode->i_block[0] = 0; 4744 raw_inode->i_block[1] = 4745 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4746 raw_inode->i_block[2] = 0; 4747 } 4748 } else if (!ext4_has_inline_data(inode)) { 4749 for (block = 0; block < EXT4_N_BLOCKS; block++) 4750 raw_inode->i_block[block] = ei->i_data[block]; 4751 } 4752 4753 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 4754 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4755 if (ei->i_extra_isize) { 4756 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4757 raw_inode->i_version_hi = 4758 cpu_to_le32(inode->i_version >> 32); 4759 raw_inode->i_extra_isize = 4760 cpu_to_le16(ei->i_extra_isize); 4761 } 4762 } 4763 4764 BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 4765 EXT4_FEATURE_RO_COMPAT_PROJECT) && 4766 i_projid != EXT4_DEF_PROJID); 4767 4768 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 4769 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 4770 raw_inode->i_projid = cpu_to_le32(i_projid); 4771 4772 ext4_inode_csum_set(inode, raw_inode, ei); 4773 spin_unlock(&ei->i_raw_lock); 4774 if (inode->i_sb->s_flags & MS_LAZYTIME) 4775 ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, 4776 bh->b_data); 4777 4778 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4779 rc = ext4_handle_dirty_metadata(handle, NULL, bh); 4780 if (!err) 4781 err = rc; 4782 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 4783 if (set_large_file) { 4784 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); 4785 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 4786 if (err) 4787 goto out_brelse; 4788 ext4_update_dynamic_rev(sb); 4789 ext4_set_feature_large_file(sb); 4790 ext4_handle_sync(handle); 4791 err = ext4_handle_dirty_super(handle, sb); 4792 } 4793 ext4_update_inode_fsync_trans(handle, inode, need_datasync); 4794 out_brelse: 4795 brelse(bh); 4796 ext4_std_error(inode->i_sb, err); 4797 return err; 4798 } 4799 4800 /* 4801 * ext4_write_inode() 4802 * 4803 * We are called from a few places: 4804 * 4805 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. 4806 * Here, there will be no transaction running. We wait for any running 4807 * transaction to commit. 4808 * 4809 * - Within flush work (sys_sync(), kupdate and such). 4810 * We wait on commit, if told to. 4811 * 4812 * - Within iput_final() -> write_inode_now() 4813 * We wait on commit, if told to. 4814 * 4815 * In all cases it is actually safe for us to return without doing anything, 4816 * because the inode has been copied into a raw inode buffer in 4817 * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL 4818 * writeback. 4819 * 4820 * Note that we are absolutely dependent upon all inode dirtiers doing the 4821 * right thing: they *must* call mark_inode_dirty() after dirtying info in 4822 * which we are interested. 4823 * 4824 * It would be a bug for them to not do this. The code: 4825 * 4826 * mark_inode_dirty(inode) 4827 * stuff(); 4828 * inode->i_size = expr; 4829 * 4830 * is in error because write_inode() could occur while `stuff()' is running, 4831 * and the new i_size will be lost. Plus the inode will no longer be on the 4832 * superblock's dirty inode list. 4833 */ 4834 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) 4835 { 4836 int err; 4837 4838 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) 4839 return 0; 4840 4841 if (EXT4_SB(inode->i_sb)->s_journal) { 4842 if (ext4_journal_current_handle()) { 4843 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4844 dump_stack(); 4845 return -EIO; 4846 } 4847 4848 /* 4849 * No need to force transaction in WB_SYNC_NONE mode. Also 4850 * ext4_sync_fs() will force the commit after everything is 4851 * written. 4852 */ 4853 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) 4854 return 0; 4855 4856 err = ext4_force_commit(inode->i_sb); 4857 } else { 4858 struct ext4_iloc iloc; 4859 4860 err = __ext4_get_inode_loc(inode, &iloc, 0); 4861 if (err) 4862 return err; 4863 /* 4864 * sync(2) will flush the whole buffer cache. No need to do 4865 * it here separately for each inode. 4866 */ 4867 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 4868 sync_dirty_buffer(iloc.bh); 4869 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 4870 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, 4871 "IO error syncing inode"); 4872 err = -EIO; 4873 } 4874 brelse(iloc.bh); 4875 } 4876 return err; 4877 } 4878 4879 /* 4880 * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate 4881 * buffers that are attached to a page stradding i_size and are undergoing 4882 * commit. In that case we have to wait for commit to finish and try again. 4883 */ 4884 static void ext4_wait_for_tail_page_commit(struct inode *inode) 4885 { 4886 struct page *page; 4887 unsigned offset; 4888 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 4889 tid_t commit_tid = 0; 4890 int ret; 4891 4892 offset = inode->i_size & (PAGE_SIZE - 1); 4893 /* 4894 * All buffers in the last page remain valid? Then there's nothing to 4895 * do. We do the check mainly to optimize the common PAGE_SIZE == 4896 * blocksize case 4897 */ 4898 if (offset > PAGE_SIZE - (1 << inode->i_blkbits)) 4899 return; 4900 while (1) { 4901 page = find_lock_page(inode->i_mapping, 4902 inode->i_size >> PAGE_SHIFT); 4903 if (!page) 4904 return; 4905 ret = __ext4_journalled_invalidatepage(page, offset, 4906 PAGE_SIZE - offset); 4907 unlock_page(page); 4908 put_page(page); 4909 if (ret != -EBUSY) 4910 return; 4911 commit_tid = 0; 4912 read_lock(&journal->j_state_lock); 4913 if (journal->j_committing_transaction) 4914 commit_tid = journal->j_committing_transaction->t_tid; 4915 read_unlock(&journal->j_state_lock); 4916 if (commit_tid) 4917 jbd2_log_wait_commit(journal, commit_tid); 4918 } 4919 } 4920 4921 /* 4922 * ext4_setattr() 4923 * 4924 * Called from notify_change. 4925 * 4926 * We want to trap VFS attempts to truncate the file as soon as 4927 * possible. In particular, we want to make sure that when the VFS 4928 * shrinks i_size, we put the inode on the orphan list and modify 4929 * i_disksize immediately, so that during the subsequent flushing of 4930 * dirty pages and freeing of disk blocks, we can guarantee that any 4931 * commit will leave the blocks being flushed in an unused state on 4932 * disk. (On recovery, the inode will get truncated and the blocks will 4933 * be freed, so we have a strong guarantee that no future commit will 4934 * leave these blocks visible to the user.) 4935 * 4936 * Another thing we have to assure is that if we are in ordered mode 4937 * and inode is still attached to the committing transaction, we must 4938 * we start writeout of all the dirty pages which are being truncated. 4939 * This way we are sure that all the data written in the previous 4940 * transaction are already on disk (truncate waits for pages under 4941 * writeback). 4942 * 4943 * Called with inode->i_mutex down. 4944 */ 4945 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4946 { 4947 struct inode *inode = d_inode(dentry); 4948 int error, rc = 0; 4949 int orphan = 0; 4950 const unsigned int ia_valid = attr->ia_valid; 4951 4952 error = inode_change_ok(inode, attr); 4953 if (error) 4954 return error; 4955 4956 if (is_quota_modification(inode, attr)) { 4957 error = dquot_initialize(inode); 4958 if (error) 4959 return error; 4960 } 4961 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || 4962 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { 4963 handle_t *handle; 4964 4965 /* (user+group)*(old+new) structure, inode write (sb, 4966 * inode block, ? - but truncate inode update has it) */ 4967 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 4968 (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 4969 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); 4970 if (IS_ERR(handle)) { 4971 error = PTR_ERR(handle); 4972 goto err_out; 4973 } 4974 error = dquot_transfer(inode, attr); 4975 if (error) { 4976 ext4_journal_stop(handle); 4977 return error; 4978 } 4979 /* Update corresponding info in inode so that everything is in 4980 * one transaction */ 4981 if (attr->ia_valid & ATTR_UID) 4982 inode->i_uid = attr->ia_uid; 4983 if (attr->ia_valid & ATTR_GID) 4984 inode->i_gid = attr->ia_gid; 4985 error = ext4_mark_inode_dirty(handle, inode); 4986 ext4_journal_stop(handle); 4987 } 4988 4989 if (attr->ia_valid & ATTR_SIZE) { 4990 handle_t *handle; 4991 loff_t oldsize = inode->i_size; 4992 int shrink = (attr->ia_size <= inode->i_size); 4993 4994 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4995 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4996 4997 if (attr->ia_size > sbi->s_bitmap_maxbytes) 4998 return -EFBIG; 4999 } 5000 if (!S_ISREG(inode->i_mode)) 5001 return -EINVAL; 5002 5003 if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) 5004 inode_inc_iversion(inode); 5005 5006 if (ext4_should_order_data(inode) && 5007 (attr->ia_size < inode->i_size)) { 5008 error = ext4_begin_ordered_truncate(inode, 5009 attr->ia_size); 5010 if (error) 5011 goto err_out; 5012 } 5013 if (attr->ia_size != inode->i_size) { 5014 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); 5015 if (IS_ERR(handle)) { 5016 error = PTR_ERR(handle); 5017 goto err_out; 5018 } 5019 if (ext4_handle_valid(handle) && shrink) { 5020 error = ext4_orphan_add(handle, inode); 5021 orphan = 1; 5022 } 5023 /* 5024 * Update c/mtime on truncate up, ext4_truncate() will 5025 * update c/mtime in shrink case below 5026 */ 5027 if (!shrink) { 5028 inode->i_mtime = ext4_current_time(inode); 5029 inode->i_ctime = inode->i_mtime; 5030 } 5031 down_write(&EXT4_I(inode)->i_data_sem); 5032 EXT4_I(inode)->i_disksize = attr->ia_size; 5033 rc = ext4_mark_inode_dirty(handle, inode); 5034 if (!error) 5035 error = rc; 5036 /* 5037 * We have to update i_size under i_data_sem together 5038 * with i_disksize to avoid races with writeback code 5039 * running ext4_wb_update_i_disksize(). 5040 */ 5041 if (!error) 5042 i_size_write(inode, attr->ia_size); 5043 up_write(&EXT4_I(inode)->i_data_sem); 5044 ext4_journal_stop(handle); 5045 if (error) { 5046 if (orphan) 5047 ext4_orphan_del(NULL, inode); 5048 goto err_out; 5049 } 5050 } 5051 if (!shrink) 5052 pagecache_isize_extended(inode, oldsize, inode->i_size); 5053 5054 /* 5055 * Blocks are going to be removed from the inode. Wait 5056 * for dio in flight. Temporarily disable 5057 * dioread_nolock to prevent livelock. 5058 */ 5059 if (orphan) { 5060 if (!ext4_should_journal_data(inode)) { 5061 ext4_inode_block_unlocked_dio(inode); 5062 inode_dio_wait(inode); 5063 ext4_inode_resume_unlocked_dio(inode); 5064 } else 5065 ext4_wait_for_tail_page_commit(inode); 5066 } 5067 down_write(&EXT4_I(inode)->i_mmap_sem); 5068 /* 5069 * Truncate pagecache after we've waited for commit 5070 * in data=journal mode to make pages freeable. 5071 */ 5072 truncate_pagecache(inode, inode->i_size); 5073 if (shrink) 5074 ext4_truncate(inode); 5075 up_write(&EXT4_I(inode)->i_mmap_sem); 5076 } 5077 5078 if (!rc) { 5079 setattr_copy(inode, attr); 5080 mark_inode_dirty(inode); 5081 } 5082 5083 /* 5084 * If the call to ext4_truncate failed to get a transaction handle at 5085 * all, we need to clean up the in-core orphan list manually. 5086 */ 5087 if (orphan && inode->i_nlink) 5088 ext4_orphan_del(NULL, inode); 5089 5090 if (!rc && (ia_valid & ATTR_MODE)) 5091 rc = posix_acl_chmod(inode, inode->i_mode); 5092 5093 err_out: 5094 ext4_std_error(inode->i_sb, error); 5095 if (!error) 5096 error = rc; 5097 return error; 5098 } 5099 5100 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 5101 struct kstat *stat) 5102 { 5103 struct inode *inode; 5104 unsigned long long delalloc_blocks; 5105 5106 inode = d_inode(dentry); 5107 generic_fillattr(inode, stat); 5108 5109 /* 5110 * If there is inline data in the inode, the inode will normally not 5111 * have data blocks allocated (it may have an external xattr block). 5112 * Report at least one sector for such files, so tools like tar, rsync, 5113 * others doen't incorrectly think the file is completely sparse. 5114 */ 5115 if (unlikely(ext4_has_inline_data(inode))) 5116 stat->blocks += (stat->size + 511) >> 9; 5117 5118 /* 5119 * We can't update i_blocks if the block allocation is delayed 5120 * otherwise in the case of system crash before the real block 5121 * allocation is done, we will have i_blocks inconsistent with 5122 * on-disk file blocks. 5123 * We always keep i_blocks updated together with real 5124 * allocation. But to not confuse with user, stat 5125 * will return the blocks that include the delayed allocation 5126 * blocks for this file. 5127 */ 5128 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 5129 EXT4_I(inode)->i_reserved_data_blocks); 5130 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); 5131 return 0; 5132 } 5133 5134 static int ext4_index_trans_blocks(struct inode *inode, int lblocks, 5135 int pextents) 5136 { 5137 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5138 return ext4_ind_trans_blocks(inode, lblocks); 5139 return ext4_ext_index_trans_blocks(inode, pextents); 5140 } 5141 5142 /* 5143 * Account for index blocks, block groups bitmaps and block group 5144 * descriptor blocks if modify datablocks and index blocks 5145 * worse case, the indexs blocks spread over different block groups 5146 * 5147 * If datablocks are discontiguous, they are possible to spread over 5148 * different block groups too. If they are contiguous, with flexbg, 5149 * they could still across block group boundary. 5150 * 5151 * Also account for superblock, inode, quota and xattr blocks 5152 */ 5153 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 5154 int pextents) 5155 { 5156 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5157 int gdpblocks; 5158 int idxblocks; 5159 int ret = 0; 5160 5161 /* 5162 * How many index blocks need to touch to map @lblocks logical blocks 5163 * to @pextents physical extents? 5164 */ 5165 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); 5166 5167 ret = idxblocks; 5168 5169 /* 5170 * Now let's see how many group bitmaps and group descriptors need 5171 * to account 5172 */ 5173 groups = idxblocks + pextents; 5174 gdpblocks = groups; 5175 if (groups > ngroups) 5176 groups = ngroups; 5177 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 5178 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 5179 5180 /* bitmaps and block group descriptor blocks */ 5181 ret += groups + gdpblocks; 5182 5183 /* Blocks for super block, inode, quota and xattr blocks */ 5184 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 5185 5186 return ret; 5187 } 5188 5189 /* 5190 * Calculate the total number of credits to reserve to fit 5191 * the modification of a single pages into a single transaction, 5192 * which may include multiple chunks of block allocations. 5193 * 5194 * This could be called via ext4_write_begin() 5195 * 5196 * We need to consider the worse case, when 5197 * one new block per extent. 5198 */ 5199 int ext4_writepage_trans_blocks(struct inode *inode) 5200 { 5201 int bpp = ext4_journal_blocks_per_page(inode); 5202 int ret; 5203 5204 ret = ext4_meta_trans_blocks(inode, bpp, bpp); 5205 5206 /* Account for data blocks for journalled mode */ 5207 if (ext4_should_journal_data(inode)) 5208 ret += bpp; 5209 return ret; 5210 } 5211 5212 /* 5213 * Calculate the journal credits for a chunk of data modification. 5214 * 5215 * This is called from DIO, fallocate or whoever calling 5216 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. 5217 * 5218 * journal buffers for data blocks are not included here, as DIO 5219 * and fallocate do no need to journal data buffers. 5220 */ 5221 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 5222 { 5223 return ext4_meta_trans_blocks(inode, nrblocks, 1); 5224 } 5225 5226 /* 5227 * The caller must have previously called ext4_reserve_inode_write(). 5228 * Give this, we know that the caller already has write access to iloc->bh. 5229 */ 5230 int ext4_mark_iloc_dirty(handle_t *handle, 5231 struct inode *inode, struct ext4_iloc *iloc) 5232 { 5233 int err = 0; 5234 5235 if (IS_I_VERSION(inode)) 5236 inode_inc_iversion(inode); 5237 5238 /* the do_update_inode consumes one bh->b_count */ 5239 get_bh(iloc->bh); 5240 5241 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5242 err = ext4_do_update_inode(handle, inode, iloc); 5243 put_bh(iloc->bh); 5244 return err; 5245 } 5246 5247 /* 5248 * On success, We end up with an outstanding reference count against 5249 * iloc->bh. This _must_ be cleaned up later. 5250 */ 5251 5252 int 5253 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 5254 struct ext4_iloc *iloc) 5255 { 5256 int err; 5257 5258 err = ext4_get_inode_loc(inode, iloc); 5259 if (!err) { 5260 BUFFER_TRACE(iloc->bh, "get_write_access"); 5261 err = ext4_journal_get_write_access(handle, iloc->bh); 5262 if (err) { 5263 brelse(iloc->bh); 5264 iloc->bh = NULL; 5265 } 5266 } 5267 ext4_std_error(inode->i_sb, err); 5268 return err; 5269 } 5270 5271 /* 5272 * Expand an inode by new_extra_isize bytes. 5273 * Returns 0 on success or negative error number on failure. 5274 */ 5275 static int ext4_expand_extra_isize(struct inode *inode, 5276 unsigned int new_extra_isize, 5277 struct ext4_iloc iloc, 5278 handle_t *handle) 5279 { 5280 struct ext4_inode *raw_inode; 5281 struct ext4_xattr_ibody_header *header; 5282 5283 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 5284 return 0; 5285 5286 raw_inode = ext4_raw_inode(&iloc); 5287 5288 header = IHDR(inode, raw_inode); 5289 5290 /* No extended attributes present */ 5291 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5292 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5293 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5294 new_extra_isize); 5295 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5296 return 0; 5297 } 5298 5299 /* try to expand with EAs present */ 5300 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 5301 raw_inode, handle); 5302 } 5303 5304 /* 5305 * What we do here is to mark the in-core inode as clean with respect to inode 5306 * dirtiness (it may still be data-dirty). 5307 * This means that the in-core inode may be reaped by prune_icache 5308 * without having to perform any I/O. This is a very good thing, 5309 * because *any* task may call prune_icache - even ones which 5310 * have a transaction open against a different journal. 5311 * 5312 * Is this cheating? Not really. Sure, we haven't written the 5313 * inode out, but prune_icache isn't a user-visible syncing function. 5314 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 5315 * we start and wait on commits. 5316 */ 5317 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 5318 { 5319 struct ext4_iloc iloc; 5320 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5321 static unsigned int mnt_count; 5322 int err, ret; 5323 5324 might_sleep(); 5325 trace_ext4_mark_inode_dirty(inode, _RET_IP_); 5326 err = ext4_reserve_inode_write(handle, inode, &iloc); 5327 if (err) 5328 return err; 5329 if (ext4_handle_valid(handle) && 5330 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5331 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 5332 /* 5333 * We need extra buffer credits since we may write into EA block 5334 * with this same handle. If journal_extend fails, then it will 5335 * only result in a minor loss of functionality for that inode. 5336 * If this is felt to be critical, then e2fsck should be run to 5337 * force a large enough s_min_extra_isize. 5338 */ 5339 if ((jbd2_journal_extend(handle, 5340 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 5341 ret = ext4_expand_extra_isize(inode, 5342 sbi->s_want_extra_isize, 5343 iloc, handle); 5344 if (ret) { 5345 ext4_set_inode_state(inode, 5346 EXT4_STATE_NO_EXPAND); 5347 if (mnt_count != 5348 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5349 ext4_warning(inode->i_sb, 5350 "Unable to expand inode %lu. Delete" 5351 " some EAs or run e2fsck.", 5352 inode->i_ino); 5353 mnt_count = 5354 le16_to_cpu(sbi->s_es->s_mnt_count); 5355 } 5356 } 5357 } 5358 } 5359 return ext4_mark_iloc_dirty(handle, inode, &iloc); 5360 } 5361 5362 /* 5363 * ext4_dirty_inode() is called from __mark_inode_dirty() 5364 * 5365 * We're really interested in the case where a file is being extended. 5366 * i_size has been changed by generic_commit_write() and we thus need 5367 * to include the updated inode in the current transaction. 5368 * 5369 * Also, dquot_alloc_block() will always dirty the inode when blocks 5370 * are allocated to the file. 5371 * 5372 * If the inode is marked synchronous, we don't honour that here - doing 5373 * so would cause a commit on atime updates, which we don't bother doing. 5374 * We handle synchronous inodes at the highest possible level. 5375 * 5376 * If only the I_DIRTY_TIME flag is set, we can skip everything. If 5377 * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need 5378 * to copy into the on-disk inode structure are the timestamp files. 5379 */ 5380 void ext4_dirty_inode(struct inode *inode, int flags) 5381 { 5382 handle_t *handle; 5383 5384 if (flags == I_DIRTY_TIME) 5385 return; 5386 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 5387 if (IS_ERR(handle)) 5388 goto out; 5389 5390 ext4_mark_inode_dirty(handle, inode); 5391 5392 ext4_journal_stop(handle); 5393 out: 5394 return; 5395 } 5396 5397 #if 0 5398 /* 5399 * Bind an inode's backing buffer_head into this transaction, to prevent 5400 * it from being flushed to disk early. Unlike 5401 * ext4_reserve_inode_write, this leaves behind no bh reference and 5402 * returns no iloc structure, so the caller needs to repeat the iloc 5403 * lookup to mark the inode dirty later. 5404 */ 5405 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 5406 { 5407 struct ext4_iloc iloc; 5408 5409 int err = 0; 5410 if (handle) { 5411 err = ext4_get_inode_loc(inode, &iloc); 5412 if (!err) { 5413 BUFFER_TRACE(iloc.bh, "get_write_access"); 5414 err = jbd2_journal_get_write_access(handle, iloc.bh); 5415 if (!err) 5416 err = ext4_handle_dirty_metadata(handle, 5417 NULL, 5418 iloc.bh); 5419 brelse(iloc.bh); 5420 } 5421 } 5422 ext4_std_error(inode->i_sb, err); 5423 return err; 5424 } 5425 #endif 5426 5427 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5428 { 5429 journal_t *journal; 5430 handle_t *handle; 5431 int err; 5432 5433 /* 5434 * We have to be very careful here: changing a data block's 5435 * journaling status dynamically is dangerous. If we write a 5436 * data block to the journal, change the status and then delete 5437 * that block, we risk forgetting to revoke the old log record 5438 * from the journal and so a subsequent replay can corrupt data. 5439 * So, first we make sure that the journal is empty and that 5440 * nobody is changing anything. 5441 */ 5442 5443 journal = EXT4_JOURNAL(inode); 5444 if (!journal) 5445 return 0; 5446 if (is_journal_aborted(journal)) 5447 return -EROFS; 5448 /* We have to allocate physical blocks for delalloc blocks 5449 * before flushing journal. otherwise delalloc blocks can not 5450 * be allocated any more. even more truncate on delalloc blocks 5451 * could trigger BUG by flushing delalloc blocks in journal. 5452 * There is no delalloc block in non-journal data mode. 5453 */ 5454 if (val && test_opt(inode->i_sb, DELALLOC)) { 5455 err = ext4_alloc_da_blocks(inode); 5456 if (err < 0) 5457 return err; 5458 } 5459 5460 /* Wait for all existing dio workers */ 5461 ext4_inode_block_unlocked_dio(inode); 5462 inode_dio_wait(inode); 5463 5464 jbd2_journal_lock_updates(journal); 5465 5466 /* 5467 * OK, there are no updates running now, and all cached data is 5468 * synced to disk. We are now in a completely consistent state 5469 * which doesn't have anything in the journal, and we know that 5470 * no filesystem updates are running, so it is safe to modify 5471 * the inode's in-core data-journaling state flag now. 5472 */ 5473 5474 if (val) 5475 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5476 else { 5477 err = jbd2_journal_flush(journal); 5478 if (err < 0) { 5479 jbd2_journal_unlock_updates(journal); 5480 ext4_inode_resume_unlocked_dio(inode); 5481 return err; 5482 } 5483 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5484 } 5485 ext4_set_aops(inode); 5486 5487 jbd2_journal_unlock_updates(journal); 5488 ext4_inode_resume_unlocked_dio(inode); 5489 5490 /* Finally we can mark the inode as dirty. */ 5491 5492 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 5493 if (IS_ERR(handle)) 5494 return PTR_ERR(handle); 5495 5496 err = ext4_mark_inode_dirty(handle, inode); 5497 ext4_handle_sync(handle); 5498 ext4_journal_stop(handle); 5499 ext4_std_error(inode->i_sb, err); 5500 5501 return err; 5502 } 5503 5504 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5505 { 5506 return !buffer_mapped(bh); 5507 } 5508 5509 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 5510 { 5511 struct page *page = vmf->page; 5512 loff_t size; 5513 unsigned long len; 5514 int ret; 5515 struct file *file = vma->vm_file; 5516 struct inode *inode = file_inode(file); 5517 struct address_space *mapping = inode->i_mapping; 5518 handle_t *handle; 5519 get_block_t *get_block; 5520 int retries = 0; 5521 5522 sb_start_pagefault(inode->i_sb); 5523 file_update_time(vma->vm_file); 5524 5525 down_read(&EXT4_I(inode)->i_mmap_sem); 5526 /* Delalloc case is easy... */ 5527 if (test_opt(inode->i_sb, DELALLOC) && 5528 !ext4_should_journal_data(inode) && 5529 !ext4_nonda_switch(inode->i_sb)) { 5530 do { 5531 ret = block_page_mkwrite(vma, vmf, 5532 ext4_da_get_block_prep); 5533 } while (ret == -ENOSPC && 5534 ext4_should_retry_alloc(inode->i_sb, &retries)); 5535 goto out_ret; 5536 } 5537 5538 lock_page(page); 5539 size = i_size_read(inode); 5540 /* Page got truncated from under us? */ 5541 if (page->mapping != mapping || page_offset(page) > size) { 5542 unlock_page(page); 5543 ret = VM_FAULT_NOPAGE; 5544 goto out; 5545 } 5546 5547 if (page->index == size >> PAGE_SHIFT) 5548 len = size & ~PAGE_MASK; 5549 else 5550 len = PAGE_SIZE; 5551 /* 5552 * Return if we have all the buffers mapped. This avoids the need to do 5553 * journal_start/journal_stop which can block and take a long time 5554 */ 5555 if (page_has_buffers(page)) { 5556 if (!ext4_walk_page_buffers(NULL, page_buffers(page), 5557 0, len, NULL, 5558 ext4_bh_unmapped)) { 5559 /* Wait so that we don't change page under IO */ 5560 wait_for_stable_page(page); 5561 ret = VM_FAULT_LOCKED; 5562 goto out; 5563 } 5564 } 5565 unlock_page(page); 5566 /* OK, we need to fill the hole... */ 5567 if (ext4_should_dioread_nolock(inode)) 5568 get_block = ext4_get_block_unwritten; 5569 else 5570 get_block = ext4_get_block; 5571 retry_alloc: 5572 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 5573 ext4_writepage_trans_blocks(inode)); 5574 if (IS_ERR(handle)) { 5575 ret = VM_FAULT_SIGBUS; 5576 goto out; 5577 } 5578 ret = block_page_mkwrite(vma, vmf, get_block); 5579 if (!ret && ext4_should_journal_data(inode)) { 5580 if (ext4_walk_page_buffers(handle, page_buffers(page), 0, 5581 PAGE_SIZE, NULL, do_journal_get_write_access)) { 5582 unlock_page(page); 5583 ret = VM_FAULT_SIGBUS; 5584 ext4_journal_stop(handle); 5585 goto out; 5586 } 5587 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 5588 } 5589 ext4_journal_stop(handle); 5590 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 5591 goto retry_alloc; 5592 out_ret: 5593 ret = block_page_mkwrite_return(ret); 5594 out: 5595 up_read(&EXT4_I(inode)->i_mmap_sem); 5596 sb_end_pagefault(inode->i_sb); 5597 return ret; 5598 } 5599 5600 int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 5601 { 5602 struct inode *inode = file_inode(vma->vm_file); 5603 int err; 5604 5605 down_read(&EXT4_I(inode)->i_mmap_sem); 5606 err = filemap_fault(vma, vmf); 5607 up_read(&EXT4_I(inode)->i_mmap_sem); 5608 5609 return err; 5610 } 5611 5612 /* 5613 * Find the first extent at or after @lblk in an inode that is not a hole. 5614 * Search for @map_len blocks at most. The extent is returned in @result. 5615 * 5616 * The function returns 1 if we found an extent. The function returns 0 in 5617 * case there is no extent at or after @lblk and in that case also sets 5618 * @result->es_len to 0. In case of error, the error code is returned. 5619 */ 5620 int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, 5621 unsigned int map_len, struct extent_status *result) 5622 { 5623 struct ext4_map_blocks map; 5624 struct extent_status es = {}; 5625 int ret; 5626 5627 map.m_lblk = lblk; 5628 map.m_len = map_len; 5629 5630 /* 5631 * For non-extent based files this loop may iterate several times since 5632 * we do not determine full hole size. 5633 */ 5634 while (map.m_len > 0) { 5635 ret = ext4_map_blocks(NULL, inode, &map, 0); 5636 if (ret < 0) 5637 return ret; 5638 /* There's extent covering m_lblk? Just return it. */ 5639 if (ret > 0) { 5640 int status; 5641 5642 ext4_es_store_pblock(result, map.m_pblk); 5643 result->es_lblk = map.m_lblk; 5644 result->es_len = map.m_len; 5645 if (map.m_flags & EXT4_MAP_UNWRITTEN) 5646 status = EXTENT_STATUS_UNWRITTEN; 5647 else 5648 status = EXTENT_STATUS_WRITTEN; 5649 ext4_es_store_status(result, status); 5650 return 1; 5651 } 5652 ext4_es_find_delayed_extent_range(inode, map.m_lblk, 5653 map.m_lblk + map.m_len - 1, 5654 &es); 5655 /* Is delalloc data before next block in extent tree? */ 5656 if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { 5657 ext4_lblk_t offset = 0; 5658 5659 if (es.es_lblk < lblk) 5660 offset = lblk - es.es_lblk; 5661 result->es_lblk = es.es_lblk + offset; 5662 ext4_es_store_pblock(result, 5663 ext4_es_pblock(&es) + offset); 5664 result->es_len = es.es_len - offset; 5665 ext4_es_store_status(result, ext4_es_status(&es)); 5666 5667 return 1; 5668 } 5669 /* There's a hole at m_lblk, advance us after it */ 5670 map.m_lblk += map.m_len; 5671 map_len -= map.m_len; 5672 map.m_len = map_len; 5673 cond_resched(); 5674 } 5675 result->es_len = 0; 5676 return 0; 5677 } 5678