1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/ext4/inode.c 4 * 5 * Copyright (C) 1992, 1993, 1994, 1995 6 * Remy Card (card@masi.ibp.fr) 7 * Laboratoire MASI - Institut Blaise Pascal 8 * Universite Pierre et Marie Curie (Paris VI) 9 * 10 * from 11 * 12 * linux/fs/minix/inode.c 13 * 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * 64-bit file support on 64-bit platforms by Jakub Jelinek 17 * (jj@sunsite.ms.mff.cuni.cz) 18 * 19 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 20 */ 21 22 #include <linux/fs.h> 23 #include <linux/mount.h> 24 #include <linux/time.h> 25 #include <linux/highuid.h> 26 #include <linux/pagemap.h> 27 #include <linux/dax.h> 28 #include <linux/quotaops.h> 29 #include <linux/string.h> 30 #include <linux/buffer_head.h> 31 #include <linux/writeback.h> 32 #include <linux/pagevec.h> 33 #include <linux/mpage.h> 34 #include <linux/namei.h> 35 #include <linux/uio.h> 36 #include <linux/bio.h> 37 #include <linux/workqueue.h> 38 #include <linux/kernel.h> 39 #include <linux/printk.h> 40 #include <linux/slab.h> 41 #include <linux/bitops.h> 42 #include <linux/iomap.h> 43 #include <linux/iversion.h> 44 45 #include "ext4_jbd2.h" 46 #include "xattr.h" 47 #include "acl.h" 48 #include "truncate.h" 49 50 #include <trace/events/ext4.h> 51 52 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, 53 struct ext4_inode_info *ei) 54 { 55 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 56 __u32 csum; 57 __u16 dummy_csum = 0; 58 int offset = offsetof(struct ext4_inode, i_checksum_lo); 59 unsigned int csum_size = sizeof(dummy_csum); 60 61 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); 62 csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); 63 offset += csum_size; 64 csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, 65 EXT4_GOOD_OLD_INODE_SIZE - offset); 66 67 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 68 offset = offsetof(struct ext4_inode, i_checksum_hi); 69 csum = ext4_chksum(sbi, csum, (__u8 *)raw + 70 EXT4_GOOD_OLD_INODE_SIZE, 71 offset - EXT4_GOOD_OLD_INODE_SIZE); 72 if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { 73 csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, 74 csum_size); 75 offset += csum_size; 76 } 77 csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, 78 EXT4_INODE_SIZE(inode->i_sb) - offset); 79 } 80 81 return csum; 82 } 83 84 static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, 85 struct ext4_inode_info *ei) 86 { 87 __u32 provided, calculated; 88 89 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 90 cpu_to_le32(EXT4_OS_LINUX) || 91 !ext4_has_metadata_csum(inode->i_sb)) 92 return 1; 93 94 provided = le16_to_cpu(raw->i_checksum_lo); 95 calculated = ext4_inode_csum(inode, raw, ei); 96 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 97 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 98 provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; 99 else 100 calculated &= 0xFFFF; 101 102 return provided == calculated; 103 } 104 105 void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, 106 struct ext4_inode_info *ei) 107 { 108 __u32 csum; 109 110 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 111 cpu_to_le32(EXT4_OS_LINUX) || 112 !ext4_has_metadata_csum(inode->i_sb)) 113 return; 114 115 csum = ext4_inode_csum(inode, raw, ei); 116 raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); 117 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 118 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 119 raw->i_checksum_hi = cpu_to_le16(csum >> 16); 120 } 121 122 static inline int ext4_begin_ordered_truncate(struct inode *inode, 123 loff_t new_size) 124 { 125 trace_ext4_begin_ordered_truncate(inode, new_size); 126 /* 127 * If jinode is zero, then we never opened the file for 128 * writing, so there's no need to call 129 * jbd2_journal_begin_ordered_truncate() since there's no 130 * outstanding writes we need to flush. 131 */ 132 if (!EXT4_I(inode)->jinode) 133 return 0; 134 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), 135 EXT4_I(inode)->jinode, 136 new_size); 137 } 138 139 static int __ext4_journalled_writepage(struct page *page, unsigned int len); 140 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 141 int pextents); 142 143 /* 144 * Test whether an inode is a fast symlink. 145 * A fast symlink has its symlink data stored in ext4_inode_info->i_data. 146 */ 147 int ext4_inode_is_fast_symlink(struct inode *inode) 148 { 149 if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { 150 int ea_blocks = EXT4_I(inode)->i_file_acl ? 151 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; 152 153 if (ext4_has_inline_data(inode)) 154 return 0; 155 156 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 157 } 158 return S_ISLNK(inode->i_mode) && inode->i_size && 159 (inode->i_size < EXT4_N_BLOCKS * 4); 160 } 161 162 /* 163 * Called at the last iput() if i_nlink is zero. 164 */ 165 void ext4_evict_inode(struct inode *inode) 166 { 167 handle_t *handle; 168 int err; 169 /* 170 * Credits for final inode cleanup and freeing: 171 * sb + inode (ext4_orphan_del()), block bitmap, group descriptor 172 * (xattr block freeing), bitmap, group descriptor (inode freeing) 173 */ 174 int extra_credits = 6; 175 struct ext4_xattr_inode_array *ea_inode_array = NULL; 176 bool freeze_protected = false; 177 178 trace_ext4_evict_inode(inode); 179 180 if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) 181 ext4_evict_ea_inode(inode); 182 if (inode->i_nlink) { 183 /* 184 * When journalling data dirty buffers are tracked only in the 185 * journal. So although mm thinks everything is clean and 186 * ready for reaping the inode might still have some pages to 187 * write in the running transaction or waiting to be 188 * checkpointed. Thus calling jbd2_journal_invalidate_folio() 189 * (via truncate_inode_pages()) to discard these buffers can 190 * cause data loss. Also even if we did not discard these 191 * buffers, we would have no way to find them after the inode 192 * is reaped and thus user could see stale data if he tries to 193 * read them before the transaction is checkpointed. So be 194 * careful and force everything to disk here... We use 195 * ei->i_datasync_tid to store the newest transaction 196 * containing inode's data. 197 * 198 * Note that directories do not have this problem because they 199 * don't use page cache. 200 */ 201 if (inode->i_ino != EXT4_JOURNAL_INO && 202 ext4_should_journal_data(inode) && 203 S_ISREG(inode->i_mode) && inode->i_data.nrpages) { 204 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 205 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; 206 207 jbd2_complete_transaction(journal, commit_tid); 208 filemap_write_and_wait(&inode->i_data); 209 } 210 truncate_inode_pages_final(&inode->i_data); 211 212 goto no_delete; 213 } 214 215 if (is_bad_inode(inode)) 216 goto no_delete; 217 dquot_initialize(inode); 218 219 if (ext4_should_order_data(inode)) 220 ext4_begin_ordered_truncate(inode, 0); 221 truncate_inode_pages_final(&inode->i_data); 222 223 /* 224 * For inodes with journalled data, transaction commit could have 225 * dirtied the inode. And for inodes with dioread_nolock, unwritten 226 * extents converting worker could merge extents and also have dirtied 227 * the inode. Flush worker is ignoring it because of I_FREEING flag but 228 * we still need to remove the inode from the writeback lists. 229 */ 230 if (!list_empty_careful(&inode->i_io_list)) 231 inode_io_list_del(inode); 232 233 /* 234 * Protect us against freezing - iput() caller didn't have to have any 235 * protection against it. When we are in a running transaction though, 236 * we are already protected against freezing and we cannot grab further 237 * protection due to lock ordering constraints. 238 */ 239 if (!ext4_journal_current_handle()) { 240 sb_start_intwrite(inode->i_sb); 241 freeze_protected = true; 242 } 243 244 if (!IS_NOQUOTA(inode)) 245 extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); 246 247 /* 248 * Block bitmap, group descriptor, and inode are accounted in both 249 * ext4_blocks_for_truncate() and extra_credits. So subtract 3. 250 */ 251 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, 252 ext4_blocks_for_truncate(inode) + extra_credits - 3); 253 if (IS_ERR(handle)) { 254 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 255 /* 256 * If we're going to skip the normal cleanup, we still need to 257 * make sure that the in-core orphan linked list is properly 258 * cleaned up. 259 */ 260 ext4_orphan_del(NULL, inode); 261 if (freeze_protected) 262 sb_end_intwrite(inode->i_sb); 263 goto no_delete; 264 } 265 266 if (IS_SYNC(inode)) 267 ext4_handle_sync(handle); 268 269 /* 270 * Set inode->i_size to 0 before calling ext4_truncate(). We need 271 * special handling of symlinks here because i_size is used to 272 * determine whether ext4_inode_info->i_data contains symlink data or 273 * block mappings. Setting i_size to 0 will remove its fast symlink 274 * status. Erase i_data so that it becomes a valid empty block map. 275 */ 276 if (ext4_inode_is_fast_symlink(inode)) 277 memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); 278 inode->i_size = 0; 279 err = ext4_mark_inode_dirty(handle, inode); 280 if (err) { 281 ext4_warning(inode->i_sb, 282 "couldn't mark inode dirty (err %d)", err); 283 goto stop_handle; 284 } 285 if (inode->i_blocks) { 286 err = ext4_truncate(inode); 287 if (err) { 288 ext4_error_err(inode->i_sb, -err, 289 "couldn't truncate inode %lu (err %d)", 290 inode->i_ino, err); 291 goto stop_handle; 292 } 293 } 294 295 /* Remove xattr references. */ 296 err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, 297 extra_credits); 298 if (err) { 299 ext4_warning(inode->i_sb, "xattr delete (err %d)", err); 300 stop_handle: 301 ext4_journal_stop(handle); 302 ext4_orphan_del(NULL, inode); 303 if (freeze_protected) 304 sb_end_intwrite(inode->i_sb); 305 ext4_xattr_inode_array_free(ea_inode_array); 306 goto no_delete; 307 } 308 309 /* 310 * Kill off the orphan record which ext4_truncate created. 311 * AKPM: I think this can be inside the above `if'. 312 * Note that ext4_orphan_del() has to be able to cope with the 313 * deletion of a non-existent orphan - this is because we don't 314 * know if ext4_truncate() actually created an orphan record. 315 * (Well, we could do this if we need to, but heck - it works) 316 */ 317 ext4_orphan_del(handle, inode); 318 EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); 319 320 /* 321 * One subtle ordering requirement: if anything has gone wrong 322 * (transaction abort, IO errors, whatever), then we can still 323 * do these next steps (the fs will already have been marked as 324 * having errors), but we can't free the inode if the mark_dirty 325 * fails. 326 */ 327 if (ext4_mark_inode_dirty(handle, inode)) 328 /* If that failed, just do the required in-core inode clear. */ 329 ext4_clear_inode(inode); 330 else 331 ext4_free_inode(handle, inode); 332 ext4_journal_stop(handle); 333 if (freeze_protected) 334 sb_end_intwrite(inode->i_sb); 335 ext4_xattr_inode_array_free(ea_inode_array); 336 return; 337 no_delete: 338 /* 339 * Check out some where else accidentally dirty the evicting inode, 340 * which may probably cause inode use-after-free issues later. 341 */ 342 WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); 343 344 if (!list_empty(&EXT4_I(inode)->i_fc_list)) 345 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); 346 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 347 } 348 349 #ifdef CONFIG_QUOTA 350 qsize_t *ext4_get_reserved_space(struct inode *inode) 351 { 352 return &EXT4_I(inode)->i_reserved_quota; 353 } 354 #endif 355 356 /* 357 * Called with i_data_sem down, which is important since we can call 358 * ext4_discard_preallocations() from here. 359 */ 360 void ext4_da_update_reserve_space(struct inode *inode, 361 int used, int quota_claim) 362 { 363 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 364 struct ext4_inode_info *ei = EXT4_I(inode); 365 366 spin_lock(&ei->i_block_reservation_lock); 367 trace_ext4_da_update_reserve_space(inode, used, quota_claim); 368 if (unlikely(used > ei->i_reserved_data_blocks)) { 369 ext4_warning(inode->i_sb, "%s: ino %lu, used %d " 370 "with only %d reserved data blocks", 371 __func__, inode->i_ino, used, 372 ei->i_reserved_data_blocks); 373 WARN_ON(1); 374 used = ei->i_reserved_data_blocks; 375 } 376 377 /* Update per-inode reservations */ 378 ei->i_reserved_data_blocks -= used; 379 percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); 380 381 spin_unlock(&ei->i_block_reservation_lock); 382 383 /* Update quota subsystem for data blocks */ 384 if (quota_claim) 385 dquot_claim_block(inode, EXT4_C2B(sbi, used)); 386 else { 387 /* 388 * We did fallocate with an offset that is already delayed 389 * allocated. So on delayed allocated writeback we should 390 * not re-claim the quota for fallocated blocks. 391 */ 392 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); 393 } 394 395 /* 396 * If we have done all the pending block allocations and if 397 * there aren't any writers on the inode, we can discard the 398 * inode's preallocations. 399 */ 400 if ((ei->i_reserved_data_blocks == 0) && 401 !inode_is_open_for_write(inode)) 402 ext4_discard_preallocations(inode, 0); 403 } 404 405 static int __check_block_validity(struct inode *inode, const char *func, 406 unsigned int line, 407 struct ext4_map_blocks *map) 408 { 409 if (ext4_has_feature_journal(inode->i_sb) && 410 (inode->i_ino == 411 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) 412 return 0; 413 if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { 414 ext4_error_inode(inode, func, line, map->m_pblk, 415 "lblock %lu mapped to illegal pblock %llu " 416 "(length %d)", (unsigned long) map->m_lblk, 417 map->m_pblk, map->m_len); 418 return -EFSCORRUPTED; 419 } 420 return 0; 421 } 422 423 int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, 424 ext4_lblk_t len) 425 { 426 int ret; 427 428 if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) 429 return fscrypt_zeroout_range(inode, lblk, pblk, len); 430 431 ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); 432 if (ret > 0) 433 ret = 0; 434 435 return ret; 436 } 437 438 #define check_block_validity(inode, map) \ 439 __check_block_validity((inode), __func__, __LINE__, (map)) 440 441 #ifdef ES_AGGRESSIVE_TEST 442 static void ext4_map_blocks_es_recheck(handle_t *handle, 443 struct inode *inode, 444 struct ext4_map_blocks *es_map, 445 struct ext4_map_blocks *map, 446 int flags) 447 { 448 int retval; 449 450 map->m_flags = 0; 451 /* 452 * There is a race window that the result is not the same. 453 * e.g. xfstests #223 when dioread_nolock enables. The reason 454 * is that we lookup a block mapping in extent status tree with 455 * out taking i_data_sem. So at the time the unwritten extent 456 * could be converted. 457 */ 458 down_read(&EXT4_I(inode)->i_data_sem); 459 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 460 retval = ext4_ext_map_blocks(handle, inode, map, 0); 461 } else { 462 retval = ext4_ind_map_blocks(handle, inode, map, 0); 463 } 464 up_read((&EXT4_I(inode)->i_data_sem)); 465 466 /* 467 * We don't check m_len because extent will be collpased in status 468 * tree. So the m_len might not equal. 469 */ 470 if (es_map->m_lblk != map->m_lblk || 471 es_map->m_flags != map->m_flags || 472 es_map->m_pblk != map->m_pblk) { 473 printk("ES cache assertion failed for inode: %lu " 474 "es_cached ex [%d/%d/%llu/%x] != " 475 "found ex [%d/%d/%llu/%x] retval %d flags %x\n", 476 inode->i_ino, es_map->m_lblk, es_map->m_len, 477 es_map->m_pblk, es_map->m_flags, map->m_lblk, 478 map->m_len, map->m_pblk, map->m_flags, 479 retval, flags); 480 } 481 } 482 #endif /* ES_AGGRESSIVE_TEST */ 483 484 /* 485 * The ext4_map_blocks() function tries to look up the requested blocks, 486 * and returns if the blocks are already mapped. 487 * 488 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 489 * and store the allocated blocks in the result buffer head and mark it 490 * mapped. 491 * 492 * If file type is extents based, it will call ext4_ext_map_blocks(), 493 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping 494 * based files 495 * 496 * On success, it returns the number of blocks being mapped or allocated. if 497 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map 498 * is marked as unwritten. If the create == 1, it will mark @map as mapped. 499 * 500 * It returns 0 if plain look up failed (blocks have not been allocated), in 501 * that case, @map is returned as unmapped but we still do fill map->m_len to 502 * indicate the length of a hole starting at map->m_lblk. 503 * 504 * It returns the error in case of allocation failure. 505 */ 506 int ext4_map_blocks(handle_t *handle, struct inode *inode, 507 struct ext4_map_blocks *map, int flags) 508 { 509 struct extent_status es; 510 int retval; 511 int ret = 0; 512 #ifdef ES_AGGRESSIVE_TEST 513 struct ext4_map_blocks orig_map; 514 515 memcpy(&orig_map, map, sizeof(*map)); 516 #endif 517 518 map->m_flags = 0; 519 ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", 520 flags, map->m_len, (unsigned long) map->m_lblk); 521 522 /* 523 * ext4_map_blocks returns an int, and m_len is an unsigned int 524 */ 525 if (unlikely(map->m_len > INT_MAX)) 526 map->m_len = INT_MAX; 527 528 /* We can handle the block number less than EXT_MAX_BLOCKS */ 529 if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) 530 return -EFSCORRUPTED; 531 532 /* Lookup extent status tree firstly */ 533 if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && 534 ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 535 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 536 map->m_pblk = ext4_es_pblock(&es) + 537 map->m_lblk - es.es_lblk; 538 map->m_flags |= ext4_es_is_written(&es) ? 539 EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; 540 retval = es.es_len - (map->m_lblk - es.es_lblk); 541 if (retval > map->m_len) 542 retval = map->m_len; 543 map->m_len = retval; 544 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { 545 map->m_pblk = 0; 546 retval = es.es_len - (map->m_lblk - es.es_lblk); 547 if (retval > map->m_len) 548 retval = map->m_len; 549 map->m_len = retval; 550 retval = 0; 551 } else { 552 BUG(); 553 } 554 555 if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) 556 return retval; 557 #ifdef ES_AGGRESSIVE_TEST 558 ext4_map_blocks_es_recheck(handle, inode, map, 559 &orig_map, flags); 560 #endif 561 goto found; 562 } 563 /* 564 * In the query cache no-wait mode, nothing we can do more if we 565 * cannot find extent in the cache. 566 */ 567 if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) 568 return 0; 569 570 /* 571 * Try to see if we can get the block without requesting a new 572 * file system block. 573 */ 574 down_read(&EXT4_I(inode)->i_data_sem); 575 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 576 retval = ext4_ext_map_blocks(handle, inode, map, 0); 577 } else { 578 retval = ext4_ind_map_blocks(handle, inode, map, 0); 579 } 580 if (retval > 0) { 581 unsigned int status; 582 583 if (unlikely(retval != map->m_len)) { 584 ext4_warning(inode->i_sb, 585 "ES len assertion failed for inode " 586 "%lu: retval %d != map->m_len %d", 587 inode->i_ino, retval, map->m_len); 588 WARN_ON(1); 589 } 590 591 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 592 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 593 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 594 !(status & EXTENT_STATUS_WRITTEN) && 595 ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, 596 map->m_lblk + map->m_len - 1)) 597 status |= EXTENT_STATUS_DELAYED; 598 ret = ext4_es_insert_extent(inode, map->m_lblk, 599 map->m_len, map->m_pblk, status); 600 if (ret < 0) 601 retval = ret; 602 } 603 up_read((&EXT4_I(inode)->i_data_sem)); 604 605 found: 606 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 607 ret = check_block_validity(inode, map); 608 if (ret != 0) 609 return ret; 610 } 611 612 /* If it is only a block(s) look up */ 613 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) 614 return retval; 615 616 /* 617 * Returns if the blocks have already allocated 618 * 619 * Note that if blocks have been preallocated 620 * ext4_ext_get_block() returns the create = 0 621 * with buffer head unmapped. 622 */ 623 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 624 /* 625 * If we need to convert extent to unwritten 626 * we continue and do the actual work in 627 * ext4_ext_map_blocks() 628 */ 629 if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) 630 return retval; 631 632 /* 633 * Here we clear m_flags because after allocating an new extent, 634 * it will be set again. 635 */ 636 map->m_flags &= ~EXT4_MAP_FLAGS; 637 638 /* 639 * New blocks allocate and/or writing to unwritten extent 640 * will possibly result in updating i_data, so we take 641 * the write lock of i_data_sem, and call get_block() 642 * with create == 1 flag. 643 */ 644 down_write(&EXT4_I(inode)->i_data_sem); 645 646 /* 647 * We need to check for EXT4 here because migrate 648 * could have changed the inode type in between 649 */ 650 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 651 retval = ext4_ext_map_blocks(handle, inode, map, flags); 652 } else { 653 retval = ext4_ind_map_blocks(handle, inode, map, flags); 654 655 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { 656 /* 657 * We allocated new blocks which will result in 658 * i_data's format changing. Force the migrate 659 * to fail by clearing migrate flags 660 */ 661 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); 662 } 663 664 /* 665 * Update reserved blocks/metadata blocks after successful 666 * block allocation which had been deferred till now. We don't 667 * support fallocate for non extent files. So we can update 668 * reserve space here. 669 */ 670 if ((retval > 0) && 671 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 672 ext4_da_update_reserve_space(inode, retval, 1); 673 } 674 675 if (retval > 0) { 676 unsigned int status; 677 678 if (unlikely(retval != map->m_len)) { 679 ext4_warning(inode->i_sb, 680 "ES len assertion failed for inode " 681 "%lu: retval %d != map->m_len %d", 682 inode->i_ino, retval, map->m_len); 683 WARN_ON(1); 684 } 685 686 /* 687 * We have to zeroout blocks before inserting them into extent 688 * status tree. Otherwise someone could look them up there and 689 * use them before they are really zeroed. We also have to 690 * unmap metadata before zeroing as otherwise writeback can 691 * overwrite zeros with stale data from block device. 692 */ 693 if (flags & EXT4_GET_BLOCKS_ZERO && 694 map->m_flags & EXT4_MAP_MAPPED && 695 map->m_flags & EXT4_MAP_NEW) { 696 ret = ext4_issue_zeroout(inode, map->m_lblk, 697 map->m_pblk, map->m_len); 698 if (ret) { 699 retval = ret; 700 goto out_sem; 701 } 702 } 703 704 /* 705 * If the extent has been zeroed out, we don't need to update 706 * extent status tree. 707 */ 708 if ((flags & EXT4_GET_BLOCKS_PRE_IO) && 709 ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 710 if (ext4_es_is_written(&es)) 711 goto out_sem; 712 } 713 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 714 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 715 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 716 !(status & EXTENT_STATUS_WRITTEN) && 717 ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, 718 map->m_lblk + map->m_len - 1)) 719 status |= EXTENT_STATUS_DELAYED; 720 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 721 map->m_pblk, status); 722 if (ret < 0) { 723 retval = ret; 724 goto out_sem; 725 } 726 } 727 728 out_sem: 729 up_write((&EXT4_I(inode)->i_data_sem)); 730 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 731 ret = check_block_validity(inode, map); 732 if (ret != 0) 733 return ret; 734 735 /* 736 * Inodes with freshly allocated blocks where contents will be 737 * visible after transaction commit must be on transaction's 738 * ordered data list. 739 */ 740 if (map->m_flags & EXT4_MAP_NEW && 741 !(map->m_flags & EXT4_MAP_UNWRITTEN) && 742 !(flags & EXT4_GET_BLOCKS_ZERO) && 743 !ext4_is_quota_file(inode) && 744 ext4_should_order_data(inode)) { 745 loff_t start_byte = 746 (loff_t)map->m_lblk << inode->i_blkbits; 747 loff_t length = (loff_t)map->m_len << inode->i_blkbits; 748 749 if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) 750 ret = ext4_jbd2_inode_add_wait(handle, inode, 751 start_byte, length); 752 else 753 ret = ext4_jbd2_inode_add_write(handle, inode, 754 start_byte, length); 755 if (ret) 756 return ret; 757 } 758 } 759 if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || 760 map->m_flags & EXT4_MAP_MAPPED)) 761 ext4_fc_track_range(handle, inode, map->m_lblk, 762 map->m_lblk + map->m_len - 1); 763 if (retval < 0) 764 ext_debug(inode, "failed with err %d\n", retval); 765 return retval; 766 } 767 768 /* 769 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages 770 * we have to be careful as someone else may be manipulating b_state as well. 771 */ 772 static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) 773 { 774 unsigned long old_state; 775 unsigned long new_state; 776 777 flags &= EXT4_MAP_FLAGS; 778 779 /* Dummy buffer_head? Set non-atomically. */ 780 if (!bh->b_page) { 781 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; 782 return; 783 } 784 /* 785 * Someone else may be modifying b_state. Be careful! This is ugly but 786 * once we get rid of using bh as a container for mapping information 787 * to pass to / from get_block functions, this can go away. 788 */ 789 do { 790 old_state = READ_ONCE(bh->b_state); 791 new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; 792 } while (unlikely( 793 cmpxchg(&bh->b_state, old_state, new_state) != old_state)); 794 } 795 796 static int _ext4_get_block(struct inode *inode, sector_t iblock, 797 struct buffer_head *bh, int flags) 798 { 799 struct ext4_map_blocks map; 800 int ret = 0; 801 802 if (ext4_has_inline_data(inode)) 803 return -ERANGE; 804 805 map.m_lblk = iblock; 806 map.m_len = bh->b_size >> inode->i_blkbits; 807 808 ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, 809 flags); 810 if (ret > 0) { 811 map_bh(bh, inode->i_sb, map.m_pblk); 812 ext4_update_bh_state(bh, map.m_flags); 813 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 814 ret = 0; 815 } else if (ret == 0) { 816 /* hole case, need to fill in bh->b_size */ 817 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 818 } 819 return ret; 820 } 821 822 int ext4_get_block(struct inode *inode, sector_t iblock, 823 struct buffer_head *bh, int create) 824 { 825 return _ext4_get_block(inode, iblock, bh, 826 create ? EXT4_GET_BLOCKS_CREATE : 0); 827 } 828 829 /* 830 * Get block function used when preparing for buffered write if we require 831 * creating an unwritten extent if blocks haven't been allocated. The extent 832 * will be converted to written after the IO is complete. 833 */ 834 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 835 struct buffer_head *bh_result, int create) 836 { 837 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", 838 inode->i_ino, create); 839 return _ext4_get_block(inode, iblock, bh_result, 840 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); 841 } 842 843 /* Maximum number of blocks we map for direct IO at once. */ 844 #define DIO_MAX_BLOCKS 4096 845 846 /* 847 * `handle' can be NULL if create is zero 848 */ 849 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 850 ext4_lblk_t block, int map_flags) 851 { 852 struct ext4_map_blocks map; 853 struct buffer_head *bh; 854 int create = map_flags & EXT4_GET_BLOCKS_CREATE; 855 bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; 856 int err; 857 858 ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 859 || handle != NULL || create == 0); 860 ASSERT(create == 0 || !nowait); 861 862 map.m_lblk = block; 863 map.m_len = 1; 864 err = ext4_map_blocks(handle, inode, &map, map_flags); 865 866 if (err == 0) 867 return create ? ERR_PTR(-ENOSPC) : NULL; 868 if (err < 0) 869 return ERR_PTR(err); 870 871 if (nowait) 872 return sb_find_get_block(inode->i_sb, map.m_pblk); 873 874 bh = sb_getblk(inode->i_sb, map.m_pblk); 875 if (unlikely(!bh)) 876 return ERR_PTR(-ENOMEM); 877 if (map.m_flags & EXT4_MAP_NEW) { 878 ASSERT(create != 0); 879 ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 880 || (handle != NULL)); 881 882 /* 883 * Now that we do not always journal data, we should 884 * keep in mind whether this should always journal the 885 * new buffer as metadata. For now, regular file 886 * writes use ext4_get_block instead, so it's not a 887 * problem. 888 */ 889 lock_buffer(bh); 890 BUFFER_TRACE(bh, "call get_create_access"); 891 err = ext4_journal_get_create_access(handle, inode->i_sb, bh, 892 EXT4_JTR_NONE); 893 if (unlikely(err)) { 894 unlock_buffer(bh); 895 goto errout; 896 } 897 if (!buffer_uptodate(bh)) { 898 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 899 set_buffer_uptodate(bh); 900 } 901 unlock_buffer(bh); 902 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 903 err = ext4_handle_dirty_metadata(handle, inode, bh); 904 if (unlikely(err)) 905 goto errout; 906 } else 907 BUFFER_TRACE(bh, "not a new buffer"); 908 return bh; 909 errout: 910 brelse(bh); 911 return ERR_PTR(err); 912 } 913 914 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 915 ext4_lblk_t block, int map_flags) 916 { 917 struct buffer_head *bh; 918 int ret; 919 920 bh = ext4_getblk(handle, inode, block, map_flags); 921 if (IS_ERR(bh)) 922 return bh; 923 if (!bh || ext4_buffer_uptodate(bh)) 924 return bh; 925 926 ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); 927 if (ret) { 928 put_bh(bh); 929 return ERR_PTR(ret); 930 } 931 return bh; 932 } 933 934 /* Read a contiguous batch of blocks. */ 935 int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, 936 bool wait, struct buffer_head **bhs) 937 { 938 int i, err; 939 940 for (i = 0; i < bh_count; i++) { 941 bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); 942 if (IS_ERR(bhs[i])) { 943 err = PTR_ERR(bhs[i]); 944 bh_count = i; 945 goto out_brelse; 946 } 947 } 948 949 for (i = 0; i < bh_count; i++) 950 /* Note that NULL bhs[i] is valid because of holes. */ 951 if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) 952 ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); 953 954 if (!wait) 955 return 0; 956 957 for (i = 0; i < bh_count; i++) 958 if (bhs[i]) 959 wait_on_buffer(bhs[i]); 960 961 for (i = 0; i < bh_count; i++) { 962 if (bhs[i] && !buffer_uptodate(bhs[i])) { 963 err = -EIO; 964 goto out_brelse; 965 } 966 } 967 return 0; 968 969 out_brelse: 970 for (i = 0; i < bh_count; i++) { 971 brelse(bhs[i]); 972 bhs[i] = NULL; 973 } 974 return err; 975 } 976 977 int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, 978 struct buffer_head *head, 979 unsigned from, 980 unsigned to, 981 int *partial, 982 int (*fn)(handle_t *handle, struct inode *inode, 983 struct buffer_head *bh)) 984 { 985 struct buffer_head *bh; 986 unsigned block_start, block_end; 987 unsigned blocksize = head->b_size; 988 int err, ret = 0; 989 struct buffer_head *next; 990 991 for (bh = head, block_start = 0; 992 ret == 0 && (bh != head || !block_start); 993 block_start = block_end, bh = next) { 994 next = bh->b_this_page; 995 block_end = block_start + blocksize; 996 if (block_end <= from || block_start >= to) { 997 if (partial && !buffer_uptodate(bh)) 998 *partial = 1; 999 continue; 1000 } 1001 err = (*fn)(handle, inode, bh); 1002 if (!ret) 1003 ret = err; 1004 } 1005 return ret; 1006 } 1007 1008 /* 1009 * To preserve ordering, it is essential that the hole instantiation and 1010 * the data write be encapsulated in a single transaction. We cannot 1011 * close off a transaction and start a new one between the ext4_get_block() 1012 * and the commit_write(). So doing the jbd2_journal_start at the start of 1013 * prepare_write() is the right place. 1014 * 1015 * Also, this function can nest inside ext4_writepage(). In that case, we 1016 * *know* that ext4_writepage() has generated enough buffer credits to do the 1017 * whole page. So we won't block on the journal in that case, which is good, 1018 * because the caller may be PF_MEMALLOC. 1019 * 1020 * By accident, ext4 can be reentered when a transaction is open via 1021 * quota file writes. If we were to commit the transaction while thus 1022 * reentered, there can be a deadlock - we would be holding a quota 1023 * lock, and the commit would never complete if another thread had a 1024 * transaction open and was blocking on the quota lock - a ranking 1025 * violation. 1026 * 1027 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1028 * will _not_ run commit under these circumstances because handle->h_ref 1029 * is elevated. We'll still have enough credits for the tiny quotafile 1030 * write. 1031 */ 1032 int do_journal_get_write_access(handle_t *handle, struct inode *inode, 1033 struct buffer_head *bh) 1034 { 1035 int dirty = buffer_dirty(bh); 1036 int ret; 1037 1038 if (!buffer_mapped(bh) || buffer_freed(bh)) 1039 return 0; 1040 /* 1041 * __block_write_begin() could have dirtied some buffers. Clean 1042 * the dirty bit as jbd2_journal_get_write_access() could complain 1043 * otherwise about fs integrity issues. Setting of the dirty bit 1044 * by __block_write_begin() isn't a real problem here as we clear 1045 * the bit before releasing a page lock and thus writeback cannot 1046 * ever write the buffer. 1047 */ 1048 if (dirty) 1049 clear_buffer_dirty(bh); 1050 BUFFER_TRACE(bh, "get write access"); 1051 ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, 1052 EXT4_JTR_NONE); 1053 if (!ret && dirty) 1054 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1055 return ret; 1056 } 1057 1058 #ifdef CONFIG_FS_ENCRYPTION 1059 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, 1060 get_block_t *get_block) 1061 { 1062 unsigned from = pos & (PAGE_SIZE - 1); 1063 unsigned to = from + len; 1064 struct inode *inode = page->mapping->host; 1065 unsigned block_start, block_end; 1066 sector_t block; 1067 int err = 0; 1068 unsigned blocksize = inode->i_sb->s_blocksize; 1069 unsigned bbits; 1070 struct buffer_head *bh, *head, *wait[2]; 1071 int nr_wait = 0; 1072 int i; 1073 1074 BUG_ON(!PageLocked(page)); 1075 BUG_ON(from > PAGE_SIZE); 1076 BUG_ON(to > PAGE_SIZE); 1077 BUG_ON(from > to); 1078 1079 if (!page_has_buffers(page)) 1080 create_empty_buffers(page, blocksize, 0); 1081 head = page_buffers(page); 1082 bbits = ilog2(blocksize); 1083 block = (sector_t)page->index << (PAGE_SHIFT - bbits); 1084 1085 for (bh = head, block_start = 0; bh != head || !block_start; 1086 block++, block_start = block_end, bh = bh->b_this_page) { 1087 block_end = block_start + blocksize; 1088 if (block_end <= from || block_start >= to) { 1089 if (PageUptodate(page)) { 1090 set_buffer_uptodate(bh); 1091 } 1092 continue; 1093 } 1094 if (buffer_new(bh)) 1095 clear_buffer_new(bh); 1096 if (!buffer_mapped(bh)) { 1097 WARN_ON(bh->b_size != blocksize); 1098 err = get_block(inode, block, bh, 1); 1099 if (err) 1100 break; 1101 if (buffer_new(bh)) { 1102 if (PageUptodate(page)) { 1103 clear_buffer_new(bh); 1104 set_buffer_uptodate(bh); 1105 mark_buffer_dirty(bh); 1106 continue; 1107 } 1108 if (block_end > to || block_start < from) 1109 zero_user_segments(page, to, block_end, 1110 block_start, from); 1111 continue; 1112 } 1113 } 1114 if (PageUptodate(page)) { 1115 set_buffer_uptodate(bh); 1116 continue; 1117 } 1118 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 1119 !buffer_unwritten(bh) && 1120 (block_start < from || block_end > to)) { 1121 ext4_read_bh_lock(bh, 0, false); 1122 wait[nr_wait++] = bh; 1123 } 1124 } 1125 /* 1126 * If we issued read requests, let them complete. 1127 */ 1128 for (i = 0; i < nr_wait; i++) { 1129 wait_on_buffer(wait[i]); 1130 if (!buffer_uptodate(wait[i])) 1131 err = -EIO; 1132 } 1133 if (unlikely(err)) { 1134 page_zero_new_buffers(page, from, to); 1135 } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { 1136 for (i = 0; i < nr_wait; i++) { 1137 int err2; 1138 1139 err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), 1140 blocksize, 1141 bh_offset(wait[i])); 1142 if (err2) { 1143 clear_buffer_uptodate(wait[i]); 1144 err = err2; 1145 } 1146 } 1147 } 1148 1149 return err; 1150 } 1151 #endif 1152 1153 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1154 loff_t pos, unsigned len, 1155 struct page **pagep, void **fsdata) 1156 { 1157 struct inode *inode = mapping->host; 1158 int ret, needed_blocks; 1159 handle_t *handle; 1160 int retries = 0; 1161 struct page *page; 1162 pgoff_t index; 1163 unsigned from, to; 1164 1165 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 1166 return -EIO; 1167 1168 trace_ext4_write_begin(inode, pos, len); 1169 /* 1170 * Reserve one block more for addition to orphan list in case 1171 * we allocate blocks but write fails for some reason 1172 */ 1173 needed_blocks = ext4_writepage_trans_blocks(inode) + 1; 1174 index = pos >> PAGE_SHIFT; 1175 from = pos & (PAGE_SIZE - 1); 1176 to = from + len; 1177 1178 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 1179 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, 1180 pagep); 1181 if (ret < 0) 1182 return ret; 1183 if (ret == 1) 1184 return 0; 1185 } 1186 1187 /* 1188 * grab_cache_page_write_begin() can take a long time if the 1189 * system is thrashing due to memory pressure, or if the page 1190 * is being written back. So grab it first before we start 1191 * the transaction handle. This also allows us to allocate 1192 * the page (if needed) without using GFP_NOFS. 1193 */ 1194 retry_grab: 1195 page = grab_cache_page_write_begin(mapping, index); 1196 if (!page) 1197 return -ENOMEM; 1198 /* 1199 * The same as page allocation, we prealloc buffer heads before 1200 * starting the handle. 1201 */ 1202 if (!page_has_buffers(page)) 1203 create_empty_buffers(page, inode->i_sb->s_blocksize, 0); 1204 1205 unlock_page(page); 1206 1207 retry_journal: 1208 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); 1209 if (IS_ERR(handle)) { 1210 put_page(page); 1211 return PTR_ERR(handle); 1212 } 1213 1214 lock_page(page); 1215 if (page->mapping != mapping) { 1216 /* The page got truncated from under us */ 1217 unlock_page(page); 1218 put_page(page); 1219 ext4_journal_stop(handle); 1220 goto retry_grab; 1221 } 1222 /* In case writeback began while the page was unlocked */ 1223 wait_for_stable_page(page); 1224 1225 #ifdef CONFIG_FS_ENCRYPTION 1226 if (ext4_should_dioread_nolock(inode)) 1227 ret = ext4_block_write_begin(page, pos, len, 1228 ext4_get_block_unwritten); 1229 else 1230 ret = ext4_block_write_begin(page, pos, len, 1231 ext4_get_block); 1232 #else 1233 if (ext4_should_dioread_nolock(inode)) 1234 ret = __block_write_begin(page, pos, len, 1235 ext4_get_block_unwritten); 1236 else 1237 ret = __block_write_begin(page, pos, len, ext4_get_block); 1238 #endif 1239 if (!ret && ext4_should_journal_data(inode)) { 1240 ret = ext4_walk_page_buffers(handle, inode, 1241 page_buffers(page), from, to, NULL, 1242 do_journal_get_write_access); 1243 } 1244 1245 if (ret) { 1246 bool extended = (pos + len > inode->i_size) && 1247 !ext4_verity_in_progress(inode); 1248 1249 unlock_page(page); 1250 /* 1251 * __block_write_begin may have instantiated a few blocks 1252 * outside i_size. Trim these off again. Don't need 1253 * i_size_read because we hold i_rwsem. 1254 * 1255 * Add inode to orphan list in case we crash before 1256 * truncate finishes 1257 */ 1258 if (extended && ext4_can_truncate(inode)) 1259 ext4_orphan_add(handle, inode); 1260 1261 ext4_journal_stop(handle); 1262 if (extended) { 1263 ext4_truncate_failed_write(inode); 1264 /* 1265 * If truncate failed early the inode might 1266 * still be on the orphan list; we need to 1267 * make sure the inode is removed from the 1268 * orphan list in that case. 1269 */ 1270 if (inode->i_nlink) 1271 ext4_orphan_del(NULL, inode); 1272 } 1273 1274 if (ret == -ENOSPC && 1275 ext4_should_retry_alloc(inode->i_sb, &retries)) 1276 goto retry_journal; 1277 put_page(page); 1278 return ret; 1279 } 1280 *pagep = page; 1281 return ret; 1282 } 1283 1284 /* For write_end() in data=journal mode */ 1285 static int write_end_fn(handle_t *handle, struct inode *inode, 1286 struct buffer_head *bh) 1287 { 1288 int ret; 1289 if (!buffer_mapped(bh) || buffer_freed(bh)) 1290 return 0; 1291 set_buffer_uptodate(bh); 1292 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1293 clear_buffer_meta(bh); 1294 clear_buffer_prio(bh); 1295 return ret; 1296 } 1297 1298 /* 1299 * We need to pick up the new inode size which generic_commit_write gave us 1300 * `file' can be NULL - eg, when called from page_symlink(). 1301 * 1302 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1303 * buffers are managed internally. 1304 */ 1305 static int ext4_write_end(struct file *file, 1306 struct address_space *mapping, 1307 loff_t pos, unsigned len, unsigned copied, 1308 struct page *page, void *fsdata) 1309 { 1310 handle_t *handle = ext4_journal_current_handle(); 1311 struct inode *inode = mapping->host; 1312 loff_t old_size = inode->i_size; 1313 int ret = 0, ret2; 1314 int i_size_changed = 0; 1315 bool verity = ext4_verity_in_progress(inode); 1316 1317 trace_ext4_write_end(inode, pos, len, copied); 1318 1319 if (ext4_has_inline_data(inode) && 1320 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) 1321 return ext4_write_inline_data_end(inode, pos, len, copied, page); 1322 1323 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1324 /* 1325 * it's important to update i_size while still holding page lock: 1326 * page writeout could otherwise come in and zero beyond i_size. 1327 * 1328 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree 1329 * blocks are being written past EOF, so skip the i_size update. 1330 */ 1331 if (!verity) 1332 i_size_changed = ext4_update_inode_size(inode, pos + copied); 1333 unlock_page(page); 1334 put_page(page); 1335 1336 if (old_size < pos && !verity) 1337 pagecache_isize_extended(inode, old_size, pos); 1338 /* 1339 * Don't mark the inode dirty under page lock. First, it unnecessarily 1340 * makes the holding time of page lock longer. Second, it forces lock 1341 * ordering of page lock and transaction start for journaling 1342 * filesystems. 1343 */ 1344 if (i_size_changed) 1345 ret = ext4_mark_inode_dirty(handle, inode); 1346 1347 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) 1348 /* if we have allocated more blocks and copied 1349 * less. We will have blocks allocated outside 1350 * inode->i_size. So truncate them 1351 */ 1352 ext4_orphan_add(handle, inode); 1353 1354 ret2 = ext4_journal_stop(handle); 1355 if (!ret) 1356 ret = ret2; 1357 1358 if (pos + len > inode->i_size && !verity) { 1359 ext4_truncate_failed_write(inode); 1360 /* 1361 * If truncate failed early the inode might still be 1362 * on the orphan list; we need to make sure the inode 1363 * is removed from the orphan list in that case. 1364 */ 1365 if (inode->i_nlink) 1366 ext4_orphan_del(NULL, inode); 1367 } 1368 1369 return ret ? ret : copied; 1370 } 1371 1372 /* 1373 * This is a private version of page_zero_new_buffers() which doesn't 1374 * set the buffer to be dirty, since in data=journalled mode we need 1375 * to call ext4_handle_dirty_metadata() instead. 1376 */ 1377 static void ext4_journalled_zero_new_buffers(handle_t *handle, 1378 struct inode *inode, 1379 struct page *page, 1380 unsigned from, unsigned to) 1381 { 1382 unsigned int block_start = 0, block_end; 1383 struct buffer_head *head, *bh; 1384 1385 bh = head = page_buffers(page); 1386 do { 1387 block_end = block_start + bh->b_size; 1388 if (buffer_new(bh)) { 1389 if (block_end > from && block_start < to) { 1390 if (!PageUptodate(page)) { 1391 unsigned start, size; 1392 1393 start = max(from, block_start); 1394 size = min(to, block_end) - start; 1395 1396 zero_user(page, start, size); 1397 write_end_fn(handle, inode, bh); 1398 } 1399 clear_buffer_new(bh); 1400 } 1401 } 1402 block_start = block_end; 1403 bh = bh->b_this_page; 1404 } while (bh != head); 1405 } 1406 1407 static int ext4_journalled_write_end(struct file *file, 1408 struct address_space *mapping, 1409 loff_t pos, unsigned len, unsigned copied, 1410 struct page *page, void *fsdata) 1411 { 1412 handle_t *handle = ext4_journal_current_handle(); 1413 struct inode *inode = mapping->host; 1414 loff_t old_size = inode->i_size; 1415 int ret = 0, ret2; 1416 int partial = 0; 1417 unsigned from, to; 1418 int size_changed = 0; 1419 bool verity = ext4_verity_in_progress(inode); 1420 1421 trace_ext4_journalled_write_end(inode, pos, len, copied); 1422 from = pos & (PAGE_SIZE - 1); 1423 to = from + len; 1424 1425 BUG_ON(!ext4_handle_valid(handle)); 1426 1427 if (ext4_has_inline_data(inode)) 1428 return ext4_write_inline_data_end(inode, pos, len, copied, page); 1429 1430 if (unlikely(copied < len) && !PageUptodate(page)) { 1431 copied = 0; 1432 ext4_journalled_zero_new_buffers(handle, inode, page, from, to); 1433 } else { 1434 if (unlikely(copied < len)) 1435 ext4_journalled_zero_new_buffers(handle, inode, page, 1436 from + copied, to); 1437 ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), 1438 from, from + copied, &partial, 1439 write_end_fn); 1440 if (!partial) 1441 SetPageUptodate(page); 1442 } 1443 if (!verity) 1444 size_changed = ext4_update_inode_size(inode, pos + copied); 1445 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1446 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1447 unlock_page(page); 1448 put_page(page); 1449 1450 if (old_size < pos && !verity) 1451 pagecache_isize_extended(inode, old_size, pos); 1452 1453 if (size_changed) { 1454 ret2 = ext4_mark_inode_dirty(handle, inode); 1455 if (!ret) 1456 ret = ret2; 1457 } 1458 1459 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) 1460 /* if we have allocated more blocks and copied 1461 * less. We will have blocks allocated outside 1462 * inode->i_size. So truncate them 1463 */ 1464 ext4_orphan_add(handle, inode); 1465 1466 ret2 = ext4_journal_stop(handle); 1467 if (!ret) 1468 ret = ret2; 1469 if (pos + len > inode->i_size && !verity) { 1470 ext4_truncate_failed_write(inode); 1471 /* 1472 * If truncate failed early the inode might still be 1473 * on the orphan list; we need to make sure the inode 1474 * is removed from the orphan list in that case. 1475 */ 1476 if (inode->i_nlink) 1477 ext4_orphan_del(NULL, inode); 1478 } 1479 1480 return ret ? ret : copied; 1481 } 1482 1483 /* 1484 * Reserve space for a single cluster 1485 */ 1486 static int ext4_da_reserve_space(struct inode *inode) 1487 { 1488 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1489 struct ext4_inode_info *ei = EXT4_I(inode); 1490 int ret; 1491 1492 /* 1493 * We will charge metadata quota at writeout time; this saves 1494 * us from metadata over-estimation, though we may go over by 1495 * a small amount in the end. Here we just reserve for data. 1496 */ 1497 ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); 1498 if (ret) 1499 return ret; 1500 1501 spin_lock(&ei->i_block_reservation_lock); 1502 if (ext4_claim_free_clusters(sbi, 1, 0)) { 1503 spin_unlock(&ei->i_block_reservation_lock); 1504 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); 1505 return -ENOSPC; 1506 } 1507 ei->i_reserved_data_blocks++; 1508 trace_ext4_da_reserve_space(inode); 1509 spin_unlock(&ei->i_block_reservation_lock); 1510 1511 return 0; /* success */ 1512 } 1513 1514 void ext4_da_release_space(struct inode *inode, int to_free) 1515 { 1516 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1517 struct ext4_inode_info *ei = EXT4_I(inode); 1518 1519 if (!to_free) 1520 return; /* Nothing to release, exit */ 1521 1522 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1523 1524 trace_ext4_da_release_space(inode, to_free); 1525 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1526 /* 1527 * if there aren't enough reserved blocks, then the 1528 * counter is messed up somewhere. Since this 1529 * function is called from invalidate page, it's 1530 * harmless to return without any action. 1531 */ 1532 ext4_warning(inode->i_sb, "ext4_da_release_space: " 1533 "ino %lu, to_free %d with only %d reserved " 1534 "data blocks", inode->i_ino, to_free, 1535 ei->i_reserved_data_blocks); 1536 WARN_ON(1); 1537 to_free = ei->i_reserved_data_blocks; 1538 } 1539 ei->i_reserved_data_blocks -= to_free; 1540 1541 /* update fs dirty data blocks counter */ 1542 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); 1543 1544 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1545 1546 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); 1547 } 1548 1549 /* 1550 * Delayed allocation stuff 1551 */ 1552 1553 struct mpage_da_data { 1554 /* These are input fields for ext4_do_writepages() */ 1555 struct inode *inode; 1556 struct writeback_control *wbc; 1557 unsigned int can_map:1; /* Can writepages call map blocks? */ 1558 1559 /* These are internal state of ext4_do_writepages() */ 1560 pgoff_t first_page; /* The first page to write */ 1561 pgoff_t next_page; /* Current page to examine */ 1562 pgoff_t last_page; /* Last page to examine */ 1563 /* 1564 * Extent to map - this can be after first_page because that can be 1565 * fully mapped. We somewhat abuse m_flags to store whether the extent 1566 * is delalloc or unwritten. 1567 */ 1568 struct ext4_map_blocks map; 1569 struct ext4_io_submit io_submit; /* IO submission data */ 1570 unsigned int do_map:1; 1571 unsigned int scanned_until_end:1; 1572 }; 1573 1574 static void mpage_release_unused_pages(struct mpage_da_data *mpd, 1575 bool invalidate) 1576 { 1577 unsigned nr, i; 1578 pgoff_t index, end; 1579 struct folio_batch fbatch; 1580 struct inode *inode = mpd->inode; 1581 struct address_space *mapping = inode->i_mapping; 1582 1583 /* This is necessary when next_page == 0. */ 1584 if (mpd->first_page >= mpd->next_page) 1585 return; 1586 1587 mpd->scanned_until_end = 0; 1588 index = mpd->first_page; 1589 end = mpd->next_page - 1; 1590 if (invalidate) { 1591 ext4_lblk_t start, last; 1592 start = index << (PAGE_SHIFT - inode->i_blkbits); 1593 last = end << (PAGE_SHIFT - inode->i_blkbits); 1594 1595 /* 1596 * avoid racing with extent status tree scans made by 1597 * ext4_insert_delayed_block() 1598 */ 1599 down_write(&EXT4_I(inode)->i_data_sem); 1600 ext4_es_remove_extent(inode, start, last - start + 1); 1601 up_write(&EXT4_I(inode)->i_data_sem); 1602 } 1603 1604 folio_batch_init(&fbatch); 1605 while (index <= end) { 1606 nr = filemap_get_folios(mapping, &index, end, &fbatch); 1607 if (nr == 0) 1608 break; 1609 for (i = 0; i < nr; i++) { 1610 struct folio *folio = fbatch.folios[i]; 1611 1612 if (folio->index < mpd->first_page) 1613 continue; 1614 if (folio->index + folio_nr_pages(folio) - 1 > end) 1615 continue; 1616 BUG_ON(!folio_test_locked(folio)); 1617 BUG_ON(folio_test_writeback(folio)); 1618 if (invalidate) { 1619 if (folio_mapped(folio)) 1620 folio_clear_dirty_for_io(folio); 1621 block_invalidate_folio(folio, 0, 1622 folio_size(folio)); 1623 folio_clear_uptodate(folio); 1624 } 1625 folio_unlock(folio); 1626 } 1627 folio_batch_release(&fbatch); 1628 } 1629 } 1630 1631 static void ext4_print_free_blocks(struct inode *inode) 1632 { 1633 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1634 struct super_block *sb = inode->i_sb; 1635 struct ext4_inode_info *ei = EXT4_I(inode); 1636 1637 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", 1638 EXT4_C2B(EXT4_SB(inode->i_sb), 1639 ext4_count_free_clusters(sb))); 1640 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); 1641 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", 1642 (long long) EXT4_C2B(EXT4_SB(sb), 1643 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1644 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", 1645 (long long) EXT4_C2B(EXT4_SB(sb), 1646 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1647 ext4_msg(sb, KERN_CRIT, "Block reservation details"); 1648 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1649 ei->i_reserved_data_blocks); 1650 return; 1651 } 1652 1653 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, 1654 struct buffer_head *bh) 1655 { 1656 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1657 } 1658 1659 /* 1660 * ext4_insert_delayed_block - adds a delayed block to the extents status 1661 * tree, incrementing the reserved cluster/block 1662 * count or making a pending reservation 1663 * where needed 1664 * 1665 * @inode - file containing the newly added block 1666 * @lblk - logical block to be added 1667 * 1668 * Returns 0 on success, negative error code on failure. 1669 */ 1670 static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) 1671 { 1672 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1673 int ret; 1674 bool allocated = false; 1675 bool reserved = false; 1676 1677 /* 1678 * If the cluster containing lblk is shared with a delayed, 1679 * written, or unwritten extent in a bigalloc file system, it's 1680 * already been accounted for and does not need to be reserved. 1681 * A pending reservation must be made for the cluster if it's 1682 * shared with a written or unwritten extent and doesn't already 1683 * have one. Written and unwritten extents can be purged from the 1684 * extents status tree if the system is under memory pressure, so 1685 * it's necessary to examine the extent tree if a search of the 1686 * extents status tree doesn't get a match. 1687 */ 1688 if (sbi->s_cluster_ratio == 1) { 1689 ret = ext4_da_reserve_space(inode); 1690 if (ret != 0) /* ENOSPC */ 1691 goto errout; 1692 reserved = true; 1693 } else { /* bigalloc */ 1694 if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { 1695 if (!ext4_es_scan_clu(inode, 1696 &ext4_es_is_mapped, lblk)) { 1697 ret = ext4_clu_mapped(inode, 1698 EXT4_B2C(sbi, lblk)); 1699 if (ret < 0) 1700 goto errout; 1701 if (ret == 0) { 1702 ret = ext4_da_reserve_space(inode); 1703 if (ret != 0) /* ENOSPC */ 1704 goto errout; 1705 reserved = true; 1706 } else { 1707 allocated = true; 1708 } 1709 } else { 1710 allocated = true; 1711 } 1712 } 1713 } 1714 1715 ret = ext4_es_insert_delayed_block(inode, lblk, allocated); 1716 if (ret && reserved) 1717 ext4_da_release_space(inode, 1); 1718 1719 errout: 1720 return ret; 1721 } 1722 1723 /* 1724 * This function is grabs code from the very beginning of 1725 * ext4_map_blocks, but assumes that the caller is from delayed write 1726 * time. This function looks up the requested blocks and sets the 1727 * buffer delay bit under the protection of i_data_sem. 1728 */ 1729 static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, 1730 struct ext4_map_blocks *map, 1731 struct buffer_head *bh) 1732 { 1733 struct extent_status es; 1734 int retval; 1735 sector_t invalid_block = ~((sector_t) 0xffff); 1736 #ifdef ES_AGGRESSIVE_TEST 1737 struct ext4_map_blocks orig_map; 1738 1739 memcpy(&orig_map, map, sizeof(*map)); 1740 #endif 1741 1742 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 1743 invalid_block = ~0; 1744 1745 map->m_flags = 0; 1746 ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, 1747 (unsigned long) map->m_lblk); 1748 1749 /* Lookup extent status tree firstly */ 1750 if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { 1751 if (ext4_es_is_hole(&es)) { 1752 retval = 0; 1753 down_read(&EXT4_I(inode)->i_data_sem); 1754 goto add_delayed; 1755 } 1756 1757 /* 1758 * Delayed extent could be allocated by fallocate. 1759 * So we need to check it. 1760 */ 1761 if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { 1762 map_bh(bh, inode->i_sb, invalid_block); 1763 set_buffer_new(bh); 1764 set_buffer_delay(bh); 1765 return 0; 1766 } 1767 1768 map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; 1769 retval = es.es_len - (iblock - es.es_lblk); 1770 if (retval > map->m_len) 1771 retval = map->m_len; 1772 map->m_len = retval; 1773 if (ext4_es_is_written(&es)) 1774 map->m_flags |= EXT4_MAP_MAPPED; 1775 else if (ext4_es_is_unwritten(&es)) 1776 map->m_flags |= EXT4_MAP_UNWRITTEN; 1777 else 1778 BUG(); 1779 1780 #ifdef ES_AGGRESSIVE_TEST 1781 ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); 1782 #endif 1783 return retval; 1784 } 1785 1786 /* 1787 * Try to see if we can get the block without requesting a new 1788 * file system block. 1789 */ 1790 down_read(&EXT4_I(inode)->i_data_sem); 1791 if (ext4_has_inline_data(inode)) 1792 retval = 0; 1793 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1794 retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1795 else 1796 retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1797 1798 add_delayed: 1799 if (retval == 0) { 1800 int ret; 1801 1802 /* 1803 * XXX: __block_prepare_write() unmaps passed block, 1804 * is it OK? 1805 */ 1806 1807 ret = ext4_insert_delayed_block(inode, map->m_lblk); 1808 if (ret != 0) { 1809 retval = ret; 1810 goto out_unlock; 1811 } 1812 1813 map_bh(bh, inode->i_sb, invalid_block); 1814 set_buffer_new(bh); 1815 set_buffer_delay(bh); 1816 } else if (retval > 0) { 1817 int ret; 1818 unsigned int status; 1819 1820 if (unlikely(retval != map->m_len)) { 1821 ext4_warning(inode->i_sb, 1822 "ES len assertion failed for inode " 1823 "%lu: retval %d != map->m_len %d", 1824 inode->i_ino, retval, map->m_len); 1825 WARN_ON(1); 1826 } 1827 1828 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 1829 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 1830 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1831 map->m_pblk, status); 1832 if (ret != 0) 1833 retval = ret; 1834 } 1835 1836 out_unlock: 1837 up_read((&EXT4_I(inode)->i_data_sem)); 1838 1839 return retval; 1840 } 1841 1842 /* 1843 * This is a special get_block_t callback which is used by 1844 * ext4_da_write_begin(). It will either return mapped block or 1845 * reserve space for a single block. 1846 * 1847 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. 1848 * We also have b_blocknr = -1 and b_bdev initialized properly 1849 * 1850 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. 1851 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 1852 * initialized properly. 1853 */ 1854 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 1855 struct buffer_head *bh, int create) 1856 { 1857 struct ext4_map_blocks map; 1858 int ret = 0; 1859 1860 BUG_ON(create == 0); 1861 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 1862 1863 map.m_lblk = iblock; 1864 map.m_len = 1; 1865 1866 /* 1867 * first, we need to know whether the block is allocated already 1868 * preallocated blocks are unmapped but should treated 1869 * the same as allocated blocks. 1870 */ 1871 ret = ext4_da_map_blocks(inode, iblock, &map, bh); 1872 if (ret <= 0) 1873 return ret; 1874 1875 map_bh(bh, inode->i_sb, map.m_pblk); 1876 ext4_update_bh_state(bh, map.m_flags); 1877 1878 if (buffer_unwritten(bh)) { 1879 /* A delayed write to unwritten bh should be marked 1880 * new and mapped. Mapped ensures that we don't do 1881 * get_block multiple times when we write to the same 1882 * offset and new ensures that we do proper zero out 1883 * for partial write. 1884 */ 1885 set_buffer_new(bh); 1886 set_buffer_mapped(bh); 1887 } 1888 return 0; 1889 } 1890 1891 static int __ext4_journalled_writepage(struct page *page, 1892 unsigned int len) 1893 { 1894 struct address_space *mapping = page->mapping; 1895 struct inode *inode = mapping->host; 1896 handle_t *handle = NULL; 1897 int ret = 0, err = 0; 1898 int inline_data = ext4_has_inline_data(inode); 1899 struct buffer_head *inode_bh = NULL; 1900 loff_t size; 1901 1902 ClearPageChecked(page); 1903 1904 if (inline_data) { 1905 BUG_ON(page->index != 0); 1906 BUG_ON(len > ext4_get_max_inline_size(inode)); 1907 inode_bh = ext4_journalled_write_inline_data(inode, len, page); 1908 if (inode_bh == NULL) 1909 goto out; 1910 } 1911 /* 1912 * We need to release the page lock before we start the 1913 * journal, so grab a reference so the page won't disappear 1914 * out from under us. 1915 */ 1916 get_page(page); 1917 unlock_page(page); 1918 1919 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1920 ext4_writepage_trans_blocks(inode)); 1921 if (IS_ERR(handle)) { 1922 ret = PTR_ERR(handle); 1923 put_page(page); 1924 goto out_no_pagelock; 1925 } 1926 BUG_ON(!ext4_handle_valid(handle)); 1927 1928 lock_page(page); 1929 put_page(page); 1930 size = i_size_read(inode); 1931 if (page->mapping != mapping || page_offset(page) > size) { 1932 /* The page got truncated from under us */ 1933 ext4_journal_stop(handle); 1934 ret = 0; 1935 goto out; 1936 } 1937 1938 if (inline_data) { 1939 ret = ext4_mark_inode_dirty(handle, inode); 1940 } else { 1941 struct buffer_head *page_bufs = page_buffers(page); 1942 1943 if (page->index == size >> PAGE_SHIFT) 1944 len = size & ~PAGE_MASK; 1945 else 1946 len = PAGE_SIZE; 1947 1948 ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, 1949 NULL, do_journal_get_write_access); 1950 1951 err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, 1952 NULL, write_end_fn); 1953 } 1954 if (ret == 0) 1955 ret = err; 1956 err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); 1957 if (ret == 0) 1958 ret = err; 1959 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1960 err = ext4_journal_stop(handle); 1961 if (!ret) 1962 ret = err; 1963 1964 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1965 out: 1966 unlock_page(page); 1967 out_no_pagelock: 1968 brelse(inode_bh); 1969 return ret; 1970 } 1971 1972 /* 1973 * Note that we don't need to start a transaction unless we're journaling data 1974 * because we should have holes filled from ext4_page_mkwrite(). We even don't 1975 * need to file the inode to the transaction's list in ordered mode because if 1976 * we are writing back data added by write(), the inode is already there and if 1977 * we are writing back data modified via mmap(), no one guarantees in which 1978 * transaction the data will hit the disk. In case we are journaling data, we 1979 * cannot start transaction directly because transaction start ranks above page 1980 * lock so we have to do some magic. 1981 * 1982 * This function can get called via... 1983 * - ext4_writepages after taking page lock (have journal handle) 1984 * - journal_submit_inode_data_buffers (no journal handle) 1985 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1986 * - grab_page_cache when doing write_begin (have journal handle) 1987 * 1988 * We don't do any block allocation in this function. If we have page with 1989 * multiple blocks we need to write those buffer_heads that are mapped. This 1990 * is important for mmaped based write. So if we do with blocksize 1K 1991 * truncate(f, 1024); 1992 * a = mmap(f, 0, 4096); 1993 * a[0] = 'a'; 1994 * truncate(f, 4096); 1995 * we have in the page first buffer_head mapped via page_mkwrite call back 1996 * but other buffer_heads would be unmapped but dirty (dirty done via the 1997 * do_wp_page). So writepage should write the first block. If we modify 1998 * the mmap area beyond 1024 we will again get a page_fault and the 1999 * page_mkwrite callback will do the block allocation and mark the 2000 * buffer_heads mapped. 2001 * 2002 * We redirty the page if we have any buffer_heads that is either delay or 2003 * unwritten in the page. 2004 * 2005 * We can get recursively called as show below. 2006 * 2007 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 2008 * ext4_writepage() 2009 * 2010 * But since we don't do any block allocation we should not deadlock. 2011 * Page also have the dirty flag cleared so we don't get recurive page_lock. 2012 */ 2013 static int ext4_writepage(struct page *page, 2014 struct writeback_control *wbc) 2015 { 2016 struct folio *folio = page_folio(page); 2017 int ret = 0; 2018 loff_t size; 2019 unsigned int len; 2020 struct buffer_head *page_bufs = NULL; 2021 struct inode *inode = page->mapping->host; 2022 struct ext4_io_submit io_submit; 2023 2024 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { 2025 folio_invalidate(folio, 0, folio_size(folio)); 2026 folio_unlock(folio); 2027 return -EIO; 2028 } 2029 2030 trace_ext4_writepage(page); 2031 size = i_size_read(inode); 2032 if (page->index == size >> PAGE_SHIFT && 2033 !ext4_verity_in_progress(inode)) 2034 len = size & ~PAGE_MASK; 2035 else 2036 len = PAGE_SIZE; 2037 2038 /* Should never happen but for bugs in other kernel subsystems */ 2039 if (!page_has_buffers(page)) { 2040 ext4_warning_inode(inode, 2041 "page %lu does not have buffers attached", page->index); 2042 ClearPageDirty(page); 2043 unlock_page(page); 2044 return 0; 2045 } 2046 2047 page_bufs = page_buffers(page); 2048 /* 2049 * We cannot do block allocation or other extent handling in this 2050 * function. If there are buffers needing that, we have to redirty 2051 * the page. But we may reach here when we do a journal commit via 2052 * journal_submit_inode_data_buffers() and in that case we must write 2053 * allocated buffers to achieve data=ordered mode guarantees. 2054 * 2055 * Also, if there is only one buffer per page (the fs block 2056 * size == the page size), if one buffer needs block 2057 * allocation or needs to modify the extent tree to clear the 2058 * unwritten flag, we know that the page can't be written at 2059 * all, so we might as well refuse the write immediately. 2060 * Unfortunately if the block size != page size, we can't as 2061 * easily detect this case using ext4_walk_page_buffers(), but 2062 * for the extremely common case, this is an optimization that 2063 * skips a useless round trip through ext4_bio_write_page(). 2064 */ 2065 if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, 2066 ext4_bh_delay_or_unwritten)) { 2067 redirty_page_for_writepage(wbc, page); 2068 if ((current->flags & PF_MEMALLOC) || 2069 (inode->i_sb->s_blocksize == PAGE_SIZE)) { 2070 /* 2071 * For memory cleaning there's no point in writing only 2072 * some buffers. So just bail out. Warn if we came here 2073 * from direct reclaim. 2074 */ 2075 WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) 2076 == PF_MEMALLOC); 2077 unlock_page(page); 2078 return 0; 2079 } 2080 } 2081 2082 if (PageChecked(page) && ext4_should_journal_data(inode)) 2083 /* 2084 * It's mmapped pagecache. Add buffers and journal it. There 2085 * doesn't seem much point in redirtying the page here. 2086 */ 2087 return __ext4_journalled_writepage(page, len); 2088 2089 ext4_io_submit_init(&io_submit, wbc); 2090 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 2091 if (!io_submit.io_end) { 2092 redirty_page_for_writepage(wbc, page); 2093 unlock_page(page); 2094 return -ENOMEM; 2095 } 2096 ret = ext4_bio_write_page(&io_submit, page, len); 2097 ext4_io_submit(&io_submit); 2098 /* Drop io_end reference we got from init */ 2099 ext4_put_io_end_defer(io_submit.io_end); 2100 return ret; 2101 } 2102 2103 static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) 2104 { 2105 int len; 2106 loff_t size; 2107 int err; 2108 2109 BUG_ON(page->index != mpd->first_page); 2110 clear_page_dirty_for_io(page); 2111 /* 2112 * We have to be very careful here! Nothing protects writeback path 2113 * against i_size changes and the page can be writeably mapped into 2114 * page tables. So an application can be growing i_size and writing 2115 * data through mmap while writeback runs. clear_page_dirty_for_io() 2116 * write-protects our page in page tables and the page cannot get 2117 * written to again until we release page lock. So only after 2118 * clear_page_dirty_for_io() we are safe to sample i_size for 2119 * ext4_bio_write_page() to zero-out tail of the written page. We rely 2120 * on the barrier provided by TestClearPageDirty in 2121 * clear_page_dirty_for_io() to make sure i_size is really sampled only 2122 * after page tables are updated. 2123 */ 2124 size = i_size_read(mpd->inode); 2125 if (page->index == size >> PAGE_SHIFT && 2126 !ext4_verity_in_progress(mpd->inode)) 2127 len = size & ~PAGE_MASK; 2128 else 2129 len = PAGE_SIZE; 2130 err = ext4_bio_write_page(&mpd->io_submit, page, len); 2131 if (!err) 2132 mpd->wbc->nr_to_write--; 2133 mpd->first_page++; 2134 2135 return err; 2136 } 2137 2138 #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) 2139 2140 /* 2141 * mballoc gives us at most this number of blocks... 2142 * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). 2143 * The rest of mballoc seems to handle chunks up to full group size. 2144 */ 2145 #define MAX_WRITEPAGES_EXTENT_LEN 2048 2146 2147 /* 2148 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map 2149 * 2150 * @mpd - extent of blocks 2151 * @lblk - logical number of the block in the file 2152 * @bh - buffer head we want to add to the extent 2153 * 2154 * The function is used to collect contig. blocks in the same state. If the 2155 * buffer doesn't require mapping for writeback and we haven't started the 2156 * extent of buffers to map yet, the function returns 'true' immediately - the 2157 * caller can write the buffer right away. Otherwise the function returns true 2158 * if the block has been added to the extent, false if the block couldn't be 2159 * added. 2160 */ 2161 static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, 2162 struct buffer_head *bh) 2163 { 2164 struct ext4_map_blocks *map = &mpd->map; 2165 2166 /* Buffer that doesn't need mapping for writeback? */ 2167 if (!buffer_dirty(bh) || !buffer_mapped(bh) || 2168 (!buffer_delay(bh) && !buffer_unwritten(bh))) { 2169 /* So far no extent to map => we write the buffer right away */ 2170 if (map->m_len == 0) 2171 return true; 2172 return false; 2173 } 2174 2175 /* First block in the extent? */ 2176 if (map->m_len == 0) { 2177 /* We cannot map unless handle is started... */ 2178 if (!mpd->do_map) 2179 return false; 2180 map->m_lblk = lblk; 2181 map->m_len = 1; 2182 map->m_flags = bh->b_state & BH_FLAGS; 2183 return true; 2184 } 2185 2186 /* Don't go larger than mballoc is willing to allocate */ 2187 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) 2188 return false; 2189 2190 /* Can we merge the block to our big extent? */ 2191 if (lblk == map->m_lblk + map->m_len && 2192 (bh->b_state & BH_FLAGS) == map->m_flags) { 2193 map->m_len++; 2194 return true; 2195 } 2196 return false; 2197 } 2198 2199 /* 2200 * mpage_process_page_bufs - submit page buffers for IO or add them to extent 2201 * 2202 * @mpd - extent of blocks for mapping 2203 * @head - the first buffer in the page 2204 * @bh - buffer we should start processing from 2205 * @lblk - logical number of the block in the file corresponding to @bh 2206 * 2207 * Walk through page buffers from @bh upto @head (exclusive) and either submit 2208 * the page for IO if all buffers in this page were mapped and there's no 2209 * accumulated extent of buffers to map or add buffers in the page to the 2210 * extent of buffers to map. The function returns 1 if the caller can continue 2211 * by processing the next page, 0 if it should stop adding buffers to the 2212 * extent to map because we cannot extend it anymore. It can also return value 2213 * < 0 in case of error during IO submission. 2214 */ 2215 static int mpage_process_page_bufs(struct mpage_da_data *mpd, 2216 struct buffer_head *head, 2217 struct buffer_head *bh, 2218 ext4_lblk_t lblk) 2219 { 2220 struct inode *inode = mpd->inode; 2221 int err; 2222 ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) 2223 >> inode->i_blkbits; 2224 2225 if (ext4_verity_in_progress(inode)) 2226 blocks = EXT_MAX_BLOCKS; 2227 2228 do { 2229 BUG_ON(buffer_locked(bh)); 2230 2231 if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { 2232 /* Found extent to map? */ 2233 if (mpd->map.m_len) 2234 return 0; 2235 /* Buffer needs mapping and handle is not started? */ 2236 if (!mpd->do_map) 2237 return 0; 2238 /* Everything mapped so far and we hit EOF */ 2239 break; 2240 } 2241 } while (lblk++, (bh = bh->b_this_page) != head); 2242 /* So far everything mapped? Submit the page for IO. */ 2243 if (mpd->map.m_len == 0) { 2244 err = mpage_submit_page(mpd, head->b_page); 2245 if (err < 0) 2246 return err; 2247 } 2248 if (lblk >= blocks) { 2249 mpd->scanned_until_end = 1; 2250 return 0; 2251 } 2252 return 1; 2253 } 2254 2255 /* 2256 * mpage_process_page - update page buffers corresponding to changed extent and 2257 * may submit fully mapped page for IO 2258 * 2259 * @mpd - description of extent to map, on return next extent to map 2260 * @m_lblk - logical block mapping. 2261 * @m_pblk - corresponding physical mapping. 2262 * @map_bh - determines on return whether this page requires any further 2263 * mapping or not. 2264 * Scan given page buffers corresponding to changed extent and update buffer 2265 * state according to new extent state. 2266 * We map delalloc buffers to their physical location, clear unwritten bits. 2267 * If the given page is not fully mapped, we update @map to the next extent in 2268 * the given page that needs mapping & return @map_bh as true. 2269 */ 2270 static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, 2271 ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, 2272 bool *map_bh) 2273 { 2274 struct buffer_head *head, *bh; 2275 ext4_io_end_t *io_end = mpd->io_submit.io_end; 2276 ext4_lblk_t lblk = *m_lblk; 2277 ext4_fsblk_t pblock = *m_pblk; 2278 int err = 0; 2279 int blkbits = mpd->inode->i_blkbits; 2280 ssize_t io_end_size = 0; 2281 struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); 2282 2283 bh = head = page_buffers(page); 2284 do { 2285 if (lblk < mpd->map.m_lblk) 2286 continue; 2287 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { 2288 /* 2289 * Buffer after end of mapped extent. 2290 * Find next buffer in the page to map. 2291 */ 2292 mpd->map.m_len = 0; 2293 mpd->map.m_flags = 0; 2294 io_end_vec->size += io_end_size; 2295 2296 err = mpage_process_page_bufs(mpd, head, bh, lblk); 2297 if (err > 0) 2298 err = 0; 2299 if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { 2300 io_end_vec = ext4_alloc_io_end_vec(io_end); 2301 if (IS_ERR(io_end_vec)) { 2302 err = PTR_ERR(io_end_vec); 2303 goto out; 2304 } 2305 io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; 2306 } 2307 *map_bh = true; 2308 goto out; 2309 } 2310 if (buffer_delay(bh)) { 2311 clear_buffer_delay(bh); 2312 bh->b_blocknr = pblock++; 2313 } 2314 clear_buffer_unwritten(bh); 2315 io_end_size += (1 << blkbits); 2316 } while (lblk++, (bh = bh->b_this_page) != head); 2317 2318 io_end_vec->size += io_end_size; 2319 *map_bh = false; 2320 out: 2321 *m_lblk = lblk; 2322 *m_pblk = pblock; 2323 return err; 2324 } 2325 2326 /* 2327 * mpage_map_buffers - update buffers corresponding to changed extent and 2328 * submit fully mapped pages for IO 2329 * 2330 * @mpd - description of extent to map, on return next extent to map 2331 * 2332 * Scan buffers corresponding to changed extent (we expect corresponding pages 2333 * to be already locked) and update buffer state according to new extent state. 2334 * We map delalloc buffers to their physical location, clear unwritten bits, 2335 * and mark buffers as uninit when we perform writes to unwritten extents 2336 * and do extent conversion after IO is finished. If the last page is not fully 2337 * mapped, we update @map to the next extent in the last page that needs 2338 * mapping. Otherwise we submit the page for IO. 2339 */ 2340 static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) 2341 { 2342 struct folio_batch fbatch; 2343 unsigned nr, i; 2344 struct inode *inode = mpd->inode; 2345 int bpp_bits = PAGE_SHIFT - inode->i_blkbits; 2346 pgoff_t start, end; 2347 ext4_lblk_t lblk; 2348 ext4_fsblk_t pblock; 2349 int err; 2350 bool map_bh = false; 2351 2352 start = mpd->map.m_lblk >> bpp_bits; 2353 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; 2354 lblk = start << bpp_bits; 2355 pblock = mpd->map.m_pblk; 2356 2357 folio_batch_init(&fbatch); 2358 while (start <= end) { 2359 nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); 2360 if (nr == 0) 2361 break; 2362 for (i = 0; i < nr; i++) { 2363 struct page *page = &fbatch.folios[i]->page; 2364 2365 err = mpage_process_page(mpd, page, &lblk, &pblock, 2366 &map_bh); 2367 /* 2368 * If map_bh is true, means page may require further bh 2369 * mapping, or maybe the page was submitted for IO. 2370 * So we return to call further extent mapping. 2371 */ 2372 if (err < 0 || map_bh) 2373 goto out; 2374 /* Page fully mapped - let IO run! */ 2375 err = mpage_submit_page(mpd, page); 2376 if (err < 0) 2377 goto out; 2378 } 2379 folio_batch_release(&fbatch); 2380 } 2381 /* Extent fully mapped and matches with page boundary. We are done. */ 2382 mpd->map.m_len = 0; 2383 mpd->map.m_flags = 0; 2384 return 0; 2385 out: 2386 folio_batch_release(&fbatch); 2387 return err; 2388 } 2389 2390 static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) 2391 { 2392 struct inode *inode = mpd->inode; 2393 struct ext4_map_blocks *map = &mpd->map; 2394 int get_blocks_flags; 2395 int err, dioread_nolock; 2396 2397 trace_ext4_da_write_pages_extent(inode, map); 2398 /* 2399 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or 2400 * to convert an unwritten extent to be initialized (in the case 2401 * where we have written into one or more preallocated blocks). It is 2402 * possible that we're going to need more metadata blocks than 2403 * previously reserved. However we must not fail because we're in 2404 * writeback and there is nothing we can do about it so it might result 2405 * in data loss. So use reserved blocks to allocate metadata if 2406 * possible. 2407 * 2408 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if 2409 * the blocks in question are delalloc blocks. This indicates 2410 * that the blocks and quotas has already been checked when 2411 * the data was copied into the page cache. 2412 */ 2413 get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 2414 EXT4_GET_BLOCKS_METADATA_NOFAIL | 2415 EXT4_GET_BLOCKS_IO_SUBMIT; 2416 dioread_nolock = ext4_should_dioread_nolock(inode); 2417 if (dioread_nolock) 2418 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2419 if (map->m_flags & BIT(BH_Delay)) 2420 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2421 2422 err = ext4_map_blocks(handle, inode, map, get_blocks_flags); 2423 if (err < 0) 2424 return err; 2425 if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { 2426 if (!mpd->io_submit.io_end->handle && 2427 ext4_handle_valid(handle)) { 2428 mpd->io_submit.io_end->handle = handle->h_rsv_handle; 2429 handle->h_rsv_handle = NULL; 2430 } 2431 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); 2432 } 2433 2434 BUG_ON(map->m_len == 0); 2435 return 0; 2436 } 2437 2438 /* 2439 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length 2440 * mpd->len and submit pages underlying it for IO 2441 * 2442 * @handle - handle for journal operations 2443 * @mpd - extent to map 2444 * @give_up_on_write - we set this to true iff there is a fatal error and there 2445 * is no hope of writing the data. The caller should discard 2446 * dirty pages to avoid infinite loops. 2447 * 2448 * The function maps extent starting at mpd->lblk of length mpd->len. If it is 2449 * delayed, blocks are allocated, if it is unwritten, we may need to convert 2450 * them to initialized or split the described range from larger unwritten 2451 * extent. Note that we need not map all the described range since allocation 2452 * can return less blocks or the range is covered by more unwritten extents. We 2453 * cannot map more because we are limited by reserved transaction credits. On 2454 * the other hand we always make sure that the last touched page is fully 2455 * mapped so that it can be written out (and thus forward progress is 2456 * guaranteed). After mapping we submit all mapped pages for IO. 2457 */ 2458 static int mpage_map_and_submit_extent(handle_t *handle, 2459 struct mpage_da_data *mpd, 2460 bool *give_up_on_write) 2461 { 2462 struct inode *inode = mpd->inode; 2463 struct ext4_map_blocks *map = &mpd->map; 2464 int err; 2465 loff_t disksize; 2466 int progress = 0; 2467 ext4_io_end_t *io_end = mpd->io_submit.io_end; 2468 struct ext4_io_end_vec *io_end_vec; 2469 2470 io_end_vec = ext4_alloc_io_end_vec(io_end); 2471 if (IS_ERR(io_end_vec)) 2472 return PTR_ERR(io_end_vec); 2473 io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; 2474 do { 2475 err = mpage_map_one_extent(handle, mpd); 2476 if (err < 0) { 2477 struct super_block *sb = inode->i_sb; 2478 2479 if (ext4_forced_shutdown(EXT4_SB(sb)) || 2480 ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 2481 goto invalidate_dirty_pages; 2482 /* 2483 * Let the uper layers retry transient errors. 2484 * In the case of ENOSPC, if ext4_count_free_blocks() 2485 * is non-zero, a commit should free up blocks. 2486 */ 2487 if ((err == -ENOMEM) || 2488 (err == -ENOSPC && ext4_count_free_clusters(sb))) { 2489 if (progress) 2490 goto update_disksize; 2491 return err; 2492 } 2493 ext4_msg(sb, KERN_CRIT, 2494 "Delayed block allocation failed for " 2495 "inode %lu at logical offset %llu with" 2496 " max blocks %u with error %d", 2497 inode->i_ino, 2498 (unsigned long long)map->m_lblk, 2499 (unsigned)map->m_len, -err); 2500 ext4_msg(sb, KERN_CRIT, 2501 "This should not happen!! Data will " 2502 "be lost\n"); 2503 if (err == -ENOSPC) 2504 ext4_print_free_blocks(inode); 2505 invalidate_dirty_pages: 2506 *give_up_on_write = true; 2507 return err; 2508 } 2509 progress = 1; 2510 /* 2511 * Update buffer state, submit mapped pages, and get us new 2512 * extent to map 2513 */ 2514 err = mpage_map_and_submit_buffers(mpd); 2515 if (err < 0) 2516 goto update_disksize; 2517 } while (map->m_len); 2518 2519 update_disksize: 2520 /* 2521 * Update on-disk size after IO is submitted. Races with 2522 * truncate are avoided by checking i_size under i_data_sem. 2523 */ 2524 disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; 2525 if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { 2526 int err2; 2527 loff_t i_size; 2528 2529 down_write(&EXT4_I(inode)->i_data_sem); 2530 i_size = i_size_read(inode); 2531 if (disksize > i_size) 2532 disksize = i_size; 2533 if (disksize > EXT4_I(inode)->i_disksize) 2534 EXT4_I(inode)->i_disksize = disksize; 2535 up_write(&EXT4_I(inode)->i_data_sem); 2536 err2 = ext4_mark_inode_dirty(handle, inode); 2537 if (err2) { 2538 ext4_error_err(inode->i_sb, -err2, 2539 "Failed to mark inode %lu dirty", 2540 inode->i_ino); 2541 } 2542 if (!err) 2543 err = err2; 2544 } 2545 return err; 2546 } 2547 2548 /* 2549 * Calculate the total number of credits to reserve for one writepages 2550 * iteration. This is called from ext4_writepages(). We map an extent of 2551 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping 2552 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + 2553 * bpp - 1 blocks in bpp different extents. 2554 */ 2555 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2556 { 2557 int bpp = ext4_journal_blocks_per_page(inode); 2558 2559 return ext4_meta_trans_blocks(inode, 2560 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); 2561 } 2562 2563 /* Return true if the page needs to be written as part of transaction commit */ 2564 static bool ext4_page_nomap_can_writeout(struct page *page) 2565 { 2566 struct buffer_head *bh, *head; 2567 2568 bh = head = page_buffers(page); 2569 do { 2570 if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) 2571 return true; 2572 } while ((bh = bh->b_this_page) != head); 2573 return false; 2574 } 2575 2576 /* 2577 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages 2578 * needing mapping, submit mapped pages 2579 * 2580 * @mpd - where to look for pages 2581 * 2582 * Walk dirty pages in the mapping. If they are fully mapped, submit them for 2583 * IO immediately. If we cannot map blocks, we submit just already mapped 2584 * buffers in the page for IO and keep page dirty. When we can map blocks and 2585 * we find a page which isn't mapped we start accumulating extent of buffers 2586 * underlying these pages that needs mapping (formed by either delayed or 2587 * unwritten buffers). We also lock the pages containing these buffers. The 2588 * extent found is returned in @mpd structure (starting at mpd->lblk with 2589 * length mpd->len blocks). 2590 * 2591 * Note that this function can attach bios to one io_end structure which are 2592 * neither logically nor physically contiguous. Although it may seem as an 2593 * unnecessary complication, it is actually inevitable in blocksize < pagesize 2594 * case as we need to track IO to all buffers underlying a page in one io_end. 2595 */ 2596 static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) 2597 { 2598 struct address_space *mapping = mpd->inode->i_mapping; 2599 struct pagevec pvec; 2600 unsigned int nr_pages; 2601 long left = mpd->wbc->nr_to_write; 2602 pgoff_t index = mpd->first_page; 2603 pgoff_t end = mpd->last_page; 2604 xa_mark_t tag; 2605 int i, err = 0; 2606 int blkbits = mpd->inode->i_blkbits; 2607 ext4_lblk_t lblk; 2608 struct buffer_head *head; 2609 2610 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) 2611 tag = PAGECACHE_TAG_TOWRITE; 2612 else 2613 tag = PAGECACHE_TAG_DIRTY; 2614 2615 pagevec_init(&pvec); 2616 mpd->map.m_len = 0; 2617 mpd->next_page = index; 2618 while (index <= end) { 2619 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 2620 tag); 2621 if (nr_pages == 0) 2622 break; 2623 2624 for (i = 0; i < nr_pages; i++) { 2625 struct page *page = pvec.pages[i]; 2626 2627 /* 2628 * Accumulated enough dirty pages? This doesn't apply 2629 * to WB_SYNC_ALL mode. For integrity sync we have to 2630 * keep going because someone may be concurrently 2631 * dirtying pages, and we might have synced a lot of 2632 * newly appeared dirty pages, but have not synced all 2633 * of the old dirty pages. 2634 */ 2635 if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) 2636 goto out; 2637 2638 /* If we can't merge this page, we are done. */ 2639 if (mpd->map.m_len > 0 && mpd->next_page != page->index) 2640 goto out; 2641 2642 lock_page(page); 2643 /* 2644 * If the page is no longer dirty, or its mapping no 2645 * longer corresponds to inode we are writing (which 2646 * means it has been truncated or invalidated), or the 2647 * page is already under writeback and we are not doing 2648 * a data integrity writeback, skip the page 2649 */ 2650 if (!PageDirty(page) || 2651 (PageWriteback(page) && 2652 (mpd->wbc->sync_mode == WB_SYNC_NONE)) || 2653 unlikely(page->mapping != mapping)) { 2654 unlock_page(page); 2655 continue; 2656 } 2657 2658 wait_on_page_writeback(page); 2659 BUG_ON(PageWriteback(page)); 2660 2661 /* 2662 * Should never happen but for buggy code in 2663 * other subsystems that call 2664 * set_page_dirty() without properly warning 2665 * the file system first. See [1] for more 2666 * information. 2667 * 2668 * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz 2669 */ 2670 if (!page_has_buffers(page)) { 2671 ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index); 2672 ClearPageDirty(page); 2673 unlock_page(page); 2674 continue; 2675 } 2676 2677 if (mpd->map.m_len == 0) 2678 mpd->first_page = page->index; 2679 mpd->next_page = page->index + 1; 2680 /* 2681 * Writeout for transaction commit where we cannot 2682 * modify metadata is simple. Just submit the page. 2683 */ 2684 if (!mpd->can_map) { 2685 if (ext4_page_nomap_can_writeout(page)) { 2686 err = mpage_submit_page(mpd, page); 2687 if (err < 0) 2688 goto out; 2689 } else { 2690 unlock_page(page); 2691 mpd->first_page++; 2692 } 2693 } else { 2694 /* Add all dirty buffers to mpd */ 2695 lblk = ((ext4_lblk_t)page->index) << 2696 (PAGE_SHIFT - blkbits); 2697 head = page_buffers(page); 2698 err = mpage_process_page_bufs(mpd, head, head, 2699 lblk); 2700 if (err <= 0) 2701 goto out; 2702 err = 0; 2703 } 2704 left--; 2705 } 2706 pagevec_release(&pvec); 2707 cond_resched(); 2708 } 2709 mpd->scanned_until_end = 1; 2710 return 0; 2711 out: 2712 pagevec_release(&pvec); 2713 return err; 2714 } 2715 2716 static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, 2717 void *data) 2718 { 2719 return ext4_writepage(page, wbc); 2720 } 2721 2722 static int ext4_do_writepages(struct mpage_da_data *mpd) 2723 { 2724 struct writeback_control *wbc = mpd->wbc; 2725 pgoff_t writeback_index = 0; 2726 long nr_to_write = wbc->nr_to_write; 2727 int range_whole = 0; 2728 int cycled = 1; 2729 handle_t *handle = NULL; 2730 struct inode *inode = mpd->inode; 2731 struct address_space *mapping = inode->i_mapping; 2732 int needed_blocks, rsv_blocks = 0, ret = 0; 2733 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2734 struct blk_plug plug; 2735 bool give_up_on_write = false; 2736 2737 trace_ext4_writepages(inode, wbc); 2738 2739 /* 2740 * No pages to write? This is mainly a kludge to avoid starting 2741 * a transaction for special inodes like journal inode on last iput() 2742 * because that could violate lock ordering on umount 2743 */ 2744 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2745 goto out_writepages; 2746 2747 if (ext4_should_journal_data(inode)) { 2748 blk_start_plug(&plug); 2749 ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); 2750 blk_finish_plug(&plug); 2751 goto out_writepages; 2752 } 2753 2754 /* 2755 * If the filesystem has aborted, it is read-only, so return 2756 * right away instead of dumping stack traces later on that 2757 * will obscure the real source of the problem. We test 2758 * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because 2759 * the latter could be true if the filesystem is mounted 2760 * read-only, and in that case, ext4_writepages should 2761 * *never* be called, so if that ever happens, we would want 2762 * the stack trace. 2763 */ 2764 if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || 2765 ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { 2766 ret = -EROFS; 2767 goto out_writepages; 2768 } 2769 2770 /* 2771 * If we have inline data and arrive here, it means that 2772 * we will soon create the block for the 1st page, so 2773 * we'd better clear the inline data here. 2774 */ 2775 if (ext4_has_inline_data(inode)) { 2776 /* Just inode will be modified... */ 2777 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 2778 if (IS_ERR(handle)) { 2779 ret = PTR_ERR(handle); 2780 goto out_writepages; 2781 } 2782 BUG_ON(ext4_test_inode_state(inode, 2783 EXT4_STATE_MAY_INLINE_DATA)); 2784 ext4_destroy_inline_data(handle, inode); 2785 ext4_journal_stop(handle); 2786 } 2787 2788 if (ext4_should_dioread_nolock(inode)) { 2789 /* 2790 * We may need to convert up to one extent per block in 2791 * the page and we may dirty the inode. 2792 */ 2793 rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, 2794 PAGE_SIZE >> inode->i_blkbits); 2795 } 2796 2797 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2798 range_whole = 1; 2799 2800 if (wbc->range_cyclic) { 2801 writeback_index = mapping->writeback_index; 2802 if (writeback_index) 2803 cycled = 0; 2804 mpd->first_page = writeback_index; 2805 mpd->last_page = -1; 2806 } else { 2807 mpd->first_page = wbc->range_start >> PAGE_SHIFT; 2808 mpd->last_page = wbc->range_end >> PAGE_SHIFT; 2809 } 2810 2811 ext4_io_submit_init(&mpd->io_submit, wbc); 2812 retry: 2813 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2814 tag_pages_for_writeback(mapping, mpd->first_page, 2815 mpd->last_page); 2816 blk_start_plug(&plug); 2817 2818 /* 2819 * First writeback pages that don't need mapping - we can avoid 2820 * starting a transaction unnecessarily and also avoid being blocked 2821 * in the block layer on device congestion while having transaction 2822 * started. 2823 */ 2824 mpd->do_map = 0; 2825 mpd->scanned_until_end = 0; 2826 mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); 2827 if (!mpd->io_submit.io_end) { 2828 ret = -ENOMEM; 2829 goto unplug; 2830 } 2831 ret = mpage_prepare_extent_to_map(mpd); 2832 /* Unlock pages we didn't use */ 2833 mpage_release_unused_pages(mpd, false); 2834 /* Submit prepared bio */ 2835 ext4_io_submit(&mpd->io_submit); 2836 ext4_put_io_end_defer(mpd->io_submit.io_end); 2837 mpd->io_submit.io_end = NULL; 2838 if (ret < 0) 2839 goto unplug; 2840 2841 while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { 2842 /* For each extent of pages we use new io_end */ 2843 mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); 2844 if (!mpd->io_submit.io_end) { 2845 ret = -ENOMEM; 2846 break; 2847 } 2848 2849 WARN_ON_ONCE(!mpd->can_map); 2850 /* 2851 * We have two constraints: We find one extent to map and we 2852 * must always write out whole page (makes a difference when 2853 * blocksize < pagesize) so that we don't block on IO when we 2854 * try to write out the rest of the page. Journalled mode is 2855 * not supported by delalloc. 2856 */ 2857 BUG_ON(ext4_should_journal_data(inode)); 2858 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2859 2860 /* start a new transaction */ 2861 handle = ext4_journal_start_with_reserve(inode, 2862 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); 2863 if (IS_ERR(handle)) { 2864 ret = PTR_ERR(handle); 2865 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2866 "%ld pages, ino %lu; err %d", __func__, 2867 wbc->nr_to_write, inode->i_ino, ret); 2868 /* Release allocated io_end */ 2869 ext4_put_io_end(mpd->io_submit.io_end); 2870 mpd->io_submit.io_end = NULL; 2871 break; 2872 } 2873 mpd->do_map = 1; 2874 2875 trace_ext4_da_write_pages(inode, mpd->first_page, wbc); 2876 ret = mpage_prepare_extent_to_map(mpd); 2877 if (!ret && mpd->map.m_len) 2878 ret = mpage_map_and_submit_extent(handle, mpd, 2879 &give_up_on_write); 2880 /* 2881 * Caution: If the handle is synchronous, 2882 * ext4_journal_stop() can wait for transaction commit 2883 * to finish which may depend on writeback of pages to 2884 * complete or on page lock to be released. In that 2885 * case, we have to wait until after we have 2886 * submitted all the IO, released page locks we hold, 2887 * and dropped io_end reference (for extent conversion 2888 * to be able to complete) before stopping the handle. 2889 */ 2890 if (!ext4_handle_valid(handle) || handle->h_sync == 0) { 2891 ext4_journal_stop(handle); 2892 handle = NULL; 2893 mpd->do_map = 0; 2894 } 2895 /* Unlock pages we didn't use */ 2896 mpage_release_unused_pages(mpd, give_up_on_write); 2897 /* Submit prepared bio */ 2898 ext4_io_submit(&mpd->io_submit); 2899 2900 /* 2901 * Drop our io_end reference we got from init. We have 2902 * to be careful and use deferred io_end finishing if 2903 * we are still holding the transaction as we can 2904 * release the last reference to io_end which may end 2905 * up doing unwritten extent conversion. 2906 */ 2907 if (handle) { 2908 ext4_put_io_end_defer(mpd->io_submit.io_end); 2909 ext4_journal_stop(handle); 2910 } else 2911 ext4_put_io_end(mpd->io_submit.io_end); 2912 mpd->io_submit.io_end = NULL; 2913 2914 if (ret == -ENOSPC && sbi->s_journal) { 2915 /* 2916 * Commit the transaction which would 2917 * free blocks released in the transaction 2918 * and try again 2919 */ 2920 jbd2_journal_force_commit_nested(sbi->s_journal); 2921 ret = 0; 2922 continue; 2923 } 2924 /* Fatal error - ENOMEM, EIO... */ 2925 if (ret) 2926 break; 2927 } 2928 unplug: 2929 blk_finish_plug(&plug); 2930 if (!ret && !cycled && wbc->nr_to_write > 0) { 2931 cycled = 1; 2932 mpd->last_page = writeback_index - 1; 2933 mpd->first_page = 0; 2934 goto retry; 2935 } 2936 2937 /* Update index */ 2938 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2939 /* 2940 * Set the writeback_index so that range_cyclic 2941 * mode will write it back later 2942 */ 2943 mapping->writeback_index = mpd->first_page; 2944 2945 out_writepages: 2946 trace_ext4_writepages_result(inode, wbc, ret, 2947 nr_to_write - wbc->nr_to_write); 2948 return ret; 2949 } 2950 2951 static int ext4_writepages(struct address_space *mapping, 2952 struct writeback_control *wbc) 2953 { 2954 struct super_block *sb = mapping->host->i_sb; 2955 struct mpage_da_data mpd = { 2956 .inode = mapping->host, 2957 .wbc = wbc, 2958 .can_map = 1, 2959 }; 2960 int ret; 2961 2962 if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 2963 return -EIO; 2964 2965 percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); 2966 ret = ext4_do_writepages(&mpd); 2967 percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); 2968 2969 return ret; 2970 } 2971 2972 int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) 2973 { 2974 struct writeback_control wbc = { 2975 .sync_mode = WB_SYNC_ALL, 2976 .nr_to_write = LONG_MAX, 2977 .range_start = jinode->i_dirty_start, 2978 .range_end = jinode->i_dirty_end, 2979 }; 2980 struct mpage_da_data mpd = { 2981 .inode = jinode->i_vfs_inode, 2982 .wbc = &wbc, 2983 .can_map = 0, 2984 }; 2985 return ext4_do_writepages(&mpd); 2986 } 2987 2988 static int ext4_dax_writepages(struct address_space *mapping, 2989 struct writeback_control *wbc) 2990 { 2991 int ret; 2992 long nr_to_write = wbc->nr_to_write; 2993 struct inode *inode = mapping->host; 2994 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2995 2996 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 2997 return -EIO; 2998 2999 percpu_down_read(&sbi->s_writepages_rwsem); 3000 trace_ext4_writepages(inode, wbc); 3001 3002 ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); 3003 trace_ext4_writepages_result(inode, wbc, ret, 3004 nr_to_write - wbc->nr_to_write); 3005 percpu_up_read(&sbi->s_writepages_rwsem); 3006 return ret; 3007 } 3008 3009 static int ext4_nonda_switch(struct super_block *sb) 3010 { 3011 s64 free_clusters, dirty_clusters; 3012 struct ext4_sb_info *sbi = EXT4_SB(sb); 3013 3014 /* 3015 * switch to non delalloc mode if we are running low 3016 * on free block. The free block accounting via percpu 3017 * counters can get slightly wrong with percpu_counter_batch getting 3018 * accumulated on each CPU without updating global counters 3019 * Delalloc need an accurate free block accounting. So switch 3020 * to non delalloc when we are near to error range. 3021 */ 3022 free_clusters = 3023 percpu_counter_read_positive(&sbi->s_freeclusters_counter); 3024 dirty_clusters = 3025 percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 3026 /* 3027 * Start pushing delalloc when 1/2 of free blocks are dirty. 3028 */ 3029 if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) 3030 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); 3031 3032 if (2 * free_clusters < 3 * dirty_clusters || 3033 free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { 3034 /* 3035 * free block count is less than 150% of dirty blocks 3036 * or free blocks is less than watermark 3037 */ 3038 return 1; 3039 } 3040 return 0; 3041 } 3042 3043 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 3044 loff_t pos, unsigned len, 3045 struct page **pagep, void **fsdata) 3046 { 3047 int ret, retries = 0; 3048 struct page *page; 3049 pgoff_t index; 3050 struct inode *inode = mapping->host; 3051 3052 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 3053 return -EIO; 3054 3055 index = pos >> PAGE_SHIFT; 3056 3057 if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { 3058 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 3059 return ext4_write_begin(file, mapping, pos, 3060 len, pagep, fsdata); 3061 } 3062 *fsdata = (void *)0; 3063 trace_ext4_da_write_begin(inode, pos, len); 3064 3065 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 3066 ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, 3067 pagep, fsdata); 3068 if (ret < 0) 3069 return ret; 3070 if (ret == 1) 3071 return 0; 3072 } 3073 3074 retry: 3075 page = grab_cache_page_write_begin(mapping, index); 3076 if (!page) 3077 return -ENOMEM; 3078 3079 /* In case writeback began while the page was unlocked */ 3080 wait_for_stable_page(page); 3081 3082 #ifdef CONFIG_FS_ENCRYPTION 3083 ret = ext4_block_write_begin(page, pos, len, 3084 ext4_da_get_block_prep); 3085 #else 3086 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 3087 #endif 3088 if (ret < 0) { 3089 unlock_page(page); 3090 put_page(page); 3091 /* 3092 * block_write_begin may have instantiated a few blocks 3093 * outside i_size. Trim these off again. Don't need 3094 * i_size_read because we hold inode lock. 3095 */ 3096 if (pos + len > inode->i_size) 3097 ext4_truncate_failed_write(inode); 3098 3099 if (ret == -ENOSPC && 3100 ext4_should_retry_alloc(inode->i_sb, &retries)) 3101 goto retry; 3102 return ret; 3103 } 3104 3105 *pagep = page; 3106 return ret; 3107 } 3108 3109 /* 3110 * Check if we should update i_disksize 3111 * when write to the end of file but not require block allocation 3112 */ 3113 static int ext4_da_should_update_i_disksize(struct page *page, 3114 unsigned long offset) 3115 { 3116 struct buffer_head *bh; 3117 struct inode *inode = page->mapping->host; 3118 unsigned int idx; 3119 int i; 3120 3121 bh = page_buffers(page); 3122 idx = offset >> inode->i_blkbits; 3123 3124 for (i = 0; i < idx; i++) 3125 bh = bh->b_this_page; 3126 3127 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) 3128 return 0; 3129 return 1; 3130 } 3131 3132 static int ext4_da_write_end(struct file *file, 3133 struct address_space *mapping, 3134 loff_t pos, unsigned len, unsigned copied, 3135 struct page *page, void *fsdata) 3136 { 3137 struct inode *inode = mapping->host; 3138 loff_t new_i_size; 3139 unsigned long start, end; 3140 int write_mode = (int)(unsigned long)fsdata; 3141 3142 if (write_mode == FALL_BACK_TO_NONDELALLOC) 3143 return ext4_write_end(file, mapping, pos, 3144 len, copied, page, fsdata); 3145 3146 trace_ext4_da_write_end(inode, pos, len, copied); 3147 3148 if (write_mode != CONVERT_INLINE_DATA && 3149 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && 3150 ext4_has_inline_data(inode)) 3151 return ext4_write_inline_data_end(inode, pos, len, copied, page); 3152 3153 start = pos & (PAGE_SIZE - 1); 3154 end = start + copied - 1; 3155 3156 /* 3157 * Since we are holding inode lock, we are sure i_disksize <= 3158 * i_size. We also know that if i_disksize < i_size, there are 3159 * delalloc writes pending in the range upto i_size. If the end of 3160 * the current write is <= i_size, there's no need to touch 3161 * i_disksize since writeback will push i_disksize upto i_size 3162 * eventually. If the end of the current write is > i_size and 3163 * inside an allocated block (ext4_da_should_update_i_disksize() 3164 * check), we need to update i_disksize here as neither 3165 * ext4_writepage() nor certain ext4_writepages() paths not 3166 * allocating blocks update i_disksize. 3167 * 3168 * Note that we defer inode dirtying to generic_write_end() / 3169 * ext4_da_write_inline_data_end(). 3170 */ 3171 new_i_size = pos + copied; 3172 if (copied && new_i_size > inode->i_size && 3173 ext4_da_should_update_i_disksize(page, end)) 3174 ext4_update_i_disksize(inode, new_i_size); 3175 3176 return generic_write_end(file, mapping, pos, len, copied, page, fsdata); 3177 } 3178 3179 /* 3180 * Force all delayed allocation blocks to be allocated for a given inode. 3181 */ 3182 int ext4_alloc_da_blocks(struct inode *inode) 3183 { 3184 trace_ext4_alloc_da_blocks(inode); 3185 3186 if (!EXT4_I(inode)->i_reserved_data_blocks) 3187 return 0; 3188 3189 /* 3190 * We do something simple for now. The filemap_flush() will 3191 * also start triggering a write of the data blocks, which is 3192 * not strictly speaking necessary (and for users of 3193 * laptop_mode, not even desirable). However, to do otherwise 3194 * would require replicating code paths in: 3195 * 3196 * ext4_writepages() -> 3197 * write_cache_pages() ---> (via passed in callback function) 3198 * __mpage_da_writepage() --> 3199 * mpage_add_bh_to_extent() 3200 * mpage_da_map_blocks() 3201 * 3202 * The problem is that write_cache_pages(), located in 3203 * mm/page-writeback.c, marks pages clean in preparation for 3204 * doing I/O, which is not desirable if we're not planning on 3205 * doing I/O at all. 3206 * 3207 * We could call write_cache_pages(), and then redirty all of 3208 * the pages by calling redirty_page_for_writepage() but that 3209 * would be ugly in the extreme. So instead we would need to 3210 * replicate parts of the code in the above functions, 3211 * simplifying them because we wouldn't actually intend to 3212 * write out the pages, but rather only collect contiguous 3213 * logical block extents, call the multi-block allocator, and 3214 * then update the buffer heads with the block allocations. 3215 * 3216 * For now, though, we'll cheat by calling filemap_flush(), 3217 * which will map the blocks, and start the I/O, but not 3218 * actually wait for the I/O to complete. 3219 */ 3220 return filemap_flush(inode->i_mapping); 3221 } 3222 3223 /* 3224 * bmap() is special. It gets used by applications such as lilo and by 3225 * the swapper to find the on-disk block of a specific piece of data. 3226 * 3227 * Naturally, this is dangerous if the block concerned is still in the 3228 * journal. If somebody makes a swapfile on an ext4 data-journaling 3229 * filesystem and enables swap, then they may get a nasty shock when the 3230 * data getting swapped to that swapfile suddenly gets overwritten by 3231 * the original zero's written out previously to the journal and 3232 * awaiting writeback in the kernel's buffer cache. 3233 * 3234 * So, if we see any bmap calls here on a modified, data-journaled file, 3235 * take extra steps to flush any blocks which might be in the cache. 3236 */ 3237 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3238 { 3239 struct inode *inode = mapping->host; 3240 journal_t *journal; 3241 sector_t ret = 0; 3242 int err; 3243 3244 inode_lock_shared(inode); 3245 /* 3246 * We can get here for an inline file via the FIBMAP ioctl 3247 */ 3248 if (ext4_has_inline_data(inode)) 3249 goto out; 3250 3251 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3252 test_opt(inode->i_sb, DELALLOC)) { 3253 /* 3254 * With delalloc we want to sync the file 3255 * so that we can make sure we allocate 3256 * blocks for file 3257 */ 3258 filemap_write_and_wait(mapping); 3259 } 3260 3261 if (EXT4_JOURNAL(inode) && 3262 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { 3263 /* 3264 * This is a REALLY heavyweight approach, but the use of 3265 * bmap on dirty files is expected to be extremely rare: 3266 * only if we run lilo or swapon on a freshly made file 3267 * do we expect this to happen. 3268 * 3269 * (bmap requires CAP_SYS_RAWIO so this does not 3270 * represent an unprivileged user DOS attack --- we'd be 3271 * in trouble if mortal users could trigger this path at 3272 * will.) 3273 * 3274 * NB. EXT4_STATE_JDATA is not set on files other than 3275 * regular files. If somebody wants to bmap a directory 3276 * or symlink and gets confused because the buffer 3277 * hasn't yet been flushed to disk, they deserve 3278 * everything they get. 3279 */ 3280 3281 ext4_clear_inode_state(inode, EXT4_STATE_JDATA); 3282 journal = EXT4_JOURNAL(inode); 3283 jbd2_journal_lock_updates(journal); 3284 err = jbd2_journal_flush(journal, 0); 3285 jbd2_journal_unlock_updates(journal); 3286 3287 if (err) 3288 goto out; 3289 } 3290 3291 ret = iomap_bmap(mapping, block, &ext4_iomap_ops); 3292 3293 out: 3294 inode_unlock_shared(inode); 3295 return ret; 3296 } 3297 3298 static int ext4_read_folio(struct file *file, struct folio *folio) 3299 { 3300 struct page *page = &folio->page; 3301 int ret = -EAGAIN; 3302 struct inode *inode = page->mapping->host; 3303 3304 trace_ext4_readpage(page); 3305 3306 if (ext4_has_inline_data(inode)) 3307 ret = ext4_readpage_inline(inode, page); 3308 3309 if (ret == -EAGAIN) 3310 return ext4_mpage_readpages(inode, NULL, page); 3311 3312 return ret; 3313 } 3314 3315 static void ext4_readahead(struct readahead_control *rac) 3316 { 3317 struct inode *inode = rac->mapping->host; 3318 3319 /* If the file has inline data, no need to do readahead. */ 3320 if (ext4_has_inline_data(inode)) 3321 return; 3322 3323 ext4_mpage_readpages(inode, rac, NULL); 3324 } 3325 3326 static void ext4_invalidate_folio(struct folio *folio, size_t offset, 3327 size_t length) 3328 { 3329 trace_ext4_invalidate_folio(folio, offset, length); 3330 3331 /* No journalling happens on data buffers when this function is used */ 3332 WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); 3333 3334 block_invalidate_folio(folio, offset, length); 3335 } 3336 3337 static int __ext4_journalled_invalidate_folio(struct folio *folio, 3338 size_t offset, size_t length) 3339 { 3340 journal_t *journal = EXT4_JOURNAL(folio->mapping->host); 3341 3342 trace_ext4_journalled_invalidate_folio(folio, offset, length); 3343 3344 /* 3345 * If it's a full truncate we just forget about the pending dirtying 3346 */ 3347 if (offset == 0 && length == folio_size(folio)) 3348 folio_clear_checked(folio); 3349 3350 return jbd2_journal_invalidate_folio(journal, folio, offset, length); 3351 } 3352 3353 /* Wrapper for aops... */ 3354 static void ext4_journalled_invalidate_folio(struct folio *folio, 3355 size_t offset, 3356 size_t length) 3357 { 3358 WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); 3359 } 3360 3361 static bool ext4_release_folio(struct folio *folio, gfp_t wait) 3362 { 3363 journal_t *journal = EXT4_JOURNAL(folio->mapping->host); 3364 3365 trace_ext4_releasepage(&folio->page); 3366 3367 /* Page has dirty journalled data -> cannot release */ 3368 if (folio_test_checked(folio)) 3369 return false; 3370 if (journal) 3371 return jbd2_journal_try_to_free_buffers(journal, folio); 3372 else 3373 return try_to_free_buffers(folio); 3374 } 3375 3376 static bool ext4_inode_datasync_dirty(struct inode *inode) 3377 { 3378 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 3379 3380 if (journal) { 3381 if (jbd2_transaction_committed(journal, 3382 EXT4_I(inode)->i_datasync_tid)) 3383 return false; 3384 if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) 3385 return !list_empty(&EXT4_I(inode)->i_fc_list); 3386 return true; 3387 } 3388 3389 /* Any metadata buffers to write? */ 3390 if (!list_empty(&inode->i_mapping->private_list)) 3391 return true; 3392 return inode->i_state & I_DIRTY_DATASYNC; 3393 } 3394 3395 static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, 3396 struct ext4_map_blocks *map, loff_t offset, 3397 loff_t length, unsigned int flags) 3398 { 3399 u8 blkbits = inode->i_blkbits; 3400 3401 /* 3402 * Writes that span EOF might trigger an I/O size update on completion, 3403 * so consider them to be dirty for the purpose of O_DSYNC, even if 3404 * there is no other metadata changes being made or are pending. 3405 */ 3406 iomap->flags = 0; 3407 if (ext4_inode_datasync_dirty(inode) || 3408 offset + length > i_size_read(inode)) 3409 iomap->flags |= IOMAP_F_DIRTY; 3410 3411 if (map->m_flags & EXT4_MAP_NEW) 3412 iomap->flags |= IOMAP_F_NEW; 3413 3414 if (flags & IOMAP_DAX) 3415 iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 3416 else 3417 iomap->bdev = inode->i_sb->s_bdev; 3418 iomap->offset = (u64) map->m_lblk << blkbits; 3419 iomap->length = (u64) map->m_len << blkbits; 3420 3421 if ((map->m_flags & EXT4_MAP_MAPPED) && 3422 !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3423 iomap->flags |= IOMAP_F_MERGED; 3424 3425 /* 3426 * Flags passed to ext4_map_blocks() for direct I/O writes can result 3427 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits 3428 * set. In order for any allocated unwritten extents to be converted 3429 * into written extents correctly within the ->end_io() handler, we 3430 * need to ensure that the iomap->type is set appropriately. Hence, the 3431 * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has 3432 * been set first. 3433 */ 3434 if (map->m_flags & EXT4_MAP_UNWRITTEN) { 3435 iomap->type = IOMAP_UNWRITTEN; 3436 iomap->addr = (u64) map->m_pblk << blkbits; 3437 if (flags & IOMAP_DAX) 3438 iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; 3439 } else if (map->m_flags & EXT4_MAP_MAPPED) { 3440 iomap->type = IOMAP_MAPPED; 3441 iomap->addr = (u64) map->m_pblk << blkbits; 3442 if (flags & IOMAP_DAX) 3443 iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; 3444 } else { 3445 iomap->type = IOMAP_HOLE; 3446 iomap->addr = IOMAP_NULL_ADDR; 3447 } 3448 } 3449 3450 static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, 3451 unsigned int flags) 3452 { 3453 handle_t *handle; 3454 u8 blkbits = inode->i_blkbits; 3455 int ret, dio_credits, m_flags = 0, retries = 0; 3456 3457 /* 3458 * Trim the mapping request to the maximum value that we can map at 3459 * once for direct I/O. 3460 */ 3461 if (map->m_len > DIO_MAX_BLOCKS) 3462 map->m_len = DIO_MAX_BLOCKS; 3463 dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); 3464 3465 retry: 3466 /* 3467 * Either we allocate blocks and then don't get an unwritten extent, so 3468 * in that case we have reserved enough credits. Or, the blocks are 3469 * already allocated and unwritten. In that case, the extent conversion 3470 * fits into the credits as well. 3471 */ 3472 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); 3473 if (IS_ERR(handle)) 3474 return PTR_ERR(handle); 3475 3476 /* 3477 * DAX and direct I/O are the only two operations that are currently 3478 * supported with IOMAP_WRITE. 3479 */ 3480 WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); 3481 if (flags & IOMAP_DAX) 3482 m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; 3483 /* 3484 * We use i_size instead of i_disksize here because delalloc writeback 3485 * can complete at any point during the I/O and subsequently push the 3486 * i_disksize out to i_size. This could be beyond where direct I/O is 3487 * happening and thus expose allocated blocks to direct I/O reads. 3488 */ 3489 else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) 3490 m_flags = EXT4_GET_BLOCKS_CREATE; 3491 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3492 m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; 3493 3494 ret = ext4_map_blocks(handle, inode, map, m_flags); 3495 3496 /* 3497 * We cannot fill holes in indirect tree based inodes as that could 3498 * expose stale data in the case of a crash. Use the magic error code 3499 * to fallback to buffered I/O. 3500 */ 3501 if (!m_flags && !ret) 3502 ret = -ENOTBLK; 3503 3504 ext4_journal_stop(handle); 3505 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3506 goto retry; 3507 3508 return ret; 3509 } 3510 3511 3512 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 3513 unsigned flags, struct iomap *iomap, struct iomap *srcmap) 3514 { 3515 int ret; 3516 struct ext4_map_blocks map; 3517 u8 blkbits = inode->i_blkbits; 3518 3519 if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) 3520 return -EINVAL; 3521 3522 if (WARN_ON_ONCE(ext4_has_inline_data(inode))) 3523 return -ERANGE; 3524 3525 /* 3526 * Calculate the first and last logical blocks respectively. 3527 */ 3528 map.m_lblk = offset >> blkbits; 3529 map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, 3530 EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; 3531 3532 if (flags & IOMAP_WRITE) { 3533 /* 3534 * We check here if the blocks are already allocated, then we 3535 * don't need to start a journal txn and we can directly return 3536 * the mapping information. This could boost performance 3537 * especially in multi-threaded overwrite requests. 3538 */ 3539 if (offset + length <= i_size_read(inode)) { 3540 ret = ext4_map_blocks(NULL, inode, &map, 0); 3541 if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) 3542 goto out; 3543 } 3544 ret = ext4_iomap_alloc(inode, &map, flags); 3545 } else { 3546 ret = ext4_map_blocks(NULL, inode, &map, 0); 3547 } 3548 3549 if (ret < 0) 3550 return ret; 3551 out: 3552 /* 3553 * When inline encryption is enabled, sometimes I/O to an encrypted file 3554 * has to be broken up to guarantee DUN contiguity. Handle this by 3555 * limiting the length of the mapping returned. 3556 */ 3557 map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); 3558 3559 ext4_set_iomap(inode, iomap, &map, offset, length, flags); 3560 3561 return 0; 3562 } 3563 3564 static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, 3565 loff_t length, unsigned flags, struct iomap *iomap, 3566 struct iomap *srcmap) 3567 { 3568 int ret; 3569 3570 /* 3571 * Even for writes we don't need to allocate blocks, so just pretend 3572 * we are reading to save overhead of starting a transaction. 3573 */ 3574 flags &= ~IOMAP_WRITE; 3575 ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); 3576 WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); 3577 return ret; 3578 } 3579 3580 static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, 3581 ssize_t written, unsigned flags, struct iomap *iomap) 3582 { 3583 /* 3584 * Check to see whether an error occurred while writing out the data to 3585 * the allocated blocks. If so, return the magic error code so that we 3586 * fallback to buffered I/O and attempt to complete the remainder of 3587 * the I/O. Any blocks that may have been allocated in preparation for 3588 * the direct I/O will be reused during buffered I/O. 3589 */ 3590 if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) 3591 return -ENOTBLK; 3592 3593 return 0; 3594 } 3595 3596 const struct iomap_ops ext4_iomap_ops = { 3597 .iomap_begin = ext4_iomap_begin, 3598 .iomap_end = ext4_iomap_end, 3599 }; 3600 3601 const struct iomap_ops ext4_iomap_overwrite_ops = { 3602 .iomap_begin = ext4_iomap_overwrite_begin, 3603 .iomap_end = ext4_iomap_end, 3604 }; 3605 3606 static bool ext4_iomap_is_delalloc(struct inode *inode, 3607 struct ext4_map_blocks *map) 3608 { 3609 struct extent_status es; 3610 ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; 3611 3612 ext4_es_find_extent_range(inode, &ext4_es_is_delayed, 3613 map->m_lblk, end, &es); 3614 3615 if (!es.es_len || es.es_lblk > end) 3616 return false; 3617 3618 if (es.es_lblk > map->m_lblk) { 3619 map->m_len = es.es_lblk - map->m_lblk; 3620 return false; 3621 } 3622 3623 offset = map->m_lblk - es.es_lblk; 3624 map->m_len = es.es_len - offset; 3625 3626 return true; 3627 } 3628 3629 static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, 3630 loff_t length, unsigned int flags, 3631 struct iomap *iomap, struct iomap *srcmap) 3632 { 3633 int ret; 3634 bool delalloc = false; 3635 struct ext4_map_blocks map; 3636 u8 blkbits = inode->i_blkbits; 3637 3638 if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) 3639 return -EINVAL; 3640 3641 if (ext4_has_inline_data(inode)) { 3642 ret = ext4_inline_data_iomap(inode, iomap); 3643 if (ret != -EAGAIN) { 3644 if (ret == 0 && offset >= iomap->length) 3645 ret = -ENOENT; 3646 return ret; 3647 } 3648 } 3649 3650 /* 3651 * Calculate the first and last logical block respectively. 3652 */ 3653 map.m_lblk = offset >> blkbits; 3654 map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, 3655 EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; 3656 3657 /* 3658 * Fiemap callers may call for offset beyond s_bitmap_maxbytes. 3659 * So handle it here itself instead of querying ext4_map_blocks(). 3660 * Since ext4_map_blocks() will warn about it and will return 3661 * -EIO error. 3662 */ 3663 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 3664 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3665 3666 if (offset >= sbi->s_bitmap_maxbytes) { 3667 map.m_flags = 0; 3668 goto set_iomap; 3669 } 3670 } 3671 3672 ret = ext4_map_blocks(NULL, inode, &map, 0); 3673 if (ret < 0) 3674 return ret; 3675 if (ret == 0) 3676 delalloc = ext4_iomap_is_delalloc(inode, &map); 3677 3678 set_iomap: 3679 ext4_set_iomap(inode, iomap, &map, offset, length, flags); 3680 if (delalloc && iomap->type == IOMAP_HOLE) 3681 iomap->type = IOMAP_DELALLOC; 3682 3683 return 0; 3684 } 3685 3686 const struct iomap_ops ext4_iomap_report_ops = { 3687 .iomap_begin = ext4_iomap_begin_report, 3688 }; 3689 3690 /* 3691 * Whenever the folio is being dirtied, corresponding buffers should already 3692 * be attached to the transaction (we take care of this in ext4_page_mkwrite() 3693 * and ext4_write_begin()). However we cannot move buffers to dirty transaction 3694 * lists here because ->dirty_folio is called under VFS locks and the folio 3695 * is not necessarily locked. 3696 * 3697 * We cannot just dirty the folio and leave attached buffers clean, because the 3698 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3699 * or jbddirty because all the journalling code will explode. 3700 * 3701 * So what we do is to mark the folio "pending dirty" and next time writepage 3702 * is called, propagate that into the buffers appropriately. 3703 */ 3704 static bool ext4_journalled_dirty_folio(struct address_space *mapping, 3705 struct folio *folio) 3706 { 3707 WARN_ON_ONCE(!folio_buffers(folio)); 3708 folio_set_checked(folio); 3709 return filemap_dirty_folio(mapping, folio); 3710 } 3711 3712 static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) 3713 { 3714 WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); 3715 WARN_ON_ONCE(!folio_buffers(folio)); 3716 return block_dirty_folio(mapping, folio); 3717 } 3718 3719 static int ext4_iomap_swap_activate(struct swap_info_struct *sis, 3720 struct file *file, sector_t *span) 3721 { 3722 return iomap_swapfile_activate(sis, file, span, 3723 &ext4_iomap_report_ops); 3724 } 3725 3726 static const struct address_space_operations ext4_aops = { 3727 .read_folio = ext4_read_folio, 3728 .readahead = ext4_readahead, 3729 .writepages = ext4_writepages, 3730 .write_begin = ext4_write_begin, 3731 .write_end = ext4_write_end, 3732 .dirty_folio = ext4_dirty_folio, 3733 .bmap = ext4_bmap, 3734 .invalidate_folio = ext4_invalidate_folio, 3735 .release_folio = ext4_release_folio, 3736 .direct_IO = noop_direct_IO, 3737 .migrate_folio = buffer_migrate_folio, 3738 .is_partially_uptodate = block_is_partially_uptodate, 3739 .error_remove_page = generic_error_remove_page, 3740 .swap_activate = ext4_iomap_swap_activate, 3741 }; 3742 3743 static const struct address_space_operations ext4_journalled_aops = { 3744 .read_folio = ext4_read_folio, 3745 .readahead = ext4_readahead, 3746 .writepages = ext4_writepages, 3747 .write_begin = ext4_write_begin, 3748 .write_end = ext4_journalled_write_end, 3749 .dirty_folio = ext4_journalled_dirty_folio, 3750 .bmap = ext4_bmap, 3751 .invalidate_folio = ext4_journalled_invalidate_folio, 3752 .release_folio = ext4_release_folio, 3753 .direct_IO = noop_direct_IO, 3754 .migrate_folio = buffer_migrate_folio_norefs, 3755 .is_partially_uptodate = block_is_partially_uptodate, 3756 .error_remove_page = generic_error_remove_page, 3757 .swap_activate = ext4_iomap_swap_activate, 3758 }; 3759 3760 static const struct address_space_operations ext4_da_aops = { 3761 .read_folio = ext4_read_folio, 3762 .readahead = ext4_readahead, 3763 .writepages = ext4_writepages, 3764 .write_begin = ext4_da_write_begin, 3765 .write_end = ext4_da_write_end, 3766 .dirty_folio = ext4_dirty_folio, 3767 .bmap = ext4_bmap, 3768 .invalidate_folio = ext4_invalidate_folio, 3769 .release_folio = ext4_release_folio, 3770 .direct_IO = noop_direct_IO, 3771 .migrate_folio = buffer_migrate_folio, 3772 .is_partially_uptodate = block_is_partially_uptodate, 3773 .error_remove_page = generic_error_remove_page, 3774 .swap_activate = ext4_iomap_swap_activate, 3775 }; 3776 3777 static const struct address_space_operations ext4_dax_aops = { 3778 .writepages = ext4_dax_writepages, 3779 .direct_IO = noop_direct_IO, 3780 .dirty_folio = noop_dirty_folio, 3781 .bmap = ext4_bmap, 3782 .swap_activate = ext4_iomap_swap_activate, 3783 }; 3784 3785 void ext4_set_aops(struct inode *inode) 3786 { 3787 switch (ext4_inode_journal_mode(inode)) { 3788 case EXT4_INODE_ORDERED_DATA_MODE: 3789 case EXT4_INODE_WRITEBACK_DATA_MODE: 3790 break; 3791 case EXT4_INODE_JOURNAL_DATA_MODE: 3792 inode->i_mapping->a_ops = &ext4_journalled_aops; 3793 return; 3794 default: 3795 BUG(); 3796 } 3797 if (IS_DAX(inode)) 3798 inode->i_mapping->a_ops = &ext4_dax_aops; 3799 else if (test_opt(inode->i_sb, DELALLOC)) 3800 inode->i_mapping->a_ops = &ext4_da_aops; 3801 else 3802 inode->i_mapping->a_ops = &ext4_aops; 3803 } 3804 3805 static int __ext4_block_zero_page_range(handle_t *handle, 3806 struct address_space *mapping, loff_t from, loff_t length) 3807 { 3808 ext4_fsblk_t index = from >> PAGE_SHIFT; 3809 unsigned offset = from & (PAGE_SIZE-1); 3810 unsigned blocksize, pos; 3811 ext4_lblk_t iblock; 3812 struct inode *inode = mapping->host; 3813 struct buffer_head *bh; 3814 struct page *page; 3815 int err = 0; 3816 3817 page = find_or_create_page(mapping, from >> PAGE_SHIFT, 3818 mapping_gfp_constraint(mapping, ~__GFP_FS)); 3819 if (!page) 3820 return -ENOMEM; 3821 3822 blocksize = inode->i_sb->s_blocksize; 3823 3824 iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); 3825 3826 if (!page_has_buffers(page)) 3827 create_empty_buffers(page, blocksize, 0); 3828 3829 /* Find the buffer that contains "offset" */ 3830 bh = page_buffers(page); 3831 pos = blocksize; 3832 while (offset >= pos) { 3833 bh = bh->b_this_page; 3834 iblock++; 3835 pos += blocksize; 3836 } 3837 if (buffer_freed(bh)) { 3838 BUFFER_TRACE(bh, "freed: skip"); 3839 goto unlock; 3840 } 3841 if (!buffer_mapped(bh)) { 3842 BUFFER_TRACE(bh, "unmapped"); 3843 ext4_get_block(inode, iblock, bh, 0); 3844 /* unmapped? It's a hole - nothing to do */ 3845 if (!buffer_mapped(bh)) { 3846 BUFFER_TRACE(bh, "still unmapped"); 3847 goto unlock; 3848 } 3849 } 3850 3851 /* Ok, it's mapped. Make sure it's up-to-date */ 3852 if (PageUptodate(page)) 3853 set_buffer_uptodate(bh); 3854 3855 if (!buffer_uptodate(bh)) { 3856 err = ext4_read_bh_lock(bh, 0, true); 3857 if (err) 3858 goto unlock; 3859 if (fscrypt_inode_uses_fs_layer_crypto(inode)) { 3860 /* We expect the key to be set. */ 3861 BUG_ON(!fscrypt_has_encryption_key(inode)); 3862 err = fscrypt_decrypt_pagecache_blocks(page_folio(page), 3863 blocksize, 3864 bh_offset(bh)); 3865 if (err) { 3866 clear_buffer_uptodate(bh); 3867 goto unlock; 3868 } 3869 } 3870 } 3871 if (ext4_should_journal_data(inode)) { 3872 BUFFER_TRACE(bh, "get write access"); 3873 err = ext4_journal_get_write_access(handle, inode->i_sb, bh, 3874 EXT4_JTR_NONE); 3875 if (err) 3876 goto unlock; 3877 } 3878 zero_user(page, offset, length); 3879 BUFFER_TRACE(bh, "zeroed end of block"); 3880 3881 if (ext4_should_journal_data(inode)) { 3882 err = ext4_handle_dirty_metadata(handle, inode, bh); 3883 } else { 3884 err = 0; 3885 mark_buffer_dirty(bh); 3886 if (ext4_should_order_data(inode)) 3887 err = ext4_jbd2_inode_add_write(handle, inode, from, 3888 length); 3889 } 3890 3891 unlock: 3892 unlock_page(page); 3893 put_page(page); 3894 return err; 3895 } 3896 3897 /* 3898 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3899 * starting from file offset 'from'. The range to be zero'd must 3900 * be contained with in one block. If the specified range exceeds 3901 * the end of the block it will be shortened to end of the block 3902 * that corresponds to 'from' 3903 */ 3904 static int ext4_block_zero_page_range(handle_t *handle, 3905 struct address_space *mapping, loff_t from, loff_t length) 3906 { 3907 struct inode *inode = mapping->host; 3908 unsigned offset = from & (PAGE_SIZE-1); 3909 unsigned blocksize = inode->i_sb->s_blocksize; 3910 unsigned max = blocksize - (offset & (blocksize - 1)); 3911 3912 /* 3913 * correct length if it does not fall between 3914 * 'from' and the end of the block 3915 */ 3916 if (length > max || length < 0) 3917 length = max; 3918 3919 if (IS_DAX(inode)) { 3920 return dax_zero_range(inode, from, length, NULL, 3921 &ext4_iomap_ops); 3922 } 3923 return __ext4_block_zero_page_range(handle, mapping, from, length); 3924 } 3925 3926 /* 3927 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3928 * up to the end of the block which corresponds to `from'. 3929 * This required during truncate. We need to physically zero the tail end 3930 * of that block so it doesn't yield old data if the file is later grown. 3931 */ 3932 static int ext4_block_truncate_page(handle_t *handle, 3933 struct address_space *mapping, loff_t from) 3934 { 3935 unsigned offset = from & (PAGE_SIZE-1); 3936 unsigned length; 3937 unsigned blocksize; 3938 struct inode *inode = mapping->host; 3939 3940 /* If we are processing an encrypted inode during orphan list handling */ 3941 if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) 3942 return 0; 3943 3944 blocksize = inode->i_sb->s_blocksize; 3945 length = blocksize - (offset & (blocksize - 1)); 3946 3947 return ext4_block_zero_page_range(handle, mapping, from, length); 3948 } 3949 3950 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3951 loff_t lstart, loff_t length) 3952 { 3953 struct super_block *sb = inode->i_sb; 3954 struct address_space *mapping = inode->i_mapping; 3955 unsigned partial_start, partial_end; 3956 ext4_fsblk_t start, end; 3957 loff_t byte_end = (lstart + length - 1); 3958 int err = 0; 3959 3960 partial_start = lstart & (sb->s_blocksize - 1); 3961 partial_end = byte_end & (sb->s_blocksize - 1); 3962 3963 start = lstart >> sb->s_blocksize_bits; 3964 end = byte_end >> sb->s_blocksize_bits; 3965 3966 /* Handle partial zero within the single block */ 3967 if (start == end && 3968 (partial_start || (partial_end != sb->s_blocksize - 1))) { 3969 err = ext4_block_zero_page_range(handle, mapping, 3970 lstart, length); 3971 return err; 3972 } 3973 /* Handle partial zero out on the start of the range */ 3974 if (partial_start) { 3975 err = ext4_block_zero_page_range(handle, mapping, 3976 lstart, sb->s_blocksize); 3977 if (err) 3978 return err; 3979 } 3980 /* Handle partial zero out on the end of the range */ 3981 if (partial_end != sb->s_blocksize - 1) 3982 err = ext4_block_zero_page_range(handle, mapping, 3983 byte_end - partial_end, 3984 partial_end + 1); 3985 return err; 3986 } 3987 3988 int ext4_can_truncate(struct inode *inode) 3989 { 3990 if (S_ISREG(inode->i_mode)) 3991 return 1; 3992 if (S_ISDIR(inode->i_mode)) 3993 return 1; 3994 if (S_ISLNK(inode->i_mode)) 3995 return !ext4_inode_is_fast_symlink(inode); 3996 return 0; 3997 } 3998 3999 /* 4000 * We have to make sure i_disksize gets properly updated before we truncate 4001 * page cache due to hole punching or zero range. Otherwise i_disksize update 4002 * can get lost as it may have been postponed to submission of writeback but 4003 * that will never happen after we truncate page cache. 4004 */ 4005 int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, 4006 loff_t len) 4007 { 4008 handle_t *handle; 4009 int ret; 4010 4011 loff_t size = i_size_read(inode); 4012 4013 WARN_ON(!inode_is_locked(inode)); 4014 if (offset > size || offset + len < size) 4015 return 0; 4016 4017 if (EXT4_I(inode)->i_disksize >= size) 4018 return 0; 4019 4020 handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 4021 if (IS_ERR(handle)) 4022 return PTR_ERR(handle); 4023 ext4_update_i_disksize(inode, size); 4024 ret = ext4_mark_inode_dirty(handle, inode); 4025 ext4_journal_stop(handle); 4026 4027 return ret; 4028 } 4029 4030 static void ext4_wait_dax_page(struct inode *inode) 4031 { 4032 filemap_invalidate_unlock(inode->i_mapping); 4033 schedule(); 4034 filemap_invalidate_lock(inode->i_mapping); 4035 } 4036 4037 int ext4_break_layouts(struct inode *inode) 4038 { 4039 struct page *page; 4040 int error; 4041 4042 if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) 4043 return -EINVAL; 4044 4045 do { 4046 page = dax_layout_busy_page(inode->i_mapping); 4047 if (!page) 4048 return 0; 4049 4050 error = ___wait_var_event(&page->_refcount, 4051 atomic_read(&page->_refcount) == 1, 4052 TASK_INTERRUPTIBLE, 0, 0, 4053 ext4_wait_dax_page(inode)); 4054 } while (error == 0); 4055 4056 return error; 4057 } 4058 4059 /* 4060 * ext4_punch_hole: punches a hole in a file by releasing the blocks 4061 * associated with the given offset and length 4062 * 4063 * @inode: File inode 4064 * @offset: The offset where the hole will begin 4065 * @len: The length of the hole 4066 * 4067 * Returns: 0 on success or negative on failure 4068 */ 4069 4070 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 4071 { 4072 struct inode *inode = file_inode(file); 4073 struct super_block *sb = inode->i_sb; 4074 ext4_lblk_t first_block, stop_block; 4075 struct address_space *mapping = inode->i_mapping; 4076 loff_t first_block_offset, last_block_offset, max_length; 4077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4078 handle_t *handle; 4079 unsigned int credits; 4080 int ret = 0, ret2 = 0; 4081 4082 trace_ext4_punch_hole(inode, offset, length, 0); 4083 4084 /* 4085 * Write out all dirty pages to avoid race conditions 4086 * Then release them. 4087 */ 4088 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4089 ret = filemap_write_and_wait_range(mapping, offset, 4090 offset + length - 1); 4091 if (ret) 4092 return ret; 4093 } 4094 4095 inode_lock(inode); 4096 4097 /* No need to punch hole beyond i_size */ 4098 if (offset >= inode->i_size) 4099 goto out_mutex; 4100 4101 /* 4102 * If the hole extends beyond i_size, set the hole 4103 * to end after the page that contains i_size 4104 */ 4105 if (offset + length > inode->i_size) { 4106 length = inode->i_size + 4107 PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - 4108 offset; 4109 } 4110 4111 /* 4112 * For punch hole the length + offset needs to be within one block 4113 * before last range. Adjust the length if it goes beyond that limit. 4114 */ 4115 max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; 4116 if (offset + length > max_length) 4117 length = max_length - offset; 4118 4119 if (offset & (sb->s_blocksize - 1) || 4120 (offset + length) & (sb->s_blocksize - 1)) { 4121 /* 4122 * Attach jinode to inode for jbd2 if we do any zeroing of 4123 * partial block 4124 */ 4125 ret = ext4_inode_attach_jinode(inode); 4126 if (ret < 0) 4127 goto out_mutex; 4128 4129 } 4130 4131 /* Wait all existing dio workers, newcomers will block on i_rwsem */ 4132 inode_dio_wait(inode); 4133 4134 ret = file_modified(file); 4135 if (ret) 4136 goto out_mutex; 4137 4138 /* 4139 * Prevent page faults from reinstantiating pages we have released from 4140 * page cache. 4141 */ 4142 filemap_invalidate_lock(mapping); 4143 4144 ret = ext4_break_layouts(inode); 4145 if (ret) 4146 goto out_dio; 4147 4148 first_block_offset = round_up(offset, sb->s_blocksize); 4149 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 4150 4151 /* Now release the pages and zero block aligned part of pages*/ 4152 if (last_block_offset > first_block_offset) { 4153 ret = ext4_update_disksize_before_punch(inode, offset, length); 4154 if (ret) 4155 goto out_dio; 4156 truncate_pagecache_range(inode, first_block_offset, 4157 last_block_offset); 4158 } 4159 4160 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4161 credits = ext4_writepage_trans_blocks(inode); 4162 else 4163 credits = ext4_blocks_for_truncate(inode); 4164 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4165 if (IS_ERR(handle)) { 4166 ret = PTR_ERR(handle); 4167 ext4_std_error(sb, ret); 4168 goto out_dio; 4169 } 4170 4171 ret = ext4_zero_partial_blocks(handle, inode, offset, 4172 length); 4173 if (ret) 4174 goto out_stop; 4175 4176 first_block = (offset + sb->s_blocksize - 1) >> 4177 EXT4_BLOCK_SIZE_BITS(sb); 4178 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 4179 4180 /* If there are blocks to remove, do it */ 4181 if (stop_block > first_block) { 4182 4183 down_write(&EXT4_I(inode)->i_data_sem); 4184 ext4_discard_preallocations(inode, 0); 4185 4186 ret = ext4_es_remove_extent(inode, first_block, 4187 stop_block - first_block); 4188 if (ret) { 4189 up_write(&EXT4_I(inode)->i_data_sem); 4190 goto out_stop; 4191 } 4192 4193 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4194 ret = ext4_ext_remove_space(inode, first_block, 4195 stop_block - 1); 4196 else 4197 ret = ext4_ind_remove_space(handle, inode, first_block, 4198 stop_block); 4199 4200 up_write(&EXT4_I(inode)->i_data_sem); 4201 } 4202 ext4_fc_track_range(handle, inode, first_block, stop_block); 4203 if (IS_SYNC(inode)) 4204 ext4_handle_sync(handle); 4205 4206 inode->i_mtime = inode->i_ctime = current_time(inode); 4207 ret2 = ext4_mark_inode_dirty(handle, inode); 4208 if (unlikely(ret2)) 4209 ret = ret2; 4210 if (ret >= 0) 4211 ext4_update_inode_fsync_trans(handle, inode, 1); 4212 out_stop: 4213 ext4_journal_stop(handle); 4214 out_dio: 4215 filemap_invalidate_unlock(mapping); 4216 out_mutex: 4217 inode_unlock(inode); 4218 return ret; 4219 } 4220 4221 int ext4_inode_attach_jinode(struct inode *inode) 4222 { 4223 struct ext4_inode_info *ei = EXT4_I(inode); 4224 struct jbd2_inode *jinode; 4225 4226 if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) 4227 return 0; 4228 4229 jinode = jbd2_alloc_inode(GFP_KERNEL); 4230 spin_lock(&inode->i_lock); 4231 if (!ei->jinode) { 4232 if (!jinode) { 4233 spin_unlock(&inode->i_lock); 4234 return -ENOMEM; 4235 } 4236 ei->jinode = jinode; 4237 jbd2_journal_init_jbd_inode(ei->jinode, inode); 4238 jinode = NULL; 4239 } 4240 spin_unlock(&inode->i_lock); 4241 if (unlikely(jinode != NULL)) 4242 jbd2_free_inode(jinode); 4243 return 0; 4244 } 4245 4246 /* 4247 * ext4_truncate() 4248 * 4249 * We block out ext4_get_block() block instantiations across the entire 4250 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 4251 * simultaneously on behalf of the same inode. 4252 * 4253 * As we work through the truncate and commit bits of it to the journal there 4254 * is one core, guiding principle: the file's tree must always be consistent on 4255 * disk. We must be able to restart the truncate after a crash. 4256 * 4257 * The file's tree may be transiently inconsistent in memory (although it 4258 * probably isn't), but whenever we close off and commit a journal transaction, 4259 * the contents of (the filesystem + the journal) must be consistent and 4260 * restartable. It's pretty simple, really: bottom up, right to left (although 4261 * left-to-right works OK too). 4262 * 4263 * Note that at recovery time, journal replay occurs *before* the restart of 4264 * truncate against the orphan inode list. 4265 * 4266 * The committed inode has the new, desired i_size (which is the same as 4267 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 4268 * that this inode's truncate did not complete and it will again call 4269 * ext4_truncate() to have another go. So there will be instantiated blocks 4270 * to the right of the truncation point in a crashed ext4 filesystem. But 4271 * that's fine - as long as they are linked from the inode, the post-crash 4272 * ext4_truncate() run will find them and release them. 4273 */ 4274 int ext4_truncate(struct inode *inode) 4275 { 4276 struct ext4_inode_info *ei = EXT4_I(inode); 4277 unsigned int credits; 4278 int err = 0, err2; 4279 handle_t *handle; 4280 struct address_space *mapping = inode->i_mapping; 4281 4282 /* 4283 * There is a possibility that we're either freeing the inode 4284 * or it's a completely new inode. In those cases we might not 4285 * have i_rwsem locked because it's not necessary. 4286 */ 4287 if (!(inode->i_state & (I_NEW|I_FREEING))) 4288 WARN_ON(!inode_is_locked(inode)); 4289 trace_ext4_truncate_enter(inode); 4290 4291 if (!ext4_can_truncate(inode)) 4292 goto out_trace; 4293 4294 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4295 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4296 4297 if (ext4_has_inline_data(inode)) { 4298 int has_inline = 1; 4299 4300 err = ext4_inline_data_truncate(inode, &has_inline); 4301 if (err || has_inline) 4302 goto out_trace; 4303 } 4304 4305 /* If we zero-out tail of the page, we have to create jinode for jbd2 */ 4306 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { 4307 err = ext4_inode_attach_jinode(inode); 4308 if (err) 4309 goto out_trace; 4310 } 4311 4312 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4313 credits = ext4_writepage_trans_blocks(inode); 4314 else 4315 credits = ext4_blocks_for_truncate(inode); 4316 4317 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4318 if (IS_ERR(handle)) { 4319 err = PTR_ERR(handle); 4320 goto out_trace; 4321 } 4322 4323 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) 4324 ext4_block_truncate_page(handle, mapping, inode->i_size); 4325 4326 /* 4327 * We add the inode to the orphan list, so that if this 4328 * truncate spans multiple transactions, and we crash, we will 4329 * resume the truncate when the filesystem recovers. It also 4330 * marks the inode dirty, to catch the new size. 4331 * 4332 * Implication: the file must always be in a sane, consistent 4333 * truncatable state while each transaction commits. 4334 */ 4335 err = ext4_orphan_add(handle, inode); 4336 if (err) 4337 goto out_stop; 4338 4339 down_write(&EXT4_I(inode)->i_data_sem); 4340 4341 ext4_discard_preallocations(inode, 0); 4342 4343 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4344 err = ext4_ext_truncate(handle, inode); 4345 else 4346 ext4_ind_truncate(handle, inode); 4347 4348 up_write(&ei->i_data_sem); 4349 if (err) 4350 goto out_stop; 4351 4352 if (IS_SYNC(inode)) 4353 ext4_handle_sync(handle); 4354 4355 out_stop: 4356 /* 4357 * If this was a simple ftruncate() and the file will remain alive, 4358 * then we need to clear up the orphan record which we created above. 4359 * However, if this was a real unlink then we were called by 4360 * ext4_evict_inode(), and we allow that function to clean up the 4361 * orphan info for us. 4362 */ 4363 if (inode->i_nlink) 4364 ext4_orphan_del(handle, inode); 4365 4366 inode->i_mtime = inode->i_ctime = current_time(inode); 4367 err2 = ext4_mark_inode_dirty(handle, inode); 4368 if (unlikely(err2 && !err)) 4369 err = err2; 4370 ext4_journal_stop(handle); 4371 4372 out_trace: 4373 trace_ext4_truncate_exit(inode); 4374 return err; 4375 } 4376 4377 static inline u64 ext4_inode_peek_iversion(const struct inode *inode) 4378 { 4379 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) 4380 return inode_peek_iversion_raw(inode); 4381 else 4382 return inode_peek_iversion(inode); 4383 } 4384 4385 static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, 4386 struct ext4_inode_info *ei) 4387 { 4388 struct inode *inode = &(ei->vfs_inode); 4389 u64 i_blocks = READ_ONCE(inode->i_blocks); 4390 struct super_block *sb = inode->i_sb; 4391 4392 if (i_blocks <= ~0U) { 4393 /* 4394 * i_blocks can be represented in a 32 bit variable 4395 * as multiple of 512 bytes 4396 */ 4397 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4398 raw_inode->i_blocks_high = 0; 4399 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4400 return 0; 4401 } 4402 4403 /* 4404 * This should never happen since sb->s_maxbytes should not have 4405 * allowed this, sb->s_maxbytes was set according to the huge_file 4406 * feature in ext4_fill_super(). 4407 */ 4408 if (!ext4_has_feature_huge_file(sb)) 4409 return -EFSCORRUPTED; 4410 4411 if (i_blocks <= 0xffffffffffffULL) { 4412 /* 4413 * i_blocks can be represented in a 48 bit variable 4414 * as multiple of 512 bytes 4415 */ 4416 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4417 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4418 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4419 } else { 4420 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4421 /* i_block is stored in file system block size */ 4422 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4423 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4424 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4425 } 4426 return 0; 4427 } 4428 4429 static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) 4430 { 4431 struct ext4_inode_info *ei = EXT4_I(inode); 4432 uid_t i_uid; 4433 gid_t i_gid; 4434 projid_t i_projid; 4435 int block; 4436 int err; 4437 4438 err = ext4_inode_blocks_set(raw_inode, ei); 4439 4440 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4441 i_uid = i_uid_read(inode); 4442 i_gid = i_gid_read(inode); 4443 i_projid = from_kprojid(&init_user_ns, ei->i_projid); 4444 if (!(test_opt(inode->i_sb, NO_UID32))) { 4445 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); 4446 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); 4447 /* 4448 * Fix up interoperability with old kernels. Otherwise, 4449 * old inodes get re-used with the upper 16 bits of the 4450 * uid/gid intact. 4451 */ 4452 if (ei->i_dtime && list_empty(&ei->i_orphan)) { 4453 raw_inode->i_uid_high = 0; 4454 raw_inode->i_gid_high = 0; 4455 } else { 4456 raw_inode->i_uid_high = 4457 cpu_to_le16(high_16_bits(i_uid)); 4458 raw_inode->i_gid_high = 4459 cpu_to_le16(high_16_bits(i_gid)); 4460 } 4461 } else { 4462 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); 4463 raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); 4464 raw_inode->i_uid_high = 0; 4465 raw_inode->i_gid_high = 0; 4466 } 4467 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4468 4469 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4470 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4471 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4472 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4473 4474 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4475 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 4476 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) 4477 raw_inode->i_file_acl_high = 4478 cpu_to_le16(ei->i_file_acl >> 32); 4479 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4480 ext4_isize_set(raw_inode, ei->i_disksize); 4481 4482 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4483 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4484 if (old_valid_dev(inode->i_rdev)) { 4485 raw_inode->i_block[0] = 4486 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4487 raw_inode->i_block[1] = 0; 4488 } else { 4489 raw_inode->i_block[0] = 0; 4490 raw_inode->i_block[1] = 4491 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4492 raw_inode->i_block[2] = 0; 4493 } 4494 } else if (!ext4_has_inline_data(inode)) { 4495 for (block = 0; block < EXT4_N_BLOCKS; block++) 4496 raw_inode->i_block[block] = ei->i_data[block]; 4497 } 4498 4499 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 4500 u64 ivers = ext4_inode_peek_iversion(inode); 4501 4502 raw_inode->i_disk_version = cpu_to_le32(ivers); 4503 if (ei->i_extra_isize) { 4504 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4505 raw_inode->i_version_hi = 4506 cpu_to_le32(ivers >> 32); 4507 raw_inode->i_extra_isize = 4508 cpu_to_le16(ei->i_extra_isize); 4509 } 4510 } 4511 4512 if (i_projid != EXT4_DEF_PROJID && 4513 !ext4_has_feature_project(inode->i_sb)) 4514 err = err ?: -EFSCORRUPTED; 4515 4516 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 4517 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 4518 raw_inode->i_projid = cpu_to_le32(i_projid); 4519 4520 ext4_inode_csum_set(inode, raw_inode, ei); 4521 return err; 4522 } 4523 4524 /* 4525 * ext4_get_inode_loc returns with an extra refcount against the inode's 4526 * underlying buffer_head on success. If we pass 'inode' and it does not 4527 * have in-inode xattr, we have all inode data in memory that is needed 4528 * to recreate the on-disk version of this inode. 4529 */ 4530 static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, 4531 struct inode *inode, struct ext4_iloc *iloc, 4532 ext4_fsblk_t *ret_block) 4533 { 4534 struct ext4_group_desc *gdp; 4535 struct buffer_head *bh; 4536 ext4_fsblk_t block; 4537 struct blk_plug plug; 4538 int inodes_per_block, inode_offset; 4539 4540 iloc->bh = NULL; 4541 if (ino < EXT4_ROOT_INO || 4542 ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) 4543 return -EFSCORRUPTED; 4544 4545 iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 4546 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4547 if (!gdp) 4548 return -EIO; 4549 4550 /* 4551 * Figure out the offset within the block group inode table 4552 */ 4553 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 4554 inode_offset = ((ino - 1) % 4555 EXT4_INODES_PER_GROUP(sb)); 4556 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4557 4558 block = ext4_inode_table(sb, gdp); 4559 if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || 4560 (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { 4561 ext4_error(sb, "Invalid inode table block %llu in " 4562 "block_group %u", block, iloc->block_group); 4563 return -EFSCORRUPTED; 4564 } 4565 block += (inode_offset / inodes_per_block); 4566 4567 bh = sb_getblk(sb, block); 4568 if (unlikely(!bh)) 4569 return -ENOMEM; 4570 if (ext4_buffer_uptodate(bh)) 4571 goto has_buffer; 4572 4573 lock_buffer(bh); 4574 if (ext4_buffer_uptodate(bh)) { 4575 /* Someone brought it uptodate while we waited */ 4576 unlock_buffer(bh); 4577 goto has_buffer; 4578 } 4579 4580 /* 4581 * If we have all information of the inode in memory and this 4582 * is the only valid inode in the block, we need not read the 4583 * block. 4584 */ 4585 if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 4586 struct buffer_head *bitmap_bh; 4587 int i, start; 4588 4589 start = inode_offset & ~(inodes_per_block - 1); 4590 4591 /* Is the inode bitmap in cache? */ 4592 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4593 if (unlikely(!bitmap_bh)) 4594 goto make_io; 4595 4596 /* 4597 * If the inode bitmap isn't in cache then the 4598 * optimisation may end up performing two reads instead 4599 * of one, so skip it. 4600 */ 4601 if (!buffer_uptodate(bitmap_bh)) { 4602 brelse(bitmap_bh); 4603 goto make_io; 4604 } 4605 for (i = start; i < start + inodes_per_block; i++) { 4606 if (i == inode_offset) 4607 continue; 4608 if (ext4_test_bit(i, bitmap_bh->b_data)) 4609 break; 4610 } 4611 brelse(bitmap_bh); 4612 if (i == start + inodes_per_block) { 4613 struct ext4_inode *raw_inode = 4614 (struct ext4_inode *) (bh->b_data + iloc->offset); 4615 4616 /* all other inodes are free, so skip I/O */ 4617 memset(bh->b_data, 0, bh->b_size); 4618 if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) 4619 ext4_fill_raw_inode(inode, raw_inode); 4620 set_buffer_uptodate(bh); 4621 unlock_buffer(bh); 4622 goto has_buffer; 4623 } 4624 } 4625 4626 make_io: 4627 /* 4628 * If we need to do any I/O, try to pre-readahead extra 4629 * blocks from the inode table. 4630 */ 4631 blk_start_plug(&plug); 4632 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4633 ext4_fsblk_t b, end, table; 4634 unsigned num; 4635 __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; 4636 4637 table = ext4_inode_table(sb, gdp); 4638 /* s_inode_readahead_blks is always a power of 2 */ 4639 b = block & ~((ext4_fsblk_t) ra_blks - 1); 4640 if (table > b) 4641 b = table; 4642 end = b + ra_blks; 4643 num = EXT4_INODES_PER_GROUP(sb); 4644 if (ext4_has_group_desc_csum(sb)) 4645 num -= ext4_itable_unused_count(sb, gdp); 4646 table += num / inodes_per_block; 4647 if (end > table) 4648 end = table; 4649 while (b <= end) 4650 ext4_sb_breadahead_unmovable(sb, b++); 4651 } 4652 4653 /* 4654 * There are other valid inodes in the buffer, this inode 4655 * has in-inode xattrs, or we don't have this inode in memory. 4656 * Read the block from disk. 4657 */ 4658 trace_ext4_load_inode(sb, ino); 4659 ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); 4660 blk_finish_plug(&plug); 4661 wait_on_buffer(bh); 4662 ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); 4663 if (!buffer_uptodate(bh)) { 4664 if (ret_block) 4665 *ret_block = block; 4666 brelse(bh); 4667 return -EIO; 4668 } 4669 has_buffer: 4670 iloc->bh = bh; 4671 return 0; 4672 } 4673 4674 static int __ext4_get_inode_loc_noinmem(struct inode *inode, 4675 struct ext4_iloc *iloc) 4676 { 4677 ext4_fsblk_t err_blk = 0; 4678 int ret; 4679 4680 ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, 4681 &err_blk); 4682 4683 if (ret == -EIO) 4684 ext4_error_inode_block(inode, err_blk, EIO, 4685 "unable to read itable block"); 4686 4687 return ret; 4688 } 4689 4690 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4691 { 4692 ext4_fsblk_t err_blk = 0; 4693 int ret; 4694 4695 ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, 4696 &err_blk); 4697 4698 if (ret == -EIO) 4699 ext4_error_inode_block(inode, err_blk, EIO, 4700 "unable to read itable block"); 4701 4702 return ret; 4703 } 4704 4705 4706 int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, 4707 struct ext4_iloc *iloc) 4708 { 4709 return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); 4710 } 4711 4712 static bool ext4_should_enable_dax(struct inode *inode) 4713 { 4714 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4715 4716 if (test_opt2(inode->i_sb, DAX_NEVER)) 4717 return false; 4718 if (!S_ISREG(inode->i_mode)) 4719 return false; 4720 if (ext4_should_journal_data(inode)) 4721 return false; 4722 if (ext4_has_inline_data(inode)) 4723 return false; 4724 if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) 4725 return false; 4726 if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) 4727 return false; 4728 if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) 4729 return false; 4730 if (test_opt(inode->i_sb, DAX_ALWAYS)) 4731 return true; 4732 4733 return ext4_test_inode_flag(inode, EXT4_INODE_DAX); 4734 } 4735 4736 void ext4_set_inode_flags(struct inode *inode, bool init) 4737 { 4738 unsigned int flags = EXT4_I(inode)->i_flags; 4739 unsigned int new_fl = 0; 4740 4741 WARN_ON_ONCE(IS_DAX(inode) && init); 4742 4743 if (flags & EXT4_SYNC_FL) 4744 new_fl |= S_SYNC; 4745 if (flags & EXT4_APPEND_FL) 4746 new_fl |= S_APPEND; 4747 if (flags & EXT4_IMMUTABLE_FL) 4748 new_fl |= S_IMMUTABLE; 4749 if (flags & EXT4_NOATIME_FL) 4750 new_fl |= S_NOATIME; 4751 if (flags & EXT4_DIRSYNC_FL) 4752 new_fl |= S_DIRSYNC; 4753 4754 /* Because of the way inode_set_flags() works we must preserve S_DAX 4755 * here if already set. */ 4756 new_fl |= (inode->i_flags & S_DAX); 4757 if (init && ext4_should_enable_dax(inode)) 4758 new_fl |= S_DAX; 4759 4760 if (flags & EXT4_ENCRYPT_FL) 4761 new_fl |= S_ENCRYPTED; 4762 if (flags & EXT4_CASEFOLD_FL) 4763 new_fl |= S_CASEFOLD; 4764 if (flags & EXT4_VERITY_FL) 4765 new_fl |= S_VERITY; 4766 inode_set_flags(inode, new_fl, 4767 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| 4768 S_ENCRYPTED|S_CASEFOLD|S_VERITY); 4769 } 4770 4771 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4772 struct ext4_inode_info *ei) 4773 { 4774 blkcnt_t i_blocks ; 4775 struct inode *inode = &(ei->vfs_inode); 4776 struct super_block *sb = inode->i_sb; 4777 4778 if (ext4_has_feature_huge_file(sb)) { 4779 /* we are using combined 48 bit field */ 4780 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4781 le32_to_cpu(raw_inode->i_blocks_lo); 4782 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { 4783 /* i_blocks represent file system block size */ 4784 return i_blocks << (inode->i_blkbits - 9); 4785 } else { 4786 return i_blocks; 4787 } 4788 } else { 4789 return le32_to_cpu(raw_inode->i_blocks_lo); 4790 } 4791 } 4792 4793 static inline int ext4_iget_extra_inode(struct inode *inode, 4794 struct ext4_inode *raw_inode, 4795 struct ext4_inode_info *ei) 4796 { 4797 __le32 *magic = (void *)raw_inode + 4798 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; 4799 4800 if (EXT4_INODE_HAS_XATTR_SPACE(inode) && 4801 *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { 4802 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 4803 return ext4_find_inline_data_nolock(inode); 4804 } else 4805 EXT4_I(inode)->i_inline_off = 0; 4806 return 0; 4807 } 4808 4809 int ext4_get_projid(struct inode *inode, kprojid_t *projid) 4810 { 4811 if (!ext4_has_feature_project(inode->i_sb)) 4812 return -EOPNOTSUPP; 4813 *projid = EXT4_I(inode)->i_projid; 4814 return 0; 4815 } 4816 4817 /* 4818 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of 4819 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag 4820 * set. 4821 */ 4822 static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) 4823 { 4824 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) 4825 inode_set_iversion_raw(inode, val); 4826 else 4827 inode_set_iversion_queried(inode, val); 4828 } 4829 4830 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, 4831 ext4_iget_flags flags, const char *function, 4832 unsigned int line) 4833 { 4834 struct ext4_iloc iloc; 4835 struct ext4_inode *raw_inode; 4836 struct ext4_inode_info *ei; 4837 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 4838 struct inode *inode; 4839 journal_t *journal = EXT4_SB(sb)->s_journal; 4840 long ret; 4841 loff_t size; 4842 int block; 4843 uid_t i_uid; 4844 gid_t i_gid; 4845 projid_t i_projid; 4846 4847 if ((!(flags & EXT4_IGET_SPECIAL) && 4848 ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || 4849 ino == le32_to_cpu(es->s_usr_quota_inum) || 4850 ino == le32_to_cpu(es->s_grp_quota_inum) || 4851 ino == le32_to_cpu(es->s_prj_quota_inum) || 4852 ino == le32_to_cpu(es->s_orphan_file_inum))) || 4853 (ino < EXT4_ROOT_INO) || 4854 (ino > le32_to_cpu(es->s_inodes_count))) { 4855 if (flags & EXT4_IGET_HANDLE) 4856 return ERR_PTR(-ESTALE); 4857 __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, 4858 "inode #%lu: comm %s: iget: illegal inode #", 4859 ino, current->comm); 4860 return ERR_PTR(-EFSCORRUPTED); 4861 } 4862 4863 inode = iget_locked(sb, ino); 4864 if (!inode) 4865 return ERR_PTR(-ENOMEM); 4866 if (!(inode->i_state & I_NEW)) 4867 return inode; 4868 4869 ei = EXT4_I(inode); 4870 iloc.bh = NULL; 4871 4872 ret = __ext4_get_inode_loc_noinmem(inode, &iloc); 4873 if (ret < 0) 4874 goto bad_inode; 4875 raw_inode = ext4_raw_inode(&iloc); 4876 4877 if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { 4878 ext4_error_inode(inode, function, line, 0, 4879 "iget: root inode unallocated"); 4880 ret = -EFSCORRUPTED; 4881 goto bad_inode; 4882 } 4883 4884 if ((flags & EXT4_IGET_HANDLE) && 4885 (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { 4886 ret = -ESTALE; 4887 goto bad_inode; 4888 } 4889 4890 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4891 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4892 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4893 EXT4_INODE_SIZE(inode->i_sb) || 4894 (ei->i_extra_isize & 3)) { 4895 ext4_error_inode(inode, function, line, 0, 4896 "iget: bad extra_isize %u " 4897 "(inode size %u)", 4898 ei->i_extra_isize, 4899 EXT4_INODE_SIZE(inode->i_sb)); 4900 ret = -EFSCORRUPTED; 4901 goto bad_inode; 4902 } 4903 } else 4904 ei->i_extra_isize = 0; 4905 4906 /* Precompute checksum seed for inode metadata */ 4907 if (ext4_has_metadata_csum(sb)) { 4908 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4909 __u32 csum; 4910 __le32 inum = cpu_to_le32(inode->i_ino); 4911 __le32 gen = raw_inode->i_generation; 4912 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, 4913 sizeof(inum)); 4914 ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, 4915 sizeof(gen)); 4916 } 4917 4918 if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || 4919 ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && 4920 (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { 4921 ext4_error_inode_err(inode, function, line, 0, 4922 EFSBADCRC, "iget: checksum invalid"); 4923 ret = -EFSBADCRC; 4924 goto bad_inode; 4925 } 4926 4927 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4928 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4929 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4930 if (ext4_has_feature_project(sb) && 4931 EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && 4932 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 4933 i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); 4934 else 4935 i_projid = EXT4_DEF_PROJID; 4936 4937 if (!(test_opt(inode->i_sb, NO_UID32))) { 4938 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4939 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4940 } 4941 i_uid_write(inode, i_uid); 4942 i_gid_write(inode, i_gid); 4943 ei->i_projid = make_kprojid(&init_user_ns, i_projid); 4944 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 4945 4946 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 4947 ei->i_inline_off = 0; 4948 ei->i_dir_start_lookup = 0; 4949 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4950 /* We now have enough fields to check if the inode was active or not. 4951 * This is needed because nfsd might try to access dead inodes 4952 * the test is that same one that e2fsck uses 4953 * NeilBrown 1999oct15 4954 */ 4955 if (inode->i_nlink == 0) { 4956 if ((inode->i_mode == 0 || 4957 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && 4958 ino != EXT4_BOOT_LOADER_INO) { 4959 /* this inode is deleted */ 4960 ret = -ESTALE; 4961 goto bad_inode; 4962 } 4963 /* The only unlinked inodes we let through here have 4964 * valid i_mode and are being read by the orphan 4965 * recovery code: that's fine, we're about to complete 4966 * the process of deleting those. 4967 * OR it is the EXT4_BOOT_LOADER_INO which is 4968 * not initialized on a new filesystem. */ 4969 } 4970 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4971 ext4_set_inode_flags(inode, true); 4972 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4973 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4974 if (ext4_has_feature_64bit(sb)) 4975 ei->i_file_acl |= 4976 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4977 inode->i_size = ext4_isize(sb, raw_inode); 4978 if ((size = i_size_read(inode)) < 0) { 4979 ext4_error_inode(inode, function, line, 0, 4980 "iget: bad i_size value: %lld", size); 4981 ret = -EFSCORRUPTED; 4982 goto bad_inode; 4983 } 4984 /* 4985 * If dir_index is not enabled but there's dir with INDEX flag set, 4986 * we'd normally treat htree data as empty space. But with metadata 4987 * checksumming that corrupts checksums so forbid that. 4988 */ 4989 if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && 4990 ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { 4991 ext4_error_inode(inode, function, line, 0, 4992 "iget: Dir with htree data on filesystem without dir_index feature."); 4993 ret = -EFSCORRUPTED; 4994 goto bad_inode; 4995 } 4996 ei->i_disksize = inode->i_size; 4997 #ifdef CONFIG_QUOTA 4998 ei->i_reserved_quota = 0; 4999 #endif 5000 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 5001 ei->i_block_group = iloc.block_group; 5002 ei->i_last_alloc_group = ~0; 5003 /* 5004 * NOTE! The in-memory inode i_data array is in little-endian order 5005 * even on big-endian machines: we do NOT byteswap the block numbers! 5006 */ 5007 for (block = 0; block < EXT4_N_BLOCKS; block++) 5008 ei->i_data[block] = raw_inode->i_block[block]; 5009 INIT_LIST_HEAD(&ei->i_orphan); 5010 ext4_fc_init_inode(&ei->vfs_inode); 5011 5012 /* 5013 * Set transaction id's of transactions that have to be committed 5014 * to finish f[data]sync. We set them to currently running transaction 5015 * as we cannot be sure that the inode or some of its metadata isn't 5016 * part of the transaction - the inode could have been reclaimed and 5017 * now it is reread from disk. 5018 */ 5019 if (journal) { 5020 transaction_t *transaction; 5021 tid_t tid; 5022 5023 read_lock(&journal->j_state_lock); 5024 if (journal->j_running_transaction) 5025 transaction = journal->j_running_transaction; 5026 else 5027 transaction = journal->j_committing_transaction; 5028 if (transaction) 5029 tid = transaction->t_tid; 5030 else 5031 tid = journal->j_commit_sequence; 5032 read_unlock(&journal->j_state_lock); 5033 ei->i_sync_tid = tid; 5034 ei->i_datasync_tid = tid; 5035 } 5036 5037 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 5038 if (ei->i_extra_isize == 0) { 5039 /* The extra space is currently unused. Use it. */ 5040 BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); 5041 ei->i_extra_isize = sizeof(struct ext4_inode) - 5042 EXT4_GOOD_OLD_INODE_SIZE; 5043 } else { 5044 ret = ext4_iget_extra_inode(inode, raw_inode, ei); 5045 if (ret) 5046 goto bad_inode; 5047 } 5048 } 5049 5050 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 5051 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 5052 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 5053 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 5054 5055 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 5056 u64 ivers = le32_to_cpu(raw_inode->i_disk_version); 5057 5058 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 5059 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 5060 ivers |= 5061 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 5062 } 5063 ext4_inode_set_iversion_queried(inode, ivers); 5064 } 5065 5066 ret = 0; 5067 if (ei->i_file_acl && 5068 !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { 5069 ext4_error_inode(inode, function, line, 0, 5070 "iget: bad extended attribute block %llu", 5071 ei->i_file_acl); 5072 ret = -EFSCORRUPTED; 5073 goto bad_inode; 5074 } else if (!ext4_has_inline_data(inode)) { 5075 /* validate the block references in the inode */ 5076 if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && 5077 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 5078 (S_ISLNK(inode->i_mode) && 5079 !ext4_inode_is_fast_symlink(inode)))) { 5080 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 5081 ret = ext4_ext_check_inode(inode); 5082 else 5083 ret = ext4_ind_check_inode(inode); 5084 } 5085 } 5086 if (ret) 5087 goto bad_inode; 5088 5089 if (S_ISREG(inode->i_mode)) { 5090 inode->i_op = &ext4_file_inode_operations; 5091 inode->i_fop = &ext4_file_operations; 5092 ext4_set_aops(inode); 5093 } else if (S_ISDIR(inode->i_mode)) { 5094 inode->i_op = &ext4_dir_inode_operations; 5095 inode->i_fop = &ext4_dir_operations; 5096 } else if (S_ISLNK(inode->i_mode)) { 5097 /* VFS does not allow setting these so must be corruption */ 5098 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 5099 ext4_error_inode(inode, function, line, 0, 5100 "iget: immutable or append flags " 5101 "not allowed on symlinks"); 5102 ret = -EFSCORRUPTED; 5103 goto bad_inode; 5104 } 5105 if (IS_ENCRYPTED(inode)) { 5106 inode->i_op = &ext4_encrypted_symlink_inode_operations; 5107 } else if (ext4_inode_is_fast_symlink(inode)) { 5108 inode->i_link = (char *)ei->i_data; 5109 inode->i_op = &ext4_fast_symlink_inode_operations; 5110 nd_terminate_link(ei->i_data, inode->i_size, 5111 sizeof(ei->i_data) - 1); 5112 } else { 5113 inode->i_op = &ext4_symlink_inode_operations; 5114 } 5115 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 5116 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 5117 inode->i_op = &ext4_special_inode_operations; 5118 if (raw_inode->i_block[0]) 5119 init_special_inode(inode, inode->i_mode, 5120 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 5121 else 5122 init_special_inode(inode, inode->i_mode, 5123 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5124 } else if (ino == EXT4_BOOT_LOADER_INO) { 5125 make_bad_inode(inode); 5126 } else { 5127 ret = -EFSCORRUPTED; 5128 ext4_error_inode(inode, function, line, 0, 5129 "iget: bogus i_mode (%o)", inode->i_mode); 5130 goto bad_inode; 5131 } 5132 if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) 5133 ext4_error_inode(inode, function, line, 0, 5134 "casefold flag without casefold feature"); 5135 if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { 5136 ext4_error_inode(inode, function, line, 0, 5137 "bad inode without EXT4_IGET_BAD flag"); 5138 ret = -EUCLEAN; 5139 goto bad_inode; 5140 } 5141 5142 brelse(iloc.bh); 5143 unlock_new_inode(inode); 5144 return inode; 5145 5146 bad_inode: 5147 brelse(iloc.bh); 5148 iget_failed(inode); 5149 return ERR_PTR(ret); 5150 } 5151 5152 static void __ext4_update_other_inode_time(struct super_block *sb, 5153 unsigned long orig_ino, 5154 unsigned long ino, 5155 struct ext4_inode *raw_inode) 5156 { 5157 struct inode *inode; 5158 5159 inode = find_inode_by_ino_rcu(sb, ino); 5160 if (!inode) 5161 return; 5162 5163 if (!inode_is_dirtytime_only(inode)) 5164 return; 5165 5166 spin_lock(&inode->i_lock); 5167 if (inode_is_dirtytime_only(inode)) { 5168 struct ext4_inode_info *ei = EXT4_I(inode); 5169 5170 inode->i_state &= ~I_DIRTY_TIME; 5171 spin_unlock(&inode->i_lock); 5172 5173 spin_lock(&ei->i_raw_lock); 5174 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 5175 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 5176 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 5177 ext4_inode_csum_set(inode, raw_inode, ei); 5178 spin_unlock(&ei->i_raw_lock); 5179 trace_ext4_other_inode_update_time(inode, orig_ino); 5180 return; 5181 } 5182 spin_unlock(&inode->i_lock); 5183 } 5184 5185 /* 5186 * Opportunistically update the other time fields for other inodes in 5187 * the same inode table block. 5188 */ 5189 static void ext4_update_other_inodes_time(struct super_block *sb, 5190 unsigned long orig_ino, char *buf) 5191 { 5192 unsigned long ino; 5193 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 5194 int inode_size = EXT4_INODE_SIZE(sb); 5195 5196 /* 5197 * Calculate the first inode in the inode table block. Inode 5198 * numbers are one-based. That is, the first inode in a block 5199 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). 5200 */ 5201 ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; 5202 rcu_read_lock(); 5203 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { 5204 if (ino == orig_ino) 5205 continue; 5206 __ext4_update_other_inode_time(sb, orig_ino, ino, 5207 (struct ext4_inode *)buf); 5208 } 5209 rcu_read_unlock(); 5210 } 5211 5212 /* 5213 * Post the struct inode info into an on-disk inode location in the 5214 * buffer-cache. This gobbles the caller's reference to the 5215 * buffer_head in the inode location struct. 5216 * 5217 * The caller must have write access to iloc->bh. 5218 */ 5219 static int ext4_do_update_inode(handle_t *handle, 5220 struct inode *inode, 5221 struct ext4_iloc *iloc) 5222 { 5223 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5224 struct ext4_inode_info *ei = EXT4_I(inode); 5225 struct buffer_head *bh = iloc->bh; 5226 struct super_block *sb = inode->i_sb; 5227 int err; 5228 int need_datasync = 0, set_large_file = 0; 5229 5230 spin_lock(&ei->i_raw_lock); 5231 5232 /* 5233 * For fields not tracked in the in-memory inode, initialise them 5234 * to zero for new inodes. 5235 */ 5236 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) 5237 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5238 5239 if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) 5240 need_datasync = 1; 5241 if (ei->i_disksize > 0x7fffffffULL) { 5242 if (!ext4_has_feature_large_file(sb) || 5243 EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) 5244 set_large_file = 1; 5245 } 5246 5247 err = ext4_fill_raw_inode(inode, raw_inode); 5248 spin_unlock(&ei->i_raw_lock); 5249 if (err) { 5250 EXT4_ERROR_INODE(inode, "corrupted inode contents"); 5251 goto out_brelse; 5252 } 5253 5254 if (inode->i_sb->s_flags & SB_LAZYTIME) 5255 ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, 5256 bh->b_data); 5257 5258 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5259 err = ext4_handle_dirty_metadata(handle, NULL, bh); 5260 if (err) 5261 goto out_error; 5262 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 5263 if (set_large_file) { 5264 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); 5265 err = ext4_journal_get_write_access(handle, sb, 5266 EXT4_SB(sb)->s_sbh, 5267 EXT4_JTR_NONE); 5268 if (err) 5269 goto out_error; 5270 lock_buffer(EXT4_SB(sb)->s_sbh); 5271 ext4_set_feature_large_file(sb); 5272 ext4_superblock_csum_set(sb); 5273 unlock_buffer(EXT4_SB(sb)->s_sbh); 5274 ext4_handle_sync(handle); 5275 err = ext4_handle_dirty_metadata(handle, NULL, 5276 EXT4_SB(sb)->s_sbh); 5277 } 5278 ext4_update_inode_fsync_trans(handle, inode, need_datasync); 5279 out_error: 5280 ext4_std_error(inode->i_sb, err); 5281 out_brelse: 5282 brelse(bh); 5283 return err; 5284 } 5285 5286 /* 5287 * ext4_write_inode() 5288 * 5289 * We are called from a few places: 5290 * 5291 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. 5292 * Here, there will be no transaction running. We wait for any running 5293 * transaction to commit. 5294 * 5295 * - Within flush work (sys_sync(), kupdate and such). 5296 * We wait on commit, if told to. 5297 * 5298 * - Within iput_final() -> write_inode_now() 5299 * We wait on commit, if told to. 5300 * 5301 * In all cases it is actually safe for us to return without doing anything, 5302 * because the inode has been copied into a raw inode buffer in 5303 * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL 5304 * writeback. 5305 * 5306 * Note that we are absolutely dependent upon all inode dirtiers doing the 5307 * right thing: they *must* call mark_inode_dirty() after dirtying info in 5308 * which we are interested. 5309 * 5310 * It would be a bug for them to not do this. The code: 5311 * 5312 * mark_inode_dirty(inode) 5313 * stuff(); 5314 * inode->i_size = expr; 5315 * 5316 * is in error because write_inode() could occur while `stuff()' is running, 5317 * and the new i_size will be lost. Plus the inode will no longer be on the 5318 * superblock's dirty inode list. 5319 */ 5320 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) 5321 { 5322 int err; 5323 5324 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || 5325 sb_rdonly(inode->i_sb)) 5326 return 0; 5327 5328 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 5329 return -EIO; 5330 5331 if (EXT4_SB(inode->i_sb)->s_journal) { 5332 if (ext4_journal_current_handle()) { 5333 ext4_debug("called recursively, non-PF_MEMALLOC!\n"); 5334 dump_stack(); 5335 return -EIO; 5336 } 5337 5338 /* 5339 * No need to force transaction in WB_SYNC_NONE mode. Also 5340 * ext4_sync_fs() will force the commit after everything is 5341 * written. 5342 */ 5343 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) 5344 return 0; 5345 5346 err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, 5347 EXT4_I(inode)->i_sync_tid); 5348 } else { 5349 struct ext4_iloc iloc; 5350 5351 err = __ext4_get_inode_loc_noinmem(inode, &iloc); 5352 if (err) 5353 return err; 5354 /* 5355 * sync(2) will flush the whole buffer cache. No need to do 5356 * it here separately for each inode. 5357 */ 5358 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 5359 sync_dirty_buffer(iloc.bh); 5360 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5361 ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, 5362 "IO error syncing inode"); 5363 err = -EIO; 5364 } 5365 brelse(iloc.bh); 5366 } 5367 return err; 5368 } 5369 5370 /* 5371 * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate 5372 * buffers that are attached to a folio straddling i_size and are undergoing 5373 * commit. In that case we have to wait for commit to finish and try again. 5374 */ 5375 static void ext4_wait_for_tail_page_commit(struct inode *inode) 5376 { 5377 unsigned offset; 5378 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 5379 tid_t commit_tid = 0; 5380 int ret; 5381 5382 offset = inode->i_size & (PAGE_SIZE - 1); 5383 /* 5384 * If the folio is fully truncated, we don't need to wait for any commit 5385 * (and we even should not as __ext4_journalled_invalidate_folio() may 5386 * strip all buffers from the folio but keep the folio dirty which can then 5387 * confuse e.g. concurrent ext4_writepage() seeing dirty folio without 5388 * buffers). Also we don't need to wait for any commit if all buffers in 5389 * the folio remain valid. This is most beneficial for the common case of 5390 * blocksize == PAGESIZE. 5391 */ 5392 if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) 5393 return; 5394 while (1) { 5395 struct folio *folio = filemap_lock_folio(inode->i_mapping, 5396 inode->i_size >> PAGE_SHIFT); 5397 if (!folio) 5398 return; 5399 ret = __ext4_journalled_invalidate_folio(folio, offset, 5400 folio_size(folio) - offset); 5401 folio_unlock(folio); 5402 folio_put(folio); 5403 if (ret != -EBUSY) 5404 return; 5405 commit_tid = 0; 5406 read_lock(&journal->j_state_lock); 5407 if (journal->j_committing_transaction) 5408 commit_tid = journal->j_committing_transaction->t_tid; 5409 read_unlock(&journal->j_state_lock); 5410 if (commit_tid) 5411 jbd2_log_wait_commit(journal, commit_tid); 5412 } 5413 } 5414 5415 /* 5416 * ext4_setattr() 5417 * 5418 * Called from notify_change. 5419 * 5420 * We want to trap VFS attempts to truncate the file as soon as 5421 * possible. In particular, we want to make sure that when the VFS 5422 * shrinks i_size, we put the inode on the orphan list and modify 5423 * i_disksize immediately, so that during the subsequent flushing of 5424 * dirty pages and freeing of disk blocks, we can guarantee that any 5425 * commit will leave the blocks being flushed in an unused state on 5426 * disk. (On recovery, the inode will get truncated and the blocks will 5427 * be freed, so we have a strong guarantee that no future commit will 5428 * leave these blocks visible to the user.) 5429 * 5430 * Another thing we have to assure is that if we are in ordered mode 5431 * and inode is still attached to the committing transaction, we must 5432 * we start writeout of all the dirty pages which are being truncated. 5433 * This way we are sure that all the data written in the previous 5434 * transaction are already on disk (truncate waits for pages under 5435 * writeback). 5436 * 5437 * Called with inode->i_rwsem down. 5438 */ 5439 int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 5440 struct iattr *attr) 5441 { 5442 struct inode *inode = d_inode(dentry); 5443 int error, rc = 0; 5444 int orphan = 0; 5445 const unsigned int ia_valid = attr->ia_valid; 5446 bool inc_ivers = true; 5447 5448 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 5449 return -EIO; 5450 5451 if (unlikely(IS_IMMUTABLE(inode))) 5452 return -EPERM; 5453 5454 if (unlikely(IS_APPEND(inode) && 5455 (ia_valid & (ATTR_MODE | ATTR_UID | 5456 ATTR_GID | ATTR_TIMES_SET)))) 5457 return -EPERM; 5458 5459 error = setattr_prepare(idmap, dentry, attr); 5460 if (error) 5461 return error; 5462 5463 error = fscrypt_prepare_setattr(dentry, attr); 5464 if (error) 5465 return error; 5466 5467 error = fsverity_prepare_setattr(dentry, attr); 5468 if (error) 5469 return error; 5470 5471 if (is_quota_modification(idmap, inode, attr)) { 5472 error = dquot_initialize(inode); 5473 if (error) 5474 return error; 5475 } 5476 5477 if (i_uid_needs_update(idmap, attr, inode) || 5478 i_gid_needs_update(idmap, attr, inode)) { 5479 handle_t *handle; 5480 5481 /* (user+group)*(old+new) structure, inode write (sb, 5482 * inode block, ? - but truncate inode update has it) */ 5483 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 5484 (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 5485 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); 5486 if (IS_ERR(handle)) { 5487 error = PTR_ERR(handle); 5488 goto err_out; 5489 } 5490 5491 /* dquot_transfer() calls back ext4_get_inode_usage() which 5492 * counts xattr inode references. 5493 */ 5494 down_read(&EXT4_I(inode)->xattr_sem); 5495 error = dquot_transfer(idmap, inode, attr); 5496 up_read(&EXT4_I(inode)->xattr_sem); 5497 5498 if (error) { 5499 ext4_journal_stop(handle); 5500 return error; 5501 } 5502 /* Update corresponding info in inode so that everything is in 5503 * one transaction */ 5504 i_uid_update(idmap, attr, inode); 5505 i_gid_update(idmap, attr, inode); 5506 error = ext4_mark_inode_dirty(handle, inode); 5507 ext4_journal_stop(handle); 5508 if (unlikely(error)) { 5509 return error; 5510 } 5511 } 5512 5513 if (attr->ia_valid & ATTR_SIZE) { 5514 handle_t *handle; 5515 loff_t oldsize = inode->i_size; 5516 loff_t old_disksize; 5517 int shrink = (attr->ia_size < inode->i_size); 5518 5519 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 5520 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5521 5522 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5523 return -EFBIG; 5524 } 5525 } 5526 if (!S_ISREG(inode->i_mode)) { 5527 return -EINVAL; 5528 } 5529 5530 if (attr->ia_size == inode->i_size) 5531 inc_ivers = false; 5532 5533 if (shrink) { 5534 if (ext4_should_order_data(inode)) { 5535 error = ext4_begin_ordered_truncate(inode, 5536 attr->ia_size); 5537 if (error) 5538 goto err_out; 5539 } 5540 /* 5541 * Blocks are going to be removed from the inode. Wait 5542 * for dio in flight. 5543 */ 5544 inode_dio_wait(inode); 5545 } 5546 5547 filemap_invalidate_lock(inode->i_mapping); 5548 5549 rc = ext4_break_layouts(inode); 5550 if (rc) { 5551 filemap_invalidate_unlock(inode->i_mapping); 5552 goto err_out; 5553 } 5554 5555 if (attr->ia_size != inode->i_size) { 5556 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); 5557 if (IS_ERR(handle)) { 5558 error = PTR_ERR(handle); 5559 goto out_mmap_sem; 5560 } 5561 if (ext4_handle_valid(handle) && shrink) { 5562 error = ext4_orphan_add(handle, inode); 5563 orphan = 1; 5564 } 5565 /* 5566 * Update c/mtime on truncate up, ext4_truncate() will 5567 * update c/mtime in shrink case below 5568 */ 5569 if (!shrink) { 5570 inode->i_mtime = current_time(inode); 5571 inode->i_ctime = inode->i_mtime; 5572 } 5573 5574 if (shrink) 5575 ext4_fc_track_range(handle, inode, 5576 (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> 5577 inode->i_sb->s_blocksize_bits, 5578 EXT_MAX_BLOCKS - 1); 5579 else 5580 ext4_fc_track_range( 5581 handle, inode, 5582 (oldsize > 0 ? oldsize - 1 : oldsize) >> 5583 inode->i_sb->s_blocksize_bits, 5584 (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> 5585 inode->i_sb->s_blocksize_bits); 5586 5587 down_write(&EXT4_I(inode)->i_data_sem); 5588 old_disksize = EXT4_I(inode)->i_disksize; 5589 EXT4_I(inode)->i_disksize = attr->ia_size; 5590 rc = ext4_mark_inode_dirty(handle, inode); 5591 if (!error) 5592 error = rc; 5593 /* 5594 * We have to update i_size under i_data_sem together 5595 * with i_disksize to avoid races with writeback code 5596 * running ext4_wb_update_i_disksize(). 5597 */ 5598 if (!error) 5599 i_size_write(inode, attr->ia_size); 5600 else 5601 EXT4_I(inode)->i_disksize = old_disksize; 5602 up_write(&EXT4_I(inode)->i_data_sem); 5603 ext4_journal_stop(handle); 5604 if (error) 5605 goto out_mmap_sem; 5606 if (!shrink) { 5607 pagecache_isize_extended(inode, oldsize, 5608 inode->i_size); 5609 } else if (ext4_should_journal_data(inode)) { 5610 ext4_wait_for_tail_page_commit(inode); 5611 } 5612 } 5613 5614 /* 5615 * Truncate pagecache after we've waited for commit 5616 * in data=journal mode to make pages freeable. 5617 */ 5618 truncate_pagecache(inode, inode->i_size); 5619 /* 5620 * Call ext4_truncate() even if i_size didn't change to 5621 * truncate possible preallocated blocks. 5622 */ 5623 if (attr->ia_size <= oldsize) { 5624 rc = ext4_truncate(inode); 5625 if (rc) 5626 error = rc; 5627 } 5628 out_mmap_sem: 5629 filemap_invalidate_unlock(inode->i_mapping); 5630 } 5631 5632 if (!error) { 5633 if (inc_ivers) 5634 inode_inc_iversion(inode); 5635 setattr_copy(idmap, inode, attr); 5636 mark_inode_dirty(inode); 5637 } 5638 5639 /* 5640 * If the call to ext4_truncate failed to get a transaction handle at 5641 * all, we need to clean up the in-core orphan list manually. 5642 */ 5643 if (orphan && inode->i_nlink) 5644 ext4_orphan_del(NULL, inode); 5645 5646 if (!error && (ia_valid & ATTR_MODE)) 5647 rc = posix_acl_chmod(idmap, dentry, inode->i_mode); 5648 5649 err_out: 5650 if (error) 5651 ext4_std_error(inode->i_sb, error); 5652 if (!error) 5653 error = rc; 5654 return error; 5655 } 5656 5657 u32 ext4_dio_alignment(struct inode *inode) 5658 { 5659 if (fsverity_active(inode)) 5660 return 0; 5661 if (ext4_should_journal_data(inode)) 5662 return 0; 5663 if (ext4_has_inline_data(inode)) 5664 return 0; 5665 if (IS_ENCRYPTED(inode)) { 5666 if (!fscrypt_dio_supported(inode)) 5667 return 0; 5668 return i_blocksize(inode); 5669 } 5670 return 1; /* use the iomap defaults */ 5671 } 5672 5673 int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, 5674 struct kstat *stat, u32 request_mask, unsigned int query_flags) 5675 { 5676 struct inode *inode = d_inode(path->dentry); 5677 struct ext4_inode *raw_inode; 5678 struct ext4_inode_info *ei = EXT4_I(inode); 5679 unsigned int flags; 5680 5681 if ((request_mask & STATX_BTIME) && 5682 EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { 5683 stat->result_mask |= STATX_BTIME; 5684 stat->btime.tv_sec = ei->i_crtime.tv_sec; 5685 stat->btime.tv_nsec = ei->i_crtime.tv_nsec; 5686 } 5687 5688 /* 5689 * Return the DIO alignment restrictions if requested. We only return 5690 * this information when requested, since on encrypted files it might 5691 * take a fair bit of work to get if the file wasn't opened recently. 5692 */ 5693 if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { 5694 u32 dio_align = ext4_dio_alignment(inode); 5695 5696 stat->result_mask |= STATX_DIOALIGN; 5697 if (dio_align == 1) { 5698 struct block_device *bdev = inode->i_sb->s_bdev; 5699 5700 /* iomap defaults */ 5701 stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 5702 stat->dio_offset_align = bdev_logical_block_size(bdev); 5703 } else { 5704 stat->dio_mem_align = dio_align; 5705 stat->dio_offset_align = dio_align; 5706 } 5707 } 5708 5709 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; 5710 if (flags & EXT4_APPEND_FL) 5711 stat->attributes |= STATX_ATTR_APPEND; 5712 if (flags & EXT4_COMPR_FL) 5713 stat->attributes |= STATX_ATTR_COMPRESSED; 5714 if (flags & EXT4_ENCRYPT_FL) 5715 stat->attributes |= STATX_ATTR_ENCRYPTED; 5716 if (flags & EXT4_IMMUTABLE_FL) 5717 stat->attributes |= STATX_ATTR_IMMUTABLE; 5718 if (flags & EXT4_NODUMP_FL) 5719 stat->attributes |= STATX_ATTR_NODUMP; 5720 if (flags & EXT4_VERITY_FL) 5721 stat->attributes |= STATX_ATTR_VERITY; 5722 5723 stat->attributes_mask |= (STATX_ATTR_APPEND | 5724 STATX_ATTR_COMPRESSED | 5725 STATX_ATTR_ENCRYPTED | 5726 STATX_ATTR_IMMUTABLE | 5727 STATX_ATTR_NODUMP | 5728 STATX_ATTR_VERITY); 5729 5730 generic_fillattr(idmap, inode, stat); 5731 return 0; 5732 } 5733 5734 int ext4_file_getattr(struct mnt_idmap *idmap, 5735 const struct path *path, struct kstat *stat, 5736 u32 request_mask, unsigned int query_flags) 5737 { 5738 struct inode *inode = d_inode(path->dentry); 5739 u64 delalloc_blocks; 5740 5741 ext4_getattr(idmap, path, stat, request_mask, query_flags); 5742 5743 /* 5744 * If there is inline data in the inode, the inode will normally not 5745 * have data blocks allocated (it may have an external xattr block). 5746 * Report at least one sector for such files, so tools like tar, rsync, 5747 * others don't incorrectly think the file is completely sparse. 5748 */ 5749 if (unlikely(ext4_has_inline_data(inode))) 5750 stat->blocks += (stat->size + 511) >> 9; 5751 5752 /* 5753 * We can't update i_blocks if the block allocation is delayed 5754 * otherwise in the case of system crash before the real block 5755 * allocation is done, we will have i_blocks inconsistent with 5756 * on-disk file blocks. 5757 * We always keep i_blocks updated together with real 5758 * allocation. But to not confuse with user, stat 5759 * will return the blocks that include the delayed allocation 5760 * blocks for this file. 5761 */ 5762 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 5763 EXT4_I(inode)->i_reserved_data_blocks); 5764 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); 5765 return 0; 5766 } 5767 5768 static int ext4_index_trans_blocks(struct inode *inode, int lblocks, 5769 int pextents) 5770 { 5771 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5772 return ext4_ind_trans_blocks(inode, lblocks); 5773 return ext4_ext_index_trans_blocks(inode, pextents); 5774 } 5775 5776 /* 5777 * Account for index blocks, block groups bitmaps and block group 5778 * descriptor blocks if modify datablocks and index blocks 5779 * worse case, the indexs blocks spread over different block groups 5780 * 5781 * If datablocks are discontiguous, they are possible to spread over 5782 * different block groups too. If they are contiguous, with flexbg, 5783 * they could still across block group boundary. 5784 * 5785 * Also account for superblock, inode, quota and xattr blocks 5786 */ 5787 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 5788 int pextents) 5789 { 5790 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5791 int gdpblocks; 5792 int idxblocks; 5793 int ret = 0; 5794 5795 /* 5796 * How many index blocks need to touch to map @lblocks logical blocks 5797 * to @pextents physical extents? 5798 */ 5799 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); 5800 5801 ret = idxblocks; 5802 5803 /* 5804 * Now let's see how many group bitmaps and group descriptors need 5805 * to account 5806 */ 5807 groups = idxblocks + pextents; 5808 gdpblocks = groups; 5809 if (groups > ngroups) 5810 groups = ngroups; 5811 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 5812 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 5813 5814 /* bitmaps and block group descriptor blocks */ 5815 ret += groups + gdpblocks; 5816 5817 /* Blocks for super block, inode, quota and xattr blocks */ 5818 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 5819 5820 return ret; 5821 } 5822 5823 /* 5824 * Calculate the total number of credits to reserve to fit 5825 * the modification of a single pages into a single transaction, 5826 * which may include multiple chunks of block allocations. 5827 * 5828 * This could be called via ext4_write_begin() 5829 * 5830 * We need to consider the worse case, when 5831 * one new block per extent. 5832 */ 5833 int ext4_writepage_trans_blocks(struct inode *inode) 5834 { 5835 int bpp = ext4_journal_blocks_per_page(inode); 5836 int ret; 5837 5838 ret = ext4_meta_trans_blocks(inode, bpp, bpp); 5839 5840 /* Account for data blocks for journalled mode */ 5841 if (ext4_should_journal_data(inode)) 5842 ret += bpp; 5843 return ret; 5844 } 5845 5846 /* 5847 * Calculate the journal credits for a chunk of data modification. 5848 * 5849 * This is called from DIO, fallocate or whoever calling 5850 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. 5851 * 5852 * journal buffers for data blocks are not included here, as DIO 5853 * and fallocate do no need to journal data buffers. 5854 */ 5855 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 5856 { 5857 return ext4_meta_trans_blocks(inode, nrblocks, 1); 5858 } 5859 5860 /* 5861 * The caller must have previously called ext4_reserve_inode_write(). 5862 * Give this, we know that the caller already has write access to iloc->bh. 5863 */ 5864 int ext4_mark_iloc_dirty(handle_t *handle, 5865 struct inode *inode, struct ext4_iloc *iloc) 5866 { 5867 int err = 0; 5868 5869 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { 5870 put_bh(iloc->bh); 5871 return -EIO; 5872 } 5873 ext4_fc_track_inode(handle, inode); 5874 5875 /* the do_update_inode consumes one bh->b_count */ 5876 get_bh(iloc->bh); 5877 5878 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5879 err = ext4_do_update_inode(handle, inode, iloc); 5880 put_bh(iloc->bh); 5881 return err; 5882 } 5883 5884 /* 5885 * On success, We end up with an outstanding reference count against 5886 * iloc->bh. This _must_ be cleaned up later. 5887 */ 5888 5889 int 5890 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 5891 struct ext4_iloc *iloc) 5892 { 5893 int err; 5894 5895 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 5896 return -EIO; 5897 5898 err = ext4_get_inode_loc(inode, iloc); 5899 if (!err) { 5900 BUFFER_TRACE(iloc->bh, "get_write_access"); 5901 err = ext4_journal_get_write_access(handle, inode->i_sb, 5902 iloc->bh, EXT4_JTR_NONE); 5903 if (err) { 5904 brelse(iloc->bh); 5905 iloc->bh = NULL; 5906 } 5907 } 5908 ext4_std_error(inode->i_sb, err); 5909 return err; 5910 } 5911 5912 static int __ext4_expand_extra_isize(struct inode *inode, 5913 unsigned int new_extra_isize, 5914 struct ext4_iloc *iloc, 5915 handle_t *handle, int *no_expand) 5916 { 5917 struct ext4_inode *raw_inode; 5918 struct ext4_xattr_ibody_header *header; 5919 unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); 5920 struct ext4_inode_info *ei = EXT4_I(inode); 5921 int error; 5922 5923 /* this was checked at iget time, but double check for good measure */ 5924 if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || 5925 (ei->i_extra_isize & 3)) { 5926 EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", 5927 ei->i_extra_isize, 5928 EXT4_INODE_SIZE(inode->i_sb)); 5929 return -EFSCORRUPTED; 5930 } 5931 if ((new_extra_isize < ei->i_extra_isize) || 5932 (new_extra_isize < 4) || 5933 (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) 5934 return -EINVAL; /* Should never happen */ 5935 5936 raw_inode = ext4_raw_inode(iloc); 5937 5938 header = IHDR(inode, raw_inode); 5939 5940 /* No extended attributes present */ 5941 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5942 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5943 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + 5944 EXT4_I(inode)->i_extra_isize, 0, 5945 new_extra_isize - EXT4_I(inode)->i_extra_isize); 5946 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5947 return 0; 5948 } 5949 5950 /* 5951 * We may need to allocate external xattr block so we need quotas 5952 * initialized. Here we can be called with various locks held so we 5953 * cannot affort to initialize quotas ourselves. So just bail. 5954 */ 5955 if (dquot_initialize_needed(inode)) 5956 return -EAGAIN; 5957 5958 /* try to expand with EAs present */ 5959 error = ext4_expand_extra_isize_ea(inode, new_extra_isize, 5960 raw_inode, handle); 5961 if (error) { 5962 /* 5963 * Inode size expansion failed; don't try again 5964 */ 5965 *no_expand = 1; 5966 } 5967 5968 return error; 5969 } 5970 5971 /* 5972 * Expand an inode by new_extra_isize bytes. 5973 * Returns 0 on success or negative error number on failure. 5974 */ 5975 static int ext4_try_to_expand_extra_isize(struct inode *inode, 5976 unsigned int new_extra_isize, 5977 struct ext4_iloc iloc, 5978 handle_t *handle) 5979 { 5980 int no_expand; 5981 int error; 5982 5983 if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) 5984 return -EOVERFLOW; 5985 5986 /* 5987 * In nojournal mode, we can immediately attempt to expand 5988 * the inode. When journaled, we first need to obtain extra 5989 * buffer credits since we may write into the EA block 5990 * with this same handle. If journal_extend fails, then it will 5991 * only result in a minor loss of functionality for that inode. 5992 * If this is felt to be critical, then e2fsck should be run to 5993 * force a large enough s_min_extra_isize. 5994 */ 5995 if (ext4_journal_extend(handle, 5996 EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) 5997 return -ENOSPC; 5998 5999 if (ext4_write_trylock_xattr(inode, &no_expand) == 0) 6000 return -EBUSY; 6001 6002 error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, 6003 handle, &no_expand); 6004 ext4_write_unlock_xattr(inode, &no_expand); 6005 6006 return error; 6007 } 6008 6009 int ext4_expand_extra_isize(struct inode *inode, 6010 unsigned int new_extra_isize, 6011 struct ext4_iloc *iloc) 6012 { 6013 handle_t *handle; 6014 int no_expand; 6015 int error, rc; 6016 6017 if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 6018 brelse(iloc->bh); 6019 return -EOVERFLOW; 6020 } 6021 6022 handle = ext4_journal_start(inode, EXT4_HT_INODE, 6023 EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 6024 if (IS_ERR(handle)) { 6025 error = PTR_ERR(handle); 6026 brelse(iloc->bh); 6027 return error; 6028 } 6029 6030 ext4_write_lock_xattr(inode, &no_expand); 6031 6032 BUFFER_TRACE(iloc->bh, "get_write_access"); 6033 error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, 6034 EXT4_JTR_NONE); 6035 if (error) { 6036 brelse(iloc->bh); 6037 goto out_unlock; 6038 } 6039 6040 error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, 6041 handle, &no_expand); 6042 6043 rc = ext4_mark_iloc_dirty(handle, inode, iloc); 6044 if (!error) 6045 error = rc; 6046 6047 out_unlock: 6048 ext4_write_unlock_xattr(inode, &no_expand); 6049 ext4_journal_stop(handle); 6050 return error; 6051 } 6052 6053 /* 6054 * What we do here is to mark the in-core inode as clean with respect to inode 6055 * dirtiness (it may still be data-dirty). 6056 * This means that the in-core inode may be reaped by prune_icache 6057 * without having to perform any I/O. This is a very good thing, 6058 * because *any* task may call prune_icache - even ones which 6059 * have a transaction open against a different journal. 6060 * 6061 * Is this cheating? Not really. Sure, we haven't written the 6062 * inode out, but prune_icache isn't a user-visible syncing function. 6063 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 6064 * we start and wait on commits. 6065 */ 6066 int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, 6067 const char *func, unsigned int line) 6068 { 6069 struct ext4_iloc iloc; 6070 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 6071 int err; 6072 6073 might_sleep(); 6074 trace_ext4_mark_inode_dirty(inode, _RET_IP_); 6075 err = ext4_reserve_inode_write(handle, inode, &iloc); 6076 if (err) 6077 goto out; 6078 6079 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) 6080 ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, 6081 iloc, handle); 6082 6083 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 6084 out: 6085 if (unlikely(err)) 6086 ext4_error_inode_err(inode, func, line, 0, err, 6087 "mark_inode_dirty error"); 6088 return err; 6089 } 6090 6091 /* 6092 * ext4_dirty_inode() is called from __mark_inode_dirty() 6093 * 6094 * We're really interested in the case where a file is being extended. 6095 * i_size has been changed by generic_commit_write() and we thus need 6096 * to include the updated inode in the current transaction. 6097 * 6098 * Also, dquot_alloc_block() will always dirty the inode when blocks 6099 * are allocated to the file. 6100 * 6101 * If the inode is marked synchronous, we don't honour that here - doing 6102 * so would cause a commit on atime updates, which we don't bother doing. 6103 * We handle synchronous inodes at the highest possible level. 6104 */ 6105 void ext4_dirty_inode(struct inode *inode, int flags) 6106 { 6107 handle_t *handle; 6108 6109 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 6110 if (IS_ERR(handle)) 6111 return; 6112 ext4_mark_inode_dirty(handle, inode); 6113 ext4_journal_stop(handle); 6114 } 6115 6116 int ext4_change_inode_journal_flag(struct inode *inode, int val) 6117 { 6118 journal_t *journal; 6119 handle_t *handle; 6120 int err; 6121 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 6122 6123 /* 6124 * We have to be very careful here: changing a data block's 6125 * journaling status dynamically is dangerous. If we write a 6126 * data block to the journal, change the status and then delete 6127 * that block, we risk forgetting to revoke the old log record 6128 * from the journal and so a subsequent replay can corrupt data. 6129 * So, first we make sure that the journal is empty and that 6130 * nobody is changing anything. 6131 */ 6132 6133 journal = EXT4_JOURNAL(inode); 6134 if (!journal) 6135 return 0; 6136 if (is_journal_aborted(journal)) 6137 return -EROFS; 6138 6139 /* Wait for all existing dio workers */ 6140 inode_dio_wait(inode); 6141 6142 /* 6143 * Before flushing the journal and switching inode's aops, we have 6144 * to flush all dirty data the inode has. There can be outstanding 6145 * delayed allocations, there can be unwritten extents created by 6146 * fallocate or buffered writes in dioread_nolock mode covered by 6147 * dirty data which can be converted only after flushing the dirty 6148 * data (and journalled aops don't know how to handle these cases). 6149 */ 6150 if (val) { 6151 filemap_invalidate_lock(inode->i_mapping); 6152 err = filemap_write_and_wait(inode->i_mapping); 6153 if (err < 0) { 6154 filemap_invalidate_unlock(inode->i_mapping); 6155 return err; 6156 } 6157 } 6158 6159 percpu_down_write(&sbi->s_writepages_rwsem); 6160 jbd2_journal_lock_updates(journal); 6161 6162 /* 6163 * OK, there are no updates running now, and all cached data is 6164 * synced to disk. We are now in a completely consistent state 6165 * which doesn't have anything in the journal, and we know that 6166 * no filesystem updates are running, so it is safe to modify 6167 * the inode's in-core data-journaling state flag now. 6168 */ 6169 6170 if (val) 6171 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 6172 else { 6173 err = jbd2_journal_flush(journal, 0); 6174 if (err < 0) { 6175 jbd2_journal_unlock_updates(journal); 6176 percpu_up_write(&sbi->s_writepages_rwsem); 6177 return err; 6178 } 6179 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 6180 } 6181 ext4_set_aops(inode); 6182 6183 jbd2_journal_unlock_updates(journal); 6184 percpu_up_write(&sbi->s_writepages_rwsem); 6185 6186 if (val) 6187 filemap_invalidate_unlock(inode->i_mapping); 6188 6189 /* Finally we can mark the inode as dirty. */ 6190 6191 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 6192 if (IS_ERR(handle)) 6193 return PTR_ERR(handle); 6194 6195 ext4_fc_mark_ineligible(inode->i_sb, 6196 EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); 6197 err = ext4_mark_inode_dirty(handle, inode); 6198 ext4_handle_sync(handle); 6199 ext4_journal_stop(handle); 6200 ext4_std_error(inode->i_sb, err); 6201 6202 return err; 6203 } 6204 6205 static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, 6206 struct buffer_head *bh) 6207 { 6208 return !buffer_mapped(bh); 6209 } 6210 6211 vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) 6212 { 6213 struct vm_area_struct *vma = vmf->vma; 6214 struct page *page = vmf->page; 6215 loff_t size; 6216 unsigned long len; 6217 int err; 6218 vm_fault_t ret; 6219 struct file *file = vma->vm_file; 6220 struct inode *inode = file_inode(file); 6221 struct address_space *mapping = inode->i_mapping; 6222 handle_t *handle; 6223 get_block_t *get_block; 6224 int retries = 0; 6225 6226 if (unlikely(IS_IMMUTABLE(inode))) 6227 return VM_FAULT_SIGBUS; 6228 6229 sb_start_pagefault(inode->i_sb); 6230 file_update_time(vma->vm_file); 6231 6232 filemap_invalidate_lock_shared(mapping); 6233 6234 err = ext4_convert_inline_data(inode); 6235 if (err) 6236 goto out_ret; 6237 6238 /* 6239 * On data journalling we skip straight to the transaction handle: 6240 * there's no delalloc; page truncated will be checked later; the 6241 * early return w/ all buffers mapped (calculates size/len) can't 6242 * be used; and there's no dioread_nolock, so only ext4_get_block. 6243 */ 6244 if (ext4_should_journal_data(inode)) 6245 goto retry_alloc; 6246 6247 /* Delalloc case is easy... */ 6248 if (test_opt(inode->i_sb, DELALLOC) && 6249 !ext4_nonda_switch(inode->i_sb)) { 6250 do { 6251 err = block_page_mkwrite(vma, vmf, 6252 ext4_da_get_block_prep); 6253 } while (err == -ENOSPC && 6254 ext4_should_retry_alloc(inode->i_sb, &retries)); 6255 goto out_ret; 6256 } 6257 6258 lock_page(page); 6259 size = i_size_read(inode); 6260 /* Page got truncated from under us? */ 6261 if (page->mapping != mapping || page_offset(page) > size) { 6262 unlock_page(page); 6263 ret = VM_FAULT_NOPAGE; 6264 goto out; 6265 } 6266 6267 if (page->index == size >> PAGE_SHIFT) 6268 len = size & ~PAGE_MASK; 6269 else 6270 len = PAGE_SIZE; 6271 /* 6272 * Return if we have all the buffers mapped. This avoids the need to do 6273 * journal_start/journal_stop which can block and take a long time 6274 * 6275 * This cannot be done for data journalling, as we have to add the 6276 * inode to the transaction's list to writeprotect pages on commit. 6277 */ 6278 if (page_has_buffers(page)) { 6279 if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), 6280 0, len, NULL, 6281 ext4_bh_unmapped)) { 6282 /* Wait so that we don't change page under IO */ 6283 wait_for_stable_page(page); 6284 ret = VM_FAULT_LOCKED; 6285 goto out; 6286 } 6287 } 6288 unlock_page(page); 6289 /* OK, we need to fill the hole... */ 6290 if (ext4_should_dioread_nolock(inode)) 6291 get_block = ext4_get_block_unwritten; 6292 else 6293 get_block = ext4_get_block; 6294 retry_alloc: 6295 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 6296 ext4_writepage_trans_blocks(inode)); 6297 if (IS_ERR(handle)) { 6298 ret = VM_FAULT_SIGBUS; 6299 goto out; 6300 } 6301 /* 6302 * Data journalling can't use block_page_mkwrite() because it 6303 * will set_buffer_dirty() before do_journal_get_write_access() 6304 * thus might hit warning messages for dirty metadata buffers. 6305 */ 6306 if (!ext4_should_journal_data(inode)) { 6307 err = block_page_mkwrite(vma, vmf, get_block); 6308 } else { 6309 lock_page(page); 6310 size = i_size_read(inode); 6311 /* Page got truncated from under us? */ 6312 if (page->mapping != mapping || page_offset(page) > size) { 6313 ret = VM_FAULT_NOPAGE; 6314 goto out_error; 6315 } 6316 6317 if (page->index == size >> PAGE_SHIFT) 6318 len = size & ~PAGE_MASK; 6319 else 6320 len = PAGE_SIZE; 6321 6322 err = __block_write_begin(page, 0, len, ext4_get_block); 6323 if (!err) { 6324 ret = VM_FAULT_SIGBUS; 6325 if (ext4_walk_page_buffers(handle, inode, 6326 page_buffers(page), 0, len, NULL, 6327 do_journal_get_write_access)) 6328 goto out_error; 6329 if (ext4_walk_page_buffers(handle, inode, 6330 page_buffers(page), 0, len, NULL, 6331 write_end_fn)) 6332 goto out_error; 6333 if (ext4_jbd2_inode_add_write(handle, inode, 6334 page_offset(page), len)) 6335 goto out_error; 6336 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 6337 } else { 6338 unlock_page(page); 6339 } 6340 } 6341 ext4_journal_stop(handle); 6342 if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 6343 goto retry_alloc; 6344 out_ret: 6345 ret = block_page_mkwrite_return(err); 6346 out: 6347 filemap_invalidate_unlock_shared(mapping); 6348 sb_end_pagefault(inode->i_sb); 6349 return ret; 6350 out_error: 6351 unlock_page(page); 6352 ext4_journal_stop(handle); 6353 goto out; 6354 } 6355