1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/ext4/inode.c 4 * 5 * Copyright (C) 1992, 1993, 1994, 1995 6 * Remy Card (card@masi.ibp.fr) 7 * Laboratoire MASI - Institut Blaise Pascal 8 * Universite Pierre et Marie Curie (Paris VI) 9 * 10 * from 11 * 12 * linux/fs/minix/inode.c 13 * 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * 64-bit file support on 64-bit platforms by Jakub Jelinek 17 * (jj@sunsite.ms.mff.cuni.cz) 18 * 19 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 20 */ 21 22 #include <linux/fs.h> 23 #include <linux/mount.h> 24 #include <linux/time.h> 25 #include <linux/highuid.h> 26 #include <linux/pagemap.h> 27 #include <linux/dax.h> 28 #include <linux/quotaops.h> 29 #include <linux/string.h> 30 #include <linux/buffer_head.h> 31 #include <linux/writeback.h> 32 #include <linux/pagevec.h> 33 #include <linux/mpage.h> 34 #include <linux/rmap.h> 35 #include <linux/namei.h> 36 #include <linux/uio.h> 37 #include <linux/bio.h> 38 #include <linux/workqueue.h> 39 #include <linux/kernel.h> 40 #include <linux/printk.h> 41 #include <linux/slab.h> 42 #include <linux/bitops.h> 43 #include <linux/iomap.h> 44 #include <linux/iversion.h> 45 46 #include "ext4_jbd2.h" 47 #include "xattr.h" 48 #include "acl.h" 49 #include "truncate.h" 50 51 #include <kunit/static_stub.h> 52 53 #include <trace/events/ext4.h> 54 55 static void ext4_journalled_zero_new_buffers(handle_t *handle, 56 struct inode *inode, 57 struct folio *folio, 58 unsigned from, unsigned to); 59 60 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, 61 struct ext4_inode_info *ei) 62 { 63 __u32 csum; 64 __u16 dummy_csum = 0; 65 int offset = offsetof(struct ext4_inode, i_checksum_lo); 66 unsigned int csum_size = sizeof(dummy_csum); 67 68 csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); 69 csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); 70 offset += csum_size; 71 csum = ext4_chksum(csum, (__u8 *)raw + offset, 72 EXT4_GOOD_OLD_INODE_SIZE - offset); 73 74 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 75 offset = offsetof(struct ext4_inode, i_checksum_hi); 76 csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, 77 offset - EXT4_GOOD_OLD_INODE_SIZE); 78 if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { 79 csum = ext4_chksum(csum, (__u8 *)&dummy_csum, 80 csum_size); 81 offset += csum_size; 82 } 83 csum = ext4_chksum(csum, (__u8 *)raw + offset, 84 EXT4_INODE_SIZE(inode->i_sb) - offset); 85 } 86 87 return csum; 88 } 89 90 static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, 91 struct ext4_inode_info *ei) 92 { 93 __u32 provided, calculated; 94 95 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 96 cpu_to_le32(EXT4_OS_LINUX) || 97 !ext4_has_feature_metadata_csum(inode->i_sb)) 98 return 1; 99 100 provided = le16_to_cpu(raw->i_checksum_lo); 101 calculated = ext4_inode_csum(inode, raw, ei); 102 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 103 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 104 provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; 105 else 106 calculated &= 0xFFFF; 107 108 return provided == calculated; 109 } 110 111 void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, 112 struct ext4_inode_info *ei) 113 { 114 __u32 csum; 115 116 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 117 cpu_to_le32(EXT4_OS_LINUX) || 118 !ext4_has_feature_metadata_csum(inode->i_sb)) 119 return; 120 121 csum = ext4_inode_csum(inode, raw, ei); 122 raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); 123 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 124 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 125 raw->i_checksum_hi = cpu_to_le16(csum >> 16); 126 } 127 128 static inline int ext4_begin_ordered_truncate(struct inode *inode, 129 loff_t new_size) 130 { 131 struct jbd2_inode *jinode = READ_ONCE(EXT4_I(inode)->jinode); 132 133 trace_ext4_begin_ordered_truncate(inode, new_size); 134 /* 135 * If jinode is zero, then we never opened the file for 136 * writing, so there's no need to call 137 * jbd2_journal_begin_ordered_truncate() since there's no 138 * outstanding writes we need to flush. 139 */ 140 if (!jinode) 141 return 0; 142 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), 143 jinode, 144 new_size); 145 } 146 147 /* 148 * Test whether an inode is a fast symlink. 149 * A fast symlink has its symlink data stored in ext4_inode_info->i_data. 150 */ 151 int ext4_inode_is_fast_symlink(struct inode *inode) 152 { 153 if (!ext4_has_feature_ea_inode(inode->i_sb)) { 154 int ea_blocks = EXT4_I(inode)->i_file_acl ? 155 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; 156 157 if (ext4_has_inline_data(inode)) 158 return 0; 159 160 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 161 } 162 return S_ISLNK(inode->i_mode) && inode->i_size && 163 (inode->i_size < EXT4_N_BLOCKS * 4); 164 } 165 166 /* 167 * Called at the last iput() if i_nlink is zero. 168 */ 169 void ext4_evict_inode(struct inode *inode) 170 { 171 handle_t *handle; 172 int err; 173 /* 174 * Credits for final inode cleanup and freeing: 175 * sb + inode (ext4_orphan_del()), block bitmap, group descriptor 176 * (xattr block freeing), bitmap, group descriptor (inode freeing) 177 */ 178 int extra_credits = 6; 179 struct ext4_xattr_inode_array *ea_inode_array = NULL; 180 bool freeze_protected = false; 181 182 trace_ext4_evict_inode(inode); 183 184 dax_break_layout_final(inode); 185 186 if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) 187 ext4_evict_ea_inode(inode); 188 if (inode->i_nlink) { 189 /* 190 * If there's dirty page will lead to data loss, user 191 * could see stale data. 192 */ 193 if (unlikely(!ext4_emergency_state(inode->i_sb) && 194 mapping_tagged(&inode->i_data, PAGECACHE_TAG_DIRTY))) 195 ext4_warning_inode(inode, "data will be lost"); 196 197 truncate_inode_pages_final(&inode->i_data); 198 199 goto no_delete; 200 } 201 202 if (is_bad_inode(inode)) 203 goto no_delete; 204 dquot_initialize(inode); 205 206 if (ext4_should_order_data(inode)) 207 ext4_begin_ordered_truncate(inode, 0); 208 truncate_inode_pages_final(&inode->i_data); 209 210 /* 211 * For inodes with journalled data, transaction commit could have 212 * dirtied the inode. And for inodes with dioread_nolock, unwritten 213 * extents converting worker could merge extents and also have dirtied 214 * the inode. Flush worker is ignoring it because of I_FREEING flag but 215 * we still need to remove the inode from the writeback lists. 216 */ 217 inode_io_list_del(inode); 218 219 /* 220 * Protect us against freezing - iput() caller didn't have to have any 221 * protection against it. When we are in a running transaction though, 222 * we are already protected against freezing and we cannot grab further 223 * protection due to lock ordering constraints. 224 */ 225 if (!ext4_journal_current_handle()) { 226 sb_start_intwrite(inode->i_sb); 227 freeze_protected = true; 228 } 229 230 if (!IS_NOQUOTA(inode)) 231 extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); 232 233 /* 234 * Block bitmap, group descriptor, and inode are accounted in both 235 * ext4_blocks_for_truncate() and extra_credits. So subtract 3. 236 */ 237 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, 238 ext4_blocks_for_truncate(inode) + extra_credits - 3); 239 if (IS_ERR(handle)) { 240 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 241 /* 242 * If we're going to skip the normal cleanup, we still need to 243 * make sure that the in-core orphan linked list is properly 244 * cleaned up. 245 */ 246 ext4_orphan_del(NULL, inode); 247 if (freeze_protected) 248 sb_end_intwrite(inode->i_sb); 249 goto no_delete; 250 } 251 252 if (IS_SYNC(inode)) 253 ext4_handle_sync(handle); 254 255 /* 256 * Set inode->i_size to 0 before calling ext4_truncate(). We need 257 * special handling of symlinks here because i_size is used to 258 * determine whether ext4_inode_info->i_data contains symlink data or 259 * block mappings. Setting i_size to 0 will remove its fast symlink 260 * status. Erase i_data so that it becomes a valid empty block map. 261 */ 262 if (ext4_inode_is_fast_symlink(inode)) 263 memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); 264 inode->i_size = 0; 265 err = ext4_mark_inode_dirty(handle, inode); 266 if (err) { 267 ext4_warning(inode->i_sb, 268 "couldn't mark inode dirty (err %d)", err); 269 goto stop_handle; 270 } 271 if (inode->i_blocks) { 272 err = ext4_truncate(inode); 273 if (err) { 274 ext4_error_err(inode->i_sb, -err, 275 "couldn't truncate inode %lu (err %d)", 276 inode->i_ino, err); 277 goto stop_handle; 278 } 279 } 280 281 /* Remove xattr references. */ 282 err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, 283 extra_credits); 284 if (err) { 285 ext4_warning(inode->i_sb, "xattr delete (err %d)", err); 286 stop_handle: 287 ext4_journal_stop(handle); 288 ext4_orphan_del(NULL, inode); 289 if (freeze_protected) 290 sb_end_intwrite(inode->i_sb); 291 ext4_xattr_inode_array_free(ea_inode_array); 292 goto no_delete; 293 } 294 295 /* 296 * Kill off the orphan record which ext4_truncate created. 297 * AKPM: I think this can be inside the above `if'. 298 * Note that ext4_orphan_del() has to be able to cope with the 299 * deletion of a non-existent orphan - this is because we don't 300 * know if ext4_truncate() actually created an orphan record. 301 * (Well, we could do this if we need to, but heck - it works) 302 */ 303 ext4_orphan_del(handle, inode); 304 EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); 305 306 /* 307 * One subtle ordering requirement: if anything has gone wrong 308 * (transaction abort, IO errors, whatever), then we can still 309 * do these next steps (the fs will already have been marked as 310 * having errors), but we can't free the inode if the mark_dirty 311 * fails. 312 */ 313 if (ext4_mark_inode_dirty(handle, inode)) 314 /* If that failed, just do the required in-core inode clear. */ 315 ext4_clear_inode(inode); 316 else 317 ext4_free_inode(handle, inode); 318 ext4_journal_stop(handle); 319 if (freeze_protected) 320 sb_end_intwrite(inode->i_sb); 321 ext4_xattr_inode_array_free(ea_inode_array); 322 return; 323 no_delete: 324 /* 325 * Check out some where else accidentally dirty the evicting inode, 326 * which may probably cause inode use-after-free issues later. 327 */ 328 WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); 329 330 if (!list_empty(&EXT4_I(inode)->i_fc_list)) 331 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); 332 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 333 } 334 335 #ifdef CONFIG_QUOTA 336 qsize_t *ext4_get_reserved_space(struct inode *inode) 337 { 338 return &EXT4_I(inode)->i_reserved_quota; 339 } 340 #endif 341 342 /* 343 * Called with i_data_sem down, which is important since we can call 344 * ext4_discard_preallocations() from here. 345 */ 346 void ext4_da_update_reserve_space(struct inode *inode, 347 int used, int quota_claim) 348 { 349 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 350 struct ext4_inode_info *ei = EXT4_I(inode); 351 352 spin_lock(&ei->i_block_reservation_lock); 353 trace_ext4_da_update_reserve_space(inode, used, quota_claim); 354 if (unlikely(used > ei->i_reserved_data_blocks)) { 355 ext4_warning(inode->i_sb, "%s: ino %lu, used %d " 356 "with only %d reserved data blocks", 357 __func__, inode->i_ino, used, 358 ei->i_reserved_data_blocks); 359 WARN_ON(1); 360 used = ei->i_reserved_data_blocks; 361 } 362 363 /* Update per-inode reservations */ 364 ei->i_reserved_data_blocks -= used; 365 percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); 366 367 spin_unlock(&ei->i_block_reservation_lock); 368 369 /* Update quota subsystem for data blocks */ 370 if (quota_claim) 371 dquot_claim_block(inode, EXT4_C2B(sbi, used)); 372 else { 373 /* 374 * We did fallocate with an offset that is already delayed 375 * allocated. So on delayed allocated writeback we should 376 * not re-claim the quota for fallocated blocks. 377 */ 378 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); 379 } 380 381 /* 382 * If we have done all the pending block allocations and if 383 * there aren't any writers on the inode, we can discard the 384 * inode's preallocations. 385 */ 386 if ((ei->i_reserved_data_blocks == 0) && 387 !inode_is_open_for_write(inode)) 388 ext4_discard_preallocations(inode); 389 } 390 391 static int __check_block_validity(struct inode *inode, const char *func, 392 unsigned int line, 393 struct ext4_map_blocks *map) 394 { 395 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 396 397 if (journal && inode == journal->j_inode) 398 return 0; 399 400 if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { 401 ext4_error_inode(inode, func, line, map->m_pblk, 402 "lblock %lu mapped to illegal pblock %llu " 403 "(length %d)", (unsigned long) map->m_lblk, 404 map->m_pblk, map->m_len); 405 return -EFSCORRUPTED; 406 } 407 return 0; 408 } 409 410 int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, 411 ext4_lblk_t len) 412 { 413 int ret; 414 415 KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len); 416 417 if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) 418 return fscrypt_zeroout_range(inode, lblk, pblk, len); 419 420 ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); 421 if (ret > 0) 422 ret = 0; 423 424 return ret; 425 } 426 427 /* 428 * For generic regular files, when updating the extent tree, Ext4 should 429 * hold the i_rwsem and invalidate_lock exclusively. This ensures 430 * exclusion against concurrent page faults, as well as reads and writes. 431 */ 432 #ifdef CONFIG_EXT4_DEBUG 433 void ext4_check_map_extents_env(struct inode *inode) 434 { 435 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 436 return; 437 438 if (!S_ISREG(inode->i_mode) || 439 IS_NOQUOTA(inode) || IS_VERITY(inode) || 440 is_special_ino(inode->i_sb, inode->i_ino) || 441 (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || 442 ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || 443 ext4_verity_in_progress(inode)) 444 return; 445 446 WARN_ON_ONCE(!inode_is_locked(inode) && 447 !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); 448 } 449 #else 450 void ext4_check_map_extents_env(struct inode *inode) {} 451 #endif 452 453 #define check_block_validity(inode, map) \ 454 __check_block_validity((inode), __func__, __LINE__, (map)) 455 456 #ifdef ES_AGGRESSIVE_TEST 457 static void ext4_map_blocks_es_recheck(handle_t *handle, 458 struct inode *inode, 459 struct ext4_map_blocks *es_map, 460 struct ext4_map_blocks *map, 461 int flags) 462 { 463 int retval; 464 465 map->m_flags = 0; 466 /* 467 * There is a race window that the result is not the same. 468 * e.g. xfstests #223 when dioread_nolock enables. The reason 469 * is that we lookup a block mapping in extent status tree with 470 * out taking i_data_sem. So at the time the unwritten extent 471 * could be converted. 472 */ 473 down_read(&EXT4_I(inode)->i_data_sem); 474 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 475 retval = ext4_ext_map_blocks(handle, inode, map, 0); 476 } else { 477 retval = ext4_ind_map_blocks(handle, inode, map, 0); 478 } 479 up_read((&EXT4_I(inode)->i_data_sem)); 480 481 /* 482 * We don't check m_len because extent will be collpased in status 483 * tree. So the m_len might not equal. 484 */ 485 if (es_map->m_lblk != map->m_lblk || 486 es_map->m_flags != map->m_flags || 487 es_map->m_pblk != map->m_pblk) { 488 printk("ES cache assertion failed for inode: %lu " 489 "es_cached ex [%d/%d/%llu/%x] != " 490 "found ex [%d/%d/%llu/%x] retval %d flags %x\n", 491 inode->i_ino, es_map->m_lblk, es_map->m_len, 492 es_map->m_pblk, es_map->m_flags, map->m_lblk, 493 map->m_len, map->m_pblk, map->m_flags, 494 retval, flags); 495 } 496 } 497 #endif /* ES_AGGRESSIVE_TEST */ 498 499 static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, 500 struct inode *inode, struct ext4_map_blocks *map, 501 unsigned int orig_mlen) 502 { 503 struct ext4_map_blocks map2; 504 unsigned int status, status2; 505 int retval; 506 507 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 508 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 509 510 WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); 511 WARN_ON_ONCE(orig_mlen <= map->m_len); 512 513 /* Prepare map2 for lookup in next leaf block */ 514 map2.m_lblk = map->m_lblk + map->m_len; 515 map2.m_len = orig_mlen - map->m_len; 516 map2.m_flags = 0; 517 retval = ext4_ext_map_blocks(handle, inode, &map2, 0); 518 519 if (retval <= 0) { 520 ext4_es_cache_extent(inode, map->m_lblk, map->m_len, 521 map->m_pblk, status); 522 return map->m_len; 523 } 524 525 if (unlikely(retval != map2.m_len)) { 526 ext4_warning(inode->i_sb, 527 "ES len assertion failed for inode " 528 "%lu: retval %d != map->m_len %d", 529 inode->i_ino, retval, map2.m_len); 530 WARN_ON(1); 531 } 532 533 status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? 534 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 535 536 /* 537 * If map2 is contiguous with map, then let's insert it as a single 538 * extent in es cache and return the combined length of both the maps. 539 */ 540 if (map->m_pblk + map->m_len == map2.m_pblk && 541 status == status2) { 542 ext4_es_cache_extent(inode, map->m_lblk, 543 map->m_len + map2.m_len, map->m_pblk, 544 status); 545 map->m_len += map2.m_len; 546 } else { 547 ext4_es_cache_extent(inode, map->m_lblk, map->m_len, 548 map->m_pblk, status); 549 } 550 551 return map->m_len; 552 } 553 554 int ext4_map_query_blocks(handle_t *handle, struct inode *inode, 555 struct ext4_map_blocks *map, int flags) 556 { 557 unsigned int status; 558 int retval; 559 unsigned int orig_mlen = map->m_len; 560 561 flags &= EXT4_EX_QUERY_FILTER; 562 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 563 retval = ext4_ext_map_blocks(handle, inode, map, flags); 564 else 565 retval = ext4_ind_map_blocks(handle, inode, map, flags); 566 if (retval < 0) 567 return retval; 568 569 /* A hole? */ 570 if (retval == 0) 571 goto out; 572 573 if (unlikely(retval != map->m_len)) { 574 ext4_warning(inode->i_sb, 575 "ES len assertion failed for inode " 576 "%lu: retval %d != map->m_len %d", 577 inode->i_ino, retval, map->m_len); 578 WARN_ON(1); 579 } 580 581 /* 582 * No need to query next in leaf: 583 * - if returned extent is not last in leaf or 584 * - if the last in leaf is the full requested range 585 */ 586 if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || 587 map->m_len == orig_mlen) { 588 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 589 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 590 ext4_es_cache_extent(inode, map->m_lblk, map->m_len, 591 map->m_pblk, status); 592 } else { 593 retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, 594 orig_mlen); 595 } 596 out: 597 map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); 598 return retval; 599 } 600 601 int ext4_map_create_blocks(handle_t *handle, struct inode *inode, 602 struct ext4_map_blocks *map, int flags) 603 { 604 unsigned int status; 605 int err, retval = 0; 606 607 /* 608 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE 609 * indicates that the blocks and quotas has already been 610 * checked when the data was copied into the page cache. 611 */ 612 if (map->m_flags & EXT4_MAP_DELAYED) 613 flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 614 615 /* 616 * Here we clear m_flags because after allocating an new extent, 617 * it will be set again. 618 */ 619 map->m_flags &= ~EXT4_MAP_FLAGS; 620 621 /* 622 * We need to check for EXT4 here because migrate could have 623 * changed the inode type in between. 624 */ 625 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 626 retval = ext4_ext_map_blocks(handle, inode, map, flags); 627 } else { 628 retval = ext4_ind_map_blocks(handle, inode, map, flags); 629 630 /* 631 * We allocated new blocks which will result in i_data's 632 * format changing. Force the migrate to fail by clearing 633 * migrate flags. 634 */ 635 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) 636 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); 637 } 638 if (retval <= 0) 639 return retval; 640 641 if (unlikely(retval != map->m_len)) { 642 ext4_warning(inode->i_sb, 643 "ES len assertion failed for inode %lu: " 644 "retval %d != map->m_len %d", 645 inode->i_ino, retval, map->m_len); 646 WARN_ON(1); 647 } 648 649 /* 650 * We have to zeroout blocks before inserting them into extent 651 * status tree. Otherwise someone could look them up there and 652 * use them before they are really zeroed. We also have to 653 * unmap metadata before zeroing as otherwise writeback can 654 * overwrite zeros with stale data from block device. 655 */ 656 if (flags & EXT4_GET_BLOCKS_ZERO && 657 map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { 658 err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, 659 map->m_len); 660 if (err) 661 return err; 662 } 663 664 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 665 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 666 ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, 667 status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); 668 map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); 669 670 return retval; 671 } 672 673 /* 674 * The ext4_map_blocks() function tries to look up the requested blocks, 675 * and returns if the blocks are already mapped. 676 * 677 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 678 * and store the allocated blocks in the result buffer head and mark it 679 * mapped. 680 * 681 * If file type is extents based, it will call ext4_ext_map_blocks(), 682 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping 683 * based files 684 * 685 * On success, it returns the number of blocks being mapped or allocated. 686 * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are 687 * pre-allocated and unwritten, the resulting @map is marked as unwritten. 688 * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. 689 * 690 * It returns 0 if plain look up failed (blocks have not been allocated), in 691 * that case, @map is returned as unmapped but we still do fill map->m_len to 692 * indicate the length of a hole starting at map->m_lblk. 693 * 694 * It returns the error in case of allocation failure. 695 */ 696 int ext4_map_blocks(handle_t *handle, struct inode *inode, 697 struct ext4_map_blocks *map, int flags) 698 { 699 struct extent_status es; 700 int retval; 701 int ret = 0; 702 unsigned int orig_mlen = map->m_len; 703 #ifdef ES_AGGRESSIVE_TEST 704 struct ext4_map_blocks orig_map; 705 706 memcpy(&orig_map, map, sizeof(*map)); 707 #endif 708 709 map->m_flags = 0; 710 ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", 711 flags, map->m_len, (unsigned long) map->m_lblk); 712 713 /* 714 * ext4_map_blocks returns an int, and m_len is an unsigned int 715 */ 716 if (unlikely(map->m_len > INT_MAX)) 717 map->m_len = INT_MAX; 718 719 /* We can handle the block number less than EXT_MAX_BLOCKS */ 720 if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) 721 return -EFSCORRUPTED; 722 723 /* 724 * Callers from the context of data submission are the only exceptions 725 * for regular files that do not hold the i_rwsem or invalidate_lock. 726 * However, caching unrelated ranges is not permitted. 727 */ 728 if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) 729 WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); 730 else 731 ext4_check_map_extents_env(inode); 732 733 /* Lookup extent status tree firstly */ 734 if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { 735 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 736 map->m_pblk = ext4_es_pblock(&es) + 737 map->m_lblk - es.es_lblk; 738 map->m_flags |= ext4_es_is_written(&es) ? 739 EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; 740 retval = es.es_len - (map->m_lblk - es.es_lblk); 741 if (retval > map->m_len) 742 retval = map->m_len; 743 map->m_len = retval; 744 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { 745 map->m_pblk = 0; 746 map->m_flags |= ext4_es_is_delayed(&es) ? 747 EXT4_MAP_DELAYED : 0; 748 retval = es.es_len - (map->m_lblk - es.es_lblk); 749 if (retval > map->m_len) 750 retval = map->m_len; 751 map->m_len = retval; 752 retval = 0; 753 } else { 754 BUG(); 755 } 756 757 if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) 758 return retval; 759 #ifdef ES_AGGRESSIVE_TEST 760 ext4_map_blocks_es_recheck(handle, inode, map, 761 &orig_map, flags); 762 #endif 763 if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || 764 orig_mlen == map->m_len) 765 goto found; 766 767 map->m_len = orig_mlen; 768 } 769 /* 770 * In the query cache no-wait mode, nothing we can do more if we 771 * cannot find extent in the cache. 772 */ 773 if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) 774 return 0; 775 776 /* 777 * Try to see if we can get the block without requesting a new 778 * file system block. 779 */ 780 down_read(&EXT4_I(inode)->i_data_sem); 781 retval = ext4_map_query_blocks(handle, inode, map, flags); 782 up_read((&EXT4_I(inode)->i_data_sem)); 783 784 found: 785 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 786 ret = check_block_validity(inode, map); 787 if (ret != 0) 788 return ret; 789 } 790 791 /* If it is only a block(s) look up */ 792 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) 793 return retval; 794 795 /* 796 * Returns if the blocks have already allocated 797 * 798 * Note that if blocks have been preallocated 799 * ext4_ext_map_blocks() returns with buffer head unmapped 800 */ 801 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 802 /* 803 * If we need to convert extent to unwritten 804 * we continue and do the actual work in 805 * ext4_ext_map_blocks() 806 */ 807 if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) 808 return retval; 809 810 811 ext4_fc_track_inode(handle, inode); 812 /* 813 * New blocks allocate and/or writing to unwritten extent 814 * will possibly result in updating i_data, so we take 815 * the write lock of i_data_sem, and call get_block() 816 * with create == 1 flag. 817 */ 818 down_write(&EXT4_I(inode)->i_data_sem); 819 retval = ext4_map_create_blocks(handle, inode, map, flags); 820 up_write((&EXT4_I(inode)->i_data_sem)); 821 822 if (retval < 0) 823 ext_debug(inode, "failed with err %d\n", retval); 824 if (retval <= 0) 825 return retval; 826 827 if (map->m_flags & EXT4_MAP_MAPPED) { 828 ret = check_block_validity(inode, map); 829 if (ret != 0) 830 return ret; 831 832 /* 833 * Inodes with freshly allocated blocks where contents will be 834 * visible after transaction commit must be on transaction's 835 * ordered data list. 836 */ 837 if (map->m_flags & EXT4_MAP_NEW && 838 !(map->m_flags & EXT4_MAP_UNWRITTEN) && 839 !(flags & EXT4_GET_BLOCKS_ZERO) && 840 !ext4_is_quota_file(inode) && 841 ext4_should_order_data(inode)) { 842 loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); 843 loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); 844 845 if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) 846 ret = ext4_jbd2_inode_add_wait(handle, inode, 847 start_byte, length); 848 else 849 ret = ext4_jbd2_inode_add_write(handle, inode, 850 start_byte, length); 851 if (ret) 852 return ret; 853 } 854 } 855 ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + 856 map->m_len - 1); 857 return retval; 858 } 859 860 /* 861 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages 862 * we have to be careful as someone else may be manipulating b_state as well. 863 */ 864 static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) 865 { 866 unsigned long old_state; 867 unsigned long new_state; 868 869 flags &= EXT4_MAP_FLAGS; 870 871 /* Dummy buffer_head? Set non-atomically. */ 872 if (!bh->b_folio) { 873 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; 874 return; 875 } 876 /* 877 * Someone else may be modifying b_state. Be careful! This is ugly but 878 * once we get rid of using bh as a container for mapping information 879 * to pass to / from get_block functions, this can go away. 880 */ 881 old_state = READ_ONCE(bh->b_state); 882 do { 883 new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; 884 } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); 885 } 886 887 /* 888 * Make sure that the current journal transaction has enough credits to map 889 * one extent. Return -EAGAIN if it cannot extend the current running 890 * transaction. 891 */ 892 static inline int ext4_journal_ensure_extent_credits(handle_t *handle, 893 struct inode *inode) 894 { 895 int credits; 896 int ret; 897 898 /* Called from ext4_da_write_begin() which has no handle started? */ 899 if (!handle) 900 return 0; 901 902 credits = ext4_chunk_trans_blocks(inode, 1); 903 ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); 904 return ret <= 0 ? ret : -EAGAIN; 905 } 906 907 static int _ext4_get_block(struct inode *inode, sector_t iblock, 908 struct buffer_head *bh, int flags) 909 { 910 struct ext4_map_blocks map; 911 int ret = 0; 912 913 if (ext4_has_inline_data(inode)) 914 return -ERANGE; 915 916 map.m_lblk = iblock; 917 map.m_len = bh->b_size >> inode->i_blkbits; 918 919 ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, 920 flags); 921 if (ret > 0) { 922 map_bh(bh, inode->i_sb, map.m_pblk); 923 ext4_update_bh_state(bh, map.m_flags); 924 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 925 ret = 0; 926 } else if (ret == 0) { 927 /* hole case, need to fill in bh->b_size */ 928 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 929 } 930 return ret; 931 } 932 933 int ext4_get_block(struct inode *inode, sector_t iblock, 934 struct buffer_head *bh, int create) 935 { 936 return _ext4_get_block(inode, iblock, bh, 937 create ? EXT4_GET_BLOCKS_CREATE : 0); 938 } 939 940 /* 941 * Get block function used when preparing for buffered write if we require 942 * creating an unwritten extent if blocks haven't been allocated. The extent 943 * will be converted to written after the IO is complete. 944 */ 945 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 946 struct buffer_head *bh_result, int create) 947 { 948 int ret = 0; 949 950 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", 951 inode->i_ino, create); 952 ret = _ext4_get_block(inode, iblock, bh_result, 953 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); 954 955 /* 956 * If the buffer is marked unwritten, mark it as new to make sure it is 957 * zeroed out correctly in case of partial writes. Otherwise, there is 958 * a chance of stale data getting exposed. 959 */ 960 if (ret == 0 && buffer_unwritten(bh_result)) 961 set_buffer_new(bh_result); 962 963 return ret; 964 } 965 966 /* Maximum number of blocks we map for direct IO at once. */ 967 #define DIO_MAX_BLOCKS 4096 968 969 /* 970 * `handle' can be NULL if create is zero 971 */ 972 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 973 ext4_lblk_t block, int map_flags) 974 { 975 struct ext4_map_blocks map; 976 struct buffer_head *bh; 977 int create = map_flags & EXT4_GET_BLOCKS_CREATE; 978 bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; 979 int err; 980 981 ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 982 || handle != NULL || create == 0); 983 ASSERT(create == 0 || !nowait); 984 985 map.m_lblk = block; 986 map.m_len = 1; 987 err = ext4_map_blocks(handle, inode, &map, map_flags); 988 989 if (err == 0) 990 return create ? ERR_PTR(-ENOSPC) : NULL; 991 if (err < 0) 992 return ERR_PTR(err); 993 994 if (nowait) 995 return sb_find_get_block(inode->i_sb, map.m_pblk); 996 997 /* 998 * Since bh could introduce extra ref count such as referred by 999 * journal_head etc. Try to avoid using __GFP_MOVABLE here 1000 * as it may fail the migration when journal_head remains. 1001 */ 1002 bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, 1003 inode->i_sb->s_blocksize); 1004 1005 if (unlikely(!bh)) 1006 return ERR_PTR(-ENOMEM); 1007 if (map.m_flags & EXT4_MAP_NEW) { 1008 ASSERT(create != 0); 1009 ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 1010 || (handle != NULL)); 1011 1012 /* 1013 * Now that we do not always journal data, we should 1014 * keep in mind whether this should always journal the 1015 * new buffer as metadata. For now, regular file 1016 * writes use ext4_get_block instead, so it's not a 1017 * problem. 1018 */ 1019 lock_buffer(bh); 1020 BUFFER_TRACE(bh, "call get_create_access"); 1021 err = ext4_journal_get_create_access(handle, inode->i_sb, bh, 1022 EXT4_JTR_NONE); 1023 if (unlikely(err)) { 1024 unlock_buffer(bh); 1025 goto errout; 1026 } 1027 if (!buffer_uptodate(bh)) { 1028 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1029 set_buffer_uptodate(bh); 1030 } 1031 unlock_buffer(bh); 1032 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1033 err = ext4_handle_dirty_metadata(handle, inode, bh); 1034 if (unlikely(err)) 1035 goto errout; 1036 } else 1037 BUFFER_TRACE(bh, "not a new buffer"); 1038 return bh; 1039 errout: 1040 brelse(bh); 1041 return ERR_PTR(err); 1042 } 1043 1044 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1045 ext4_lblk_t block, int map_flags) 1046 { 1047 struct buffer_head *bh; 1048 int ret; 1049 1050 bh = ext4_getblk(handle, inode, block, map_flags); 1051 if (IS_ERR(bh)) 1052 return bh; 1053 if (!bh || ext4_buffer_uptodate(bh)) 1054 return bh; 1055 1056 ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); 1057 if (ret) { 1058 put_bh(bh); 1059 return ERR_PTR(ret); 1060 } 1061 return bh; 1062 } 1063 1064 /* Read a contiguous batch of blocks. */ 1065 int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, 1066 bool wait, struct buffer_head **bhs) 1067 { 1068 int i, err; 1069 1070 for (i = 0; i < bh_count; i++) { 1071 bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); 1072 if (IS_ERR(bhs[i])) { 1073 err = PTR_ERR(bhs[i]); 1074 bh_count = i; 1075 goto out_brelse; 1076 } 1077 } 1078 1079 for (i = 0; i < bh_count; i++) 1080 /* Note that NULL bhs[i] is valid because of holes. */ 1081 if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) 1082 ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); 1083 1084 if (!wait) 1085 return 0; 1086 1087 for (i = 0; i < bh_count; i++) 1088 if (bhs[i]) 1089 wait_on_buffer(bhs[i]); 1090 1091 for (i = 0; i < bh_count; i++) { 1092 if (bhs[i] && !buffer_uptodate(bhs[i])) { 1093 err = -EIO; 1094 goto out_brelse; 1095 } 1096 } 1097 return 0; 1098 1099 out_brelse: 1100 for (i = 0; i < bh_count; i++) { 1101 brelse(bhs[i]); 1102 bhs[i] = NULL; 1103 } 1104 return err; 1105 } 1106 1107 int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, 1108 struct buffer_head *head, 1109 unsigned from, 1110 unsigned to, 1111 int *partial, 1112 int (*fn)(handle_t *handle, struct inode *inode, 1113 struct buffer_head *bh)) 1114 { 1115 struct buffer_head *bh; 1116 unsigned block_start, block_end; 1117 unsigned blocksize = head->b_size; 1118 int err, ret = 0; 1119 struct buffer_head *next; 1120 1121 for (bh = head, block_start = 0; 1122 ret == 0 && (bh != head || !block_start); 1123 block_start = block_end, bh = next) { 1124 next = bh->b_this_page; 1125 block_end = block_start + blocksize; 1126 if (block_end <= from || block_start >= to) { 1127 if (partial && !buffer_uptodate(bh)) 1128 *partial = 1; 1129 continue; 1130 } 1131 err = (*fn)(handle, inode, bh); 1132 if (!ret) 1133 ret = err; 1134 } 1135 return ret; 1136 } 1137 1138 /* 1139 * Helper for handling dirtying of journalled data. We also mark the folio as 1140 * dirty so that writeback code knows about this page (and inode) contains 1141 * dirty data. ext4_writepages() then commits appropriate transaction to 1142 * make data stable. 1143 */ 1144 static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) 1145 { 1146 struct folio *folio = bh->b_folio; 1147 struct inode *inode = folio->mapping->host; 1148 1149 /* only regular files have a_ops */ 1150 if (S_ISREG(inode->i_mode)) 1151 folio_mark_dirty(folio); 1152 return ext4_handle_dirty_metadata(handle, NULL, bh); 1153 } 1154 1155 int do_journal_get_write_access(handle_t *handle, struct inode *inode, 1156 struct buffer_head *bh) 1157 { 1158 if (!buffer_mapped(bh) || buffer_freed(bh)) 1159 return 0; 1160 BUFFER_TRACE(bh, "get write access"); 1161 return ext4_journal_get_write_access(handle, inode->i_sb, bh, 1162 EXT4_JTR_NONE); 1163 } 1164 1165 int ext4_block_write_begin(handle_t *handle, struct folio *folio, 1166 loff_t pos, unsigned len, 1167 get_block_t *get_block) 1168 { 1169 unsigned int from = offset_in_folio(folio, pos); 1170 unsigned to = from + len; 1171 struct inode *inode = folio->mapping->host; 1172 unsigned block_start, block_end; 1173 sector_t block; 1174 int err = 0; 1175 unsigned int blocksize = i_blocksize(inode); 1176 struct buffer_head *bh, *head, *wait[2]; 1177 int nr_wait = 0; 1178 int i; 1179 bool should_journal_data = ext4_should_journal_data(inode); 1180 1181 BUG_ON(!folio_test_locked(folio)); 1182 BUG_ON(to > folio_size(folio)); 1183 BUG_ON(from > to); 1184 WARN_ON_ONCE(blocksize > folio_size(folio)); 1185 1186 head = folio_buffers(folio); 1187 if (!head) 1188 head = create_empty_buffers(folio, blocksize, 0); 1189 block = EXT4_PG_TO_LBLK(inode, folio->index); 1190 1191 for (bh = head, block_start = 0; bh != head || !block_start; 1192 block++, block_start = block_end, bh = bh->b_this_page) { 1193 block_end = block_start + blocksize; 1194 if (block_end <= from || block_start >= to) { 1195 if (folio_test_uptodate(folio)) { 1196 set_buffer_uptodate(bh); 1197 } 1198 continue; 1199 } 1200 if (WARN_ON_ONCE(buffer_new(bh))) 1201 clear_buffer_new(bh); 1202 if (!buffer_mapped(bh)) { 1203 WARN_ON(bh->b_size != blocksize); 1204 err = ext4_journal_ensure_extent_credits(handle, inode); 1205 if (!err) 1206 err = get_block(inode, block, bh, 1); 1207 if (err) 1208 break; 1209 if (buffer_new(bh)) { 1210 /* 1211 * We may be zeroing partial buffers or all new 1212 * buffers in case of failure. Prepare JBD2 for 1213 * that. 1214 */ 1215 if (should_journal_data) 1216 do_journal_get_write_access(handle, 1217 inode, bh); 1218 if (folio_test_uptodate(folio)) { 1219 /* 1220 * Unlike __block_write_begin() we leave 1221 * dirtying of new uptodate buffers to 1222 * ->write_end() time or 1223 * folio_zero_new_buffers(). 1224 */ 1225 set_buffer_uptodate(bh); 1226 continue; 1227 } 1228 if (block_end > to || block_start < from) 1229 folio_zero_segments(folio, to, 1230 block_end, 1231 block_start, from); 1232 continue; 1233 } 1234 } 1235 if (folio_test_uptodate(folio)) { 1236 set_buffer_uptodate(bh); 1237 continue; 1238 } 1239 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 1240 !buffer_unwritten(bh) && 1241 (block_start < from || block_end > to)) { 1242 ext4_read_bh_lock(bh, 0, false); 1243 wait[nr_wait++] = bh; 1244 } 1245 } 1246 /* 1247 * If we issued read requests, let them complete. 1248 */ 1249 for (i = 0; i < nr_wait; i++) { 1250 wait_on_buffer(wait[i]); 1251 if (!buffer_uptodate(wait[i])) 1252 err = -EIO; 1253 } 1254 if (unlikely(err)) { 1255 if (should_journal_data) 1256 ext4_journalled_zero_new_buffers(handle, inode, folio, 1257 from, to); 1258 else 1259 folio_zero_new_buffers(folio, from, to); 1260 } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { 1261 for (i = 0; i < nr_wait; i++) { 1262 int err2; 1263 1264 err2 = fscrypt_decrypt_pagecache_blocks(folio, 1265 blocksize, bh_offset(wait[i])); 1266 if (err2) { 1267 clear_buffer_uptodate(wait[i]); 1268 err = err2; 1269 } 1270 } 1271 } 1272 1273 return err; 1274 } 1275 1276 /* 1277 * To preserve ordering, it is essential that the hole instantiation and 1278 * the data write be encapsulated in a single transaction. We cannot 1279 * close off a transaction and start a new one between the ext4_get_block() 1280 * and the ext4_write_end(). So doing the jbd2_journal_start at the start of 1281 * ext4_write_begin() is the right place. 1282 */ 1283 static int ext4_write_begin(const struct kiocb *iocb, 1284 struct address_space *mapping, 1285 loff_t pos, unsigned len, 1286 struct folio **foliop, void **fsdata) 1287 { 1288 struct inode *inode = mapping->host; 1289 int ret, needed_blocks; 1290 handle_t *handle; 1291 int retries = 0; 1292 struct folio *folio; 1293 pgoff_t index; 1294 unsigned from, to; 1295 1296 ret = ext4_emergency_state(inode->i_sb); 1297 if (unlikely(ret)) 1298 return ret; 1299 1300 trace_ext4_write_begin(inode, pos, len); 1301 /* 1302 * Reserve one block more for addition to orphan list in case 1303 * we allocate blocks but write fails for some reason 1304 */ 1305 needed_blocks = ext4_chunk_trans_extent(inode, 1306 ext4_journal_blocks_per_folio(inode)) + 1; 1307 index = pos >> PAGE_SHIFT; 1308 1309 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 1310 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, 1311 foliop); 1312 if (ret < 0) 1313 return ret; 1314 if (ret == 1) 1315 return 0; 1316 } 1317 1318 /* 1319 * write_begin_get_folio() can take a long time if the 1320 * system is thrashing due to memory pressure, or if the folio 1321 * is being written back. So grab it first before we start 1322 * the transaction handle. This also allows us to allocate 1323 * the folio (if needed) without using GFP_NOFS. 1324 */ 1325 retry_grab: 1326 folio = write_begin_get_folio(iocb, mapping, index, len); 1327 if (IS_ERR(folio)) 1328 return PTR_ERR(folio); 1329 1330 if (len > folio_next_pos(folio) - pos) 1331 len = folio_next_pos(folio) - pos; 1332 1333 from = offset_in_folio(folio, pos); 1334 to = from + len; 1335 1336 /* 1337 * The same as page allocation, we prealloc buffer heads before 1338 * starting the handle. 1339 */ 1340 if (!folio_buffers(folio)) 1341 create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); 1342 1343 folio_unlock(folio); 1344 1345 retry_journal: 1346 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); 1347 if (IS_ERR(handle)) { 1348 folio_put(folio); 1349 return PTR_ERR(handle); 1350 } 1351 1352 folio_lock(folio); 1353 if (folio->mapping != mapping) { 1354 /* The folio got truncated from under us */ 1355 folio_unlock(folio); 1356 folio_put(folio); 1357 ext4_journal_stop(handle); 1358 goto retry_grab; 1359 } 1360 /* In case writeback began while the folio was unlocked */ 1361 folio_wait_stable(folio); 1362 1363 if (ext4_should_dioread_nolock(inode)) 1364 ret = ext4_block_write_begin(handle, folio, pos, len, 1365 ext4_get_block_unwritten); 1366 else 1367 ret = ext4_block_write_begin(handle, folio, pos, len, 1368 ext4_get_block); 1369 if (!ret && ext4_should_journal_data(inode)) { 1370 ret = ext4_walk_page_buffers(handle, inode, 1371 folio_buffers(folio), from, to, 1372 NULL, do_journal_get_write_access); 1373 } 1374 1375 if (ret) { 1376 bool extended = (pos + len > inode->i_size) && 1377 !ext4_verity_in_progress(inode); 1378 1379 folio_unlock(folio); 1380 /* 1381 * ext4_block_write_begin may have instantiated a few blocks 1382 * outside i_size. Trim these off again. Don't need 1383 * i_size_read because we hold i_rwsem. 1384 * 1385 * Add inode to orphan list in case we crash before 1386 * truncate finishes 1387 */ 1388 if (extended && ext4_can_truncate(inode)) 1389 ext4_orphan_add(handle, inode); 1390 1391 ext4_journal_stop(handle); 1392 if (extended) { 1393 ext4_truncate_failed_write(inode); 1394 /* 1395 * If truncate failed early the inode might 1396 * still be on the orphan list; we need to 1397 * make sure the inode is removed from the 1398 * orphan list in that case. 1399 */ 1400 if (inode->i_nlink) 1401 ext4_orphan_del(NULL, inode); 1402 } 1403 1404 if (ret == -EAGAIN || 1405 (ret == -ENOSPC && 1406 ext4_should_retry_alloc(inode->i_sb, &retries))) 1407 goto retry_journal; 1408 folio_put(folio); 1409 return ret; 1410 } 1411 *foliop = folio; 1412 return ret; 1413 } 1414 1415 /* For write_end() in data=journal mode */ 1416 static int write_end_fn(handle_t *handle, struct inode *inode, 1417 struct buffer_head *bh) 1418 { 1419 int ret; 1420 if (!buffer_mapped(bh) || buffer_freed(bh)) 1421 return 0; 1422 set_buffer_uptodate(bh); 1423 ret = ext4_dirty_journalled_data(handle, bh); 1424 clear_buffer_meta(bh); 1425 clear_buffer_prio(bh); 1426 clear_buffer_new(bh); 1427 return ret; 1428 } 1429 1430 /* 1431 * We need to pick up the new inode size which generic_commit_write gave us 1432 * `iocb` can be NULL - eg, when called from page_symlink(). 1433 * 1434 * ext4 never places buffers on inode->i_mapping->i_private_list. metadata 1435 * buffers are managed internally. 1436 */ 1437 static int ext4_write_end(const struct kiocb *iocb, 1438 struct address_space *mapping, 1439 loff_t pos, unsigned len, unsigned copied, 1440 struct folio *folio, void *fsdata) 1441 { 1442 handle_t *handle = ext4_journal_current_handle(); 1443 struct inode *inode = mapping->host; 1444 loff_t old_size = inode->i_size; 1445 int ret = 0, ret2; 1446 int i_size_changed = 0; 1447 bool verity = ext4_verity_in_progress(inode); 1448 1449 trace_ext4_write_end(inode, pos, len, copied); 1450 1451 if (ext4_has_inline_data(inode) && 1452 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) 1453 return ext4_write_inline_data_end(inode, pos, len, copied, 1454 folio); 1455 1456 copied = block_write_end(pos, len, copied, folio); 1457 /* 1458 * it's important to update i_size while still holding folio lock: 1459 * page writeout could otherwise come in and zero beyond i_size. 1460 * 1461 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree 1462 * blocks are being written past EOF, so skip the i_size update. 1463 */ 1464 if (!verity) 1465 i_size_changed = ext4_update_inode_size(inode, pos + copied); 1466 folio_unlock(folio); 1467 folio_put(folio); 1468 1469 if (old_size < pos && !verity) { 1470 pagecache_isize_extended(inode, old_size, pos); 1471 ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); 1472 } 1473 /* 1474 * Don't mark the inode dirty under folio lock. First, it unnecessarily 1475 * makes the holding time of folio lock longer. Second, it forces lock 1476 * ordering of folio lock and transaction start for journaling 1477 * filesystems. 1478 */ 1479 if (i_size_changed) 1480 ret = ext4_mark_inode_dirty(handle, inode); 1481 1482 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) 1483 /* if we have allocated more blocks and copied 1484 * less. We will have blocks allocated outside 1485 * inode->i_size. So truncate them 1486 */ 1487 ext4_orphan_add(handle, inode); 1488 1489 ret2 = ext4_journal_stop(handle); 1490 if (!ret) 1491 ret = ret2; 1492 1493 if (pos + len > inode->i_size && !verity) { 1494 ext4_truncate_failed_write(inode); 1495 /* 1496 * If truncate failed early the inode might still be 1497 * on the orphan list; we need to make sure the inode 1498 * is removed from the orphan list in that case. 1499 */ 1500 if (inode->i_nlink) 1501 ext4_orphan_del(NULL, inode); 1502 } 1503 1504 return ret ? ret : copied; 1505 } 1506 1507 /* 1508 * This is a private version of folio_zero_new_buffers() which doesn't 1509 * set the buffer to be dirty, since in data=journalled mode we need 1510 * to call ext4_dirty_journalled_data() instead. 1511 */ 1512 static void ext4_journalled_zero_new_buffers(handle_t *handle, 1513 struct inode *inode, 1514 struct folio *folio, 1515 unsigned from, unsigned to) 1516 { 1517 unsigned int block_start = 0, block_end; 1518 struct buffer_head *head, *bh; 1519 1520 bh = head = folio_buffers(folio); 1521 do { 1522 block_end = block_start + bh->b_size; 1523 if (buffer_new(bh)) { 1524 if (block_end > from && block_start < to) { 1525 if (!folio_test_uptodate(folio)) { 1526 unsigned start, size; 1527 1528 start = max(from, block_start); 1529 size = min(to, block_end) - start; 1530 1531 folio_zero_range(folio, start, size); 1532 } 1533 clear_buffer_new(bh); 1534 write_end_fn(handle, inode, bh); 1535 } 1536 } 1537 block_start = block_end; 1538 bh = bh->b_this_page; 1539 } while (bh != head); 1540 } 1541 1542 static int ext4_journalled_write_end(const struct kiocb *iocb, 1543 struct address_space *mapping, 1544 loff_t pos, unsigned len, unsigned copied, 1545 struct folio *folio, void *fsdata) 1546 { 1547 handle_t *handle = ext4_journal_current_handle(); 1548 struct inode *inode = mapping->host; 1549 loff_t old_size = inode->i_size; 1550 int ret = 0, ret2; 1551 int partial = 0; 1552 unsigned from, to; 1553 int size_changed = 0; 1554 bool verity = ext4_verity_in_progress(inode); 1555 1556 trace_ext4_journalled_write_end(inode, pos, len, copied); 1557 from = pos & (PAGE_SIZE - 1); 1558 to = from + len; 1559 1560 BUG_ON(!ext4_handle_valid(handle)); 1561 1562 if (ext4_has_inline_data(inode)) 1563 return ext4_write_inline_data_end(inode, pos, len, copied, 1564 folio); 1565 1566 if (unlikely(copied < len) && !folio_test_uptodate(folio)) { 1567 copied = 0; 1568 ext4_journalled_zero_new_buffers(handle, inode, folio, 1569 from, to); 1570 } else { 1571 if (unlikely(copied < len)) 1572 ext4_journalled_zero_new_buffers(handle, inode, folio, 1573 from + copied, to); 1574 ret = ext4_walk_page_buffers(handle, inode, 1575 folio_buffers(folio), 1576 from, from + copied, &partial, 1577 write_end_fn); 1578 if (!partial) 1579 folio_mark_uptodate(folio); 1580 } 1581 if (!verity) 1582 size_changed = ext4_update_inode_size(inode, pos + copied); 1583 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1584 folio_unlock(folio); 1585 folio_put(folio); 1586 1587 if (old_size < pos && !verity) { 1588 pagecache_isize_extended(inode, old_size, pos); 1589 ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); 1590 } 1591 1592 if (size_changed) { 1593 ret2 = ext4_mark_inode_dirty(handle, inode); 1594 if (!ret) 1595 ret = ret2; 1596 } 1597 1598 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) 1599 /* if we have allocated more blocks and copied 1600 * less. We will have blocks allocated outside 1601 * inode->i_size. So truncate them 1602 */ 1603 ext4_orphan_add(handle, inode); 1604 1605 ret2 = ext4_journal_stop(handle); 1606 if (!ret) 1607 ret = ret2; 1608 if (pos + len > inode->i_size && !verity) { 1609 ext4_truncate_failed_write(inode); 1610 /* 1611 * If truncate failed early the inode might still be 1612 * on the orphan list; we need to make sure the inode 1613 * is removed from the orphan list in that case. 1614 */ 1615 if (inode->i_nlink) 1616 ext4_orphan_del(NULL, inode); 1617 } 1618 1619 return ret ? ret : copied; 1620 } 1621 1622 /* 1623 * Reserve space for 'nr_resv' clusters 1624 */ 1625 static int ext4_da_reserve_space(struct inode *inode, int nr_resv) 1626 { 1627 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1628 struct ext4_inode_info *ei = EXT4_I(inode); 1629 int ret; 1630 1631 /* 1632 * We will charge metadata quota at writeout time; this saves 1633 * us from metadata over-estimation, though we may go over by 1634 * a small amount in the end. Here we just reserve for data. 1635 */ 1636 ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); 1637 if (ret) 1638 return ret; 1639 1640 spin_lock(&ei->i_block_reservation_lock); 1641 if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { 1642 spin_unlock(&ei->i_block_reservation_lock); 1643 dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); 1644 return -ENOSPC; 1645 } 1646 ei->i_reserved_data_blocks += nr_resv; 1647 trace_ext4_da_reserve_space(inode, nr_resv); 1648 spin_unlock(&ei->i_block_reservation_lock); 1649 1650 return 0; /* success */ 1651 } 1652 1653 void ext4_da_release_space(struct inode *inode, int to_free) 1654 { 1655 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1656 struct ext4_inode_info *ei = EXT4_I(inode); 1657 1658 if (!to_free) 1659 return; /* Nothing to release, exit */ 1660 1661 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1662 1663 trace_ext4_da_release_space(inode, to_free); 1664 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1665 /* 1666 * if there aren't enough reserved blocks, then the 1667 * counter is messed up somewhere. Since this 1668 * function is called from invalidate page, it's 1669 * harmless to return without any action. 1670 */ 1671 ext4_warning(inode->i_sb, "ext4_da_release_space: " 1672 "ino %lu, to_free %d with only %d reserved " 1673 "data blocks", inode->i_ino, to_free, 1674 ei->i_reserved_data_blocks); 1675 WARN_ON(1); 1676 to_free = ei->i_reserved_data_blocks; 1677 } 1678 ei->i_reserved_data_blocks -= to_free; 1679 1680 /* update fs dirty data blocks counter */ 1681 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); 1682 1683 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1684 1685 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); 1686 } 1687 1688 /* 1689 * Delayed allocation stuff 1690 */ 1691 1692 struct mpage_da_data { 1693 /* These are input fields for ext4_do_writepages() */ 1694 struct inode *inode; 1695 struct writeback_control *wbc; 1696 unsigned int can_map:1; /* Can writepages call map blocks? */ 1697 1698 /* These are internal state of ext4_do_writepages() */ 1699 loff_t start_pos; /* The start pos to write */ 1700 loff_t next_pos; /* Current pos to examine */ 1701 loff_t end_pos; /* Last pos to examine */ 1702 1703 /* 1704 * Extent to map - this can be after start_pos because that can be 1705 * fully mapped. We somewhat abuse m_flags to store whether the extent 1706 * is delalloc or unwritten. 1707 */ 1708 struct ext4_map_blocks map; 1709 struct ext4_io_submit io_submit; /* IO submission data */ 1710 unsigned int do_map:1; 1711 unsigned int scanned_until_end:1; 1712 unsigned int journalled_more_data:1; 1713 }; 1714 1715 static void mpage_release_unused_pages(struct mpage_da_data *mpd, 1716 bool invalidate) 1717 { 1718 unsigned nr, i; 1719 pgoff_t index, end; 1720 struct folio_batch fbatch; 1721 struct inode *inode = mpd->inode; 1722 struct address_space *mapping = inode->i_mapping; 1723 1724 /* This is necessary when next_pos == 0. */ 1725 if (mpd->start_pos >= mpd->next_pos) 1726 return; 1727 1728 mpd->scanned_until_end = 0; 1729 if (invalidate) { 1730 ext4_lblk_t start, last; 1731 start = EXT4_B_TO_LBLK(inode, mpd->start_pos); 1732 last = mpd->next_pos >> inode->i_blkbits; 1733 1734 /* 1735 * avoid racing with extent status tree scans made by 1736 * ext4_insert_delayed_block() 1737 */ 1738 down_write(&EXT4_I(inode)->i_data_sem); 1739 ext4_es_remove_extent(inode, start, last - start); 1740 up_write(&EXT4_I(inode)->i_data_sem); 1741 } 1742 1743 folio_batch_init(&fbatch); 1744 index = mpd->start_pos >> PAGE_SHIFT; 1745 end = mpd->next_pos >> PAGE_SHIFT; 1746 while (index < end) { 1747 nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); 1748 if (nr == 0) 1749 break; 1750 for (i = 0; i < nr; i++) { 1751 struct folio *folio = fbatch.folios[i]; 1752 1753 if (folio_pos(folio) < mpd->start_pos) 1754 continue; 1755 if (folio_next_index(folio) > end) 1756 continue; 1757 BUG_ON(!folio_test_locked(folio)); 1758 BUG_ON(folio_test_writeback(folio)); 1759 if (invalidate) { 1760 if (folio_mapped(folio)) 1761 folio_clear_dirty_for_io(folio); 1762 block_invalidate_folio(folio, 0, 1763 folio_size(folio)); 1764 folio_clear_uptodate(folio); 1765 } 1766 folio_unlock(folio); 1767 } 1768 folio_batch_release(&fbatch); 1769 } 1770 } 1771 1772 static void ext4_print_free_blocks(struct inode *inode) 1773 { 1774 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1775 struct super_block *sb = inode->i_sb; 1776 struct ext4_inode_info *ei = EXT4_I(inode); 1777 1778 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", 1779 EXT4_C2B(EXT4_SB(inode->i_sb), 1780 ext4_count_free_clusters(sb))); 1781 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); 1782 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", 1783 (long long) EXT4_C2B(EXT4_SB(sb), 1784 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1785 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", 1786 (long long) EXT4_C2B(EXT4_SB(sb), 1787 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1788 ext4_msg(sb, KERN_CRIT, "Block reservation details"); 1789 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1790 ei->i_reserved_data_blocks); 1791 return; 1792 } 1793 1794 /* 1795 * Check whether the cluster containing lblk has been allocated or has 1796 * delalloc reservation. 1797 * 1798 * Returns 0 if the cluster doesn't have either, 1 if it has delalloc 1799 * reservation, 2 if it's already been allocated, negative error code on 1800 * failure. 1801 */ 1802 static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) 1803 { 1804 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1805 int ret; 1806 1807 /* Has delalloc reservation? */ 1808 if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) 1809 return 1; 1810 1811 /* Already been allocated? */ 1812 if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) 1813 return 2; 1814 ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); 1815 if (ret < 0) 1816 return ret; 1817 if (ret > 0) 1818 return 2; 1819 1820 return 0; 1821 } 1822 1823 /* 1824 * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents 1825 * status tree, incrementing the reserved 1826 * cluster/block count or making pending 1827 * reservations where needed 1828 * 1829 * @inode - file containing the newly added block 1830 * @lblk - start logical block to be added 1831 * @len - length of blocks to be added 1832 * 1833 * Returns 0 on success, negative error code on failure. 1834 */ 1835 static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, 1836 ext4_lblk_t len) 1837 { 1838 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1839 int ret; 1840 bool lclu_allocated = false; 1841 bool end_allocated = false; 1842 ext4_lblk_t resv_clu; 1843 ext4_lblk_t end = lblk + len - 1; 1844 1845 /* 1846 * If the cluster containing lblk or end is shared with a delayed, 1847 * written, or unwritten extent in a bigalloc file system, it's 1848 * already been accounted for and does not need to be reserved. 1849 * A pending reservation must be made for the cluster if it's 1850 * shared with a written or unwritten extent and doesn't already 1851 * have one. Written and unwritten extents can be purged from the 1852 * extents status tree if the system is under memory pressure, so 1853 * it's necessary to examine the extent tree if a search of the 1854 * extents status tree doesn't get a match. 1855 */ 1856 if (sbi->s_cluster_ratio == 1) { 1857 ret = ext4_da_reserve_space(inode, len); 1858 if (ret != 0) /* ENOSPC */ 1859 return ret; 1860 } else { /* bigalloc */ 1861 resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; 1862 1863 ret = ext4_clu_alloc_state(inode, lblk); 1864 if (ret < 0) 1865 return ret; 1866 if (ret > 0) { 1867 resv_clu--; 1868 lclu_allocated = (ret == 2); 1869 } 1870 1871 if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { 1872 ret = ext4_clu_alloc_state(inode, end); 1873 if (ret < 0) 1874 return ret; 1875 if (ret > 0) { 1876 resv_clu--; 1877 end_allocated = (ret == 2); 1878 } 1879 } 1880 1881 if (resv_clu) { 1882 ret = ext4_da_reserve_space(inode, resv_clu); 1883 if (ret != 0) /* ENOSPC */ 1884 return ret; 1885 } 1886 } 1887 1888 ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, 1889 end_allocated); 1890 return 0; 1891 } 1892 1893 /* 1894 * Looks up the requested blocks and sets the delalloc extent map. 1895 * First try to look up for the extent entry that contains the requested 1896 * blocks in the extent status tree without i_data_sem, then try to look 1897 * up for the ondisk extent mapping with i_data_sem in read mode, 1898 * finally hold i_data_sem in write mode, looks up again and add a 1899 * delalloc extent entry if it still couldn't find any extent. Pass out 1900 * the mapped extent through @map and return 0 on success. 1901 */ 1902 static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) 1903 { 1904 struct extent_status es; 1905 int retval; 1906 #ifdef ES_AGGRESSIVE_TEST 1907 struct ext4_map_blocks orig_map; 1908 1909 memcpy(&orig_map, map, sizeof(*map)); 1910 #endif 1911 1912 map->m_flags = 0; 1913 ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, 1914 (unsigned long) map->m_lblk); 1915 1916 ext4_check_map_extents_env(inode); 1917 1918 /* Lookup extent status tree firstly */ 1919 if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { 1920 map->m_len = min_t(unsigned int, map->m_len, 1921 es.es_len - (map->m_lblk - es.es_lblk)); 1922 1923 if (ext4_es_is_hole(&es)) 1924 goto add_delayed; 1925 1926 found: 1927 /* 1928 * Delayed extent could be allocated by fallocate. 1929 * So we need to check it. 1930 */ 1931 if (ext4_es_is_delayed(&es)) { 1932 map->m_flags |= EXT4_MAP_DELAYED; 1933 return 0; 1934 } 1935 1936 map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; 1937 if (ext4_es_is_written(&es)) 1938 map->m_flags |= EXT4_MAP_MAPPED; 1939 else if (ext4_es_is_unwritten(&es)) 1940 map->m_flags |= EXT4_MAP_UNWRITTEN; 1941 else 1942 BUG(); 1943 1944 #ifdef ES_AGGRESSIVE_TEST 1945 ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); 1946 #endif 1947 return 0; 1948 } 1949 1950 /* 1951 * Try to see if we can get the block without requesting a new 1952 * file system block. 1953 */ 1954 down_read(&EXT4_I(inode)->i_data_sem); 1955 if (ext4_has_inline_data(inode)) 1956 retval = 0; 1957 else 1958 retval = ext4_map_query_blocks(NULL, inode, map, 0); 1959 up_read(&EXT4_I(inode)->i_data_sem); 1960 if (retval) 1961 return retval < 0 ? retval : 0; 1962 1963 add_delayed: 1964 down_write(&EXT4_I(inode)->i_data_sem); 1965 /* 1966 * Page fault path (ext4_page_mkwrite does not take i_rwsem) 1967 * and fallocate path (no folio lock) can race. Make sure we 1968 * lookup the extent status tree here again while i_data_sem 1969 * is held in write mode, before inserting a new da entry in 1970 * the extent status tree. 1971 */ 1972 if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { 1973 map->m_len = min_t(unsigned int, map->m_len, 1974 es.es_len - (map->m_lblk - es.es_lblk)); 1975 1976 if (!ext4_es_is_hole(&es)) { 1977 up_write(&EXT4_I(inode)->i_data_sem); 1978 goto found; 1979 } 1980 } else if (!ext4_has_inline_data(inode)) { 1981 retval = ext4_map_query_blocks(NULL, inode, map, 0); 1982 if (retval) { 1983 up_write(&EXT4_I(inode)->i_data_sem); 1984 return retval < 0 ? retval : 0; 1985 } 1986 } 1987 1988 map->m_flags |= EXT4_MAP_DELAYED; 1989 retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); 1990 if (!retval) 1991 map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); 1992 up_write(&EXT4_I(inode)->i_data_sem); 1993 1994 return retval; 1995 } 1996 1997 /* 1998 * This is a special get_block_t callback which is used by 1999 * ext4_da_write_begin(). It will either return mapped block or 2000 * reserve space for a single block. 2001 * 2002 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. 2003 * We also have b_blocknr = -1 and b_bdev initialized properly 2004 * 2005 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. 2006 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 2007 * initialized properly. 2008 */ 2009 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2010 struct buffer_head *bh, int create) 2011 { 2012 struct ext4_map_blocks map; 2013 sector_t invalid_block = ~((sector_t) 0xffff); 2014 int ret = 0; 2015 2016 BUG_ON(create == 0); 2017 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 2018 2019 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 2020 invalid_block = ~0; 2021 2022 map.m_lblk = iblock; 2023 map.m_len = 1; 2024 2025 /* 2026 * first, we need to know whether the block is allocated already 2027 * preallocated blocks are unmapped but should treated 2028 * the same as allocated blocks. 2029 */ 2030 ret = ext4_da_map_blocks(inode, &map); 2031 if (ret < 0) 2032 return ret; 2033 2034 if (map.m_flags & EXT4_MAP_DELAYED) { 2035 map_bh(bh, inode->i_sb, invalid_block); 2036 set_buffer_new(bh); 2037 set_buffer_delay(bh); 2038 return 0; 2039 } 2040 2041 map_bh(bh, inode->i_sb, map.m_pblk); 2042 ext4_update_bh_state(bh, map.m_flags); 2043 2044 if (buffer_unwritten(bh)) { 2045 /* A delayed write to unwritten bh should be marked 2046 * new and mapped. Mapped ensures that we don't do 2047 * get_block multiple times when we write to the same 2048 * offset and new ensures that we do proper zero out 2049 * for partial write. 2050 */ 2051 set_buffer_new(bh); 2052 set_buffer_mapped(bh); 2053 } 2054 return 0; 2055 } 2056 2057 static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) 2058 { 2059 mpd->start_pos += folio_size(folio); 2060 mpd->wbc->nr_to_write -= folio_nr_pages(folio); 2061 folio_unlock(folio); 2062 } 2063 2064 static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) 2065 { 2066 size_t len; 2067 loff_t size; 2068 int err; 2069 2070 WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); 2071 folio_clear_dirty_for_io(folio); 2072 /* 2073 * We have to be very careful here! Nothing protects writeback path 2074 * against i_size changes and the page can be writeably mapped into 2075 * page tables. So an application can be growing i_size and writing 2076 * data through mmap while writeback runs. folio_clear_dirty_for_io() 2077 * write-protects our page in page tables and the page cannot get 2078 * written to again until we release folio lock. So only after 2079 * folio_clear_dirty_for_io() we are safe to sample i_size for 2080 * ext4_bio_write_folio() to zero-out tail of the written page. We rely 2081 * on the barrier provided by folio_test_clear_dirty() in 2082 * folio_clear_dirty_for_io() to make sure i_size is really sampled only 2083 * after page tables are updated. 2084 */ 2085 size = i_size_read(mpd->inode); 2086 len = folio_size(folio); 2087 if (folio_pos(folio) + len > size && 2088 !ext4_verity_in_progress(mpd->inode)) 2089 len = size & (len - 1); 2090 err = ext4_bio_write_folio(&mpd->io_submit, folio, len); 2091 2092 return err; 2093 } 2094 2095 #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) 2096 2097 /* 2098 * mballoc gives us at most this number of blocks... 2099 * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). 2100 * The rest of mballoc seems to handle chunks up to full group size. 2101 */ 2102 #define MAX_WRITEPAGES_EXTENT_LEN 2048 2103 2104 /* 2105 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map 2106 * 2107 * @mpd - extent of blocks 2108 * @lblk - logical number of the block in the file 2109 * @bh - buffer head we want to add to the extent 2110 * 2111 * The function is used to collect contig. blocks in the same state. If the 2112 * buffer doesn't require mapping for writeback and we haven't started the 2113 * extent of buffers to map yet, the function returns 'true' immediately - the 2114 * caller can write the buffer right away. Otherwise the function returns true 2115 * if the block has been added to the extent, false if the block couldn't be 2116 * added. 2117 */ 2118 static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, 2119 struct buffer_head *bh) 2120 { 2121 struct ext4_map_blocks *map = &mpd->map; 2122 2123 /* Buffer that doesn't need mapping for writeback? */ 2124 if (!buffer_dirty(bh) || !buffer_mapped(bh) || 2125 (!buffer_delay(bh) && !buffer_unwritten(bh))) { 2126 /* So far no extent to map => we write the buffer right away */ 2127 if (map->m_len == 0) 2128 return true; 2129 return false; 2130 } 2131 2132 /* First block in the extent? */ 2133 if (map->m_len == 0) { 2134 /* We cannot map unless handle is started... */ 2135 if (!mpd->do_map) 2136 return false; 2137 map->m_lblk = lblk; 2138 map->m_len = 1; 2139 map->m_flags = bh->b_state & BH_FLAGS; 2140 return true; 2141 } 2142 2143 /* Don't go larger than mballoc is willing to allocate */ 2144 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) 2145 return false; 2146 2147 /* Can we merge the block to our big extent? */ 2148 if (lblk == map->m_lblk + map->m_len && 2149 (bh->b_state & BH_FLAGS) == map->m_flags) { 2150 map->m_len++; 2151 return true; 2152 } 2153 return false; 2154 } 2155 2156 /* 2157 * mpage_process_page_bufs - submit page buffers for IO or add them to extent 2158 * 2159 * @mpd - extent of blocks for mapping 2160 * @head - the first buffer in the page 2161 * @bh - buffer we should start processing from 2162 * @lblk - logical number of the block in the file corresponding to @bh 2163 * 2164 * Walk through page buffers from @bh upto @head (exclusive) and either submit 2165 * the page for IO if all buffers in this page were mapped and there's no 2166 * accumulated extent of buffers to map or add buffers in the page to the 2167 * extent of buffers to map. The function returns 1 if the caller can continue 2168 * by processing the next page, 0 if it should stop adding buffers to the 2169 * extent to map because we cannot extend it anymore. It can also return value 2170 * < 0 in case of error during IO submission. 2171 */ 2172 static int mpage_process_page_bufs(struct mpage_da_data *mpd, 2173 struct buffer_head *head, 2174 struct buffer_head *bh, 2175 ext4_lblk_t lblk) 2176 { 2177 struct inode *inode = mpd->inode; 2178 int err; 2179 ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) 2180 >> inode->i_blkbits; 2181 2182 if (ext4_verity_in_progress(inode)) 2183 blocks = EXT_MAX_BLOCKS; 2184 2185 do { 2186 BUG_ON(buffer_locked(bh)); 2187 2188 if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { 2189 /* Found extent to map? */ 2190 if (mpd->map.m_len) 2191 return 0; 2192 /* Buffer needs mapping and handle is not started? */ 2193 if (!mpd->do_map) 2194 return 0; 2195 /* Everything mapped so far and we hit EOF */ 2196 break; 2197 } 2198 } while (lblk++, (bh = bh->b_this_page) != head); 2199 /* So far everything mapped? Submit the page for IO. */ 2200 if (mpd->map.m_len == 0) { 2201 err = mpage_submit_folio(mpd, head->b_folio); 2202 if (err < 0) 2203 return err; 2204 mpage_folio_done(mpd, head->b_folio); 2205 } 2206 if (lblk >= blocks) { 2207 mpd->scanned_until_end = 1; 2208 return 0; 2209 } 2210 return 1; 2211 } 2212 2213 /* 2214 * mpage_process_folio - update folio buffers corresponding to changed extent 2215 * and may submit fully mapped page for IO 2216 * @mpd: description of extent to map, on return next extent to map 2217 * @folio: Contains these buffers. 2218 * @m_lblk: logical block mapping. 2219 * @m_pblk: corresponding physical mapping. 2220 * @map_bh: determines on return whether this page requires any further 2221 * mapping or not. 2222 * 2223 * Scan given folio buffers corresponding to changed extent and update buffer 2224 * state according to new extent state. 2225 * We map delalloc buffers to their physical location, clear unwritten bits. 2226 * If the given folio is not fully mapped, we update @mpd to the next extent in 2227 * the given folio that needs mapping & return @map_bh as true. 2228 */ 2229 static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, 2230 ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, 2231 bool *map_bh) 2232 { 2233 struct buffer_head *head, *bh; 2234 ext4_io_end_t *io_end = mpd->io_submit.io_end; 2235 ext4_lblk_t lblk = *m_lblk; 2236 ext4_fsblk_t pblock = *m_pblk; 2237 int err = 0; 2238 ssize_t io_end_size = 0; 2239 struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); 2240 2241 bh = head = folio_buffers(folio); 2242 do { 2243 if (lblk < mpd->map.m_lblk) 2244 continue; 2245 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { 2246 /* 2247 * Buffer after end of mapped extent. 2248 * Find next buffer in the folio to map. 2249 */ 2250 mpd->map.m_len = 0; 2251 mpd->map.m_flags = 0; 2252 io_end_vec->size += io_end_size; 2253 2254 err = mpage_process_page_bufs(mpd, head, bh, lblk); 2255 if (err > 0) 2256 err = 0; 2257 if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { 2258 io_end_vec = ext4_alloc_io_end_vec(io_end); 2259 if (IS_ERR(io_end_vec)) { 2260 err = PTR_ERR(io_end_vec); 2261 goto out; 2262 } 2263 io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, 2264 mpd->map.m_lblk); 2265 } 2266 *map_bh = true; 2267 goto out; 2268 } 2269 if (buffer_delay(bh)) { 2270 clear_buffer_delay(bh); 2271 bh->b_blocknr = pblock++; 2272 } 2273 clear_buffer_unwritten(bh); 2274 io_end_size += i_blocksize(mpd->inode); 2275 } while (lblk++, (bh = bh->b_this_page) != head); 2276 2277 io_end_vec->size += io_end_size; 2278 *map_bh = false; 2279 out: 2280 *m_lblk = lblk; 2281 *m_pblk = pblock; 2282 return err; 2283 } 2284 2285 /* 2286 * mpage_map_buffers - update buffers corresponding to changed extent and 2287 * submit fully mapped pages for IO 2288 * 2289 * @mpd - description of extent to map, on return next extent to map 2290 * 2291 * Scan buffers corresponding to changed extent (we expect corresponding pages 2292 * to be already locked) and update buffer state according to new extent state. 2293 * We map delalloc buffers to their physical location, clear unwritten bits, 2294 * and mark buffers as uninit when we perform writes to unwritten extents 2295 * and do extent conversion after IO is finished. If the last page is not fully 2296 * mapped, we update @map to the next extent in the last page that needs 2297 * mapping. Otherwise we submit the page for IO. 2298 */ 2299 static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) 2300 { 2301 struct folio_batch fbatch; 2302 unsigned nr, i; 2303 struct inode *inode = mpd->inode; 2304 pgoff_t start, end; 2305 ext4_lblk_t lblk; 2306 ext4_fsblk_t pblock; 2307 int err; 2308 bool map_bh = false; 2309 2310 start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk); 2311 end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1); 2312 pblock = mpd->map.m_pblk; 2313 2314 folio_batch_init(&fbatch); 2315 while (start <= end) { 2316 nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); 2317 if (nr == 0) 2318 break; 2319 for (i = 0; i < nr; i++) { 2320 struct folio *folio = fbatch.folios[i]; 2321 2322 lblk = EXT4_PG_TO_LBLK(inode, folio->index); 2323 err = mpage_process_folio(mpd, folio, &lblk, &pblock, 2324 &map_bh); 2325 /* 2326 * If map_bh is true, means page may require further bh 2327 * mapping, or maybe the page was submitted for IO. 2328 * So we return to call further extent mapping. 2329 */ 2330 if (err < 0 || map_bh) 2331 goto out; 2332 /* Page fully mapped - let IO run! */ 2333 err = mpage_submit_folio(mpd, folio); 2334 if (err < 0) 2335 goto out; 2336 mpage_folio_done(mpd, folio); 2337 } 2338 folio_batch_release(&fbatch); 2339 } 2340 /* Extent fully mapped and matches with page boundary. We are done. */ 2341 mpd->map.m_len = 0; 2342 mpd->map.m_flags = 0; 2343 return 0; 2344 out: 2345 folio_batch_release(&fbatch); 2346 return err; 2347 } 2348 2349 static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) 2350 { 2351 struct inode *inode = mpd->inode; 2352 struct ext4_map_blocks *map = &mpd->map; 2353 int get_blocks_flags; 2354 int err, dioread_nolock; 2355 2356 /* Make sure transaction has enough credits for this extent */ 2357 err = ext4_journal_ensure_extent_credits(handle, inode); 2358 if (err < 0) 2359 return err; 2360 2361 trace_ext4_da_write_pages_extent(inode, map); 2362 /* 2363 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or 2364 * to convert an unwritten extent to be initialized (in the case 2365 * where we have written into one or more preallocated blocks). It is 2366 * possible that we're going to need more metadata blocks than 2367 * previously reserved. However we must not fail because we're in 2368 * writeback and there is nothing we can do about it so it might result 2369 * in data loss. So use reserved blocks to allocate metadata if 2370 * possible. In addition, do not cache any unrelated extents, as it 2371 * only holds the folio lock but does not hold the i_rwsem or 2372 * invalidate_lock, which could corrupt the extent status tree. 2373 */ 2374 get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 2375 EXT4_GET_BLOCKS_METADATA_NOFAIL | 2376 EXT4_GET_BLOCKS_IO_SUBMIT | 2377 EXT4_EX_NOCACHE; 2378 2379 dioread_nolock = ext4_should_dioread_nolock(inode); 2380 if (dioread_nolock) 2381 get_blocks_flags |= EXT4_GET_BLOCKS_UNWRIT_EXT; 2382 2383 err = ext4_map_blocks(handle, inode, map, get_blocks_flags); 2384 if (err < 0) 2385 return err; 2386 if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { 2387 if (!mpd->io_submit.io_end->handle && 2388 ext4_handle_valid(handle)) { 2389 mpd->io_submit.io_end->handle = handle->h_rsv_handle; 2390 handle->h_rsv_handle = NULL; 2391 } 2392 ext4_set_io_unwritten_flag(mpd->io_submit.io_end); 2393 } 2394 2395 BUG_ON(map->m_len == 0); 2396 return 0; 2397 } 2398 2399 /* 2400 * This is used to submit mapped buffers in a single folio that is not fully 2401 * mapped for various reasons, such as insufficient space or journal credits. 2402 */ 2403 static int mpage_submit_partial_folio(struct mpage_da_data *mpd) 2404 { 2405 struct inode *inode = mpd->inode; 2406 struct folio *folio; 2407 loff_t pos; 2408 int ret; 2409 2410 folio = filemap_get_folio(inode->i_mapping, 2411 mpd->start_pos >> PAGE_SHIFT); 2412 if (IS_ERR(folio)) 2413 return PTR_ERR(folio); 2414 /* 2415 * The mapped position should be within the current processing folio 2416 * but must not be the folio start position. 2417 */ 2418 pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; 2419 if (WARN_ON_ONCE((folio_pos(folio) == pos) || 2420 !folio_contains(folio, pos >> PAGE_SHIFT))) 2421 return -EINVAL; 2422 2423 ret = mpage_submit_folio(mpd, folio); 2424 if (ret) 2425 goto out; 2426 /* 2427 * Update start_pos to prevent this folio from being released in 2428 * mpage_release_unused_pages(), it will be reset to the aligned folio 2429 * pos when this folio is written again in the next round. Additionally, 2430 * do not update wbc->nr_to_write here, as it will be updated once the 2431 * entire folio has finished processing. 2432 */ 2433 mpd->start_pos = pos; 2434 out: 2435 folio_unlock(folio); 2436 folio_put(folio); 2437 return ret; 2438 } 2439 2440 /* 2441 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length 2442 * mpd->len and submit pages underlying it for IO 2443 * 2444 * @handle - handle for journal operations 2445 * @mpd - extent to map 2446 * @give_up_on_write - we set this to true iff there is a fatal error and there 2447 * is no hope of writing the data. The caller should discard 2448 * dirty pages to avoid infinite loops. 2449 * 2450 * The function maps extent starting at mpd->lblk of length mpd->len. If it is 2451 * delayed, blocks are allocated, if it is unwritten, we may need to convert 2452 * them to initialized or split the described range from larger unwritten 2453 * extent. Note that we need not map all the described range since allocation 2454 * can return less blocks or the range is covered by more unwritten extents. We 2455 * cannot map more because we are limited by reserved transaction credits. On 2456 * the other hand we always make sure that the last touched page is fully 2457 * mapped so that it can be written out (and thus forward progress is 2458 * guaranteed). After mapping we submit all mapped pages for IO. 2459 */ 2460 static int mpage_map_and_submit_extent(handle_t *handle, 2461 struct mpage_da_data *mpd, 2462 bool *give_up_on_write) 2463 { 2464 struct inode *inode = mpd->inode; 2465 struct ext4_map_blocks *map = &mpd->map; 2466 int err; 2467 loff_t disksize; 2468 int progress = 0; 2469 ext4_io_end_t *io_end = mpd->io_submit.io_end; 2470 struct ext4_io_end_vec *io_end_vec; 2471 2472 io_end_vec = ext4_alloc_io_end_vec(io_end); 2473 if (IS_ERR(io_end_vec)) 2474 return PTR_ERR(io_end_vec); 2475 io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); 2476 do { 2477 err = mpage_map_one_extent(handle, mpd); 2478 if (err < 0) { 2479 struct super_block *sb = inode->i_sb; 2480 2481 if (ext4_emergency_state(sb)) 2482 goto invalidate_dirty_pages; 2483 /* 2484 * Let the uper layers retry transient errors. 2485 * In the case of ENOSPC, if ext4_count_free_blocks() 2486 * is non-zero, a commit should free up blocks. 2487 */ 2488 if ((err == -ENOMEM) || (err == -EAGAIN) || 2489 (err == -ENOSPC && ext4_count_free_clusters(sb))) { 2490 /* 2491 * We may have already allocated extents for 2492 * some bhs inside the folio, issue the 2493 * corresponding data to prevent stale data. 2494 */ 2495 if (progress) { 2496 if (mpage_submit_partial_folio(mpd)) 2497 goto invalidate_dirty_pages; 2498 goto update_disksize; 2499 } 2500 return err; 2501 } 2502 ext4_msg(sb, KERN_CRIT, 2503 "Delayed block allocation failed for " 2504 "inode %lu at logical offset %llu with" 2505 " max blocks %u with error %d", 2506 inode->i_ino, 2507 (unsigned long long)map->m_lblk, 2508 (unsigned)map->m_len, -err); 2509 ext4_msg(sb, KERN_CRIT, 2510 "This should not happen!! Data will " 2511 "be lost\n"); 2512 if (err == -ENOSPC) 2513 ext4_print_free_blocks(inode); 2514 invalidate_dirty_pages: 2515 *give_up_on_write = true; 2516 return err; 2517 } 2518 progress = 1; 2519 /* 2520 * Update buffer state, submit mapped pages, and get us new 2521 * extent to map 2522 */ 2523 err = mpage_map_and_submit_buffers(mpd); 2524 if (err < 0) 2525 goto update_disksize; 2526 } while (map->m_len); 2527 2528 update_disksize: 2529 /* 2530 * Update on-disk size after IO is submitted. Races with 2531 * truncate are avoided by checking i_size under i_data_sem. 2532 */ 2533 disksize = mpd->start_pos; 2534 if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { 2535 int err2; 2536 loff_t i_size; 2537 2538 down_write(&EXT4_I(inode)->i_data_sem); 2539 i_size = i_size_read(inode); 2540 if (disksize > i_size) 2541 disksize = i_size; 2542 if (disksize > EXT4_I(inode)->i_disksize) 2543 EXT4_I(inode)->i_disksize = disksize; 2544 up_write(&EXT4_I(inode)->i_data_sem); 2545 err2 = ext4_mark_inode_dirty(handle, inode); 2546 if (err2) { 2547 ext4_error_err(inode->i_sb, -err2, 2548 "Failed to mark inode %lu dirty", 2549 inode->i_ino); 2550 } 2551 if (!err) 2552 err = err2; 2553 } 2554 return err; 2555 } 2556 2557 static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, 2558 size_t len) 2559 { 2560 struct buffer_head *page_bufs = folio_buffers(folio); 2561 struct inode *inode = folio->mapping->host; 2562 int ret, err; 2563 2564 ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, 2565 NULL, do_journal_get_write_access); 2566 err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, 2567 NULL, write_end_fn); 2568 if (ret == 0) 2569 ret = err; 2570 err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); 2571 if (ret == 0) 2572 ret = err; 2573 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 2574 2575 return ret; 2576 } 2577 2578 static int mpage_journal_page_buffers(handle_t *handle, 2579 struct mpage_da_data *mpd, 2580 struct folio *folio) 2581 { 2582 struct inode *inode = mpd->inode; 2583 loff_t size = i_size_read(inode); 2584 size_t len = folio_size(folio); 2585 2586 folio_clear_checked(folio); 2587 mpd->wbc->nr_to_write -= folio_nr_pages(folio); 2588 2589 if (folio_pos(folio) + len > size && 2590 !ext4_verity_in_progress(inode)) 2591 len = size & (len - 1); 2592 2593 return ext4_journal_folio_buffers(handle, folio, len); 2594 } 2595 2596 /* 2597 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages 2598 * needing mapping, submit mapped pages 2599 * 2600 * @mpd - where to look for pages 2601 * 2602 * Walk dirty pages in the mapping. If they are fully mapped, submit them for 2603 * IO immediately. If we cannot map blocks, we submit just already mapped 2604 * buffers in the page for IO and keep page dirty. When we can map blocks and 2605 * we find a page which isn't mapped we start accumulating extent of buffers 2606 * underlying these pages that needs mapping (formed by either delayed or 2607 * unwritten buffers). We also lock the pages containing these buffers. The 2608 * extent found is returned in @mpd structure (starting at mpd->lblk with 2609 * length mpd->len blocks). 2610 * 2611 * Note that this function can attach bios to one io_end structure which are 2612 * neither logically nor physically contiguous. Although it may seem as an 2613 * unnecessary complication, it is actually inevitable in blocksize < pagesize 2614 * case as we need to track IO to all buffers underlying a page in one io_end. 2615 */ 2616 static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) 2617 { 2618 struct address_space *mapping = mpd->inode->i_mapping; 2619 struct folio_batch fbatch; 2620 unsigned int nr_folios; 2621 pgoff_t index = mpd->start_pos >> PAGE_SHIFT; 2622 pgoff_t end = mpd->end_pos >> PAGE_SHIFT; 2623 xa_mark_t tag; 2624 int i, err = 0; 2625 ext4_lblk_t lblk; 2626 struct buffer_head *head; 2627 handle_t *handle = NULL; 2628 int bpp = ext4_journal_blocks_per_folio(mpd->inode); 2629 2630 tag = wbc_to_tag(mpd->wbc); 2631 2632 mpd->map.m_len = 0; 2633 mpd->next_pos = mpd->start_pos; 2634 if (ext4_should_journal_data(mpd->inode)) { 2635 handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, 2636 bpp); 2637 if (IS_ERR(handle)) 2638 return PTR_ERR(handle); 2639 } 2640 folio_batch_init(&fbatch); 2641 while (index <= end) { 2642 nr_folios = filemap_get_folios_tag(mapping, &index, end, 2643 tag, &fbatch); 2644 if (nr_folios == 0) 2645 break; 2646 2647 for (i = 0; i < nr_folios; i++) { 2648 struct folio *folio = fbatch.folios[i]; 2649 2650 /* 2651 * Accumulated enough dirty pages? This doesn't apply 2652 * to WB_SYNC_ALL mode. For integrity sync we have to 2653 * keep going because someone may be concurrently 2654 * dirtying pages, and we might have synced a lot of 2655 * newly appeared dirty pages, but have not synced all 2656 * of the old dirty pages. 2657 */ 2658 if (mpd->wbc->sync_mode == WB_SYNC_NONE && 2659 mpd->wbc->nr_to_write <= 2660 EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len)) 2661 goto out; 2662 2663 /* If we can't merge this page, we are done. */ 2664 if (mpd->map.m_len > 0 && 2665 mpd->next_pos != folio_pos(folio)) 2666 goto out; 2667 2668 if (handle) { 2669 err = ext4_journal_ensure_credits(handle, bpp, 2670 0); 2671 if (err < 0) 2672 goto out; 2673 } 2674 2675 folio_lock(folio); 2676 /* 2677 * If the page is no longer dirty, or its mapping no 2678 * longer corresponds to inode we are writing (which 2679 * means it has been truncated or invalidated), or the 2680 * page is already under writeback and we are not doing 2681 * a data integrity writeback, skip the page 2682 */ 2683 if (!folio_test_dirty(folio) || 2684 (folio_test_writeback(folio) && 2685 (mpd->wbc->sync_mode == WB_SYNC_NONE)) || 2686 unlikely(folio->mapping != mapping)) { 2687 folio_unlock(folio); 2688 continue; 2689 } 2690 2691 folio_wait_writeback(folio); 2692 BUG_ON(folio_test_writeback(folio)); 2693 2694 /* 2695 * Should never happen but for buggy code in 2696 * other subsystems that call 2697 * set_page_dirty() without properly warning 2698 * the file system first. See [1] for more 2699 * information. 2700 * 2701 * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz 2702 */ 2703 if (!folio_buffers(folio)) { 2704 ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); 2705 folio_clear_dirty(folio); 2706 folio_unlock(folio); 2707 continue; 2708 } 2709 2710 if (mpd->map.m_len == 0) 2711 mpd->start_pos = folio_pos(folio); 2712 mpd->next_pos = folio_next_pos(folio); 2713 /* 2714 * Writeout when we cannot modify metadata is simple. 2715 * Just submit the page. For data=journal mode we 2716 * first handle writeout of the page for checkpoint and 2717 * only after that handle delayed page dirtying. This 2718 * makes sure current data is checkpointed to the final 2719 * location before possibly journalling it again which 2720 * is desirable when the page is frequently dirtied 2721 * through a pin. 2722 */ 2723 if (!mpd->can_map) { 2724 err = mpage_submit_folio(mpd, folio); 2725 if (err < 0) 2726 goto out; 2727 /* Pending dirtying of journalled data? */ 2728 if (folio_test_checked(folio)) { 2729 err = mpage_journal_page_buffers(handle, 2730 mpd, folio); 2731 if (err < 0) 2732 goto out; 2733 mpd->journalled_more_data = 1; 2734 } 2735 mpage_folio_done(mpd, folio); 2736 } else { 2737 /* Add all dirty buffers to mpd */ 2738 lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index); 2739 head = folio_buffers(folio); 2740 err = mpage_process_page_bufs(mpd, head, head, 2741 lblk); 2742 if (err <= 0) 2743 goto out; 2744 err = 0; 2745 } 2746 } 2747 folio_batch_release(&fbatch); 2748 cond_resched(); 2749 } 2750 mpd->scanned_until_end = 1; 2751 if (handle) 2752 ext4_journal_stop(handle); 2753 return 0; 2754 out: 2755 folio_batch_release(&fbatch); 2756 if (handle) 2757 ext4_journal_stop(handle); 2758 return err; 2759 } 2760 2761 static int ext4_do_writepages(struct mpage_da_data *mpd) 2762 { 2763 struct writeback_control *wbc = mpd->wbc; 2764 pgoff_t writeback_index = 0; 2765 long nr_to_write = wbc->nr_to_write; 2766 int range_whole = 0; 2767 int cycled = 1; 2768 handle_t *handle = NULL; 2769 struct inode *inode = mpd->inode; 2770 struct address_space *mapping = inode->i_mapping; 2771 int needed_blocks, rsv_blocks = 0, ret = 0; 2772 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2773 struct blk_plug plug; 2774 bool give_up_on_write = false; 2775 2776 trace_ext4_writepages(inode, wbc); 2777 2778 /* 2779 * No pages to write? This is mainly a kludge to avoid starting 2780 * a transaction for special inodes like journal inode on last iput() 2781 * because that could violate lock ordering on umount 2782 */ 2783 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2784 goto out_writepages; 2785 2786 /* 2787 * If the filesystem has aborted, it is read-only, so return 2788 * right away instead of dumping stack traces later on that 2789 * will obscure the real source of the problem. We test 2790 * fs shutdown state instead of sb->s_flag's SB_RDONLY because 2791 * the latter could be true if the filesystem is mounted 2792 * read-only, and in that case, ext4_writepages should 2793 * *never* be called, so if that ever happens, we would want 2794 * the stack trace. 2795 */ 2796 ret = ext4_emergency_state(mapping->host->i_sb); 2797 if (unlikely(ret)) 2798 goto out_writepages; 2799 2800 /* 2801 * If we have inline data and arrive here, it means that 2802 * we will soon create the block for the 1st page, so 2803 * we'd better clear the inline data here. 2804 */ 2805 if (ext4_has_inline_data(inode)) { 2806 /* Just inode will be modified... */ 2807 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 2808 if (IS_ERR(handle)) { 2809 ret = PTR_ERR(handle); 2810 goto out_writepages; 2811 } 2812 BUG_ON(ext4_test_inode_state(inode, 2813 EXT4_STATE_MAY_INLINE_DATA)); 2814 ext4_destroy_inline_data(handle, inode); 2815 ext4_journal_stop(handle); 2816 } 2817 2818 /* 2819 * data=journal mode does not do delalloc so we just need to writeout / 2820 * journal already mapped buffers. On the other hand we need to commit 2821 * transaction to make data stable. We expect all the data to be 2822 * already in the journal (the only exception are DMA pinned pages 2823 * dirtied behind our back) so we commit transaction here and run the 2824 * writeback loop to checkpoint them. The checkpointing is not actually 2825 * necessary to make data persistent *but* quite a few places (extent 2826 * shifting operations, fsverity, ...) depend on being able to drop 2827 * pagecache pages after calling filemap_write_and_wait() and for that 2828 * checkpointing needs to happen. 2829 */ 2830 if (ext4_should_journal_data(inode)) { 2831 mpd->can_map = 0; 2832 if (wbc->sync_mode == WB_SYNC_ALL) 2833 ext4_fc_commit(sbi->s_journal, 2834 EXT4_I(inode)->i_datasync_tid); 2835 } 2836 mpd->journalled_more_data = 0; 2837 2838 if (ext4_should_dioread_nolock(inode)) { 2839 int bpf = ext4_journal_blocks_per_folio(inode); 2840 /* 2841 * We may need to convert up to one extent per block in 2842 * the folio and we may dirty the inode. 2843 */ 2844 rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); 2845 } 2846 2847 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2848 range_whole = 1; 2849 2850 if (wbc->range_cyclic) { 2851 writeback_index = mapping->writeback_index; 2852 if (writeback_index) 2853 cycled = 0; 2854 mpd->start_pos = writeback_index << PAGE_SHIFT; 2855 mpd->end_pos = LLONG_MAX; 2856 } else { 2857 mpd->start_pos = wbc->range_start; 2858 mpd->end_pos = wbc->range_end; 2859 } 2860 2861 ext4_io_submit_init(&mpd->io_submit, wbc); 2862 retry: 2863 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2864 tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, 2865 mpd->end_pos >> PAGE_SHIFT); 2866 blk_start_plug(&plug); 2867 2868 /* 2869 * First writeback pages that don't need mapping - we can avoid 2870 * starting a transaction unnecessarily and also avoid being blocked 2871 * in the block layer on device congestion while having transaction 2872 * started. 2873 */ 2874 mpd->do_map = 0; 2875 mpd->scanned_until_end = 0; 2876 mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); 2877 if (!mpd->io_submit.io_end) { 2878 ret = -ENOMEM; 2879 goto unplug; 2880 } 2881 ret = mpage_prepare_extent_to_map(mpd); 2882 /* Unlock pages we didn't use */ 2883 mpage_release_unused_pages(mpd, false); 2884 /* Submit prepared bio */ 2885 ext4_io_submit(&mpd->io_submit); 2886 ext4_put_io_end_defer(mpd->io_submit.io_end); 2887 mpd->io_submit.io_end = NULL; 2888 if (ret < 0) 2889 goto unplug; 2890 2891 while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { 2892 /* For each extent of pages we use new io_end */ 2893 mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); 2894 if (!mpd->io_submit.io_end) { 2895 ret = -ENOMEM; 2896 break; 2897 } 2898 2899 WARN_ON_ONCE(!mpd->can_map); 2900 /* 2901 * We have two constraints: We find one extent to map and we 2902 * must always write out whole page (makes a difference when 2903 * blocksize < pagesize) so that we don't block on IO when we 2904 * try to write out the rest of the page. Journalled mode is 2905 * not supported by delalloc. 2906 */ 2907 BUG_ON(ext4_should_journal_data(inode)); 2908 /* 2909 * Calculate the number of credits needed to reserve for one 2910 * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will 2911 * attempt to extend the transaction or start a new iteration 2912 * if the reserved credits are insufficient. 2913 */ 2914 needed_blocks = ext4_chunk_trans_blocks(inode, 2915 MAX_WRITEPAGES_EXTENT_LEN); 2916 /* start a new transaction */ 2917 handle = ext4_journal_start_with_reserve(inode, 2918 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); 2919 if (IS_ERR(handle)) { 2920 ret = PTR_ERR(handle); 2921 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2922 "%ld pages, ino %lu; err %d", __func__, 2923 wbc->nr_to_write, inode->i_ino, ret); 2924 /* Release allocated io_end */ 2925 ext4_put_io_end(mpd->io_submit.io_end); 2926 mpd->io_submit.io_end = NULL; 2927 break; 2928 } 2929 mpd->do_map = 1; 2930 2931 trace_ext4_da_write_folios_start(inode, mpd->start_pos, 2932 mpd->next_pos, wbc); 2933 ret = mpage_prepare_extent_to_map(mpd); 2934 if (!ret && mpd->map.m_len) 2935 ret = mpage_map_and_submit_extent(handle, mpd, 2936 &give_up_on_write); 2937 /* 2938 * Caution: If the handle is synchronous, 2939 * ext4_journal_stop() can wait for transaction commit 2940 * to finish which may depend on writeback of pages to 2941 * complete or on page lock to be released. In that 2942 * case, we have to wait until after we have 2943 * submitted all the IO, released page locks we hold, 2944 * and dropped io_end reference (for extent conversion 2945 * to be able to complete) before stopping the handle. 2946 */ 2947 if (!ext4_handle_valid(handle) || handle->h_sync == 0) { 2948 ext4_journal_stop(handle); 2949 handle = NULL; 2950 mpd->do_map = 0; 2951 } 2952 /* Unlock pages we didn't use */ 2953 mpage_release_unused_pages(mpd, give_up_on_write); 2954 /* Submit prepared bio */ 2955 ext4_io_submit(&mpd->io_submit); 2956 2957 /* 2958 * Drop our io_end reference we got from init. We have 2959 * to be careful and use deferred io_end finishing if 2960 * we are still holding the transaction as we can 2961 * release the last reference to io_end which may end 2962 * up doing unwritten extent conversion. 2963 */ 2964 if (handle) { 2965 ext4_put_io_end_defer(mpd->io_submit.io_end); 2966 ext4_journal_stop(handle); 2967 } else 2968 ext4_put_io_end(mpd->io_submit.io_end); 2969 mpd->io_submit.io_end = NULL; 2970 trace_ext4_da_write_folios_end(inode, mpd->start_pos, 2971 mpd->next_pos, wbc, ret); 2972 2973 if (ret == -ENOSPC && sbi->s_journal) { 2974 /* 2975 * Commit the transaction which would 2976 * free blocks released in the transaction 2977 * and try again 2978 */ 2979 jbd2_journal_force_commit_nested(sbi->s_journal); 2980 ret = 0; 2981 continue; 2982 } 2983 if (ret == -EAGAIN) 2984 ret = 0; 2985 /* Fatal error - ENOMEM, EIO... */ 2986 if (ret) 2987 break; 2988 } 2989 unplug: 2990 blk_finish_plug(&plug); 2991 if (!ret && !cycled && wbc->nr_to_write > 0) { 2992 cycled = 1; 2993 mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; 2994 mpd->start_pos = 0; 2995 goto retry; 2996 } 2997 2998 /* Update index */ 2999 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 3000 /* 3001 * Set the writeback_index so that range_cyclic 3002 * mode will write it back later 3003 */ 3004 mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; 3005 3006 out_writepages: 3007 trace_ext4_writepages_result(inode, wbc, ret, 3008 nr_to_write - wbc->nr_to_write); 3009 return ret; 3010 } 3011 3012 static int ext4_writepages(struct address_space *mapping, 3013 struct writeback_control *wbc) 3014 { 3015 struct super_block *sb = mapping->host->i_sb; 3016 struct mpage_da_data mpd = { 3017 .inode = mapping->host, 3018 .wbc = wbc, 3019 .can_map = 1, 3020 }; 3021 int ret; 3022 int alloc_ctx; 3023 3024 ret = ext4_emergency_state(sb); 3025 if (unlikely(ret)) 3026 return ret; 3027 3028 alloc_ctx = ext4_writepages_down_read(sb); 3029 ret = ext4_do_writepages(&mpd); 3030 /* 3031 * For data=journal writeback we could have come across pages marked 3032 * for delayed dirtying (PageChecked) which were just added to the 3033 * running transaction. Try once more to get them to stable storage. 3034 */ 3035 if (!ret && mpd.journalled_more_data) 3036 ret = ext4_do_writepages(&mpd); 3037 ext4_writepages_up_read(sb, alloc_ctx); 3038 3039 return ret; 3040 } 3041 3042 int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) 3043 { 3044 struct writeback_control wbc = { 3045 .sync_mode = WB_SYNC_ALL, 3046 .nr_to_write = LONG_MAX, 3047 .range_start = jinode->i_dirty_start, 3048 .range_end = jinode->i_dirty_end, 3049 }; 3050 struct mpage_da_data mpd = { 3051 .inode = jinode->i_vfs_inode, 3052 .wbc = &wbc, 3053 .can_map = 0, 3054 }; 3055 return ext4_do_writepages(&mpd); 3056 } 3057 3058 static int ext4_dax_writepages(struct address_space *mapping, 3059 struct writeback_control *wbc) 3060 { 3061 int ret; 3062 long nr_to_write = wbc->nr_to_write; 3063 struct inode *inode = mapping->host; 3064 int alloc_ctx; 3065 3066 ret = ext4_emergency_state(inode->i_sb); 3067 if (unlikely(ret)) 3068 return ret; 3069 3070 alloc_ctx = ext4_writepages_down_read(inode->i_sb); 3071 trace_ext4_writepages(inode, wbc); 3072 3073 ret = dax_writeback_mapping_range(mapping, 3074 EXT4_SB(inode->i_sb)->s_daxdev, wbc); 3075 trace_ext4_writepages_result(inode, wbc, ret, 3076 nr_to_write - wbc->nr_to_write); 3077 ext4_writepages_up_read(inode->i_sb, alloc_ctx); 3078 return ret; 3079 } 3080 3081 static int ext4_nonda_switch(struct super_block *sb) 3082 { 3083 s64 free_clusters, dirty_clusters; 3084 struct ext4_sb_info *sbi = EXT4_SB(sb); 3085 3086 /* 3087 * switch to non delalloc mode if we are running low 3088 * on free block. The free block accounting via percpu 3089 * counters can get slightly wrong with percpu_counter_batch getting 3090 * accumulated on each CPU without updating global counters 3091 * Delalloc need an accurate free block accounting. So switch 3092 * to non delalloc when we are near to error range. 3093 */ 3094 free_clusters = 3095 percpu_counter_read_positive(&sbi->s_freeclusters_counter); 3096 dirty_clusters = 3097 percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 3098 /* 3099 * Start pushing delalloc when 1/2 of free blocks are dirty. 3100 */ 3101 if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) 3102 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); 3103 3104 if (2 * free_clusters < 3 * dirty_clusters || 3105 free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { 3106 /* 3107 * free block count is less than 150% of dirty blocks 3108 * or free blocks is less than watermark 3109 */ 3110 return 1; 3111 } 3112 return 0; 3113 } 3114 3115 static int ext4_da_write_begin(const struct kiocb *iocb, 3116 struct address_space *mapping, 3117 loff_t pos, unsigned len, 3118 struct folio **foliop, void **fsdata) 3119 { 3120 int ret, retries = 0; 3121 struct folio *folio; 3122 pgoff_t index; 3123 struct inode *inode = mapping->host; 3124 3125 ret = ext4_emergency_state(inode->i_sb); 3126 if (unlikely(ret)) 3127 return ret; 3128 3129 index = pos >> PAGE_SHIFT; 3130 3131 if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { 3132 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 3133 return ext4_write_begin(iocb, mapping, pos, 3134 len, foliop, fsdata); 3135 } 3136 *fsdata = (void *)0; 3137 trace_ext4_da_write_begin(inode, pos, len); 3138 3139 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 3140 ret = ext4_generic_write_inline_data(mapping, inode, pos, len, 3141 foliop, fsdata, true); 3142 if (ret < 0) 3143 return ret; 3144 if (ret == 1) 3145 return 0; 3146 } 3147 3148 retry: 3149 folio = write_begin_get_folio(iocb, mapping, index, len); 3150 if (IS_ERR(folio)) 3151 return PTR_ERR(folio); 3152 3153 if (len > folio_next_pos(folio) - pos) 3154 len = folio_next_pos(folio) - pos; 3155 3156 ret = ext4_block_write_begin(NULL, folio, pos, len, 3157 ext4_da_get_block_prep); 3158 if (ret < 0) { 3159 folio_unlock(folio); 3160 folio_put(folio); 3161 /* 3162 * ext4_block_write_begin may have instantiated a few blocks 3163 * outside i_size. Trim these off again. Don't need 3164 * i_size_read because we hold inode lock. 3165 */ 3166 if (pos + len > inode->i_size) 3167 ext4_truncate_failed_write(inode); 3168 3169 if (ret == -ENOSPC && 3170 ext4_should_retry_alloc(inode->i_sb, &retries)) 3171 goto retry; 3172 return ret; 3173 } 3174 3175 *foliop = folio; 3176 return ret; 3177 } 3178 3179 /* 3180 * Check if we should update i_disksize 3181 * when write to the end of file but not require block allocation 3182 */ 3183 static int ext4_da_should_update_i_disksize(struct folio *folio, 3184 unsigned long offset) 3185 { 3186 struct buffer_head *bh; 3187 struct inode *inode = folio->mapping->host; 3188 unsigned int idx; 3189 int i; 3190 3191 bh = folio_buffers(folio); 3192 idx = offset >> inode->i_blkbits; 3193 3194 for (i = 0; i < idx; i++) 3195 bh = bh->b_this_page; 3196 3197 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) 3198 return 0; 3199 return 1; 3200 } 3201 3202 static int ext4_da_do_write_end(struct address_space *mapping, 3203 loff_t pos, unsigned len, unsigned copied, 3204 struct folio *folio) 3205 { 3206 struct inode *inode = mapping->host; 3207 loff_t old_size = inode->i_size; 3208 bool disksize_changed = false; 3209 loff_t new_i_size, zero_len = 0; 3210 handle_t *handle; 3211 3212 if (unlikely(!folio_buffers(folio))) { 3213 folio_unlock(folio); 3214 folio_put(folio); 3215 return -EIO; 3216 } 3217 /* 3218 * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES 3219 * flag, which all that's needed to trigger page writeback. 3220 */ 3221 copied = block_write_end(pos, len, copied, folio); 3222 new_i_size = pos + copied; 3223 3224 /* 3225 * It's important to update i_size while still holding folio lock, 3226 * because folio writeout could otherwise come in and zero beyond 3227 * i_size. 3228 * 3229 * Since we are holding inode lock, we are sure i_disksize <= 3230 * i_size. We also know that if i_disksize < i_size, there are 3231 * delalloc writes pending in the range up to i_size. If the end of 3232 * the current write is <= i_size, there's no need to touch 3233 * i_disksize since writeback will push i_disksize up to i_size 3234 * eventually. If the end of the current write is > i_size and 3235 * inside an allocated block which ext4_da_should_update_i_disksize() 3236 * checked, we need to update i_disksize here as certain 3237 * ext4_writepages() paths not allocating blocks and update i_disksize. 3238 */ 3239 if (new_i_size > inode->i_size) { 3240 unsigned long end; 3241 3242 i_size_write(inode, new_i_size); 3243 end = offset_in_folio(folio, new_i_size - 1); 3244 if (copied && ext4_da_should_update_i_disksize(folio, end)) { 3245 ext4_update_i_disksize(inode, new_i_size); 3246 disksize_changed = true; 3247 } 3248 } 3249 3250 folio_unlock(folio); 3251 folio_put(folio); 3252 3253 if (pos > old_size) { 3254 pagecache_isize_extended(inode, old_size, pos); 3255 zero_len = pos - old_size; 3256 } 3257 3258 if (!disksize_changed && !zero_len) 3259 return copied; 3260 3261 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 3262 if (IS_ERR(handle)) 3263 return PTR_ERR(handle); 3264 if (zero_len) 3265 ext4_zero_partial_blocks(handle, inode, old_size, zero_len); 3266 ext4_mark_inode_dirty(handle, inode); 3267 ext4_journal_stop(handle); 3268 3269 return copied; 3270 } 3271 3272 static int ext4_da_write_end(const struct kiocb *iocb, 3273 struct address_space *mapping, 3274 loff_t pos, unsigned len, unsigned copied, 3275 struct folio *folio, void *fsdata) 3276 { 3277 struct inode *inode = mapping->host; 3278 int write_mode = (int)(unsigned long)fsdata; 3279 3280 if (write_mode == FALL_BACK_TO_NONDELALLOC) 3281 return ext4_write_end(iocb, mapping, pos, 3282 len, copied, folio, fsdata); 3283 3284 trace_ext4_da_write_end(inode, pos, len, copied); 3285 3286 if (write_mode != CONVERT_INLINE_DATA && 3287 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && 3288 ext4_has_inline_data(inode)) 3289 return ext4_write_inline_data_end(inode, pos, len, copied, 3290 folio); 3291 3292 if (unlikely(copied < len) && !folio_test_uptodate(folio)) 3293 copied = 0; 3294 3295 return ext4_da_do_write_end(mapping, pos, len, copied, folio); 3296 } 3297 3298 /* 3299 * Force all delayed allocation blocks to be allocated for a given inode. 3300 */ 3301 int ext4_alloc_da_blocks(struct inode *inode) 3302 { 3303 trace_ext4_alloc_da_blocks(inode); 3304 3305 if (!EXT4_I(inode)->i_reserved_data_blocks) 3306 return 0; 3307 3308 /* 3309 * We do something simple for now. The filemap_flush() will 3310 * also start triggering a write of the data blocks, which is 3311 * not strictly speaking necessary. However, to do otherwise 3312 * would require replicating code paths in: 3313 * 3314 * ext4_writepages() -> 3315 * write_cache_pages() ---> (via passed in callback function) 3316 * __mpage_da_writepage() --> 3317 * mpage_add_bh_to_extent() 3318 * mpage_da_map_blocks() 3319 * 3320 * The problem is that write_cache_pages(), located in 3321 * mm/page-writeback.c, marks pages clean in preparation for 3322 * doing I/O, which is not desirable if we're not planning on 3323 * doing I/O at all. 3324 * 3325 * We could call write_cache_pages(), and then redirty all of 3326 * the pages by calling redirty_page_for_writepage() but that 3327 * would be ugly in the extreme. So instead we would need to 3328 * replicate parts of the code in the above functions, 3329 * simplifying them because we wouldn't actually intend to 3330 * write out the pages, but rather only collect contiguous 3331 * logical block extents, call the multi-block allocator, and 3332 * then update the buffer heads with the block allocations. 3333 * 3334 * For now, though, we'll cheat by calling filemap_flush(), 3335 * which will map the blocks, and start the I/O, but not 3336 * actually wait for the I/O to complete. 3337 */ 3338 return filemap_flush(inode->i_mapping); 3339 } 3340 3341 /* 3342 * bmap() is special. It gets used by applications such as lilo and by 3343 * the swapper to find the on-disk block of a specific piece of data. 3344 * 3345 * Naturally, this is dangerous if the block concerned is still in the 3346 * journal. If somebody makes a swapfile on an ext4 data-journaling 3347 * filesystem and enables swap, then they may get a nasty shock when the 3348 * data getting swapped to that swapfile suddenly gets overwritten by 3349 * the original zero's written out previously to the journal and 3350 * awaiting writeback in the kernel's buffer cache. 3351 * 3352 * So, if we see any bmap calls here on a modified, data-journaled file, 3353 * take extra steps to flush any blocks which might be in the cache. 3354 */ 3355 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3356 { 3357 struct inode *inode = mapping->host; 3358 sector_t ret = 0; 3359 3360 inode_lock_shared(inode); 3361 /* 3362 * We can get here for an inline file via the FIBMAP ioctl 3363 */ 3364 if (ext4_has_inline_data(inode)) 3365 goto out; 3366 3367 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3368 (test_opt(inode->i_sb, DELALLOC) || 3369 ext4_should_journal_data(inode))) { 3370 /* 3371 * With delalloc or journalled data we want to sync the file so 3372 * that we can make sure we allocate blocks for file and data 3373 * is in place for the user to see it 3374 */ 3375 filemap_write_and_wait(mapping); 3376 } 3377 3378 ret = iomap_bmap(mapping, block, &ext4_iomap_ops); 3379 3380 out: 3381 inode_unlock_shared(inode); 3382 return ret; 3383 } 3384 3385 static void ext4_invalidate_folio(struct folio *folio, size_t offset, 3386 size_t length) 3387 { 3388 trace_ext4_invalidate_folio(folio, offset, length); 3389 3390 /* No journalling happens on data buffers when this function is used */ 3391 WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); 3392 3393 block_invalidate_folio(folio, offset, length); 3394 } 3395 3396 static int __ext4_journalled_invalidate_folio(struct folio *folio, 3397 size_t offset, size_t length) 3398 { 3399 journal_t *journal = EXT4_JOURNAL(folio->mapping->host); 3400 3401 trace_ext4_journalled_invalidate_folio(folio, offset, length); 3402 3403 /* 3404 * If it's a full truncate we just forget about the pending dirtying 3405 */ 3406 if (offset == 0 && length == folio_size(folio)) 3407 folio_clear_checked(folio); 3408 3409 return jbd2_journal_invalidate_folio(journal, folio, offset, length); 3410 } 3411 3412 /* Wrapper for aops... */ 3413 static void ext4_journalled_invalidate_folio(struct folio *folio, 3414 size_t offset, 3415 size_t length) 3416 { 3417 WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); 3418 } 3419 3420 static bool ext4_release_folio(struct folio *folio, gfp_t wait) 3421 { 3422 struct inode *inode = folio->mapping->host; 3423 journal_t *journal = EXT4_JOURNAL(inode); 3424 3425 trace_ext4_release_folio(inode, folio); 3426 3427 /* Page has dirty journalled data -> cannot release */ 3428 if (folio_test_checked(folio)) 3429 return false; 3430 if (journal) 3431 return jbd2_journal_try_to_free_buffers(journal, folio); 3432 else 3433 return try_to_free_buffers(folio); 3434 } 3435 3436 static bool ext4_inode_datasync_dirty(struct inode *inode) 3437 { 3438 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 3439 3440 if (journal) { 3441 if (jbd2_transaction_committed(journal, 3442 EXT4_I(inode)->i_datasync_tid)) 3443 return false; 3444 if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) 3445 return !list_empty(&EXT4_I(inode)->i_fc_list); 3446 return true; 3447 } 3448 3449 /* Any metadata buffers to write? */ 3450 if (!list_empty(&inode->i_mapping->i_private_list)) 3451 return true; 3452 return inode_state_read_once(inode) & I_DIRTY_DATASYNC; 3453 } 3454 3455 static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, 3456 struct ext4_map_blocks *map, loff_t offset, 3457 loff_t length, unsigned int flags) 3458 { 3459 u8 blkbits = inode->i_blkbits; 3460 3461 /* 3462 * Writes that span EOF might trigger an I/O size update on completion, 3463 * so consider them to be dirty for the purpose of O_DSYNC, even if 3464 * there is no other metadata changes being made or are pending. 3465 */ 3466 iomap->flags = 0; 3467 if (ext4_inode_datasync_dirty(inode) || 3468 offset + length > i_size_read(inode)) 3469 iomap->flags |= IOMAP_F_DIRTY; 3470 3471 if (map->m_flags & EXT4_MAP_NEW) 3472 iomap->flags |= IOMAP_F_NEW; 3473 3474 /* HW-offload atomics are always used */ 3475 if (flags & IOMAP_ATOMIC) 3476 iomap->flags |= IOMAP_F_ATOMIC_BIO; 3477 3478 if (flags & IOMAP_DAX) 3479 iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 3480 else 3481 iomap->bdev = inode->i_sb->s_bdev; 3482 iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); 3483 iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); 3484 3485 if ((map->m_flags & EXT4_MAP_MAPPED) && 3486 !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3487 iomap->flags |= IOMAP_F_MERGED; 3488 3489 /* 3490 * Flags passed to ext4_map_blocks() for direct I/O writes can result 3491 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits 3492 * set. In order for any allocated unwritten extents to be converted 3493 * into written extents correctly within the ->end_io() handler, we 3494 * need to ensure that the iomap->type is set appropriately. Hence, the 3495 * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has 3496 * been set first. 3497 */ 3498 if (map->m_flags & EXT4_MAP_UNWRITTEN) { 3499 iomap->type = IOMAP_UNWRITTEN; 3500 iomap->addr = (u64) map->m_pblk << blkbits; 3501 if (flags & IOMAP_DAX) 3502 iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; 3503 } else if (map->m_flags & EXT4_MAP_MAPPED) { 3504 iomap->type = IOMAP_MAPPED; 3505 iomap->addr = (u64) map->m_pblk << blkbits; 3506 if (flags & IOMAP_DAX) 3507 iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; 3508 } else if (map->m_flags & EXT4_MAP_DELAYED) { 3509 iomap->type = IOMAP_DELALLOC; 3510 iomap->addr = IOMAP_NULL_ADDR; 3511 } else { 3512 iomap->type = IOMAP_HOLE; 3513 iomap->addr = IOMAP_NULL_ADDR; 3514 } 3515 } 3516 3517 static int ext4_map_blocks_atomic_write_slow(handle_t *handle, 3518 struct inode *inode, struct ext4_map_blocks *map) 3519 { 3520 ext4_lblk_t m_lblk = map->m_lblk; 3521 unsigned int m_len = map->m_len; 3522 unsigned int mapped_len = 0, m_flags = 0; 3523 ext4_fsblk_t next_pblk = 0; 3524 bool check_next_pblk = false; 3525 int ret = 0; 3526 3527 WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); 3528 3529 /* 3530 * This is a slow path in case of mixed mapping. We use 3531 * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single 3532 * contiguous mapped mapping. This will ensure any unwritten or hole 3533 * regions within the requested range is zeroed out and we return 3534 * a single contiguous mapped extent. 3535 */ 3536 m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; 3537 3538 do { 3539 ret = ext4_map_blocks(handle, inode, map, m_flags); 3540 if (ret < 0 && ret != -ENOSPC) 3541 goto out_err; 3542 /* 3543 * This should never happen, but let's return an error code to 3544 * avoid an infinite loop in here. 3545 */ 3546 if (ret == 0) { 3547 ret = -EFSCORRUPTED; 3548 ext4_warning_inode(inode, 3549 "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", 3550 m_flags, ret); 3551 goto out_err; 3552 } 3553 /* 3554 * With bigalloc we should never get ENOSPC nor discontiguous 3555 * physical extents. 3556 */ 3557 if ((check_next_pblk && next_pblk != map->m_pblk) || 3558 ret == -ENOSPC) { 3559 ext4_warning_inode(inode, 3560 "Non-contiguous allocation detected: expected %llu, got %llu, " 3561 "or ext4_map_blocks() returned out of space ret: %d", 3562 next_pblk, map->m_pblk, ret); 3563 ret = -EFSCORRUPTED; 3564 goto out_err; 3565 } 3566 next_pblk = map->m_pblk + map->m_len; 3567 check_next_pblk = true; 3568 3569 mapped_len += map->m_len; 3570 map->m_lblk += map->m_len; 3571 map->m_len = m_len - mapped_len; 3572 } while (mapped_len < m_len); 3573 3574 /* 3575 * We might have done some work in above loop, so we need to query the 3576 * start of the physical extent, based on the origin m_lblk and m_len. 3577 * Let's also ensure we were able to allocate the required range for 3578 * mixed mapping case. 3579 */ 3580 map->m_lblk = m_lblk; 3581 map->m_len = m_len; 3582 map->m_flags = 0; 3583 3584 ret = ext4_map_blocks(handle, inode, map, 3585 EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); 3586 if (ret != m_len) { 3587 ext4_warning_inode(inode, 3588 "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", 3589 m_lblk, m_len, ret); 3590 ret = -EINVAL; 3591 } 3592 return ret; 3593 3594 out_err: 3595 /* reset map before returning an error */ 3596 map->m_lblk = m_lblk; 3597 map->m_len = m_len; 3598 map->m_flags = 0; 3599 return ret; 3600 } 3601 3602 /* 3603 * ext4_map_blocks_atomic: Helper routine to ensure the entire requested 3604 * range in @map [lblk, lblk + len) is one single contiguous extent with no 3605 * mixed mappings. 3606 * 3607 * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). 3608 * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying 3609 * physical extent for the requested range does not have a single contiguous 3610 * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. 3611 * In that case we will loop over the requested range to allocate and zero out 3612 * the unwritten / holes in between, to get a single mapped extent from 3613 * [m_lblk, m_lblk + m_len). Note that this is only possible because we know 3614 * this can be called only with bigalloc enabled filesystem where the underlying 3615 * cluster is already allocated. This avoids allocating discontiguous extents 3616 * in the slow path due to multiple calls to ext4_map_blocks(). 3617 * The slow path is mostly non-performance critical path, so it should be ok to 3618 * loop using ext4_map_blocks() with appropriate flags to allocate & zero the 3619 * underlying short holes/unwritten extents within the requested range. 3620 */ 3621 static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, 3622 struct ext4_map_blocks *map, int m_flags, 3623 bool *force_commit) 3624 { 3625 ext4_lblk_t m_lblk = map->m_lblk; 3626 unsigned int m_len = map->m_len; 3627 int ret = 0; 3628 3629 WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); 3630 3631 ret = ext4_map_blocks(handle, inode, map, m_flags); 3632 if (ret < 0 || ret == m_len) 3633 goto out; 3634 /* 3635 * This is a mixed mapping case where we were not able to allocate 3636 * a single contiguous extent. In that case let's reset requested 3637 * mapping and call the slow path. 3638 */ 3639 map->m_lblk = m_lblk; 3640 map->m_len = m_len; 3641 map->m_flags = 0; 3642 3643 /* 3644 * slow path means we have mixed mapping, that means we will need 3645 * to force txn commit. 3646 */ 3647 *force_commit = true; 3648 return ext4_map_blocks_atomic_write_slow(handle, inode, map); 3649 out: 3650 return ret; 3651 } 3652 3653 static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, 3654 unsigned int flags) 3655 { 3656 handle_t *handle; 3657 int ret, dio_credits, m_flags = 0, retries = 0; 3658 bool force_commit = false; 3659 3660 /* 3661 * Trim the mapping request to the maximum value that we can map at 3662 * once for direct I/O. 3663 */ 3664 if (map->m_len > DIO_MAX_BLOCKS) 3665 map->m_len = DIO_MAX_BLOCKS; 3666 3667 /* 3668 * journal credits estimation for atomic writes. We call 3669 * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, 3670 * then let's assume the no. of pextents required can be m_len i.e. 3671 * every alternate block can be unwritten and hole. 3672 */ 3673 if (flags & IOMAP_ATOMIC) { 3674 unsigned int orig_mlen = map->m_len; 3675 3676 ret = ext4_map_blocks(NULL, inode, map, 0); 3677 if (ret < 0) 3678 return ret; 3679 if (map->m_len < orig_mlen) { 3680 map->m_len = orig_mlen; 3681 dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, 3682 map->m_len); 3683 } else { 3684 dio_credits = ext4_chunk_trans_blocks(inode, 3685 map->m_len); 3686 } 3687 } else { 3688 dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); 3689 } 3690 3691 retry: 3692 /* 3693 * Either we allocate blocks and then don't get an unwritten extent, so 3694 * in that case we have reserved enough credits. Or, the blocks are 3695 * already allocated and unwritten. In that case, the extent conversion 3696 * fits into the credits as well. 3697 */ 3698 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); 3699 if (IS_ERR(handle)) 3700 return PTR_ERR(handle); 3701 3702 /* 3703 * DAX and direct I/O are the only two operations that are currently 3704 * supported with IOMAP_WRITE. 3705 */ 3706 WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); 3707 if (flags & IOMAP_DAX) 3708 m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; 3709 /* 3710 * We use i_size instead of i_disksize here because delalloc writeback 3711 * can complete at any point during the I/O and subsequently push the 3712 * i_disksize out to i_size. This could be beyond where direct I/O is 3713 * happening and thus expose allocated blocks to direct I/O reads. 3714 */ 3715 else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) 3716 m_flags = EXT4_GET_BLOCKS_CREATE; 3717 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3718 m_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 3719 3720 if (flags & IOMAP_ATOMIC) 3721 ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, 3722 &force_commit); 3723 else 3724 ret = ext4_map_blocks(handle, inode, map, m_flags); 3725 3726 /* 3727 * We cannot fill holes in indirect tree based inodes as that could 3728 * expose stale data in the case of a crash. Use the magic error code 3729 * to fallback to buffered I/O. 3730 */ 3731 if (!m_flags && !ret) 3732 ret = -ENOTBLK; 3733 3734 ext4_journal_stop(handle); 3735 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3736 goto retry; 3737 3738 /* 3739 * Force commit the current transaction if the allocation spans a mixed 3740 * mapping range. This ensures any pending metadata updates (like 3741 * unwritten to written extents conversion) in this range are in 3742 * consistent state with the file data blocks, before performing the 3743 * actual write I/O. If the commit fails, the whole I/O must be aborted 3744 * to prevent any possible torn writes. 3745 */ 3746 if (ret > 0 && force_commit) { 3747 int ret2; 3748 3749 ret2 = ext4_force_commit(inode->i_sb); 3750 if (ret2) 3751 return ret2; 3752 } 3753 3754 return ret; 3755 } 3756 3757 3758 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 3759 unsigned flags, struct iomap *iomap, struct iomap *srcmap) 3760 { 3761 int ret; 3762 struct ext4_map_blocks map; 3763 u8 blkbits = inode->i_blkbits; 3764 unsigned int orig_mlen; 3765 3766 if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) 3767 return -EINVAL; 3768 3769 if (WARN_ON_ONCE(ext4_has_inline_data(inode))) 3770 return -ERANGE; 3771 3772 /* 3773 * Calculate the first and last logical blocks respectively. 3774 */ 3775 map.m_lblk = offset >> blkbits; 3776 map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, 3777 EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; 3778 orig_mlen = map.m_len; 3779 3780 if (flags & IOMAP_WRITE) { 3781 /* 3782 * We check here if the blocks are already allocated, then we 3783 * don't need to start a journal txn and we can directly return 3784 * the mapping information. This could boost performance 3785 * especially in multi-threaded overwrite requests. 3786 */ 3787 if (offset + length <= i_size_read(inode)) { 3788 ret = ext4_map_blocks(NULL, inode, &map, 0); 3789 /* 3790 * For DAX we convert extents to initialized ones before 3791 * copying the data, otherwise we do it after I/O so 3792 * there's no need to call into ext4_iomap_alloc(). 3793 */ 3794 if ((map.m_flags & EXT4_MAP_MAPPED) || 3795 (!(flags & IOMAP_DAX) && 3796 (map.m_flags & EXT4_MAP_UNWRITTEN))) { 3797 /* 3798 * For atomic writes the entire requested 3799 * length should be mapped. 3800 */ 3801 if (ret == orig_mlen || 3802 (!(flags & IOMAP_ATOMIC) && ret > 0)) 3803 goto out; 3804 } 3805 map.m_len = orig_mlen; 3806 } 3807 ret = ext4_iomap_alloc(inode, &map, flags); 3808 } else { 3809 ret = ext4_map_blocks(NULL, inode, &map, 0); 3810 } 3811 3812 if (ret < 0) 3813 return ret; 3814 out: 3815 /* 3816 * When inline encryption is enabled, sometimes I/O to an encrypted file 3817 * has to be broken up to guarantee DUN contiguity. Handle this by 3818 * limiting the length of the mapping returned. 3819 */ 3820 map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); 3821 3822 /* 3823 * Before returning to iomap, let's ensure the allocated mapping 3824 * covers the entire requested length for atomic writes. 3825 */ 3826 if (flags & IOMAP_ATOMIC) { 3827 if (map.m_len < (length >> blkbits)) { 3828 WARN_ON_ONCE(1); 3829 return -EINVAL; 3830 } 3831 } 3832 ext4_set_iomap(inode, iomap, &map, offset, length, flags); 3833 3834 return 0; 3835 } 3836 3837 const struct iomap_ops ext4_iomap_ops = { 3838 .iomap_begin = ext4_iomap_begin, 3839 }; 3840 3841 static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, 3842 loff_t length, unsigned int flags, 3843 struct iomap *iomap, struct iomap *srcmap) 3844 { 3845 int ret; 3846 struct ext4_map_blocks map; 3847 u8 blkbits = inode->i_blkbits; 3848 3849 if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) 3850 return -EINVAL; 3851 3852 if (ext4_has_inline_data(inode)) { 3853 ret = ext4_inline_data_iomap(inode, iomap); 3854 if (ret != -EAGAIN) { 3855 if (ret == 0 && offset >= iomap->length) 3856 ret = -ENOENT; 3857 return ret; 3858 } 3859 } 3860 3861 /* 3862 * Calculate the first and last logical block respectively. 3863 */ 3864 map.m_lblk = offset >> blkbits; 3865 map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, 3866 EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; 3867 3868 /* 3869 * Fiemap callers may call for offset beyond s_bitmap_maxbytes. 3870 * So handle it here itself instead of querying ext4_map_blocks(). 3871 * Since ext4_map_blocks() will warn about it and will return 3872 * -EIO error. 3873 */ 3874 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 3875 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3876 3877 if (offset >= sbi->s_bitmap_maxbytes) { 3878 map.m_flags = 0; 3879 goto set_iomap; 3880 } 3881 } 3882 3883 ret = ext4_map_blocks(NULL, inode, &map, 0); 3884 if (ret < 0) 3885 return ret; 3886 set_iomap: 3887 ext4_set_iomap(inode, iomap, &map, offset, length, flags); 3888 3889 return 0; 3890 } 3891 3892 const struct iomap_ops ext4_iomap_report_ops = { 3893 .iomap_begin = ext4_iomap_begin_report, 3894 }; 3895 3896 /* 3897 * For data=journal mode, folio should be marked dirty only when it was 3898 * writeably mapped. When that happens, it was already attached to the 3899 * transaction and marked as jbddirty (we take care of this in 3900 * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings 3901 * so we should have nothing to do here, except for the case when someone 3902 * had the page pinned and dirtied the page through this pin (e.g. by doing 3903 * direct IO to it). In that case we'd need to attach buffers here to the 3904 * transaction but we cannot due to lock ordering. We cannot just dirty the 3905 * folio and leave attached buffers clean, because the buffers' dirty state is 3906 * "definitive". We cannot just set the buffers dirty or jbddirty because all 3907 * the journalling code will explode. So what we do is to mark the folio 3908 * "pending dirty" and next time ext4_writepages() is called, attach buffers 3909 * to the transaction appropriately. 3910 */ 3911 static bool ext4_journalled_dirty_folio(struct address_space *mapping, 3912 struct folio *folio) 3913 { 3914 WARN_ON_ONCE(!folio_buffers(folio)); 3915 if (folio_maybe_dma_pinned(folio)) 3916 folio_set_checked(folio); 3917 return filemap_dirty_folio(mapping, folio); 3918 } 3919 3920 static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) 3921 { 3922 WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); 3923 WARN_ON_ONCE(!folio_buffers(folio)); 3924 return block_dirty_folio(mapping, folio); 3925 } 3926 3927 static int ext4_iomap_swap_activate(struct swap_info_struct *sis, 3928 struct file *file, sector_t *span) 3929 { 3930 return iomap_swapfile_activate(sis, file, span, 3931 &ext4_iomap_report_ops); 3932 } 3933 3934 static const struct address_space_operations ext4_aops = { 3935 .read_folio = ext4_read_folio, 3936 .readahead = ext4_readahead, 3937 .writepages = ext4_writepages, 3938 .write_begin = ext4_write_begin, 3939 .write_end = ext4_write_end, 3940 .dirty_folio = ext4_dirty_folio, 3941 .bmap = ext4_bmap, 3942 .invalidate_folio = ext4_invalidate_folio, 3943 .release_folio = ext4_release_folio, 3944 .migrate_folio = buffer_migrate_folio, 3945 .is_partially_uptodate = block_is_partially_uptodate, 3946 .error_remove_folio = generic_error_remove_folio, 3947 .swap_activate = ext4_iomap_swap_activate, 3948 }; 3949 3950 static const struct address_space_operations ext4_journalled_aops = { 3951 .read_folio = ext4_read_folio, 3952 .readahead = ext4_readahead, 3953 .writepages = ext4_writepages, 3954 .write_begin = ext4_write_begin, 3955 .write_end = ext4_journalled_write_end, 3956 .dirty_folio = ext4_journalled_dirty_folio, 3957 .bmap = ext4_bmap, 3958 .invalidate_folio = ext4_journalled_invalidate_folio, 3959 .release_folio = ext4_release_folio, 3960 .migrate_folio = buffer_migrate_folio_norefs, 3961 .is_partially_uptodate = block_is_partially_uptodate, 3962 .error_remove_folio = generic_error_remove_folio, 3963 .swap_activate = ext4_iomap_swap_activate, 3964 }; 3965 3966 static const struct address_space_operations ext4_da_aops = { 3967 .read_folio = ext4_read_folio, 3968 .readahead = ext4_readahead, 3969 .writepages = ext4_writepages, 3970 .write_begin = ext4_da_write_begin, 3971 .write_end = ext4_da_write_end, 3972 .dirty_folio = ext4_dirty_folio, 3973 .bmap = ext4_bmap, 3974 .invalidate_folio = ext4_invalidate_folio, 3975 .release_folio = ext4_release_folio, 3976 .migrate_folio = buffer_migrate_folio, 3977 .is_partially_uptodate = block_is_partially_uptodate, 3978 .error_remove_folio = generic_error_remove_folio, 3979 .swap_activate = ext4_iomap_swap_activate, 3980 }; 3981 3982 static const struct address_space_operations ext4_dax_aops = { 3983 .writepages = ext4_dax_writepages, 3984 .dirty_folio = noop_dirty_folio, 3985 .bmap = ext4_bmap, 3986 .swap_activate = ext4_iomap_swap_activate, 3987 }; 3988 3989 void ext4_set_aops(struct inode *inode) 3990 { 3991 switch (ext4_inode_journal_mode(inode)) { 3992 case EXT4_INODE_ORDERED_DATA_MODE: 3993 case EXT4_INODE_WRITEBACK_DATA_MODE: 3994 break; 3995 case EXT4_INODE_JOURNAL_DATA_MODE: 3996 inode->i_mapping->a_ops = &ext4_journalled_aops; 3997 return; 3998 default: 3999 BUG(); 4000 } 4001 if (IS_DAX(inode)) 4002 inode->i_mapping->a_ops = &ext4_dax_aops; 4003 else if (test_opt(inode->i_sb, DELALLOC)) 4004 inode->i_mapping->a_ops = &ext4_da_aops; 4005 else 4006 inode->i_mapping->a_ops = &ext4_aops; 4007 } 4008 4009 /* 4010 * Here we can't skip an unwritten buffer even though it usually reads zero 4011 * because it might have data in pagecache (eg, if called from ext4_zero_range, 4012 * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a 4013 * racing writeback can come later and flush the stale pagecache to disk. 4014 */ 4015 static int __ext4_block_zero_page_range(handle_t *handle, 4016 struct address_space *mapping, loff_t from, loff_t length) 4017 { 4018 unsigned int offset, blocksize, pos; 4019 ext4_lblk_t iblock; 4020 struct inode *inode = mapping->host; 4021 struct buffer_head *bh; 4022 struct folio *folio; 4023 int err = 0; 4024 4025 folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, 4026 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 4027 mapping_gfp_constraint(mapping, ~__GFP_FS)); 4028 if (IS_ERR(folio)) 4029 return PTR_ERR(folio); 4030 4031 blocksize = inode->i_sb->s_blocksize; 4032 4033 iblock = EXT4_PG_TO_LBLK(inode, folio->index); 4034 4035 bh = folio_buffers(folio); 4036 if (!bh) 4037 bh = create_empty_buffers(folio, blocksize, 0); 4038 4039 /* Find the buffer that contains "offset" */ 4040 offset = offset_in_folio(folio, from); 4041 pos = blocksize; 4042 while (offset >= pos) { 4043 bh = bh->b_this_page; 4044 iblock++; 4045 pos += blocksize; 4046 } 4047 if (buffer_freed(bh)) { 4048 BUFFER_TRACE(bh, "freed: skip"); 4049 goto unlock; 4050 } 4051 if (!buffer_mapped(bh)) { 4052 BUFFER_TRACE(bh, "unmapped"); 4053 ext4_get_block(inode, iblock, bh, 0); 4054 /* unmapped? It's a hole - nothing to do */ 4055 if (!buffer_mapped(bh)) { 4056 BUFFER_TRACE(bh, "still unmapped"); 4057 goto unlock; 4058 } 4059 } 4060 4061 /* Ok, it's mapped. Make sure it's up-to-date */ 4062 if (folio_test_uptodate(folio)) 4063 set_buffer_uptodate(bh); 4064 4065 if (!buffer_uptodate(bh)) { 4066 err = ext4_read_bh_lock(bh, 0, true); 4067 if (err) 4068 goto unlock; 4069 if (fscrypt_inode_uses_fs_layer_crypto(inode)) { 4070 /* We expect the key to be set. */ 4071 BUG_ON(!fscrypt_has_encryption_key(inode)); 4072 err = fscrypt_decrypt_pagecache_blocks(folio, 4073 blocksize, 4074 bh_offset(bh)); 4075 if (err) { 4076 clear_buffer_uptodate(bh); 4077 goto unlock; 4078 } 4079 } 4080 } 4081 if (ext4_should_journal_data(inode)) { 4082 BUFFER_TRACE(bh, "get write access"); 4083 err = ext4_journal_get_write_access(handle, inode->i_sb, bh, 4084 EXT4_JTR_NONE); 4085 if (err) 4086 goto unlock; 4087 } 4088 folio_zero_range(folio, offset, length); 4089 BUFFER_TRACE(bh, "zeroed end of block"); 4090 4091 if (ext4_should_journal_data(inode)) { 4092 err = ext4_dirty_journalled_data(handle, bh); 4093 } else { 4094 mark_buffer_dirty(bh); 4095 /* 4096 * Only the written block requires ordered data to prevent 4097 * exposing stale data. 4098 */ 4099 if (!buffer_unwritten(bh) && !buffer_delay(bh) && 4100 ext4_should_order_data(inode)) 4101 err = ext4_jbd2_inode_add_write(handle, inode, from, 4102 length); 4103 } 4104 4105 unlock: 4106 folio_unlock(folio); 4107 folio_put(folio); 4108 return err; 4109 } 4110 4111 /* 4112 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 4113 * starting from file offset 'from'. The range to be zero'd must 4114 * be contained with in one block. If the specified range exceeds 4115 * the end of the block it will be shortened to end of the block 4116 * that corresponds to 'from' 4117 */ 4118 static int ext4_block_zero_page_range(handle_t *handle, 4119 struct address_space *mapping, loff_t from, loff_t length) 4120 { 4121 struct inode *inode = mapping->host; 4122 unsigned blocksize = inode->i_sb->s_blocksize; 4123 unsigned int max = blocksize - (from & (blocksize - 1)); 4124 4125 /* 4126 * correct length if it does not fall between 4127 * 'from' and the end of the block 4128 */ 4129 if (length > max || length < 0) 4130 length = max; 4131 4132 if (IS_DAX(inode)) { 4133 return dax_zero_range(inode, from, length, NULL, 4134 &ext4_iomap_ops); 4135 } 4136 return __ext4_block_zero_page_range(handle, mapping, from, length); 4137 } 4138 4139 /* 4140 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 4141 * up to the end of the block which corresponds to `from'. 4142 * This required during truncate. We need to physically zero the tail end 4143 * of that block so it doesn't yield old data if the file is later grown. 4144 */ 4145 static int ext4_block_truncate_page(handle_t *handle, 4146 struct address_space *mapping, loff_t from) 4147 { 4148 unsigned length; 4149 unsigned blocksize; 4150 struct inode *inode = mapping->host; 4151 4152 /* If we are processing an encrypted inode during orphan list handling */ 4153 if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) 4154 return 0; 4155 4156 blocksize = i_blocksize(inode); 4157 length = blocksize - (from & (blocksize - 1)); 4158 4159 return ext4_block_zero_page_range(handle, mapping, from, length); 4160 } 4161 4162 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 4163 loff_t lstart, loff_t length) 4164 { 4165 struct super_block *sb = inode->i_sb; 4166 struct address_space *mapping = inode->i_mapping; 4167 unsigned partial_start, partial_end; 4168 ext4_fsblk_t start, end; 4169 loff_t byte_end = (lstart + length - 1); 4170 int err = 0; 4171 4172 partial_start = lstart & (sb->s_blocksize - 1); 4173 partial_end = byte_end & (sb->s_blocksize - 1); 4174 4175 start = lstart >> sb->s_blocksize_bits; 4176 end = byte_end >> sb->s_blocksize_bits; 4177 4178 /* Handle partial zero within the single block */ 4179 if (start == end && 4180 (partial_start || (partial_end != sb->s_blocksize - 1))) { 4181 err = ext4_block_zero_page_range(handle, mapping, 4182 lstart, length); 4183 return err; 4184 } 4185 /* Handle partial zero out on the start of the range */ 4186 if (partial_start) { 4187 err = ext4_block_zero_page_range(handle, mapping, 4188 lstart, sb->s_blocksize); 4189 if (err) 4190 return err; 4191 } 4192 /* Handle partial zero out on the end of the range */ 4193 if (partial_end != sb->s_blocksize - 1) 4194 err = ext4_block_zero_page_range(handle, mapping, 4195 byte_end - partial_end, 4196 partial_end + 1); 4197 return err; 4198 } 4199 4200 int ext4_can_truncate(struct inode *inode) 4201 { 4202 if (S_ISREG(inode->i_mode)) 4203 return 1; 4204 if (S_ISDIR(inode->i_mode)) 4205 return 1; 4206 if (S_ISLNK(inode->i_mode)) 4207 return !ext4_inode_is_fast_symlink(inode); 4208 return 0; 4209 } 4210 4211 /* 4212 * We have to make sure i_disksize gets properly updated before we truncate 4213 * page cache due to hole punching or zero range. Otherwise i_disksize update 4214 * can get lost as it may have been postponed to submission of writeback but 4215 * that will never happen if we remove the folio containing i_size from the 4216 * page cache. Also if we punch hole within i_size but above i_disksize, 4217 * following ext4_page_mkwrite() may mistakenly allocate written blocks over 4218 * the hole and thus introduce allocated blocks beyond i_disksize which is 4219 * not allowed (e2fsck would complain in case of crash). 4220 */ 4221 int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, 4222 loff_t len) 4223 { 4224 handle_t *handle; 4225 int ret; 4226 4227 loff_t size = i_size_read(inode); 4228 4229 WARN_ON(!inode_is_locked(inode)); 4230 if (offset > size) 4231 return 0; 4232 4233 if (offset + len < size) 4234 size = offset + len; 4235 if (EXT4_I(inode)->i_disksize >= size) 4236 return 0; 4237 4238 handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 4239 if (IS_ERR(handle)) 4240 return PTR_ERR(handle); 4241 ext4_update_i_disksize(inode, size); 4242 ret = ext4_mark_inode_dirty(handle, inode); 4243 ext4_journal_stop(handle); 4244 4245 return ret; 4246 } 4247 4248 static inline void ext4_truncate_folio(struct inode *inode, 4249 loff_t start, loff_t end) 4250 { 4251 unsigned long blocksize = i_blocksize(inode); 4252 struct folio *folio; 4253 4254 /* Nothing to be done if no complete block needs to be truncated. */ 4255 if (round_up(start, blocksize) >= round_down(end, blocksize)) 4256 return; 4257 4258 folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); 4259 if (IS_ERR(folio)) 4260 return; 4261 4262 if (folio_mkclean(folio)) 4263 folio_mark_dirty(folio); 4264 folio_unlock(folio); 4265 folio_put(folio); 4266 } 4267 4268 int ext4_truncate_page_cache_block_range(struct inode *inode, 4269 loff_t start, loff_t end) 4270 { 4271 unsigned long blocksize = i_blocksize(inode); 4272 int ret; 4273 4274 /* 4275 * For journalled data we need to write (and checkpoint) pages 4276 * before discarding page cache to avoid inconsitent data on disk 4277 * in case of crash before freeing or unwritten converting trans 4278 * is committed. 4279 */ 4280 if (ext4_should_journal_data(inode)) { 4281 ret = filemap_write_and_wait_range(inode->i_mapping, start, 4282 end - 1); 4283 if (ret) 4284 return ret; 4285 goto truncate_pagecache; 4286 } 4287 4288 /* 4289 * If the block size is less than the page size, the file's mapped 4290 * blocks within one page could be freed or converted to unwritten. 4291 * So it's necessary to remove writable userspace mappings, and then 4292 * ext4_page_mkwrite() can be called during subsequent write access 4293 * to these partial folios. 4294 */ 4295 if (!IS_ALIGNED(start | end, PAGE_SIZE) && 4296 blocksize < PAGE_SIZE && start < inode->i_size) { 4297 loff_t page_boundary = round_up(start, PAGE_SIZE); 4298 4299 ext4_truncate_folio(inode, start, min(page_boundary, end)); 4300 if (end > page_boundary) 4301 ext4_truncate_folio(inode, 4302 round_down(end, PAGE_SIZE), end); 4303 } 4304 4305 truncate_pagecache: 4306 truncate_pagecache_range(inode, start, end - 1); 4307 return 0; 4308 } 4309 4310 static void ext4_wait_dax_page(struct inode *inode) 4311 { 4312 filemap_invalidate_unlock(inode->i_mapping); 4313 schedule(); 4314 filemap_invalidate_lock(inode->i_mapping); 4315 } 4316 4317 int ext4_break_layouts(struct inode *inode) 4318 { 4319 if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) 4320 return -EINVAL; 4321 4322 return dax_break_layout_inode(inode, ext4_wait_dax_page); 4323 } 4324 4325 /* 4326 * ext4_punch_hole: punches a hole in a file by releasing the blocks 4327 * associated with the given offset and length 4328 * 4329 * @inode: File inode 4330 * @offset: The offset where the hole will begin 4331 * @len: The length of the hole 4332 * 4333 * Returns: 0 on success or negative on failure 4334 */ 4335 4336 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 4337 { 4338 struct inode *inode = file_inode(file); 4339 struct super_block *sb = inode->i_sb; 4340 ext4_lblk_t start_lblk, end_lblk; 4341 loff_t max_end = sb->s_maxbytes; 4342 loff_t end = offset + length; 4343 handle_t *handle; 4344 unsigned int credits; 4345 int ret; 4346 4347 trace_ext4_punch_hole(inode, offset, length, 0); 4348 WARN_ON_ONCE(!inode_is_locked(inode)); 4349 4350 /* 4351 * For indirect-block based inodes, make sure that the hole within 4352 * one block before last range. 4353 */ 4354 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4355 max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; 4356 4357 /* No need to punch hole beyond i_size */ 4358 if (offset >= inode->i_size || offset >= max_end) 4359 return 0; 4360 4361 /* 4362 * If the hole extends beyond i_size, set the hole to end after 4363 * the block that contains i_size to save pointless tail block zeroing. 4364 */ 4365 if (end >= inode->i_size) 4366 end = round_up(inode->i_size, sb->s_blocksize); 4367 if (end > max_end) 4368 end = max_end; 4369 length = end - offset; 4370 4371 /* 4372 * Attach jinode to inode for jbd2 if we do any zeroing of partial 4373 * block. 4374 */ 4375 if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { 4376 ret = ext4_inode_attach_jinode(inode); 4377 if (ret < 0) 4378 return ret; 4379 } 4380 4381 4382 ret = ext4_update_disksize_before_punch(inode, offset, length); 4383 if (ret) 4384 return ret; 4385 4386 /* Now release the pages and zero block aligned part of pages*/ 4387 ret = ext4_truncate_page_cache_block_range(inode, offset, end); 4388 if (ret) 4389 return ret; 4390 4391 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4392 credits = ext4_chunk_trans_extent(inode, 2); 4393 else 4394 credits = ext4_blocks_for_truncate(inode); 4395 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4396 if (IS_ERR(handle)) { 4397 ret = PTR_ERR(handle); 4398 ext4_std_error(sb, ret); 4399 return ret; 4400 } 4401 4402 ret = ext4_zero_partial_blocks(handle, inode, offset, length); 4403 if (ret) 4404 goto out_handle; 4405 4406 /* If there are blocks to remove, do it */ 4407 start_lblk = EXT4_B_TO_LBLK(inode, offset); 4408 end_lblk = end >> inode->i_blkbits; 4409 4410 if (end_lblk > start_lblk) { 4411 ext4_lblk_t hole_len = end_lblk - start_lblk; 4412 4413 ext4_fc_track_inode(handle, inode); 4414 ext4_check_map_extents_env(inode); 4415 down_write(&EXT4_I(inode)->i_data_sem); 4416 ext4_discard_preallocations(inode); 4417 4418 ext4_es_remove_extent(inode, start_lblk, hole_len); 4419 4420 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4421 ret = ext4_ext_remove_space(inode, start_lblk, 4422 end_lblk - 1); 4423 else 4424 ret = ext4_ind_remove_space(handle, inode, start_lblk, 4425 end_lblk); 4426 if (ret) { 4427 up_write(&EXT4_I(inode)->i_data_sem); 4428 goto out_handle; 4429 } 4430 4431 ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, 4432 EXTENT_STATUS_HOLE, 0); 4433 up_write(&EXT4_I(inode)->i_data_sem); 4434 } 4435 ext4_fc_track_range(handle, inode, start_lblk, end_lblk); 4436 4437 ret = ext4_mark_inode_dirty(handle, inode); 4438 if (unlikely(ret)) 4439 goto out_handle; 4440 4441 ext4_update_inode_fsync_trans(handle, inode, 1); 4442 if (IS_SYNC(inode)) 4443 ext4_handle_sync(handle); 4444 out_handle: 4445 ext4_journal_stop(handle); 4446 return ret; 4447 } 4448 4449 int ext4_inode_attach_jinode(struct inode *inode) 4450 { 4451 struct ext4_inode_info *ei = EXT4_I(inode); 4452 struct jbd2_inode *jinode; 4453 4454 if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) 4455 return 0; 4456 4457 jinode = jbd2_alloc_inode(GFP_KERNEL); 4458 spin_lock(&inode->i_lock); 4459 if (!ei->jinode) { 4460 if (!jinode) { 4461 spin_unlock(&inode->i_lock); 4462 return -ENOMEM; 4463 } 4464 jbd2_journal_init_jbd_inode(jinode, inode); 4465 /* 4466 * Publish ->jinode only after it is fully initialized so that 4467 * readers never observe a partially initialized jbd2_inode. 4468 */ 4469 smp_wmb(); 4470 WRITE_ONCE(ei->jinode, jinode); 4471 jinode = NULL; 4472 } 4473 spin_unlock(&inode->i_lock); 4474 if (unlikely(jinode != NULL)) 4475 jbd2_free_inode(jinode); 4476 return 0; 4477 } 4478 4479 /* 4480 * ext4_truncate() 4481 * 4482 * We block out ext4_get_block() block instantiations across the entire 4483 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 4484 * simultaneously on behalf of the same inode. 4485 * 4486 * As we work through the truncate and commit bits of it to the journal there 4487 * is one core, guiding principle: the file's tree must always be consistent on 4488 * disk. We must be able to restart the truncate after a crash. 4489 * 4490 * The file's tree may be transiently inconsistent in memory (although it 4491 * probably isn't), but whenever we close off and commit a journal transaction, 4492 * the contents of (the filesystem + the journal) must be consistent and 4493 * restartable. It's pretty simple, really: bottom up, right to left (although 4494 * left-to-right works OK too). 4495 * 4496 * Note that at recovery time, journal replay occurs *before* the restart of 4497 * truncate against the orphan inode list. 4498 * 4499 * The committed inode has the new, desired i_size (which is the same as 4500 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 4501 * that this inode's truncate did not complete and it will again call 4502 * ext4_truncate() to have another go. So there will be instantiated blocks 4503 * to the right of the truncation point in a crashed ext4 filesystem. But 4504 * that's fine - as long as they are linked from the inode, the post-crash 4505 * ext4_truncate() run will find them and release them. 4506 */ 4507 int ext4_truncate(struct inode *inode) 4508 { 4509 struct ext4_inode_info *ei = EXT4_I(inode); 4510 unsigned int credits; 4511 int err = 0, err2; 4512 handle_t *handle; 4513 struct address_space *mapping = inode->i_mapping; 4514 4515 /* 4516 * There is a possibility that we're either freeing the inode 4517 * or it's a completely new inode. In those cases we might not 4518 * have i_rwsem locked because it's not necessary. 4519 */ 4520 if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING))) 4521 WARN_ON(!inode_is_locked(inode)); 4522 trace_ext4_truncate_enter(inode); 4523 4524 if (!ext4_can_truncate(inode)) 4525 goto out_trace; 4526 4527 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4528 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4529 4530 if (ext4_has_inline_data(inode)) { 4531 int has_inline = 1; 4532 4533 err = ext4_inline_data_truncate(inode, &has_inline); 4534 if (err || has_inline) 4535 goto out_trace; 4536 } 4537 4538 /* If we zero-out tail of the page, we have to create jinode for jbd2 */ 4539 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { 4540 err = ext4_inode_attach_jinode(inode); 4541 if (err) 4542 goto out_trace; 4543 } 4544 4545 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4546 credits = ext4_chunk_trans_extent(inode, 1); 4547 else 4548 credits = ext4_blocks_for_truncate(inode); 4549 4550 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4551 if (IS_ERR(handle)) { 4552 err = PTR_ERR(handle); 4553 goto out_trace; 4554 } 4555 4556 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) 4557 ext4_block_truncate_page(handle, mapping, inode->i_size); 4558 4559 /* 4560 * We add the inode to the orphan list, so that if this 4561 * truncate spans multiple transactions, and we crash, we will 4562 * resume the truncate when the filesystem recovers. It also 4563 * marks the inode dirty, to catch the new size. 4564 * 4565 * Implication: the file must always be in a sane, consistent 4566 * truncatable state while each transaction commits. 4567 */ 4568 err = ext4_orphan_add(handle, inode); 4569 if (err) 4570 goto out_stop; 4571 4572 ext4_fc_track_inode(handle, inode); 4573 ext4_check_map_extents_env(inode); 4574 4575 down_write(&EXT4_I(inode)->i_data_sem); 4576 ext4_discard_preallocations(inode); 4577 4578 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4579 err = ext4_ext_truncate(handle, inode); 4580 else 4581 ext4_ind_truncate(handle, inode); 4582 4583 up_write(&ei->i_data_sem); 4584 if (err) 4585 goto out_stop; 4586 4587 if (IS_SYNC(inode)) 4588 ext4_handle_sync(handle); 4589 4590 out_stop: 4591 /* 4592 * If this was a simple ftruncate() and the file will remain alive, 4593 * then we need to clear up the orphan record which we created above. 4594 * However, if this was a real unlink then we were called by 4595 * ext4_evict_inode(), and we allow that function to clean up the 4596 * orphan info for us. 4597 */ 4598 if (inode->i_nlink) 4599 ext4_orphan_del(handle, inode); 4600 4601 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 4602 err2 = ext4_mark_inode_dirty(handle, inode); 4603 if (unlikely(err2 && !err)) 4604 err = err2; 4605 ext4_journal_stop(handle); 4606 4607 out_trace: 4608 trace_ext4_truncate_exit(inode); 4609 return err; 4610 } 4611 4612 static inline u64 ext4_inode_peek_iversion(const struct inode *inode) 4613 { 4614 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) 4615 return inode_peek_iversion_raw(inode); 4616 else 4617 return inode_peek_iversion(inode); 4618 } 4619 4620 static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, 4621 struct ext4_inode_info *ei) 4622 { 4623 struct inode *inode = &(ei->vfs_inode); 4624 u64 i_blocks = READ_ONCE(inode->i_blocks); 4625 struct super_block *sb = inode->i_sb; 4626 4627 if (i_blocks <= ~0U) { 4628 /* 4629 * i_blocks can be represented in a 32 bit variable 4630 * as multiple of 512 bytes 4631 */ 4632 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4633 raw_inode->i_blocks_high = 0; 4634 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4635 return 0; 4636 } 4637 4638 /* 4639 * This should never happen since sb->s_maxbytes should not have 4640 * allowed this, sb->s_maxbytes was set according to the huge_file 4641 * feature in ext4_fill_super(). 4642 */ 4643 if (!ext4_has_feature_huge_file(sb)) 4644 return -EFSCORRUPTED; 4645 4646 if (i_blocks <= 0xffffffffffffULL) { 4647 /* 4648 * i_blocks can be represented in a 48 bit variable 4649 * as multiple of 512 bytes 4650 */ 4651 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4652 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4653 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4654 } else { 4655 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4656 /* i_block is stored in file system block size */ 4657 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4658 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4659 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4660 } 4661 return 0; 4662 } 4663 4664 static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) 4665 { 4666 struct ext4_inode_info *ei = EXT4_I(inode); 4667 uid_t i_uid; 4668 gid_t i_gid; 4669 projid_t i_projid; 4670 int block; 4671 int err; 4672 4673 err = ext4_inode_blocks_set(raw_inode, ei); 4674 4675 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4676 i_uid = i_uid_read(inode); 4677 i_gid = i_gid_read(inode); 4678 i_projid = from_kprojid(&init_user_ns, ei->i_projid); 4679 if (!(test_opt(inode->i_sb, NO_UID32))) { 4680 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); 4681 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); 4682 /* 4683 * Fix up interoperability with old kernels. Otherwise, 4684 * old inodes get re-used with the upper 16 bits of the 4685 * uid/gid intact. 4686 */ 4687 if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) { 4688 raw_inode->i_uid_high = 0; 4689 raw_inode->i_gid_high = 0; 4690 } else { 4691 raw_inode->i_uid_high = 4692 cpu_to_le16(high_16_bits(i_uid)); 4693 raw_inode->i_gid_high = 4694 cpu_to_le16(high_16_bits(i_gid)); 4695 } 4696 } else { 4697 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); 4698 raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); 4699 raw_inode->i_uid_high = 0; 4700 raw_inode->i_gid_high = 0; 4701 } 4702 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4703 4704 EXT4_INODE_SET_CTIME(inode, raw_inode); 4705 EXT4_INODE_SET_MTIME(inode, raw_inode); 4706 EXT4_INODE_SET_ATIME(inode, raw_inode); 4707 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4708 4709 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4710 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 4711 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) 4712 raw_inode->i_file_acl_high = 4713 cpu_to_le16(ei->i_file_acl >> 32); 4714 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4715 ext4_isize_set(raw_inode, ei->i_disksize); 4716 4717 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4718 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4719 if (old_valid_dev(inode->i_rdev)) { 4720 raw_inode->i_block[0] = 4721 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4722 raw_inode->i_block[1] = 0; 4723 } else { 4724 raw_inode->i_block[0] = 0; 4725 raw_inode->i_block[1] = 4726 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4727 raw_inode->i_block[2] = 0; 4728 } 4729 } else if (!ext4_has_inline_data(inode)) { 4730 for (block = 0; block < EXT4_N_BLOCKS; block++) 4731 raw_inode->i_block[block] = ei->i_data[block]; 4732 } 4733 4734 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 4735 u64 ivers = ext4_inode_peek_iversion(inode); 4736 4737 raw_inode->i_disk_version = cpu_to_le32(ivers); 4738 if (ei->i_extra_isize) { 4739 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4740 raw_inode->i_version_hi = 4741 cpu_to_le32(ivers >> 32); 4742 raw_inode->i_extra_isize = 4743 cpu_to_le16(ei->i_extra_isize); 4744 } 4745 } 4746 4747 if (i_projid != EXT4_DEF_PROJID && 4748 !ext4_has_feature_project(inode->i_sb)) 4749 err = err ?: -EFSCORRUPTED; 4750 4751 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 4752 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 4753 raw_inode->i_projid = cpu_to_le32(i_projid); 4754 4755 ext4_inode_csum_set(inode, raw_inode, ei); 4756 return err; 4757 } 4758 4759 /* 4760 * ext4_get_inode_loc returns with an extra refcount against the inode's 4761 * underlying buffer_head on success. If we pass 'inode' and it does not 4762 * have in-inode xattr, we have all inode data in memory that is needed 4763 * to recreate the on-disk version of this inode. 4764 */ 4765 static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, 4766 struct inode *inode, struct ext4_iloc *iloc, 4767 ext4_fsblk_t *ret_block) 4768 { 4769 struct ext4_group_desc *gdp; 4770 struct buffer_head *bh; 4771 ext4_fsblk_t block; 4772 struct blk_plug plug; 4773 int inodes_per_block, inode_offset; 4774 4775 iloc->bh = NULL; 4776 if (ino < EXT4_ROOT_INO || 4777 ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) 4778 return -EFSCORRUPTED; 4779 4780 iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 4781 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4782 if (!gdp) 4783 return -EIO; 4784 4785 /* 4786 * Figure out the offset within the block group inode table 4787 */ 4788 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 4789 inode_offset = ((ino - 1) % 4790 EXT4_INODES_PER_GROUP(sb)); 4791 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4792 4793 block = ext4_inode_table(sb, gdp); 4794 if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || 4795 (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { 4796 ext4_error(sb, "Invalid inode table block %llu in " 4797 "block_group %u", block, iloc->block_group); 4798 return -EFSCORRUPTED; 4799 } 4800 block += (inode_offset / inodes_per_block); 4801 4802 bh = sb_getblk(sb, block); 4803 if (unlikely(!bh)) 4804 return -ENOMEM; 4805 if (ext4_buffer_uptodate(bh)) 4806 goto has_buffer; 4807 4808 lock_buffer(bh); 4809 if (ext4_buffer_uptodate(bh)) { 4810 /* Someone brought it uptodate while we waited */ 4811 unlock_buffer(bh); 4812 goto has_buffer; 4813 } 4814 4815 /* 4816 * If we have all information of the inode in memory and this 4817 * is the only valid inode in the block, we need not read the 4818 * block. 4819 */ 4820 if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 4821 struct buffer_head *bitmap_bh; 4822 int i, start; 4823 4824 start = inode_offset & ~(inodes_per_block - 1); 4825 4826 /* Is the inode bitmap in cache? */ 4827 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4828 if (unlikely(!bitmap_bh)) 4829 goto make_io; 4830 4831 /* 4832 * If the inode bitmap isn't in cache then the 4833 * optimisation may end up performing two reads instead 4834 * of one, so skip it. 4835 */ 4836 if (!buffer_uptodate(bitmap_bh)) { 4837 brelse(bitmap_bh); 4838 goto make_io; 4839 } 4840 for (i = start; i < start + inodes_per_block; i++) { 4841 if (i == inode_offset) 4842 continue; 4843 if (ext4_test_bit(i, bitmap_bh->b_data)) 4844 break; 4845 } 4846 brelse(bitmap_bh); 4847 if (i == start + inodes_per_block) { 4848 struct ext4_inode *raw_inode = 4849 (struct ext4_inode *) (bh->b_data + iloc->offset); 4850 4851 /* all other inodes are free, so skip I/O */ 4852 memset(bh->b_data, 0, bh->b_size); 4853 if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) 4854 ext4_fill_raw_inode(inode, raw_inode); 4855 set_buffer_uptodate(bh); 4856 unlock_buffer(bh); 4857 goto has_buffer; 4858 } 4859 } 4860 4861 make_io: 4862 /* 4863 * If we need to do any I/O, try to pre-readahead extra 4864 * blocks from the inode table. 4865 */ 4866 blk_start_plug(&plug); 4867 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4868 ext4_fsblk_t b, end, table; 4869 unsigned num; 4870 __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; 4871 4872 table = ext4_inode_table(sb, gdp); 4873 /* s_inode_readahead_blks is always a power of 2 */ 4874 b = block & ~((ext4_fsblk_t) ra_blks - 1); 4875 if (table > b) 4876 b = table; 4877 end = b + ra_blks; 4878 num = EXT4_INODES_PER_GROUP(sb); 4879 if (ext4_has_group_desc_csum(sb)) 4880 num -= ext4_itable_unused_count(sb, gdp); 4881 table += num / inodes_per_block; 4882 if (end > table) 4883 end = table; 4884 while (b <= end) 4885 ext4_sb_breadahead_unmovable(sb, b++); 4886 } 4887 4888 /* 4889 * There are other valid inodes in the buffer, this inode 4890 * has in-inode xattrs, or we don't have this inode in memory. 4891 * Read the block from disk. 4892 */ 4893 trace_ext4_load_inode(sb, ino); 4894 ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, 4895 ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); 4896 blk_finish_plug(&plug); 4897 wait_on_buffer(bh); 4898 if (!buffer_uptodate(bh)) { 4899 if (ret_block) 4900 *ret_block = block; 4901 brelse(bh); 4902 return -EIO; 4903 } 4904 has_buffer: 4905 iloc->bh = bh; 4906 return 0; 4907 } 4908 4909 static int __ext4_get_inode_loc_noinmem(struct inode *inode, 4910 struct ext4_iloc *iloc) 4911 { 4912 ext4_fsblk_t err_blk = 0; 4913 int ret; 4914 4915 ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, 4916 &err_blk); 4917 4918 if (ret == -EIO) 4919 ext4_error_inode_block(inode, err_blk, EIO, 4920 "unable to read itable block"); 4921 4922 return ret; 4923 } 4924 4925 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4926 { 4927 ext4_fsblk_t err_blk = 0; 4928 int ret; 4929 4930 ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, 4931 &err_blk); 4932 4933 if (ret == -EIO) 4934 ext4_error_inode_block(inode, err_blk, EIO, 4935 "unable to read itable block"); 4936 4937 return ret; 4938 } 4939 4940 4941 int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, 4942 struct ext4_iloc *iloc) 4943 { 4944 return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); 4945 } 4946 4947 static bool ext4_should_enable_dax(struct inode *inode) 4948 { 4949 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4950 4951 if (test_opt2(inode->i_sb, DAX_NEVER)) 4952 return false; 4953 if (!S_ISREG(inode->i_mode)) 4954 return false; 4955 if (ext4_should_journal_data(inode)) 4956 return false; 4957 if (ext4_has_inline_data(inode)) 4958 return false; 4959 if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) 4960 return false; 4961 if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) 4962 return false; 4963 if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) 4964 return false; 4965 if (test_opt(inode->i_sb, DAX_ALWAYS)) 4966 return true; 4967 4968 return ext4_test_inode_flag(inode, EXT4_INODE_DAX); 4969 } 4970 4971 void ext4_set_inode_flags(struct inode *inode, bool init) 4972 { 4973 unsigned int flags = EXT4_I(inode)->i_flags; 4974 unsigned int new_fl = 0; 4975 4976 WARN_ON_ONCE(IS_DAX(inode) && init); 4977 4978 if (flags & EXT4_SYNC_FL) 4979 new_fl |= S_SYNC; 4980 if (flags & EXT4_APPEND_FL) 4981 new_fl |= S_APPEND; 4982 if (flags & EXT4_IMMUTABLE_FL) 4983 new_fl |= S_IMMUTABLE; 4984 if (flags & EXT4_NOATIME_FL) 4985 new_fl |= S_NOATIME; 4986 if (flags & EXT4_DIRSYNC_FL) 4987 new_fl |= S_DIRSYNC; 4988 4989 /* Because of the way inode_set_flags() works we must preserve S_DAX 4990 * here if already set. */ 4991 new_fl |= (inode->i_flags & S_DAX); 4992 if (init && ext4_should_enable_dax(inode)) 4993 new_fl |= S_DAX; 4994 4995 if (flags & EXT4_ENCRYPT_FL) 4996 new_fl |= S_ENCRYPTED; 4997 if (flags & EXT4_CASEFOLD_FL) 4998 new_fl |= S_CASEFOLD; 4999 if (flags & EXT4_VERITY_FL) 5000 new_fl |= S_VERITY; 5001 inode_set_flags(inode, new_fl, 5002 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| 5003 S_ENCRYPTED|S_CASEFOLD|S_VERITY); 5004 } 5005 5006 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 5007 struct ext4_inode_info *ei) 5008 { 5009 blkcnt_t i_blocks ; 5010 struct inode *inode = &(ei->vfs_inode); 5011 struct super_block *sb = inode->i_sb; 5012 5013 if (ext4_has_feature_huge_file(sb)) { 5014 /* we are using combined 48 bit field */ 5015 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 5016 le32_to_cpu(raw_inode->i_blocks_lo); 5017 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { 5018 /* i_blocks represent file system block size */ 5019 return i_blocks << (inode->i_blkbits - 9); 5020 } else { 5021 return i_blocks; 5022 } 5023 } else { 5024 return le32_to_cpu(raw_inode->i_blocks_lo); 5025 } 5026 } 5027 5028 static inline int ext4_iget_extra_inode(struct inode *inode, 5029 struct ext4_inode *raw_inode, 5030 struct ext4_inode_info *ei) 5031 { 5032 __le32 *magic = (void *)raw_inode + 5033 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; 5034 5035 if (EXT4_INODE_HAS_XATTR_SPACE(inode) && 5036 *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { 5037 int err; 5038 5039 err = xattr_check_inode(inode, IHDR(inode, raw_inode), 5040 ITAIL(inode, raw_inode)); 5041 if (err) 5042 return err; 5043 5044 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 5045 err = ext4_find_inline_data_nolock(inode); 5046 if (!err && ext4_has_inline_data(inode)) 5047 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 5048 return err; 5049 } else 5050 EXT4_I(inode)->i_inline_off = 0; 5051 return 0; 5052 } 5053 5054 int ext4_get_projid(struct inode *inode, kprojid_t *projid) 5055 { 5056 if (!ext4_has_feature_project(inode->i_sb)) 5057 return -EOPNOTSUPP; 5058 *projid = EXT4_I(inode)->i_projid; 5059 return 0; 5060 } 5061 5062 /* 5063 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of 5064 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag 5065 * set. 5066 */ 5067 static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) 5068 { 5069 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) 5070 inode_set_iversion_raw(inode, val); 5071 else 5072 inode_set_iversion_queried(inode, val); 5073 } 5074 5075 static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, 5076 const char *function, unsigned int line) 5077 { 5078 const char *err_str; 5079 5080 if (flags & EXT4_IGET_EA_INODE) { 5081 if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { 5082 err_str = "missing EA_INODE flag"; 5083 goto error; 5084 } 5085 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5086 EXT4_I(inode)->i_file_acl) { 5087 err_str = "ea_inode with extended attributes"; 5088 goto error; 5089 } 5090 } else { 5091 if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { 5092 /* 5093 * open_by_handle_at() could provide an old inode number 5094 * that has since been reused for an ea_inode; this does 5095 * not indicate filesystem corruption 5096 */ 5097 if (flags & EXT4_IGET_HANDLE) 5098 return -ESTALE; 5099 err_str = "unexpected EA_INODE flag"; 5100 goto error; 5101 } 5102 } 5103 if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { 5104 err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; 5105 goto error; 5106 } 5107 return 0; 5108 5109 error: 5110 ext4_error_inode(inode, function, line, 0, "%s", err_str); 5111 return -EFSCORRUPTED; 5112 } 5113 5114 void ext4_set_inode_mapping_order(struct inode *inode) 5115 { 5116 struct super_block *sb = inode->i_sb; 5117 u16 min_order, max_order; 5118 5119 max_order = EXT4_SB(sb)->s_max_folio_order; 5120 if (!max_order) 5121 return; 5122 5123 min_order = EXT4_SB(sb)->s_min_folio_order; 5124 if (!min_order && !S_ISREG(inode->i_mode)) 5125 return; 5126 5127 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 5128 max_order = min_order; 5129 5130 mapping_set_folio_order_range(inode->i_mapping, min_order, max_order); 5131 } 5132 5133 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, 5134 ext4_iget_flags flags, const char *function, 5135 unsigned int line) 5136 { 5137 struct ext4_iloc iloc; 5138 struct ext4_inode *raw_inode; 5139 struct ext4_inode_info *ei; 5140 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 5141 struct inode *inode; 5142 journal_t *journal = EXT4_SB(sb)->s_journal; 5143 long ret; 5144 loff_t size; 5145 int block; 5146 uid_t i_uid; 5147 gid_t i_gid; 5148 projid_t i_projid; 5149 5150 if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || 5151 (ino < EXT4_ROOT_INO) || 5152 (ino > le32_to_cpu(es->s_inodes_count))) { 5153 if (flags & EXT4_IGET_HANDLE) 5154 return ERR_PTR(-ESTALE); 5155 __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, 5156 "inode #%lu: comm %s: iget: illegal inode #", 5157 ino, current->comm); 5158 return ERR_PTR(-EFSCORRUPTED); 5159 } 5160 5161 inode = iget_locked(sb, ino); 5162 if (!inode) 5163 return ERR_PTR(-ENOMEM); 5164 if (!(inode_state_read_once(inode) & I_NEW)) { 5165 ret = check_igot_inode(inode, flags, function, line); 5166 if (ret) { 5167 iput(inode); 5168 return ERR_PTR(ret); 5169 } 5170 return inode; 5171 } 5172 5173 ei = EXT4_I(inode); 5174 iloc.bh = NULL; 5175 5176 ret = __ext4_get_inode_loc_noinmem(inode, &iloc); 5177 if (ret < 0) 5178 goto bad_inode; 5179 raw_inode = ext4_raw_inode(&iloc); 5180 5181 if ((flags & EXT4_IGET_HANDLE) && 5182 (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { 5183 ret = -ESTALE; 5184 goto bad_inode; 5185 } 5186 5187 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 5188 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 5189 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 5190 EXT4_INODE_SIZE(inode->i_sb) || 5191 (ei->i_extra_isize & 3)) { 5192 ext4_error_inode(inode, function, line, 0, 5193 "iget: bad extra_isize %u " 5194 "(inode size %u)", 5195 ei->i_extra_isize, 5196 EXT4_INODE_SIZE(inode->i_sb)); 5197 ret = -EFSCORRUPTED; 5198 goto bad_inode; 5199 } 5200 } else 5201 ei->i_extra_isize = 0; 5202 5203 /* Precompute checksum seed for inode metadata */ 5204 if (ext4_has_feature_metadata_csum(sb)) { 5205 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5206 __u32 csum; 5207 __le32 inum = cpu_to_le32(inode->i_ino); 5208 __le32 gen = raw_inode->i_generation; 5209 csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, 5210 sizeof(inum)); 5211 ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); 5212 } 5213 5214 if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || 5215 ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && 5216 (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { 5217 ext4_error_inode_err(inode, function, line, 0, 5218 EFSBADCRC, "iget: checksum invalid"); 5219 ret = -EFSBADCRC; 5220 goto bad_inode; 5221 } 5222 5223 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 5224 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 5225 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 5226 if (ext4_has_feature_project(sb) && 5227 EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && 5228 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 5229 i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); 5230 else 5231 i_projid = EXT4_DEF_PROJID; 5232 5233 if (!(test_opt(inode->i_sb, NO_UID32))) { 5234 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 5235 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 5236 } 5237 i_uid_write(inode, i_uid); 5238 i_gid_write(inode, i_gid); 5239 ei->i_projid = make_kprojid(&init_user_ns, i_projid); 5240 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 5241 5242 ei->i_inline_off = 0; 5243 ei->i_dir_start_lookup = 0; 5244 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 5245 /* We now have enough fields to check if the inode was active or not. 5246 * This is needed because nfsd might try to access dead inodes 5247 * the test is that same one that e2fsck uses 5248 * NeilBrown 1999oct15 5249 */ 5250 if (inode->i_nlink == 0) { 5251 if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || 5252 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && 5253 ino != EXT4_BOOT_LOADER_INO) { 5254 /* this inode is deleted or unallocated */ 5255 if (flags & EXT4_IGET_SPECIAL) { 5256 ext4_error_inode(inode, function, line, 0, 5257 "iget: special inode unallocated"); 5258 ret = -EFSCORRUPTED; 5259 } else 5260 ret = -ESTALE; 5261 goto bad_inode; 5262 } 5263 /* The only unlinked inodes we let through here have 5264 * valid i_mode and are being read by the orphan 5265 * recovery code: that's fine, we're about to complete 5266 * the process of deleting those. 5267 * OR it is the EXT4_BOOT_LOADER_INO which is 5268 * not initialized on a new filesystem. */ 5269 } 5270 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 5271 ext4_set_inode_flags(inode, true); 5272 /* Detect invalid flag combination - can't have both inline data and extents */ 5273 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && 5274 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5275 ext4_error_inode(inode, function, line, 0, 5276 "inode has both inline data and extents flags"); 5277 ret = -EFSCORRUPTED; 5278 goto bad_inode; 5279 } 5280 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 5281 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 5282 if (ext4_has_feature_64bit(sb)) 5283 ei->i_file_acl |= 5284 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 5285 inode->i_size = ext4_isize(sb, raw_inode); 5286 size = i_size_read(inode); 5287 if (size < 0 || size > ext4_get_maxbytes(inode)) { 5288 ext4_error_inode(inode, function, line, 0, 5289 "iget: bad i_size value: %lld", size); 5290 ret = -EFSCORRUPTED; 5291 goto bad_inode; 5292 } 5293 /* 5294 * If dir_index is not enabled but there's dir with INDEX flag set, 5295 * we'd normally treat htree data as empty space. But with metadata 5296 * checksumming that corrupts checksums so forbid that. 5297 */ 5298 if (!ext4_has_feature_dir_index(sb) && 5299 ext4_has_feature_metadata_csum(sb) && 5300 ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { 5301 ext4_error_inode(inode, function, line, 0, 5302 "iget: Dir with htree data on filesystem without dir_index feature."); 5303 ret = -EFSCORRUPTED; 5304 goto bad_inode; 5305 } 5306 ei->i_disksize = inode->i_size; 5307 #ifdef CONFIG_QUOTA 5308 ei->i_reserved_quota = 0; 5309 #endif 5310 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 5311 ei->i_block_group = iloc.block_group; 5312 ei->i_last_alloc_group = ~0; 5313 /* 5314 * NOTE! The in-memory inode i_data array is in little-endian order 5315 * even on big-endian machines: we do NOT byteswap the block numbers! 5316 */ 5317 for (block = 0; block < EXT4_N_BLOCKS; block++) 5318 ei->i_data[block] = raw_inode->i_block[block]; 5319 INIT_LIST_HEAD(&ei->i_orphan); 5320 ext4_fc_init_inode(&ei->vfs_inode); 5321 5322 /* 5323 * Set transaction id's of transactions that have to be committed 5324 * to finish f[data]sync. We set them to currently running transaction 5325 * as we cannot be sure that the inode or some of its metadata isn't 5326 * part of the transaction - the inode could have been reclaimed and 5327 * now it is reread from disk. 5328 */ 5329 if (journal) { 5330 transaction_t *transaction; 5331 tid_t tid; 5332 5333 read_lock(&journal->j_state_lock); 5334 if (journal->j_running_transaction) 5335 transaction = journal->j_running_transaction; 5336 else 5337 transaction = journal->j_committing_transaction; 5338 if (transaction) 5339 tid = transaction->t_tid; 5340 else 5341 tid = journal->j_commit_sequence; 5342 read_unlock(&journal->j_state_lock); 5343 ei->i_sync_tid = tid; 5344 ei->i_datasync_tid = tid; 5345 } 5346 5347 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 5348 if (ei->i_extra_isize == 0) { 5349 /* The extra space is currently unused. Use it. */ 5350 BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); 5351 ei->i_extra_isize = sizeof(struct ext4_inode) - 5352 EXT4_GOOD_OLD_INODE_SIZE; 5353 } else { 5354 ret = ext4_iget_extra_inode(inode, raw_inode, ei); 5355 if (ret) 5356 goto bad_inode; 5357 } 5358 } 5359 5360 EXT4_INODE_GET_CTIME(inode, raw_inode); 5361 EXT4_INODE_GET_ATIME(inode, raw_inode); 5362 EXT4_INODE_GET_MTIME(inode, raw_inode); 5363 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 5364 5365 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 5366 u64 ivers = le32_to_cpu(raw_inode->i_disk_version); 5367 5368 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 5369 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 5370 ivers |= 5371 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 5372 } 5373 ext4_inode_set_iversion_queried(inode, ivers); 5374 } 5375 5376 ret = 0; 5377 if (ei->i_file_acl && 5378 !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { 5379 ext4_error_inode(inode, function, line, 0, 5380 "iget: bad extended attribute block %llu", 5381 ei->i_file_acl); 5382 ret = -EFSCORRUPTED; 5383 goto bad_inode; 5384 } else if (!ext4_has_inline_data(inode)) { 5385 /* validate the block references in the inode */ 5386 if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && 5387 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 5388 (S_ISLNK(inode->i_mode) && 5389 !ext4_inode_is_fast_symlink(inode)))) { 5390 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 5391 ret = ext4_ext_check_inode(inode); 5392 else 5393 ret = ext4_ind_check_inode(inode); 5394 } 5395 } 5396 if (ret) 5397 goto bad_inode; 5398 5399 if (S_ISREG(inode->i_mode)) { 5400 inode->i_op = &ext4_file_inode_operations; 5401 inode->i_fop = &ext4_file_operations; 5402 ext4_set_aops(inode); 5403 } else if (S_ISDIR(inode->i_mode)) { 5404 inode->i_op = &ext4_dir_inode_operations; 5405 inode->i_fop = &ext4_dir_operations; 5406 } else if (S_ISLNK(inode->i_mode)) { 5407 /* VFS does not allow setting these so must be corruption */ 5408 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 5409 ext4_error_inode(inode, function, line, 0, 5410 "iget: immutable or append flags " 5411 "not allowed on symlinks"); 5412 ret = -EFSCORRUPTED; 5413 goto bad_inode; 5414 } 5415 if (IS_ENCRYPTED(inode)) { 5416 inode->i_op = &ext4_encrypted_symlink_inode_operations; 5417 } else if (ext4_inode_is_fast_symlink(inode)) { 5418 inode->i_op = &ext4_fast_symlink_inode_operations; 5419 5420 /* 5421 * Orphan cleanup can see inodes with i_size == 0 5422 * and i_data uninitialized. Skip size checks in 5423 * that case. This is safe because the first thing 5424 * ext4_evict_inode() does for fast symlinks is 5425 * clearing of i_data and i_size. 5426 */ 5427 if ((EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS)) { 5428 if (inode->i_nlink != 0) { 5429 ext4_error_inode(inode, function, line, 0, 5430 "invalid orphan symlink nlink %d", 5431 inode->i_nlink); 5432 ret = -EFSCORRUPTED; 5433 goto bad_inode; 5434 } 5435 } else { 5436 if (inode->i_size == 0 || 5437 inode->i_size >= sizeof(ei->i_data) || 5438 strnlen((char *)ei->i_data, inode->i_size + 1) != 5439 inode->i_size) { 5440 ext4_error_inode(inode, function, line, 0, 5441 "invalid fast symlink length %llu", 5442 (unsigned long long)inode->i_size); 5443 ret = -EFSCORRUPTED; 5444 goto bad_inode; 5445 } 5446 inode_set_cached_link(inode, (char *)ei->i_data, 5447 inode->i_size); 5448 } 5449 } else { 5450 inode->i_op = &ext4_symlink_inode_operations; 5451 } 5452 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 5453 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 5454 inode->i_op = &ext4_special_inode_operations; 5455 if (raw_inode->i_block[0]) 5456 init_special_inode(inode, inode->i_mode, 5457 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 5458 else 5459 init_special_inode(inode, inode->i_mode, 5460 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5461 } else if (ino == EXT4_BOOT_LOADER_INO) { 5462 make_bad_inode(inode); 5463 } else { 5464 ret = -EFSCORRUPTED; 5465 ext4_error_inode(inode, function, line, 0, 5466 "iget: bogus i_mode (%o)", inode->i_mode); 5467 goto bad_inode; 5468 } 5469 if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { 5470 ext4_error_inode(inode, function, line, 0, 5471 "casefold flag without casefold feature"); 5472 ret = -EFSCORRUPTED; 5473 goto bad_inode; 5474 } 5475 5476 ext4_set_inode_mapping_order(inode); 5477 5478 ret = check_igot_inode(inode, flags, function, line); 5479 /* 5480 * -ESTALE here means there is nothing inherently wrong with the inode, 5481 * it's just not an inode we can return for an fhandle lookup. 5482 */ 5483 if (ret == -ESTALE) { 5484 brelse(iloc.bh); 5485 unlock_new_inode(inode); 5486 iput(inode); 5487 return ERR_PTR(-ESTALE); 5488 } 5489 if (ret) 5490 goto bad_inode; 5491 brelse(iloc.bh); 5492 /* Initialize the "no ACL's" state for the simple cases */ 5493 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl) 5494 cache_no_acl(inode); 5495 unlock_new_inode(inode); 5496 return inode; 5497 5498 bad_inode: 5499 brelse(iloc.bh); 5500 iget_failed(inode); 5501 return ERR_PTR(ret); 5502 } 5503 5504 static void __ext4_update_other_inode_time(struct super_block *sb, 5505 unsigned long orig_ino, 5506 unsigned long ino, 5507 struct ext4_inode *raw_inode) 5508 { 5509 struct inode *inode; 5510 5511 inode = find_inode_by_ino_rcu(sb, ino); 5512 if (!inode) 5513 return; 5514 5515 if (!inode_is_dirtytime_only(inode)) 5516 return; 5517 5518 spin_lock(&inode->i_lock); 5519 if (inode_is_dirtytime_only(inode)) { 5520 struct ext4_inode_info *ei = EXT4_I(inode); 5521 5522 inode_state_clear(inode, I_DIRTY_TIME); 5523 spin_unlock(&inode->i_lock); 5524 5525 spin_lock(&ei->i_raw_lock); 5526 EXT4_INODE_SET_CTIME(inode, raw_inode); 5527 EXT4_INODE_SET_MTIME(inode, raw_inode); 5528 EXT4_INODE_SET_ATIME(inode, raw_inode); 5529 ext4_inode_csum_set(inode, raw_inode, ei); 5530 spin_unlock(&ei->i_raw_lock); 5531 trace_ext4_other_inode_update_time(inode, orig_ino); 5532 return; 5533 } 5534 spin_unlock(&inode->i_lock); 5535 } 5536 5537 /* 5538 * Opportunistically update the other time fields for other inodes in 5539 * the same inode table block. 5540 */ 5541 static void ext4_update_other_inodes_time(struct super_block *sb, 5542 unsigned long orig_ino, char *buf) 5543 { 5544 unsigned long ino; 5545 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 5546 int inode_size = EXT4_INODE_SIZE(sb); 5547 5548 /* 5549 * Calculate the first inode in the inode table block. Inode 5550 * numbers are one-based. That is, the first inode in a block 5551 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). 5552 */ 5553 ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; 5554 rcu_read_lock(); 5555 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { 5556 if (ino == orig_ino) 5557 continue; 5558 __ext4_update_other_inode_time(sb, orig_ino, ino, 5559 (struct ext4_inode *)buf); 5560 } 5561 rcu_read_unlock(); 5562 } 5563 5564 /* 5565 * Post the struct inode info into an on-disk inode location in the 5566 * buffer-cache. This gobbles the caller's reference to the 5567 * buffer_head in the inode location struct. 5568 * 5569 * The caller must have write access to iloc->bh. 5570 */ 5571 static int ext4_do_update_inode(handle_t *handle, 5572 struct inode *inode, 5573 struct ext4_iloc *iloc) 5574 { 5575 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5576 struct ext4_inode_info *ei = EXT4_I(inode); 5577 struct buffer_head *bh = iloc->bh; 5578 struct super_block *sb = inode->i_sb; 5579 int err; 5580 int need_datasync = 0, set_large_file = 0; 5581 5582 spin_lock(&ei->i_raw_lock); 5583 5584 /* 5585 * For fields not tracked in the in-memory inode, initialise them 5586 * to zero for new inodes. 5587 */ 5588 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) 5589 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5590 5591 if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) 5592 need_datasync = 1; 5593 if (ei->i_disksize > 0x7fffffffULL) { 5594 if (!ext4_has_feature_large_file(sb) || 5595 EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) 5596 set_large_file = 1; 5597 } 5598 5599 err = ext4_fill_raw_inode(inode, raw_inode); 5600 spin_unlock(&ei->i_raw_lock); 5601 if (err) { 5602 EXT4_ERROR_INODE(inode, "corrupted inode contents"); 5603 goto out_brelse; 5604 } 5605 5606 if (inode->i_sb->s_flags & SB_LAZYTIME) 5607 ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, 5608 bh->b_data); 5609 5610 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5611 err = ext4_handle_dirty_metadata(handle, NULL, bh); 5612 if (err) 5613 goto out_error; 5614 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 5615 if (set_large_file) { 5616 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); 5617 err = ext4_journal_get_write_access(handle, sb, 5618 EXT4_SB(sb)->s_sbh, 5619 EXT4_JTR_NONE); 5620 if (err) 5621 goto out_error; 5622 lock_buffer(EXT4_SB(sb)->s_sbh); 5623 ext4_set_feature_large_file(sb); 5624 ext4_superblock_csum_set(sb); 5625 unlock_buffer(EXT4_SB(sb)->s_sbh); 5626 ext4_handle_sync(handle); 5627 err = ext4_handle_dirty_metadata(handle, NULL, 5628 EXT4_SB(sb)->s_sbh); 5629 } 5630 ext4_update_inode_fsync_trans(handle, inode, need_datasync); 5631 out_error: 5632 ext4_std_error(inode->i_sb, err); 5633 out_brelse: 5634 brelse(bh); 5635 return err; 5636 } 5637 5638 /* 5639 * ext4_write_inode() 5640 * 5641 * We are called from a few places: 5642 * 5643 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. 5644 * Here, there will be no transaction running. We wait for any running 5645 * transaction to commit. 5646 * 5647 * - Within flush work (sys_sync(), kupdate and such). 5648 * We wait on commit, if told to. 5649 * 5650 * - Within iput_final() -> write_inode_now() 5651 * We wait on commit, if told to. 5652 * 5653 * In all cases it is actually safe for us to return without doing anything, 5654 * because the inode has been copied into a raw inode buffer in 5655 * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL 5656 * writeback. 5657 * 5658 * Note that we are absolutely dependent upon all inode dirtiers doing the 5659 * right thing: they *must* call mark_inode_dirty() after dirtying info in 5660 * which we are interested. 5661 * 5662 * It would be a bug for them to not do this. The code: 5663 * 5664 * mark_inode_dirty(inode) 5665 * stuff(); 5666 * inode->i_size = expr; 5667 * 5668 * is in error because write_inode() could occur while `stuff()' is running, 5669 * and the new i_size will be lost. Plus the inode will no longer be on the 5670 * superblock's dirty inode list. 5671 */ 5672 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) 5673 { 5674 int err; 5675 5676 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) 5677 return 0; 5678 5679 err = ext4_emergency_state(inode->i_sb); 5680 if (unlikely(err)) 5681 return err; 5682 5683 if (EXT4_SB(inode->i_sb)->s_journal) { 5684 if (ext4_journal_current_handle()) { 5685 ext4_debug("called recursively, non-PF_MEMALLOC!\n"); 5686 dump_stack(); 5687 return -EIO; 5688 } 5689 5690 /* 5691 * No need to force transaction in WB_SYNC_NONE mode. Also 5692 * ext4_sync_fs() will force the commit after everything is 5693 * written. 5694 */ 5695 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) 5696 return 0; 5697 5698 err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, 5699 EXT4_I(inode)->i_sync_tid); 5700 } else { 5701 struct ext4_iloc iloc; 5702 5703 err = __ext4_get_inode_loc_noinmem(inode, &iloc); 5704 if (err) 5705 return err; 5706 /* 5707 * sync(2) will flush the whole buffer cache. No need to do 5708 * it here separately for each inode. 5709 */ 5710 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 5711 sync_dirty_buffer(iloc.bh); 5712 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5713 ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, 5714 "IO error syncing inode"); 5715 err = -EIO; 5716 } 5717 brelse(iloc.bh); 5718 } 5719 return err; 5720 } 5721 5722 /* 5723 * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate 5724 * buffers that are attached to a folio straddling i_size and are undergoing 5725 * commit. In that case we have to wait for commit to finish and try again. 5726 */ 5727 static void ext4_wait_for_tail_page_commit(struct inode *inode) 5728 { 5729 unsigned offset; 5730 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 5731 tid_t commit_tid; 5732 int ret; 5733 bool has_transaction; 5734 5735 offset = inode->i_size & (PAGE_SIZE - 1); 5736 /* 5737 * If the folio is fully truncated, we don't need to wait for any commit 5738 * (and we even should not as __ext4_journalled_invalidate_folio() may 5739 * strip all buffers from the folio but keep the folio dirty which can then 5740 * confuse e.g. concurrent ext4_writepages() seeing dirty folio without 5741 * buffers). Also we don't need to wait for any commit if all buffers in 5742 * the folio remain valid. This is most beneficial for the common case of 5743 * blocksize == PAGESIZE. 5744 */ 5745 if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) 5746 return; 5747 while (1) { 5748 struct folio *folio = filemap_lock_folio(inode->i_mapping, 5749 inode->i_size >> PAGE_SHIFT); 5750 if (IS_ERR(folio)) 5751 return; 5752 ret = __ext4_journalled_invalidate_folio(folio, offset, 5753 folio_size(folio) - offset); 5754 folio_unlock(folio); 5755 folio_put(folio); 5756 if (ret != -EBUSY) 5757 return; 5758 has_transaction = false; 5759 read_lock(&journal->j_state_lock); 5760 if (journal->j_committing_transaction) { 5761 commit_tid = journal->j_committing_transaction->t_tid; 5762 has_transaction = true; 5763 } 5764 read_unlock(&journal->j_state_lock); 5765 if (has_transaction) 5766 jbd2_log_wait_commit(journal, commit_tid); 5767 } 5768 } 5769 5770 /* 5771 * ext4_setattr() 5772 * 5773 * Called from notify_change. 5774 * 5775 * We want to trap VFS attempts to truncate the file as soon as 5776 * possible. In particular, we want to make sure that when the VFS 5777 * shrinks i_size, we put the inode on the orphan list and modify 5778 * i_disksize immediately, so that during the subsequent flushing of 5779 * dirty pages and freeing of disk blocks, we can guarantee that any 5780 * commit will leave the blocks being flushed in an unused state on 5781 * disk. (On recovery, the inode will get truncated and the blocks will 5782 * be freed, so we have a strong guarantee that no future commit will 5783 * leave these blocks visible to the user.) 5784 * 5785 * Another thing we have to assure is that if we are in ordered mode 5786 * and inode is still attached to the committing transaction, we must 5787 * we start writeout of all the dirty pages which are being truncated. 5788 * This way we are sure that all the data written in the previous 5789 * transaction are already on disk (truncate waits for pages under 5790 * writeback). 5791 * 5792 * Called with inode->i_rwsem down. 5793 */ 5794 int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 5795 struct iattr *attr) 5796 { 5797 struct inode *inode = d_inode(dentry); 5798 int error, rc = 0; 5799 int orphan = 0; 5800 const unsigned int ia_valid = attr->ia_valid; 5801 bool inc_ivers = true; 5802 5803 error = ext4_emergency_state(inode->i_sb); 5804 if (unlikely(error)) 5805 return error; 5806 5807 if (unlikely(IS_IMMUTABLE(inode))) 5808 return -EPERM; 5809 5810 if (unlikely(IS_APPEND(inode) && 5811 (ia_valid & (ATTR_MODE | ATTR_UID | 5812 ATTR_GID | ATTR_TIMES_SET)))) 5813 return -EPERM; 5814 5815 error = setattr_prepare(idmap, dentry, attr); 5816 if (error) 5817 return error; 5818 5819 error = fscrypt_prepare_setattr(dentry, attr); 5820 if (error) 5821 return error; 5822 5823 if (is_quota_modification(idmap, inode, attr)) { 5824 error = dquot_initialize(inode); 5825 if (error) 5826 return error; 5827 } 5828 5829 if (i_uid_needs_update(idmap, attr, inode) || 5830 i_gid_needs_update(idmap, attr, inode)) { 5831 handle_t *handle; 5832 5833 /* (user+group)*(old+new) structure, inode write (sb, 5834 * inode block, ? - but truncate inode update has it) */ 5835 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 5836 (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 5837 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); 5838 if (IS_ERR(handle)) { 5839 error = PTR_ERR(handle); 5840 goto err_out; 5841 } 5842 5843 /* dquot_transfer() calls back ext4_get_inode_usage() which 5844 * counts xattr inode references. 5845 */ 5846 down_read(&EXT4_I(inode)->xattr_sem); 5847 error = dquot_transfer(idmap, inode, attr); 5848 up_read(&EXT4_I(inode)->xattr_sem); 5849 5850 if (error) { 5851 ext4_journal_stop(handle); 5852 return error; 5853 } 5854 /* Update corresponding info in inode so that everything is in 5855 * one transaction */ 5856 i_uid_update(idmap, attr, inode); 5857 i_gid_update(idmap, attr, inode); 5858 error = ext4_mark_inode_dirty(handle, inode); 5859 ext4_journal_stop(handle); 5860 if (unlikely(error)) { 5861 return error; 5862 } 5863 } 5864 5865 if (attr->ia_valid & ATTR_SIZE) { 5866 handle_t *handle; 5867 loff_t oldsize = inode->i_size; 5868 loff_t old_disksize; 5869 int shrink = (attr->ia_size < inode->i_size); 5870 5871 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 5872 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5873 5874 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5875 return -EFBIG; 5876 } 5877 } 5878 if (!S_ISREG(inode->i_mode)) { 5879 return -EINVAL; 5880 } 5881 5882 if (attr->ia_size == inode->i_size) 5883 inc_ivers = false; 5884 5885 /* 5886 * If file has inline data but new size exceeds inline capacity, 5887 * convert to extent-based storage first to prevent inconsistent 5888 * state (inline flag set but size exceeds inline capacity). 5889 */ 5890 if (ext4_has_inline_data(inode) && 5891 attr->ia_size > EXT4_I(inode)->i_inline_size) { 5892 error = ext4_convert_inline_data(inode); 5893 if (error) 5894 goto err_out; 5895 } 5896 5897 if (shrink) { 5898 if (ext4_should_order_data(inode)) { 5899 error = ext4_begin_ordered_truncate(inode, 5900 attr->ia_size); 5901 if (error) 5902 goto err_out; 5903 } 5904 /* 5905 * Blocks are going to be removed from the inode. Wait 5906 * for dio in flight. 5907 */ 5908 inode_dio_wait(inode); 5909 } 5910 5911 filemap_invalidate_lock(inode->i_mapping); 5912 5913 rc = ext4_break_layouts(inode); 5914 if (rc) { 5915 filemap_invalidate_unlock(inode->i_mapping); 5916 goto err_out; 5917 } 5918 5919 if (attr->ia_size != inode->i_size) { 5920 /* attach jbd2 jinode for EOF folio tail zeroing */ 5921 if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || 5922 oldsize & (inode->i_sb->s_blocksize - 1)) { 5923 error = ext4_inode_attach_jinode(inode); 5924 if (error) 5925 goto out_mmap_sem; 5926 } 5927 5928 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); 5929 if (IS_ERR(handle)) { 5930 error = PTR_ERR(handle); 5931 goto out_mmap_sem; 5932 } 5933 if (ext4_handle_valid(handle) && shrink) { 5934 error = ext4_orphan_add(handle, inode); 5935 orphan = 1; 5936 } 5937 /* 5938 * Update c/mtime and tail zero the EOF folio on 5939 * truncate up. ext4_truncate() handles the shrink case 5940 * below. 5941 */ 5942 if (!shrink) { 5943 inode_set_mtime_to_ts(inode, 5944 inode_set_ctime_current(inode)); 5945 if (oldsize & (inode->i_sb->s_blocksize - 1)) 5946 ext4_block_truncate_page(handle, 5947 inode->i_mapping, oldsize); 5948 } 5949 5950 if (shrink) 5951 ext4_fc_track_range(handle, inode, 5952 (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> 5953 inode->i_sb->s_blocksize_bits, 5954 EXT_MAX_BLOCKS - 1); 5955 else 5956 ext4_fc_track_range( 5957 handle, inode, 5958 (oldsize > 0 ? oldsize - 1 : oldsize) >> 5959 inode->i_sb->s_blocksize_bits, 5960 (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> 5961 inode->i_sb->s_blocksize_bits); 5962 5963 down_write(&EXT4_I(inode)->i_data_sem); 5964 old_disksize = EXT4_I(inode)->i_disksize; 5965 EXT4_I(inode)->i_disksize = attr->ia_size; 5966 5967 /* 5968 * We have to update i_size under i_data_sem together 5969 * with i_disksize to avoid races with writeback code 5970 * running ext4_wb_update_i_disksize(). 5971 */ 5972 if (!error) 5973 i_size_write(inode, attr->ia_size); 5974 else 5975 EXT4_I(inode)->i_disksize = old_disksize; 5976 up_write(&EXT4_I(inode)->i_data_sem); 5977 rc = ext4_mark_inode_dirty(handle, inode); 5978 if (!error) 5979 error = rc; 5980 ext4_journal_stop(handle); 5981 if (error) 5982 goto out_mmap_sem; 5983 if (!shrink) { 5984 pagecache_isize_extended(inode, oldsize, 5985 inode->i_size); 5986 } else if (ext4_should_journal_data(inode)) { 5987 ext4_wait_for_tail_page_commit(inode); 5988 } 5989 } 5990 5991 /* 5992 * Truncate pagecache after we've waited for commit 5993 * in data=journal mode to make pages freeable. 5994 */ 5995 truncate_pagecache(inode, inode->i_size); 5996 /* 5997 * Call ext4_truncate() even if i_size didn't change to 5998 * truncate possible preallocated blocks. 5999 */ 6000 if (attr->ia_size <= oldsize) { 6001 rc = ext4_truncate(inode); 6002 if (rc) 6003 error = rc; 6004 } 6005 out_mmap_sem: 6006 filemap_invalidate_unlock(inode->i_mapping); 6007 } 6008 6009 if (!error) { 6010 if (inc_ivers) 6011 inode_inc_iversion(inode); 6012 setattr_copy(idmap, inode, attr); 6013 mark_inode_dirty(inode); 6014 } 6015 6016 /* 6017 * If the call to ext4_truncate failed to get a transaction handle at 6018 * all, we need to clean up the in-core orphan list manually. 6019 */ 6020 if (orphan && inode->i_nlink) 6021 ext4_orphan_del(NULL, inode); 6022 6023 if (!error && (ia_valid & ATTR_MODE)) 6024 rc = posix_acl_chmod(idmap, dentry, inode->i_mode); 6025 6026 err_out: 6027 if (error) 6028 ext4_std_error(inode->i_sb, error); 6029 if (!error) 6030 error = rc; 6031 return error; 6032 } 6033 6034 u32 ext4_dio_alignment(struct inode *inode) 6035 { 6036 if (fsverity_active(inode)) 6037 return 0; 6038 if (ext4_should_journal_data(inode)) 6039 return 0; 6040 if (ext4_has_inline_data(inode)) 6041 return 0; 6042 if (IS_ENCRYPTED(inode)) { 6043 if (!fscrypt_dio_supported(inode)) 6044 return 0; 6045 return i_blocksize(inode); 6046 } 6047 return 1; /* use the iomap defaults */ 6048 } 6049 6050 int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, 6051 struct kstat *stat, u32 request_mask, unsigned int query_flags) 6052 { 6053 struct inode *inode = d_inode(path->dentry); 6054 struct ext4_inode *raw_inode; 6055 struct ext4_inode_info *ei = EXT4_I(inode); 6056 unsigned int flags; 6057 6058 if ((request_mask & STATX_BTIME) && 6059 EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { 6060 stat->result_mask |= STATX_BTIME; 6061 stat->btime.tv_sec = ei->i_crtime.tv_sec; 6062 stat->btime.tv_nsec = ei->i_crtime.tv_nsec; 6063 } 6064 6065 /* 6066 * Return the DIO alignment restrictions if requested. We only return 6067 * this information when requested, since on encrypted files it might 6068 * take a fair bit of work to get if the file wasn't opened recently. 6069 */ 6070 if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { 6071 u32 dio_align = ext4_dio_alignment(inode); 6072 6073 stat->result_mask |= STATX_DIOALIGN; 6074 if (dio_align == 1) { 6075 struct block_device *bdev = inode->i_sb->s_bdev; 6076 6077 /* iomap defaults */ 6078 stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 6079 stat->dio_offset_align = bdev_logical_block_size(bdev); 6080 } else { 6081 stat->dio_mem_align = dio_align; 6082 stat->dio_offset_align = dio_align; 6083 } 6084 } 6085 6086 if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { 6087 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 6088 unsigned int awu_min = 0, awu_max = 0; 6089 6090 if (ext4_inode_can_atomic_write(inode)) { 6091 awu_min = sbi->s_awu_min; 6092 awu_max = sbi->s_awu_max; 6093 } 6094 6095 generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); 6096 } 6097 6098 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; 6099 if (flags & EXT4_APPEND_FL) 6100 stat->attributes |= STATX_ATTR_APPEND; 6101 if (flags & EXT4_COMPR_FL) 6102 stat->attributes |= STATX_ATTR_COMPRESSED; 6103 if (flags & EXT4_ENCRYPT_FL) 6104 stat->attributes |= STATX_ATTR_ENCRYPTED; 6105 if (flags & EXT4_IMMUTABLE_FL) 6106 stat->attributes |= STATX_ATTR_IMMUTABLE; 6107 if (flags & EXT4_NODUMP_FL) 6108 stat->attributes |= STATX_ATTR_NODUMP; 6109 if (flags & EXT4_VERITY_FL) 6110 stat->attributes |= STATX_ATTR_VERITY; 6111 6112 stat->attributes_mask |= (STATX_ATTR_APPEND | 6113 STATX_ATTR_COMPRESSED | 6114 STATX_ATTR_ENCRYPTED | 6115 STATX_ATTR_IMMUTABLE | 6116 STATX_ATTR_NODUMP | 6117 STATX_ATTR_VERITY); 6118 6119 generic_fillattr(idmap, request_mask, inode, stat); 6120 return 0; 6121 } 6122 6123 int ext4_file_getattr(struct mnt_idmap *idmap, 6124 const struct path *path, struct kstat *stat, 6125 u32 request_mask, unsigned int query_flags) 6126 { 6127 struct inode *inode = d_inode(path->dentry); 6128 u64 delalloc_blocks; 6129 6130 ext4_getattr(idmap, path, stat, request_mask, query_flags); 6131 6132 /* 6133 * If there is inline data in the inode, the inode will normally not 6134 * have data blocks allocated (it may have an external xattr block). 6135 * Report at least one sector for such files, so tools like tar, rsync, 6136 * others don't incorrectly think the file is completely sparse. 6137 */ 6138 if (unlikely(ext4_has_inline_data(inode))) 6139 stat->blocks += (stat->size + 511) >> 9; 6140 6141 /* 6142 * We can't update i_blocks if the block allocation is delayed 6143 * otherwise in the case of system crash before the real block 6144 * allocation is done, we will have i_blocks inconsistent with 6145 * on-disk file blocks. 6146 * We always keep i_blocks updated together with real 6147 * allocation. But to not confuse with user, stat 6148 * will return the blocks that include the delayed allocation 6149 * blocks for this file. 6150 */ 6151 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 6152 EXT4_I(inode)->i_reserved_data_blocks); 6153 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); 6154 return 0; 6155 } 6156 6157 static int ext4_index_trans_blocks(struct inode *inode, int lblocks, 6158 int pextents) 6159 { 6160 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 6161 return ext4_ind_trans_blocks(inode, lblocks); 6162 return ext4_ext_index_trans_blocks(inode, pextents); 6163 } 6164 6165 /* 6166 * Account for index blocks, block groups bitmaps and block group 6167 * descriptor blocks if modify datablocks and index blocks 6168 * worse case, the indexs blocks spread over different block groups 6169 * 6170 * If datablocks are discontiguous, they are possible to spread over 6171 * different block groups too. If they are contiguous, with flexbg, 6172 * they could still across block group boundary. 6173 * 6174 * Also account for superblock, inode, quota and xattr blocks 6175 */ 6176 int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) 6177 { 6178 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 6179 int gdpblocks; 6180 int idxblocks; 6181 int ret; 6182 6183 /* 6184 * How many index and leaf blocks need to touch to map @lblocks 6185 * logical blocks to @pextents physical extents? 6186 */ 6187 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); 6188 6189 /* 6190 * Now let's see how many group bitmaps and group descriptors need 6191 * to account 6192 */ 6193 groups = idxblocks + pextents; 6194 gdpblocks = groups; 6195 if (groups > ngroups) 6196 groups = ngroups; 6197 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 6198 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 6199 6200 /* bitmaps and block group descriptor blocks */ 6201 ret = idxblocks + groups + gdpblocks; 6202 6203 /* Blocks for super block, inode, quota and xattr blocks */ 6204 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 6205 6206 return ret; 6207 } 6208 6209 /* 6210 * Calculate the journal credits for modifying the number of blocks 6211 * in a single extent within one transaction. 'nrblocks' is used only 6212 * for non-extent inodes. For extent type inodes, 'nrblocks' can be 6213 * zero if the exact number of blocks is unknown. 6214 */ 6215 int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) 6216 { 6217 int ret; 6218 6219 ret = ext4_meta_trans_blocks(inode, nrblocks, 1); 6220 /* Account for data blocks for journalled mode */ 6221 if (ext4_should_journal_data(inode)) 6222 ret += nrblocks; 6223 return ret; 6224 } 6225 6226 /* 6227 * Calculate the journal credits for a chunk of data modification. 6228 * 6229 * This is called from DIO, fallocate or whoever calling 6230 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. 6231 * 6232 * journal buffers for data blocks are not included here, as DIO 6233 * and fallocate do no need to journal data buffers. 6234 */ 6235 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 6236 { 6237 return ext4_meta_trans_blocks(inode, nrblocks, 1); 6238 } 6239 6240 /* 6241 * The caller must have previously called ext4_reserve_inode_write(). 6242 * Give this, we know that the caller already has write access to iloc->bh. 6243 */ 6244 int ext4_mark_iloc_dirty(handle_t *handle, 6245 struct inode *inode, struct ext4_iloc *iloc) 6246 { 6247 int err = 0; 6248 6249 err = ext4_emergency_state(inode->i_sb); 6250 if (unlikely(err)) { 6251 put_bh(iloc->bh); 6252 return err; 6253 } 6254 ext4_fc_track_inode(handle, inode); 6255 6256 /* the do_update_inode consumes one bh->b_count */ 6257 get_bh(iloc->bh); 6258 6259 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 6260 err = ext4_do_update_inode(handle, inode, iloc); 6261 put_bh(iloc->bh); 6262 return err; 6263 } 6264 6265 /* 6266 * On success, We end up with an outstanding reference count against 6267 * iloc->bh. This _must_ be cleaned up later. 6268 */ 6269 6270 int 6271 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 6272 struct ext4_iloc *iloc) 6273 { 6274 int err; 6275 6276 err = ext4_emergency_state(inode->i_sb); 6277 if (unlikely(err)) 6278 return err; 6279 6280 err = ext4_get_inode_loc(inode, iloc); 6281 if (!err) { 6282 BUFFER_TRACE(iloc->bh, "get_write_access"); 6283 err = ext4_journal_get_write_access(handle, inode->i_sb, 6284 iloc->bh, EXT4_JTR_NONE); 6285 if (err) { 6286 brelse(iloc->bh); 6287 iloc->bh = NULL; 6288 } 6289 ext4_fc_track_inode(handle, inode); 6290 } 6291 ext4_std_error(inode->i_sb, err); 6292 return err; 6293 } 6294 6295 static int __ext4_expand_extra_isize(struct inode *inode, 6296 unsigned int new_extra_isize, 6297 struct ext4_iloc *iloc, 6298 handle_t *handle, int *no_expand) 6299 { 6300 struct ext4_inode *raw_inode; 6301 struct ext4_xattr_ibody_header *header; 6302 unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); 6303 struct ext4_inode_info *ei = EXT4_I(inode); 6304 int error; 6305 6306 /* this was checked at iget time, but double check for good measure */ 6307 if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || 6308 (ei->i_extra_isize & 3)) { 6309 EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", 6310 ei->i_extra_isize, 6311 EXT4_INODE_SIZE(inode->i_sb)); 6312 return -EFSCORRUPTED; 6313 } 6314 if ((new_extra_isize < ei->i_extra_isize) || 6315 (new_extra_isize < 4) || 6316 (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) 6317 return -EINVAL; /* Should never happen */ 6318 6319 raw_inode = ext4_raw_inode(iloc); 6320 6321 header = IHDR(inode, raw_inode); 6322 6323 /* No extended attributes present */ 6324 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 6325 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 6326 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + 6327 EXT4_I(inode)->i_extra_isize, 0, 6328 new_extra_isize - EXT4_I(inode)->i_extra_isize); 6329 EXT4_I(inode)->i_extra_isize = new_extra_isize; 6330 return 0; 6331 } 6332 6333 /* 6334 * We may need to allocate external xattr block so we need quotas 6335 * initialized. Here we can be called with various locks held so we 6336 * cannot affort to initialize quotas ourselves. So just bail. 6337 */ 6338 if (dquot_initialize_needed(inode)) 6339 return -EAGAIN; 6340 6341 /* try to expand with EAs present */ 6342 error = ext4_expand_extra_isize_ea(inode, new_extra_isize, 6343 raw_inode, handle); 6344 if (error) { 6345 /* 6346 * Inode size expansion failed; don't try again 6347 */ 6348 *no_expand = 1; 6349 } 6350 6351 return error; 6352 } 6353 6354 /* 6355 * Expand an inode by new_extra_isize bytes. 6356 * Returns 0 on success or negative error number on failure. 6357 */ 6358 static int ext4_try_to_expand_extra_isize(struct inode *inode, 6359 unsigned int new_extra_isize, 6360 struct ext4_iloc iloc, 6361 handle_t *handle) 6362 { 6363 int no_expand; 6364 int error; 6365 6366 if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) 6367 return -EOVERFLOW; 6368 6369 /* 6370 * In nojournal mode, we can immediately attempt to expand 6371 * the inode. When journaled, we first need to obtain extra 6372 * buffer credits since we may write into the EA block 6373 * with this same handle. If journal_extend fails, then it will 6374 * only result in a minor loss of functionality for that inode. 6375 * If this is felt to be critical, then e2fsck should be run to 6376 * force a large enough s_min_extra_isize. 6377 */ 6378 if (ext4_journal_extend(handle, 6379 EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) 6380 return -ENOSPC; 6381 6382 if (ext4_write_trylock_xattr(inode, &no_expand) == 0) 6383 return -EBUSY; 6384 6385 error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, 6386 handle, &no_expand); 6387 ext4_write_unlock_xattr(inode, &no_expand); 6388 6389 return error; 6390 } 6391 6392 int ext4_expand_extra_isize(struct inode *inode, 6393 unsigned int new_extra_isize, 6394 struct ext4_iloc *iloc) 6395 { 6396 handle_t *handle; 6397 int no_expand; 6398 int error, rc; 6399 6400 if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 6401 brelse(iloc->bh); 6402 return -EOVERFLOW; 6403 } 6404 6405 handle = ext4_journal_start(inode, EXT4_HT_INODE, 6406 EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 6407 if (IS_ERR(handle)) { 6408 error = PTR_ERR(handle); 6409 brelse(iloc->bh); 6410 return error; 6411 } 6412 6413 ext4_write_lock_xattr(inode, &no_expand); 6414 6415 BUFFER_TRACE(iloc->bh, "get_write_access"); 6416 error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, 6417 EXT4_JTR_NONE); 6418 if (error) { 6419 brelse(iloc->bh); 6420 goto out_unlock; 6421 } 6422 6423 error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, 6424 handle, &no_expand); 6425 6426 rc = ext4_mark_iloc_dirty(handle, inode, iloc); 6427 if (!error) 6428 error = rc; 6429 6430 out_unlock: 6431 ext4_write_unlock_xattr(inode, &no_expand); 6432 ext4_journal_stop(handle); 6433 return error; 6434 } 6435 6436 /* 6437 * What we do here is to mark the in-core inode as clean with respect to inode 6438 * dirtiness (it may still be data-dirty). 6439 * This means that the in-core inode may be reaped by prune_icache 6440 * without having to perform any I/O. This is a very good thing, 6441 * because *any* task may call prune_icache - even ones which 6442 * have a transaction open against a different journal. 6443 * 6444 * Is this cheating? Not really. Sure, we haven't written the 6445 * inode out, but prune_icache isn't a user-visible syncing function. 6446 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 6447 * we start and wait on commits. 6448 */ 6449 int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, 6450 const char *func, unsigned int line) 6451 { 6452 struct ext4_iloc iloc; 6453 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 6454 int err; 6455 6456 might_sleep(); 6457 trace_ext4_mark_inode_dirty(inode, _RET_IP_); 6458 err = ext4_reserve_inode_write(handle, inode, &iloc); 6459 if (err) 6460 goto out; 6461 6462 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) 6463 ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, 6464 iloc, handle); 6465 6466 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 6467 out: 6468 if (unlikely(err)) 6469 ext4_error_inode_err(inode, func, line, 0, err, 6470 "mark_inode_dirty error"); 6471 return err; 6472 } 6473 6474 /* 6475 * ext4_dirty_inode() is called from __mark_inode_dirty() 6476 * 6477 * We're really interested in the case where a file is being extended. 6478 * i_size has been changed by generic_commit_write() and we thus need 6479 * to include the updated inode in the current transaction. 6480 * 6481 * Also, dquot_alloc_block() will always dirty the inode when blocks 6482 * are allocated to the file. 6483 * 6484 * If the inode is marked synchronous, we don't honour that here - doing 6485 * so would cause a commit on atime updates, which we don't bother doing. 6486 * We handle synchronous inodes at the highest possible level. 6487 */ 6488 void ext4_dirty_inode(struct inode *inode, int flags) 6489 { 6490 handle_t *handle; 6491 6492 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 6493 if (IS_ERR(handle)) 6494 return; 6495 ext4_mark_inode_dirty(handle, inode); 6496 ext4_journal_stop(handle); 6497 } 6498 6499 int ext4_change_inode_journal_flag(struct inode *inode, int val) 6500 { 6501 journal_t *journal; 6502 handle_t *handle; 6503 int err; 6504 int alloc_ctx; 6505 6506 /* 6507 * We have to be very careful here: changing a data block's 6508 * journaling status dynamically is dangerous. If we write a 6509 * data block to the journal, change the status and then delete 6510 * that block, we risk forgetting to revoke the old log record 6511 * from the journal and so a subsequent replay can corrupt data. 6512 * So, first we make sure that the journal is empty and that 6513 * nobody is changing anything. 6514 */ 6515 6516 journal = EXT4_JOURNAL(inode); 6517 if (!journal) 6518 return 0; 6519 if (is_journal_aborted(journal)) 6520 return -EROFS; 6521 6522 /* Wait for all existing dio workers */ 6523 inode_dio_wait(inode); 6524 6525 /* 6526 * Before flushing the journal and switching inode's aops, we have 6527 * to flush all dirty data the inode has. There can be outstanding 6528 * delayed allocations, there can be unwritten extents created by 6529 * fallocate or buffered writes in dioread_nolock mode covered by 6530 * dirty data which can be converted only after flushing the dirty 6531 * data (and journalled aops don't know how to handle these cases). 6532 */ 6533 filemap_invalidate_lock(inode->i_mapping); 6534 err = filemap_write_and_wait(inode->i_mapping); 6535 if (err < 0) { 6536 filemap_invalidate_unlock(inode->i_mapping); 6537 return err; 6538 } 6539 /* Before switch the inode journalling mode evict all the page cache. */ 6540 truncate_pagecache(inode, 0); 6541 6542 alloc_ctx = ext4_writepages_down_write(inode->i_sb); 6543 jbd2_journal_lock_updates(journal); 6544 6545 /* 6546 * OK, there are no updates running now, and all cached data is 6547 * synced to disk. We are now in a completely consistent state 6548 * which doesn't have anything in the journal, and we know that 6549 * no filesystem updates are running, so it is safe to modify 6550 * the inode's in-core data-journaling state flag now. 6551 */ 6552 6553 if (val) 6554 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 6555 else { 6556 err = jbd2_journal_flush(journal, 0); 6557 if (err < 0) { 6558 jbd2_journal_unlock_updates(journal); 6559 ext4_writepages_up_write(inode->i_sb, alloc_ctx); 6560 filemap_invalidate_unlock(inode->i_mapping); 6561 return err; 6562 } 6563 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 6564 } 6565 ext4_set_aops(inode); 6566 ext4_set_inode_mapping_order(inode); 6567 6568 jbd2_journal_unlock_updates(journal); 6569 ext4_writepages_up_write(inode->i_sb, alloc_ctx); 6570 filemap_invalidate_unlock(inode->i_mapping); 6571 6572 /* Finally we can mark the inode as dirty. */ 6573 6574 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 6575 if (IS_ERR(handle)) 6576 return PTR_ERR(handle); 6577 6578 ext4_fc_mark_ineligible(inode->i_sb, 6579 EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); 6580 err = ext4_mark_inode_dirty(handle, inode); 6581 ext4_handle_sync(handle); 6582 ext4_journal_stop(handle); 6583 ext4_std_error(inode->i_sb, err); 6584 6585 return err; 6586 } 6587 6588 static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, 6589 struct buffer_head *bh) 6590 { 6591 return !buffer_mapped(bh); 6592 } 6593 6594 static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, 6595 get_block_t get_block) 6596 { 6597 handle_t *handle; 6598 loff_t size; 6599 unsigned long len; 6600 int credits; 6601 int ret; 6602 6603 credits = ext4_chunk_trans_extent(inode, 6604 ext4_journal_blocks_per_folio(inode)); 6605 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); 6606 if (IS_ERR(handle)) 6607 return PTR_ERR(handle); 6608 6609 folio_lock(folio); 6610 size = i_size_read(inode); 6611 /* Page got truncated from under us? */ 6612 if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { 6613 ret = -EFAULT; 6614 goto out_error; 6615 } 6616 6617 len = folio_size(folio); 6618 if (folio_pos(folio) + len > size) 6619 len = size - folio_pos(folio); 6620 6621 ret = ext4_block_write_begin(handle, folio, 0, len, get_block); 6622 if (ret) 6623 goto out_error; 6624 6625 if (!ext4_should_journal_data(inode)) { 6626 block_commit_write(folio, 0, len); 6627 folio_mark_dirty(folio); 6628 } else { 6629 ret = ext4_journal_folio_buffers(handle, folio, len); 6630 if (ret) 6631 goto out_error; 6632 } 6633 ext4_journal_stop(handle); 6634 folio_wait_stable(folio); 6635 return ret; 6636 6637 out_error: 6638 folio_unlock(folio); 6639 ext4_journal_stop(handle); 6640 return ret; 6641 } 6642 6643 vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) 6644 { 6645 struct vm_area_struct *vma = vmf->vma; 6646 struct folio *folio = page_folio(vmf->page); 6647 loff_t size; 6648 unsigned long len; 6649 int err; 6650 vm_fault_t ret; 6651 struct file *file = vma->vm_file; 6652 struct inode *inode = file_inode(file); 6653 struct address_space *mapping = inode->i_mapping; 6654 get_block_t *get_block = ext4_get_block; 6655 int retries = 0; 6656 6657 if (unlikely(IS_IMMUTABLE(inode))) 6658 return VM_FAULT_SIGBUS; 6659 6660 sb_start_pagefault(inode->i_sb); 6661 file_update_time(vma->vm_file); 6662 6663 filemap_invalidate_lock_shared(mapping); 6664 6665 err = ext4_convert_inline_data(inode); 6666 if (err) 6667 goto out_ret; 6668 6669 /* 6670 * On data journalling we skip straight to the transaction handle: 6671 * there's no delalloc; page truncated will be checked later; the 6672 * early return w/ all buffers mapped (calculates size/len) can't 6673 * be used; and there's no dioread_nolock, so only ext4_get_block. 6674 */ 6675 if (ext4_should_journal_data(inode)) 6676 goto retry_alloc; 6677 6678 /* Delalloc case is easy... */ 6679 if (test_opt(inode->i_sb, DELALLOC) && 6680 !ext4_nonda_switch(inode->i_sb)) { 6681 do { 6682 err = block_page_mkwrite(vma, vmf, 6683 ext4_da_get_block_prep); 6684 } while (err == -ENOSPC && 6685 ext4_should_retry_alloc(inode->i_sb, &retries)); 6686 goto out_ret; 6687 } 6688 6689 folio_lock(folio); 6690 size = i_size_read(inode); 6691 /* Page got truncated from under us? */ 6692 if (folio->mapping != mapping || folio_pos(folio) > size) { 6693 folio_unlock(folio); 6694 ret = VM_FAULT_NOPAGE; 6695 goto out; 6696 } 6697 6698 len = folio_size(folio); 6699 if (folio_pos(folio) + len > size) 6700 len = size - folio_pos(folio); 6701 /* 6702 * Return if we have all the buffers mapped. This avoids the need to do 6703 * journal_start/journal_stop which can block and take a long time 6704 * 6705 * This cannot be done for data journalling, as we have to add the 6706 * inode to the transaction's list to writeprotect pages on commit. 6707 */ 6708 if (folio_buffers(folio)) { 6709 if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 6710 0, len, NULL, 6711 ext4_bh_unmapped)) { 6712 /* Wait so that we don't change page under IO */ 6713 folio_wait_stable(folio); 6714 ret = VM_FAULT_LOCKED; 6715 goto out; 6716 } 6717 } 6718 folio_unlock(folio); 6719 /* OK, we need to fill the hole... */ 6720 if (ext4_should_dioread_nolock(inode)) 6721 get_block = ext4_get_block_unwritten; 6722 retry_alloc: 6723 /* Start journal and allocate blocks */ 6724 err = ext4_block_page_mkwrite(inode, folio, get_block); 6725 if (err == -EAGAIN || 6726 (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) 6727 goto retry_alloc; 6728 out_ret: 6729 ret = vmf_fs_error(err); 6730 out: 6731 filemap_invalidate_unlock_shared(mapping); 6732 sb_end_pagefault(inode->i_sb); 6733 return ret; 6734 } 6735