1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/ext4/file.c 4 * 5 * Copyright (C) 1992, 1993, 1994, 1995 6 * Remy Card (card@masi.ibp.fr) 7 * Laboratoire MASI - Institut Blaise Pascal 8 * Universite Pierre et Marie Curie (Paris VI) 9 * 10 * from 11 * 12 * linux/fs/minix/file.c 13 * 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * ext4 fs regular file handling primitives 17 * 18 * 64-bit file support on 64-bit platforms by Jakub Jelinek 19 * (jj@sunsite.ms.mff.cuni.cz) 20 */ 21 22 #include <linux/time.h> 23 #include <linux/fs.h> 24 #include <linux/iomap.h> 25 #include <linux/mount.h> 26 #include <linux/path.h> 27 #include <linux/dax.h> 28 #include <linux/quotaops.h> 29 #include <linux/pagevec.h> 30 #include <linux/uio.h> 31 #include "ext4.h" 32 #include "ext4_jbd2.h" 33 #include "xattr.h" 34 #include "acl.h" 35 36 #ifdef CONFIG_FS_DAX 37 static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) 38 { 39 struct inode *inode = file_inode(iocb->ki_filp); 40 ssize_t ret; 41 42 if (!inode_trylock_shared(inode)) { 43 if (iocb->ki_flags & IOCB_NOWAIT) 44 return -EAGAIN; 45 inode_lock_shared(inode); 46 } 47 /* 48 * Recheck under inode lock - at this point we are sure it cannot 49 * change anymore 50 */ 51 if (!IS_DAX(inode)) { 52 inode_unlock_shared(inode); 53 /* Fallback to buffered IO in case we cannot support DAX */ 54 return generic_file_read_iter(iocb, to); 55 } 56 ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); 57 inode_unlock_shared(inode); 58 59 file_accessed(iocb->ki_filp); 60 return ret; 61 } 62 #endif 63 64 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 65 { 66 if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) 67 return -EIO; 68 69 if (!iov_iter_count(to)) 70 return 0; /* skip atime */ 71 72 #ifdef CONFIG_FS_DAX 73 if (IS_DAX(file_inode(iocb->ki_filp))) 74 return ext4_dax_read_iter(iocb, to); 75 #endif 76 return generic_file_read_iter(iocb, to); 77 } 78 79 /* 80 * Called when an inode is released. Note that this is different 81 * from ext4_file_open: open gets called at every open, but release 82 * gets called only when /all/ the files are closed. 83 */ 84 static int ext4_release_file(struct inode *inode, struct file *filp) 85 { 86 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { 87 ext4_alloc_da_blocks(inode); 88 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 89 } 90 /* if we are the last writer on the inode, drop the block reservation */ 91 if ((filp->f_mode & FMODE_WRITE) && 92 (atomic_read(&inode->i_writecount) == 1) && 93 !EXT4_I(inode)->i_reserved_data_blocks) 94 { 95 down_write(&EXT4_I(inode)->i_data_sem); 96 ext4_discard_preallocations(inode); 97 up_write(&EXT4_I(inode)->i_data_sem); 98 } 99 if (is_dx(inode) && filp->private_data) 100 ext4_htree_free_dir_info(filp->private_data); 101 102 return 0; 103 } 104 105 static void ext4_unwritten_wait(struct inode *inode) 106 { 107 wait_queue_head_t *wq = ext4_ioend_wq(inode); 108 109 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); 110 } 111 112 /* 113 * This tests whether the IO in question is block-aligned or not. 114 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they 115 * are converted to written only after the IO is complete. Until they are 116 * mapped, these blocks appear as holes, so dio_zero_block() will assume that 117 * it needs to zero out portions of the start and/or end block. If 2 AIO 118 * threads are at work on the same unwritten block, they must be synchronized 119 * or one thread will zero the other's data, causing corruption. 120 */ 121 static int 122 ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) 123 { 124 struct super_block *sb = inode->i_sb; 125 int blockmask = sb->s_blocksize - 1; 126 127 if (pos >= i_size_read(inode)) 128 return 0; 129 130 if ((pos | iov_iter_alignment(from)) & blockmask) 131 return 1; 132 133 return 0; 134 } 135 136 /* Is IO overwriting allocated and initialized blocks? */ 137 static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) 138 { 139 struct ext4_map_blocks map; 140 unsigned int blkbits = inode->i_blkbits; 141 int err, blklen; 142 143 if (pos + len > i_size_read(inode)) 144 return false; 145 146 map.m_lblk = pos >> blkbits; 147 map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); 148 blklen = map.m_len; 149 150 err = ext4_map_blocks(NULL, inode, &map, 0); 151 /* 152 * 'err==len' means that all of the blocks have been preallocated, 153 * regardless of whether they have been initialized or not. To exclude 154 * unwritten extents, we need to check m_flags. 155 */ 156 return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); 157 } 158 159 static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) 160 { 161 struct inode *inode = file_inode(iocb->ki_filp); 162 ssize_t ret; 163 164 ret = generic_write_checks(iocb, from); 165 if (ret <= 0) 166 return ret; 167 /* 168 * If we have encountered a bitmap-format file, the size limit 169 * is smaller than s_maxbytes, which is for extent-mapped files. 170 */ 171 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 172 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 173 174 if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) 175 return -EFBIG; 176 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); 177 } 178 return iov_iter_count(from); 179 } 180 181 #ifdef CONFIG_FS_DAX 182 static ssize_t 183 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) 184 { 185 struct inode *inode = file_inode(iocb->ki_filp); 186 ssize_t ret; 187 188 if (!inode_trylock(inode)) { 189 if (iocb->ki_flags & IOCB_NOWAIT) 190 return -EAGAIN; 191 inode_lock(inode); 192 } 193 ret = ext4_write_checks(iocb, from); 194 if (ret <= 0) 195 goto out; 196 ret = file_remove_privs(iocb->ki_filp); 197 if (ret) 198 goto out; 199 ret = file_update_time(iocb->ki_filp); 200 if (ret) 201 goto out; 202 203 ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); 204 out: 205 inode_unlock(inode); 206 if (ret > 0) 207 ret = generic_write_sync(iocb, ret); 208 return ret; 209 } 210 #endif 211 212 static ssize_t 213 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 214 { 215 struct inode *inode = file_inode(iocb->ki_filp); 216 int o_direct = iocb->ki_flags & IOCB_DIRECT; 217 int unaligned_aio = 0; 218 int overwrite = 0; 219 ssize_t ret; 220 221 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 222 return -EIO; 223 224 #ifdef CONFIG_FS_DAX 225 if (IS_DAX(inode)) 226 return ext4_dax_write_iter(iocb, from); 227 #endif 228 if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT)) 229 return -EOPNOTSUPP; 230 231 if (!inode_trylock(inode)) { 232 if (iocb->ki_flags & IOCB_NOWAIT) 233 return -EAGAIN; 234 inode_lock(inode); 235 } 236 237 ret = ext4_write_checks(iocb, from); 238 if (ret <= 0) 239 goto out; 240 241 /* 242 * Unaligned direct AIO must be serialized among each other as zeroing 243 * of partial blocks of two competing unaligned AIOs can result in data 244 * corruption. 245 */ 246 if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && 247 !is_sync_kiocb(iocb) && 248 ext4_unaligned_aio(inode, from, iocb->ki_pos)) { 249 unaligned_aio = 1; 250 ext4_unwritten_wait(inode); 251 } 252 253 iocb->private = &overwrite; 254 /* Check whether we do a DIO overwrite or not */ 255 if (o_direct && !unaligned_aio) { 256 if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { 257 if (ext4_should_dioread_nolock(inode)) 258 overwrite = 1; 259 } else if (iocb->ki_flags & IOCB_NOWAIT) { 260 ret = -EAGAIN; 261 goto out; 262 } 263 } 264 265 ret = __generic_file_write_iter(iocb, from); 266 inode_unlock(inode); 267 268 if (ret > 0) 269 ret = generic_write_sync(iocb, ret); 270 271 return ret; 272 273 out: 274 inode_unlock(inode); 275 return ret; 276 } 277 278 #ifdef CONFIG_FS_DAX 279 static int ext4_dax_huge_fault(struct vm_fault *vmf, 280 enum page_entry_size pe_size) 281 { 282 int result; 283 handle_t *handle = NULL; 284 struct inode *inode = file_inode(vmf->vma->vm_file); 285 struct super_block *sb = inode->i_sb; 286 287 /* 288 * We have to distinguish real writes from writes which will result in a 289 * COW page; COW writes should *not* poke the journal (the file will not 290 * be changed). Doing so would cause unintended failures when mounted 291 * read-only. 292 * 293 * We check for VM_SHARED rather than vmf->cow_page since the latter is 294 * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for 295 * other sizes, dax_iomap_fault will handle splitting / fallback so that 296 * we eventually come back with a COW page. 297 */ 298 bool write = (vmf->flags & FAULT_FLAG_WRITE) && 299 (vmf->vma->vm_flags & VM_SHARED); 300 301 if (write) { 302 sb_start_pagefault(sb); 303 file_update_time(vmf->vma->vm_file); 304 down_read(&EXT4_I(inode)->i_mmap_sem); 305 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 306 EXT4_DATA_TRANS_BLOCKS(sb)); 307 } else { 308 down_read(&EXT4_I(inode)->i_mmap_sem); 309 } 310 if (!IS_ERR(handle)) 311 result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); 312 else 313 result = VM_FAULT_SIGBUS; 314 if (write) { 315 if (!IS_ERR(handle)) 316 ext4_journal_stop(handle); 317 up_read(&EXT4_I(inode)->i_mmap_sem); 318 sb_end_pagefault(sb); 319 } else { 320 up_read(&EXT4_I(inode)->i_mmap_sem); 321 } 322 323 return result; 324 } 325 326 static int ext4_dax_fault(struct vm_fault *vmf) 327 { 328 return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); 329 } 330 331 static const struct vm_operations_struct ext4_dax_vm_ops = { 332 .fault = ext4_dax_fault, 333 .huge_fault = ext4_dax_huge_fault, 334 .page_mkwrite = ext4_dax_fault, 335 .pfn_mkwrite = ext4_dax_fault, 336 }; 337 #else 338 #define ext4_dax_vm_ops ext4_file_vm_ops 339 #endif 340 341 static const struct vm_operations_struct ext4_file_vm_ops = { 342 .fault = ext4_filemap_fault, 343 .map_pages = filemap_map_pages, 344 .page_mkwrite = ext4_page_mkwrite, 345 }; 346 347 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 348 { 349 struct inode *inode = file->f_mapping->host; 350 351 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 352 return -EIO; 353 354 file_accessed(file); 355 if (IS_DAX(file_inode(file))) { 356 vma->vm_ops = &ext4_dax_vm_ops; 357 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 358 } else { 359 vma->vm_ops = &ext4_file_vm_ops; 360 } 361 return 0; 362 } 363 364 static int ext4_file_open(struct inode * inode, struct file * filp) 365 { 366 struct super_block *sb = inode->i_sb; 367 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 368 struct vfsmount *mnt = filp->f_path.mnt; 369 struct path path; 370 char buf[64], *cp; 371 int ret; 372 373 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 374 return -EIO; 375 376 if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && 377 !sb_rdonly(sb))) { 378 sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; 379 /* 380 * Sample where the filesystem has been mounted and 381 * store it in the superblock for sysadmin convenience 382 * when trying to sort through large numbers of block 383 * devices or filesystem images. 384 */ 385 memset(buf, 0, sizeof(buf)); 386 path.mnt = mnt; 387 path.dentry = mnt->mnt_root; 388 cp = d_path(&path, buf, sizeof(buf)); 389 if (!IS_ERR(cp)) { 390 handle_t *handle; 391 int err; 392 393 handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); 394 if (IS_ERR(handle)) 395 return PTR_ERR(handle); 396 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 397 err = ext4_journal_get_write_access(handle, sbi->s_sbh); 398 if (err) { 399 ext4_journal_stop(handle); 400 return err; 401 } 402 strlcpy(sbi->s_es->s_last_mounted, cp, 403 sizeof(sbi->s_es->s_last_mounted)); 404 ext4_handle_dirty_super(handle, sb); 405 ext4_journal_stop(handle); 406 } 407 } 408 409 ret = fscrypt_file_open(inode, filp); 410 if (ret) 411 return ret; 412 413 /* 414 * Set up the jbd2_inode if we are opening the inode for 415 * writing and the journal is present 416 */ 417 if (filp->f_mode & FMODE_WRITE) { 418 ret = ext4_inode_attach_jinode(inode); 419 if (ret < 0) 420 return ret; 421 } 422 423 filp->f_mode |= FMODE_NOWAIT; 424 return dquot_file_open(inode, filp); 425 } 426 427 /* 428 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 429 * by calling generic_file_llseek_size() with the appropriate maxbytes 430 * value for each. 431 */ 432 loff_t ext4_llseek(struct file *file, loff_t offset, int whence) 433 { 434 struct inode *inode = file->f_mapping->host; 435 loff_t maxbytes; 436 437 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 438 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; 439 else 440 maxbytes = inode->i_sb->s_maxbytes; 441 442 switch (whence) { 443 default: 444 return generic_file_llseek_size(file, offset, whence, 445 maxbytes, i_size_read(inode)); 446 case SEEK_HOLE: 447 inode_lock_shared(inode); 448 offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); 449 inode_unlock_shared(inode); 450 break; 451 case SEEK_DATA: 452 inode_lock_shared(inode); 453 offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); 454 inode_unlock_shared(inode); 455 break; 456 } 457 458 if (offset < 0) 459 return offset; 460 return vfs_setpos(file, offset, maxbytes); 461 } 462 463 const struct file_operations ext4_file_operations = { 464 .llseek = ext4_llseek, 465 .read_iter = ext4_file_read_iter, 466 .write_iter = ext4_file_write_iter, 467 .unlocked_ioctl = ext4_ioctl, 468 #ifdef CONFIG_COMPAT 469 .compat_ioctl = ext4_compat_ioctl, 470 #endif 471 .mmap = ext4_file_mmap, 472 .open = ext4_file_open, 473 .release = ext4_release_file, 474 .fsync = ext4_sync_file, 475 .get_unmapped_area = thp_get_unmapped_area, 476 .splice_read = generic_file_splice_read, 477 .splice_write = iter_file_splice_write, 478 .fallocate = ext4_fallocate, 479 }; 480 481 const struct inode_operations ext4_file_inode_operations = { 482 .setattr = ext4_setattr, 483 .getattr = ext4_file_getattr, 484 .listxattr = ext4_listxattr, 485 .get_acl = ext4_get_acl, 486 .set_acl = ext4_set_acl, 487 .fiemap = ext4_fiemap, 488 }; 489 490