1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 4 * 5 * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. 6 */ 7 8 #include <linux/blkdev.h> 9 #include <linux/backing-dev.h> 10 #include <linux/buffer_head.h> 11 #include <linux/gfp.h> 12 #include <linux/pagemap.h> 13 #include <linux/pagevec.h> 14 #include <linux/sched/signal.h> 15 #include <linux/swap.h> 16 #include <linux/uio.h> 17 #include <linux/writeback.h> 18 19 #include <asm/page.h> 20 #include <linux/uaccess.h> 21 22 #include "attrib.h" 23 #include "bitmap.h" 24 #include "inode.h" 25 #include "debug.h" 26 #include "lcnalloc.h" 27 #include "malloc.h" 28 #include "mft.h" 29 #include "ntfs.h" 30 31 /** 32 * ntfs_file_open - called when an inode is about to be opened 33 * @vi: inode to be opened 34 * @filp: file structure describing the inode 35 * 36 * Limit file size to the page cache limit on architectures where unsigned long 37 * is 32-bits. This is the most we can do for now without overflowing the page 38 * cache page index. Doing it this way means we don't run into problems because 39 * of existing too large files. It would be better to allow the user to read 40 * the beginning of the file but I doubt very much anyone is going to hit this 41 * check on a 32-bit architecture, so there is no point in adding the extra 42 * complexity required to support this. 43 * 44 * On 64-bit architectures, the check is hopefully optimized away by the 45 * compiler. 46 * 47 * After the check passes, just call generic_file_open() to do its work. 48 */ 49 static int ntfs_file_open(struct inode *vi, struct file *filp) 50 { 51 if (sizeof(unsigned long) < 8) { 52 if (i_size_read(vi) > MAX_LFS_FILESIZE) 53 return -EOVERFLOW; 54 } 55 return generic_file_open(vi, filp); 56 } 57 58 #ifdef NTFS_RW 59 60 /** 61 * ntfs_attr_extend_initialized - extend the initialized size of an attribute 62 * @ni: ntfs inode of the attribute to extend 63 * @new_init_size: requested new initialized size in bytes 64 * 65 * Extend the initialized size of an attribute described by the ntfs inode @ni 66 * to @new_init_size bytes. This involves zeroing any non-sparse space between 67 * the old initialized size and @new_init_size both in the page cache and on 68 * disk (if relevant complete pages are already uptodate in the page cache then 69 * these are simply marked dirty). 70 * 71 * As a side-effect, the file size (vfs inode->i_size) may be incremented as, 72 * in the resident attribute case, it is tied to the initialized size and, in 73 * the non-resident attribute case, it may not fall below the initialized size. 74 * 75 * Note that if the attribute is resident, we do not need to touch the page 76 * cache at all. This is because if the page cache page is not uptodate we 77 * bring it uptodate later, when doing the write to the mft record since we 78 * then already have the page mapped. And if the page is uptodate, the 79 * non-initialized region will already have been zeroed when the page was 80 * brought uptodate and the region may in fact already have been overwritten 81 * with new data via mmap() based writes, so we cannot just zero it. And since 82 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped 83 * is unspecified, we choose not to do zeroing and thus we do not need to touch 84 * the page at all. For a more detailed explanation see ntfs_truncate() in 85 * fs/ntfs/inode.c. 86 * 87 * Return 0 on success and -errno on error. In the case that an error is 88 * encountered it is possible that the initialized size will already have been 89 * incremented some way towards @new_init_size but it is guaranteed that if 90 * this is the case, the necessary zeroing will also have happened and that all 91 * metadata is self-consistent. 92 * 93 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 94 * held by the caller. 95 */ 96 static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) 97 { 98 s64 old_init_size; 99 loff_t old_i_size; 100 pgoff_t index, end_index; 101 unsigned long flags; 102 struct inode *vi = VFS_I(ni); 103 ntfs_inode *base_ni; 104 MFT_RECORD *m = NULL; 105 ATTR_RECORD *a; 106 ntfs_attr_search_ctx *ctx = NULL; 107 struct address_space *mapping; 108 struct page *page = NULL; 109 u8 *kattr; 110 int err; 111 u32 attr_len; 112 113 read_lock_irqsave(&ni->size_lock, flags); 114 old_init_size = ni->initialized_size; 115 old_i_size = i_size_read(vi); 116 BUG_ON(new_init_size > ni->allocated_size); 117 read_unlock_irqrestore(&ni->size_lock, flags); 118 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 119 "old_initialized_size 0x%llx, " 120 "new_initialized_size 0x%llx, i_size 0x%llx.", 121 vi->i_ino, (unsigned)le32_to_cpu(ni->type), 122 (unsigned long long)old_init_size, 123 (unsigned long long)new_init_size, old_i_size); 124 if (!NInoAttr(ni)) 125 base_ni = ni; 126 else 127 base_ni = ni->ext.base_ntfs_ino; 128 /* Use goto to reduce indentation and we need the label below anyway. */ 129 if (NInoNonResident(ni)) 130 goto do_non_resident_extend; 131 BUG_ON(old_init_size != old_i_size); 132 m = map_mft_record(base_ni); 133 if (IS_ERR(m)) { 134 err = PTR_ERR(m); 135 m = NULL; 136 goto err_out; 137 } 138 ctx = ntfs_attr_get_search_ctx(base_ni, m); 139 if (unlikely(!ctx)) { 140 err = -ENOMEM; 141 goto err_out; 142 } 143 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 144 CASE_SENSITIVE, 0, NULL, 0, ctx); 145 if (unlikely(err)) { 146 if (err == -ENOENT) 147 err = -EIO; 148 goto err_out; 149 } 150 m = ctx->mrec; 151 a = ctx->attr; 152 BUG_ON(a->non_resident); 153 /* The total length of the attribute value. */ 154 attr_len = le32_to_cpu(a->data.resident.value_length); 155 BUG_ON(old_i_size != (loff_t)attr_len); 156 /* 157 * Do the zeroing in the mft record and update the attribute size in 158 * the mft record. 159 */ 160 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 161 memset(kattr + attr_len, 0, new_init_size - attr_len); 162 a->data.resident.value_length = cpu_to_le32((u32)new_init_size); 163 /* Finally, update the sizes in the vfs and ntfs inodes. */ 164 write_lock_irqsave(&ni->size_lock, flags); 165 i_size_write(vi, new_init_size); 166 ni->initialized_size = new_init_size; 167 write_unlock_irqrestore(&ni->size_lock, flags); 168 goto done; 169 do_non_resident_extend: 170 /* 171 * If the new initialized size @new_init_size exceeds the current file 172 * size (vfs inode->i_size), we need to extend the file size to the 173 * new initialized size. 174 */ 175 if (new_init_size > old_i_size) { 176 m = map_mft_record(base_ni); 177 if (IS_ERR(m)) { 178 err = PTR_ERR(m); 179 m = NULL; 180 goto err_out; 181 } 182 ctx = ntfs_attr_get_search_ctx(base_ni, m); 183 if (unlikely(!ctx)) { 184 err = -ENOMEM; 185 goto err_out; 186 } 187 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 188 CASE_SENSITIVE, 0, NULL, 0, ctx); 189 if (unlikely(err)) { 190 if (err == -ENOENT) 191 err = -EIO; 192 goto err_out; 193 } 194 m = ctx->mrec; 195 a = ctx->attr; 196 BUG_ON(!a->non_resident); 197 BUG_ON(old_i_size != (loff_t) 198 sle64_to_cpu(a->data.non_resident.data_size)); 199 a->data.non_resident.data_size = cpu_to_sle64(new_init_size); 200 flush_dcache_mft_record_page(ctx->ntfs_ino); 201 mark_mft_record_dirty(ctx->ntfs_ino); 202 /* Update the file size in the vfs inode. */ 203 i_size_write(vi, new_init_size); 204 ntfs_attr_put_search_ctx(ctx); 205 ctx = NULL; 206 unmap_mft_record(base_ni); 207 m = NULL; 208 } 209 mapping = vi->i_mapping; 210 index = old_init_size >> PAGE_SHIFT; 211 end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; 212 do { 213 /* 214 * Read the page. If the page is not present, this will zero 215 * the uninitialized regions for us. 216 */ 217 page = read_mapping_page(mapping, index, NULL); 218 if (IS_ERR(page)) { 219 err = PTR_ERR(page); 220 goto init_err_out; 221 } 222 /* 223 * Update the initialized size in the ntfs inode. This is 224 * enough to make ntfs_writepage() work. 225 */ 226 write_lock_irqsave(&ni->size_lock, flags); 227 ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; 228 if (ni->initialized_size > new_init_size) 229 ni->initialized_size = new_init_size; 230 write_unlock_irqrestore(&ni->size_lock, flags); 231 /* Set the page dirty so it gets written out. */ 232 set_page_dirty(page); 233 put_page(page); 234 /* 235 * Play nice with the vm and the rest of the system. This is 236 * very much needed as we can potentially be modifying the 237 * initialised size from a very small value to a really huge 238 * value, e.g. 239 * f = open(somefile, O_TRUNC); 240 * truncate(f, 10GiB); 241 * seek(f, 10GiB); 242 * write(f, 1); 243 * And this would mean we would be marking dirty hundreds of 244 * thousands of pages or as in the above example more than 245 * two and a half million pages! 246 * 247 * TODO: For sparse pages could optimize this workload by using 248 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This 249 * would be set in read_folio for sparse pages and here we would 250 * not need to mark dirty any pages which have this bit set. 251 * The only caveat is that we have to clear the bit everywhere 252 * where we allocate any clusters that lie in the page or that 253 * contain the page. 254 * 255 * TODO: An even greater optimization would be for us to only 256 * call read_folio() on pages which are not in sparse regions as 257 * determined from the runlist. This would greatly reduce the 258 * number of pages we read and make dirty in the case of sparse 259 * files. 260 */ 261 balance_dirty_pages_ratelimited(mapping); 262 cond_resched(); 263 } while (++index < end_index); 264 read_lock_irqsave(&ni->size_lock, flags); 265 BUG_ON(ni->initialized_size != new_init_size); 266 read_unlock_irqrestore(&ni->size_lock, flags); 267 /* Now bring in sync the initialized_size in the mft record. */ 268 m = map_mft_record(base_ni); 269 if (IS_ERR(m)) { 270 err = PTR_ERR(m); 271 m = NULL; 272 goto init_err_out; 273 } 274 ctx = ntfs_attr_get_search_ctx(base_ni, m); 275 if (unlikely(!ctx)) { 276 err = -ENOMEM; 277 goto init_err_out; 278 } 279 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 280 CASE_SENSITIVE, 0, NULL, 0, ctx); 281 if (unlikely(err)) { 282 if (err == -ENOENT) 283 err = -EIO; 284 goto init_err_out; 285 } 286 m = ctx->mrec; 287 a = ctx->attr; 288 BUG_ON(!a->non_resident); 289 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); 290 done: 291 flush_dcache_mft_record_page(ctx->ntfs_ino); 292 mark_mft_record_dirty(ctx->ntfs_ino); 293 if (ctx) 294 ntfs_attr_put_search_ctx(ctx); 295 if (m) 296 unmap_mft_record(base_ni); 297 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", 298 (unsigned long long)new_init_size, i_size_read(vi)); 299 return 0; 300 init_err_out: 301 write_lock_irqsave(&ni->size_lock, flags); 302 ni->initialized_size = old_init_size; 303 write_unlock_irqrestore(&ni->size_lock, flags); 304 err_out: 305 if (ctx) 306 ntfs_attr_put_search_ctx(ctx); 307 if (m) 308 unmap_mft_record(base_ni); 309 ntfs_debug("Failed. Returning error code %i.", err); 310 return err; 311 } 312 313 static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, 314 struct iov_iter *from) 315 { 316 loff_t pos; 317 s64 end, ll; 318 ssize_t err; 319 unsigned long flags; 320 struct file *file = iocb->ki_filp; 321 struct inode *vi = file_inode(file); 322 ntfs_inode *ni = NTFS_I(vi); 323 ntfs_volume *vol = ni->vol; 324 325 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " 326 "0x%llx, count 0x%zx.", vi->i_ino, 327 (unsigned)le32_to_cpu(ni->type), 328 (unsigned long long)iocb->ki_pos, 329 iov_iter_count(from)); 330 err = generic_write_checks(iocb, from); 331 if (unlikely(err <= 0)) 332 goto out; 333 /* 334 * All checks have passed. Before we start doing any writing we want 335 * to abort any totally illegal writes. 336 */ 337 BUG_ON(NInoMstProtected(ni)); 338 BUG_ON(ni->type != AT_DATA); 339 /* If file is encrypted, deny access, just like NT4. */ 340 if (NInoEncrypted(ni)) { 341 /* Only $DATA attributes can be encrypted. */ 342 /* 343 * Reminder for later: Encrypted files are _always_ 344 * non-resident so that the content can always be encrypted. 345 */ 346 ntfs_debug("Denying write access to encrypted file."); 347 err = -EACCES; 348 goto out; 349 } 350 if (NInoCompressed(ni)) { 351 /* Only unnamed $DATA attribute can be compressed. */ 352 BUG_ON(ni->name_len); 353 /* 354 * Reminder for later: If resident, the data is not actually 355 * compressed. Only on the switch to non-resident does 356 * compression kick in. This is in contrast to encrypted files 357 * (see above). 358 */ 359 ntfs_error(vi->i_sb, "Writing to compressed files is not " 360 "implemented yet. Sorry."); 361 err = -EOPNOTSUPP; 362 goto out; 363 } 364 err = file_remove_privs(file); 365 if (unlikely(err)) 366 goto out; 367 /* 368 * Our ->update_time method always succeeds thus file_update_time() 369 * cannot fail either so there is no need to check the return code. 370 */ 371 file_update_time(file); 372 pos = iocb->ki_pos; 373 /* The first byte after the last cluster being written to. */ 374 end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & 375 ~(u64)vol->cluster_size_mask; 376 /* 377 * If the write goes beyond the allocated size, extend the allocation 378 * to cover the whole of the write, rounded up to the nearest cluster. 379 */ 380 read_lock_irqsave(&ni->size_lock, flags); 381 ll = ni->allocated_size; 382 read_unlock_irqrestore(&ni->size_lock, flags); 383 if (end > ll) { 384 /* 385 * Extend the allocation without changing the data size. 386 * 387 * Note we ensure the allocation is big enough to at least 388 * write some data but we do not require the allocation to be 389 * complete, i.e. it may be partial. 390 */ 391 ll = ntfs_attr_extend_allocation(ni, end, -1, pos); 392 if (likely(ll >= 0)) { 393 BUG_ON(pos >= ll); 394 /* If the extension was partial truncate the write. */ 395 if (end > ll) { 396 ntfs_debug("Truncating write to inode 0x%lx, " 397 "attribute type 0x%x, because " 398 "the allocation was only " 399 "partially extended.", 400 vi->i_ino, (unsigned) 401 le32_to_cpu(ni->type)); 402 iov_iter_truncate(from, ll - pos); 403 } 404 } else { 405 err = ll; 406 read_lock_irqsave(&ni->size_lock, flags); 407 ll = ni->allocated_size; 408 read_unlock_irqrestore(&ni->size_lock, flags); 409 /* Perform a partial write if possible or fail. */ 410 if (pos < ll) { 411 ntfs_debug("Truncating write to inode 0x%lx " 412 "attribute type 0x%x, because " 413 "extending the allocation " 414 "failed (error %d).", 415 vi->i_ino, (unsigned) 416 le32_to_cpu(ni->type), 417 (int)-err); 418 iov_iter_truncate(from, ll - pos); 419 } else { 420 if (err != -ENOSPC) 421 ntfs_error(vi->i_sb, "Cannot perform " 422 "write to inode " 423 "0x%lx, attribute " 424 "type 0x%x, because " 425 "extending the " 426 "allocation failed " 427 "(error %ld).", 428 vi->i_ino, (unsigned) 429 le32_to_cpu(ni->type), 430 (long)-err); 431 else 432 ntfs_debug("Cannot perform write to " 433 "inode 0x%lx, " 434 "attribute type 0x%x, " 435 "because there is not " 436 "space left.", 437 vi->i_ino, (unsigned) 438 le32_to_cpu(ni->type)); 439 goto out; 440 } 441 } 442 } 443 /* 444 * If the write starts beyond the initialized size, extend it up to the 445 * beginning of the write and initialize all non-sparse space between 446 * the old initialized size and the new one. This automatically also 447 * increments the vfs inode->i_size to keep it above or equal to the 448 * initialized_size. 449 */ 450 read_lock_irqsave(&ni->size_lock, flags); 451 ll = ni->initialized_size; 452 read_unlock_irqrestore(&ni->size_lock, flags); 453 if (pos > ll) { 454 /* 455 * Wait for ongoing direct i/o to complete before proceeding. 456 * New direct i/o cannot start as we hold i_mutex. 457 */ 458 inode_dio_wait(vi); 459 err = ntfs_attr_extend_initialized(ni, pos); 460 if (unlikely(err < 0)) 461 ntfs_error(vi->i_sb, "Cannot perform write to inode " 462 "0x%lx, attribute type 0x%x, because " 463 "extending the initialized size " 464 "failed (error %d).", vi->i_ino, 465 (unsigned)le32_to_cpu(ni->type), 466 (int)-err); 467 } 468 out: 469 return err; 470 } 471 472 /** 473 * __ntfs_grab_cache_pages - obtain a number of locked pages 474 * @mapping: address space mapping from which to obtain page cache pages 475 * @index: starting index in @mapping at which to begin obtaining pages 476 * @nr_pages: number of page cache pages to obtain 477 * @pages: array of pages in which to return the obtained page cache pages 478 * @cached_page: allocated but as yet unused page 479 * 480 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 481 * starting at index @index. 482 * 483 * If a page is newly created, add it to lru list 484 * 485 * Note, the page locks are obtained in ascending page index order. 486 */ 487 static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 488 pgoff_t index, const unsigned nr_pages, struct page **pages, 489 struct page **cached_page) 490 { 491 int err, nr; 492 493 BUG_ON(!nr_pages); 494 err = nr = 0; 495 do { 496 pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | 497 FGP_ACCESSED); 498 if (!pages[nr]) { 499 if (!*cached_page) { 500 *cached_page = page_cache_alloc(mapping); 501 if (unlikely(!*cached_page)) { 502 err = -ENOMEM; 503 goto err_out; 504 } 505 } 506 err = add_to_page_cache_lru(*cached_page, mapping, 507 index, 508 mapping_gfp_constraint(mapping, GFP_KERNEL)); 509 if (unlikely(err)) { 510 if (err == -EEXIST) 511 continue; 512 goto err_out; 513 } 514 pages[nr] = *cached_page; 515 *cached_page = NULL; 516 } 517 index++; 518 nr++; 519 } while (nr < nr_pages); 520 out: 521 return err; 522 err_out: 523 while (nr > 0) { 524 unlock_page(pages[--nr]); 525 put_page(pages[nr]); 526 } 527 goto out; 528 } 529 530 static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) 531 { 532 lock_buffer(bh); 533 get_bh(bh); 534 bh->b_end_io = end_buffer_read_sync; 535 submit_bh(REQ_OP_READ, bh); 536 } 537 538 /** 539 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data 540 * @pages: array of destination pages 541 * @nr_pages: number of pages in @pages 542 * @pos: byte position in file at which the write begins 543 * @bytes: number of bytes to be written 544 * 545 * This is called for non-resident attributes from ntfs_file_buffered_write() 546 * with i_mutex held on the inode (@pages[0]->mapping->host). There are 547 * @nr_pages pages in @pages which are locked but not kmap()ped. The source 548 * data has not yet been copied into the @pages. 549 * 550 * Need to fill any holes with actual clusters, allocate buffers if necessary, 551 * ensure all the buffers are mapped, and bring uptodate any buffers that are 552 * only partially being written to. 553 * 554 * If @nr_pages is greater than one, we are guaranteed that the cluster size is 555 * greater than PAGE_SIZE, that all pages in @pages are entirely inside 556 * the same cluster and that they are the entirety of that cluster, and that 557 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. 558 * 559 * i_size is not to be modified yet. 560 * 561 * Return 0 on success or -errno on error. 562 */ 563 static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, 564 unsigned nr_pages, s64 pos, size_t bytes) 565 { 566 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; 567 LCN lcn; 568 s64 bh_pos, vcn_len, end, initialized_size; 569 sector_t lcn_block; 570 struct folio *folio; 571 struct inode *vi; 572 ntfs_inode *ni, *base_ni = NULL; 573 ntfs_volume *vol; 574 runlist_element *rl, *rl2; 575 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; 576 ntfs_attr_search_ctx *ctx = NULL; 577 MFT_RECORD *m = NULL; 578 ATTR_RECORD *a = NULL; 579 unsigned long flags; 580 u32 attr_rec_len = 0; 581 unsigned blocksize, u; 582 int err, mp_size; 583 bool rl_write_locked, was_hole, is_retry; 584 unsigned char blocksize_bits; 585 struct { 586 u8 runlist_merged:1; 587 u8 mft_attr_mapped:1; 588 u8 mp_rebuilt:1; 589 u8 attr_switched:1; 590 } status = { 0, 0, 0, 0 }; 591 592 BUG_ON(!nr_pages); 593 BUG_ON(!pages); 594 BUG_ON(!*pages); 595 vi = pages[0]->mapping->host; 596 ni = NTFS_I(vi); 597 vol = ni->vol; 598 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 599 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 600 vi->i_ino, ni->type, pages[0]->index, nr_pages, 601 (long long)pos, bytes); 602 blocksize = vol->sb->s_blocksize; 603 blocksize_bits = vol->sb->s_blocksize_bits; 604 rl_write_locked = false; 605 rl = NULL; 606 err = 0; 607 vcn = lcn = -1; 608 vcn_len = 0; 609 lcn_block = -1; 610 was_hole = false; 611 cpos = pos >> vol->cluster_size_bits; 612 end = pos + bytes; 613 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; 614 /* 615 * Loop over each buffer in each folio. Use goto to 616 * reduce indentation. 617 */ 618 u = 0; 619 do_next_folio: 620 folio = page_folio(pages[u]); 621 bh_pos = folio_pos(folio); 622 head = folio_buffers(folio); 623 if (!head) 624 /* 625 * create_empty_buffers() will create uptodate/dirty 626 * buffers if the folio is uptodate/dirty. 627 */ 628 head = create_empty_buffers(folio, blocksize, 0); 629 bh = head; 630 do { 631 VCN cdelta; 632 s64 bh_end; 633 unsigned bh_cofs; 634 635 /* Clear buffer_new on all buffers to reinitialise state. */ 636 if (buffer_new(bh)) 637 clear_buffer_new(bh); 638 bh_end = bh_pos + blocksize; 639 bh_cpos = bh_pos >> vol->cluster_size_bits; 640 bh_cofs = bh_pos & vol->cluster_size_mask; 641 if (buffer_mapped(bh)) { 642 /* 643 * The buffer is already mapped. If it is uptodate, 644 * ignore it. 645 */ 646 if (buffer_uptodate(bh)) 647 continue; 648 /* 649 * The buffer is not uptodate. If the folio is uptodate 650 * set the buffer uptodate and otherwise ignore it. 651 */ 652 if (folio_test_uptodate(folio)) { 653 set_buffer_uptodate(bh); 654 continue; 655 } 656 /* 657 * Neither the folio nor the buffer are uptodate. If 658 * the buffer is only partially being written to, we 659 * need to read it in before the write, i.e. now. 660 */ 661 if ((bh_pos < pos && bh_end > pos) || 662 (bh_pos < end && bh_end > end)) { 663 /* 664 * If the buffer is fully or partially within 665 * the initialized size, do an actual read. 666 * Otherwise, simply zero the buffer. 667 */ 668 read_lock_irqsave(&ni->size_lock, flags); 669 initialized_size = ni->initialized_size; 670 read_unlock_irqrestore(&ni->size_lock, flags); 671 if (bh_pos < initialized_size) { 672 ntfs_submit_bh_for_read(bh); 673 *wait_bh++ = bh; 674 } else { 675 folio_zero_range(folio, bh_offset(bh), 676 blocksize); 677 set_buffer_uptodate(bh); 678 } 679 } 680 continue; 681 } 682 /* Unmapped buffer. Need to map it. */ 683 bh->b_bdev = vol->sb->s_bdev; 684 /* 685 * If the current buffer is in the same clusters as the map 686 * cache, there is no need to check the runlist again. The 687 * map cache is made up of @vcn, which is the first cached file 688 * cluster, @vcn_len which is the number of cached file 689 * clusters, @lcn is the device cluster corresponding to @vcn, 690 * and @lcn_block is the block number corresponding to @lcn. 691 */ 692 cdelta = bh_cpos - vcn; 693 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { 694 map_buffer_cached: 695 BUG_ON(lcn < 0); 696 bh->b_blocknr = lcn_block + 697 (cdelta << (vol->cluster_size_bits - 698 blocksize_bits)) + 699 (bh_cofs >> blocksize_bits); 700 set_buffer_mapped(bh); 701 /* 702 * If the folio is uptodate so is the buffer. If the 703 * buffer is fully outside the write, we ignore it if 704 * it was already allocated and we mark it dirty so it 705 * gets written out if we allocated it. On the other 706 * hand, if we allocated the buffer but we are not 707 * marking it dirty we set buffer_new so we can do 708 * error recovery. 709 */ 710 if (folio_test_uptodate(folio)) { 711 if (!buffer_uptodate(bh)) 712 set_buffer_uptodate(bh); 713 if (unlikely(was_hole)) { 714 /* We allocated the buffer. */ 715 clean_bdev_bh_alias(bh); 716 if (bh_end <= pos || bh_pos >= end) 717 mark_buffer_dirty(bh); 718 else 719 set_buffer_new(bh); 720 } 721 continue; 722 } 723 /* Page is _not_ uptodate. */ 724 if (likely(!was_hole)) { 725 /* 726 * Buffer was already allocated. If it is not 727 * uptodate and is only partially being written 728 * to, we need to read it in before the write, 729 * i.e. now. 730 */ 731 if (!buffer_uptodate(bh) && bh_pos < end && 732 bh_end > pos && 733 (bh_pos < pos || 734 bh_end > end)) { 735 /* 736 * If the buffer is fully or partially 737 * within the initialized size, do an 738 * actual read. Otherwise, simply zero 739 * the buffer. 740 */ 741 read_lock_irqsave(&ni->size_lock, 742 flags); 743 initialized_size = ni->initialized_size; 744 read_unlock_irqrestore(&ni->size_lock, 745 flags); 746 if (bh_pos < initialized_size) { 747 ntfs_submit_bh_for_read(bh); 748 *wait_bh++ = bh; 749 } else { 750 folio_zero_range(folio, 751 bh_offset(bh), 752 blocksize); 753 set_buffer_uptodate(bh); 754 } 755 } 756 continue; 757 } 758 /* We allocated the buffer. */ 759 clean_bdev_bh_alias(bh); 760 /* 761 * If the buffer is fully outside the write, zero it, 762 * set it uptodate, and mark it dirty so it gets 763 * written out. If it is partially being written to, 764 * zero region surrounding the write but leave it to 765 * commit write to do anything else. Finally, if the 766 * buffer is fully being overwritten, do nothing. 767 */ 768 if (bh_end <= pos || bh_pos >= end) { 769 if (!buffer_uptodate(bh)) { 770 folio_zero_range(folio, bh_offset(bh), 771 blocksize); 772 set_buffer_uptodate(bh); 773 } 774 mark_buffer_dirty(bh); 775 continue; 776 } 777 set_buffer_new(bh); 778 if (!buffer_uptodate(bh) && 779 (bh_pos < pos || bh_end > end)) { 780 u8 *kaddr; 781 unsigned pofs; 782 783 kaddr = kmap_local_folio(folio, 0); 784 if (bh_pos < pos) { 785 pofs = bh_pos & ~PAGE_MASK; 786 memset(kaddr + pofs, 0, pos - bh_pos); 787 } 788 if (bh_end > end) { 789 pofs = end & ~PAGE_MASK; 790 memset(kaddr + pofs, 0, bh_end - end); 791 } 792 kunmap_local(kaddr); 793 flush_dcache_folio(folio); 794 } 795 continue; 796 } 797 /* 798 * Slow path: this is the first buffer in the cluster. If it 799 * is outside allocated size and is not uptodate, zero it and 800 * set it uptodate. 801 */ 802 read_lock_irqsave(&ni->size_lock, flags); 803 initialized_size = ni->allocated_size; 804 read_unlock_irqrestore(&ni->size_lock, flags); 805 if (bh_pos > initialized_size) { 806 if (folio_test_uptodate(folio)) { 807 if (!buffer_uptodate(bh)) 808 set_buffer_uptodate(bh); 809 } else if (!buffer_uptodate(bh)) { 810 folio_zero_range(folio, bh_offset(bh), 811 blocksize); 812 set_buffer_uptodate(bh); 813 } 814 continue; 815 } 816 is_retry = false; 817 if (!rl) { 818 down_read(&ni->runlist.lock); 819 retry_remap: 820 rl = ni->runlist.rl; 821 } 822 if (likely(rl != NULL)) { 823 /* Seek to element containing target cluster. */ 824 while (rl->length && rl[1].vcn <= bh_cpos) 825 rl++; 826 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); 827 if (likely(lcn >= 0)) { 828 /* 829 * Successful remap, setup the map cache and 830 * use that to deal with the buffer. 831 */ 832 was_hole = false; 833 vcn = bh_cpos; 834 vcn_len = rl[1].vcn - vcn; 835 lcn_block = lcn << (vol->cluster_size_bits - 836 blocksize_bits); 837 cdelta = 0; 838 /* 839 * If the number of remaining clusters touched 840 * by the write is smaller or equal to the 841 * number of cached clusters, unlock the 842 * runlist as the map cache will be used from 843 * now on. 844 */ 845 if (likely(vcn + vcn_len >= cend)) { 846 if (rl_write_locked) { 847 up_write(&ni->runlist.lock); 848 rl_write_locked = false; 849 } else 850 up_read(&ni->runlist.lock); 851 rl = NULL; 852 } 853 goto map_buffer_cached; 854 } 855 } else 856 lcn = LCN_RL_NOT_MAPPED; 857 /* 858 * If it is not a hole and not out of bounds, the runlist is 859 * probably unmapped so try to map it now. 860 */ 861 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { 862 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { 863 /* Attempt to map runlist. */ 864 if (!rl_write_locked) { 865 /* 866 * We need the runlist locked for 867 * writing, so if it is locked for 868 * reading relock it now and retry in 869 * case it changed whilst we dropped 870 * the lock. 871 */ 872 up_read(&ni->runlist.lock); 873 down_write(&ni->runlist.lock); 874 rl_write_locked = true; 875 goto retry_remap; 876 } 877 err = ntfs_map_runlist_nolock(ni, bh_cpos, 878 NULL); 879 if (likely(!err)) { 880 is_retry = true; 881 goto retry_remap; 882 } 883 /* 884 * If @vcn is out of bounds, pretend @lcn is 885 * LCN_ENOENT. As long as the buffer is out 886 * of bounds this will work fine. 887 */ 888 if (err == -ENOENT) { 889 lcn = LCN_ENOENT; 890 err = 0; 891 goto rl_not_mapped_enoent; 892 } 893 } else 894 err = -EIO; 895 /* Failed to map the buffer, even after retrying. */ 896 bh->b_blocknr = -1; 897 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " 898 "attribute type 0x%x, vcn 0x%llx, " 899 "vcn offset 0x%x, because its " 900 "location on disk could not be " 901 "determined%s (error code %i).", 902 ni->mft_no, ni->type, 903 (unsigned long long)bh_cpos, 904 (unsigned)bh_pos & 905 vol->cluster_size_mask, 906 is_retry ? " even after retrying" : "", 907 err); 908 break; 909 } 910 rl_not_mapped_enoent: 911 /* 912 * The buffer is in a hole or out of bounds. We need to fill 913 * the hole, unless the buffer is in a cluster which is not 914 * touched by the write, in which case we just leave the buffer 915 * unmapped. This can only happen when the cluster size is 916 * less than the page cache size. 917 */ 918 if (unlikely(vol->cluster_size < PAGE_SIZE)) { 919 bh_cend = (bh_end + vol->cluster_size - 1) >> 920 vol->cluster_size_bits; 921 if ((bh_cend <= cpos || bh_cpos >= cend)) { 922 bh->b_blocknr = -1; 923 /* 924 * If the buffer is uptodate we skip it. If it 925 * is not but the folio is uptodate, we can set 926 * the buffer uptodate. If the folio is not 927 * uptodate, we can clear the buffer and set it 928 * uptodate. Whether this is worthwhile is 929 * debatable and this could be removed. 930 */ 931 if (folio_test_uptodate(folio)) { 932 if (!buffer_uptodate(bh)) 933 set_buffer_uptodate(bh); 934 } else if (!buffer_uptodate(bh)) { 935 folio_zero_range(folio, bh_offset(bh), 936 blocksize); 937 set_buffer_uptodate(bh); 938 } 939 continue; 940 } 941 } 942 /* 943 * Out of bounds buffer is invalid if it was not really out of 944 * bounds. 945 */ 946 BUG_ON(lcn != LCN_HOLE); 947 /* 948 * We need the runlist locked for writing, so if it is locked 949 * for reading relock it now and retry in case it changed 950 * whilst we dropped the lock. 951 */ 952 BUG_ON(!rl); 953 if (!rl_write_locked) { 954 up_read(&ni->runlist.lock); 955 down_write(&ni->runlist.lock); 956 rl_write_locked = true; 957 goto retry_remap; 958 } 959 /* Find the previous last allocated cluster. */ 960 BUG_ON(rl->lcn != LCN_HOLE); 961 lcn = -1; 962 rl2 = rl; 963 while (--rl2 >= ni->runlist.rl) { 964 if (rl2->lcn >= 0) { 965 lcn = rl2->lcn + rl2->length; 966 break; 967 } 968 } 969 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, 970 false); 971 if (IS_ERR(rl2)) { 972 err = PTR_ERR(rl2); 973 ntfs_debug("Failed to allocate cluster, error code %i.", 974 err); 975 break; 976 } 977 lcn = rl2->lcn; 978 rl = ntfs_runlists_merge(ni->runlist.rl, rl2); 979 if (IS_ERR(rl)) { 980 err = PTR_ERR(rl); 981 if (err != -ENOMEM) 982 err = -EIO; 983 if (ntfs_cluster_free_from_rl(vol, rl2)) { 984 ntfs_error(vol->sb, "Failed to release " 985 "allocated cluster in error " 986 "code path. Run chkdsk to " 987 "recover the lost cluster."); 988 NVolSetErrors(vol); 989 } 990 ntfs_free(rl2); 991 break; 992 } 993 ni->runlist.rl = rl; 994 status.runlist_merged = 1; 995 ntfs_debug("Allocated cluster, lcn 0x%llx.", 996 (unsigned long long)lcn); 997 /* Map and lock the mft record and get the attribute record. */ 998 if (!NInoAttr(ni)) 999 base_ni = ni; 1000 else 1001 base_ni = ni->ext.base_ntfs_ino; 1002 m = map_mft_record(base_ni); 1003 if (IS_ERR(m)) { 1004 err = PTR_ERR(m); 1005 break; 1006 } 1007 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1008 if (unlikely(!ctx)) { 1009 err = -ENOMEM; 1010 unmap_mft_record(base_ni); 1011 break; 1012 } 1013 status.mft_attr_mapped = 1; 1014 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1015 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); 1016 if (unlikely(err)) { 1017 if (err == -ENOENT) 1018 err = -EIO; 1019 break; 1020 } 1021 m = ctx->mrec; 1022 a = ctx->attr; 1023 /* 1024 * Find the runlist element with which the attribute extent 1025 * starts. Note, we cannot use the _attr_ version because we 1026 * have mapped the mft record. That is ok because we know the 1027 * runlist fragment must be mapped already to have ever gotten 1028 * here, so we can just use the _rl_ version. 1029 */ 1030 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); 1031 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); 1032 BUG_ON(!rl2); 1033 BUG_ON(!rl2->length); 1034 BUG_ON(rl2->lcn < LCN_HOLE); 1035 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); 1036 /* 1037 * If @highest_vcn is zero, calculate the real highest_vcn 1038 * (which can really be zero). 1039 */ 1040 if (!highest_vcn) 1041 highest_vcn = (sle64_to_cpu( 1042 a->data.non_resident.allocated_size) >> 1043 vol->cluster_size_bits) - 1; 1044 /* 1045 * Determine the size of the mapping pairs array for the new 1046 * extent, i.e. the old extent with the hole filled. 1047 */ 1048 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, 1049 highest_vcn); 1050 if (unlikely(mp_size <= 0)) { 1051 if (!(err = mp_size)) 1052 err = -EIO; 1053 ntfs_debug("Failed to get size for mapping pairs " 1054 "array, error code %i.", err); 1055 break; 1056 } 1057 /* 1058 * Resize the attribute record to fit the new mapping pairs 1059 * array. 1060 */ 1061 attr_rec_len = le32_to_cpu(a->length); 1062 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( 1063 a->data.non_resident.mapping_pairs_offset)); 1064 if (unlikely(err)) { 1065 BUG_ON(err != -ENOSPC); 1066 // TODO: Deal with this by using the current attribute 1067 // and fill it with as much of the mapping pairs 1068 // array as possible. Then loop over each attribute 1069 // extent rewriting the mapping pairs arrays as we go 1070 // along and if when we reach the end we have not 1071 // enough space, try to resize the last attribute 1072 // extent and if even that fails, add a new attribute 1073 // extent. 1074 // We could also try to resize at each step in the hope 1075 // that we will not need to rewrite every single extent. 1076 // Note, we may need to decompress some extents to fill 1077 // the runlist as we are walking the extents... 1078 ntfs_error(vol->sb, "Not enough space in the mft " 1079 "record for the extended attribute " 1080 "record. This case is not " 1081 "implemented yet."); 1082 err = -EOPNOTSUPP; 1083 break ; 1084 } 1085 status.mp_rebuilt = 1; 1086 /* 1087 * Generate the mapping pairs array directly into the attribute 1088 * record. 1089 */ 1090 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1091 a->data.non_resident.mapping_pairs_offset), 1092 mp_size, rl2, vcn, highest_vcn, NULL); 1093 if (unlikely(err)) { 1094 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " 1095 "attribute type 0x%x, because building " 1096 "the mapping pairs failed with error " 1097 "code %i.", vi->i_ino, 1098 (unsigned)le32_to_cpu(ni->type), err); 1099 err = -EIO; 1100 break; 1101 } 1102 /* Update the highest_vcn but only if it was not set. */ 1103 if (unlikely(!a->data.non_resident.highest_vcn)) 1104 a->data.non_resident.highest_vcn = 1105 cpu_to_sle64(highest_vcn); 1106 /* 1107 * If the attribute is sparse/compressed, update the compressed 1108 * size in the ntfs_inode structure and the attribute record. 1109 */ 1110 if (likely(NInoSparse(ni) || NInoCompressed(ni))) { 1111 /* 1112 * If we are not in the first attribute extent, switch 1113 * to it, but first ensure the changes will make it to 1114 * disk later. 1115 */ 1116 if (a->data.non_resident.lowest_vcn) { 1117 flush_dcache_mft_record_page(ctx->ntfs_ino); 1118 mark_mft_record_dirty(ctx->ntfs_ino); 1119 ntfs_attr_reinit_search_ctx(ctx); 1120 err = ntfs_attr_lookup(ni->type, ni->name, 1121 ni->name_len, CASE_SENSITIVE, 1122 0, NULL, 0, ctx); 1123 if (unlikely(err)) { 1124 status.attr_switched = 1; 1125 break; 1126 } 1127 /* @m is not used any more so do not set it. */ 1128 a = ctx->attr; 1129 } 1130 write_lock_irqsave(&ni->size_lock, flags); 1131 ni->itype.compressed.size += vol->cluster_size; 1132 a->data.non_resident.compressed_size = 1133 cpu_to_sle64(ni->itype.compressed.size); 1134 write_unlock_irqrestore(&ni->size_lock, flags); 1135 } 1136 /* Ensure the changes make it to disk. */ 1137 flush_dcache_mft_record_page(ctx->ntfs_ino); 1138 mark_mft_record_dirty(ctx->ntfs_ino); 1139 ntfs_attr_put_search_ctx(ctx); 1140 unmap_mft_record(base_ni); 1141 /* Successfully filled the hole. */ 1142 status.runlist_merged = 0; 1143 status.mft_attr_mapped = 0; 1144 status.mp_rebuilt = 0; 1145 /* Setup the map cache and use that to deal with the buffer. */ 1146 was_hole = true; 1147 vcn = bh_cpos; 1148 vcn_len = 1; 1149 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); 1150 cdelta = 0; 1151 /* 1152 * If the number of remaining clusters in the @pages is smaller 1153 * or equal to the number of cached clusters, unlock the 1154 * runlist as the map cache will be used from now on. 1155 */ 1156 if (likely(vcn + vcn_len >= cend)) { 1157 up_write(&ni->runlist.lock); 1158 rl_write_locked = false; 1159 rl = NULL; 1160 } 1161 goto map_buffer_cached; 1162 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1163 /* If there are no errors, do the next page. */ 1164 if (likely(!err && ++u < nr_pages)) 1165 goto do_next_folio; 1166 /* If there are no errors, release the runlist lock if we took it. */ 1167 if (likely(!err)) { 1168 if (unlikely(rl_write_locked)) { 1169 up_write(&ni->runlist.lock); 1170 rl_write_locked = false; 1171 } else if (unlikely(rl)) 1172 up_read(&ni->runlist.lock); 1173 rl = NULL; 1174 } 1175 /* If we issued read requests, let them complete. */ 1176 read_lock_irqsave(&ni->size_lock, flags); 1177 initialized_size = ni->initialized_size; 1178 read_unlock_irqrestore(&ni->size_lock, flags); 1179 while (wait_bh > wait) { 1180 bh = *--wait_bh; 1181 wait_on_buffer(bh); 1182 if (likely(buffer_uptodate(bh))) { 1183 folio = bh->b_folio; 1184 bh_pos = folio_pos(folio) + bh_offset(bh); 1185 /* 1186 * If the buffer overflows the initialized size, need 1187 * to zero the overflowing region. 1188 */ 1189 if (unlikely(bh_pos + blocksize > initialized_size)) { 1190 int ofs = 0; 1191 1192 if (likely(bh_pos < initialized_size)) 1193 ofs = initialized_size - bh_pos; 1194 folio_zero_segment(folio, bh_offset(bh) + ofs, 1195 blocksize); 1196 } 1197 } else /* if (unlikely(!buffer_uptodate(bh))) */ 1198 err = -EIO; 1199 } 1200 if (likely(!err)) { 1201 /* Clear buffer_new on all buffers. */ 1202 u = 0; 1203 do { 1204 bh = head = page_buffers(pages[u]); 1205 do { 1206 if (buffer_new(bh)) 1207 clear_buffer_new(bh); 1208 } while ((bh = bh->b_this_page) != head); 1209 } while (++u < nr_pages); 1210 ntfs_debug("Done."); 1211 return err; 1212 } 1213 if (status.attr_switched) { 1214 /* Get back to the attribute extent we modified. */ 1215 ntfs_attr_reinit_search_ctx(ctx); 1216 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1217 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { 1218 ntfs_error(vol->sb, "Failed to find required " 1219 "attribute extent of attribute in " 1220 "error code path. Run chkdsk to " 1221 "recover."); 1222 write_lock_irqsave(&ni->size_lock, flags); 1223 ni->itype.compressed.size += vol->cluster_size; 1224 write_unlock_irqrestore(&ni->size_lock, flags); 1225 flush_dcache_mft_record_page(ctx->ntfs_ino); 1226 mark_mft_record_dirty(ctx->ntfs_ino); 1227 /* 1228 * The only thing that is now wrong is the compressed 1229 * size of the base attribute extent which chkdsk 1230 * should be able to fix. 1231 */ 1232 NVolSetErrors(vol); 1233 } else { 1234 m = ctx->mrec; 1235 a = ctx->attr; 1236 status.attr_switched = 0; 1237 } 1238 } 1239 /* 1240 * If the runlist has been modified, need to restore it by punching a 1241 * hole into it and we then need to deallocate the on-disk cluster as 1242 * well. Note, we only modify the runlist if we are able to generate a 1243 * new mapping pairs array, i.e. only when the mapped attribute extent 1244 * is not switched. 1245 */ 1246 if (status.runlist_merged && !status.attr_switched) { 1247 BUG_ON(!rl_write_locked); 1248 /* Make the file cluster we allocated sparse in the runlist. */ 1249 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { 1250 ntfs_error(vol->sb, "Failed to punch hole into " 1251 "attribute runlist in error code " 1252 "path. Run chkdsk to recover the " 1253 "lost cluster."); 1254 NVolSetErrors(vol); 1255 } else /* if (success) */ { 1256 status.runlist_merged = 0; 1257 /* 1258 * Deallocate the on-disk cluster we allocated but only 1259 * if we succeeded in punching its vcn out of the 1260 * runlist. 1261 */ 1262 down_write(&vol->lcnbmp_lock); 1263 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { 1264 ntfs_error(vol->sb, "Failed to release " 1265 "allocated cluster in error " 1266 "code path. Run chkdsk to " 1267 "recover the lost cluster."); 1268 NVolSetErrors(vol); 1269 } 1270 up_write(&vol->lcnbmp_lock); 1271 } 1272 } 1273 /* 1274 * Resize the attribute record to its old size and rebuild the mapping 1275 * pairs array. Note, we only can do this if the runlist has been 1276 * restored to its old state which also implies that the mapped 1277 * attribute extent is not switched. 1278 */ 1279 if (status.mp_rebuilt && !status.runlist_merged) { 1280 if (ntfs_attr_record_resize(m, a, attr_rec_len)) { 1281 ntfs_error(vol->sb, "Failed to restore attribute " 1282 "record in error code path. Run " 1283 "chkdsk to recover."); 1284 NVolSetErrors(vol); 1285 } else /* if (success) */ { 1286 if (ntfs_mapping_pairs_build(vol, (u8*)a + 1287 le16_to_cpu(a->data.non_resident. 1288 mapping_pairs_offset), attr_rec_len - 1289 le16_to_cpu(a->data.non_resident. 1290 mapping_pairs_offset), ni->runlist.rl, 1291 vcn, highest_vcn, NULL)) { 1292 ntfs_error(vol->sb, "Failed to restore " 1293 "mapping pairs array in error " 1294 "code path. Run chkdsk to " 1295 "recover."); 1296 NVolSetErrors(vol); 1297 } 1298 flush_dcache_mft_record_page(ctx->ntfs_ino); 1299 mark_mft_record_dirty(ctx->ntfs_ino); 1300 } 1301 } 1302 /* Release the mft record and the attribute. */ 1303 if (status.mft_attr_mapped) { 1304 ntfs_attr_put_search_ctx(ctx); 1305 unmap_mft_record(base_ni); 1306 } 1307 /* Release the runlist lock. */ 1308 if (rl_write_locked) 1309 up_write(&ni->runlist.lock); 1310 else if (rl) 1311 up_read(&ni->runlist.lock); 1312 /* 1313 * Zero out any newly allocated blocks to avoid exposing stale data. 1314 * If BH_New is set, we know that the block was newly allocated above 1315 * and that it has not been fully zeroed and marked dirty yet. 1316 */ 1317 nr_pages = u; 1318 u = 0; 1319 end = bh_cpos << vol->cluster_size_bits; 1320 do { 1321 folio = page_folio(pages[u]); 1322 bh = head = folio_buffers(folio); 1323 do { 1324 if (u == nr_pages && 1325 folio_pos(folio) + bh_offset(bh) >= end) 1326 break; 1327 if (!buffer_new(bh)) 1328 continue; 1329 clear_buffer_new(bh); 1330 if (!buffer_uptodate(bh)) { 1331 if (folio_test_uptodate(folio)) 1332 set_buffer_uptodate(bh); 1333 else { 1334 folio_zero_range(folio, bh_offset(bh), 1335 blocksize); 1336 set_buffer_uptodate(bh); 1337 } 1338 } 1339 mark_buffer_dirty(bh); 1340 } while ((bh = bh->b_this_page) != head); 1341 } while (++u <= nr_pages); 1342 ntfs_error(vol->sb, "Failed. Returning error code %i.", err); 1343 return err; 1344 } 1345 1346 static inline void ntfs_flush_dcache_pages(struct page **pages, 1347 unsigned nr_pages) 1348 { 1349 BUG_ON(!nr_pages); 1350 /* 1351 * Warning: Do not do the decrement at the same time as the call to 1352 * flush_dcache_page() because it is a NULL macro on i386 and hence the 1353 * decrement never happens so the loop never terminates. 1354 */ 1355 do { 1356 --nr_pages; 1357 flush_dcache_page(pages[nr_pages]); 1358 } while (nr_pages > 0); 1359 } 1360 1361 /** 1362 * ntfs_commit_pages_after_non_resident_write - commit the received data 1363 * @pages: array of destination pages 1364 * @nr_pages: number of pages in @pages 1365 * @pos: byte position in file at which the write begins 1366 * @bytes: number of bytes to be written 1367 * 1368 * See description of ntfs_commit_pages_after_write(), below. 1369 */ 1370 static inline int ntfs_commit_pages_after_non_resident_write( 1371 struct page **pages, const unsigned nr_pages, 1372 s64 pos, size_t bytes) 1373 { 1374 s64 end, initialized_size; 1375 struct inode *vi; 1376 ntfs_inode *ni, *base_ni; 1377 struct buffer_head *bh, *head; 1378 ntfs_attr_search_ctx *ctx; 1379 MFT_RECORD *m; 1380 ATTR_RECORD *a; 1381 unsigned long flags; 1382 unsigned blocksize, u; 1383 int err; 1384 1385 vi = pages[0]->mapping->host; 1386 ni = NTFS_I(vi); 1387 blocksize = vi->i_sb->s_blocksize; 1388 end = pos + bytes; 1389 u = 0; 1390 do { 1391 s64 bh_pos; 1392 struct page *page; 1393 bool partial; 1394 1395 page = pages[u]; 1396 bh_pos = (s64)page->index << PAGE_SHIFT; 1397 bh = head = page_buffers(page); 1398 partial = false; 1399 do { 1400 s64 bh_end; 1401 1402 bh_end = bh_pos + blocksize; 1403 if (bh_end <= pos || bh_pos >= end) { 1404 if (!buffer_uptodate(bh)) 1405 partial = true; 1406 } else { 1407 set_buffer_uptodate(bh); 1408 mark_buffer_dirty(bh); 1409 } 1410 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); 1411 /* 1412 * If all buffers are now uptodate but the page is not, set the 1413 * page uptodate. 1414 */ 1415 if (!partial && !PageUptodate(page)) 1416 SetPageUptodate(page); 1417 } while (++u < nr_pages); 1418 /* 1419 * Finally, if we do not need to update initialized_size or i_size we 1420 * are finished. 1421 */ 1422 read_lock_irqsave(&ni->size_lock, flags); 1423 initialized_size = ni->initialized_size; 1424 read_unlock_irqrestore(&ni->size_lock, flags); 1425 if (end <= initialized_size) { 1426 ntfs_debug("Done."); 1427 return 0; 1428 } 1429 /* 1430 * Update initialized_size/i_size as appropriate, both in the inode and 1431 * the mft record. 1432 */ 1433 if (!NInoAttr(ni)) 1434 base_ni = ni; 1435 else 1436 base_ni = ni->ext.base_ntfs_ino; 1437 /* Map, pin, and lock the mft record. */ 1438 m = map_mft_record(base_ni); 1439 if (IS_ERR(m)) { 1440 err = PTR_ERR(m); 1441 m = NULL; 1442 ctx = NULL; 1443 goto err_out; 1444 } 1445 BUG_ON(!NInoNonResident(ni)); 1446 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1447 if (unlikely(!ctx)) { 1448 err = -ENOMEM; 1449 goto err_out; 1450 } 1451 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1452 CASE_SENSITIVE, 0, NULL, 0, ctx); 1453 if (unlikely(err)) { 1454 if (err == -ENOENT) 1455 err = -EIO; 1456 goto err_out; 1457 } 1458 a = ctx->attr; 1459 BUG_ON(!a->non_resident); 1460 write_lock_irqsave(&ni->size_lock, flags); 1461 BUG_ON(end > ni->allocated_size); 1462 ni->initialized_size = end; 1463 a->data.non_resident.initialized_size = cpu_to_sle64(end); 1464 if (end > i_size_read(vi)) { 1465 i_size_write(vi, end); 1466 a->data.non_resident.data_size = 1467 a->data.non_resident.initialized_size; 1468 } 1469 write_unlock_irqrestore(&ni->size_lock, flags); 1470 /* Mark the mft record dirty, so it gets written back. */ 1471 flush_dcache_mft_record_page(ctx->ntfs_ino); 1472 mark_mft_record_dirty(ctx->ntfs_ino); 1473 ntfs_attr_put_search_ctx(ctx); 1474 unmap_mft_record(base_ni); 1475 ntfs_debug("Done."); 1476 return 0; 1477 err_out: 1478 if (ctx) 1479 ntfs_attr_put_search_ctx(ctx); 1480 if (m) 1481 unmap_mft_record(base_ni); 1482 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " 1483 "code %i).", err); 1484 if (err != -ENOMEM) 1485 NVolSetErrors(ni->vol); 1486 return err; 1487 } 1488 1489 /** 1490 * ntfs_commit_pages_after_write - commit the received data 1491 * @pages: array of destination pages 1492 * @nr_pages: number of pages in @pages 1493 * @pos: byte position in file at which the write begins 1494 * @bytes: number of bytes to be written 1495 * 1496 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode 1497 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are 1498 * locked but not kmap()ped. The source data has already been copied into the 1499 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before 1500 * the data was copied (for non-resident attributes only) and it returned 1501 * success. 1502 * 1503 * Need to set uptodate and mark dirty all buffers within the boundary of the 1504 * write. If all buffers in a page are uptodate we set the page uptodate, too. 1505 * 1506 * Setting the buffers dirty ensures that they get written out later when 1507 * ntfs_writepage() is invoked by the VM. 1508 * 1509 * Finally, we need to update i_size and initialized_size as appropriate both 1510 * in the inode and the mft record. 1511 * 1512 * This is modelled after fs/buffer.c::generic_commit_write(), which marks 1513 * buffers uptodate and dirty, sets the page uptodate if all buffers in the 1514 * page are uptodate, and updates i_size if the end of io is beyond i_size. In 1515 * that case, it also marks the inode dirty. 1516 * 1517 * If things have gone as outlined in 1518 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page 1519 * content modifications here for non-resident attributes. For resident 1520 * attributes we need to do the uptodate bringing here which we combine with 1521 * the copying into the mft record which means we save one atomic kmap. 1522 * 1523 * Return 0 on success or -errno on error. 1524 */ 1525 static int ntfs_commit_pages_after_write(struct page **pages, 1526 const unsigned nr_pages, s64 pos, size_t bytes) 1527 { 1528 s64 end, initialized_size; 1529 loff_t i_size; 1530 struct inode *vi; 1531 ntfs_inode *ni, *base_ni; 1532 struct page *page; 1533 ntfs_attr_search_ctx *ctx; 1534 MFT_RECORD *m; 1535 ATTR_RECORD *a; 1536 char *kattr, *kaddr; 1537 unsigned long flags; 1538 u32 attr_len; 1539 int err; 1540 1541 BUG_ON(!nr_pages); 1542 BUG_ON(!pages); 1543 page = pages[0]; 1544 BUG_ON(!page); 1545 vi = page->mapping->host; 1546 ni = NTFS_I(vi); 1547 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " 1548 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", 1549 vi->i_ino, ni->type, page->index, nr_pages, 1550 (long long)pos, bytes); 1551 if (NInoNonResident(ni)) 1552 return ntfs_commit_pages_after_non_resident_write(pages, 1553 nr_pages, pos, bytes); 1554 BUG_ON(nr_pages > 1); 1555 /* 1556 * Attribute is resident, implying it is not compressed, encrypted, or 1557 * sparse. 1558 */ 1559 if (!NInoAttr(ni)) 1560 base_ni = ni; 1561 else 1562 base_ni = ni->ext.base_ntfs_ino; 1563 BUG_ON(NInoNonResident(ni)); 1564 /* Map, pin, and lock the mft record. */ 1565 m = map_mft_record(base_ni); 1566 if (IS_ERR(m)) { 1567 err = PTR_ERR(m); 1568 m = NULL; 1569 ctx = NULL; 1570 goto err_out; 1571 } 1572 ctx = ntfs_attr_get_search_ctx(base_ni, m); 1573 if (unlikely(!ctx)) { 1574 err = -ENOMEM; 1575 goto err_out; 1576 } 1577 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 1578 CASE_SENSITIVE, 0, NULL, 0, ctx); 1579 if (unlikely(err)) { 1580 if (err == -ENOENT) 1581 err = -EIO; 1582 goto err_out; 1583 } 1584 a = ctx->attr; 1585 BUG_ON(a->non_resident); 1586 /* The total length of the attribute value. */ 1587 attr_len = le32_to_cpu(a->data.resident.value_length); 1588 i_size = i_size_read(vi); 1589 BUG_ON(attr_len != i_size); 1590 BUG_ON(pos > attr_len); 1591 end = pos + bytes; 1592 BUG_ON(end > le32_to_cpu(a->length) - 1593 le16_to_cpu(a->data.resident.value_offset)); 1594 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); 1595 kaddr = kmap_atomic(page); 1596 /* Copy the received data from the page to the mft record. */ 1597 memcpy(kattr + pos, kaddr + pos, bytes); 1598 /* Update the attribute length if necessary. */ 1599 if (end > attr_len) { 1600 attr_len = end; 1601 a->data.resident.value_length = cpu_to_le32(attr_len); 1602 } 1603 /* 1604 * If the page is not uptodate, bring the out of bounds area(s) 1605 * uptodate by copying data from the mft record to the page. 1606 */ 1607 if (!PageUptodate(page)) { 1608 if (pos > 0) 1609 memcpy(kaddr, kattr, pos); 1610 if (end < attr_len) 1611 memcpy(kaddr + end, kattr + end, attr_len - end); 1612 /* Zero the region outside the end of the attribute value. */ 1613 memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); 1614 flush_dcache_page(page); 1615 SetPageUptodate(page); 1616 } 1617 kunmap_atomic(kaddr); 1618 /* Update initialized_size/i_size if necessary. */ 1619 read_lock_irqsave(&ni->size_lock, flags); 1620 initialized_size = ni->initialized_size; 1621 BUG_ON(end > ni->allocated_size); 1622 read_unlock_irqrestore(&ni->size_lock, flags); 1623 BUG_ON(initialized_size != i_size); 1624 if (end > initialized_size) { 1625 write_lock_irqsave(&ni->size_lock, flags); 1626 ni->initialized_size = end; 1627 i_size_write(vi, end); 1628 write_unlock_irqrestore(&ni->size_lock, flags); 1629 } 1630 /* Mark the mft record dirty, so it gets written back. */ 1631 flush_dcache_mft_record_page(ctx->ntfs_ino); 1632 mark_mft_record_dirty(ctx->ntfs_ino); 1633 ntfs_attr_put_search_ctx(ctx); 1634 unmap_mft_record(base_ni); 1635 ntfs_debug("Done."); 1636 return 0; 1637 err_out: 1638 if (err == -ENOMEM) { 1639 ntfs_warning(vi->i_sb, "Error allocating memory required to " 1640 "commit the write."); 1641 if (PageUptodate(page)) { 1642 ntfs_warning(vi->i_sb, "Page is uptodate, setting " 1643 "dirty so the write will be retried " 1644 "later on by the VM."); 1645 /* 1646 * Put the page on mapping->dirty_pages, but leave its 1647 * buffers' dirty state as-is. 1648 */ 1649 __set_page_dirty_nobuffers(page); 1650 err = 0; 1651 } else 1652 ntfs_error(vi->i_sb, "Page is not uptodate. Written " 1653 "data has been lost."); 1654 } else { 1655 ntfs_error(vi->i_sb, "Resident attribute commit write failed " 1656 "with error %i.", err); 1657 NVolSetErrors(ni->vol); 1658 } 1659 if (ctx) 1660 ntfs_attr_put_search_ctx(ctx); 1661 if (m) 1662 unmap_mft_record(base_ni); 1663 return err; 1664 } 1665 1666 /* 1667 * Copy as much as we can into the pages and return the number of bytes which 1668 * were successfully copied. If a fault is encountered then clear the pages 1669 * out to (ofs + bytes) and return the number of bytes which were copied. 1670 */ 1671 static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, 1672 unsigned ofs, struct iov_iter *i, size_t bytes) 1673 { 1674 struct page **last_page = pages + nr_pages; 1675 size_t total = 0; 1676 unsigned len, copied; 1677 1678 do { 1679 len = PAGE_SIZE - ofs; 1680 if (len > bytes) 1681 len = bytes; 1682 copied = copy_page_from_iter_atomic(*pages, ofs, len, i); 1683 total += copied; 1684 bytes -= copied; 1685 if (!bytes) 1686 break; 1687 if (copied < len) 1688 goto err; 1689 ofs = 0; 1690 } while (++pages < last_page); 1691 out: 1692 return total; 1693 err: 1694 /* Zero the rest of the target like __copy_from_user(). */ 1695 len = PAGE_SIZE - copied; 1696 do { 1697 if (len > bytes) 1698 len = bytes; 1699 zero_user(*pages, copied, len); 1700 bytes -= len; 1701 copied = 0; 1702 len = PAGE_SIZE; 1703 } while (++pages < last_page); 1704 goto out; 1705 } 1706 1707 /** 1708 * ntfs_perform_write - perform buffered write to a file 1709 * @file: file to write to 1710 * @i: iov_iter with data to write 1711 * @pos: byte offset in file at which to begin writing to 1712 */ 1713 static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, 1714 loff_t pos) 1715 { 1716 struct address_space *mapping = file->f_mapping; 1717 struct inode *vi = mapping->host; 1718 ntfs_inode *ni = NTFS_I(vi); 1719 ntfs_volume *vol = ni->vol; 1720 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; 1721 struct page *cached_page = NULL; 1722 VCN last_vcn; 1723 LCN lcn; 1724 size_t bytes; 1725 ssize_t status, written = 0; 1726 unsigned nr_pages; 1727 1728 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " 1729 "0x%llx, count 0x%lx.", vi->i_ino, 1730 (unsigned)le32_to_cpu(ni->type), 1731 (unsigned long long)pos, 1732 (unsigned long)iov_iter_count(i)); 1733 /* 1734 * If a previous ntfs_truncate() failed, repeat it and abort if it 1735 * fails again. 1736 */ 1737 if (unlikely(NInoTruncateFailed(ni))) { 1738 int err; 1739 1740 inode_dio_wait(vi); 1741 err = ntfs_truncate(vi); 1742 if (err || NInoTruncateFailed(ni)) { 1743 if (!err) 1744 err = -EIO; 1745 ntfs_error(vol->sb, "Cannot perform write to inode " 1746 "0x%lx, attribute type 0x%x, because " 1747 "ntfs_truncate() failed (error code " 1748 "%i).", vi->i_ino, 1749 (unsigned)le32_to_cpu(ni->type), err); 1750 return err; 1751 } 1752 } 1753 /* 1754 * Determine the number of pages per cluster for non-resident 1755 * attributes. 1756 */ 1757 nr_pages = 1; 1758 if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) 1759 nr_pages = vol->cluster_size >> PAGE_SHIFT; 1760 last_vcn = -1; 1761 do { 1762 VCN vcn; 1763 pgoff_t start_idx; 1764 unsigned ofs, do_pages, u; 1765 size_t copied; 1766 1767 start_idx = pos >> PAGE_SHIFT; 1768 ofs = pos & ~PAGE_MASK; 1769 bytes = PAGE_SIZE - ofs; 1770 do_pages = 1; 1771 if (nr_pages > 1) { 1772 vcn = pos >> vol->cluster_size_bits; 1773 if (vcn != last_vcn) { 1774 last_vcn = vcn; 1775 /* 1776 * Get the lcn of the vcn the write is in. If 1777 * it is a hole, need to lock down all pages in 1778 * the cluster. 1779 */ 1780 down_read(&ni->runlist.lock); 1781 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> 1782 vol->cluster_size_bits, false); 1783 up_read(&ni->runlist.lock); 1784 if (unlikely(lcn < LCN_HOLE)) { 1785 if (lcn == LCN_ENOMEM) 1786 status = -ENOMEM; 1787 else { 1788 status = -EIO; 1789 ntfs_error(vol->sb, "Cannot " 1790 "perform write to " 1791 "inode 0x%lx, " 1792 "attribute type 0x%x, " 1793 "because the attribute " 1794 "is corrupt.", 1795 vi->i_ino, (unsigned) 1796 le32_to_cpu(ni->type)); 1797 } 1798 break; 1799 } 1800 if (lcn == LCN_HOLE) { 1801 start_idx = (pos & ~(s64) 1802 vol->cluster_size_mask) 1803 >> PAGE_SHIFT; 1804 bytes = vol->cluster_size - (pos & 1805 vol->cluster_size_mask); 1806 do_pages = nr_pages; 1807 } 1808 } 1809 } 1810 if (bytes > iov_iter_count(i)) 1811 bytes = iov_iter_count(i); 1812 again: 1813 /* 1814 * Bring in the user page(s) that we will copy from _first_. 1815 * Otherwise there is a nasty deadlock on copying from the same 1816 * page(s) as we are writing to, without it/them being marked 1817 * up-to-date. Note, at present there is nothing to stop the 1818 * pages being swapped out between us bringing them into memory 1819 * and doing the actual copying. 1820 */ 1821 if (unlikely(fault_in_iov_iter_readable(i, bytes))) { 1822 status = -EFAULT; 1823 break; 1824 } 1825 /* Get and lock @do_pages starting at index @start_idx. */ 1826 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1827 pages, &cached_page); 1828 if (unlikely(status)) 1829 break; 1830 /* 1831 * For non-resident attributes, we need to fill any holes with 1832 * actual clusters and ensure all bufferes are mapped. We also 1833 * need to bring uptodate any buffers that are only partially 1834 * being written to. 1835 */ 1836 if (NInoNonResident(ni)) { 1837 status = ntfs_prepare_pages_for_non_resident_write( 1838 pages, do_pages, pos, bytes); 1839 if (unlikely(status)) { 1840 do { 1841 unlock_page(pages[--do_pages]); 1842 put_page(pages[do_pages]); 1843 } while (do_pages); 1844 break; 1845 } 1846 } 1847 u = (pos >> PAGE_SHIFT) - pages[0]->index; 1848 copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, 1849 i, bytes); 1850 ntfs_flush_dcache_pages(pages + u, do_pages - u); 1851 status = 0; 1852 if (likely(copied == bytes)) { 1853 status = ntfs_commit_pages_after_write(pages, do_pages, 1854 pos, bytes); 1855 } 1856 do { 1857 unlock_page(pages[--do_pages]); 1858 put_page(pages[do_pages]); 1859 } while (do_pages); 1860 if (unlikely(status < 0)) { 1861 iov_iter_revert(i, copied); 1862 break; 1863 } 1864 cond_resched(); 1865 if (unlikely(copied < bytes)) { 1866 iov_iter_revert(i, copied); 1867 if (copied) 1868 bytes = copied; 1869 else if (bytes > PAGE_SIZE - ofs) 1870 bytes = PAGE_SIZE - ofs; 1871 goto again; 1872 } 1873 pos += copied; 1874 written += copied; 1875 balance_dirty_pages_ratelimited(mapping); 1876 if (fatal_signal_pending(current)) { 1877 status = -EINTR; 1878 break; 1879 } 1880 } while (iov_iter_count(i)); 1881 if (cached_page) 1882 put_page(cached_page); 1883 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 1884 written ? "written" : "status", (unsigned long)written, 1885 (long)status); 1886 return written ? written : status; 1887 } 1888 1889 /** 1890 * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() 1891 * @iocb: IO state structure 1892 * @from: iov_iter with data to write 1893 * 1894 * Basically the same as generic_file_write_iter() except that it ends up 1895 * up calling ntfs_perform_write() instead of generic_perform_write() and that 1896 * O_DIRECT is not implemented. 1897 */ 1898 static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1899 { 1900 struct file *file = iocb->ki_filp; 1901 struct inode *vi = file_inode(file); 1902 ssize_t written = 0; 1903 ssize_t err; 1904 1905 inode_lock(vi); 1906 /* We can write back this queue in page reclaim. */ 1907 err = ntfs_prepare_file_for_write(iocb, from); 1908 if (iov_iter_count(from) && !err) 1909 written = ntfs_perform_write(file, from, iocb->ki_pos); 1910 inode_unlock(vi); 1911 iocb->ki_pos += written; 1912 if (likely(written > 0)) 1913 written = generic_write_sync(iocb, written); 1914 return written ? written : err; 1915 } 1916 1917 /** 1918 * ntfs_file_fsync - sync a file to disk 1919 * @filp: file to be synced 1920 * @datasync: if non-zero only flush user data and not metadata 1921 * 1922 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 1923 * system calls. This function is inspired by fs/buffer.c::file_fsync(). 1924 * 1925 * If @datasync is false, write the mft record and all associated extent mft 1926 * records as well as the $DATA attribute and then sync the block device. 1927 * 1928 * If @datasync is true and the attribute is non-resident, we skip the writing 1929 * of the mft record and all associated extent mft records (this might still 1930 * happen due to the write_inode_now() call). 1931 * 1932 * Also, if @datasync is true, we do not wait on the inode to be written out 1933 * but we always wait on the page cache pages to be written out. 1934 * 1935 * Locking: Caller must hold i_mutex on the inode. 1936 * 1937 * TODO: We should probably also write all attribute/index inodes associated 1938 * with this inode but since we have no simple way of getting to them we ignore 1939 * this problem for now. 1940 */ 1941 static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, 1942 int datasync) 1943 { 1944 struct inode *vi = filp->f_mapping->host; 1945 int err, ret = 0; 1946 1947 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 1948 1949 err = file_write_and_wait_range(filp, start, end); 1950 if (err) 1951 return err; 1952 inode_lock(vi); 1953 1954 BUG_ON(S_ISDIR(vi->i_mode)); 1955 if (!datasync || !NInoNonResident(NTFS_I(vi))) 1956 ret = __ntfs_write_inode(vi, 1); 1957 write_inode_now(vi, !datasync); 1958 /* 1959 * NOTE: If we were to use mapping->private_list (see ext2 and 1960 * fs/buffer.c) for dirty blocks then we could optimize the below to be 1961 * sync_mapping_buffers(vi->i_mapping). 1962 */ 1963 err = sync_blockdev(vi->i_sb->s_bdev); 1964 if (unlikely(err && !ret)) 1965 ret = err; 1966 if (likely(!ret)) 1967 ntfs_debug("Done."); 1968 else 1969 ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " 1970 "%u.", datasync ? "data" : "", vi->i_ino, -ret); 1971 inode_unlock(vi); 1972 return ret; 1973 } 1974 1975 #endif /* NTFS_RW */ 1976 1977 const struct file_operations ntfs_file_ops = { 1978 .llseek = generic_file_llseek, 1979 .read_iter = generic_file_read_iter, 1980 #ifdef NTFS_RW 1981 .write_iter = ntfs_file_write_iter, 1982 .fsync = ntfs_file_fsync, 1983 #endif /* NTFS_RW */ 1984 .mmap = generic_file_mmap, 1985 .open = ntfs_file_open, 1986 .splice_read = filemap_splice_read, 1987 }; 1988 1989 const struct inode_operations ntfs_file_inode_ops = { 1990 #ifdef NTFS_RW 1991 .setattr = ntfs_setattr, 1992 #endif /* NTFS_RW */ 1993 }; 1994 1995 const struct file_operations ntfs_empty_file_ops = {}; 1996 1997 const struct inode_operations ntfs_empty_inode_ops = {}; 1998