1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Network filesystem high-level buffered write support. 3 * 4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include "internal.h" 14 15 /* 16 * Grab a folio for writing and lock it. Attempt to allocate as large a folio 17 * as possible to hold as much of the remaining length as possible in one go. 18 */ 19 static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, 20 loff_t pos, size_t part) 21 { 22 pgoff_t index = pos / PAGE_SIZE; 23 fgf_t fgp_flags = FGP_WRITEBEGIN; 24 25 if (mapping_large_folio_support(mapping)) 26 fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); 27 28 return __filemap_get_folio(mapping, index, fgp_flags, 29 mapping_gfp_mask(mapping)); 30 } 31 32 /* 33 * Update i_size and estimate the update to i_blocks to reflect the additional 34 * data written into the pagecache until we can find out from the server what 35 * the values actually are. 36 */ 37 void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, 38 loff_t pos, size_t copied) 39 { 40 loff_t i_size, end = pos + copied; 41 blkcnt_t add; 42 size_t gap; 43 44 if (end <= i_size_read(inode)) 45 return; 46 47 if (ctx->ops->update_i_size) { 48 ctx->ops->update_i_size(inode, end); 49 return; 50 } 51 52 spin_lock(&inode->i_lock); 53 54 i_size = i_size_read(inode); 55 if (end > i_size) { 56 i_size_write(inode, end); 57 #if IS_ENABLED(CONFIG_FSCACHE) 58 fscache_update_cookie(ctx->cache, NULL, &end); 59 #endif 60 61 gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); 62 if (copied > gap) { 63 add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); 64 65 inode->i_blocks = min_t(blkcnt_t, 66 DIV_ROUND_UP(end, SECTOR_SIZE), 67 inode->i_blocks + add); 68 } 69 } 70 spin_unlock(&inode->i_lock); 71 } 72 73 /** 74 * netfs_perform_write - Copy data into the pagecache. 75 * @iocb: The operation parameters 76 * @iter: The source buffer 77 * @netfs_group: Grouping for dirty folios (eg. ceph snaps). 78 * 79 * Copy data into pagecache folios attached to the inode specified by @iocb. 80 * The caller must hold appropriate inode locks. 81 * 82 * Dirty folios are tagged with a netfs_folio struct if they're not up to date 83 * to indicate the range modified. Dirty folios may also be tagged with a 84 * netfs-specific grouping such that data from an old group gets flushed before 85 * a new one is started. 86 */ 87 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, 88 struct netfs_group *netfs_group) 89 { 90 struct file *file = iocb->ki_filp; 91 struct inode *inode = file_inode(file); 92 struct address_space *mapping = inode->i_mapping; 93 struct netfs_inode *ctx = netfs_inode(inode); 94 struct writeback_control wbc = { 95 .sync_mode = WB_SYNC_NONE, 96 .for_sync = true, 97 .nr_to_write = LONG_MAX, 98 .range_start = iocb->ki_pos, 99 .range_end = iocb->ki_pos + iter->count, 100 }; 101 struct netfs_io_request *wreq = NULL; 102 struct folio *folio = NULL, *writethrough = NULL; 103 unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; 104 ssize_t written = 0, ret, ret2; 105 loff_t pos = iocb->ki_pos; 106 size_t max_chunk = mapping_max_folio_size(mapping); 107 bool maybe_trouble = false; 108 109 if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) 110 ) { 111 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 112 113 ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); 114 if (ret < 0) { 115 wbc_detach_inode(&wbc); 116 goto out; 117 } 118 119 wreq = netfs_begin_writethrough(iocb, iter->count); 120 if (IS_ERR(wreq)) { 121 wbc_detach_inode(&wbc); 122 ret = PTR_ERR(wreq); 123 wreq = NULL; 124 goto out; 125 } 126 if (!is_sync_kiocb(iocb)) 127 wreq->iocb = iocb; 128 netfs_stat(&netfs_n_wh_writethrough); 129 } else { 130 netfs_stat(&netfs_n_wh_buffered_write); 131 } 132 133 do { 134 enum netfs_folio_trace trace; 135 struct netfs_folio *finfo; 136 struct netfs_group *group; 137 unsigned long long fpos; 138 size_t flen; 139 size_t offset; /* Offset into pagecache folio */ 140 size_t part; /* Bytes to write to folio */ 141 size_t copied; /* Bytes copied from user */ 142 void *priv; 143 144 offset = pos & (max_chunk - 1); 145 part = min(max_chunk - offset, iov_iter_count(iter)); 146 147 /* Bring in the user pages that we will copy from _first_ lest 148 * we hit a nasty deadlock on copying from the same page as 149 * we're writing to, without it being marked uptodate. 150 * 151 * Not only is this an optimisation, but it is also required to 152 * check that the address is actually valid, when atomic 153 * usercopies are used below. 154 * 155 * We rely on the page being held onto long enough by the LRU 156 * that we can grab it below if this causes it to be read. 157 */ 158 ret = -EFAULT; 159 if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) 160 break; 161 162 folio = netfs_grab_folio_for_write(mapping, pos, part); 163 if (IS_ERR(folio)) { 164 ret = PTR_ERR(folio); 165 break; 166 } 167 168 flen = folio_size(folio); 169 fpos = folio_pos(folio); 170 offset = pos - fpos; 171 part = min_t(size_t, flen - offset, part); 172 173 /* Wait for writeback to complete. The writeback engine owns 174 * the info in folio->private and may change it until it 175 * removes the WB mark. 176 */ 177 if (folio_get_private(folio) && 178 folio_wait_writeback_killable(folio)) { 179 ret = written ? -EINTR : -ERESTARTSYS; 180 goto error_folio_unlock; 181 } 182 183 if (signal_pending(current)) { 184 ret = written ? -EINTR : -ERESTARTSYS; 185 goto error_folio_unlock; 186 } 187 188 finfo = netfs_folio_info(folio); 189 group = netfs_folio_group(folio); 190 191 /* If the requested group differs from the group set on the 192 * page, then we need to flush out the folio if it has a group 193 * set (ie. is non-NULL). Note that COPY_TO_CACHE is a special 194 * case, being a netfs annotation rather than an actual group. 195 * 196 * The filesystem isn't permitted to mix writes with groups and 197 * writes without groups as the NULL group is used to indicate 198 * that no group is set. 199 */ 200 if (unlikely(group != netfs_group) && 201 group != NETFS_FOLIO_COPY_TO_CACHE && 202 group) { 203 WARN_ON_ONCE(!netfs_group); 204 goto flush_content; 205 } 206 207 /* Decide how we should modify a folio. We might be attempting 208 * to do write-streaming, as we don't want to a local RMW cycle 209 * if we can avoid it. If we're doing local caching or content 210 * crypto, we award that priority over avoiding RMW. If the 211 * file is open readably, then we let ->read_folio() fill in 212 * the gaps. 213 */ 214 if (folio_test_uptodate(folio)) { 215 if (mapping_writably_mapped(mapping)) 216 flush_dcache_folio(folio); 217 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 218 if (unlikely(copied == 0)) 219 goto copy_failed; 220 trace = netfs_folio_is_uptodate; 221 goto copied_uptodate; 222 } 223 224 /* If the page is above the zero-point then we assume that the 225 * server would just return a block of zeros or a short read if 226 * we try to read it. 227 */ 228 if (fpos >= netfs_read_zero_point(inode)) { 229 folio_zero_segment(folio, 0, offset); 230 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 231 if (unlikely(copied == 0)) 232 goto copy_failed; 233 folio_zero_segment(folio, offset + copied, flen); 234 if (finfo) 235 trace = netfs_modify_and_clear_rm_finfo; 236 else 237 trace = netfs_modify_and_clear; 238 goto mark_uptodate; 239 } 240 241 /* See if we can write a whole folio in one go. */ 242 if (!maybe_trouble && offset == 0 && part >= flen) { 243 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 244 if (likely(copied == part)) { 245 if (finfo) 246 trace = netfs_whole_folio_modify_filled; 247 else 248 trace = netfs_whole_folio_modify; 249 goto mark_uptodate; 250 } 251 if (copied == 0) 252 goto copy_failed; 253 if (!finfo || copied <= finfo->dirty_offset) { 254 maybe_trouble = true; 255 iov_iter_revert(iter, copied); 256 copied = 0; 257 folio_unlock(folio); 258 goto retry; 259 } 260 261 /* We overwrote some existing dirty data, so we have to 262 * accept the partial write. 263 */ 264 finfo->dirty_len += finfo->dirty_offset; 265 if (finfo->dirty_len == flen) { 266 trace = netfs_whole_folio_modify_filled_efault; 267 goto mark_uptodate; 268 } 269 if (copied > finfo->dirty_len) 270 finfo->dirty_len = copied; 271 finfo->dirty_offset = 0; 272 trace = netfs_whole_folio_modify_efault; 273 goto copied; 274 } 275 276 /* We don't want to do a streaming write on a file that loses 277 * caching service temporarily because the backing store got 278 * culled. 279 */ 280 if (netfs_is_cache_enabled(ctx)) { 281 if (finfo) { 282 netfs_stat(&netfs_n_wh_wstream_conflict); 283 goto flush_content; 284 } 285 ret = netfs_prefetch_for_write(file, folio, offset, part); 286 if (ret < 0) { 287 _debug("prefetch = %zd", ret); 288 goto error_folio_unlock; 289 } 290 /* Note that copy-to-cache may have been set. */ 291 292 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 293 if (unlikely(copied == 0)) 294 goto copy_failed; 295 trace = netfs_just_prefetch; 296 goto copied_uptodate; 297 } 298 299 /* Do a streaming write on a folio that has nothing in it yet. */ 300 if (!finfo) { 301 ret = -EIO; 302 if (WARN_ON(folio_get_private(folio))) 303 goto error_folio_unlock; 304 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 305 if (unlikely(copied == 0)) 306 goto copy_failed; 307 if (offset == 0 && copied == flen) { 308 trace = netfs_streaming_filled_page; 309 goto mark_uptodate; 310 } 311 312 finfo = kzalloc_obj(*finfo); 313 if (!finfo) { 314 iov_iter_revert(iter, copied); 315 ret = -ENOMEM; 316 goto error_folio_unlock; 317 } 318 finfo->netfs_group = netfs_get_group(netfs_group); 319 finfo->dirty_offset = offset; 320 finfo->dirty_len = copied; 321 folio_attach_private(folio, (void *)((unsigned long)finfo | 322 NETFS_FOLIO_INFO)); 323 trace = netfs_streaming_write; 324 goto copied; 325 } 326 327 /* We can continue a streaming write only if it continues on 328 * from the previous. If it overlaps, we must flush lest we 329 * suffer a partial copy and disjoint dirty regions. 330 */ 331 if (offset == finfo->dirty_offset + finfo->dirty_len) { 332 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 333 if (unlikely(copied == 0)) 334 goto copy_failed; 335 finfo->dirty_len += copied; 336 if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { 337 trace = netfs_streaming_cont_filled_page; 338 goto mark_uptodate; 339 } 340 trace = netfs_streaming_write_cont; 341 goto copied; 342 } 343 344 /* Incompatible write; flush the folio and try again. */ 345 flush_content: 346 trace_netfs_folio(folio, netfs_flush_content); 347 folio_unlock(folio); 348 folio_put(folio); 349 ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); 350 if (ret < 0) 351 goto out; 352 continue; 353 354 /* Mark a folio as being up to data when we've filled it 355 * completely. If the folio has a group attached, then it must 356 * be the same group, otherwise we should have flushed it out 357 * above. We have to get rid of the netfs_folio struct if 358 * there was one. 359 */ 360 mark_uptodate: 361 folio_mark_uptodate(folio); 362 363 copied_uptodate: 364 priv = folio_get_private(folio); 365 if (likely(priv == netfs_group)) { 366 /* Already set correctly; no change required. */ 367 } else if (priv == NETFS_FOLIO_COPY_TO_CACHE) { 368 if (!netfs_group) 369 folio_detach_private(folio); 370 else 371 folio_change_private(folio, netfs_get_group(netfs_group)); 372 } else if (!priv) { 373 folio_attach_private(folio, netfs_get_group(netfs_group)); 374 } else { 375 WARN_ON_ONCE(!finfo); 376 if (netfs_group) 377 /* finfo->netfs_group has a ref */ 378 folio_change_private(folio, netfs_group); 379 else 380 folio_detach_private(folio); 381 kfree(finfo); 382 } 383 384 copied: 385 trace_netfs_folio(folio, trace); 386 flush_dcache_folio(folio); 387 388 /* Update the inode size if we moved the EOF marker */ 389 netfs_update_i_size(ctx, inode, pos, copied); 390 pos += copied; 391 written += copied; 392 393 if (likely(!wreq)) { 394 folio_mark_dirty(folio); 395 folio_unlock(folio); 396 } else { 397 netfs_advance_writethrough(wreq, &wbc, folio, copied, 398 offset + copied == flen, 399 &writethrough); 400 /* Folio unlocked */ 401 } 402 retry: 403 folio_put(folio); 404 folio = NULL; 405 406 ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); 407 if (unlikely(ret < 0)) 408 break; 409 410 cond_resched(); 411 } while (iov_iter_count(iter)); 412 413 out: 414 if (likely(written)) { 415 /* Set indication that ctime and mtime got updated in case 416 * close is deferred. 417 */ 418 set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags); 419 if (unlikely(ctx->ops->post_modify)) 420 ctx->ops->post_modify(inode); 421 } 422 423 if (unlikely(wreq)) { 424 ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); 425 wbc_detach_inode(&wbc); 426 if (ret2 == -EIOCBQUEUED) 427 return ret2; 428 if (ret == 0 && ret2 < 0) 429 ret = ret2; 430 } 431 432 iocb->ki_pos += written; 433 _leave(" = %zd [%zd]", written, ret); 434 return written ? written : ret; 435 436 copy_failed: 437 ret = -EFAULT; 438 error_folio_unlock: 439 folio_unlock(folio); 440 folio_put(folio); 441 goto out; 442 } 443 EXPORT_SYMBOL(netfs_perform_write); 444 445 /** 446 * netfs_buffered_write_iter_locked - write data to a file 447 * @iocb: IO state structure (file, offset, etc.) 448 * @from: iov_iter with data to write 449 * @netfs_group: Grouping for dirty folios (eg. ceph snaps). 450 * 451 * This function does all the work needed for actually writing data to a 452 * file. It does all basic checks, removes SUID from the file, updates 453 * modification times and calls proper subroutines depending on whether we 454 * do direct IO or a standard buffered write. 455 * 456 * The caller must hold appropriate locks around this function and have called 457 * generic_write_checks() already. The caller is also responsible for doing 458 * any necessary syncing afterwards. 459 * 460 * This function does *not* take care of syncing data in case of O_SYNC write. 461 * A caller has to handle it. This is mainly due to the fact that we want to 462 * avoid syncing under i_rwsem. 463 * 464 * Return: 465 * * number of bytes written, even for truncated writes 466 * * negative error code if no data has been written at all 467 */ 468 ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, 469 struct netfs_group *netfs_group) 470 { 471 struct file *file = iocb->ki_filp; 472 ssize_t ret; 473 474 trace_netfs_write_iter(iocb, from); 475 476 ret = file_remove_privs(file); 477 if (ret) 478 return ret; 479 480 ret = file_update_time(file); 481 if (ret) 482 return ret; 483 484 return netfs_perform_write(iocb, from, netfs_group); 485 } 486 EXPORT_SYMBOL(netfs_buffered_write_iter_locked); 487 488 /** 489 * netfs_file_write_iter - write data to a file 490 * @iocb: IO state structure 491 * @from: iov_iter with data to write 492 * 493 * Perform a write to a file, writing into the pagecache if possible and doing 494 * an unbuffered write instead if not. 495 * 496 * Return: 497 * * Negative error code if no data has been written at all of 498 * vfs_fsync_range() failed for a synchronous write 499 * * Number of bytes written, even for truncated writes 500 */ 501 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 502 { 503 struct file *file = iocb->ki_filp; 504 struct inode *inode = file->f_mapping->host; 505 struct netfs_inode *ictx = netfs_inode(inode); 506 ssize_t ret; 507 508 _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); 509 510 if (!iov_iter_count(from)) 511 return 0; 512 513 if ((iocb->ki_flags & IOCB_DIRECT) || 514 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) 515 return netfs_unbuffered_write_iter(iocb, from); 516 517 ret = netfs_start_io_write(inode); 518 if (ret < 0) 519 return ret; 520 521 ret = generic_write_checks(iocb, from); 522 if (ret > 0) 523 ret = netfs_buffered_write_iter_locked(iocb, from, NULL); 524 netfs_end_io_write(inode); 525 if (ret > 0) 526 ret = generic_write_sync(iocb, ret); 527 return ret; 528 } 529 EXPORT_SYMBOL(netfs_file_write_iter); 530 531 /* 532 * Notification that a previously read-only page is about to become writable. 533 * The caller indicates the precise page that needs to be written to, but 534 * we only track group on a per-folio basis, so we block more often than 535 * we might otherwise. 536 */ 537 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) 538 { 539 struct netfs_group *group; 540 struct folio *folio = page_folio(vmf->page); 541 struct file *file = vmf->vma->vm_file; 542 struct address_space *mapping = file->f_mapping; 543 struct inode *inode = file_inode(file); 544 struct netfs_inode *ictx = netfs_inode(inode); 545 vm_fault_t ret = VM_FAULT_NOPAGE; 546 void *priv; 547 int err; 548 549 _enter("%lx", folio->index); 550 551 sb_start_pagefault(inode->i_sb); 552 553 if (folio_lock_killable(folio) < 0) 554 goto out; 555 if (folio->mapping != mapping) 556 goto unlock; 557 if (folio_wait_writeback_killable(folio) < 0) 558 goto unlock; 559 560 /* Can we see a streaming write here? */ 561 if (WARN_ON(!folio_test_uptodate(folio))) { 562 ret = VM_FAULT_SIGBUS; 563 goto unlock; 564 } 565 566 group = netfs_folio_group(folio); 567 if (group && 568 group != netfs_group && 569 group != NETFS_FOLIO_COPY_TO_CACHE) { 570 folio_unlock(folio); 571 err = filemap_fdatawrite_range(mapping, 572 folio_pos(folio), 573 folio_next_pos(folio)); 574 switch (err) { 575 case 0: 576 ret = VM_FAULT_RETRY; 577 goto out; 578 case -ENOMEM: 579 ret = VM_FAULT_OOM; 580 goto out; 581 default: 582 ret = VM_FAULT_SIGBUS; 583 goto out; 584 } 585 } 586 587 if (folio_test_dirty(folio)) 588 trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); 589 else 590 trace_netfs_folio(folio, netfs_folio_trace_mkwrite); 591 592 priv = folio_get_private(folio); 593 if (priv != netfs_group) { 594 if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) 595 folio_detach_private(folio); 596 else if (netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) 597 folio_change_private(folio, netfs_get_group(netfs_group)); 598 else if (netfs_group && !priv) 599 folio_attach_private(folio, netfs_get_group(netfs_group)); 600 else 601 WARN_ON_ONCE(1); 602 } 603 604 file_update_time(file); 605 set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags); 606 if (ictx->ops->post_modify) 607 ictx->ops->post_modify(inode); 608 ret = VM_FAULT_LOCKED; 609 out: 610 sb_end_pagefault(inode->i_sb); 611 return ret; 612 unlock: 613 folio_unlock(folio); 614 goto out; 615 } 616 EXPORT_SYMBOL(netfs_page_mkwrite); 617