1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Network filesystem high-level buffered write support. 3 * 4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include "internal.h" 14 15 static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) 16 { 17 if (netfs_group) 18 folio_attach_private(folio, netfs_get_group(netfs_group)); 19 } 20 21 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) 22 { 23 void *priv = folio_get_private(folio); 24 25 if (unlikely(priv != netfs_group)) { 26 if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) 27 folio_attach_private(folio, netfs_get_group(netfs_group)); 28 else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) 29 folio_detach_private(folio); 30 } 31 } 32 33 /* 34 * Grab a folio for writing and lock it. Attempt to allocate as large a folio 35 * as possible to hold as much of the remaining length as possible in one go. 36 */ 37 static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, 38 loff_t pos, size_t part) 39 { 40 pgoff_t index = pos / PAGE_SIZE; 41 fgf_t fgp_flags = FGP_WRITEBEGIN; 42 43 if (mapping_large_folio_support(mapping)) 44 fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); 45 46 return __filemap_get_folio(mapping, index, fgp_flags, 47 mapping_gfp_mask(mapping)); 48 } 49 50 /* 51 * Update i_size and estimate the update to i_blocks to reflect the additional 52 * data written into the pagecache until we can find out from the server what 53 * the values actually are. 54 */ 55 void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, 56 loff_t pos, size_t copied) 57 { 58 loff_t i_size, end = pos + copied; 59 blkcnt_t add; 60 size_t gap; 61 62 if (end <= i_size_read(inode)) 63 return; 64 65 if (ctx->ops->update_i_size) { 66 ctx->ops->update_i_size(inode, end); 67 return; 68 } 69 70 spin_lock(&inode->i_lock); 71 72 i_size = i_size_read(inode); 73 if (end > i_size) { 74 i_size_write(inode, end); 75 #if IS_ENABLED(CONFIG_FSCACHE) 76 fscache_update_cookie(ctx->cache, NULL, &end); 77 #endif 78 79 gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); 80 if (copied > gap) { 81 add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); 82 83 inode->i_blocks = min_t(blkcnt_t, 84 DIV_ROUND_UP(end, SECTOR_SIZE), 85 inode->i_blocks + add); 86 } 87 } 88 spin_unlock(&inode->i_lock); 89 } 90 91 /** 92 * netfs_perform_write - Copy data into the pagecache. 93 * @iocb: The operation parameters 94 * @iter: The source buffer 95 * @netfs_group: Grouping for dirty folios (eg. ceph snaps). 96 * 97 * Copy data into pagecache folios attached to the inode specified by @iocb. 98 * The caller must hold appropriate inode locks. 99 * 100 * Dirty folios are tagged with a netfs_folio struct if they're not up to date 101 * to indicate the range modified. Dirty folios may also be tagged with a 102 * netfs-specific grouping such that data from an old group gets flushed before 103 * a new one is started. 104 */ 105 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, 106 struct netfs_group *netfs_group) 107 { 108 struct file *file = iocb->ki_filp; 109 struct inode *inode = file_inode(file); 110 struct address_space *mapping = inode->i_mapping; 111 struct netfs_inode *ctx = netfs_inode(inode); 112 struct writeback_control wbc = { 113 .sync_mode = WB_SYNC_NONE, 114 .for_sync = true, 115 .nr_to_write = LONG_MAX, 116 .range_start = iocb->ki_pos, 117 .range_end = iocb->ki_pos + iter->count, 118 }; 119 struct netfs_io_request *wreq = NULL; 120 struct folio *folio = NULL, *writethrough = NULL; 121 unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; 122 ssize_t written = 0, ret, ret2; 123 loff_t pos = iocb->ki_pos; 124 size_t max_chunk = mapping_max_folio_size(mapping); 125 bool maybe_trouble = false; 126 127 if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) 128 ) { 129 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 130 131 ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); 132 if (ret < 0) { 133 wbc_detach_inode(&wbc); 134 goto out; 135 } 136 137 wreq = netfs_begin_writethrough(iocb, iter->count); 138 if (IS_ERR(wreq)) { 139 wbc_detach_inode(&wbc); 140 ret = PTR_ERR(wreq); 141 wreq = NULL; 142 goto out; 143 } 144 if (!is_sync_kiocb(iocb)) 145 wreq->iocb = iocb; 146 netfs_stat(&netfs_n_wh_writethrough); 147 } else { 148 netfs_stat(&netfs_n_wh_buffered_write); 149 } 150 151 do { 152 struct netfs_folio *finfo; 153 struct netfs_group *group; 154 unsigned long long fpos; 155 size_t flen; 156 size_t offset; /* Offset into pagecache folio */ 157 size_t part; /* Bytes to write to folio */ 158 size_t copied; /* Bytes copied from user */ 159 160 offset = pos & (max_chunk - 1); 161 part = min(max_chunk - offset, iov_iter_count(iter)); 162 163 /* Bring in the user pages that we will copy from _first_ lest 164 * we hit a nasty deadlock on copying from the same page as 165 * we're writing to, without it being marked uptodate. 166 * 167 * Not only is this an optimisation, but it is also required to 168 * check that the address is actually valid, when atomic 169 * usercopies are used below. 170 * 171 * We rely on the page being held onto long enough by the LRU 172 * that we can grab it below if this causes it to be read. 173 */ 174 ret = -EFAULT; 175 if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) 176 break; 177 178 folio = netfs_grab_folio_for_write(mapping, pos, part); 179 if (IS_ERR(folio)) { 180 ret = PTR_ERR(folio); 181 break; 182 } 183 184 flen = folio_size(folio); 185 fpos = folio_pos(folio); 186 offset = pos - fpos; 187 part = min_t(size_t, flen - offset, part); 188 189 /* Wait for writeback to complete. The writeback engine owns 190 * the info in folio->private and may change it until it 191 * removes the WB mark. 192 */ 193 if (folio_get_private(folio) && 194 folio_wait_writeback_killable(folio)) { 195 ret = written ? -EINTR : -ERESTARTSYS; 196 goto error_folio_unlock; 197 } 198 199 if (signal_pending(current)) { 200 ret = written ? -EINTR : -ERESTARTSYS; 201 goto error_folio_unlock; 202 } 203 204 /* Decide how we should modify a folio. We might be attempting 205 * to do write-streaming, in which case we don't want to a 206 * local RMW cycle if we can avoid it. If we're doing local 207 * caching or content crypto, we award that priority over 208 * avoiding RMW. If the file is open readably, then we also 209 * assume that we may want to read what we wrote. 210 */ 211 finfo = netfs_folio_info(folio); 212 group = netfs_folio_group(folio); 213 214 if (unlikely(group != netfs_group) && 215 group != NETFS_FOLIO_COPY_TO_CACHE) 216 goto flush_content; 217 218 if (folio_test_uptodate(folio)) { 219 if (mapping_writably_mapped(mapping)) 220 flush_dcache_folio(folio); 221 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 222 if (unlikely(copied == 0)) 223 goto copy_failed; 224 netfs_set_group(folio, netfs_group); 225 trace_netfs_folio(folio, netfs_folio_is_uptodate); 226 goto copied; 227 } 228 229 /* If the page is above the zero-point then we assume that the 230 * server would just return a block of zeros or a short read if 231 * we try to read it. 232 */ 233 if (fpos >= ctx->zero_point) { 234 folio_zero_segment(folio, 0, offset); 235 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 236 if (unlikely(copied == 0)) 237 goto copy_failed; 238 folio_zero_segment(folio, offset + copied, flen); 239 __netfs_set_group(folio, netfs_group); 240 folio_mark_uptodate(folio); 241 trace_netfs_folio(folio, netfs_modify_and_clear); 242 goto copied; 243 } 244 245 /* See if we can write a whole folio in one go. */ 246 if (!maybe_trouble && offset == 0 && part >= flen) { 247 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 248 if (unlikely(copied == 0)) 249 goto copy_failed; 250 if (unlikely(copied < part)) { 251 maybe_trouble = true; 252 iov_iter_revert(iter, copied); 253 copied = 0; 254 folio_unlock(folio); 255 goto retry; 256 } 257 __netfs_set_group(folio, netfs_group); 258 folio_mark_uptodate(folio); 259 trace_netfs_folio(folio, netfs_whole_folio_modify); 260 goto copied; 261 } 262 263 /* We don't want to do a streaming write on a file that loses 264 * caching service temporarily because the backing store got 265 * culled and we don't really want to get a streaming write on 266 * a file that's open for reading as ->read_folio() then has to 267 * be able to flush it. 268 */ 269 if ((file->f_mode & FMODE_READ) || 270 netfs_is_cache_enabled(ctx)) { 271 if (finfo) { 272 netfs_stat(&netfs_n_wh_wstream_conflict); 273 goto flush_content; 274 } 275 ret = netfs_prefetch_for_write(file, folio, offset, part); 276 if (ret < 0) { 277 _debug("prefetch = %zd", ret); 278 goto error_folio_unlock; 279 } 280 /* Note that copy-to-cache may have been set. */ 281 282 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 283 if (unlikely(copied == 0)) 284 goto copy_failed; 285 netfs_set_group(folio, netfs_group); 286 trace_netfs_folio(folio, netfs_just_prefetch); 287 goto copied; 288 } 289 290 if (!finfo) { 291 ret = -EIO; 292 if (WARN_ON(folio_get_private(folio))) 293 goto error_folio_unlock; 294 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 295 if (unlikely(copied == 0)) 296 goto copy_failed; 297 if (offset == 0 && copied == flen) { 298 __netfs_set_group(folio, netfs_group); 299 folio_mark_uptodate(folio); 300 trace_netfs_folio(folio, netfs_streaming_filled_page); 301 goto copied; 302 } 303 304 finfo = kzalloc_obj(*finfo); 305 if (!finfo) { 306 iov_iter_revert(iter, copied); 307 ret = -ENOMEM; 308 goto error_folio_unlock; 309 } 310 finfo->netfs_group = netfs_get_group(netfs_group); 311 finfo->dirty_offset = offset; 312 finfo->dirty_len = copied; 313 folio_attach_private(folio, (void *)((unsigned long)finfo | 314 NETFS_FOLIO_INFO)); 315 trace_netfs_folio(folio, netfs_streaming_write); 316 goto copied; 317 } 318 319 /* We can continue a streaming write only if it continues on 320 * from the previous. If it overlaps, we must flush lest we 321 * suffer a partial copy and disjoint dirty regions. 322 */ 323 if (offset == finfo->dirty_offset + finfo->dirty_len) { 324 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 325 if (unlikely(copied == 0)) 326 goto copy_failed; 327 finfo->dirty_len += copied; 328 if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { 329 if (finfo->netfs_group) 330 folio_change_private(folio, finfo->netfs_group); 331 else 332 folio_detach_private(folio); 333 folio_mark_uptodate(folio); 334 kfree(finfo); 335 trace_netfs_folio(folio, netfs_streaming_cont_filled_page); 336 } else { 337 trace_netfs_folio(folio, netfs_streaming_write_cont); 338 } 339 goto copied; 340 } 341 342 /* Incompatible write; flush the folio and try again. */ 343 flush_content: 344 trace_netfs_folio(folio, netfs_flush_content); 345 folio_unlock(folio); 346 folio_put(folio); 347 ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); 348 if (ret < 0) 349 goto out; 350 continue; 351 352 copied: 353 flush_dcache_folio(folio); 354 355 /* Update the inode size if we moved the EOF marker */ 356 netfs_update_i_size(ctx, inode, pos, copied); 357 pos += copied; 358 written += copied; 359 360 if (likely(!wreq)) { 361 folio_mark_dirty(folio); 362 folio_unlock(folio); 363 } else { 364 netfs_advance_writethrough(wreq, &wbc, folio, copied, 365 offset + copied == flen, 366 &writethrough); 367 /* Folio unlocked */ 368 } 369 retry: 370 folio_put(folio); 371 folio = NULL; 372 373 ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); 374 if (unlikely(ret < 0)) 375 break; 376 377 cond_resched(); 378 } while (iov_iter_count(iter)); 379 380 out: 381 if (likely(written)) { 382 /* Set indication that ctime and mtime got updated in case 383 * close is deferred. 384 */ 385 set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags); 386 if (unlikely(ctx->ops->post_modify)) 387 ctx->ops->post_modify(inode); 388 } 389 390 if (unlikely(wreq)) { 391 ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); 392 wbc_detach_inode(&wbc); 393 if (ret2 == -EIOCBQUEUED) 394 return ret2; 395 if (ret == 0 && ret2 < 0) 396 ret = ret2; 397 } 398 399 iocb->ki_pos += written; 400 _leave(" = %zd [%zd]", written, ret); 401 return written ? written : ret; 402 403 copy_failed: 404 ret = -EFAULT; 405 error_folio_unlock: 406 folio_unlock(folio); 407 folio_put(folio); 408 goto out; 409 } 410 EXPORT_SYMBOL(netfs_perform_write); 411 412 /** 413 * netfs_buffered_write_iter_locked - write data to a file 414 * @iocb: IO state structure (file, offset, etc.) 415 * @from: iov_iter with data to write 416 * @netfs_group: Grouping for dirty folios (eg. ceph snaps). 417 * 418 * This function does all the work needed for actually writing data to a 419 * file. It does all basic checks, removes SUID from the file, updates 420 * modification times and calls proper subroutines depending on whether we 421 * do direct IO or a standard buffered write. 422 * 423 * The caller must hold appropriate locks around this function and have called 424 * generic_write_checks() already. The caller is also responsible for doing 425 * any necessary syncing afterwards. 426 * 427 * This function does *not* take care of syncing data in case of O_SYNC write. 428 * A caller has to handle it. This is mainly due to the fact that we want to 429 * avoid syncing under i_rwsem. 430 * 431 * Return: 432 * * number of bytes written, even for truncated writes 433 * * negative error code if no data has been written at all 434 */ 435 ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, 436 struct netfs_group *netfs_group) 437 { 438 struct file *file = iocb->ki_filp; 439 ssize_t ret; 440 441 trace_netfs_write_iter(iocb, from); 442 443 ret = file_remove_privs(file); 444 if (ret) 445 return ret; 446 447 ret = file_update_time(file); 448 if (ret) 449 return ret; 450 451 return netfs_perform_write(iocb, from, netfs_group); 452 } 453 EXPORT_SYMBOL(netfs_buffered_write_iter_locked); 454 455 /** 456 * netfs_file_write_iter - write data to a file 457 * @iocb: IO state structure 458 * @from: iov_iter with data to write 459 * 460 * Perform a write to a file, writing into the pagecache if possible and doing 461 * an unbuffered write instead if not. 462 * 463 * Return: 464 * * Negative error code if no data has been written at all of 465 * vfs_fsync_range() failed for a synchronous write 466 * * Number of bytes written, even for truncated writes 467 */ 468 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 469 { 470 struct file *file = iocb->ki_filp; 471 struct inode *inode = file->f_mapping->host; 472 struct netfs_inode *ictx = netfs_inode(inode); 473 ssize_t ret; 474 475 _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); 476 477 if (!iov_iter_count(from)) 478 return 0; 479 480 if ((iocb->ki_flags & IOCB_DIRECT) || 481 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) 482 return netfs_unbuffered_write_iter(iocb, from); 483 484 ret = netfs_start_io_write(inode); 485 if (ret < 0) 486 return ret; 487 488 ret = generic_write_checks(iocb, from); 489 if (ret > 0) 490 ret = netfs_buffered_write_iter_locked(iocb, from, NULL); 491 netfs_end_io_write(inode); 492 if (ret > 0) 493 ret = generic_write_sync(iocb, ret); 494 return ret; 495 } 496 EXPORT_SYMBOL(netfs_file_write_iter); 497 498 /* 499 * Notification that a previously read-only page is about to become writable. 500 * The caller indicates the precise page that needs to be written to, but 501 * we only track group on a per-folio basis, so we block more often than 502 * we might otherwise. 503 */ 504 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) 505 { 506 struct netfs_group *group; 507 struct folio *folio = page_folio(vmf->page); 508 struct file *file = vmf->vma->vm_file; 509 struct address_space *mapping = file->f_mapping; 510 struct inode *inode = file_inode(file); 511 struct netfs_inode *ictx = netfs_inode(inode); 512 vm_fault_t ret = VM_FAULT_NOPAGE; 513 int err; 514 515 _enter("%lx", folio->index); 516 517 sb_start_pagefault(inode->i_sb); 518 519 if (folio_lock_killable(folio) < 0) 520 goto out; 521 if (folio->mapping != mapping) 522 goto unlock; 523 if (folio_wait_writeback_killable(folio) < 0) 524 goto unlock; 525 526 /* Can we see a streaming write here? */ 527 if (WARN_ON(!folio_test_uptodate(folio))) { 528 ret = VM_FAULT_SIGBUS; 529 goto unlock; 530 } 531 532 group = netfs_folio_group(folio); 533 if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { 534 folio_unlock(folio); 535 err = filemap_fdatawrite_range(mapping, 536 folio_pos(folio), 537 folio_next_pos(folio)); 538 switch (err) { 539 case 0: 540 ret = VM_FAULT_RETRY; 541 goto out; 542 case -ENOMEM: 543 ret = VM_FAULT_OOM; 544 goto out; 545 default: 546 ret = VM_FAULT_SIGBUS; 547 goto out; 548 } 549 } 550 551 if (folio_test_dirty(folio)) 552 trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); 553 else 554 trace_netfs_folio(folio, netfs_folio_trace_mkwrite); 555 netfs_set_group(folio, netfs_group); 556 file_update_time(file); 557 set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags); 558 if (ictx->ops->post_modify) 559 ictx->ops->post_modify(inode); 560 ret = VM_FAULT_LOCKED; 561 out: 562 sb_end_pagefault(inode->i_sb); 563 return ret; 564 unlock: 565 folio_unlock(folio); 566 goto out; 567 } 568 EXPORT_SYMBOL(netfs_page_mkwrite); 569