1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Network filesystem high-level buffered write support. 3 * 4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include <linux/pagevec.h> 14 #include "internal.h" 15 16 static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) 17 { 18 if (netfs_group) 19 folio_attach_private(folio, netfs_get_group(netfs_group)); 20 } 21 22 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) 23 { 24 void *priv = folio_get_private(folio); 25 26 if (unlikely(priv != netfs_group)) { 27 if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) 28 folio_attach_private(folio, netfs_get_group(netfs_group)); 29 else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) 30 folio_detach_private(folio); 31 } 32 } 33 34 /* 35 * Grab a folio for writing and lock it. Attempt to allocate as large a folio 36 * as possible to hold as much of the remaining length as possible in one go. 37 */ 38 static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, 39 loff_t pos, size_t part) 40 { 41 pgoff_t index = pos / PAGE_SIZE; 42 fgf_t fgp_flags = FGP_WRITEBEGIN; 43 44 if (mapping_large_folio_support(mapping)) 45 fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); 46 47 return __filemap_get_folio(mapping, index, fgp_flags, 48 mapping_gfp_mask(mapping)); 49 } 50 51 /* 52 * Update i_size and estimate the update to i_blocks to reflect the additional 53 * data written into the pagecache until we can find out from the server what 54 * the values actually are. 55 */ 56 void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, 57 loff_t pos, size_t copied) 58 { 59 loff_t i_size, end = pos + copied; 60 blkcnt_t add; 61 size_t gap; 62 63 if (end <= i_size_read(inode)) 64 return; 65 66 if (ctx->ops->update_i_size) { 67 ctx->ops->update_i_size(inode, end); 68 return; 69 } 70 71 spin_lock(&inode->i_lock); 72 73 i_size = i_size_read(inode); 74 if (end > i_size) { 75 i_size_write(inode, end); 76 #if IS_ENABLED(CONFIG_FSCACHE) 77 fscache_update_cookie(ctx->cache, NULL, &end); 78 #endif 79 80 gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); 81 if (copied > gap) { 82 add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); 83 84 inode->i_blocks = min_t(blkcnt_t, 85 DIV_ROUND_UP(end, SECTOR_SIZE), 86 inode->i_blocks + add); 87 } 88 } 89 spin_unlock(&inode->i_lock); 90 } 91 92 /** 93 * netfs_perform_write - Copy data into the pagecache. 94 * @iocb: The operation parameters 95 * @iter: The source buffer 96 * @netfs_group: Grouping for dirty folios (eg. ceph snaps). 97 * 98 * Copy data into pagecache folios attached to the inode specified by @iocb. 99 * The caller must hold appropriate inode locks. 100 * 101 * Dirty folios are tagged with a netfs_folio struct if they're not up to date 102 * to indicate the range modified. Dirty folios may also be tagged with a 103 * netfs-specific grouping such that data from an old group gets flushed before 104 * a new one is started. 105 */ 106 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, 107 struct netfs_group *netfs_group) 108 { 109 struct file *file = iocb->ki_filp; 110 struct inode *inode = file_inode(file); 111 struct address_space *mapping = inode->i_mapping; 112 struct netfs_inode *ctx = netfs_inode(inode); 113 struct writeback_control wbc = { 114 .sync_mode = WB_SYNC_NONE, 115 .for_sync = true, 116 .nr_to_write = LONG_MAX, 117 .range_start = iocb->ki_pos, 118 .range_end = iocb->ki_pos + iter->count, 119 }; 120 struct netfs_io_request *wreq = NULL; 121 struct folio *folio = NULL, *writethrough = NULL; 122 unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; 123 ssize_t written = 0, ret, ret2; 124 loff_t pos = iocb->ki_pos; 125 size_t max_chunk = mapping_max_folio_size(mapping); 126 bool maybe_trouble = false; 127 128 if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) 129 ) { 130 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 131 132 ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); 133 if (ret < 0) { 134 wbc_detach_inode(&wbc); 135 goto out; 136 } 137 138 wreq = netfs_begin_writethrough(iocb, iter->count); 139 if (IS_ERR(wreq)) { 140 wbc_detach_inode(&wbc); 141 ret = PTR_ERR(wreq); 142 wreq = NULL; 143 goto out; 144 } 145 if (!is_sync_kiocb(iocb)) 146 wreq->iocb = iocb; 147 netfs_stat(&netfs_n_wh_writethrough); 148 } else { 149 netfs_stat(&netfs_n_wh_buffered_write); 150 } 151 152 do { 153 struct netfs_folio *finfo; 154 struct netfs_group *group; 155 unsigned long long fpos; 156 size_t flen; 157 size_t offset; /* Offset into pagecache folio */ 158 size_t part; /* Bytes to write to folio */ 159 size_t copied; /* Bytes copied from user */ 160 161 offset = pos & (max_chunk - 1); 162 part = min(max_chunk - offset, iov_iter_count(iter)); 163 164 /* Bring in the user pages that we will copy from _first_ lest 165 * we hit a nasty deadlock on copying from the same page as 166 * we're writing to, without it being marked uptodate. 167 * 168 * Not only is this an optimisation, but it is also required to 169 * check that the address is actually valid, when atomic 170 * usercopies are used below. 171 * 172 * We rely on the page being held onto long enough by the LRU 173 * that we can grab it below if this causes it to be read. 174 */ 175 ret = -EFAULT; 176 if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) 177 break; 178 179 folio = netfs_grab_folio_for_write(mapping, pos, part); 180 if (IS_ERR(folio)) { 181 ret = PTR_ERR(folio); 182 break; 183 } 184 185 flen = folio_size(folio); 186 fpos = folio_pos(folio); 187 offset = pos - fpos; 188 part = min_t(size_t, flen - offset, part); 189 190 /* Wait for writeback to complete. The writeback engine owns 191 * the info in folio->private and may change it until it 192 * removes the WB mark. 193 */ 194 if (folio_get_private(folio) && 195 folio_wait_writeback_killable(folio)) { 196 ret = written ? -EINTR : -ERESTARTSYS; 197 goto error_folio_unlock; 198 } 199 200 if (signal_pending(current)) { 201 ret = written ? -EINTR : -ERESTARTSYS; 202 goto error_folio_unlock; 203 } 204 205 /* Decide how we should modify a folio. We might be attempting 206 * to do write-streaming, in which case we don't want to a 207 * local RMW cycle if we can avoid it. If we're doing local 208 * caching or content crypto, we award that priority over 209 * avoiding RMW. If the file is open readably, then we also 210 * assume that we may want to read what we wrote. 211 */ 212 finfo = netfs_folio_info(folio); 213 group = netfs_folio_group(folio); 214 215 if (unlikely(group != netfs_group) && 216 group != NETFS_FOLIO_COPY_TO_CACHE) 217 goto flush_content; 218 219 if (folio_test_uptodate(folio)) { 220 if (mapping_writably_mapped(mapping)) 221 flush_dcache_folio(folio); 222 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 223 if (unlikely(copied == 0)) 224 goto copy_failed; 225 netfs_set_group(folio, netfs_group); 226 trace_netfs_folio(folio, netfs_folio_is_uptodate); 227 goto copied; 228 } 229 230 /* If the page is above the zero-point then we assume that the 231 * server would just return a block of zeros or a short read if 232 * we try to read it. 233 */ 234 if (fpos >= ctx->zero_point) { 235 folio_zero_segment(folio, 0, offset); 236 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 237 if (unlikely(copied == 0)) 238 goto copy_failed; 239 folio_zero_segment(folio, offset + copied, flen); 240 __netfs_set_group(folio, netfs_group); 241 folio_mark_uptodate(folio); 242 trace_netfs_folio(folio, netfs_modify_and_clear); 243 goto copied; 244 } 245 246 /* See if we can write a whole folio in one go. */ 247 if (!maybe_trouble && offset == 0 && part >= flen) { 248 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 249 if (unlikely(copied == 0)) 250 goto copy_failed; 251 if (unlikely(copied < part)) { 252 maybe_trouble = true; 253 iov_iter_revert(iter, copied); 254 copied = 0; 255 folio_unlock(folio); 256 goto retry; 257 } 258 __netfs_set_group(folio, netfs_group); 259 folio_mark_uptodate(folio); 260 trace_netfs_folio(folio, netfs_whole_folio_modify); 261 goto copied; 262 } 263 264 /* We don't want to do a streaming write on a file that loses 265 * caching service temporarily because the backing store got 266 * culled and we don't really want to get a streaming write on 267 * a file that's open for reading as ->read_folio() then has to 268 * be able to flush it. 269 */ 270 if ((file->f_mode & FMODE_READ) || 271 netfs_is_cache_enabled(ctx)) { 272 if (finfo) { 273 netfs_stat(&netfs_n_wh_wstream_conflict); 274 goto flush_content; 275 } 276 ret = netfs_prefetch_for_write(file, folio, offset, part); 277 if (ret < 0) { 278 _debug("prefetch = %zd", ret); 279 goto error_folio_unlock; 280 } 281 /* Note that copy-to-cache may have been set. */ 282 283 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 284 if (unlikely(copied == 0)) 285 goto copy_failed; 286 netfs_set_group(folio, netfs_group); 287 trace_netfs_folio(folio, netfs_just_prefetch); 288 goto copied; 289 } 290 291 if (!finfo) { 292 ret = -EIO; 293 if (WARN_ON(folio_get_private(folio))) 294 goto error_folio_unlock; 295 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 296 if (unlikely(copied == 0)) 297 goto copy_failed; 298 if (offset == 0 && copied == flen) { 299 __netfs_set_group(folio, netfs_group); 300 folio_mark_uptodate(folio); 301 trace_netfs_folio(folio, netfs_streaming_filled_page); 302 goto copied; 303 } 304 305 finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); 306 if (!finfo) { 307 iov_iter_revert(iter, copied); 308 ret = -ENOMEM; 309 goto error_folio_unlock; 310 } 311 finfo->netfs_group = netfs_get_group(netfs_group); 312 finfo->dirty_offset = offset; 313 finfo->dirty_len = copied; 314 folio_attach_private(folio, (void *)((unsigned long)finfo | 315 NETFS_FOLIO_INFO)); 316 trace_netfs_folio(folio, netfs_streaming_write); 317 goto copied; 318 } 319 320 /* We can continue a streaming write only if it continues on 321 * from the previous. If it overlaps, we must flush lest we 322 * suffer a partial copy and disjoint dirty regions. 323 */ 324 if (offset == finfo->dirty_offset + finfo->dirty_len) { 325 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 326 if (unlikely(copied == 0)) 327 goto copy_failed; 328 finfo->dirty_len += copied; 329 if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { 330 if (finfo->netfs_group) 331 folio_change_private(folio, finfo->netfs_group); 332 else 333 folio_detach_private(folio); 334 folio_mark_uptodate(folio); 335 kfree(finfo); 336 trace_netfs_folio(folio, netfs_streaming_cont_filled_page); 337 } else { 338 trace_netfs_folio(folio, netfs_streaming_write_cont); 339 } 340 goto copied; 341 } 342 343 /* Incompatible write; flush the folio and try again. */ 344 flush_content: 345 trace_netfs_folio(folio, netfs_flush_content); 346 folio_unlock(folio); 347 folio_put(folio); 348 ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); 349 if (ret < 0) 350 goto error_folio_unlock; 351 continue; 352 353 copied: 354 flush_dcache_folio(folio); 355 356 /* Update the inode size if we moved the EOF marker */ 357 netfs_update_i_size(ctx, inode, pos, copied); 358 pos += copied; 359 written += copied; 360 361 if (likely(!wreq)) { 362 folio_mark_dirty(folio); 363 folio_unlock(folio); 364 } else { 365 netfs_advance_writethrough(wreq, &wbc, folio, copied, 366 offset + copied == flen, 367 &writethrough); 368 /* Folio unlocked */ 369 } 370 retry: 371 folio_put(folio); 372 folio = NULL; 373 374 ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); 375 if (unlikely(ret < 0)) 376 break; 377 378 cond_resched(); 379 } while (iov_iter_count(iter)); 380 381 out: 382 if (likely(written)) { 383 /* Set indication that ctime and mtime got updated in case 384 * close is deferred. 385 */ 386 set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags); 387 if (unlikely(ctx->ops->post_modify)) 388 ctx->ops->post_modify(inode); 389 } 390 391 if (unlikely(wreq)) { 392 ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); 393 wbc_detach_inode(&wbc); 394 if (ret2 == -EIOCBQUEUED) 395 return ret2; 396 if (ret == 0 && ret2 < 0) 397 ret = ret2; 398 } 399 400 iocb->ki_pos += written; 401 _leave(" = %zd [%zd]", written, ret); 402 return written ? written : ret; 403 404 copy_failed: 405 ret = -EFAULT; 406 error_folio_unlock: 407 folio_unlock(folio); 408 folio_put(folio); 409 goto out; 410 } 411 EXPORT_SYMBOL(netfs_perform_write); 412 413 /** 414 * netfs_buffered_write_iter_locked - write data to a file 415 * @iocb: IO state structure (file, offset, etc.) 416 * @from: iov_iter with data to write 417 * @netfs_group: Grouping for dirty folios (eg. ceph snaps). 418 * 419 * This function does all the work needed for actually writing data to a 420 * file. It does all basic checks, removes SUID from the file, updates 421 * modification times and calls proper subroutines depending on whether we 422 * do direct IO or a standard buffered write. 423 * 424 * The caller must hold appropriate locks around this function and have called 425 * generic_write_checks() already. The caller is also responsible for doing 426 * any necessary syncing afterwards. 427 * 428 * This function does *not* take care of syncing data in case of O_SYNC write. 429 * A caller has to handle it. This is mainly due to the fact that we want to 430 * avoid syncing under i_rwsem. 431 * 432 * Return: 433 * * number of bytes written, even for truncated writes 434 * * negative error code if no data has been written at all 435 */ 436 ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, 437 struct netfs_group *netfs_group) 438 { 439 struct file *file = iocb->ki_filp; 440 ssize_t ret; 441 442 trace_netfs_write_iter(iocb, from); 443 444 ret = file_remove_privs(file); 445 if (ret) 446 return ret; 447 448 ret = file_update_time(file); 449 if (ret) 450 return ret; 451 452 return netfs_perform_write(iocb, from, netfs_group); 453 } 454 EXPORT_SYMBOL(netfs_buffered_write_iter_locked); 455 456 /** 457 * netfs_file_write_iter - write data to a file 458 * @iocb: IO state structure 459 * @from: iov_iter with data to write 460 * 461 * Perform a write to a file, writing into the pagecache if possible and doing 462 * an unbuffered write instead if not. 463 * 464 * Return: 465 * * Negative error code if no data has been written at all of 466 * vfs_fsync_range() failed for a synchronous write 467 * * Number of bytes written, even for truncated writes 468 */ 469 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 470 { 471 struct file *file = iocb->ki_filp; 472 struct inode *inode = file->f_mapping->host; 473 struct netfs_inode *ictx = netfs_inode(inode); 474 ssize_t ret; 475 476 _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); 477 478 if (!iov_iter_count(from)) 479 return 0; 480 481 if ((iocb->ki_flags & IOCB_DIRECT) || 482 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) 483 return netfs_unbuffered_write_iter(iocb, from); 484 485 ret = netfs_start_io_write(inode); 486 if (ret < 0) 487 return ret; 488 489 ret = generic_write_checks(iocb, from); 490 if (ret > 0) 491 ret = netfs_buffered_write_iter_locked(iocb, from, NULL); 492 netfs_end_io_write(inode); 493 if (ret > 0) 494 ret = generic_write_sync(iocb, ret); 495 return ret; 496 } 497 EXPORT_SYMBOL(netfs_file_write_iter); 498 499 /* 500 * Notification that a previously read-only page is about to become writable. 501 * The caller indicates the precise page that needs to be written to, but 502 * we only track group on a per-folio basis, so we block more often than 503 * we might otherwise. 504 */ 505 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) 506 { 507 struct netfs_group *group; 508 struct folio *folio = page_folio(vmf->page); 509 struct file *file = vmf->vma->vm_file; 510 struct address_space *mapping = file->f_mapping; 511 struct inode *inode = file_inode(file); 512 struct netfs_inode *ictx = netfs_inode(inode); 513 vm_fault_t ret = VM_FAULT_NOPAGE; 514 int err; 515 516 _enter("%lx", folio->index); 517 518 sb_start_pagefault(inode->i_sb); 519 520 if (folio_lock_killable(folio) < 0) 521 goto out; 522 if (folio->mapping != mapping) 523 goto unlock; 524 if (folio_wait_writeback_killable(folio) < 0) 525 goto unlock; 526 527 /* Can we see a streaming write here? */ 528 if (WARN_ON(!folio_test_uptodate(folio))) { 529 ret = VM_FAULT_SIGBUS; 530 goto unlock; 531 } 532 533 group = netfs_folio_group(folio); 534 if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { 535 folio_unlock(folio); 536 err = filemap_fdatawrite_range(mapping, 537 folio_pos(folio), 538 folio_pos(folio) + folio_size(folio)); 539 switch (err) { 540 case 0: 541 ret = VM_FAULT_RETRY; 542 goto out; 543 case -ENOMEM: 544 ret = VM_FAULT_OOM; 545 goto out; 546 default: 547 ret = VM_FAULT_SIGBUS; 548 goto out; 549 } 550 } 551 552 if (folio_test_dirty(folio)) 553 trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); 554 else 555 trace_netfs_folio(folio, netfs_folio_trace_mkwrite); 556 netfs_set_group(folio, netfs_group); 557 file_update_time(file); 558 set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags); 559 if (ictx->ops->post_modify) 560 ictx->ops->post_modify(inode); 561 ret = VM_FAULT_LOCKED; 562 out: 563 sb_end_pagefault(inode->i_sb); 564 return ret; 565 unlock: 566 folio_unlock(folio); 567 goto out; 568 } 569 EXPORT_SYMBOL(netfs_page_mkwrite); 570