1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Network filesystem high-level write support. 3 * 4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include <linux/pagevec.h> 14 #include "internal.h" 15 16 /* 17 * Determined write method. Adjust netfs_folio_traces if this is changed. 18 */ 19 enum netfs_how_to_modify { 20 NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ 21 NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ 22 NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ 23 NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ 24 NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ 25 NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ 26 NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ 27 }; 28 29 static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); 30 31 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) 32 { 33 void *priv = folio_get_private(folio); 34 35 if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) 36 folio_attach_private(folio, netfs_get_group(netfs_group)); 37 else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) 38 folio_detach_private(folio); 39 } 40 41 /* 42 * Decide how we should modify a folio. We might be attempting to do 43 * write-streaming, in which case we don't want to a local RMW cycle if we can 44 * avoid it. If we're doing local caching or content crypto, we award that 45 * priority over avoiding RMW. If the file is open readably, then we also 46 * assume that we may want to read what we wrote. 47 */ 48 static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, 49 struct file *file, 50 struct folio *folio, 51 void *netfs_group, 52 size_t flen, 53 size_t offset, 54 size_t len, 55 bool maybe_trouble) 56 { 57 struct netfs_folio *finfo = netfs_folio_info(folio); 58 struct netfs_group *group = netfs_folio_group(folio); 59 loff_t pos = folio_file_pos(folio); 60 61 _enter(""); 62 63 if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) 64 return NETFS_FLUSH_CONTENT; 65 66 if (folio_test_uptodate(folio)) 67 return NETFS_FOLIO_IS_UPTODATE; 68 69 if (pos >= ctx->zero_point) 70 return NETFS_MODIFY_AND_CLEAR; 71 72 if (!maybe_trouble && offset == 0 && len >= flen) 73 return NETFS_WHOLE_FOLIO_MODIFY; 74 75 if (file->f_mode & FMODE_READ) 76 goto no_write_streaming; 77 78 if (netfs_is_cache_enabled(ctx)) { 79 /* We don't want to get a streaming write on a file that loses 80 * caching service temporarily because the backing store got 81 * culled. 82 */ 83 goto no_write_streaming; 84 } 85 86 if (!finfo) 87 return NETFS_STREAMING_WRITE; 88 89 /* We can continue a streaming write only if it continues on from the 90 * previous. If it overlaps, we must flush lest we suffer a partial 91 * copy and disjoint dirty regions. 92 */ 93 if (offset == finfo->dirty_offset + finfo->dirty_len) 94 return NETFS_STREAMING_WRITE_CONT; 95 return NETFS_FLUSH_CONTENT; 96 97 no_write_streaming: 98 if (finfo) { 99 netfs_stat(&netfs_n_wh_wstream_conflict); 100 return NETFS_FLUSH_CONTENT; 101 } 102 return NETFS_JUST_PREFETCH; 103 } 104 105 /* 106 * Grab a folio for writing and lock it. Attempt to allocate as large a folio 107 * as possible to hold as much of the remaining length as possible in one go. 108 */ 109 static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, 110 loff_t pos, size_t part) 111 { 112 pgoff_t index = pos / PAGE_SIZE; 113 fgf_t fgp_flags = FGP_WRITEBEGIN; 114 115 if (mapping_large_folio_support(mapping)) 116 fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); 117 118 return __filemap_get_folio(mapping, index, fgp_flags, 119 mapping_gfp_mask(mapping)); 120 } 121 122 /* 123 * Update i_size and estimate the update to i_blocks to reflect the additional 124 * data written into the pagecache until we can find out from the server what 125 * the values actually are. 126 */ 127 static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, 128 loff_t i_size, loff_t pos, size_t copied) 129 { 130 blkcnt_t add; 131 size_t gap; 132 133 if (ctx->ops->update_i_size) { 134 ctx->ops->update_i_size(inode, pos); 135 return; 136 } 137 138 i_size_write(inode, pos); 139 #if IS_ENABLED(CONFIG_FSCACHE) 140 fscache_update_cookie(ctx->cache, NULL, &pos); 141 #endif 142 143 gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); 144 if (copied > gap) { 145 add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); 146 147 inode->i_blocks = min_t(blkcnt_t, 148 DIV_ROUND_UP(pos, SECTOR_SIZE), 149 inode->i_blocks + add); 150 } 151 } 152 153 /** 154 * netfs_perform_write - Copy data into the pagecache. 155 * @iocb: The operation parameters 156 * @iter: The source buffer 157 * @netfs_group: Grouping for dirty pages (eg. ceph snaps). 158 * 159 * Copy data into pagecache pages attached to the inode specified by @iocb. 160 * The caller must hold appropriate inode locks. 161 * 162 * Dirty pages are tagged with a netfs_folio struct if they're not up to date 163 * to indicate the range modified. Dirty pages may also be tagged with a 164 * netfs-specific grouping such that data from an old group gets flushed before 165 * a new one is started. 166 */ 167 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, 168 struct netfs_group *netfs_group) 169 { 170 struct file *file = iocb->ki_filp; 171 struct inode *inode = file_inode(file); 172 struct address_space *mapping = inode->i_mapping; 173 struct netfs_inode *ctx = netfs_inode(inode); 174 struct writeback_control wbc = { 175 .sync_mode = WB_SYNC_NONE, 176 .for_sync = true, 177 .nr_to_write = LONG_MAX, 178 .range_start = iocb->ki_pos, 179 .range_end = iocb->ki_pos + iter->count, 180 }; 181 struct netfs_io_request *wreq = NULL; 182 struct netfs_folio *finfo; 183 struct folio *folio; 184 enum netfs_how_to_modify howto; 185 enum netfs_folio_trace trace; 186 unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; 187 ssize_t written = 0, ret, ret2; 188 loff_t i_size, pos = iocb->ki_pos, from, to; 189 size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; 190 bool maybe_trouble = false; 191 192 if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || 193 iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) 194 ) { 195 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 196 197 ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); 198 if (ret < 0) { 199 wbc_detach_inode(&wbc); 200 goto out; 201 } 202 203 wreq = netfs_begin_writethrough(iocb, iter->count); 204 if (IS_ERR(wreq)) { 205 wbc_detach_inode(&wbc); 206 ret = PTR_ERR(wreq); 207 wreq = NULL; 208 goto out; 209 } 210 if (!is_sync_kiocb(iocb)) 211 wreq->iocb = iocb; 212 wreq->cleanup = netfs_cleanup_buffered_write; 213 netfs_stat(&netfs_n_wh_writethrough); 214 } else { 215 netfs_stat(&netfs_n_wh_buffered_write); 216 } 217 218 do { 219 size_t flen; 220 size_t offset; /* Offset into pagecache folio */ 221 size_t part; /* Bytes to write to folio */ 222 size_t copied; /* Bytes copied from user */ 223 224 ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); 225 if (unlikely(ret < 0)) 226 break; 227 228 offset = pos & (max_chunk - 1); 229 part = min(max_chunk - offset, iov_iter_count(iter)); 230 231 /* Bring in the user pages that we will copy from _first_ lest 232 * we hit a nasty deadlock on copying from the same page as 233 * we're writing to, without it being marked uptodate. 234 * 235 * Not only is this an optimisation, but it is also required to 236 * check that the address is actually valid, when atomic 237 * usercopies are used below. 238 * 239 * We rely on the page being held onto long enough by the LRU 240 * that we can grab it below if this causes it to be read. 241 */ 242 ret = -EFAULT; 243 if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) 244 break; 245 246 folio = netfs_grab_folio_for_write(mapping, pos, part); 247 if (IS_ERR(folio)) { 248 ret = PTR_ERR(folio); 249 break; 250 } 251 252 flen = folio_size(folio); 253 offset = pos & (flen - 1); 254 part = min_t(size_t, flen - offset, part); 255 256 if (signal_pending(current)) { 257 ret = written ? -EINTR : -ERESTARTSYS; 258 goto error_folio_unlock; 259 } 260 261 /* See if we need to prefetch the area we're going to modify. 262 * We need to do this before we get a lock on the folio in case 263 * there's more than one writer competing for the same cache 264 * block. 265 */ 266 howto = netfs_how_to_modify(ctx, file, folio, netfs_group, 267 flen, offset, part, maybe_trouble); 268 _debug("howto %u", howto); 269 switch (howto) { 270 case NETFS_JUST_PREFETCH: 271 ret = netfs_prefetch_for_write(file, folio, offset, part); 272 if (ret < 0) { 273 _debug("prefetch = %zd", ret); 274 goto error_folio_unlock; 275 } 276 break; 277 case NETFS_FOLIO_IS_UPTODATE: 278 case NETFS_WHOLE_FOLIO_MODIFY: 279 case NETFS_STREAMING_WRITE_CONT: 280 break; 281 case NETFS_MODIFY_AND_CLEAR: 282 zero_user_segment(&folio->page, 0, offset); 283 break; 284 case NETFS_STREAMING_WRITE: 285 ret = -EIO; 286 if (WARN_ON(folio_get_private(folio))) 287 goto error_folio_unlock; 288 break; 289 case NETFS_FLUSH_CONTENT: 290 trace_netfs_folio(folio, netfs_flush_content); 291 from = folio_pos(folio); 292 to = from + folio_size(folio) - 1; 293 folio_unlock(folio); 294 folio_put(folio); 295 ret = filemap_write_and_wait_range(mapping, from, to); 296 if (ret < 0) 297 goto error_folio_unlock; 298 continue; 299 } 300 301 if (mapping_writably_mapped(mapping)) 302 flush_dcache_folio(folio); 303 304 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 305 306 flush_dcache_folio(folio); 307 308 /* Deal with a (partially) failed copy */ 309 if (copied == 0) { 310 ret = -EFAULT; 311 goto error_folio_unlock; 312 } 313 314 trace = (enum netfs_folio_trace)howto; 315 switch (howto) { 316 case NETFS_FOLIO_IS_UPTODATE: 317 case NETFS_JUST_PREFETCH: 318 netfs_set_group(folio, netfs_group); 319 break; 320 case NETFS_MODIFY_AND_CLEAR: 321 zero_user_segment(&folio->page, offset + copied, flen); 322 netfs_set_group(folio, netfs_group); 323 folio_mark_uptodate(folio); 324 break; 325 case NETFS_WHOLE_FOLIO_MODIFY: 326 if (unlikely(copied < part)) { 327 maybe_trouble = true; 328 iov_iter_revert(iter, copied); 329 copied = 0; 330 goto retry; 331 } 332 netfs_set_group(folio, netfs_group); 333 folio_mark_uptodate(folio); 334 break; 335 case NETFS_STREAMING_WRITE: 336 if (offset == 0 && copied == flen) { 337 netfs_set_group(folio, netfs_group); 338 folio_mark_uptodate(folio); 339 trace = netfs_streaming_filled_page; 340 break; 341 } 342 finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); 343 if (!finfo) { 344 iov_iter_revert(iter, copied); 345 ret = -ENOMEM; 346 goto error_folio_unlock; 347 } 348 finfo->netfs_group = netfs_get_group(netfs_group); 349 finfo->dirty_offset = offset; 350 finfo->dirty_len = copied; 351 folio_attach_private(folio, (void *)((unsigned long)finfo | 352 NETFS_FOLIO_INFO)); 353 break; 354 case NETFS_STREAMING_WRITE_CONT: 355 finfo = netfs_folio_info(folio); 356 finfo->dirty_len += copied; 357 if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { 358 if (finfo->netfs_group) 359 folio_change_private(folio, finfo->netfs_group); 360 else 361 folio_detach_private(folio); 362 folio_mark_uptodate(folio); 363 kfree(finfo); 364 trace = netfs_streaming_cont_filled_page; 365 } 366 break; 367 default: 368 WARN(true, "Unexpected modify type %u ix=%lx\n", 369 howto, folio->index); 370 ret = -EIO; 371 goto error_folio_unlock; 372 } 373 374 trace_netfs_folio(folio, trace); 375 376 /* Update the inode size if we moved the EOF marker */ 377 pos += copied; 378 i_size = i_size_read(inode); 379 if (pos > i_size) 380 netfs_update_i_size(ctx, inode, i_size, pos, copied); 381 written += copied; 382 383 if (likely(!wreq)) { 384 folio_mark_dirty(folio); 385 } else { 386 if (folio_test_dirty(folio)) 387 /* Sigh. mmap. */ 388 folio_clear_dirty_for_io(folio); 389 /* We make multiple writes to the folio... */ 390 if (!folio_test_writeback(folio)) { 391 folio_start_writeback(folio); 392 if (wreq->iter.count == 0) 393 trace_netfs_folio(folio, netfs_folio_trace_wthru); 394 else 395 trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); 396 } 397 netfs_advance_writethrough(wreq, copied, 398 offset + copied == flen); 399 } 400 retry: 401 folio_unlock(folio); 402 folio_put(folio); 403 folio = NULL; 404 405 cond_resched(); 406 } while (iov_iter_count(iter)); 407 408 out: 409 if (unlikely(wreq)) { 410 ret2 = netfs_end_writethrough(wreq, iocb); 411 wbc_detach_inode(&wbc); 412 if (ret2 == -EIOCBQUEUED) 413 return ret2; 414 if (ret == 0) 415 ret = ret2; 416 } 417 418 iocb->ki_pos += written; 419 _leave(" = %zd [%zd]", written, ret); 420 return written ? written : ret; 421 422 error_folio_unlock: 423 folio_unlock(folio); 424 folio_put(folio); 425 goto out; 426 } 427 EXPORT_SYMBOL(netfs_perform_write); 428 429 /** 430 * netfs_buffered_write_iter_locked - write data to a file 431 * @iocb: IO state structure (file, offset, etc.) 432 * @from: iov_iter with data to write 433 * @netfs_group: Grouping for dirty pages (eg. ceph snaps). 434 * 435 * This function does all the work needed for actually writing data to a 436 * file. It does all basic checks, removes SUID from the file, updates 437 * modification times and calls proper subroutines depending on whether we 438 * do direct IO or a standard buffered write. 439 * 440 * The caller must hold appropriate locks around this function and have called 441 * generic_write_checks() already. The caller is also responsible for doing 442 * any necessary syncing afterwards. 443 * 444 * This function does *not* take care of syncing data in case of O_SYNC write. 445 * A caller has to handle it. This is mainly due to the fact that we want to 446 * avoid syncing under i_rwsem. 447 * 448 * Return: 449 * * number of bytes written, even for truncated writes 450 * * negative error code if no data has been written at all 451 */ 452 ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, 453 struct netfs_group *netfs_group) 454 { 455 struct file *file = iocb->ki_filp; 456 ssize_t ret; 457 458 trace_netfs_write_iter(iocb, from); 459 460 ret = file_remove_privs(file); 461 if (ret) 462 return ret; 463 464 ret = file_update_time(file); 465 if (ret) 466 return ret; 467 468 return netfs_perform_write(iocb, from, netfs_group); 469 } 470 EXPORT_SYMBOL(netfs_buffered_write_iter_locked); 471 472 /** 473 * netfs_file_write_iter - write data to a file 474 * @iocb: IO state structure 475 * @from: iov_iter with data to write 476 * 477 * Perform a write to a file, writing into the pagecache if possible and doing 478 * an unbuffered write instead if not. 479 * 480 * Return: 481 * * Negative error code if no data has been written at all of 482 * vfs_fsync_range() failed for a synchronous write 483 * * Number of bytes written, even for truncated writes 484 */ 485 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 486 { 487 struct file *file = iocb->ki_filp; 488 struct inode *inode = file->f_mapping->host; 489 struct netfs_inode *ictx = netfs_inode(inode); 490 ssize_t ret; 491 492 _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); 493 494 if (!iov_iter_count(from)) 495 return 0; 496 497 if ((iocb->ki_flags & IOCB_DIRECT) || 498 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) 499 return netfs_unbuffered_write_iter(iocb, from); 500 501 ret = netfs_start_io_write(inode); 502 if (ret < 0) 503 return ret; 504 505 ret = generic_write_checks(iocb, from); 506 if (ret > 0) 507 ret = netfs_buffered_write_iter_locked(iocb, from, NULL); 508 netfs_end_io_write(inode); 509 if (ret > 0) 510 ret = generic_write_sync(iocb, ret); 511 return ret; 512 } 513 EXPORT_SYMBOL(netfs_file_write_iter); 514 515 /* 516 * Notification that a previously read-only page is about to become writable. 517 * Note that the caller indicates a single page of a multipage folio. 518 */ 519 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) 520 { 521 struct netfs_group *group; 522 struct folio *folio = page_folio(vmf->page); 523 struct file *file = vmf->vma->vm_file; 524 struct inode *inode = file_inode(file); 525 vm_fault_t ret = VM_FAULT_RETRY; 526 int err; 527 528 _enter("%lx", folio->index); 529 530 sb_start_pagefault(inode->i_sb); 531 532 if (folio_wait_writeback_killable(folio)) 533 goto out; 534 535 if (folio_lock_killable(folio) < 0) 536 goto out; 537 538 /* Can we see a streaming write here? */ 539 if (WARN_ON(!folio_test_uptodate(folio))) { 540 ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; 541 goto out; 542 } 543 544 group = netfs_folio_group(folio); 545 if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { 546 folio_unlock(folio); 547 err = filemap_fdatawait_range(inode->i_mapping, 548 folio_pos(folio), 549 folio_pos(folio) + folio_size(folio)); 550 switch (err) { 551 case 0: 552 ret = VM_FAULT_RETRY; 553 goto out; 554 case -ENOMEM: 555 ret = VM_FAULT_OOM; 556 goto out; 557 default: 558 ret = VM_FAULT_SIGBUS; 559 goto out; 560 } 561 } 562 563 if (folio_test_dirty(folio)) 564 trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); 565 else 566 trace_netfs_folio(folio, netfs_folio_trace_mkwrite); 567 netfs_set_group(folio, netfs_group); 568 file_update_time(file); 569 ret = VM_FAULT_LOCKED; 570 out: 571 sb_end_pagefault(inode->i_sb); 572 return ret; 573 } 574 EXPORT_SYMBOL(netfs_page_mkwrite); 575 576 /* 577 * Kill all the pages in the given range 578 */ 579 static void netfs_kill_pages(struct address_space *mapping, 580 loff_t start, loff_t len) 581 { 582 struct folio *folio; 583 pgoff_t index = start / PAGE_SIZE; 584 pgoff_t last = (start + len - 1) / PAGE_SIZE, next; 585 586 _enter("%llx-%llx", start, start + len - 1); 587 588 do { 589 _debug("kill %lx (to %lx)", index, last); 590 591 folio = filemap_get_folio(mapping, index); 592 if (IS_ERR(folio)) { 593 next = index + 1; 594 continue; 595 } 596 597 next = folio_next_index(folio); 598 599 trace_netfs_folio(folio, netfs_folio_trace_kill); 600 folio_clear_uptodate(folio); 601 folio_end_writeback(folio); 602 folio_lock(folio); 603 generic_error_remove_folio(mapping, folio); 604 folio_unlock(folio); 605 folio_put(folio); 606 607 } while (index = next, index <= last); 608 609 _leave(""); 610 } 611 612 /* 613 * Redirty all the pages in a given range. 614 */ 615 static void netfs_redirty_pages(struct address_space *mapping, 616 loff_t start, loff_t len) 617 { 618 struct folio *folio; 619 pgoff_t index = start / PAGE_SIZE; 620 pgoff_t last = (start + len - 1) / PAGE_SIZE, next; 621 622 _enter("%llx-%llx", start, start + len - 1); 623 624 do { 625 _debug("redirty %llx @%llx", len, start); 626 627 folio = filemap_get_folio(mapping, index); 628 if (IS_ERR(folio)) { 629 next = index + 1; 630 continue; 631 } 632 633 next = folio_next_index(folio); 634 trace_netfs_folio(folio, netfs_folio_trace_redirty); 635 filemap_dirty_folio(mapping, folio); 636 folio_end_writeback(folio); 637 folio_put(folio); 638 } while (index = next, index <= last); 639 640 balance_dirty_pages_ratelimited(mapping); 641 642 _leave(""); 643 } 644 645 /* 646 * Completion of write to server 647 */ 648 static void netfs_pages_written_back(struct netfs_io_request *wreq) 649 { 650 struct address_space *mapping = wreq->mapping; 651 struct netfs_folio *finfo; 652 struct netfs_group *group = NULL; 653 struct folio *folio; 654 pgoff_t last; 655 int gcount = 0; 656 657 XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE); 658 659 _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); 660 661 rcu_read_lock(); 662 663 last = (wreq->start + wreq->len - 1) / PAGE_SIZE; 664 xas_for_each(&xas, folio, last) { 665 WARN(!folio_test_writeback(folio), 666 "bad %llx @%llx page %lx %lx\n", 667 wreq->len, wreq->start, folio->index, last); 668 669 if ((finfo = netfs_folio_info(folio))) { 670 /* Streaming writes cannot be redirtied whilst under 671 * writeback, so discard the streaming record. 672 */ 673 folio_detach_private(folio); 674 group = finfo->netfs_group; 675 gcount++; 676 trace_netfs_folio(folio, netfs_folio_trace_clear_s); 677 kfree(finfo); 678 } else if ((group = netfs_folio_group(folio))) { 679 /* Need to detach the group pointer if the page didn't 680 * get redirtied. If it has been redirtied, then it 681 * must be within the same group. 682 */ 683 if (folio_test_dirty(folio)) { 684 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 685 goto end_wb; 686 } 687 if (folio_trylock(folio)) { 688 if (!folio_test_dirty(folio)) { 689 folio_detach_private(folio); 690 gcount++; 691 if (group == NETFS_FOLIO_COPY_TO_CACHE) 692 trace_netfs_folio(folio, 693 netfs_folio_trace_end_copy); 694 else 695 trace_netfs_folio(folio, netfs_folio_trace_clear_g); 696 } else { 697 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 698 } 699 folio_unlock(folio); 700 goto end_wb; 701 } 702 703 xas_pause(&xas); 704 rcu_read_unlock(); 705 folio_lock(folio); 706 if (!folio_test_dirty(folio)) { 707 folio_detach_private(folio); 708 gcount++; 709 trace_netfs_folio(folio, netfs_folio_trace_clear_g); 710 } else { 711 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 712 } 713 folio_unlock(folio); 714 rcu_read_lock(); 715 } else { 716 trace_netfs_folio(folio, netfs_folio_trace_clear); 717 } 718 end_wb: 719 xas_advance(&xas, folio_next_index(folio) - 1); 720 folio_end_writeback(folio); 721 } 722 723 rcu_read_unlock(); 724 netfs_put_group_many(group, gcount); 725 _leave(""); 726 } 727 728 /* 729 * Deal with the disposition of the folios that are under writeback to close 730 * out the operation. 731 */ 732 static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq) 733 { 734 struct address_space *mapping = wreq->mapping; 735 736 _enter(""); 737 738 switch (wreq->error) { 739 case 0: 740 netfs_pages_written_back(wreq); 741 break; 742 743 default: 744 pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error); 745 fallthrough; 746 case -EACCES: 747 case -EPERM: 748 case -ENOKEY: 749 case -EKEYEXPIRED: 750 case -EKEYREJECTED: 751 case -EKEYREVOKED: 752 case -ENETRESET: 753 case -EDQUOT: 754 case -ENOSPC: 755 netfs_redirty_pages(mapping, wreq->start, wreq->len); 756 break; 757 758 case -EROFS: 759 case -EIO: 760 case -EREMOTEIO: 761 case -EFBIG: 762 case -ENOENT: 763 case -ENOMEDIUM: 764 case -ENXIO: 765 netfs_kill_pages(mapping, wreq->start, wreq->len); 766 break; 767 } 768 769 if (wreq->error) 770 mapping_set_error(mapping, wreq->error); 771 if (wreq->netfs_ops->done) 772 wreq->netfs_ops->done(wreq); 773 } 774 775 /* 776 * Extend the region to be written back to include subsequent contiguously 777 * dirty pages if possible, but don't sleep while doing so. 778 * 779 * If this page holds new content, then we can include filler zeros in the 780 * writeback. 781 */ 782 static void netfs_extend_writeback(struct address_space *mapping, 783 struct netfs_group *group, 784 struct xa_state *xas, 785 long *_count, 786 loff_t start, 787 loff_t max_len, 788 size_t *_len, 789 size_t *_top) 790 { 791 struct netfs_folio *finfo; 792 struct folio_batch fbatch; 793 struct folio *folio; 794 unsigned int i; 795 pgoff_t index = (start + *_len) / PAGE_SIZE; 796 size_t len; 797 void *priv; 798 bool stop = true; 799 800 folio_batch_init(&fbatch); 801 802 do { 803 /* Firstly, we gather up a batch of contiguous dirty pages 804 * under the RCU read lock - but we can't clear the dirty flags 805 * there if any of those pages are mapped. 806 */ 807 rcu_read_lock(); 808 809 xas_for_each(xas, folio, ULONG_MAX) { 810 stop = true; 811 if (xas_retry(xas, folio)) 812 continue; 813 if (xa_is_value(folio)) 814 break; 815 if (folio->index != index) { 816 xas_reset(xas); 817 break; 818 } 819 820 if (!folio_try_get_rcu(folio)) { 821 xas_reset(xas); 822 continue; 823 } 824 825 /* Has the folio moved or been split? */ 826 if (unlikely(folio != xas_reload(xas))) { 827 folio_put(folio); 828 xas_reset(xas); 829 break; 830 } 831 832 if (!folio_trylock(folio)) { 833 folio_put(folio); 834 xas_reset(xas); 835 break; 836 } 837 if (!folio_test_dirty(folio) || 838 folio_test_writeback(folio)) { 839 folio_unlock(folio); 840 folio_put(folio); 841 xas_reset(xas); 842 break; 843 } 844 845 stop = false; 846 len = folio_size(folio); 847 priv = folio_get_private(folio); 848 if ((const struct netfs_group *)priv != group) { 849 stop = true; 850 finfo = netfs_folio_info(folio); 851 if (!finfo || 852 finfo->netfs_group != group || 853 finfo->dirty_offset > 0) { 854 folio_unlock(folio); 855 folio_put(folio); 856 xas_reset(xas); 857 break; 858 } 859 len = finfo->dirty_len; 860 } 861 862 *_top += folio_size(folio); 863 index += folio_nr_pages(folio); 864 *_count -= folio_nr_pages(folio); 865 *_len += len; 866 if (*_len >= max_len || *_count <= 0) 867 stop = true; 868 869 if (!folio_batch_add(&fbatch, folio)) 870 break; 871 if (stop) 872 break; 873 } 874 875 xas_pause(xas); 876 rcu_read_unlock(); 877 878 /* Now, if we obtained any folios, we can shift them to being 879 * writable and mark them for caching. 880 */ 881 if (!folio_batch_count(&fbatch)) 882 break; 883 884 for (i = 0; i < folio_batch_count(&fbatch); i++) { 885 folio = fbatch.folios[i]; 886 if (group == NETFS_FOLIO_COPY_TO_CACHE) 887 trace_netfs_folio(folio, netfs_folio_trace_copy_plus); 888 else 889 trace_netfs_folio(folio, netfs_folio_trace_store_plus); 890 891 if (!folio_clear_dirty_for_io(folio)) 892 BUG(); 893 folio_start_writeback(folio); 894 folio_unlock(folio); 895 } 896 897 folio_batch_release(&fbatch); 898 cond_resched(); 899 } while (!stop); 900 } 901 902 /* 903 * Synchronously write back the locked page and any subsequent non-locked dirty 904 * pages. 905 */ 906 static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, 907 struct writeback_control *wbc, 908 struct netfs_group *group, 909 struct xa_state *xas, 910 struct folio *folio, 911 unsigned long long start, 912 unsigned long long end) 913 { 914 struct netfs_io_request *wreq; 915 struct netfs_folio *finfo; 916 struct netfs_inode *ctx = netfs_inode(mapping->host); 917 unsigned long long i_size = i_size_read(&ctx->inode); 918 size_t len, max_len; 919 long count = wbc->nr_to_write; 920 int ret; 921 922 _enter(",%lx,%llx-%llx", folio->index, start, end); 923 924 wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), 925 group == NETFS_FOLIO_COPY_TO_CACHE ? 926 NETFS_COPY_TO_CACHE : NETFS_WRITEBACK); 927 if (IS_ERR(wreq)) { 928 folio_unlock(folio); 929 return PTR_ERR(wreq); 930 } 931 932 if (!folio_clear_dirty_for_io(folio)) 933 BUG(); 934 folio_start_writeback(folio); 935 936 count -= folio_nr_pages(folio); 937 938 /* Find all consecutive lockable dirty pages that have contiguous 939 * written regions, stopping when we find a page that is not 940 * immediately lockable, is not dirty or is missing, or we reach the 941 * end of the range. 942 */ 943 if (group == NETFS_FOLIO_COPY_TO_CACHE) 944 trace_netfs_folio(folio, netfs_folio_trace_copy); 945 else 946 trace_netfs_folio(folio, netfs_folio_trace_store); 947 948 len = wreq->len; 949 finfo = netfs_folio_info(folio); 950 if (finfo) { 951 start += finfo->dirty_offset; 952 if (finfo->dirty_offset + finfo->dirty_len != len) { 953 len = finfo->dirty_len; 954 goto cant_expand; 955 } 956 len = finfo->dirty_len; 957 } 958 959 if (start < i_size) { 960 /* Trim the write to the EOF; the extra data is ignored. Also 961 * put an upper limit on the size of a single storedata op. 962 */ 963 max_len = 65536 * 4096; 964 max_len = min_t(unsigned long long, max_len, end - start + 1); 965 max_len = min_t(unsigned long long, max_len, i_size - start); 966 967 if (len < max_len) 968 netfs_extend_writeback(mapping, group, xas, &count, start, 969 max_len, &len, &wreq->upper_len); 970 } 971 972 cant_expand: 973 len = min_t(unsigned long long, len, i_size - start); 974 975 /* We now have a contiguous set of dirty pages, each with writeback 976 * set; the first page is still locked at this point, but all the rest 977 * have been unlocked. 978 */ 979 folio_unlock(folio); 980 wreq->start = start; 981 wreq->len = len; 982 983 if (start < i_size) { 984 _debug("write back %zx @%llx [%llx]", len, start, i_size); 985 986 /* Speculatively write to the cache. We have to fix this up 987 * later if the store fails. 988 */ 989 wreq->cleanup = netfs_cleanup_buffered_write; 990 991 iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start, 992 wreq->upper_len); 993 if (group != NETFS_FOLIO_COPY_TO_CACHE) { 994 __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); 995 ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback); 996 } else { 997 ret = netfs_begin_write(wreq, true, netfs_write_trace_copy_to_cache); 998 } 999 if (ret == 0 || ret == -EIOCBQUEUED) 1000 wbc->nr_to_write -= len / PAGE_SIZE; 1001 } else { 1002 _debug("write discard %zx @%llx [%llx]", len, start, i_size); 1003 1004 /* The dirty region was entirely beyond the EOF. */ 1005 netfs_pages_written_back(wreq); 1006 ret = 0; 1007 } 1008 1009 netfs_put_request(wreq, false, netfs_rreq_trace_put_return); 1010 _leave(" = 1"); 1011 return 1; 1012 } 1013 1014 /* 1015 * Write a region of pages back to the server 1016 */ 1017 static ssize_t netfs_writepages_begin(struct address_space *mapping, 1018 struct writeback_control *wbc, 1019 struct netfs_group *group, 1020 struct xa_state *xas, 1021 unsigned long long *_start, 1022 unsigned long long end) 1023 { 1024 const struct netfs_folio *finfo; 1025 struct folio *folio; 1026 unsigned long long start = *_start; 1027 ssize_t ret; 1028 void *priv; 1029 int skips = 0; 1030 1031 _enter("%llx,%llx,", start, end); 1032 1033 search_again: 1034 /* Find the first dirty page in the group. */ 1035 rcu_read_lock(); 1036 1037 for (;;) { 1038 folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); 1039 if (xas_retry(xas, folio) || xa_is_value(folio)) 1040 continue; 1041 if (!folio) 1042 break; 1043 1044 if (!folio_try_get_rcu(folio)) { 1045 xas_reset(xas); 1046 continue; 1047 } 1048 1049 if (unlikely(folio != xas_reload(xas))) { 1050 folio_put(folio); 1051 xas_reset(xas); 1052 continue; 1053 } 1054 1055 /* Skip any dirty folio that's not in the group of interest. */ 1056 priv = folio_get_private(folio); 1057 if ((const struct netfs_group *)priv == NETFS_FOLIO_COPY_TO_CACHE) { 1058 group = NETFS_FOLIO_COPY_TO_CACHE; 1059 } else if ((const struct netfs_group *)priv != group) { 1060 finfo = __netfs_folio_info(priv); 1061 if (!finfo || finfo->netfs_group != group) { 1062 folio_put(folio); 1063 continue; 1064 } 1065 } 1066 1067 xas_pause(xas); 1068 break; 1069 } 1070 rcu_read_unlock(); 1071 if (!folio) 1072 return 0; 1073 1074 start = folio_pos(folio); /* May regress with THPs */ 1075 1076 _debug("wback %lx", folio->index); 1077 1078 /* At this point we hold neither the i_pages lock nor the page lock: 1079 * the page may be truncated or invalidated (changing page->mapping to 1080 * NULL), or even swizzled back from swapper_space to tmpfs file 1081 * mapping 1082 */ 1083 lock_again: 1084 if (wbc->sync_mode != WB_SYNC_NONE) { 1085 ret = folio_lock_killable(folio); 1086 if (ret < 0) 1087 return ret; 1088 } else { 1089 if (!folio_trylock(folio)) 1090 goto search_again; 1091 } 1092 1093 if (folio->mapping != mapping || 1094 !folio_test_dirty(folio)) { 1095 start += folio_size(folio); 1096 folio_unlock(folio); 1097 goto search_again; 1098 } 1099 1100 if (folio_test_writeback(folio)) { 1101 folio_unlock(folio); 1102 if (wbc->sync_mode != WB_SYNC_NONE) { 1103 folio_wait_writeback(folio); 1104 goto lock_again; 1105 } 1106 1107 start += folio_size(folio); 1108 if (wbc->sync_mode == WB_SYNC_NONE) { 1109 if (skips >= 5 || need_resched()) { 1110 ret = 0; 1111 goto out; 1112 } 1113 skips++; 1114 } 1115 goto search_again; 1116 } 1117 1118 ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas, 1119 folio, start, end); 1120 out: 1121 if (ret > 0) 1122 *_start = start + ret; 1123 _leave(" = %zd [%llx]", ret, *_start); 1124 return ret; 1125 } 1126 1127 /* 1128 * Write a region of pages back to the server 1129 */ 1130 static int netfs_writepages_region(struct address_space *mapping, 1131 struct writeback_control *wbc, 1132 struct netfs_group *group, 1133 unsigned long long *_start, 1134 unsigned long long end) 1135 { 1136 ssize_t ret; 1137 1138 XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); 1139 1140 do { 1141 ret = netfs_writepages_begin(mapping, wbc, group, &xas, 1142 _start, end); 1143 if (ret > 0 && wbc->nr_to_write > 0) 1144 cond_resched(); 1145 } while (ret > 0 && wbc->nr_to_write > 0); 1146 1147 return ret > 0 ? 0 : ret; 1148 } 1149 1150 /* 1151 * write some of the pending data back to the server 1152 */ 1153 int netfs_writepages(struct address_space *mapping, 1154 struct writeback_control *wbc) 1155 { 1156 struct netfs_group *group = NULL; 1157 loff_t start, end; 1158 int ret; 1159 1160 _enter(""); 1161 1162 /* We have to be careful as we can end up racing with setattr() 1163 * truncating the pagecache since the caller doesn't take a lock here 1164 * to prevent it. 1165 */ 1166 1167 if (wbc->range_cyclic && mapping->writeback_index) { 1168 start = mapping->writeback_index * PAGE_SIZE; 1169 ret = netfs_writepages_region(mapping, wbc, group, 1170 &start, LLONG_MAX); 1171 if (ret < 0) 1172 goto out; 1173 1174 if (wbc->nr_to_write <= 0) { 1175 mapping->writeback_index = start / PAGE_SIZE; 1176 goto out; 1177 } 1178 1179 start = 0; 1180 end = mapping->writeback_index * PAGE_SIZE; 1181 mapping->writeback_index = 0; 1182 ret = netfs_writepages_region(mapping, wbc, group, &start, end); 1183 if (ret == 0) 1184 mapping->writeback_index = start / PAGE_SIZE; 1185 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { 1186 start = 0; 1187 ret = netfs_writepages_region(mapping, wbc, group, 1188 &start, LLONG_MAX); 1189 if (wbc->nr_to_write > 0 && ret == 0) 1190 mapping->writeback_index = start / PAGE_SIZE; 1191 } else { 1192 start = wbc->range_start; 1193 ret = netfs_writepages_region(mapping, wbc, group, 1194 &start, wbc->range_end); 1195 } 1196 1197 out: 1198 _leave(" = %d", ret); 1199 return ret; 1200 } 1201 EXPORT_SYMBOL(netfs_writepages); 1202