1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Network filesystem high-level write support. 3 * 4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include <linux/pagevec.h> 14 #include "internal.h" 15 16 /* 17 * Determined write method. Adjust netfs_folio_traces if this is changed. 18 */ 19 enum netfs_how_to_modify { 20 NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ 21 NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ 22 NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ 23 NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ 24 NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ 25 NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ 26 NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ 27 }; 28 29 static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); 30 31 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) 32 { 33 if (netfs_group && !folio_get_private(folio)) 34 folio_attach_private(folio, netfs_get_group(netfs_group)); 35 } 36 37 #if IS_ENABLED(CONFIG_FSCACHE) 38 static void netfs_folio_start_fscache(bool caching, struct folio *folio) 39 { 40 if (caching) 41 folio_start_fscache(folio); 42 } 43 #else 44 static void netfs_folio_start_fscache(bool caching, struct folio *folio) 45 { 46 } 47 #endif 48 49 /* 50 * Decide how we should modify a folio. We might be attempting to do 51 * write-streaming, in which case we don't want to a local RMW cycle if we can 52 * avoid it. If we're doing local caching or content crypto, we award that 53 * priority over avoiding RMW. If the file is open readably, then we also 54 * assume that we may want to read what we wrote. 55 */ 56 static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, 57 struct file *file, 58 struct folio *folio, 59 void *netfs_group, 60 size_t flen, 61 size_t offset, 62 size_t len, 63 bool maybe_trouble) 64 { 65 struct netfs_folio *finfo = netfs_folio_info(folio); 66 loff_t pos = folio_file_pos(folio); 67 68 _enter(""); 69 70 if (netfs_folio_group(folio) != netfs_group) 71 return NETFS_FLUSH_CONTENT; 72 73 if (folio_test_uptodate(folio)) 74 return NETFS_FOLIO_IS_UPTODATE; 75 76 if (pos >= ctx->zero_point) 77 return NETFS_MODIFY_AND_CLEAR; 78 79 if (!maybe_trouble && offset == 0 && len >= flen) 80 return NETFS_WHOLE_FOLIO_MODIFY; 81 82 if (file->f_mode & FMODE_READ) 83 goto no_write_streaming; 84 if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) 85 goto no_write_streaming; 86 87 if (netfs_is_cache_enabled(ctx)) { 88 /* We don't want to get a streaming write on a file that loses 89 * caching service temporarily because the backing store got 90 * culled. 91 */ 92 if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) 93 set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags); 94 goto no_write_streaming; 95 } 96 97 if (!finfo) 98 return NETFS_STREAMING_WRITE; 99 100 /* We can continue a streaming write only if it continues on from the 101 * previous. If it overlaps, we must flush lest we suffer a partial 102 * copy and disjoint dirty regions. 103 */ 104 if (offset == finfo->dirty_offset + finfo->dirty_len) 105 return NETFS_STREAMING_WRITE_CONT; 106 return NETFS_FLUSH_CONTENT; 107 108 no_write_streaming: 109 if (finfo) { 110 netfs_stat(&netfs_n_wh_wstream_conflict); 111 return NETFS_FLUSH_CONTENT; 112 } 113 return NETFS_JUST_PREFETCH; 114 } 115 116 /* 117 * Grab a folio for writing and lock it. Attempt to allocate as large a folio 118 * as possible to hold as much of the remaining length as possible in one go. 119 */ 120 static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, 121 loff_t pos, size_t part) 122 { 123 pgoff_t index = pos / PAGE_SIZE; 124 fgf_t fgp_flags = FGP_WRITEBEGIN; 125 126 if (mapping_large_folio_support(mapping)) 127 fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); 128 129 return __filemap_get_folio(mapping, index, fgp_flags, 130 mapping_gfp_mask(mapping)); 131 } 132 133 /** 134 * netfs_perform_write - Copy data into the pagecache. 135 * @iocb: The operation parameters 136 * @iter: The source buffer 137 * @netfs_group: Grouping for dirty pages (eg. ceph snaps). 138 * 139 * Copy data into pagecache pages attached to the inode specified by @iocb. 140 * The caller must hold appropriate inode locks. 141 * 142 * Dirty pages are tagged with a netfs_folio struct if they're not up to date 143 * to indicate the range modified. Dirty pages may also be tagged with a 144 * netfs-specific grouping such that data from an old group gets flushed before 145 * a new one is started. 146 */ 147 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, 148 struct netfs_group *netfs_group) 149 { 150 struct file *file = iocb->ki_filp; 151 struct inode *inode = file_inode(file); 152 struct address_space *mapping = inode->i_mapping; 153 struct netfs_inode *ctx = netfs_inode(inode); 154 struct writeback_control wbc = { 155 .sync_mode = WB_SYNC_NONE, 156 .for_sync = true, 157 .nr_to_write = LONG_MAX, 158 .range_start = iocb->ki_pos, 159 .range_end = iocb->ki_pos + iter->count, 160 }; 161 struct netfs_io_request *wreq = NULL; 162 struct netfs_folio *finfo; 163 struct folio *folio; 164 enum netfs_how_to_modify howto; 165 enum netfs_folio_trace trace; 166 unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; 167 ssize_t written = 0, ret; 168 loff_t i_size, pos = iocb->ki_pos, from, to; 169 size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; 170 bool maybe_trouble = false; 171 172 if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || 173 iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) 174 ) { 175 if (pos < i_size_read(inode)) { 176 ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); 177 if (ret < 0) { 178 goto out; 179 } 180 } 181 182 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 183 184 wreq = netfs_begin_writethrough(iocb, iter->count); 185 if (IS_ERR(wreq)) { 186 wbc_detach_inode(&wbc); 187 ret = PTR_ERR(wreq); 188 wreq = NULL; 189 goto out; 190 } 191 if (!is_sync_kiocb(iocb)) 192 wreq->iocb = iocb; 193 wreq->cleanup = netfs_cleanup_buffered_write; 194 } 195 196 do { 197 size_t flen; 198 size_t offset; /* Offset into pagecache folio */ 199 size_t part; /* Bytes to write to folio */ 200 size_t copied; /* Bytes copied from user */ 201 202 ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); 203 if (unlikely(ret < 0)) 204 break; 205 206 offset = pos & (max_chunk - 1); 207 part = min(max_chunk - offset, iov_iter_count(iter)); 208 209 /* Bring in the user pages that we will copy from _first_ lest 210 * we hit a nasty deadlock on copying from the same page as 211 * we're writing to, without it being marked uptodate. 212 * 213 * Not only is this an optimisation, but it is also required to 214 * check that the address is actually valid, when atomic 215 * usercopies are used below. 216 * 217 * We rely on the page being held onto long enough by the LRU 218 * that we can grab it below if this causes it to be read. 219 */ 220 ret = -EFAULT; 221 if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) 222 break; 223 224 folio = netfs_grab_folio_for_write(mapping, pos, part); 225 if (IS_ERR(folio)) { 226 ret = PTR_ERR(folio); 227 break; 228 } 229 230 flen = folio_size(folio); 231 offset = pos & (flen - 1); 232 part = min_t(size_t, flen - offset, part); 233 234 if (signal_pending(current)) { 235 ret = written ? -EINTR : -ERESTARTSYS; 236 goto error_folio_unlock; 237 } 238 239 /* See if we need to prefetch the area we're going to modify. 240 * We need to do this before we get a lock on the folio in case 241 * there's more than one writer competing for the same cache 242 * block. 243 */ 244 howto = netfs_how_to_modify(ctx, file, folio, netfs_group, 245 flen, offset, part, maybe_trouble); 246 _debug("howto %u", howto); 247 switch (howto) { 248 case NETFS_JUST_PREFETCH: 249 ret = netfs_prefetch_for_write(file, folio, offset, part); 250 if (ret < 0) { 251 _debug("prefetch = %zd", ret); 252 goto error_folio_unlock; 253 } 254 break; 255 case NETFS_FOLIO_IS_UPTODATE: 256 case NETFS_WHOLE_FOLIO_MODIFY: 257 case NETFS_STREAMING_WRITE_CONT: 258 break; 259 case NETFS_MODIFY_AND_CLEAR: 260 zero_user_segment(&folio->page, 0, offset); 261 break; 262 case NETFS_STREAMING_WRITE: 263 ret = -EIO; 264 if (WARN_ON(folio_get_private(folio))) 265 goto error_folio_unlock; 266 break; 267 case NETFS_FLUSH_CONTENT: 268 trace_netfs_folio(folio, netfs_flush_content); 269 from = folio_pos(folio); 270 to = from + folio_size(folio) - 1; 271 folio_unlock(folio); 272 folio_put(folio); 273 ret = filemap_write_and_wait_range(mapping, from, to); 274 if (ret < 0) 275 goto error_folio_unlock; 276 continue; 277 } 278 279 if (mapping_writably_mapped(mapping)) 280 flush_dcache_folio(folio); 281 282 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 283 284 flush_dcache_folio(folio); 285 286 /* Deal with a (partially) failed copy */ 287 if (copied == 0) { 288 ret = -EFAULT; 289 goto error_folio_unlock; 290 } 291 292 trace = (enum netfs_folio_trace)howto; 293 switch (howto) { 294 case NETFS_FOLIO_IS_UPTODATE: 295 case NETFS_JUST_PREFETCH: 296 netfs_set_group(folio, netfs_group); 297 break; 298 case NETFS_MODIFY_AND_CLEAR: 299 zero_user_segment(&folio->page, offset + copied, flen); 300 netfs_set_group(folio, netfs_group); 301 folio_mark_uptodate(folio); 302 break; 303 case NETFS_WHOLE_FOLIO_MODIFY: 304 if (unlikely(copied < part)) { 305 maybe_trouble = true; 306 iov_iter_revert(iter, copied); 307 copied = 0; 308 goto retry; 309 } 310 netfs_set_group(folio, netfs_group); 311 folio_mark_uptodate(folio); 312 break; 313 case NETFS_STREAMING_WRITE: 314 if (offset == 0 && copied == flen) { 315 netfs_set_group(folio, netfs_group); 316 folio_mark_uptodate(folio); 317 trace = netfs_streaming_filled_page; 318 break; 319 } 320 finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); 321 if (!finfo) { 322 iov_iter_revert(iter, copied); 323 ret = -ENOMEM; 324 goto error_folio_unlock; 325 } 326 finfo->netfs_group = netfs_get_group(netfs_group); 327 finfo->dirty_offset = offset; 328 finfo->dirty_len = copied; 329 folio_attach_private(folio, (void *)((unsigned long)finfo | 330 NETFS_FOLIO_INFO)); 331 break; 332 case NETFS_STREAMING_WRITE_CONT: 333 finfo = netfs_folio_info(folio); 334 finfo->dirty_len += copied; 335 if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { 336 if (finfo->netfs_group) 337 folio_change_private(folio, finfo->netfs_group); 338 else 339 folio_detach_private(folio); 340 folio_mark_uptodate(folio); 341 kfree(finfo); 342 trace = netfs_streaming_cont_filled_page; 343 } 344 break; 345 default: 346 WARN(true, "Unexpected modify type %u ix=%lx\n", 347 howto, folio->index); 348 ret = -EIO; 349 goto error_folio_unlock; 350 } 351 352 trace_netfs_folio(folio, trace); 353 354 /* Update the inode size if we moved the EOF marker */ 355 i_size = i_size_read(inode); 356 pos += copied; 357 if (pos > i_size) { 358 if (ctx->ops->update_i_size) { 359 ctx->ops->update_i_size(inode, pos); 360 } else { 361 i_size_write(inode, pos); 362 #if IS_ENABLED(CONFIG_FSCACHE) 363 fscache_update_cookie(ctx->cache, NULL, &pos); 364 #endif 365 } 366 } 367 written += copied; 368 369 if (likely(!wreq)) { 370 folio_mark_dirty(folio); 371 } else { 372 if (folio_test_dirty(folio)) 373 /* Sigh. mmap. */ 374 folio_clear_dirty_for_io(folio); 375 /* We make multiple writes to the folio... */ 376 if (!folio_test_writeback(folio)) { 377 folio_wait_fscache(folio); 378 folio_start_writeback(folio); 379 folio_start_fscache(folio); 380 if (wreq->iter.count == 0) 381 trace_netfs_folio(folio, netfs_folio_trace_wthru); 382 else 383 trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); 384 } 385 netfs_advance_writethrough(wreq, copied, 386 offset + copied == flen); 387 } 388 retry: 389 folio_unlock(folio); 390 folio_put(folio); 391 folio = NULL; 392 393 cond_resched(); 394 } while (iov_iter_count(iter)); 395 396 out: 397 if (unlikely(wreq)) { 398 ret = netfs_end_writethrough(wreq, iocb); 399 wbc_detach_inode(&wbc); 400 if (ret == -EIOCBQUEUED) 401 return ret; 402 } 403 404 iocb->ki_pos += written; 405 _leave(" = %zd [%zd]", written, ret); 406 return written ? written : ret; 407 408 error_folio_unlock: 409 folio_unlock(folio); 410 folio_put(folio); 411 goto out; 412 } 413 EXPORT_SYMBOL(netfs_perform_write); 414 415 /** 416 * netfs_buffered_write_iter_locked - write data to a file 417 * @iocb: IO state structure (file, offset, etc.) 418 * @from: iov_iter with data to write 419 * @netfs_group: Grouping for dirty pages (eg. ceph snaps). 420 * 421 * This function does all the work needed for actually writing data to a 422 * file. It does all basic checks, removes SUID from the file, updates 423 * modification times and calls proper subroutines depending on whether we 424 * do direct IO or a standard buffered write. 425 * 426 * The caller must hold appropriate locks around this function and have called 427 * generic_write_checks() already. The caller is also responsible for doing 428 * any necessary syncing afterwards. 429 * 430 * This function does *not* take care of syncing data in case of O_SYNC write. 431 * A caller has to handle it. This is mainly due to the fact that we want to 432 * avoid syncing under i_rwsem. 433 * 434 * Return: 435 * * number of bytes written, even for truncated writes 436 * * negative error code if no data has been written at all 437 */ 438 ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, 439 struct netfs_group *netfs_group) 440 { 441 struct file *file = iocb->ki_filp; 442 ssize_t ret; 443 444 trace_netfs_write_iter(iocb, from); 445 446 ret = file_remove_privs(file); 447 if (ret) 448 return ret; 449 450 ret = file_update_time(file); 451 if (ret) 452 return ret; 453 454 return netfs_perform_write(iocb, from, netfs_group); 455 } 456 EXPORT_SYMBOL(netfs_buffered_write_iter_locked); 457 458 /** 459 * netfs_file_write_iter - write data to a file 460 * @iocb: IO state structure 461 * @from: iov_iter with data to write 462 * 463 * Perform a write to a file, writing into the pagecache if possible and doing 464 * an unbuffered write instead if not. 465 * 466 * Return: 467 * * Negative error code if no data has been written at all of 468 * vfs_fsync_range() failed for a synchronous write 469 * * Number of bytes written, even for truncated writes 470 */ 471 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 472 { 473 struct file *file = iocb->ki_filp; 474 struct inode *inode = file->f_mapping->host; 475 struct netfs_inode *ictx = netfs_inode(inode); 476 ssize_t ret; 477 478 _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); 479 480 if ((iocb->ki_flags & IOCB_DIRECT) || 481 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) 482 return netfs_unbuffered_write_iter(iocb, from); 483 484 ret = netfs_start_io_write(inode); 485 if (ret < 0) 486 return ret; 487 488 ret = generic_write_checks(iocb, from); 489 if (ret > 0) 490 ret = netfs_buffered_write_iter_locked(iocb, from, NULL); 491 netfs_end_io_write(inode); 492 if (ret > 0) 493 ret = generic_write_sync(iocb, ret); 494 return ret; 495 } 496 EXPORT_SYMBOL(netfs_file_write_iter); 497 498 /* 499 * Notification that a previously read-only page is about to become writable. 500 * Note that the caller indicates a single page of a multipage folio. 501 */ 502 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) 503 { 504 struct folio *folio = page_folio(vmf->page); 505 struct file *file = vmf->vma->vm_file; 506 struct inode *inode = file_inode(file); 507 vm_fault_t ret = VM_FAULT_RETRY; 508 int err; 509 510 _enter("%lx", folio->index); 511 512 sb_start_pagefault(inode->i_sb); 513 514 if (folio_wait_writeback_killable(folio)) 515 goto out; 516 517 if (folio_lock_killable(folio) < 0) 518 goto out; 519 520 /* Can we see a streaming write here? */ 521 if (WARN_ON(!folio_test_uptodate(folio))) { 522 ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; 523 goto out; 524 } 525 526 if (netfs_folio_group(folio) != netfs_group) { 527 folio_unlock(folio); 528 err = filemap_fdatawait_range(inode->i_mapping, 529 folio_pos(folio), 530 folio_pos(folio) + folio_size(folio)); 531 switch (err) { 532 case 0: 533 ret = VM_FAULT_RETRY; 534 goto out; 535 case -ENOMEM: 536 ret = VM_FAULT_OOM; 537 goto out; 538 default: 539 ret = VM_FAULT_SIGBUS; 540 goto out; 541 } 542 } 543 544 if (folio_test_dirty(folio)) 545 trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); 546 else 547 trace_netfs_folio(folio, netfs_folio_trace_mkwrite); 548 netfs_set_group(folio, netfs_group); 549 file_update_time(file); 550 ret = VM_FAULT_LOCKED; 551 out: 552 sb_end_pagefault(inode->i_sb); 553 return ret; 554 } 555 EXPORT_SYMBOL(netfs_page_mkwrite); 556 557 /* 558 * Kill all the pages in the given range 559 */ 560 static void netfs_kill_pages(struct address_space *mapping, 561 loff_t start, loff_t len) 562 { 563 struct folio *folio; 564 pgoff_t index = start / PAGE_SIZE; 565 pgoff_t last = (start + len - 1) / PAGE_SIZE, next; 566 567 _enter("%llx-%llx", start, start + len - 1); 568 569 do { 570 _debug("kill %lx (to %lx)", index, last); 571 572 folio = filemap_get_folio(mapping, index); 573 if (IS_ERR(folio)) { 574 next = index + 1; 575 continue; 576 } 577 578 next = folio_next_index(folio); 579 580 trace_netfs_folio(folio, netfs_folio_trace_kill); 581 folio_clear_uptodate(folio); 582 if (folio_test_fscache(folio)) 583 folio_end_fscache(folio); 584 folio_end_writeback(folio); 585 folio_lock(folio); 586 generic_error_remove_folio(mapping, folio); 587 folio_unlock(folio); 588 folio_put(folio); 589 590 } while (index = next, index <= last); 591 592 _leave(""); 593 } 594 595 /* 596 * Redirty all the pages in a given range. 597 */ 598 static void netfs_redirty_pages(struct address_space *mapping, 599 loff_t start, loff_t len) 600 { 601 struct folio *folio; 602 pgoff_t index = start / PAGE_SIZE; 603 pgoff_t last = (start + len - 1) / PAGE_SIZE, next; 604 605 _enter("%llx-%llx", start, start + len - 1); 606 607 do { 608 _debug("redirty %llx @%llx", len, start); 609 610 folio = filemap_get_folio(mapping, index); 611 if (IS_ERR(folio)) { 612 next = index + 1; 613 continue; 614 } 615 616 next = folio_next_index(folio); 617 trace_netfs_folio(folio, netfs_folio_trace_redirty); 618 filemap_dirty_folio(mapping, folio); 619 if (folio_test_fscache(folio)) 620 folio_end_fscache(folio); 621 folio_end_writeback(folio); 622 folio_put(folio); 623 } while (index = next, index <= last); 624 625 balance_dirty_pages_ratelimited(mapping); 626 627 _leave(""); 628 } 629 630 /* 631 * Completion of write to server 632 */ 633 static void netfs_pages_written_back(struct netfs_io_request *wreq) 634 { 635 struct address_space *mapping = wreq->mapping; 636 struct netfs_folio *finfo; 637 struct netfs_group *group = NULL; 638 struct folio *folio; 639 pgoff_t last; 640 int gcount = 0; 641 642 XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE); 643 644 _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); 645 646 rcu_read_lock(); 647 648 last = (wreq->start + wreq->len - 1) / PAGE_SIZE; 649 xas_for_each(&xas, folio, last) { 650 WARN(!folio_test_writeback(folio), 651 "bad %zx @%llx page %lx %lx\n", 652 wreq->len, wreq->start, folio->index, last); 653 654 if ((finfo = netfs_folio_info(folio))) { 655 /* Streaming writes cannot be redirtied whilst under 656 * writeback, so discard the streaming record. 657 */ 658 folio_detach_private(folio); 659 group = finfo->netfs_group; 660 gcount++; 661 trace_netfs_folio(folio, netfs_folio_trace_clear_s); 662 kfree(finfo); 663 } else if ((group = netfs_folio_group(folio))) { 664 /* Need to detach the group pointer if the page didn't 665 * get redirtied. If it has been redirtied, then it 666 * must be within the same group. 667 */ 668 if (folio_test_dirty(folio)) { 669 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 670 goto end_wb; 671 } 672 if (folio_trylock(folio)) { 673 if (!folio_test_dirty(folio)) { 674 folio_detach_private(folio); 675 gcount++; 676 trace_netfs_folio(folio, netfs_folio_trace_clear_g); 677 } else { 678 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 679 } 680 folio_unlock(folio); 681 goto end_wb; 682 } 683 684 xas_pause(&xas); 685 rcu_read_unlock(); 686 folio_lock(folio); 687 if (!folio_test_dirty(folio)) { 688 folio_detach_private(folio); 689 gcount++; 690 trace_netfs_folio(folio, netfs_folio_trace_clear_g); 691 } else { 692 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 693 } 694 folio_unlock(folio); 695 rcu_read_lock(); 696 } else { 697 trace_netfs_folio(folio, netfs_folio_trace_clear); 698 } 699 end_wb: 700 if (folio_test_fscache(folio)) 701 folio_end_fscache(folio); 702 xas_advance(&xas, folio_next_index(folio) - 1); 703 folio_end_writeback(folio); 704 } 705 706 rcu_read_unlock(); 707 netfs_put_group_many(group, gcount); 708 _leave(""); 709 } 710 711 /* 712 * Deal with the disposition of the folios that are under writeback to close 713 * out the operation. 714 */ 715 static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq) 716 { 717 struct address_space *mapping = wreq->mapping; 718 719 _enter(""); 720 721 switch (wreq->error) { 722 case 0: 723 netfs_pages_written_back(wreq); 724 break; 725 726 default: 727 pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error); 728 fallthrough; 729 case -EACCES: 730 case -EPERM: 731 case -ENOKEY: 732 case -EKEYEXPIRED: 733 case -EKEYREJECTED: 734 case -EKEYREVOKED: 735 case -ENETRESET: 736 case -EDQUOT: 737 case -ENOSPC: 738 netfs_redirty_pages(mapping, wreq->start, wreq->len); 739 break; 740 741 case -EROFS: 742 case -EIO: 743 case -EREMOTEIO: 744 case -EFBIG: 745 case -ENOENT: 746 case -ENOMEDIUM: 747 case -ENXIO: 748 netfs_kill_pages(mapping, wreq->start, wreq->len); 749 break; 750 } 751 752 if (wreq->error) 753 mapping_set_error(mapping, wreq->error); 754 if (wreq->netfs_ops->done) 755 wreq->netfs_ops->done(wreq); 756 } 757 758 /* 759 * Extend the region to be written back to include subsequent contiguously 760 * dirty pages if possible, but don't sleep while doing so. 761 * 762 * If this page holds new content, then we can include filler zeros in the 763 * writeback. 764 */ 765 static void netfs_extend_writeback(struct address_space *mapping, 766 struct netfs_group *group, 767 struct xa_state *xas, 768 long *_count, 769 loff_t start, 770 loff_t max_len, 771 bool caching, 772 size_t *_len, 773 size_t *_top) 774 { 775 struct netfs_folio *finfo; 776 struct folio_batch fbatch; 777 struct folio *folio; 778 unsigned int i; 779 pgoff_t index = (start + *_len) / PAGE_SIZE; 780 size_t len; 781 void *priv; 782 bool stop = true; 783 784 folio_batch_init(&fbatch); 785 786 do { 787 /* Firstly, we gather up a batch of contiguous dirty pages 788 * under the RCU read lock - but we can't clear the dirty flags 789 * there if any of those pages are mapped. 790 */ 791 rcu_read_lock(); 792 793 xas_for_each(xas, folio, ULONG_MAX) { 794 stop = true; 795 if (xas_retry(xas, folio)) 796 continue; 797 if (xa_is_value(folio)) 798 break; 799 if (folio->index != index) { 800 xas_reset(xas); 801 break; 802 } 803 804 if (!folio_try_get_rcu(folio)) { 805 xas_reset(xas); 806 continue; 807 } 808 809 /* Has the folio moved or been split? */ 810 if (unlikely(folio != xas_reload(xas))) { 811 folio_put(folio); 812 xas_reset(xas); 813 break; 814 } 815 816 if (!folio_trylock(folio)) { 817 folio_put(folio); 818 xas_reset(xas); 819 break; 820 } 821 if (!folio_test_dirty(folio) || 822 folio_test_writeback(folio) || 823 folio_test_fscache(folio)) { 824 folio_unlock(folio); 825 folio_put(folio); 826 xas_reset(xas); 827 break; 828 } 829 830 stop = false; 831 len = folio_size(folio); 832 priv = folio_get_private(folio); 833 if ((const struct netfs_group *)priv != group) { 834 stop = true; 835 finfo = netfs_folio_info(folio); 836 if (finfo->netfs_group != group || 837 finfo->dirty_offset > 0) { 838 folio_unlock(folio); 839 folio_put(folio); 840 xas_reset(xas); 841 break; 842 } 843 len = finfo->dirty_len; 844 } 845 846 *_top += folio_size(folio); 847 index += folio_nr_pages(folio); 848 *_count -= folio_nr_pages(folio); 849 *_len += len; 850 if (*_len >= max_len || *_count <= 0) 851 stop = true; 852 853 if (!folio_batch_add(&fbatch, folio)) 854 break; 855 if (stop) 856 break; 857 } 858 859 xas_pause(xas); 860 rcu_read_unlock(); 861 862 /* Now, if we obtained any folios, we can shift them to being 863 * writable and mark them for caching. 864 */ 865 if (!folio_batch_count(&fbatch)) 866 break; 867 868 for (i = 0; i < folio_batch_count(&fbatch); i++) { 869 folio = fbatch.folios[i]; 870 trace_netfs_folio(folio, netfs_folio_trace_store_plus); 871 872 if (!folio_clear_dirty_for_io(folio)) 873 BUG(); 874 folio_start_writeback(folio); 875 netfs_folio_start_fscache(caching, folio); 876 folio_unlock(folio); 877 } 878 879 folio_batch_release(&fbatch); 880 cond_resched(); 881 } while (!stop); 882 } 883 884 /* 885 * Synchronously write back the locked page and any subsequent non-locked dirty 886 * pages. 887 */ 888 static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, 889 struct writeback_control *wbc, 890 struct netfs_group *group, 891 struct xa_state *xas, 892 struct folio *folio, 893 unsigned long long start, 894 unsigned long long end) 895 { 896 struct netfs_io_request *wreq; 897 struct netfs_folio *finfo; 898 struct netfs_inode *ctx = netfs_inode(mapping->host); 899 unsigned long long i_size = i_size_read(&ctx->inode); 900 size_t len, max_len; 901 bool caching = netfs_is_cache_enabled(ctx); 902 long count = wbc->nr_to_write; 903 int ret; 904 905 _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching); 906 907 wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), 908 NETFS_WRITEBACK); 909 if (IS_ERR(wreq)) { 910 folio_unlock(folio); 911 return PTR_ERR(wreq); 912 } 913 914 if (!folio_clear_dirty_for_io(folio)) 915 BUG(); 916 folio_start_writeback(folio); 917 netfs_folio_start_fscache(caching, folio); 918 919 count -= folio_nr_pages(folio); 920 921 /* Find all consecutive lockable dirty pages that have contiguous 922 * written regions, stopping when we find a page that is not 923 * immediately lockable, is not dirty or is missing, or we reach the 924 * end of the range. 925 */ 926 trace_netfs_folio(folio, netfs_folio_trace_store); 927 928 len = wreq->len; 929 finfo = netfs_folio_info(folio); 930 if (finfo) { 931 start += finfo->dirty_offset; 932 if (finfo->dirty_offset + finfo->dirty_len != len) { 933 len = finfo->dirty_len; 934 goto cant_expand; 935 } 936 len = finfo->dirty_len; 937 } 938 939 if (start < i_size) { 940 /* Trim the write to the EOF; the extra data is ignored. Also 941 * put an upper limit on the size of a single storedata op. 942 */ 943 max_len = 65536 * 4096; 944 max_len = min_t(unsigned long long, max_len, end - start + 1); 945 max_len = min_t(unsigned long long, max_len, i_size - start); 946 947 if (len < max_len) 948 netfs_extend_writeback(mapping, group, xas, &count, start, 949 max_len, caching, &len, &wreq->upper_len); 950 } 951 952 cant_expand: 953 len = min_t(unsigned long long, len, i_size - start); 954 955 /* We now have a contiguous set of dirty pages, each with writeback 956 * set; the first page is still locked at this point, but all the rest 957 * have been unlocked. 958 */ 959 folio_unlock(folio); 960 wreq->start = start; 961 wreq->len = len; 962 963 if (start < i_size) { 964 _debug("write back %zx @%llx [%llx]", len, start, i_size); 965 966 /* Speculatively write to the cache. We have to fix this up 967 * later if the store fails. 968 */ 969 wreq->cleanup = netfs_cleanup_buffered_write; 970 971 iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start, 972 wreq->upper_len); 973 __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); 974 ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback); 975 if (ret == 0 || ret == -EIOCBQUEUED) 976 wbc->nr_to_write -= len / PAGE_SIZE; 977 } else { 978 _debug("write discard %zx @%llx [%llx]", len, start, i_size); 979 980 /* The dirty region was entirely beyond the EOF. */ 981 fscache_clear_page_bits(mapping, start, len, caching); 982 netfs_pages_written_back(wreq); 983 ret = 0; 984 } 985 986 netfs_put_request(wreq, false, netfs_rreq_trace_put_return); 987 _leave(" = 1"); 988 return 1; 989 } 990 991 /* 992 * Write a region of pages back to the server 993 */ 994 static ssize_t netfs_writepages_begin(struct address_space *mapping, 995 struct writeback_control *wbc, 996 struct netfs_group *group, 997 struct xa_state *xas, 998 unsigned long long *_start, 999 unsigned long long end) 1000 { 1001 const struct netfs_folio *finfo; 1002 struct folio *folio; 1003 unsigned long long start = *_start; 1004 ssize_t ret; 1005 void *priv; 1006 int skips = 0; 1007 1008 _enter("%llx,%llx,", start, end); 1009 1010 search_again: 1011 /* Find the first dirty page in the group. */ 1012 rcu_read_lock(); 1013 1014 for (;;) { 1015 folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); 1016 if (xas_retry(xas, folio) || xa_is_value(folio)) 1017 continue; 1018 if (!folio) 1019 break; 1020 1021 if (!folio_try_get_rcu(folio)) { 1022 xas_reset(xas); 1023 continue; 1024 } 1025 1026 if (unlikely(folio != xas_reload(xas))) { 1027 folio_put(folio); 1028 xas_reset(xas); 1029 continue; 1030 } 1031 1032 /* Skip any dirty folio that's not in the group of interest. */ 1033 priv = folio_get_private(folio); 1034 if ((const struct netfs_group *)priv != group) { 1035 finfo = netfs_folio_info(folio); 1036 if (finfo->netfs_group != group) { 1037 folio_put(folio); 1038 continue; 1039 } 1040 } 1041 1042 xas_pause(xas); 1043 break; 1044 } 1045 rcu_read_unlock(); 1046 if (!folio) 1047 return 0; 1048 1049 start = folio_pos(folio); /* May regress with THPs */ 1050 1051 _debug("wback %lx", folio->index); 1052 1053 /* At this point we hold neither the i_pages lock nor the page lock: 1054 * the page may be truncated or invalidated (changing page->mapping to 1055 * NULL), or even swizzled back from swapper_space to tmpfs file 1056 * mapping 1057 */ 1058 lock_again: 1059 if (wbc->sync_mode != WB_SYNC_NONE) { 1060 ret = folio_lock_killable(folio); 1061 if (ret < 0) 1062 return ret; 1063 } else { 1064 if (!folio_trylock(folio)) 1065 goto search_again; 1066 } 1067 1068 if (folio->mapping != mapping || 1069 !folio_test_dirty(folio)) { 1070 start += folio_size(folio); 1071 folio_unlock(folio); 1072 goto search_again; 1073 } 1074 1075 if (folio_test_writeback(folio) || 1076 folio_test_fscache(folio)) { 1077 folio_unlock(folio); 1078 if (wbc->sync_mode != WB_SYNC_NONE) { 1079 folio_wait_writeback(folio); 1080 #ifdef CONFIG_FSCACHE 1081 folio_wait_fscache(folio); 1082 #endif 1083 goto lock_again; 1084 } 1085 1086 start += folio_size(folio); 1087 if (wbc->sync_mode == WB_SYNC_NONE) { 1088 if (skips >= 5 || need_resched()) { 1089 ret = 0; 1090 goto out; 1091 } 1092 skips++; 1093 } 1094 goto search_again; 1095 } 1096 1097 ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas, 1098 folio, start, end); 1099 out: 1100 if (ret > 0) 1101 *_start = start + ret; 1102 _leave(" = %zd [%llx]", ret, *_start); 1103 return ret; 1104 } 1105 1106 /* 1107 * Write a region of pages back to the server 1108 */ 1109 static int netfs_writepages_region(struct address_space *mapping, 1110 struct writeback_control *wbc, 1111 struct netfs_group *group, 1112 unsigned long long *_start, 1113 unsigned long long end) 1114 { 1115 ssize_t ret; 1116 1117 XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); 1118 1119 do { 1120 ret = netfs_writepages_begin(mapping, wbc, group, &xas, 1121 _start, end); 1122 if (ret > 0 && wbc->nr_to_write > 0) 1123 cond_resched(); 1124 } while (ret > 0 && wbc->nr_to_write > 0); 1125 1126 return ret > 0 ? 0 : ret; 1127 } 1128 1129 /* 1130 * write some of the pending data back to the server 1131 */ 1132 int netfs_writepages(struct address_space *mapping, 1133 struct writeback_control *wbc) 1134 { 1135 struct netfs_group *group = NULL; 1136 loff_t start, end; 1137 int ret; 1138 1139 _enter(""); 1140 1141 /* We have to be careful as we can end up racing with setattr() 1142 * truncating the pagecache since the caller doesn't take a lock here 1143 * to prevent it. 1144 */ 1145 1146 if (wbc->range_cyclic && mapping->writeback_index) { 1147 start = mapping->writeback_index * PAGE_SIZE; 1148 ret = netfs_writepages_region(mapping, wbc, group, 1149 &start, LLONG_MAX); 1150 if (ret < 0) 1151 goto out; 1152 1153 if (wbc->nr_to_write <= 0) { 1154 mapping->writeback_index = start / PAGE_SIZE; 1155 goto out; 1156 } 1157 1158 start = 0; 1159 end = mapping->writeback_index * PAGE_SIZE; 1160 mapping->writeback_index = 0; 1161 ret = netfs_writepages_region(mapping, wbc, group, &start, end); 1162 if (ret == 0) 1163 mapping->writeback_index = start / PAGE_SIZE; 1164 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { 1165 start = 0; 1166 ret = netfs_writepages_region(mapping, wbc, group, 1167 &start, LLONG_MAX); 1168 if (wbc->nr_to_write > 0 && ret == 0) 1169 mapping->writeback_index = start / PAGE_SIZE; 1170 } else { 1171 start = wbc->range_start; 1172 ret = netfs_writepages_region(mapping, wbc, group, 1173 &start, wbc->range_end); 1174 } 1175 1176 out: 1177 _leave(" = %d", ret); 1178 return ret; 1179 } 1180 EXPORT_SYMBOL(netfs_writepages); 1181 1182 /* 1183 * Deal with the disposition of a laundered folio. 1184 */ 1185 static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq) 1186 { 1187 if (wreq->error) { 1188 pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error); 1189 mapping_set_error(wreq->mapping, wreq->error); 1190 } 1191 } 1192 1193 /** 1194 * netfs_launder_folio - Clean up a dirty folio that's being invalidated 1195 * @folio: The folio to clean 1196 * 1197 * This is called to write back a folio that's being invalidated when an inode 1198 * is getting torn down. Ideally, writepages would be used instead. 1199 */ 1200 int netfs_launder_folio(struct folio *folio) 1201 { 1202 struct netfs_io_request *wreq; 1203 struct address_space *mapping = folio->mapping; 1204 struct netfs_folio *finfo = netfs_folio_info(folio); 1205 struct netfs_group *group = netfs_folio_group(folio); 1206 struct bio_vec bvec; 1207 unsigned long long i_size = i_size_read(mapping->host); 1208 unsigned long long start = folio_pos(folio); 1209 size_t offset = 0, len; 1210 int ret = 0; 1211 1212 if (finfo) { 1213 offset = finfo->dirty_offset; 1214 start += offset; 1215 len = finfo->dirty_len; 1216 } else { 1217 len = folio_size(folio); 1218 } 1219 len = min_t(unsigned long long, len, i_size - start); 1220 1221 wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE); 1222 if (IS_ERR(wreq)) { 1223 ret = PTR_ERR(wreq); 1224 goto out; 1225 } 1226 1227 if (!folio_clear_dirty_for_io(folio)) 1228 goto out_put; 1229 1230 trace_netfs_folio(folio, netfs_folio_trace_launder); 1231 1232 _debug("launder %llx-%llx", start, start + len - 1); 1233 1234 /* Speculatively write to the cache. We have to fix this up later if 1235 * the store fails. 1236 */ 1237 wreq->cleanup = netfs_cleanup_launder_folio; 1238 1239 bvec_set_folio(&bvec, folio, len, offset); 1240 iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len); 1241 __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); 1242 ret = netfs_begin_write(wreq, true, netfs_write_trace_launder); 1243 1244 out_put: 1245 folio_detach_private(folio); 1246 netfs_put_group(group); 1247 kfree(finfo); 1248 netfs_put_request(wreq, false, netfs_rreq_trace_put_return); 1249 out: 1250 folio_wait_fscache(folio); 1251 _leave(" = %d", ret); 1252 return ret; 1253 } 1254 EXPORT_SYMBOL(netfs_launder_folio); 1255