1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Network filesystem high-level write support. 3 * 4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/fs.h> 10 #include <linux/mm.h> 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include <linux/pagevec.h> 14 #include "internal.h" 15 16 /* 17 * Determined write method. Adjust netfs_folio_traces if this is changed. 18 */ 19 enum netfs_how_to_modify { 20 NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ 21 NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ 22 NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ 23 NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ 24 NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ 25 NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ 26 NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ 27 }; 28 29 static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); 30 31 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) 32 { 33 if (netfs_group && !folio_get_private(folio)) 34 folio_attach_private(folio, netfs_get_group(netfs_group)); 35 } 36 37 #if IS_ENABLED(CONFIG_FSCACHE) 38 static void netfs_folio_start_fscache(bool caching, struct folio *folio) 39 { 40 if (caching) 41 folio_start_fscache(folio); 42 } 43 #else 44 static void netfs_folio_start_fscache(bool caching, struct folio *folio) 45 { 46 } 47 #endif 48 49 /* 50 * Decide how we should modify a folio. We might be attempting to do 51 * write-streaming, in which case we don't want to a local RMW cycle if we can 52 * avoid it. If we're doing local caching or content crypto, we award that 53 * priority over avoiding RMW. If the file is open readably, then we also 54 * assume that we may want to read what we wrote. 55 */ 56 static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, 57 struct file *file, 58 struct folio *folio, 59 void *netfs_group, 60 size_t flen, 61 size_t offset, 62 size_t len, 63 bool maybe_trouble) 64 { 65 struct netfs_folio *finfo = netfs_folio_info(folio); 66 loff_t pos = folio_file_pos(folio); 67 68 _enter(""); 69 70 if (netfs_folio_group(folio) != netfs_group) 71 return NETFS_FLUSH_CONTENT; 72 73 if (folio_test_uptodate(folio)) 74 return NETFS_FOLIO_IS_UPTODATE; 75 76 if (pos >= ctx->zero_point) 77 return NETFS_MODIFY_AND_CLEAR; 78 79 if (!maybe_trouble && offset == 0 && len >= flen) 80 return NETFS_WHOLE_FOLIO_MODIFY; 81 82 if (file->f_mode & FMODE_READ) 83 goto no_write_streaming; 84 if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) 85 goto no_write_streaming; 86 87 if (netfs_is_cache_enabled(ctx)) { 88 /* We don't want to get a streaming write on a file that loses 89 * caching service temporarily because the backing store got 90 * culled. 91 */ 92 if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) 93 set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags); 94 goto no_write_streaming; 95 } 96 97 if (!finfo) 98 return NETFS_STREAMING_WRITE; 99 100 /* We can continue a streaming write only if it continues on from the 101 * previous. If it overlaps, we must flush lest we suffer a partial 102 * copy and disjoint dirty regions. 103 */ 104 if (offset == finfo->dirty_offset + finfo->dirty_len) 105 return NETFS_STREAMING_WRITE_CONT; 106 return NETFS_FLUSH_CONTENT; 107 108 no_write_streaming: 109 if (finfo) { 110 netfs_stat(&netfs_n_wh_wstream_conflict); 111 return NETFS_FLUSH_CONTENT; 112 } 113 return NETFS_JUST_PREFETCH; 114 } 115 116 /* 117 * Grab a folio for writing and lock it. Attempt to allocate as large a folio 118 * as possible to hold as much of the remaining length as possible in one go. 119 */ 120 static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, 121 loff_t pos, size_t part) 122 { 123 pgoff_t index = pos / PAGE_SIZE; 124 fgf_t fgp_flags = FGP_WRITEBEGIN; 125 126 if (mapping_large_folio_support(mapping)) 127 fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); 128 129 return __filemap_get_folio(mapping, index, fgp_flags, 130 mapping_gfp_mask(mapping)); 131 } 132 133 /** 134 * netfs_perform_write - Copy data into the pagecache. 135 * @iocb: The operation parameters 136 * @iter: The source buffer 137 * @netfs_group: Grouping for dirty pages (eg. ceph snaps). 138 * 139 * Copy data into pagecache pages attached to the inode specified by @iocb. 140 * The caller must hold appropriate inode locks. 141 * 142 * Dirty pages are tagged with a netfs_folio struct if they're not up to date 143 * to indicate the range modified. Dirty pages may also be tagged with a 144 * netfs-specific grouping such that data from an old group gets flushed before 145 * a new one is started. 146 */ 147 ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, 148 struct netfs_group *netfs_group) 149 { 150 struct file *file = iocb->ki_filp; 151 struct inode *inode = file_inode(file); 152 struct address_space *mapping = inode->i_mapping; 153 struct netfs_inode *ctx = netfs_inode(inode); 154 struct writeback_control wbc = { 155 .sync_mode = WB_SYNC_NONE, 156 .for_sync = true, 157 .nr_to_write = LONG_MAX, 158 .range_start = iocb->ki_pos, 159 .range_end = iocb->ki_pos + iter->count, 160 }; 161 struct netfs_io_request *wreq = NULL; 162 struct netfs_folio *finfo; 163 struct folio *folio; 164 enum netfs_how_to_modify howto; 165 enum netfs_folio_trace trace; 166 unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; 167 ssize_t written = 0, ret; 168 loff_t i_size, pos = iocb->ki_pos, from, to; 169 size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; 170 bool maybe_trouble = false; 171 172 if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || 173 iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) 174 ) { 175 if (pos < i_size_read(inode)) { 176 ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); 177 if (ret < 0) { 178 goto out; 179 } 180 } 181 182 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 183 184 wreq = netfs_begin_writethrough(iocb, iter->count); 185 if (IS_ERR(wreq)) { 186 wbc_detach_inode(&wbc); 187 ret = PTR_ERR(wreq); 188 wreq = NULL; 189 goto out; 190 } 191 if (!is_sync_kiocb(iocb)) 192 wreq->iocb = iocb; 193 wreq->cleanup = netfs_cleanup_buffered_write; 194 } 195 196 do { 197 size_t flen; 198 size_t offset; /* Offset into pagecache folio */ 199 size_t part; /* Bytes to write to folio */ 200 size_t copied; /* Bytes copied from user */ 201 202 ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); 203 if (unlikely(ret < 0)) 204 break; 205 206 offset = pos & (max_chunk - 1); 207 part = min(max_chunk - offset, iov_iter_count(iter)); 208 209 /* Bring in the user pages that we will copy from _first_ lest 210 * we hit a nasty deadlock on copying from the same page as 211 * we're writing to, without it being marked uptodate. 212 * 213 * Not only is this an optimisation, but it is also required to 214 * check that the address is actually valid, when atomic 215 * usercopies are used below. 216 * 217 * We rely on the page being held onto long enough by the LRU 218 * that we can grab it below if this causes it to be read. 219 */ 220 ret = -EFAULT; 221 if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) 222 break; 223 224 ret = -ENOMEM; 225 folio = netfs_grab_folio_for_write(mapping, pos, part); 226 if (!folio) 227 break; 228 229 flen = folio_size(folio); 230 offset = pos & (flen - 1); 231 part = min_t(size_t, flen - offset, part); 232 233 if (signal_pending(current)) { 234 ret = written ? -EINTR : -ERESTARTSYS; 235 goto error_folio_unlock; 236 } 237 238 /* See if we need to prefetch the area we're going to modify. 239 * We need to do this before we get a lock on the folio in case 240 * there's more than one writer competing for the same cache 241 * block. 242 */ 243 howto = netfs_how_to_modify(ctx, file, folio, netfs_group, 244 flen, offset, part, maybe_trouble); 245 _debug("howto %u", howto); 246 switch (howto) { 247 case NETFS_JUST_PREFETCH: 248 ret = netfs_prefetch_for_write(file, folio, offset, part); 249 if (ret < 0) { 250 _debug("prefetch = %zd", ret); 251 goto error_folio_unlock; 252 } 253 break; 254 case NETFS_FOLIO_IS_UPTODATE: 255 case NETFS_WHOLE_FOLIO_MODIFY: 256 case NETFS_STREAMING_WRITE_CONT: 257 break; 258 case NETFS_MODIFY_AND_CLEAR: 259 zero_user_segment(&folio->page, 0, offset); 260 break; 261 case NETFS_STREAMING_WRITE: 262 ret = -EIO; 263 if (WARN_ON(folio_get_private(folio))) 264 goto error_folio_unlock; 265 break; 266 case NETFS_FLUSH_CONTENT: 267 trace_netfs_folio(folio, netfs_flush_content); 268 from = folio_pos(folio); 269 to = from + folio_size(folio) - 1; 270 folio_unlock(folio); 271 folio_put(folio); 272 ret = filemap_write_and_wait_range(mapping, from, to); 273 if (ret < 0) 274 goto error_folio_unlock; 275 continue; 276 } 277 278 if (mapping_writably_mapped(mapping)) 279 flush_dcache_folio(folio); 280 281 copied = copy_folio_from_iter_atomic(folio, offset, part, iter); 282 283 flush_dcache_folio(folio); 284 285 /* Deal with a (partially) failed copy */ 286 if (copied == 0) { 287 ret = -EFAULT; 288 goto error_folio_unlock; 289 } 290 291 trace = (enum netfs_folio_trace)howto; 292 switch (howto) { 293 case NETFS_FOLIO_IS_UPTODATE: 294 case NETFS_JUST_PREFETCH: 295 netfs_set_group(folio, netfs_group); 296 break; 297 case NETFS_MODIFY_AND_CLEAR: 298 zero_user_segment(&folio->page, offset + copied, flen); 299 netfs_set_group(folio, netfs_group); 300 folio_mark_uptodate(folio); 301 break; 302 case NETFS_WHOLE_FOLIO_MODIFY: 303 if (unlikely(copied < part)) { 304 maybe_trouble = true; 305 iov_iter_revert(iter, copied); 306 copied = 0; 307 goto retry; 308 } 309 netfs_set_group(folio, netfs_group); 310 folio_mark_uptodate(folio); 311 break; 312 case NETFS_STREAMING_WRITE: 313 if (offset == 0 && copied == flen) { 314 netfs_set_group(folio, netfs_group); 315 folio_mark_uptodate(folio); 316 trace = netfs_streaming_filled_page; 317 break; 318 } 319 finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); 320 if (!finfo) { 321 iov_iter_revert(iter, copied); 322 ret = -ENOMEM; 323 goto error_folio_unlock; 324 } 325 finfo->netfs_group = netfs_get_group(netfs_group); 326 finfo->dirty_offset = offset; 327 finfo->dirty_len = copied; 328 folio_attach_private(folio, (void *)((unsigned long)finfo | 329 NETFS_FOLIO_INFO)); 330 break; 331 case NETFS_STREAMING_WRITE_CONT: 332 finfo = netfs_folio_info(folio); 333 finfo->dirty_len += copied; 334 if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { 335 if (finfo->netfs_group) 336 folio_change_private(folio, finfo->netfs_group); 337 else 338 folio_detach_private(folio); 339 folio_mark_uptodate(folio); 340 kfree(finfo); 341 trace = netfs_streaming_cont_filled_page; 342 } 343 break; 344 default: 345 WARN(true, "Unexpected modify type %u ix=%lx\n", 346 howto, folio_index(folio)); 347 ret = -EIO; 348 goto error_folio_unlock; 349 } 350 351 trace_netfs_folio(folio, trace); 352 353 /* Update the inode size if we moved the EOF marker */ 354 i_size = i_size_read(inode); 355 pos += copied; 356 if (pos > i_size) { 357 if (ctx->ops->update_i_size) { 358 ctx->ops->update_i_size(inode, pos); 359 } else { 360 i_size_write(inode, pos); 361 #if IS_ENABLED(CONFIG_FSCACHE) 362 fscache_update_cookie(ctx->cache, NULL, &pos); 363 #endif 364 } 365 } 366 written += copied; 367 368 if (likely(!wreq)) { 369 folio_mark_dirty(folio); 370 } else { 371 if (folio_test_dirty(folio)) 372 /* Sigh. mmap. */ 373 folio_clear_dirty_for_io(folio); 374 /* We make multiple writes to the folio... */ 375 if (!folio_test_writeback(folio)) { 376 folio_wait_fscache(folio); 377 folio_start_writeback(folio); 378 folio_start_fscache(folio); 379 if (wreq->iter.count == 0) 380 trace_netfs_folio(folio, netfs_folio_trace_wthru); 381 else 382 trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); 383 } 384 netfs_advance_writethrough(wreq, copied, 385 offset + copied == flen); 386 } 387 retry: 388 folio_unlock(folio); 389 folio_put(folio); 390 folio = NULL; 391 392 cond_resched(); 393 } while (iov_iter_count(iter)); 394 395 out: 396 if (unlikely(wreq)) { 397 ret = netfs_end_writethrough(wreq, iocb); 398 wbc_detach_inode(&wbc); 399 if (ret == -EIOCBQUEUED) 400 return ret; 401 } 402 403 iocb->ki_pos += written; 404 _leave(" = %zd [%zd]", written, ret); 405 return written ? written : ret; 406 407 error_folio_unlock: 408 folio_unlock(folio); 409 folio_put(folio); 410 goto out; 411 } 412 EXPORT_SYMBOL(netfs_perform_write); 413 414 /** 415 * netfs_buffered_write_iter_locked - write data to a file 416 * @iocb: IO state structure (file, offset, etc.) 417 * @from: iov_iter with data to write 418 * @netfs_group: Grouping for dirty pages (eg. ceph snaps). 419 * 420 * This function does all the work needed for actually writing data to a 421 * file. It does all basic checks, removes SUID from the file, updates 422 * modification times and calls proper subroutines depending on whether we 423 * do direct IO or a standard buffered write. 424 * 425 * The caller must hold appropriate locks around this function and have called 426 * generic_write_checks() already. The caller is also responsible for doing 427 * any necessary syncing afterwards. 428 * 429 * This function does *not* take care of syncing data in case of O_SYNC write. 430 * A caller has to handle it. This is mainly due to the fact that we want to 431 * avoid syncing under i_rwsem. 432 * 433 * Return: 434 * * number of bytes written, even for truncated writes 435 * * negative error code if no data has been written at all 436 */ 437 ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, 438 struct netfs_group *netfs_group) 439 { 440 struct file *file = iocb->ki_filp; 441 ssize_t ret; 442 443 trace_netfs_write_iter(iocb, from); 444 445 ret = file_remove_privs(file); 446 if (ret) 447 return ret; 448 449 ret = file_update_time(file); 450 if (ret) 451 return ret; 452 453 return netfs_perform_write(iocb, from, netfs_group); 454 } 455 EXPORT_SYMBOL(netfs_buffered_write_iter_locked); 456 457 /** 458 * netfs_file_write_iter - write data to a file 459 * @iocb: IO state structure 460 * @from: iov_iter with data to write 461 * 462 * Perform a write to a file, writing into the pagecache if possible and doing 463 * an unbuffered write instead if not. 464 * 465 * Return: 466 * * Negative error code if no data has been written at all of 467 * vfs_fsync_range() failed for a synchronous write 468 * * Number of bytes written, even for truncated writes 469 */ 470 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 471 { 472 struct file *file = iocb->ki_filp; 473 struct inode *inode = file->f_mapping->host; 474 struct netfs_inode *ictx = netfs_inode(inode); 475 ssize_t ret; 476 477 _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); 478 479 if ((iocb->ki_flags & IOCB_DIRECT) || 480 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) 481 return netfs_unbuffered_write_iter(iocb, from); 482 483 ret = netfs_start_io_write(inode); 484 if (ret < 0) 485 return ret; 486 487 ret = generic_write_checks(iocb, from); 488 if (ret > 0) 489 ret = netfs_buffered_write_iter_locked(iocb, from, NULL); 490 netfs_end_io_write(inode); 491 if (ret > 0) 492 ret = generic_write_sync(iocb, ret); 493 return ret; 494 } 495 EXPORT_SYMBOL(netfs_file_write_iter); 496 497 /* 498 * Notification that a previously read-only page is about to become writable. 499 * Note that the caller indicates a single page of a multipage folio. 500 */ 501 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) 502 { 503 struct folio *folio = page_folio(vmf->page); 504 struct file *file = vmf->vma->vm_file; 505 struct inode *inode = file_inode(file); 506 vm_fault_t ret = VM_FAULT_RETRY; 507 int err; 508 509 _enter("%lx", folio->index); 510 511 sb_start_pagefault(inode->i_sb); 512 513 if (folio_wait_writeback_killable(folio)) 514 goto out; 515 516 if (folio_lock_killable(folio) < 0) 517 goto out; 518 519 /* Can we see a streaming write here? */ 520 if (WARN_ON(!folio_test_uptodate(folio))) { 521 ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; 522 goto out; 523 } 524 525 if (netfs_folio_group(folio) != netfs_group) { 526 folio_unlock(folio); 527 err = filemap_fdatawait_range(inode->i_mapping, 528 folio_pos(folio), 529 folio_pos(folio) + folio_size(folio)); 530 switch (err) { 531 case 0: 532 ret = VM_FAULT_RETRY; 533 goto out; 534 case -ENOMEM: 535 ret = VM_FAULT_OOM; 536 goto out; 537 default: 538 ret = VM_FAULT_SIGBUS; 539 goto out; 540 } 541 } 542 543 if (folio_test_dirty(folio)) 544 trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); 545 else 546 trace_netfs_folio(folio, netfs_folio_trace_mkwrite); 547 netfs_set_group(folio, netfs_group); 548 file_update_time(file); 549 ret = VM_FAULT_LOCKED; 550 out: 551 sb_end_pagefault(inode->i_sb); 552 return ret; 553 } 554 EXPORT_SYMBOL(netfs_page_mkwrite); 555 556 /* 557 * Kill all the pages in the given range 558 */ 559 static void netfs_kill_pages(struct address_space *mapping, 560 loff_t start, loff_t len) 561 { 562 struct folio *folio; 563 pgoff_t index = start / PAGE_SIZE; 564 pgoff_t last = (start + len - 1) / PAGE_SIZE, next; 565 566 _enter("%llx-%llx", start, start + len - 1); 567 568 do { 569 _debug("kill %lx (to %lx)", index, last); 570 571 folio = filemap_get_folio(mapping, index); 572 if (IS_ERR(folio)) { 573 next = index + 1; 574 continue; 575 } 576 577 next = folio_next_index(folio); 578 579 trace_netfs_folio(folio, netfs_folio_trace_kill); 580 folio_clear_uptodate(folio); 581 if (folio_test_fscache(folio)) 582 folio_end_fscache(folio); 583 folio_end_writeback(folio); 584 folio_lock(folio); 585 generic_error_remove_folio(mapping, folio); 586 folio_unlock(folio); 587 folio_put(folio); 588 589 } while (index = next, index <= last); 590 591 _leave(""); 592 } 593 594 /* 595 * Redirty all the pages in a given range. 596 */ 597 static void netfs_redirty_pages(struct address_space *mapping, 598 loff_t start, loff_t len) 599 { 600 struct folio *folio; 601 pgoff_t index = start / PAGE_SIZE; 602 pgoff_t last = (start + len - 1) / PAGE_SIZE, next; 603 604 _enter("%llx-%llx", start, start + len - 1); 605 606 do { 607 _debug("redirty %llx @%llx", len, start); 608 609 folio = filemap_get_folio(mapping, index); 610 if (IS_ERR(folio)) { 611 next = index + 1; 612 continue; 613 } 614 615 next = folio_next_index(folio); 616 trace_netfs_folio(folio, netfs_folio_trace_redirty); 617 filemap_dirty_folio(mapping, folio); 618 if (folio_test_fscache(folio)) 619 folio_end_fscache(folio); 620 folio_end_writeback(folio); 621 folio_put(folio); 622 } while (index = next, index <= last); 623 624 balance_dirty_pages_ratelimited(mapping); 625 626 _leave(""); 627 } 628 629 /* 630 * Completion of write to server 631 */ 632 static void netfs_pages_written_back(struct netfs_io_request *wreq) 633 { 634 struct address_space *mapping = wreq->mapping; 635 struct netfs_folio *finfo; 636 struct netfs_group *group = NULL; 637 struct folio *folio; 638 pgoff_t last; 639 int gcount = 0; 640 641 XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE); 642 643 _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); 644 645 rcu_read_lock(); 646 647 last = (wreq->start + wreq->len - 1) / PAGE_SIZE; 648 xas_for_each(&xas, folio, last) { 649 WARN(!folio_test_writeback(folio), 650 "bad %zx @%llx page %lx %lx\n", 651 wreq->len, wreq->start, folio_index(folio), last); 652 653 if ((finfo = netfs_folio_info(folio))) { 654 /* Streaming writes cannot be redirtied whilst under 655 * writeback, so discard the streaming record. 656 */ 657 folio_detach_private(folio); 658 group = finfo->netfs_group; 659 gcount++; 660 trace_netfs_folio(folio, netfs_folio_trace_clear_s); 661 kfree(finfo); 662 } else if ((group = netfs_folio_group(folio))) { 663 /* Need to detach the group pointer if the page didn't 664 * get redirtied. If it has been redirtied, then it 665 * must be within the same group. 666 */ 667 if (folio_test_dirty(folio)) { 668 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 669 goto end_wb; 670 } 671 if (folio_trylock(folio)) { 672 if (!folio_test_dirty(folio)) { 673 folio_detach_private(folio); 674 gcount++; 675 trace_netfs_folio(folio, netfs_folio_trace_clear_g); 676 } else { 677 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 678 } 679 folio_unlock(folio); 680 goto end_wb; 681 } 682 683 xas_pause(&xas); 684 rcu_read_unlock(); 685 folio_lock(folio); 686 if (!folio_test_dirty(folio)) { 687 folio_detach_private(folio); 688 gcount++; 689 trace_netfs_folio(folio, netfs_folio_trace_clear_g); 690 } else { 691 trace_netfs_folio(folio, netfs_folio_trace_redirtied); 692 } 693 folio_unlock(folio); 694 rcu_read_lock(); 695 } else { 696 trace_netfs_folio(folio, netfs_folio_trace_clear); 697 } 698 end_wb: 699 if (folio_test_fscache(folio)) 700 folio_end_fscache(folio); 701 xas_advance(&xas, folio_next_index(folio) - 1); 702 folio_end_writeback(folio); 703 } 704 705 rcu_read_unlock(); 706 netfs_put_group_many(group, gcount); 707 _leave(""); 708 } 709 710 /* 711 * Deal with the disposition of the folios that are under writeback to close 712 * out the operation. 713 */ 714 static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq) 715 { 716 struct address_space *mapping = wreq->mapping; 717 718 _enter(""); 719 720 switch (wreq->error) { 721 case 0: 722 netfs_pages_written_back(wreq); 723 break; 724 725 default: 726 pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error); 727 fallthrough; 728 case -EACCES: 729 case -EPERM: 730 case -ENOKEY: 731 case -EKEYEXPIRED: 732 case -EKEYREJECTED: 733 case -EKEYREVOKED: 734 case -ENETRESET: 735 case -EDQUOT: 736 case -ENOSPC: 737 netfs_redirty_pages(mapping, wreq->start, wreq->len); 738 break; 739 740 case -EROFS: 741 case -EIO: 742 case -EREMOTEIO: 743 case -EFBIG: 744 case -ENOENT: 745 case -ENOMEDIUM: 746 case -ENXIO: 747 netfs_kill_pages(mapping, wreq->start, wreq->len); 748 break; 749 } 750 751 if (wreq->error) 752 mapping_set_error(mapping, wreq->error); 753 if (wreq->netfs_ops->done) 754 wreq->netfs_ops->done(wreq); 755 } 756 757 /* 758 * Extend the region to be written back to include subsequent contiguously 759 * dirty pages if possible, but don't sleep while doing so. 760 * 761 * If this page holds new content, then we can include filler zeros in the 762 * writeback. 763 */ 764 static void netfs_extend_writeback(struct address_space *mapping, 765 struct netfs_group *group, 766 struct xa_state *xas, 767 long *_count, 768 loff_t start, 769 loff_t max_len, 770 bool caching, 771 size_t *_len, 772 size_t *_top) 773 { 774 struct netfs_folio *finfo; 775 struct folio_batch fbatch; 776 struct folio *folio; 777 unsigned int i; 778 pgoff_t index = (start + *_len) / PAGE_SIZE; 779 size_t len; 780 void *priv; 781 bool stop = true; 782 783 folio_batch_init(&fbatch); 784 785 do { 786 /* Firstly, we gather up a batch of contiguous dirty pages 787 * under the RCU read lock - but we can't clear the dirty flags 788 * there if any of those pages are mapped. 789 */ 790 rcu_read_lock(); 791 792 xas_for_each(xas, folio, ULONG_MAX) { 793 stop = true; 794 if (xas_retry(xas, folio)) 795 continue; 796 if (xa_is_value(folio)) 797 break; 798 if (folio_index(folio) != index) { 799 xas_reset(xas); 800 break; 801 } 802 803 if (!folio_try_get_rcu(folio)) { 804 xas_reset(xas); 805 continue; 806 } 807 808 /* Has the folio moved or been split? */ 809 if (unlikely(folio != xas_reload(xas))) { 810 folio_put(folio); 811 xas_reset(xas); 812 break; 813 } 814 815 if (!folio_trylock(folio)) { 816 folio_put(folio); 817 xas_reset(xas); 818 break; 819 } 820 if (!folio_test_dirty(folio) || 821 folio_test_writeback(folio) || 822 folio_test_fscache(folio)) { 823 folio_unlock(folio); 824 folio_put(folio); 825 xas_reset(xas); 826 break; 827 } 828 829 stop = false; 830 len = folio_size(folio); 831 priv = folio_get_private(folio); 832 if ((const struct netfs_group *)priv != group) { 833 stop = true; 834 finfo = netfs_folio_info(folio); 835 if (finfo->netfs_group != group || 836 finfo->dirty_offset > 0) { 837 folio_unlock(folio); 838 folio_put(folio); 839 xas_reset(xas); 840 break; 841 } 842 len = finfo->dirty_len; 843 } 844 845 *_top += folio_size(folio); 846 index += folio_nr_pages(folio); 847 *_count -= folio_nr_pages(folio); 848 *_len += len; 849 if (*_len >= max_len || *_count <= 0) 850 stop = true; 851 852 if (!folio_batch_add(&fbatch, folio)) 853 break; 854 if (stop) 855 break; 856 } 857 858 xas_pause(xas); 859 rcu_read_unlock(); 860 861 /* Now, if we obtained any folios, we can shift them to being 862 * writable and mark them for caching. 863 */ 864 if (!folio_batch_count(&fbatch)) 865 break; 866 867 for (i = 0; i < folio_batch_count(&fbatch); i++) { 868 folio = fbatch.folios[i]; 869 trace_netfs_folio(folio, netfs_folio_trace_store_plus); 870 871 if (!folio_clear_dirty_for_io(folio)) 872 BUG(); 873 folio_start_writeback(folio); 874 netfs_folio_start_fscache(caching, folio); 875 folio_unlock(folio); 876 } 877 878 folio_batch_release(&fbatch); 879 cond_resched(); 880 } while (!stop); 881 } 882 883 /* 884 * Synchronously write back the locked page and any subsequent non-locked dirty 885 * pages. 886 */ 887 static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, 888 struct writeback_control *wbc, 889 struct netfs_group *group, 890 struct xa_state *xas, 891 struct folio *folio, 892 unsigned long long start, 893 unsigned long long end) 894 { 895 struct netfs_io_request *wreq; 896 struct netfs_folio *finfo; 897 struct netfs_inode *ctx = netfs_inode(mapping->host); 898 unsigned long long i_size = i_size_read(&ctx->inode); 899 size_t len, max_len; 900 bool caching = netfs_is_cache_enabled(ctx); 901 long count = wbc->nr_to_write; 902 int ret; 903 904 _enter(",%lx,%llx-%llx,%u", folio_index(folio), start, end, caching); 905 906 wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), 907 NETFS_WRITEBACK); 908 if (IS_ERR(wreq)) { 909 folio_unlock(folio); 910 return PTR_ERR(wreq); 911 } 912 913 if (!folio_clear_dirty_for_io(folio)) 914 BUG(); 915 folio_start_writeback(folio); 916 netfs_folio_start_fscache(caching, folio); 917 918 count -= folio_nr_pages(folio); 919 920 /* Find all consecutive lockable dirty pages that have contiguous 921 * written regions, stopping when we find a page that is not 922 * immediately lockable, is not dirty or is missing, or we reach the 923 * end of the range. 924 */ 925 trace_netfs_folio(folio, netfs_folio_trace_store); 926 927 len = wreq->len; 928 finfo = netfs_folio_info(folio); 929 if (finfo) { 930 start += finfo->dirty_offset; 931 if (finfo->dirty_offset + finfo->dirty_len != len) { 932 len = finfo->dirty_len; 933 goto cant_expand; 934 } 935 len = finfo->dirty_len; 936 } 937 938 if (start < i_size) { 939 /* Trim the write to the EOF; the extra data is ignored. Also 940 * put an upper limit on the size of a single storedata op. 941 */ 942 max_len = 65536 * 4096; 943 max_len = min_t(unsigned long long, max_len, end - start + 1); 944 max_len = min_t(unsigned long long, max_len, i_size - start); 945 946 if (len < max_len) 947 netfs_extend_writeback(mapping, group, xas, &count, start, 948 max_len, caching, &len, &wreq->upper_len); 949 } 950 951 cant_expand: 952 len = min_t(unsigned long long, len, i_size - start); 953 954 /* We now have a contiguous set of dirty pages, each with writeback 955 * set; the first page is still locked at this point, but all the rest 956 * have been unlocked. 957 */ 958 folio_unlock(folio); 959 wreq->start = start; 960 wreq->len = len; 961 962 if (start < i_size) { 963 _debug("write back %zx @%llx [%llx]", len, start, i_size); 964 965 /* Speculatively write to the cache. We have to fix this up 966 * later if the store fails. 967 */ 968 wreq->cleanup = netfs_cleanup_buffered_write; 969 970 iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start, 971 wreq->upper_len); 972 __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); 973 ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback); 974 if (ret == 0 || ret == -EIOCBQUEUED) 975 wbc->nr_to_write -= len / PAGE_SIZE; 976 } else { 977 _debug("write discard %zx @%llx [%llx]", len, start, i_size); 978 979 /* The dirty region was entirely beyond the EOF. */ 980 fscache_clear_page_bits(mapping, start, len, caching); 981 netfs_pages_written_back(wreq); 982 ret = 0; 983 } 984 985 netfs_put_request(wreq, false, netfs_rreq_trace_put_return); 986 _leave(" = 1"); 987 return 1; 988 } 989 990 /* 991 * Write a region of pages back to the server 992 */ 993 static ssize_t netfs_writepages_begin(struct address_space *mapping, 994 struct writeback_control *wbc, 995 struct netfs_group *group, 996 struct xa_state *xas, 997 unsigned long long *_start, 998 unsigned long long end) 999 { 1000 const struct netfs_folio *finfo; 1001 struct folio *folio; 1002 unsigned long long start = *_start; 1003 ssize_t ret; 1004 void *priv; 1005 int skips = 0; 1006 1007 _enter("%llx,%llx,", start, end); 1008 1009 search_again: 1010 /* Find the first dirty page in the group. */ 1011 rcu_read_lock(); 1012 1013 for (;;) { 1014 folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); 1015 if (xas_retry(xas, folio) || xa_is_value(folio)) 1016 continue; 1017 if (!folio) 1018 break; 1019 1020 if (!folio_try_get_rcu(folio)) { 1021 xas_reset(xas); 1022 continue; 1023 } 1024 1025 if (unlikely(folio != xas_reload(xas))) { 1026 folio_put(folio); 1027 xas_reset(xas); 1028 continue; 1029 } 1030 1031 /* Skip any dirty folio that's not in the group of interest. */ 1032 priv = folio_get_private(folio); 1033 if ((const struct netfs_group *)priv != group) { 1034 finfo = netfs_folio_info(folio); 1035 if (finfo->netfs_group != group) { 1036 folio_put(folio); 1037 continue; 1038 } 1039 } 1040 1041 xas_pause(xas); 1042 break; 1043 } 1044 rcu_read_unlock(); 1045 if (!folio) 1046 return 0; 1047 1048 start = folio_pos(folio); /* May regress with THPs */ 1049 1050 _debug("wback %lx", folio_index(folio)); 1051 1052 /* At this point we hold neither the i_pages lock nor the page lock: 1053 * the page may be truncated or invalidated (changing page->mapping to 1054 * NULL), or even swizzled back from swapper_space to tmpfs file 1055 * mapping 1056 */ 1057 lock_again: 1058 if (wbc->sync_mode != WB_SYNC_NONE) { 1059 ret = folio_lock_killable(folio); 1060 if (ret < 0) 1061 return ret; 1062 } else { 1063 if (!folio_trylock(folio)) 1064 goto search_again; 1065 } 1066 1067 if (folio->mapping != mapping || 1068 !folio_test_dirty(folio)) { 1069 start += folio_size(folio); 1070 folio_unlock(folio); 1071 goto search_again; 1072 } 1073 1074 if (folio_test_writeback(folio) || 1075 folio_test_fscache(folio)) { 1076 folio_unlock(folio); 1077 if (wbc->sync_mode != WB_SYNC_NONE) { 1078 folio_wait_writeback(folio); 1079 #ifdef CONFIG_FSCACHE 1080 folio_wait_fscache(folio); 1081 #endif 1082 goto lock_again; 1083 } 1084 1085 start += folio_size(folio); 1086 if (wbc->sync_mode == WB_SYNC_NONE) { 1087 if (skips >= 5 || need_resched()) { 1088 ret = 0; 1089 goto out; 1090 } 1091 skips++; 1092 } 1093 goto search_again; 1094 } 1095 1096 ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas, 1097 folio, start, end); 1098 out: 1099 if (ret > 0) 1100 *_start = start + ret; 1101 _leave(" = %zd [%llx]", ret, *_start); 1102 return ret; 1103 } 1104 1105 /* 1106 * Write a region of pages back to the server 1107 */ 1108 static int netfs_writepages_region(struct address_space *mapping, 1109 struct writeback_control *wbc, 1110 struct netfs_group *group, 1111 unsigned long long *_start, 1112 unsigned long long end) 1113 { 1114 ssize_t ret; 1115 1116 XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); 1117 1118 do { 1119 ret = netfs_writepages_begin(mapping, wbc, group, &xas, 1120 _start, end); 1121 if (ret > 0 && wbc->nr_to_write > 0) 1122 cond_resched(); 1123 } while (ret > 0 && wbc->nr_to_write > 0); 1124 1125 return ret > 0 ? 0 : ret; 1126 } 1127 1128 /* 1129 * write some of the pending data back to the server 1130 */ 1131 int netfs_writepages(struct address_space *mapping, 1132 struct writeback_control *wbc) 1133 { 1134 struct netfs_group *group = NULL; 1135 loff_t start, end; 1136 int ret; 1137 1138 _enter(""); 1139 1140 /* We have to be careful as we can end up racing with setattr() 1141 * truncating the pagecache since the caller doesn't take a lock here 1142 * to prevent it. 1143 */ 1144 1145 if (wbc->range_cyclic && mapping->writeback_index) { 1146 start = mapping->writeback_index * PAGE_SIZE; 1147 ret = netfs_writepages_region(mapping, wbc, group, 1148 &start, LLONG_MAX); 1149 if (ret < 0) 1150 goto out; 1151 1152 if (wbc->nr_to_write <= 0) { 1153 mapping->writeback_index = start / PAGE_SIZE; 1154 goto out; 1155 } 1156 1157 start = 0; 1158 end = mapping->writeback_index * PAGE_SIZE; 1159 mapping->writeback_index = 0; 1160 ret = netfs_writepages_region(mapping, wbc, group, &start, end); 1161 if (ret == 0) 1162 mapping->writeback_index = start / PAGE_SIZE; 1163 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { 1164 start = 0; 1165 ret = netfs_writepages_region(mapping, wbc, group, 1166 &start, LLONG_MAX); 1167 if (wbc->nr_to_write > 0 && ret == 0) 1168 mapping->writeback_index = start / PAGE_SIZE; 1169 } else { 1170 start = wbc->range_start; 1171 ret = netfs_writepages_region(mapping, wbc, group, 1172 &start, wbc->range_end); 1173 } 1174 1175 out: 1176 _leave(" = %d", ret); 1177 return ret; 1178 } 1179 EXPORT_SYMBOL(netfs_writepages); 1180 1181 /* 1182 * Deal with the disposition of a laundered folio. 1183 */ 1184 static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq) 1185 { 1186 if (wreq->error) { 1187 pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error); 1188 mapping_set_error(wreq->mapping, wreq->error); 1189 } 1190 } 1191 1192 /** 1193 * netfs_launder_folio - Clean up a dirty folio that's being invalidated 1194 * @folio: The folio to clean 1195 * 1196 * This is called to write back a folio that's being invalidated when an inode 1197 * is getting torn down. Ideally, writepages would be used instead. 1198 */ 1199 int netfs_launder_folio(struct folio *folio) 1200 { 1201 struct netfs_io_request *wreq; 1202 struct address_space *mapping = folio->mapping; 1203 struct netfs_folio *finfo = netfs_folio_info(folio); 1204 struct netfs_group *group = netfs_folio_group(folio); 1205 struct bio_vec bvec; 1206 unsigned long long i_size = i_size_read(mapping->host); 1207 unsigned long long start = folio_pos(folio); 1208 size_t offset = 0, len; 1209 int ret = 0; 1210 1211 if (finfo) { 1212 offset = finfo->dirty_offset; 1213 start += offset; 1214 len = finfo->dirty_len; 1215 } else { 1216 len = folio_size(folio); 1217 } 1218 len = min_t(unsigned long long, len, i_size - start); 1219 1220 wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE); 1221 if (IS_ERR(wreq)) { 1222 ret = PTR_ERR(wreq); 1223 goto out; 1224 } 1225 1226 if (!folio_clear_dirty_for_io(folio)) 1227 goto out_put; 1228 1229 trace_netfs_folio(folio, netfs_folio_trace_launder); 1230 1231 _debug("launder %llx-%llx", start, start + len - 1); 1232 1233 /* Speculatively write to the cache. We have to fix this up later if 1234 * the store fails. 1235 */ 1236 wreq->cleanup = netfs_cleanup_launder_folio; 1237 1238 bvec_set_folio(&bvec, folio, len, offset); 1239 iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len); 1240 __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); 1241 ret = netfs_begin_write(wreq, true, netfs_write_trace_launder); 1242 1243 out_put: 1244 folio_detach_private(folio); 1245 netfs_put_group(group); 1246 kfree(finfo); 1247 netfs_put_request(wreq, false, netfs_rreq_trace_put_return); 1248 out: 1249 folio_wait_fscache(folio); 1250 _leave(" = %d", ret); 1251 return ret; 1252 } 1253 EXPORT_SYMBOL(netfs_launder_folio); 1254