1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* Network filesystem high-level buffered read support. 3 * 4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/task_io_accounting_ops.h> 10 #include "internal.h" 11 12 /* 13 * Unlock the folios in a read operation. We need to set PG_fscache on any 14 * folios we're going to write back before we unlock them. 15 */ 16 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) 17 { 18 struct netfs_io_subrequest *subreq; 19 struct netfs_folio *finfo; 20 struct folio *folio; 21 pgoff_t start_page = rreq->start / PAGE_SIZE; 22 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; 23 size_t account = 0; 24 bool subreq_failed = false; 25 26 XA_STATE(xas, &rreq->mapping->i_pages, start_page); 27 28 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { 29 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); 30 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { 31 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); 32 } 33 } 34 35 /* Walk through the pagecache and the I/O request lists simultaneously. 36 * We may have a mixture of cached and uncached sections and we only 37 * really want to write out the uncached sections. This is slightly 38 * complicated by the possibility that we might have huge pages with a 39 * mixture inside. 40 */ 41 subreq = list_first_entry(&rreq->subrequests, 42 struct netfs_io_subrequest, rreq_link); 43 subreq_failed = (subreq->error < 0); 44 45 trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); 46 47 rcu_read_lock(); 48 xas_for_each(&xas, folio, last_page) { 49 loff_t pg_end; 50 bool pg_failed = false; 51 bool folio_started; 52 53 if (xas_retry(&xas, folio)) 54 continue; 55 56 pg_end = folio_pos(folio) + folio_size(folio) - 1; 57 58 folio_started = false; 59 for (;;) { 60 loff_t sreq_end; 61 62 if (!subreq) { 63 pg_failed = true; 64 break; 65 } 66 if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { 67 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); 68 folio_start_fscache(folio); 69 folio_started = true; 70 } 71 pg_failed |= subreq_failed; 72 sreq_end = subreq->start + subreq->len - 1; 73 if (pg_end < sreq_end) 74 break; 75 76 account += subreq->transferred; 77 if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { 78 subreq = list_next_entry(subreq, rreq_link); 79 subreq_failed = (subreq->error < 0); 80 } else { 81 subreq = NULL; 82 subreq_failed = false; 83 } 84 85 if (pg_end == sreq_end) 86 break; 87 } 88 89 if (!pg_failed) { 90 flush_dcache_folio(folio); 91 finfo = netfs_folio_info(folio); 92 if (finfo) { 93 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); 94 if (finfo->netfs_group) 95 folio_change_private(folio, finfo->netfs_group); 96 else 97 folio_detach_private(folio); 98 kfree(finfo); 99 } 100 folio_mark_uptodate(folio); 101 } 102 103 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { 104 if (folio->index == rreq->no_unlock_folio && 105 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) 106 _debug("no unlock"); 107 else 108 folio_unlock(folio); 109 } 110 } 111 rcu_read_unlock(); 112 113 task_io_account_read(account); 114 if (rreq->netfs_ops->done) 115 rreq->netfs_ops->done(rreq); 116 } 117 118 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, 119 loff_t *_start, size_t *_len, loff_t i_size) 120 { 121 struct netfs_cache_resources *cres = &rreq->cache_resources; 122 123 if (cres->ops && cres->ops->expand_readahead) 124 cres->ops->expand_readahead(cres, _start, _len, i_size); 125 } 126 127 static void netfs_rreq_expand(struct netfs_io_request *rreq, 128 struct readahead_control *ractl) 129 { 130 /* Give the cache a chance to change the request parameters. The 131 * resultant request must contain the original region. 132 */ 133 netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); 134 135 /* Give the netfs a chance to change the request parameters. The 136 * resultant request must contain the original region. 137 */ 138 if (rreq->netfs_ops->expand_readahead) 139 rreq->netfs_ops->expand_readahead(rreq); 140 141 /* Expand the request if the cache wants it to start earlier. Note 142 * that the expansion may get further extended if the VM wishes to 143 * insert THPs and the preferred start and/or end wind up in the middle 144 * of THPs. 145 * 146 * If this is the case, however, the THP size should be an integer 147 * multiple of the cache granule size, so we get a whole number of 148 * granules to deal with. 149 */ 150 if (rreq->start != readahead_pos(ractl) || 151 rreq->len != readahead_length(ractl)) { 152 readahead_expand(ractl, rreq->start, rreq->len); 153 rreq->start = readahead_pos(ractl); 154 rreq->len = readahead_length(ractl); 155 156 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), 157 netfs_read_trace_expanded); 158 } 159 } 160 161 /* 162 * Begin an operation, and fetch the stored zero point value from the cookie if 163 * available. 164 */ 165 static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) 166 { 167 return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); 168 } 169 170 /** 171 * netfs_readahead - Helper to manage a read request 172 * @ractl: The description of the readahead request 173 * 174 * Fulfil a readahead request by drawing data from the cache if possible, or 175 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O 176 * requests from different sources will get munged together. If necessary, the 177 * readahead window can be expanded in either direction to a more convenient 178 * alighment for RPC efficiency or to make storage in the cache feasible. 179 * 180 * The calling netfs must initialise a netfs context contiguous to the vfs 181 * inode before calling this. 182 * 183 * This is usable whether or not caching is enabled. 184 */ 185 void netfs_readahead(struct readahead_control *ractl) 186 { 187 struct netfs_io_request *rreq; 188 struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); 189 int ret; 190 191 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); 192 193 if (readahead_count(ractl) == 0) 194 return; 195 196 rreq = netfs_alloc_request(ractl->mapping, ractl->file, 197 readahead_pos(ractl), 198 readahead_length(ractl), 199 NETFS_READAHEAD); 200 if (IS_ERR(rreq)) 201 return; 202 203 ret = netfs_begin_cache_read(rreq, ctx); 204 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 205 goto cleanup_free; 206 207 netfs_stat(&netfs_n_rh_readahead); 208 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), 209 netfs_read_trace_readahead); 210 211 netfs_rreq_expand(rreq, ractl); 212 213 /* Set up the output buffer */ 214 iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, 215 rreq->start, rreq->len); 216 217 /* Drop the refs on the folios here rather than in the cache or 218 * filesystem. The locks will be dropped in netfs_rreq_unlock(). 219 */ 220 while (readahead_folio(ractl)) 221 ; 222 223 netfs_begin_read(rreq, false); 224 netfs_put_request(rreq, false, netfs_rreq_trace_put_return); 225 return; 226 227 cleanup_free: 228 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); 229 return; 230 } 231 EXPORT_SYMBOL(netfs_readahead); 232 233 /** 234 * netfs_read_folio - Helper to manage a read_folio request 235 * @file: The file to read from 236 * @folio: The folio to read 237 * 238 * Fulfil a read_folio request by drawing data from the cache if 239 * possible, or the netfs if not. Space beyond the EOF is zero-filled. 240 * Multiple I/O requests from different sources will get munged together. 241 * 242 * The calling netfs must initialise a netfs context contiguous to the vfs 243 * inode before calling this. 244 * 245 * This is usable whether or not caching is enabled. 246 */ 247 int netfs_read_folio(struct file *file, struct folio *folio) 248 { 249 struct address_space *mapping = folio->mapping; 250 struct netfs_io_request *rreq; 251 struct netfs_inode *ctx = netfs_inode(mapping->host); 252 struct folio *sink = NULL; 253 int ret; 254 255 _enter("%lx", folio->index); 256 257 rreq = netfs_alloc_request(mapping, file, 258 folio_file_pos(folio), folio_size(folio), 259 NETFS_READPAGE); 260 if (IS_ERR(rreq)) { 261 ret = PTR_ERR(rreq); 262 goto alloc_error; 263 } 264 265 ret = netfs_begin_cache_read(rreq, ctx); 266 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 267 goto discard; 268 269 netfs_stat(&netfs_n_rh_readpage); 270 trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); 271 272 /* Set up the output buffer */ 273 if (folio_test_dirty(folio)) { 274 /* Handle someone trying to read from an unflushed streaming 275 * write. We fiddle the buffer so that a gap at the beginning 276 * and/or a gap at the end get copied to, but the middle is 277 * discarded. 278 */ 279 struct netfs_folio *finfo = netfs_folio_info(folio); 280 struct bio_vec *bvec; 281 unsigned int from = finfo->dirty_offset; 282 unsigned int to = from + finfo->dirty_len; 283 unsigned int off = 0, i = 0; 284 size_t flen = folio_size(folio); 285 size_t nr_bvec = flen / PAGE_SIZE + 2; 286 size_t part; 287 288 ret = -ENOMEM; 289 bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); 290 if (!bvec) 291 goto discard; 292 293 sink = folio_alloc(GFP_KERNEL, 0); 294 if (!sink) 295 goto discard; 296 297 trace_netfs_folio(folio, netfs_folio_trace_read_gaps); 298 299 rreq->direct_bv = bvec; 300 rreq->direct_bv_count = nr_bvec; 301 if (from > 0) { 302 bvec_set_folio(&bvec[i++], folio, from, 0); 303 off = from; 304 } 305 while (off < to) { 306 part = min_t(size_t, to - off, PAGE_SIZE); 307 bvec_set_folio(&bvec[i++], sink, part, 0); 308 off += part; 309 } 310 if (to < flen) 311 bvec_set_folio(&bvec[i++], folio, flen - to, to); 312 iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); 313 } else { 314 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, 315 rreq->start, rreq->len); 316 } 317 318 ret = netfs_begin_read(rreq, true); 319 if (sink) 320 folio_put(sink); 321 netfs_put_request(rreq, false, netfs_rreq_trace_put_return); 322 return ret < 0 ? ret : 0; 323 324 discard: 325 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); 326 alloc_error: 327 folio_unlock(folio); 328 return ret; 329 } 330 EXPORT_SYMBOL(netfs_read_folio); 331 332 /* 333 * Prepare a folio for writing without reading first 334 * @folio: The folio being prepared 335 * @pos: starting position for the write 336 * @len: length of write 337 * @always_fill: T if the folio should always be completely filled/cleared 338 * 339 * In some cases, write_begin doesn't need to read at all: 340 * - full folio write 341 * - write that lies in a folio that is completely beyond EOF 342 * - write that covers the folio from start to EOF or beyond it 343 * 344 * If any of these criteria are met, then zero out the unwritten parts 345 * of the folio and return true. Otherwise, return false. 346 */ 347 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, 348 bool always_fill) 349 { 350 struct inode *inode = folio_inode(folio); 351 loff_t i_size = i_size_read(inode); 352 size_t offset = offset_in_folio(folio, pos); 353 size_t plen = folio_size(folio); 354 355 if (unlikely(always_fill)) { 356 if (pos - offset + len <= i_size) 357 return false; /* Page entirely before EOF */ 358 zero_user_segment(&folio->page, 0, plen); 359 folio_mark_uptodate(folio); 360 return true; 361 } 362 363 /* Full folio write */ 364 if (offset == 0 && len >= plen) 365 return true; 366 367 /* Page entirely beyond the end of the file */ 368 if (pos - offset >= i_size) 369 goto zero_out; 370 371 /* Write that covers from the start of the folio to EOF or beyond */ 372 if (offset == 0 && (pos + len) >= i_size) 373 goto zero_out; 374 375 return false; 376 zero_out: 377 zero_user_segments(&folio->page, 0, offset, offset + len, plen); 378 return true; 379 } 380 381 /** 382 * netfs_write_begin - Helper to prepare for writing 383 * @ctx: The netfs context 384 * @file: The file to read from 385 * @mapping: The mapping to read from 386 * @pos: File position at which the write will begin 387 * @len: The length of the write (may extend beyond the end of the folio chosen) 388 * @_folio: Where to put the resultant folio 389 * @_fsdata: Place for the netfs to store a cookie 390 * 391 * Pre-read data for a write-begin request by drawing data from the cache if 392 * possible, or the netfs if not. Space beyond the EOF is zero-filled. 393 * Multiple I/O requests from different sources will get munged together. If 394 * necessary, the readahead window can be expanded in either direction to a 395 * more convenient alighment for RPC efficiency or to make storage in the cache 396 * feasible. 397 * 398 * The calling netfs must provide a table of operations, only one of which, 399 * issue_op, is mandatory. 400 * 401 * The check_write_begin() operation can be provided to check for and flush 402 * conflicting writes once the folio is grabbed and locked. It is passed a 403 * pointer to the fsdata cookie that gets returned to the VM to be passed to 404 * write_end. It is permitted to sleep. It should return 0 if the request 405 * should go ahead or it may return an error. It may also unlock and put the 406 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 407 * will cause the folio to be re-got and the process to be retried. 408 * 409 * The calling netfs must initialise a netfs context contiguous to the vfs 410 * inode before calling this. 411 * 412 * This is usable whether or not caching is enabled. 413 */ 414 int netfs_write_begin(struct netfs_inode *ctx, 415 struct file *file, struct address_space *mapping, 416 loff_t pos, unsigned int len, struct folio **_folio, 417 void **_fsdata) 418 { 419 struct netfs_io_request *rreq; 420 struct folio *folio; 421 pgoff_t index = pos >> PAGE_SHIFT; 422 int ret; 423 424 DEFINE_READAHEAD(ractl, file, NULL, mapping, index); 425 426 retry: 427 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 428 mapping_gfp_mask(mapping)); 429 if (IS_ERR(folio)) 430 return PTR_ERR(folio); 431 432 if (ctx->ops->check_write_begin) { 433 /* Allow the netfs (eg. ceph) to flush conflicts. */ 434 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); 435 if (ret < 0) { 436 trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); 437 goto error; 438 } 439 if (!folio) 440 goto retry; 441 } 442 443 if (folio_test_uptodate(folio)) 444 goto have_folio; 445 446 /* If the page is beyond the EOF, we want to clear it - unless it's 447 * within the cache granule containing the EOF, in which case we need 448 * to preload the granule. 449 */ 450 if (!netfs_is_cache_enabled(ctx) && 451 netfs_skip_folio_read(folio, pos, len, false)) { 452 netfs_stat(&netfs_n_rh_write_zskip); 453 goto have_folio_no_wait; 454 } 455 456 rreq = netfs_alloc_request(mapping, file, 457 folio_file_pos(folio), folio_size(folio), 458 NETFS_READ_FOR_WRITE); 459 if (IS_ERR(rreq)) { 460 ret = PTR_ERR(rreq); 461 goto error; 462 } 463 rreq->no_unlock_folio = folio->index; 464 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); 465 466 ret = netfs_begin_cache_read(rreq, ctx); 467 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 468 goto error_put; 469 470 netfs_stat(&netfs_n_rh_write_begin); 471 trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); 472 473 /* Expand the request to meet caching requirements and download 474 * preferences. 475 */ 476 ractl._nr_pages = folio_nr_pages(folio); 477 netfs_rreq_expand(rreq, &ractl); 478 479 /* Set up the output buffer */ 480 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, 481 rreq->start, rreq->len); 482 483 /* We hold the folio locks, so we can drop the references */ 484 folio_get(folio); 485 while (readahead_folio(&ractl)) 486 ; 487 488 ret = netfs_begin_read(rreq, true); 489 if (ret < 0) 490 goto error; 491 netfs_put_request(rreq, false, netfs_rreq_trace_put_return); 492 493 have_folio: 494 ret = folio_wait_fscache_killable(folio); 495 if (ret < 0) 496 goto error; 497 have_folio_no_wait: 498 *_folio = folio; 499 _leave(" = 0"); 500 return 0; 501 502 error_put: 503 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); 504 error: 505 if (folio) { 506 folio_unlock(folio); 507 folio_put(folio); 508 } 509 _leave(" = %d", ret); 510 return ret; 511 } 512 EXPORT_SYMBOL(netfs_write_begin); 513 514 /* 515 * Preload the data into a page we're proposing to write into. 516 */ 517 int netfs_prefetch_for_write(struct file *file, struct folio *folio, 518 size_t offset, size_t len) 519 { 520 struct netfs_io_request *rreq; 521 struct address_space *mapping = folio->mapping; 522 struct netfs_inode *ctx = netfs_inode(mapping->host); 523 unsigned long long start = folio_pos(folio); 524 size_t flen = folio_size(folio); 525 int ret; 526 527 _enter("%zx @%llx", flen, start); 528 529 ret = -ENOMEM; 530 531 rreq = netfs_alloc_request(mapping, file, start, flen, 532 NETFS_READ_FOR_WRITE); 533 if (IS_ERR(rreq)) { 534 ret = PTR_ERR(rreq); 535 goto error; 536 } 537 538 rreq->no_unlock_folio = folio->index; 539 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); 540 ret = netfs_begin_cache_read(rreq, ctx); 541 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 542 goto error_put; 543 544 netfs_stat(&netfs_n_rh_write_begin); 545 trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); 546 547 /* Set up the output buffer */ 548 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, 549 rreq->start, rreq->len); 550 551 ret = netfs_begin_read(rreq, true); 552 netfs_put_request(rreq, false, netfs_rreq_trace_put_return); 553 return ret; 554 555 error_put: 556 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); 557 error: 558 _leave(" = %d", ret); 559 return ret; 560 } 561 562 /** 563 * netfs_buffered_read_iter - Filesystem buffered I/O read routine 564 * @iocb: kernel I/O control block 565 * @iter: destination for the data read 566 * 567 * This is the ->read_iter() routine for all filesystems that can use the page 568 * cache directly. 569 * 570 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be 571 * returned when no data can be read without waiting for I/O requests to 572 * complete; it doesn't prevent readahead. 573 * 574 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests 575 * shall be made for the read or for readahead. When no data can be read, 576 * -EAGAIN shall be returned. When readahead would be triggered, a partial, 577 * possibly empty read shall be returned. 578 * 579 * Return: 580 * * number of bytes copied, even for partial reads 581 * * negative error code (or 0 if IOCB_NOIO) if nothing was read 582 */ 583 ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) 584 { 585 struct inode *inode = file_inode(iocb->ki_filp); 586 struct netfs_inode *ictx = netfs_inode(inode); 587 ssize_t ret; 588 589 if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) || 590 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))) 591 return -EINVAL; 592 593 ret = netfs_start_io_read(inode); 594 if (ret == 0) { 595 ret = filemap_read(iocb, iter, 0); 596 netfs_end_io_read(inode); 597 } 598 return ret; 599 } 600 EXPORT_SYMBOL(netfs_buffered_read_iter); 601 602 /** 603 * netfs_file_read_iter - Generic filesystem read routine 604 * @iocb: kernel I/O control block 605 * @iter: destination for the data read 606 * 607 * This is the ->read_iter() routine for all filesystems that can use the page 608 * cache directly. 609 * 610 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be 611 * returned when no data can be read without waiting for I/O requests to 612 * complete; it doesn't prevent readahead. 613 * 614 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests 615 * shall be made for the read or for readahead. When no data can be read, 616 * -EAGAIN shall be returned. When readahead would be triggered, a partial, 617 * possibly empty read shall be returned. 618 * 619 * Return: 620 * * number of bytes copied, even for partial reads 621 * * negative error code (or 0 if IOCB_NOIO) if nothing was read 622 */ 623 ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 624 { 625 struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host); 626 627 if ((iocb->ki_flags & IOCB_DIRECT) || 628 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) 629 return netfs_unbuffered_read_iter(iocb, iter); 630 631 return netfs_buffered_read_iter(iocb, iter); 632 } 633 EXPORT_SYMBOL(netfs_file_read_iter); 634