1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* Network filesystem high-level buffered read support. 3 * 4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/export.h> 9 #include <linux/task_io_accounting_ops.h> 10 #include "internal.h" 11 12 /* 13 * Unlock the folios in a read operation. We need to set PG_writeback on any 14 * folios we're going to write back before we unlock them. 15 * 16 * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use 17 * PG_private_2 and do a direct write to the cache from here instead. 18 */ 19 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) 20 { 21 struct netfs_io_subrequest *subreq; 22 struct netfs_folio *finfo; 23 struct folio *folio; 24 pgoff_t start_page = rreq->start / PAGE_SIZE; 25 pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; 26 size_t account = 0; 27 bool subreq_failed = false; 28 29 XA_STATE(xas, &rreq->mapping->i_pages, start_page); 30 31 if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { 32 __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); 33 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { 34 __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); 35 } 36 } 37 38 /* Walk through the pagecache and the I/O request lists simultaneously. 39 * We may have a mixture of cached and uncached sections and we only 40 * really want to write out the uncached sections. This is slightly 41 * complicated by the possibility that we might have huge pages with a 42 * mixture inside. 43 */ 44 subreq = list_first_entry(&rreq->subrequests, 45 struct netfs_io_subrequest, rreq_link); 46 subreq_failed = (subreq->error < 0); 47 48 trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); 49 50 rcu_read_lock(); 51 xas_for_each(&xas, folio, last_page) { 52 loff_t pg_end; 53 bool pg_failed = false; 54 bool wback_to_cache = false; 55 bool folio_started = false; 56 57 if (xas_retry(&xas, folio)) 58 continue; 59 60 pg_end = folio_pos(folio) + folio_size(folio) - 1; 61 62 for (;;) { 63 loff_t sreq_end; 64 65 if (!subreq) { 66 pg_failed = true; 67 break; 68 } 69 if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { 70 if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, 71 &subreq->flags)) { 72 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); 73 folio_start_private_2(folio); 74 folio_started = true; 75 } 76 } else { 77 wback_to_cache |= 78 test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); 79 } 80 pg_failed |= subreq_failed; 81 sreq_end = subreq->start + subreq->len - 1; 82 if (pg_end < sreq_end) 83 break; 84 85 account += subreq->transferred; 86 if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { 87 subreq = list_next_entry(subreq, rreq_link); 88 subreq_failed = (subreq->error < 0); 89 } else { 90 subreq = NULL; 91 subreq_failed = false; 92 } 93 94 if (pg_end == sreq_end) 95 break; 96 } 97 98 if (!pg_failed) { 99 flush_dcache_folio(folio); 100 finfo = netfs_folio_info(folio); 101 if (finfo) { 102 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); 103 if (finfo->netfs_group) 104 folio_change_private(folio, finfo->netfs_group); 105 else 106 folio_detach_private(folio); 107 kfree(finfo); 108 } 109 folio_mark_uptodate(folio); 110 if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) { 111 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); 112 folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); 113 filemap_dirty_folio(folio->mapping, folio); 114 } 115 } 116 117 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { 118 if (folio->index == rreq->no_unlock_folio && 119 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) 120 _debug("no unlock"); 121 else 122 folio_unlock(folio); 123 } 124 } 125 rcu_read_unlock(); 126 127 task_io_account_read(account); 128 if (rreq->netfs_ops->done) 129 rreq->netfs_ops->done(rreq); 130 } 131 132 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, 133 unsigned long long *_start, 134 unsigned long long *_len, 135 unsigned long long i_size) 136 { 137 struct netfs_cache_resources *cres = &rreq->cache_resources; 138 139 if (cres->ops && cres->ops->expand_readahead) 140 cres->ops->expand_readahead(cres, _start, _len, i_size); 141 } 142 143 static void netfs_rreq_expand(struct netfs_io_request *rreq, 144 struct readahead_control *ractl) 145 { 146 /* Give the cache a chance to change the request parameters. The 147 * resultant request must contain the original region. 148 */ 149 netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); 150 151 /* Give the netfs a chance to change the request parameters. The 152 * resultant request must contain the original region. 153 */ 154 if (rreq->netfs_ops->expand_readahead) 155 rreq->netfs_ops->expand_readahead(rreq); 156 157 /* Expand the request if the cache wants it to start earlier. Note 158 * that the expansion may get further extended if the VM wishes to 159 * insert THPs and the preferred start and/or end wind up in the middle 160 * of THPs. 161 * 162 * If this is the case, however, the THP size should be an integer 163 * multiple of the cache granule size, so we get a whole number of 164 * granules to deal with. 165 */ 166 if (rreq->start != readahead_pos(ractl) || 167 rreq->len != readahead_length(ractl)) { 168 readahead_expand(ractl, rreq->start, rreq->len); 169 rreq->start = readahead_pos(ractl); 170 rreq->len = readahead_length(ractl); 171 172 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), 173 netfs_read_trace_expanded); 174 } 175 } 176 177 /* 178 * Begin an operation, and fetch the stored zero point value from the cookie if 179 * available. 180 */ 181 static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) 182 { 183 return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); 184 } 185 186 /** 187 * netfs_readahead - Helper to manage a read request 188 * @ractl: The description of the readahead request 189 * 190 * Fulfil a readahead request by drawing data from the cache if possible, or 191 * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O 192 * requests from different sources will get munged together. If necessary, the 193 * readahead window can be expanded in either direction to a more convenient 194 * alighment for RPC efficiency or to make storage in the cache feasible. 195 * 196 * The calling netfs must initialise a netfs context contiguous to the vfs 197 * inode before calling this. 198 * 199 * This is usable whether or not caching is enabled. 200 */ 201 void netfs_readahead(struct readahead_control *ractl) 202 { 203 struct netfs_io_request *rreq; 204 struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); 205 int ret; 206 207 _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); 208 209 if (readahead_count(ractl) == 0) 210 return; 211 212 rreq = netfs_alloc_request(ractl->mapping, ractl->file, 213 readahead_pos(ractl), 214 readahead_length(ractl), 215 NETFS_READAHEAD); 216 if (IS_ERR(rreq)) 217 return; 218 219 ret = netfs_begin_cache_read(rreq, ctx); 220 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 221 goto cleanup_free; 222 223 netfs_stat(&netfs_n_rh_readahead); 224 trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), 225 netfs_read_trace_readahead); 226 227 netfs_rreq_expand(rreq, ractl); 228 229 /* Set up the output buffer */ 230 iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, 231 rreq->start, rreq->len); 232 233 /* Drop the refs on the folios here rather than in the cache or 234 * filesystem. The locks will be dropped in netfs_rreq_unlock(). 235 */ 236 while (readahead_folio(ractl)) 237 ; 238 239 netfs_begin_read(rreq, false); 240 netfs_put_request(rreq, false, netfs_rreq_trace_put_return); 241 return; 242 243 cleanup_free: 244 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); 245 return; 246 } 247 EXPORT_SYMBOL(netfs_readahead); 248 249 /** 250 * netfs_read_folio - Helper to manage a read_folio request 251 * @file: The file to read from 252 * @folio: The folio to read 253 * 254 * Fulfil a read_folio request by drawing data from the cache if 255 * possible, or the netfs if not. Space beyond the EOF is zero-filled. 256 * Multiple I/O requests from different sources will get munged together. 257 * 258 * The calling netfs must initialise a netfs context contiguous to the vfs 259 * inode before calling this. 260 * 261 * This is usable whether or not caching is enabled. 262 */ 263 int netfs_read_folio(struct file *file, struct folio *folio) 264 { 265 struct address_space *mapping = folio->mapping; 266 struct netfs_io_request *rreq; 267 struct netfs_inode *ctx = netfs_inode(mapping->host); 268 struct folio *sink = NULL; 269 int ret; 270 271 _enter("%lx", folio->index); 272 273 rreq = netfs_alloc_request(mapping, file, 274 folio_file_pos(folio), folio_size(folio), 275 NETFS_READPAGE); 276 if (IS_ERR(rreq)) { 277 ret = PTR_ERR(rreq); 278 goto alloc_error; 279 } 280 281 ret = netfs_begin_cache_read(rreq, ctx); 282 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 283 goto discard; 284 285 netfs_stat(&netfs_n_rh_read_folio); 286 trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); 287 288 /* Set up the output buffer */ 289 if (folio_test_dirty(folio)) { 290 /* Handle someone trying to read from an unflushed streaming 291 * write. We fiddle the buffer so that a gap at the beginning 292 * and/or a gap at the end get copied to, but the middle is 293 * discarded. 294 */ 295 struct netfs_folio *finfo = netfs_folio_info(folio); 296 struct bio_vec *bvec; 297 unsigned int from = finfo->dirty_offset; 298 unsigned int to = from + finfo->dirty_len; 299 unsigned int off = 0, i = 0; 300 size_t flen = folio_size(folio); 301 size_t nr_bvec = flen / PAGE_SIZE + 2; 302 size_t part; 303 304 ret = -ENOMEM; 305 bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); 306 if (!bvec) 307 goto discard; 308 309 sink = folio_alloc(GFP_KERNEL, 0); 310 if (!sink) 311 goto discard; 312 313 trace_netfs_folio(folio, netfs_folio_trace_read_gaps); 314 315 rreq->direct_bv = bvec; 316 rreq->direct_bv_count = nr_bvec; 317 if (from > 0) { 318 bvec_set_folio(&bvec[i++], folio, from, 0); 319 off = from; 320 } 321 while (off < to) { 322 part = min_t(size_t, to - off, PAGE_SIZE); 323 bvec_set_folio(&bvec[i++], sink, part, 0); 324 off += part; 325 } 326 if (to < flen) 327 bvec_set_folio(&bvec[i++], folio, flen - to, to); 328 iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); 329 } else { 330 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, 331 rreq->start, rreq->len); 332 } 333 334 ret = netfs_begin_read(rreq, true); 335 if (sink) 336 folio_put(sink); 337 netfs_put_request(rreq, false, netfs_rreq_trace_put_return); 338 return ret < 0 ? ret : 0; 339 340 discard: 341 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); 342 alloc_error: 343 folio_unlock(folio); 344 return ret; 345 } 346 EXPORT_SYMBOL(netfs_read_folio); 347 348 /* 349 * Prepare a folio for writing without reading first 350 * @folio: The folio being prepared 351 * @pos: starting position for the write 352 * @len: length of write 353 * @always_fill: T if the folio should always be completely filled/cleared 354 * 355 * In some cases, write_begin doesn't need to read at all: 356 * - full folio write 357 * - write that lies in a folio that is completely beyond EOF 358 * - write that covers the folio from start to EOF or beyond it 359 * 360 * If any of these criteria are met, then zero out the unwritten parts 361 * of the folio and return true. Otherwise, return false. 362 */ 363 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, 364 bool always_fill) 365 { 366 struct inode *inode = folio_inode(folio); 367 loff_t i_size = i_size_read(inode); 368 size_t offset = offset_in_folio(folio, pos); 369 size_t plen = folio_size(folio); 370 371 if (unlikely(always_fill)) { 372 if (pos - offset + len <= i_size) 373 return false; /* Page entirely before EOF */ 374 zero_user_segment(&folio->page, 0, plen); 375 folio_mark_uptodate(folio); 376 return true; 377 } 378 379 /* Full folio write */ 380 if (offset == 0 && len >= plen) 381 return true; 382 383 /* Page entirely beyond the end of the file */ 384 if (pos - offset >= i_size) 385 goto zero_out; 386 387 /* Write that covers from the start of the folio to EOF or beyond */ 388 if (offset == 0 && (pos + len) >= i_size) 389 goto zero_out; 390 391 return false; 392 zero_out: 393 zero_user_segments(&folio->page, 0, offset, offset + len, plen); 394 return true; 395 } 396 397 /** 398 * netfs_write_begin - Helper to prepare for writing 399 * @ctx: The netfs context 400 * @file: The file to read from 401 * @mapping: The mapping to read from 402 * @pos: File position at which the write will begin 403 * @len: The length of the write (may extend beyond the end of the folio chosen) 404 * @_folio: Where to put the resultant folio 405 * @_fsdata: Place for the netfs to store a cookie 406 * 407 * Pre-read data for a write-begin request by drawing data from the cache if 408 * possible, or the netfs if not. Space beyond the EOF is zero-filled. 409 * Multiple I/O requests from different sources will get munged together. If 410 * necessary, the readahead window can be expanded in either direction to a 411 * more convenient alighment for RPC efficiency or to make storage in the cache 412 * feasible. 413 * 414 * The calling netfs must provide a table of operations, only one of which, 415 * issue_op, is mandatory. 416 * 417 * The check_write_begin() operation can be provided to check for and flush 418 * conflicting writes once the folio is grabbed and locked. It is passed a 419 * pointer to the fsdata cookie that gets returned to the VM to be passed to 420 * write_end. It is permitted to sleep. It should return 0 if the request 421 * should go ahead or it may return an error. It may also unlock and put the 422 * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 423 * will cause the folio to be re-got and the process to be retried. 424 * 425 * The calling netfs must initialise a netfs context contiguous to the vfs 426 * inode before calling this. 427 * 428 * This is usable whether or not caching is enabled. 429 */ 430 int netfs_write_begin(struct netfs_inode *ctx, 431 struct file *file, struct address_space *mapping, 432 loff_t pos, unsigned int len, struct folio **_folio, 433 void **_fsdata) 434 { 435 struct netfs_io_request *rreq; 436 struct folio *folio; 437 pgoff_t index = pos >> PAGE_SHIFT; 438 int ret; 439 440 DEFINE_READAHEAD(ractl, file, NULL, mapping, index); 441 442 retry: 443 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, 444 mapping_gfp_mask(mapping)); 445 if (IS_ERR(folio)) 446 return PTR_ERR(folio); 447 448 if (ctx->ops->check_write_begin) { 449 /* Allow the netfs (eg. ceph) to flush conflicts. */ 450 ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); 451 if (ret < 0) { 452 trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); 453 goto error; 454 } 455 if (!folio) 456 goto retry; 457 } 458 459 if (folio_test_uptodate(folio)) 460 goto have_folio; 461 462 /* If the page is beyond the EOF, we want to clear it - unless it's 463 * within the cache granule containing the EOF, in which case we need 464 * to preload the granule. 465 */ 466 if (!netfs_is_cache_enabled(ctx) && 467 netfs_skip_folio_read(folio, pos, len, false)) { 468 netfs_stat(&netfs_n_rh_write_zskip); 469 goto have_folio; 470 } 471 472 rreq = netfs_alloc_request(mapping, file, 473 folio_file_pos(folio), folio_size(folio), 474 NETFS_READ_FOR_WRITE); 475 if (IS_ERR(rreq)) { 476 ret = PTR_ERR(rreq); 477 goto error; 478 } 479 rreq->no_unlock_folio = folio->index; 480 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); 481 482 ret = netfs_begin_cache_read(rreq, ctx); 483 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 484 goto error_put; 485 486 netfs_stat(&netfs_n_rh_write_begin); 487 trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); 488 489 /* Expand the request to meet caching requirements and download 490 * preferences. 491 */ 492 ractl._nr_pages = folio_nr_pages(folio); 493 netfs_rreq_expand(rreq, &ractl); 494 495 /* Set up the output buffer */ 496 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, 497 rreq->start, rreq->len); 498 499 /* We hold the folio locks, so we can drop the references */ 500 folio_get(folio); 501 while (readahead_folio(&ractl)) 502 ; 503 504 ret = netfs_begin_read(rreq, true); 505 if (ret < 0) 506 goto error; 507 netfs_put_request(rreq, false, netfs_rreq_trace_put_return); 508 509 have_folio: 510 *_folio = folio; 511 _leave(" = 0"); 512 return 0; 513 514 error_put: 515 netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); 516 error: 517 if (folio) { 518 folio_unlock(folio); 519 folio_put(folio); 520 } 521 _leave(" = %d", ret); 522 return ret; 523 } 524 EXPORT_SYMBOL(netfs_write_begin); 525 526 /* 527 * Preload the data into a page we're proposing to write into. 528 */ 529 int netfs_prefetch_for_write(struct file *file, struct folio *folio, 530 size_t offset, size_t len) 531 { 532 struct netfs_io_request *rreq; 533 struct address_space *mapping = folio->mapping; 534 struct netfs_inode *ctx = netfs_inode(mapping->host); 535 unsigned long long start = folio_pos(folio); 536 size_t flen = folio_size(folio); 537 int ret; 538 539 _enter("%zx @%llx", flen, start); 540 541 ret = -ENOMEM; 542 543 rreq = netfs_alloc_request(mapping, file, start, flen, 544 NETFS_READ_FOR_WRITE); 545 if (IS_ERR(rreq)) { 546 ret = PTR_ERR(rreq); 547 goto error; 548 } 549 550 rreq->no_unlock_folio = folio->index; 551 __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); 552 ret = netfs_begin_cache_read(rreq, ctx); 553 if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) 554 goto error_put; 555 556 netfs_stat(&netfs_n_rh_write_begin); 557 trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); 558 559 /* Set up the output buffer */ 560 iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, 561 rreq->start, rreq->len); 562 563 ret = netfs_begin_read(rreq, true); 564 netfs_put_request(rreq, false, netfs_rreq_trace_put_return); 565 return ret; 566 567 error_put: 568 netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); 569 error: 570 _leave(" = %d", ret); 571 return ret; 572 } 573 574 /** 575 * netfs_buffered_read_iter - Filesystem buffered I/O read routine 576 * @iocb: kernel I/O control block 577 * @iter: destination for the data read 578 * 579 * This is the ->read_iter() routine for all filesystems that can use the page 580 * cache directly. 581 * 582 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be 583 * returned when no data can be read without waiting for I/O requests to 584 * complete; it doesn't prevent readahead. 585 * 586 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests 587 * shall be made for the read or for readahead. When no data can be read, 588 * -EAGAIN shall be returned. When readahead would be triggered, a partial, 589 * possibly empty read shall be returned. 590 * 591 * Return: 592 * * number of bytes copied, even for partial reads 593 * * negative error code (or 0 if IOCB_NOIO) if nothing was read 594 */ 595 ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) 596 { 597 struct inode *inode = file_inode(iocb->ki_filp); 598 struct netfs_inode *ictx = netfs_inode(inode); 599 ssize_t ret; 600 601 if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) || 602 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))) 603 return -EINVAL; 604 605 ret = netfs_start_io_read(inode); 606 if (ret == 0) { 607 ret = filemap_read(iocb, iter, 0); 608 netfs_end_io_read(inode); 609 } 610 return ret; 611 } 612 EXPORT_SYMBOL(netfs_buffered_read_iter); 613 614 /** 615 * netfs_file_read_iter - Generic filesystem read routine 616 * @iocb: kernel I/O control block 617 * @iter: destination for the data read 618 * 619 * This is the ->read_iter() routine for all filesystems that can use the page 620 * cache directly. 621 * 622 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be 623 * returned when no data can be read without waiting for I/O requests to 624 * complete; it doesn't prevent readahead. 625 * 626 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests 627 * shall be made for the read or for readahead. When no data can be read, 628 * -EAGAIN shall be returned. When readahead would be triggered, a partial, 629 * possibly empty read shall be returned. 630 * 631 * Return: 632 * * number of bytes copied, even for partial reads 633 * * negative error code (or 0 if IOCB_NOIO) if nothing was read 634 */ 635 ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) 636 { 637 struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host); 638 639 if ((iocb->ki_flags & IOCB_DIRECT) || 640 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) 641 return netfs_unbuffered_read_iter(iocb, iter); 642 643 return netfs_buffered_read_iter(iocb, iter); 644 } 645 EXPORT_SYMBOL(netfs_file_read_iter); 646