1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Network filesystem read subrequest result collection, assessment and 3 * retrying. 4 * 5 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. 6 * Written by David Howells (dhowells@redhat.com) 7 */ 8 9 #include <linux/export.h> 10 #include <linux/fs.h> 11 #include <linux/mm.h> 12 #include <linux/pagemap.h> 13 #include <linux/slab.h> 14 #include <linux/task_io_accounting_ops.h> 15 #include "internal.h" 16 17 /* 18 * Clear the unread part of an I/O request. 19 */ 20 static void netfs_clear_unread(struct netfs_io_subrequest *subreq) 21 { 22 netfs_reset_iter(subreq); 23 WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter)); 24 iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); 25 if (subreq->start + subreq->transferred >= subreq->rreq->i_size) 26 __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); 27 } 28 29 /* 30 * Flush, mark and unlock a folio that's now completely read. If we want to 31 * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it 32 * dirty and let writeback handle it. 33 */ 34 static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, 35 struct netfs_io_request *rreq, 36 struct folio_queue *folioq, 37 int slot) 38 { 39 struct netfs_folio *finfo; 40 struct folio *folio = folioq_folio(folioq, slot); 41 42 flush_dcache_folio(folio); 43 folio_mark_uptodate(folio); 44 45 if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { 46 finfo = netfs_folio_info(folio); 47 if (finfo) { 48 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); 49 if (finfo->netfs_group) 50 folio_change_private(folio, finfo->netfs_group); 51 else 52 folio_detach_private(folio); 53 kfree(finfo); 54 } 55 56 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { 57 if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) { 58 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); 59 folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); 60 folio_mark_dirty(folio); 61 } 62 } else { 63 trace_netfs_folio(folio, netfs_folio_trace_read_done); 64 } 65 66 folioq_clear(folioq, slot); 67 } else { 68 // TODO: Use of PG_private_2 is deprecated. 69 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) 70 netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); 71 else 72 folioq_clear(folioq, slot); 73 } 74 75 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { 76 if (folio->index == rreq->no_unlock_folio && 77 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { 78 _debug("no unlock"); 79 } else { 80 trace_netfs_folio(folio, netfs_folio_trace_read_unlock); 81 folio_unlock(folio); 82 } 83 } 84 } 85 86 /* 87 * Unlock any folios that are now completely read. Returns true if the 88 * subrequest is removed from the list. 89 */ 90 static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async) 91 { 92 struct netfs_io_subrequest *prev, *next; 93 struct netfs_io_request *rreq = subreq->rreq; 94 struct folio_queue *folioq = subreq->curr_folioq; 95 size_t avail, prev_donated, next_donated, fsize, part, excess; 96 loff_t fpos, start; 97 loff_t fend; 98 int slot = subreq->curr_folioq_slot; 99 100 if (WARN(subreq->transferred > subreq->len, 101 "Subreq overread: R%x[%x] %zu > %zu", 102 rreq->debug_id, subreq->debug_index, 103 subreq->transferred, subreq->len)) 104 subreq->transferred = subreq->len; 105 106 next_folio: 107 fsize = PAGE_SIZE << subreq->curr_folio_order; 108 fpos = round_down(subreq->start + subreq->consumed, fsize); 109 fend = fpos + fsize; 110 111 if (WARN_ON_ONCE(!folioq) || 112 WARN_ON_ONCE(!folioq_folio(folioq, slot)) || 113 WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) { 114 pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n", 115 rreq->debug_id, subreq->debug_index, 116 subreq->start, subreq->start + subreq->transferred - 1, 117 subreq->consumed, subreq->transferred, subreq->len, 118 slot); 119 if (folioq) { 120 struct folio *folio = folioq_folio(folioq, slot); 121 122 pr_err("folioq: orders=%02x%02x%02x%02x\n", 123 folioq->orders[0], folioq->orders[1], 124 folioq->orders[2], folioq->orders[3]); 125 if (folio) 126 pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n", 127 fpos, fend - 1, folio_pos(folio), folio_order(folio), 128 folioq_folio_order(folioq, slot)); 129 } 130 } 131 132 donation_changed: 133 /* Try to consume the current folio if we've hit or passed the end of 134 * it. There's a possibility that this subreq doesn't start at the 135 * beginning of the folio, in which case we need to donate to/from the 136 * preceding subreq. 137 * 138 * We also need to include any potential donation back from the 139 * following subreq. 140 */ 141 prev_donated = READ_ONCE(subreq->prev_donated); 142 next_donated = READ_ONCE(subreq->next_donated); 143 if (prev_donated || next_donated) { 144 spin_lock_bh(&rreq->lock); 145 prev_donated = subreq->prev_donated; 146 next_donated = subreq->next_donated; 147 subreq->start -= prev_donated; 148 subreq->len += prev_donated; 149 subreq->transferred += prev_donated; 150 prev_donated = subreq->prev_donated = 0; 151 if (subreq->transferred == subreq->len) { 152 subreq->len += next_donated; 153 subreq->transferred += next_donated; 154 next_donated = subreq->next_donated = 0; 155 } 156 trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations); 157 spin_unlock_bh(&rreq->lock); 158 } 159 160 avail = subreq->transferred; 161 if (avail == subreq->len) 162 avail += next_donated; 163 start = subreq->start; 164 if (subreq->consumed == 0) { 165 start -= prev_donated; 166 avail += prev_donated; 167 } else { 168 start += subreq->consumed; 169 avail -= subreq->consumed; 170 } 171 part = umin(avail, fsize); 172 173 trace_netfs_progress(subreq, start, avail, part); 174 175 if (start + avail >= fend) { 176 if (fpos == start) { 177 /* Flush, unlock and mark for caching any folio we've just read. */ 178 subreq->consumed = fend - subreq->start; 179 netfs_unlock_read_folio(subreq, rreq, folioq, slot); 180 folioq_mark2(folioq, slot); 181 if (subreq->consumed >= subreq->len) 182 goto remove_subreq; 183 } else if (fpos < start) { 184 excess = fend - subreq->start; 185 186 spin_lock_bh(&rreq->lock); 187 /* If we complete first on a folio split with the 188 * preceding subreq, donate to that subreq - otherwise 189 * we get the responsibility. 190 */ 191 if (subreq->prev_donated != prev_donated) { 192 spin_unlock_bh(&rreq->lock); 193 goto donation_changed; 194 } 195 196 if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) { 197 spin_unlock_bh(&rreq->lock); 198 pr_err("Can't donate prior to front\n"); 199 goto bad; 200 } 201 202 prev = list_prev_entry(subreq, rreq_link); 203 WRITE_ONCE(prev->next_donated, prev->next_donated + excess); 204 subreq->start += excess; 205 subreq->len -= excess; 206 subreq->transferred -= excess; 207 trace_netfs_donate(rreq, subreq, prev, excess, 208 netfs_trace_donate_tail_to_prev); 209 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); 210 211 if (subreq->consumed >= subreq->len) 212 goto remove_subreq_locked; 213 spin_unlock_bh(&rreq->lock); 214 } else { 215 pr_err("fpos > start\n"); 216 goto bad; 217 } 218 219 /* Advance the rolling buffer to the next folio. */ 220 slot++; 221 if (slot >= folioq_nr_slots(folioq)) { 222 slot = 0; 223 folioq = folioq->next; 224 subreq->curr_folioq = folioq; 225 } 226 subreq->curr_folioq_slot = slot; 227 if (folioq && folioq_folio(folioq, slot)) 228 subreq->curr_folio_order = folioq->orders[slot]; 229 if (!was_async) 230 cond_resched(); 231 goto next_folio; 232 } 233 234 /* Deal with partial progress. */ 235 if (subreq->transferred < subreq->len) 236 return false; 237 238 /* Donate the remaining downloaded data to one of the neighbouring 239 * subrequests. Note that we may race with them doing the same thing. 240 */ 241 spin_lock_bh(&rreq->lock); 242 243 if (subreq->prev_donated != prev_donated || 244 subreq->next_donated != next_donated) { 245 spin_unlock_bh(&rreq->lock); 246 cond_resched(); 247 goto donation_changed; 248 } 249 250 /* Deal with the trickiest case: that this subreq is in the middle of a 251 * folio, not touching either edge, but finishes first. In such a 252 * case, we donate to the previous subreq, if there is one and if it is 253 * contiguous, so that the donation is only handled when that completes 254 * - and remove this subreq from the list. 255 * 256 * If the previous subreq finished first, we will have acquired their 257 * donation and should be able to unlock folios and/or donate nextwards. 258 */ 259 if (!subreq->consumed && 260 !prev_donated && 261 !list_is_first(&subreq->rreq_link, &rreq->subrequests) && 262 subreq->start == prev->start + prev->len) { 263 prev = list_prev_entry(subreq, rreq_link); 264 WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); 265 subreq->start += subreq->len; 266 subreq->len = 0; 267 subreq->transferred = 0; 268 trace_netfs_donate(rreq, subreq, prev, subreq->len, 269 netfs_trace_donate_to_prev); 270 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); 271 goto remove_subreq_locked; 272 } 273 274 /* If we can't donate down the chain, donate up the chain instead. */ 275 excess = subreq->len - subreq->consumed + next_donated; 276 277 if (!subreq->consumed) 278 excess += prev_donated; 279 280 if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) { 281 rreq->prev_donated = excess; 282 trace_netfs_donate(rreq, subreq, NULL, excess, 283 netfs_trace_donate_to_deferred_next); 284 } else { 285 next = list_next_entry(subreq, rreq_link); 286 WRITE_ONCE(next->prev_donated, excess); 287 trace_netfs_donate(rreq, subreq, next, excess, 288 netfs_trace_donate_to_next); 289 } 290 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next); 291 subreq->len = subreq->consumed; 292 subreq->transferred = subreq->consumed; 293 goto remove_subreq_locked; 294 295 remove_subreq: 296 spin_lock_bh(&rreq->lock); 297 remove_subreq_locked: 298 subreq->consumed = subreq->len; 299 list_del(&subreq->rreq_link); 300 spin_unlock_bh(&rreq->lock); 301 netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed); 302 return true; 303 304 bad: 305 /* Errr... prev and next both donated to us, but insufficient to finish 306 * the folio. 307 */ 308 printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n", 309 rreq->debug_id, subreq->debug_index, 310 subreq->start, subreq->start + subreq->transferred - 1, 311 subreq->consumed, subreq->transferred, subreq->len); 312 printk("folio: %llx-%llx\n", fpos, fend - 1); 313 printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated); 314 printk("s=%llx av=%zx part=%zx\n", start, avail, part); 315 BUG(); 316 } 317 318 /* 319 * Do page flushing and suchlike after DIO. 320 */ 321 static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) 322 { 323 struct netfs_io_subrequest *subreq; 324 unsigned int i; 325 326 /* Collect unbuffered reads and direct reads, adding up the transfer 327 * sizes until we find the first short or failed subrequest. 328 */ 329 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { 330 rreq->transferred += subreq->transferred; 331 332 if (subreq->transferred < subreq->len || 333 test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { 334 rreq->error = subreq->error; 335 break; 336 } 337 } 338 339 if (rreq->origin == NETFS_DIO_READ) { 340 for (i = 0; i < rreq->direct_bv_count; i++) { 341 flush_dcache_page(rreq->direct_bv[i].bv_page); 342 // TODO: cifs marks pages in the destination buffer 343 // dirty under some circumstances after a read. Do we 344 // need to do that too? 345 set_page_dirty(rreq->direct_bv[i].bv_page); 346 } 347 } 348 349 if (rreq->iocb) { 350 rreq->iocb->ki_pos += rreq->transferred; 351 if (rreq->iocb->ki_complete) 352 rreq->iocb->ki_complete( 353 rreq->iocb, rreq->error ? rreq->error : rreq->transferred); 354 } 355 if (rreq->netfs_ops->done) 356 rreq->netfs_ops->done(rreq); 357 if (rreq->origin == NETFS_DIO_READ) 358 inode_dio_end(rreq->inode); 359 } 360 361 /* 362 * Assess the state of a read request and decide what to do next. 363 * 364 * Note that we're in normal kernel thread context at this point, possibly 365 * running on a workqueue. 366 */ 367 static void netfs_rreq_assess(struct netfs_io_request *rreq) 368 { 369 trace_netfs_rreq(rreq, netfs_rreq_trace_assess); 370 371 //netfs_rreq_is_still_valid(rreq); 372 373 if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) { 374 netfs_retry_reads(rreq); 375 return; 376 } 377 378 if (rreq->origin == NETFS_DIO_READ || 379 rreq->origin == NETFS_READ_GAPS) 380 netfs_rreq_assess_dio(rreq); 381 task_io_account_read(rreq->transferred); 382 383 trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); 384 clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); 385 386 trace_netfs_rreq(rreq, netfs_rreq_trace_done); 387 netfs_clear_subrequests(rreq, false); 388 netfs_unlock_abandoned_read_pages(rreq); 389 if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))) 390 netfs_pgpriv2_write_to_the_cache(rreq); 391 } 392 393 void netfs_read_termination_worker(struct work_struct *work) 394 { 395 struct netfs_io_request *rreq = 396 container_of(work, struct netfs_io_request, work); 397 netfs_see_request(rreq, netfs_rreq_trace_see_work); 398 netfs_rreq_assess(rreq); 399 netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete); 400 } 401 402 /* 403 * Handle the completion of all outstanding I/O operations on a read request. 404 * We inherit a ref from the caller. 405 */ 406 void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async) 407 { 408 if (!was_async) 409 return netfs_rreq_assess(rreq); 410 if (!work_pending(&rreq->work)) { 411 netfs_get_request(rreq, netfs_rreq_trace_get_work); 412 if (!queue_work(system_unbound_wq, &rreq->work)) 413 netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq); 414 } 415 } 416 417 /** 418 * netfs_read_subreq_progress - Note progress of a read operation. 419 * @subreq: The read request that has terminated. 420 * @was_async: True if we're in an asynchronous context. 421 * 422 * This tells the read side of netfs lib that a contributory I/O operation has 423 * made some progress and that it may be possible to unlock some folios. 424 * 425 * Before calling, the filesystem should update subreq->transferred to track 426 * the amount of data copied into the output buffer. 427 * 428 * If @was_async is true, the caller might be running in softirq or interrupt 429 * context and we can't sleep. 430 */ 431 void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, 432 bool was_async) 433 { 434 struct netfs_io_request *rreq = subreq->rreq; 435 436 trace_netfs_sreq(subreq, netfs_sreq_trace_progress); 437 438 if (subreq->transferred > subreq->consumed && 439 (rreq->origin == NETFS_READAHEAD || 440 rreq->origin == NETFS_READPAGE || 441 rreq->origin == NETFS_READ_FOR_WRITE)) { 442 netfs_consume_read_data(subreq, was_async); 443 __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); 444 } 445 } 446 EXPORT_SYMBOL(netfs_read_subreq_progress); 447 448 /** 449 * netfs_read_subreq_terminated - Note the termination of an I/O operation. 450 * @subreq: The I/O request that has terminated. 451 * @error: Error code indicating type of completion. 452 * @was_async: The termination was asynchronous 453 * 454 * This tells the read helper that a contributory I/O operation has terminated, 455 * one way or another, and that it should integrate the results. 456 * 457 * The caller indicates the outcome of the operation through @error, supplying 458 * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY 459 * is set) or a negative error code. The helper will look after reissuing I/O 460 * operations as appropriate and writing downloaded data to the cache. 461 * 462 * Before calling, the filesystem should update subreq->transferred to track 463 * the amount of data copied into the output buffer. 464 * 465 * If @was_async is true, the caller might be running in softirq or interrupt 466 * context and we can't sleep. 467 */ 468 void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, 469 int error, bool was_async) 470 { 471 struct netfs_io_request *rreq = subreq->rreq; 472 473 switch (subreq->source) { 474 case NETFS_READ_FROM_CACHE: 475 netfs_stat(&netfs_n_rh_read_done); 476 break; 477 case NETFS_DOWNLOAD_FROM_SERVER: 478 netfs_stat(&netfs_n_rh_download_done); 479 break; 480 default: 481 break; 482 } 483 484 if (rreq->origin != NETFS_DIO_READ) { 485 /* Collect buffered reads. 486 * 487 * If the read completed validly short, then we can clear the 488 * tail before going on to unlock the folios. 489 */ 490 if (error == 0 && subreq->transferred < subreq->len && 491 (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) || 492 test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) { 493 netfs_clear_unread(subreq); 494 subreq->transferred = subreq->len; 495 trace_netfs_sreq(subreq, netfs_sreq_trace_clear); 496 } 497 if (subreq->transferred > subreq->consumed && 498 (rreq->origin == NETFS_READAHEAD || 499 rreq->origin == NETFS_READPAGE || 500 rreq->origin == NETFS_READ_FOR_WRITE)) { 501 netfs_consume_read_data(subreq, was_async); 502 __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); 503 } 504 rreq->transferred += subreq->transferred; 505 } 506 507 /* Deal with retry requests, short reads and errors. If we retry 508 * but don't make progress, we abandon the attempt. 509 */ 510 if (!error && subreq->transferred < subreq->len) { 511 if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) { 512 trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof); 513 } else { 514 trace_netfs_sreq(subreq, netfs_sreq_trace_short); 515 if (subreq->transferred > subreq->consumed) { 516 /* If we didn't read new data, abandon retry. */ 517 if (subreq->retry_count && 518 test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { 519 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); 520 set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); 521 } 522 } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { 523 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); 524 set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); 525 } else { 526 __set_bit(NETFS_SREQ_FAILED, &subreq->flags); 527 error = -ENODATA; 528 } 529 } 530 } 531 532 subreq->error = error; 533 trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); 534 535 if (unlikely(error < 0)) { 536 trace_netfs_failure(rreq, subreq, error, netfs_fail_read); 537 if (subreq->source == NETFS_READ_FROM_CACHE) { 538 netfs_stat(&netfs_n_rh_read_failed); 539 } else { 540 netfs_stat(&netfs_n_rh_download_failed); 541 set_bit(NETFS_RREQ_FAILED, &rreq->flags); 542 rreq->error = subreq->error; 543 } 544 } 545 546 if (atomic_dec_and_test(&rreq->nr_outstanding)) 547 netfs_rreq_terminated(rreq, was_async); 548 549 netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); 550 } 551 EXPORT_SYMBOL(netfs_read_subreq_terminated); 552