1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem read subrequest result collection, assessment and
3 * retrying.
4 *
5 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
6 * Written by David Howells (dhowells@redhat.com)
7 */
8
9 #include <linux/export.h>
10 #include <linux/fs.h>
11 #include <linux/mm.h>
12 #include <linux/pagemap.h>
13 #include <linux/slab.h>
14 #include <linux/task_io_accounting_ops.h>
15 #include "internal.h"
16
17 /*
18 * Clear the unread part of an I/O request.
19 */
netfs_clear_unread(struct netfs_io_subrequest * subreq)20 static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
21 {
22 netfs_reset_iter(subreq);
23 WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter));
24 iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
25 if (subreq->start + subreq->transferred >= subreq->rreq->i_size)
26 __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
27 }
28
29 /*
30 * Flush, mark and unlock a folio that's now completely read. If we want to
31 * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
32 * dirty and let writeback handle it.
33 */
netfs_unlock_read_folio(struct netfs_io_subrequest * subreq,struct netfs_io_request * rreq,struct folio_queue * folioq,int slot)34 static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
35 struct netfs_io_request *rreq,
36 struct folio_queue *folioq,
37 int slot)
38 {
39 struct netfs_folio *finfo;
40 struct folio *folio = folioq_folio(folioq, slot);
41
42 flush_dcache_folio(folio);
43 folio_mark_uptodate(folio);
44
45 if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
46 finfo = netfs_folio_info(folio);
47 if (finfo) {
48 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
49 if (finfo->netfs_group)
50 folio_change_private(folio, finfo->netfs_group);
51 else
52 folio_detach_private(folio);
53 kfree(finfo);
54 }
55
56 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
57 if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
58 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
59 folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
60 folio_mark_dirty(folio);
61 }
62 } else {
63 trace_netfs_folio(folio, netfs_folio_trace_read_done);
64 }
65 } else {
66 // TODO: Use of PG_private_2 is deprecated.
67 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
68 netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
69 }
70
71 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
72 if (folio->index == rreq->no_unlock_folio &&
73 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
74 _debug("no unlock");
75 } else {
76 trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
77 folio_unlock(folio);
78 }
79 }
80
81 folioq_clear(folioq, slot);
82 }
83
84 /*
85 * Unlock any folios that are now completely read. Returns true if the
86 * subrequest is removed from the list.
87 */
netfs_consume_read_data(struct netfs_io_subrequest * subreq,bool was_async)88 static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async)
89 {
90 struct netfs_io_subrequest *prev, *next;
91 struct netfs_io_request *rreq = subreq->rreq;
92 struct folio_queue *folioq = subreq->curr_folioq;
93 size_t avail, prev_donated, next_donated, fsize, part, excess;
94 loff_t fpos, start;
95 loff_t fend;
96 int slot = subreq->curr_folioq_slot;
97
98 if (WARN(subreq->transferred > subreq->len,
99 "Subreq overread: R%x[%x] %zu > %zu",
100 rreq->debug_id, subreq->debug_index,
101 subreq->transferred, subreq->len))
102 subreq->transferred = subreq->len;
103
104 next_folio:
105 fsize = PAGE_SIZE << subreq->curr_folio_order;
106 fpos = round_down(subreq->start + subreq->consumed, fsize);
107 fend = fpos + fsize;
108
109 if (WARN_ON_ONCE(!folioq) ||
110 WARN_ON_ONCE(!folioq_folio(folioq, slot)) ||
111 WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) {
112 pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n",
113 rreq->debug_id, subreq->debug_index,
114 subreq->start, subreq->start + subreq->transferred - 1,
115 subreq->consumed, subreq->transferred, subreq->len,
116 slot);
117 if (folioq) {
118 struct folio *folio = folioq_folio(folioq, slot);
119
120 pr_err("folioq: orders=%02x%02x%02x%02x\n",
121 folioq->orders[0], folioq->orders[1],
122 folioq->orders[2], folioq->orders[3]);
123 if (folio)
124 pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
125 fpos, fend - 1, folio_pos(folio), folio_order(folio),
126 folioq_folio_order(folioq, slot));
127 }
128 }
129
130 donation_changed:
131 /* Try to consume the current folio if we've hit or passed the end of
132 * it. There's a possibility that this subreq doesn't start at the
133 * beginning of the folio, in which case we need to donate to/from the
134 * preceding subreq.
135 *
136 * We also need to include any potential donation back from the
137 * following subreq.
138 */
139 prev_donated = READ_ONCE(subreq->prev_donated);
140 next_donated = READ_ONCE(subreq->next_donated);
141 if (prev_donated || next_donated) {
142 spin_lock_bh(&rreq->lock);
143 prev_donated = subreq->prev_donated;
144 next_donated = subreq->next_donated;
145 subreq->start -= prev_donated;
146 subreq->len += prev_donated;
147 subreq->transferred += prev_donated;
148 prev_donated = subreq->prev_donated = 0;
149 if (subreq->transferred == subreq->len) {
150 subreq->len += next_donated;
151 subreq->transferred += next_donated;
152 next_donated = subreq->next_donated = 0;
153 }
154 trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
155 spin_unlock_bh(&rreq->lock);
156 }
157
158 avail = subreq->transferred;
159 if (avail == subreq->len)
160 avail += next_donated;
161 start = subreq->start;
162 if (subreq->consumed == 0) {
163 start -= prev_donated;
164 avail += prev_donated;
165 } else {
166 start += subreq->consumed;
167 avail -= subreq->consumed;
168 }
169 part = umin(avail, fsize);
170
171 trace_netfs_progress(subreq, start, avail, part);
172
173 if (start + avail >= fend) {
174 if (fpos == start) {
175 /* Flush, unlock and mark for caching any folio we've just read. */
176 subreq->consumed = fend - subreq->start;
177 netfs_unlock_read_folio(subreq, rreq, folioq, slot);
178 folioq_mark2(folioq, slot);
179 if (subreq->consumed >= subreq->len)
180 goto remove_subreq;
181 } else if (fpos < start) {
182 excess = fend - subreq->start;
183
184 spin_lock_bh(&rreq->lock);
185 /* If we complete first on a folio split with the
186 * preceding subreq, donate to that subreq - otherwise
187 * we get the responsibility.
188 */
189 if (subreq->prev_donated != prev_donated) {
190 spin_unlock_bh(&rreq->lock);
191 goto donation_changed;
192 }
193
194 if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
195 spin_unlock_bh(&rreq->lock);
196 pr_err("Can't donate prior to front\n");
197 goto bad;
198 }
199
200 prev = list_prev_entry(subreq, rreq_link);
201 WRITE_ONCE(prev->next_donated, prev->next_donated + excess);
202 subreq->start += excess;
203 subreq->len -= excess;
204 subreq->transferred -= excess;
205 trace_netfs_donate(rreq, subreq, prev, excess,
206 netfs_trace_donate_tail_to_prev);
207 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
208
209 if (subreq->consumed >= subreq->len)
210 goto remove_subreq_locked;
211 spin_unlock_bh(&rreq->lock);
212 } else {
213 pr_err("fpos > start\n");
214 goto bad;
215 }
216
217 /* Advance the rolling buffer to the next folio. */
218 slot++;
219 if (slot >= folioq_nr_slots(folioq)) {
220 slot = 0;
221 folioq = folioq->next;
222 subreq->curr_folioq = folioq;
223 }
224 subreq->curr_folioq_slot = slot;
225 if (folioq && folioq_folio(folioq, slot))
226 subreq->curr_folio_order = folioq->orders[slot];
227 if (!was_async)
228 cond_resched();
229 goto next_folio;
230 }
231
232 /* Deal with partial progress. */
233 if (subreq->transferred < subreq->len)
234 return false;
235
236 /* Donate the remaining downloaded data to one of the neighbouring
237 * subrequests. Note that we may race with them doing the same thing.
238 */
239 spin_lock_bh(&rreq->lock);
240
241 if (subreq->prev_donated != prev_donated ||
242 subreq->next_donated != next_donated) {
243 spin_unlock_bh(&rreq->lock);
244 cond_resched();
245 goto donation_changed;
246 }
247
248 /* Deal with the trickiest case: that this subreq is in the middle of a
249 * folio, not touching either edge, but finishes first. In such a
250 * case, we donate to the previous subreq, if there is one, so that the
251 * donation is only handled when that completes - and remove this
252 * subreq from the list.
253 *
254 * If the previous subreq finished first, we will have acquired their
255 * donation and should be able to unlock folios and/or donate nextwards.
256 */
257 if (!subreq->consumed &&
258 !prev_donated &&
259 !list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
260 prev = list_prev_entry(subreq, rreq_link);
261 WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
262 subreq->start += subreq->len;
263 subreq->len = 0;
264 subreq->transferred = 0;
265 trace_netfs_donate(rreq, subreq, prev, subreq->len,
266 netfs_trace_donate_to_prev);
267 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
268 goto remove_subreq_locked;
269 }
270
271 /* If we can't donate down the chain, donate up the chain instead. */
272 excess = subreq->len - subreq->consumed + next_donated;
273
274 if (!subreq->consumed)
275 excess += prev_donated;
276
277 if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
278 rreq->prev_donated = excess;
279 trace_netfs_donate(rreq, subreq, NULL, excess,
280 netfs_trace_donate_to_deferred_next);
281 } else {
282 next = list_next_entry(subreq, rreq_link);
283 WRITE_ONCE(next->prev_donated, excess);
284 trace_netfs_donate(rreq, subreq, next, excess,
285 netfs_trace_donate_to_next);
286 }
287 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next);
288 subreq->len = subreq->consumed;
289 subreq->transferred = subreq->consumed;
290 goto remove_subreq_locked;
291
292 remove_subreq:
293 spin_lock_bh(&rreq->lock);
294 remove_subreq_locked:
295 subreq->consumed = subreq->len;
296 list_del(&subreq->rreq_link);
297 spin_unlock_bh(&rreq->lock);
298 netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
299 return true;
300
301 bad:
302 /* Errr... prev and next both donated to us, but insufficient to finish
303 * the folio.
304 */
305 printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
306 rreq->debug_id, subreq->debug_index,
307 subreq->start, subreq->start + subreq->transferred - 1,
308 subreq->consumed, subreq->transferred, subreq->len);
309 printk("folio: %llx-%llx\n", fpos, fend - 1);
310 printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated);
311 printk("s=%llx av=%zx part=%zx\n", start, avail, part);
312 BUG();
313 }
314
315 /*
316 * Do page flushing and suchlike after DIO.
317 */
netfs_rreq_assess_dio(struct netfs_io_request * rreq)318 static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
319 {
320 struct netfs_io_subrequest *subreq;
321 unsigned int i;
322
323 /* Collect unbuffered reads and direct reads, adding up the transfer
324 * sizes until we find the first short or failed subrequest.
325 */
326 list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
327 rreq->transferred += subreq->transferred;
328
329 if (subreq->transferred < subreq->len ||
330 test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
331 rreq->error = subreq->error;
332 break;
333 }
334 }
335
336 if (rreq->origin == NETFS_DIO_READ) {
337 for (i = 0; i < rreq->direct_bv_count; i++) {
338 flush_dcache_page(rreq->direct_bv[i].bv_page);
339 // TODO: cifs marks pages in the destination buffer
340 // dirty under some circumstances after a read. Do we
341 // need to do that too?
342 set_page_dirty(rreq->direct_bv[i].bv_page);
343 }
344 }
345
346 if (rreq->iocb) {
347 rreq->iocb->ki_pos += rreq->transferred;
348 if (rreq->iocb->ki_complete)
349 rreq->iocb->ki_complete(
350 rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
351 }
352 if (rreq->netfs_ops->done)
353 rreq->netfs_ops->done(rreq);
354 if (rreq->origin == NETFS_DIO_READ)
355 inode_dio_end(rreq->inode);
356 }
357
358 /*
359 * Assess the state of a read request and decide what to do next.
360 *
361 * Note that we're in normal kernel thread context at this point, possibly
362 * running on a workqueue.
363 */
netfs_rreq_assess(struct netfs_io_request * rreq)364 static void netfs_rreq_assess(struct netfs_io_request *rreq)
365 {
366 trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
367
368 //netfs_rreq_is_still_valid(rreq);
369
370 if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) {
371 netfs_retry_reads(rreq);
372 return;
373 }
374
375 if (rreq->origin == NETFS_DIO_READ ||
376 rreq->origin == NETFS_READ_GAPS)
377 netfs_rreq_assess_dio(rreq);
378 task_io_account_read(rreq->transferred);
379
380 trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
381 clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
382 wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
383
384 trace_netfs_rreq(rreq, netfs_rreq_trace_done);
385 netfs_clear_subrequests(rreq, false);
386 netfs_unlock_abandoned_read_pages(rreq);
387 if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)))
388 netfs_pgpriv2_write_to_the_cache(rreq);
389 }
390
netfs_read_termination_worker(struct work_struct * work)391 void netfs_read_termination_worker(struct work_struct *work)
392 {
393 struct netfs_io_request *rreq =
394 container_of(work, struct netfs_io_request, work);
395 netfs_see_request(rreq, netfs_rreq_trace_see_work);
396 netfs_rreq_assess(rreq);
397 netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete);
398 }
399
400 /*
401 * Handle the completion of all outstanding I/O operations on a read request.
402 * We inherit a ref from the caller.
403 */
netfs_rreq_terminated(struct netfs_io_request * rreq,bool was_async)404 void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async)
405 {
406 if (!was_async)
407 return netfs_rreq_assess(rreq);
408 if (!work_pending(&rreq->work)) {
409 netfs_get_request(rreq, netfs_rreq_trace_get_work);
410 if (!queue_work(system_unbound_wq, &rreq->work))
411 netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq);
412 }
413 }
414
415 /**
416 * netfs_read_subreq_progress - Note progress of a read operation.
417 * @subreq: The read request that has terminated.
418 * @was_async: True if we're in an asynchronous context.
419 *
420 * This tells the read side of netfs lib that a contributory I/O operation has
421 * made some progress and that it may be possible to unlock some folios.
422 *
423 * Before calling, the filesystem should update subreq->transferred to track
424 * the amount of data copied into the output buffer.
425 *
426 * If @was_async is true, the caller might be running in softirq or interrupt
427 * context and we can't sleep.
428 */
netfs_read_subreq_progress(struct netfs_io_subrequest * subreq,bool was_async)429 void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
430 bool was_async)
431 {
432 struct netfs_io_request *rreq = subreq->rreq;
433
434 trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
435
436 if (subreq->transferred > subreq->consumed &&
437 (rreq->origin == NETFS_READAHEAD ||
438 rreq->origin == NETFS_READPAGE ||
439 rreq->origin == NETFS_READ_FOR_WRITE)) {
440 netfs_consume_read_data(subreq, was_async);
441 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
442 }
443 }
444 EXPORT_SYMBOL(netfs_read_subreq_progress);
445
446 /**
447 * netfs_read_subreq_terminated - Note the termination of an I/O operation.
448 * @subreq: The I/O request that has terminated.
449 * @error: Error code indicating type of completion.
450 * @was_async: The termination was asynchronous
451 *
452 * This tells the read helper that a contributory I/O operation has terminated,
453 * one way or another, and that it should integrate the results.
454 *
455 * The caller indicates the outcome of the operation through @error, supplying
456 * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY
457 * is set) or a negative error code. The helper will look after reissuing I/O
458 * operations as appropriate and writing downloaded data to the cache.
459 *
460 * Before calling, the filesystem should update subreq->transferred to track
461 * the amount of data copied into the output buffer.
462 *
463 * If @was_async is true, the caller might be running in softirq or interrupt
464 * context and we can't sleep.
465 */
netfs_read_subreq_terminated(struct netfs_io_subrequest * subreq,int error,bool was_async)466 void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
467 int error, bool was_async)
468 {
469 struct netfs_io_request *rreq = subreq->rreq;
470
471 switch (subreq->source) {
472 case NETFS_READ_FROM_CACHE:
473 netfs_stat(&netfs_n_rh_read_done);
474 break;
475 case NETFS_DOWNLOAD_FROM_SERVER:
476 netfs_stat(&netfs_n_rh_download_done);
477 break;
478 default:
479 break;
480 }
481
482 if (rreq->origin != NETFS_DIO_READ) {
483 /* Collect buffered reads.
484 *
485 * If the read completed validly short, then we can clear the
486 * tail before going on to unlock the folios.
487 */
488 if (error == 0 && subreq->transferred < subreq->len &&
489 (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
490 test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
491 netfs_clear_unread(subreq);
492 subreq->transferred = subreq->len;
493 trace_netfs_sreq(subreq, netfs_sreq_trace_clear);
494 }
495 if (subreq->transferred > subreq->consumed &&
496 (rreq->origin == NETFS_READAHEAD ||
497 rreq->origin == NETFS_READPAGE ||
498 rreq->origin == NETFS_READ_FOR_WRITE)) {
499 netfs_consume_read_data(subreq, was_async);
500 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
501 }
502 rreq->transferred += subreq->transferred;
503 }
504
505 /* Deal with retry requests, short reads and errors. If we retry
506 * but don't make progress, we abandon the attempt.
507 */
508 if (!error && subreq->transferred < subreq->len) {
509 if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
510 trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
511 } else {
512 trace_netfs_sreq(subreq, netfs_sreq_trace_short);
513 if (subreq->transferred > subreq->consumed) {
514 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
515 __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
516 set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
517 } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
518 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
519 set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
520 } else {
521 __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
522 error = -ENODATA;
523 }
524 }
525 }
526
527 subreq->error = error;
528 trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
529
530 if (unlikely(error < 0)) {
531 trace_netfs_failure(rreq, subreq, error, netfs_fail_read);
532 if (subreq->source == NETFS_READ_FROM_CACHE) {
533 netfs_stat(&netfs_n_rh_read_failed);
534 } else {
535 netfs_stat(&netfs_n_rh_download_failed);
536 set_bit(NETFS_RREQ_FAILED, &rreq->flags);
537 rreq->error = subreq->error;
538 }
539 }
540
541 if (atomic_dec_and_test(&rreq->nr_outstanding))
542 netfs_rreq_terminated(rreq, was_async);
543
544 netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
545 }
546 EXPORT_SYMBOL(netfs_read_subreq_terminated);
547