1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem write subrequest result collection, assessment
3 * and retrying.
4 *
5 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
6 * Written by David Howells (dhowells@redhat.com)
7 */
8
9 #include <linux/export.h>
10 #include <linux/fs.h>
11 #include <linux/mm.h>
12 #include <linux/pagemap.h>
13 #include <linux/slab.h>
14 #include "internal.h"
15
16 /* Notes made in the collector */
17 #define HIT_PENDING 0x01 /* A front op was still pending */
18 #define NEED_REASSESS 0x02 /* Need to loop round and reassess */
19 #define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
20 #define NEED_UNLOCK 0x08 /* The pagecache needs unlocking */
21 #define NEED_RETRY 0x10 /* A front op requests retrying */
22 #define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */
23
netfs_dump_request(const struct netfs_io_request * rreq)24 static void netfs_dump_request(const struct netfs_io_request *rreq)
25 {
26 pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n",
27 rreq->debug_id, refcount_read(&rreq->ref), rreq->flags,
28 rreq->origin, rreq->error);
29 pr_err(" st=%llx tsl=%zx/%llx/%llx\n",
30 rreq->start, rreq->transferred, rreq->submitted, rreq->len);
31 pr_err(" cci=%llx/%llx/%llx\n",
32 rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));
33 pr_err(" iw=%pSR\n", rreq->netfs_ops->issue_write);
34 for (int i = 0; i < NR_IO_STREAMS; i++) {
35 const struct netfs_io_subrequest *sreq;
36 const struct netfs_io_stream *s = &rreq->io_streams[i];
37
38 pr_err(" str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n",
39 s->stream_nr, s->source, s->error,
40 s->avail, s->active, s->need_retry, s->failed);
41 pr_err(" str[%x] ct=%llx t=%zx\n",
42 s->stream_nr, s->collected_to, s->transferred);
43 list_for_each_entry(sreq, &s->subrequests, rreq_link) {
44 pr_err(" sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n",
45 sreq->stream_nr, sreq->debug_index, sreq->source,
46 sreq->start, sreq->transferred, sreq->len,
47 refcount_read(&sreq->ref), sreq->flags);
48 }
49 }
50 }
51
52 /*
53 * Successful completion of write of a folio to the server and/or cache. Note
54 * that we are not allowed to lock the folio here on pain of deadlocking with
55 * truncate.
56 */
netfs_folio_written_back(struct folio * folio)57 int netfs_folio_written_back(struct folio *folio)
58 {
59 enum netfs_folio_trace why = netfs_folio_trace_clear;
60 struct inode *inode = folio_inode(folio);
61 struct netfs_inode *ictx = netfs_inode(inode);
62 struct netfs_folio *finfo;
63 struct netfs_group *group = NULL;
64 int gcount = 0;
65
66 if ((finfo = netfs_folio_info(folio))) {
67 /* Streaming writes cannot be redirtied whilst under writeback,
68 * so discard the streaming record.
69 */
70 unsigned long long fend;
71
72 fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
73 spin_lock(&ictx->inode.i_lock);
74 if (fend > ictx->_zero_point)
75 netfs_write_zero_point(inode, fend);
76 spin_unlock(&ictx->inode.i_lock);
77
78 folio_detach_private(folio);
79 group = finfo->netfs_group;
80 gcount++;
81 kfree(finfo);
82 why = netfs_folio_trace_clear_s;
83 goto end_wb;
84 }
85
86 if ((group = netfs_folio_group(folio))) {
87 if (group == NETFS_FOLIO_COPY_TO_CACHE) {
88 why = netfs_folio_trace_clear_cc;
89 folio_detach_private(folio);
90 goto end_wb;
91 }
92
93 /* Need to detach the group pointer if the page didn't get
94 * redirtied. If it has been redirtied, then it must be within
95 * the same group.
96 */
97 why = netfs_folio_trace_redirtied;
98 if (!folio_test_dirty(folio)) {
99 folio_detach_private(folio);
100 gcount++;
101 why = netfs_folio_trace_clear_g;
102 }
103 }
104
105 end_wb:
106 trace_netfs_folio(folio, why);
107 folio_end_writeback(folio);
108 return gcount;
109 }
110
111 /*
112 * Unlock any folios we've finished with.
113 */
netfs_writeback_unlock_folios(struct netfs_io_request * wreq,unsigned int * notes)114 static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
115 unsigned int *notes)
116 {
117 struct folio_queue *folioq = wreq->buffer.tail;
118 unsigned long long collected_to = wreq->collected_to;
119 unsigned int slot = wreq->buffer.first_tail_slot;
120
121 if (WARN_ON_ONCE(!folioq)) {
122 pr_err("[!] Writeback unlock found empty rolling buffer!\n");
123 netfs_dump_request(wreq);
124 return;
125 }
126
127 if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
128 if (netfs_pgpriv2_unlock_copied_folios(wreq))
129 *notes |= MADE_PROGRESS;
130 return;
131 }
132
133 if (slot >= folioq_nr_slots(folioq)) {
134 folioq = rolling_buffer_delete_spent(&wreq->buffer);
135 if (!folioq)
136 return;
137 slot = 0;
138 }
139
140 for (;;) {
141 struct folio *folio;
142 struct netfs_folio *finfo;
143 unsigned long long fpos, fend;
144 size_t fsize, flen;
145
146 folio = folioq_folio(folioq, slot);
147 if (WARN_ONCE(!folio_test_writeback(folio),
148 "R=%08x: folio %lx is not under writeback\n",
149 wreq->debug_id, folio->index))
150 trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
151
152 fpos = folio_pos(folio);
153 fsize = folio_size(folio);
154 finfo = netfs_folio_info(folio);
155 flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
156
157 fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
158
159 trace_netfs_collect_folio(wreq, folio, fend, collected_to);
160
161 /* Unlock any folio we've transferred all of. */
162 if (collected_to < fend)
163 break;
164
165 wreq->nr_group_rel += netfs_folio_written_back(folio);
166 wreq->cleaned_to = fpos + fsize;
167 *notes |= MADE_PROGRESS;
168
169 /* Clean up the head folioq. If we clear an entire folioq, then
170 * we can get rid of it provided it's not also the tail folioq
171 * being filled by the issuer.
172 */
173 folioq_clear(folioq, slot);
174 slot++;
175 if (slot >= folioq_nr_slots(folioq)) {
176 folioq = rolling_buffer_delete_spent(&wreq->buffer);
177 if (!folioq)
178 goto done;
179 slot = 0;
180 }
181
182 if (fpos + fsize >= collected_to)
183 break;
184 }
185
186 wreq->buffer.tail = folioq;
187 done:
188 wreq->buffer.first_tail_slot = slot;
189 }
190
191 /*
192 * Collect and assess the results of various write subrequests. We may need to
193 * retry some of the results - or even do an RMW cycle for content crypto.
194 *
195 * Note that we have a number of parallel, overlapping lists of subrequests,
196 * one to the server and one to the local cache for example, which may not be
197 * the same size or starting position and may not even correspond in boundary
198 * alignment.
199 */
netfs_collect_write_results(struct netfs_io_request * wreq)200 static void netfs_collect_write_results(struct netfs_io_request *wreq)
201 {
202 struct netfs_io_subrequest *front, *remove;
203 struct netfs_io_stream *stream;
204 unsigned long long collected_to, issued_to;
205 unsigned int notes;
206 int s;
207
208 _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
209 trace_netfs_collect(wreq);
210 trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
211
212 reassess_streams:
213 issued_to = atomic64_read(&wreq->issued_to);
214 smp_rmb();
215 collected_to = ULLONG_MAX;
216 if (wreq->origin == NETFS_WRITEBACK ||
217 wreq->origin == NETFS_WRITETHROUGH ||
218 wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
219 notes = NEED_UNLOCK;
220 else
221 notes = 0;
222
223 /* Remove completed subrequests from the front of the streams and
224 * advance the completion point on each stream. We stop when we hit
225 * something that's in progress. The issuer thread may be adding stuff
226 * to the tail whilst we're doing this.
227 */
228 for (s = 0; s < NR_IO_STREAMS; s++) {
229 stream = &wreq->io_streams[s];
230 /* Read active flag before list pointers */
231 if (!smp_load_acquire(&stream->active))
232 continue;
233
234 front = list_first_entry_or_null_acquire(&stream->subrequests,
235 struct netfs_io_subrequest, rreq_link);
236 /* Read first subreq pointer before IN_PROGRESS flag. */
237
238 while (front) {
239 trace_netfs_collect_sreq(wreq, front);
240 //_debug("sreq [%x] %llx %zx/%zx",
241 // front->debug_index, front->start, front->transferred, front->len);
242
243 if (stream->collected_to < front->start) {
244 trace_netfs_collect_gap(wreq, stream, issued_to, 'F');
245 stream->collected_to = front->start;
246 }
247
248 /* Stall if the front is still undergoing I/O. */
249 if (netfs_check_subreq_in_progress(front)) {
250 notes |= HIT_PENDING;
251 break;
252 }
253 smp_rmb(); /* Read counters after I-P flag. */
254
255 if (stream->failed) {
256 stream->collected_to = front->start + front->len;
257 notes |= MADE_PROGRESS | SAW_FAILURE;
258 goto cancel;
259 }
260 if (front->start + front->transferred > stream->collected_to) {
261 stream->collected_to = front->start + front->transferred;
262 stream->transferred = stream->collected_to - wreq->start;
263 stream->transferred_valid = true;
264 notes |= MADE_PROGRESS;
265 }
266 if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
267 stream->failed = true;
268 stream->error = front->error;
269 if (stream->source == NETFS_UPLOAD_TO_SERVER)
270 mapping_set_error(wreq->mapping, front->error);
271 notes |= NEED_REASSESS | SAW_FAILURE;
272 break;
273 }
274 if (front->transferred < front->len) {
275 stream->need_retry = true;
276 notes |= NEED_RETRY | MADE_PROGRESS;
277 break;
278 }
279
280 cancel:
281 /* Remove if completely consumed. */
282 spin_lock(&wreq->lock);
283
284 remove = front;
285 list_del_init(&front->rreq_link);
286 front = list_first_entry_or_null(&stream->subrequests,
287 struct netfs_io_subrequest, rreq_link);
288 spin_unlock(&wreq->lock);
289 netfs_put_subrequest(remove,
290 notes & SAW_FAILURE ?
291 netfs_sreq_trace_put_cancel :
292 netfs_sreq_trace_put_done);
293 }
294
295 /* If we have an empty stream, we need to jump it forward
296 * otherwise the collection point will never advance.
297 */
298 if (!front && issued_to > stream->collected_to) {
299 trace_netfs_collect_gap(wreq, stream, issued_to, 'E');
300 stream->collected_to = issued_to;
301 }
302
303 if (stream->collected_to < collected_to)
304 collected_to = stream->collected_to;
305 }
306
307 if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
308 wreq->collected_to = collected_to;
309
310 for (s = 0; s < NR_IO_STREAMS; s++) {
311 stream = &wreq->io_streams[s];
312 if (stream->active)
313 trace_netfs_collect_stream(wreq, stream);
314 }
315
316 trace_netfs_collect_state(wreq, wreq->collected_to, notes);
317
318 /* Unlock any folios that we have now finished with. */
319 if (notes & NEED_UNLOCK) {
320 if (wreq->cleaned_to < wreq->collected_to)
321 netfs_writeback_unlock_folios(wreq, ¬es);
322 } else {
323 wreq->cleaned_to = wreq->collected_to;
324 }
325
326 // TODO: Discard encryption buffers
327
328 if (notes & NEED_RETRY)
329 goto need_retry;
330
331 if (notes & MADE_PROGRESS) {
332 netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause);
333 //cond_resched();
334 goto reassess_streams;
335 }
336
337 if (notes & NEED_REASSESS) {
338 //cond_resched();
339 goto reassess_streams;
340 }
341
342 out:
343 netfs_put_group_many(wreq->group, wreq->nr_group_rel);
344 wreq->nr_group_rel = 0;
345 _leave(" = %x", notes);
346 return;
347
348 need_retry:
349 /* Okay... We're going to have to retry one or both streams. Note
350 * that any partially completed op will have had any wholly transferred
351 * folios removed from it.
352 */
353 _debug("retry");
354 netfs_retry_writes(wreq);
355 goto out;
356 }
357
358 /*
359 * Perform the collection of subrequests, folios and encryption buffers.
360 */
netfs_write_collection(struct netfs_io_request * wreq)361 bool netfs_write_collection(struct netfs_io_request *wreq)
362 {
363 struct netfs_inode *ictx = netfs_inode(wreq->inode);
364 size_t transferred;
365 bool transferred_valid = false;
366 int s;
367
368 _enter("R=%x", wreq->debug_id);
369
370 netfs_collect_write_results(wreq);
371
372 /* We're done when the app thread has finished posting subreqs and all
373 * the queues in all the streams are empty.
374 */
375 if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags))
376 return false;
377 smp_rmb(); /* Read ALL_QUEUED before lists. */
378
379 transferred = LONG_MAX;
380 for (s = 0; s < NR_IO_STREAMS; s++) {
381 struct netfs_io_stream *stream = &wreq->io_streams[s];
382 if (!stream->active)
383 continue;
384 if (!list_empty(&stream->subrequests))
385 return false;
386 if (stream->transferred_valid &&
387 stream->transferred < transferred) {
388 transferred = stream->transferred;
389 transferred_valid = true;
390 }
391 }
392
393 /* Okay, declare that all I/O is complete. */
394 if (transferred_valid)
395 wreq->transferred = transferred;
396 trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
397
398 if (wreq->io_streams[1].active &&
399 wreq->io_streams[1].failed &&
400 ictx->ops->invalidate_cache) {
401 /* Cache write failure doesn't prevent writeback completion
402 * unless we're in disconnected mode.
403 */
404 ictx->ops->invalidate_cache(wreq);
405 }
406
407 _debug("finished");
408 netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
409 /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
410
411 if (wreq->iocb) {
412 size_t written = min(wreq->transferred, wreq->len);
413 wreq->iocb->ki_pos += written;
414 if (wreq->iocb->ki_complete) {
415 trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
416 wreq->iocb->ki_complete(
417 wreq->iocb, wreq->error ? wreq->error : written);
418 }
419 wreq->iocb = VFS_PTR_POISON;
420 }
421
422 netfs_clear_subrequests(wreq);
423 return true;
424 }
425
netfs_write_collection_worker(struct work_struct * work)426 void netfs_write_collection_worker(struct work_struct *work)
427 {
428 struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
429
430 netfs_see_request(rreq, netfs_rreq_trace_see_work);
431 if (netfs_check_rreq_in_progress(rreq)) {
432 if (netfs_write_collection(rreq))
433 /* Drop the ref from the IN_PROGRESS flag. */
434 netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
435 else
436 netfs_see_request(rreq, netfs_rreq_trace_see_work_complete);
437 }
438 }
439
440 /**
441 * netfs_write_subrequest_terminated - Note the termination of a write operation.
442 * @_op: The I/O request that has terminated.
443 * @transferred_or_error: The amount of data transferred or an error code.
444 *
445 * This tells the library that a contributory write I/O operation has
446 * terminated, one way or another, and that it should collect the results.
447 *
448 * The caller indicates in @transferred_or_error the outcome of the operation,
449 * supplying a positive value to indicate the number of bytes transferred or a
450 * negative error code. The library will look after reissuing I/O operations
451 * as appropriate and writing downloaded data to the cache.
452 *
453 * When this is called, ownership of the subrequest is transferred back to the
454 * library, along with a ref.
455 *
456 * Note that %_op is a void* so that the function can be passed to
457 * kiocb::term_func without the need for a casting wrapper.
458 */
netfs_write_subrequest_terminated(void * _op,ssize_t transferred_or_error)459 void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error)
460 {
461 struct netfs_io_subrequest *subreq = _op;
462 struct netfs_io_request *wreq = subreq->rreq;
463
464 _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
465
466 switch (subreq->source) {
467 case NETFS_UPLOAD_TO_SERVER:
468 netfs_stat(&netfs_n_wh_upload_done);
469 break;
470 case NETFS_WRITE_TO_CACHE:
471 netfs_stat(&netfs_n_wh_write_done);
472 break;
473 default:
474 BUG();
475 }
476
477 if (IS_ERR_VALUE(transferred_or_error)) {
478 subreq->error = transferred_or_error;
479 /* if need retry is set, error should not matter */
480 if (!test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
481 set_bit(NETFS_SREQ_FAILED, &subreq->flags);
482 trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
483 }
484
485 switch (subreq->source) {
486 case NETFS_WRITE_TO_CACHE:
487 netfs_stat(&netfs_n_wh_write_failed);
488 break;
489 case NETFS_UPLOAD_TO_SERVER:
490 netfs_stat(&netfs_n_wh_upload_failed);
491 break;
492 default:
493 break;
494 }
495 trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
496 set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
497 } else {
498 if (WARN(transferred_or_error > subreq->len - subreq->transferred,
499 "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
500 wreq->debug_id, subreq->debug_index,
501 transferred_or_error, subreq->len, subreq->transferred))
502 transferred_or_error = subreq->len - subreq->transferred;
503
504 subreq->error = 0;
505 subreq->transferred += transferred_or_error;
506
507 if (subreq->transferred < subreq->len)
508 set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
509 }
510
511 trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
512 netfs_subreq_clear_in_progress(subreq);
513 netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated);
514 }
515 EXPORT_SYMBOL(netfs_write_subrequest_terminated);
516