1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem write subrequest result collection, assessment
3  * and retrying.
4  *
5  * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
6  * Written by David Howells (dhowells@redhat.com)
7  */
8 
9 #include <linux/export.h>
10 #include <linux/fs.h>
11 #include <linux/mm.h>
12 #include <linux/pagemap.h>
13 #include <linux/slab.h>
14 #include "internal.h"
15 
16 /* Notes made in the collector */
17 #define HIT_PENDING		0x01	/* A front op was still pending */
18 #define NEED_REASSESS		0x02	/* Need to loop round and reassess */
19 #define MADE_PROGRESS		0x04	/* Made progress cleaning up a stream or the folio set */
20 #define NEED_UNLOCK		0x08	/* The pagecache needs unlocking */
21 #define NEED_RETRY		0x10	/* A front op requests retrying */
22 #define SAW_FAILURE		0x20	/* One stream or hit a permanent failure */
23 
netfs_dump_request(const struct netfs_io_request * rreq)24 static void netfs_dump_request(const struct netfs_io_request *rreq)
25 {
26 	pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n",
27 	       rreq->debug_id, refcount_read(&rreq->ref), rreq->flags,
28 	       rreq->origin, rreq->error);
29 	pr_err("  st=%llx tsl=%zx/%llx/%llx\n",
30 	       rreq->start, rreq->transferred, rreq->submitted, rreq->len);
31 	pr_err("  cci=%llx/%llx/%llx\n",
32 	       rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));
33 	pr_err("  iw=%pSR\n", rreq->netfs_ops->issue_write);
34 	for (int i = 0; i < NR_IO_STREAMS; i++) {
35 		const struct netfs_io_subrequest *sreq;
36 		const struct netfs_io_stream *s = &rreq->io_streams[i];
37 
38 		pr_err("  str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n",
39 		       s->stream_nr, s->source, s->error,
40 		       s->avail, s->active, s->need_retry, s->failed);
41 		pr_err("  str[%x] ct=%llx t=%zx\n",
42 		       s->stream_nr, s->collected_to, s->transferred);
43 		list_for_each_entry(sreq, &s->subrequests, rreq_link) {
44 			pr_err("  sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n",
45 			       sreq->stream_nr, sreq->debug_index, sreq->source,
46 			       sreq->start, sreq->transferred, sreq->len,
47 			       refcount_read(&sreq->ref), sreq->flags);
48 		}
49 	}
50 }
51 
52 /*
53  * Successful completion of write of a folio to the server and/or cache.  Note
54  * that we are not allowed to lock the folio here on pain of deadlocking with
55  * truncate.
56  */
netfs_folio_written_back(struct folio * folio)57 int netfs_folio_written_back(struct folio *folio)
58 {
59 	enum netfs_folio_trace why = netfs_folio_trace_clear;
60 	struct netfs_inode *ictx = netfs_inode(folio->mapping->host);
61 	struct netfs_folio *finfo;
62 	struct netfs_group *group = NULL;
63 	int gcount = 0;
64 
65 	if ((finfo = netfs_folio_info(folio))) {
66 		/* Streaming writes cannot be redirtied whilst under writeback,
67 		 * so discard the streaming record.
68 		 */
69 		unsigned long long fend;
70 
71 		fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
72 		if (fend > ictx->zero_point)
73 			ictx->zero_point = fend;
74 
75 		folio_detach_private(folio);
76 		group = finfo->netfs_group;
77 		gcount++;
78 		kfree(finfo);
79 		why = netfs_folio_trace_clear_s;
80 		goto end_wb;
81 	}
82 
83 	if ((group = netfs_folio_group(folio))) {
84 		if (group == NETFS_FOLIO_COPY_TO_CACHE) {
85 			why = netfs_folio_trace_clear_cc;
86 			folio_detach_private(folio);
87 			goto end_wb;
88 		}
89 
90 		/* Need to detach the group pointer if the page didn't get
91 		 * redirtied.  If it has been redirtied, then it must be within
92 		 * the same group.
93 		 */
94 		why = netfs_folio_trace_redirtied;
95 		if (!folio_test_dirty(folio)) {
96 			folio_detach_private(folio);
97 			gcount++;
98 			why = netfs_folio_trace_clear_g;
99 		}
100 	}
101 
102 end_wb:
103 	trace_netfs_folio(folio, why);
104 	folio_end_writeback(folio);
105 	return gcount;
106 }
107 
108 /*
109  * Unlock any folios we've finished with.
110  */
netfs_writeback_unlock_folios(struct netfs_io_request * wreq,unsigned int * notes)111 static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
112 					  unsigned int *notes)
113 {
114 	struct folio_queue *folioq = wreq->buffer.tail;
115 	unsigned long long collected_to = wreq->collected_to;
116 	unsigned int slot = wreq->buffer.first_tail_slot;
117 
118 	if (WARN_ON_ONCE(!folioq)) {
119 		pr_err("[!] Writeback unlock found empty rolling buffer!\n");
120 		netfs_dump_request(wreq);
121 		return;
122 	}
123 
124 	if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
125 		if (netfs_pgpriv2_unlock_copied_folios(wreq))
126 			*notes |= MADE_PROGRESS;
127 		return;
128 	}
129 
130 	if (slot >= folioq_nr_slots(folioq)) {
131 		folioq = rolling_buffer_delete_spent(&wreq->buffer);
132 		if (!folioq)
133 			return;
134 		slot = 0;
135 	}
136 
137 	for (;;) {
138 		struct folio *folio;
139 		struct netfs_folio *finfo;
140 		unsigned long long fpos, fend;
141 		size_t fsize, flen;
142 
143 		folio = folioq_folio(folioq, slot);
144 		if (WARN_ONCE(!folio_test_writeback(folio),
145 			      "R=%08x: folio %lx is not under writeback\n",
146 			      wreq->debug_id, folio->index))
147 			trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
148 
149 		fpos = folio_pos(folio);
150 		fsize = folio_size(folio);
151 		finfo = netfs_folio_info(folio);
152 		flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
153 
154 		fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
155 
156 		trace_netfs_collect_folio(wreq, folio, fend, collected_to);
157 
158 		/* Unlock any folio we've transferred all of. */
159 		if (collected_to < fend)
160 			break;
161 
162 		wreq->nr_group_rel += netfs_folio_written_back(folio);
163 		wreq->cleaned_to = fpos + fsize;
164 		*notes |= MADE_PROGRESS;
165 
166 		/* Clean up the head folioq.  If we clear an entire folioq, then
167 		 * we can get rid of it provided it's not also the tail folioq
168 		 * being filled by the issuer.
169 		 */
170 		folioq_clear(folioq, slot);
171 		slot++;
172 		if (slot >= folioq_nr_slots(folioq)) {
173 			folioq = rolling_buffer_delete_spent(&wreq->buffer);
174 			if (!folioq)
175 				goto done;
176 			slot = 0;
177 		}
178 
179 		if (fpos + fsize >= collected_to)
180 			break;
181 	}
182 
183 	wreq->buffer.tail = folioq;
184 done:
185 	wreq->buffer.first_tail_slot = slot;
186 }
187 
188 /*
189  * Collect and assess the results of various write subrequests.  We may need to
190  * retry some of the results - or even do an RMW cycle for content crypto.
191  *
192  * Note that we have a number of parallel, overlapping lists of subrequests,
193  * one to the server and one to the local cache for example, which may not be
194  * the same size or starting position and may not even correspond in boundary
195  * alignment.
196  */
netfs_collect_write_results(struct netfs_io_request * wreq)197 static void netfs_collect_write_results(struct netfs_io_request *wreq)
198 {
199 	struct netfs_io_subrequest *front, *remove;
200 	struct netfs_io_stream *stream;
201 	unsigned long long collected_to, issued_to;
202 	unsigned int notes;
203 	int s;
204 
205 	_enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
206 	trace_netfs_collect(wreq);
207 	trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
208 
209 reassess_streams:
210 	issued_to = atomic64_read(&wreq->issued_to);
211 	smp_rmb();
212 	collected_to = ULLONG_MAX;
213 	if (wreq->origin == NETFS_WRITEBACK ||
214 	    wreq->origin == NETFS_WRITETHROUGH ||
215 	    wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
216 		notes = NEED_UNLOCK;
217 	else
218 		notes = 0;
219 
220 	/* Remove completed subrequests from the front of the streams and
221 	 * advance the completion point on each stream.  We stop when we hit
222 	 * something that's in progress.  The issuer thread may be adding stuff
223 	 * to the tail whilst we're doing this.
224 	 */
225 	for (s = 0; s < NR_IO_STREAMS; s++) {
226 		stream = &wreq->io_streams[s];
227 		/* Read active flag before list pointers */
228 		if (!smp_load_acquire(&stream->active))
229 			continue;
230 
231 		front = stream->front;
232 		while (front) {
233 			trace_netfs_collect_sreq(wreq, front);
234 			//_debug("sreq [%x] %llx %zx/%zx",
235 			//       front->debug_index, front->start, front->transferred, front->len);
236 
237 			if (stream->collected_to < front->start) {
238 				trace_netfs_collect_gap(wreq, stream, issued_to, 'F');
239 				stream->collected_to = front->start;
240 			}
241 
242 			/* Stall if the front is still undergoing I/O. */
243 			if (netfs_check_subreq_in_progress(front)) {
244 				notes |= HIT_PENDING;
245 				break;
246 			}
247 			smp_rmb(); /* Read counters after I-P flag. */
248 
249 			if (stream->failed) {
250 				stream->collected_to = front->start + front->len;
251 				notes |= MADE_PROGRESS | SAW_FAILURE;
252 				goto cancel;
253 			}
254 			if (front->start + front->transferred > stream->collected_to) {
255 				stream->collected_to = front->start + front->transferred;
256 				stream->transferred = stream->collected_to - wreq->start;
257 				stream->transferred_valid = true;
258 				notes |= MADE_PROGRESS;
259 			}
260 			if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
261 				stream->failed = true;
262 				stream->error = front->error;
263 				if (stream->source == NETFS_UPLOAD_TO_SERVER)
264 					mapping_set_error(wreq->mapping, front->error);
265 				notes |= NEED_REASSESS | SAW_FAILURE;
266 				break;
267 			}
268 			if (front->transferred < front->len) {
269 				stream->need_retry = true;
270 				notes |= NEED_RETRY | MADE_PROGRESS;
271 				break;
272 			}
273 
274 		cancel:
275 			/* Remove if completely consumed. */
276 			spin_lock(&wreq->lock);
277 
278 			remove = front;
279 			list_del_init(&front->rreq_link);
280 			front = list_first_entry_or_null(&stream->subrequests,
281 							 struct netfs_io_subrequest, rreq_link);
282 			stream->front = front;
283 			spin_unlock(&wreq->lock);
284 			netfs_put_subrequest(remove,
285 					     notes & SAW_FAILURE ?
286 					     netfs_sreq_trace_put_cancel :
287 					     netfs_sreq_trace_put_done);
288 		}
289 
290 		/* If we have an empty stream, we need to jump it forward
291 		 * otherwise the collection point will never advance.
292 		 */
293 		if (!front && issued_to > stream->collected_to) {
294 			trace_netfs_collect_gap(wreq, stream, issued_to, 'E');
295 			stream->collected_to = issued_to;
296 		}
297 
298 		if (stream->collected_to < collected_to)
299 			collected_to = stream->collected_to;
300 	}
301 
302 	if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
303 		wreq->collected_to = collected_to;
304 
305 	for (s = 0; s < NR_IO_STREAMS; s++) {
306 		stream = &wreq->io_streams[s];
307 		if (stream->active)
308 			trace_netfs_collect_stream(wreq, stream);
309 	}
310 
311 	trace_netfs_collect_state(wreq, wreq->collected_to, notes);
312 
313 	/* Unlock any folios that we have now finished with. */
314 	if (notes & NEED_UNLOCK) {
315 		if (wreq->cleaned_to < wreq->collected_to)
316 			netfs_writeback_unlock_folios(wreq, ¬es);
317 	} else {
318 		wreq->cleaned_to = wreq->collected_to;
319 	}
320 
321 	// TODO: Discard encryption buffers
322 
323 	if (notes & NEED_RETRY)
324 		goto need_retry;
325 
326 	if (notes & MADE_PROGRESS) {
327 		netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause);
328 		//cond_resched();
329 		goto reassess_streams;
330 	}
331 
332 	if (notes & NEED_REASSESS) {
333 		//cond_resched();
334 		goto reassess_streams;
335 	}
336 
337 out:
338 	netfs_put_group_many(wreq->group, wreq->nr_group_rel);
339 	wreq->nr_group_rel = 0;
340 	_leave(" = %x", notes);
341 	return;
342 
343 need_retry:
344 	/* Okay...  We're going to have to retry one or both streams.  Note
345 	 * that any partially completed op will have had any wholly transferred
346 	 * folios removed from it.
347 	 */
348 	_debug("retry");
349 	netfs_retry_writes(wreq);
350 	goto out;
351 }
352 
353 /*
354  * Perform the collection of subrequests, folios and encryption buffers.
355  */
netfs_write_collection(struct netfs_io_request * wreq)356 bool netfs_write_collection(struct netfs_io_request *wreq)
357 {
358 	struct netfs_inode *ictx = netfs_inode(wreq->inode);
359 	size_t transferred;
360 	bool transferred_valid = false;
361 	int s;
362 
363 	_enter("R=%x", wreq->debug_id);
364 
365 	netfs_collect_write_results(wreq);
366 
367 	/* We're done when the app thread has finished posting subreqs and all
368 	 * the queues in all the streams are empty.
369 	 */
370 	if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags))
371 		return false;
372 	smp_rmb(); /* Read ALL_QUEUED before lists. */
373 
374 	transferred = LONG_MAX;
375 	for (s = 0; s < NR_IO_STREAMS; s++) {
376 		struct netfs_io_stream *stream = &wreq->io_streams[s];
377 		if (!stream->active)
378 			continue;
379 		if (!list_empty(&stream->subrequests))
380 			return false;
381 		if (stream->transferred_valid &&
382 		    stream->transferred < transferred) {
383 			transferred = stream->transferred;
384 			transferred_valid = true;
385 		}
386 	}
387 
388 	/* Okay, declare that all I/O is complete. */
389 	if (transferred_valid)
390 		wreq->transferred = transferred;
391 	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
392 
393 	if (wreq->io_streams[1].active &&
394 	    wreq->io_streams[1].failed &&
395 	    ictx->ops->invalidate_cache) {
396 		/* Cache write failure doesn't prevent writeback completion
397 		 * unless we're in disconnected mode.
398 		 */
399 		ictx->ops->invalidate_cache(wreq);
400 	}
401 
402 	if ((wreq->origin == NETFS_UNBUFFERED_WRITE ||
403 	     wreq->origin == NETFS_DIO_WRITE) &&
404 	    !wreq->error)
405 		netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
406 
407 	if (wreq->origin == NETFS_DIO_WRITE &&
408 	    wreq->mapping->nrpages) {
409 		/* mmap may have got underfoot and we may now have folios
410 		 * locally covering the region we just wrote.  Attempt to
411 		 * discard the folios, but leave in place any modified locally.
412 		 * ->write_iter() is prevented from interfering by the DIO
413 		 * counter.
414 		 */
415 		pgoff_t first = wreq->start >> PAGE_SHIFT;
416 		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
417 		invalidate_inode_pages2_range(wreq->mapping, first, last);
418 	}
419 
420 	if (wreq->origin == NETFS_DIO_WRITE)
421 		inode_dio_end(wreq->inode);
422 
423 	_debug("finished");
424 	netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
425 	/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
426 
427 	if (wreq->iocb) {
428 		size_t written = min(wreq->transferred, wreq->len);
429 		wreq->iocb->ki_pos += written;
430 		if (wreq->iocb->ki_complete) {
431 			trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
432 			wreq->iocb->ki_complete(
433 				wreq->iocb, wreq->error ? wreq->error : written);
434 		}
435 		wreq->iocb = VFS_PTR_POISON;
436 	}
437 
438 	netfs_clear_subrequests(wreq);
439 	return true;
440 }
441 
netfs_write_collection_worker(struct work_struct * work)442 void netfs_write_collection_worker(struct work_struct *work)
443 {
444 	struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
445 
446 	netfs_see_request(rreq, netfs_rreq_trace_see_work);
447 	if (netfs_check_rreq_in_progress(rreq)) {
448 		if (netfs_write_collection(rreq))
449 			/* Drop the ref from the IN_PROGRESS flag. */
450 			netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
451 		else
452 			netfs_see_request(rreq, netfs_rreq_trace_see_work_complete);
453 	}
454 }
455 
456 /**
457  * netfs_write_subrequest_terminated - Note the termination of a write operation.
458  * @_op: The I/O request that has terminated.
459  * @transferred_or_error: The amount of data transferred or an error code.
460  *
461  * This tells the library that a contributory write I/O operation has
462  * terminated, one way or another, and that it should collect the results.
463  *
464  * The caller indicates in @transferred_or_error the outcome of the operation,
465  * supplying a positive value to indicate the number of bytes transferred or a
466  * negative error code.  The library will look after reissuing I/O operations
467  * as appropriate and writing downloaded data to the cache.
468  *
469  * When this is called, ownership of the subrequest is transferred back to the
470  * library, along with a ref.
471  *
472  * Note that %_op is a void* so that the function can be passed to
473  * kiocb::term_func without the need for a casting wrapper.
474  */
netfs_write_subrequest_terminated(void * _op,ssize_t transferred_or_error)475 void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error)
476 {
477 	struct netfs_io_subrequest *subreq = _op;
478 	struct netfs_io_request *wreq = subreq->rreq;
479 
480 	_enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
481 
482 	switch (subreq->source) {
483 	case NETFS_UPLOAD_TO_SERVER:
484 		netfs_stat(&netfs_n_wh_upload_done);
485 		break;
486 	case NETFS_WRITE_TO_CACHE:
487 		netfs_stat(&netfs_n_wh_write_done);
488 		break;
489 	default:
490 		BUG();
491 	}
492 
493 	if (IS_ERR_VALUE(transferred_or_error)) {
494 		subreq->error = transferred_or_error;
495 		if (subreq->error == -EAGAIN)
496 			set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
497 		else
498 			set_bit(NETFS_SREQ_FAILED, &subreq->flags);
499 		trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
500 
501 		switch (subreq->source) {
502 		case NETFS_WRITE_TO_CACHE:
503 			netfs_stat(&netfs_n_wh_write_failed);
504 			break;
505 		case NETFS_UPLOAD_TO_SERVER:
506 			netfs_stat(&netfs_n_wh_upload_failed);
507 			break;
508 		default:
509 			break;
510 		}
511 		trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
512 		set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
513 	} else {
514 		if (WARN(transferred_or_error > subreq->len - subreq->transferred,
515 			 "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
516 			 wreq->debug_id, subreq->debug_index,
517 			 transferred_or_error, subreq->len, subreq->transferred))
518 			transferred_or_error = subreq->len - subreq->transferred;
519 
520 		subreq->error = 0;
521 		subreq->transferred += transferred_or_error;
522 
523 		if (subreq->transferred < subreq->len)
524 			set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
525 	}
526 
527 	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
528 	netfs_subreq_clear_in_progress(subreq);
529 	netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated);
530 }
531 EXPORT_SYMBOL(netfs_write_subrequest_terminated);
532