xref: /linux/fs/netfs/write_issue.c (revision 13b6931c44969ff057ad2a3aa1c20a1b28b79b6f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem high-level (buffered) writeback.
3  *
4  * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  *
7  *
8  * To support network filesystems with local caching, we manage a situation
9  * that can be envisioned like the following:
10  *
11  *               +---+---+-----+-----+---+----------+
12  *    Folios:    |   |   |     |     |   |          |
13  *               +---+---+-----+-----+---+----------+
14  *
15  *                 +------+------+     +----+----+
16  *    Upload:      |      |      |.....|    |    |
17  *  (Stream 0)     +------+------+     +----+----+
18  *
19  *               +------+------+------+------+------+
20  *    Cache:     |      |      |      |      |      |
21  *  (Stream 1)   +------+------+------+------+------+
22  *
23  * Where we have a sequence of folios of varying sizes that we need to overlay
24  * with multiple parallel streams of I/O requests, where the I/O requests in a
25  * stream may also be of various sizes (in cifs, for example, the sizes are
26  * negotiated with the server; in something like ceph, they may represent the
27  * sizes of storage objects).
28  *
29  * The sequence in each stream may contain gaps and noncontiguous subrequests
30  * may be glued together into single vectored write RPCs.
31  */
32 
33 #include <linux/export.h>
34 #include <linux/fs.h>
35 #include <linux/mm.h>
36 #include <linux/pagemap.h>
37 #include "internal.h"
38 
39 /*
40  * Kill all dirty folios in the event of an unrecoverable error, starting with
41  * a locked folio we've already obtained from writeback_iter().
42  */
43 static void netfs_kill_dirty_pages(struct address_space *mapping,
44 				   struct writeback_control *wbc,
45 				   struct folio *folio)
46 {
47 	int error = 0;
48 
49 	do {
50 		enum netfs_folio_trace why = netfs_folio_trace_kill;
51 		struct netfs_group *group = NULL;
52 		struct netfs_folio *finfo = NULL;
53 		void *priv;
54 
55 		priv = folio_detach_private(folio);
56 		if (priv) {
57 			finfo = __netfs_folio_info(priv);
58 			if (finfo) {
59 				/* Kill folio from streaming write. */
60 				group = finfo->netfs_group;
61 				why = netfs_folio_trace_kill_s;
62 			} else {
63 				group = priv;
64 				if (group == NETFS_FOLIO_COPY_TO_CACHE) {
65 					/* Kill copy-to-cache folio */
66 					why = netfs_folio_trace_kill_cc;
67 					group = NULL;
68 				} else {
69 					/* Kill folio with group */
70 					why = netfs_folio_trace_kill_g;
71 				}
72 			}
73 		}
74 
75 		trace_netfs_folio(folio, why);
76 
77 		folio_start_writeback(folio);
78 		folio_unlock(folio);
79 		folio_end_writeback(folio);
80 
81 		netfs_put_group(group);
82 		kfree(finfo);
83 
84 	} while ((folio = writeback_iter(mapping, wbc, folio, &error)));
85 }
86 
87 /*
88  * Create a write request and set it up appropriately for the origin type.
89  */
90 struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
91 						struct file *file,
92 						loff_t start,
93 						enum netfs_io_origin origin)
94 {
95 	struct netfs_io_request *wreq;
96 	struct netfs_inode *ictx;
97 	bool is_cacheable = (origin == NETFS_WRITEBACK ||
98 			     origin == NETFS_WRITEBACK_SINGLE ||
99 			     origin == NETFS_WRITETHROUGH ||
100 			     origin == NETFS_PGPRIV2_COPY_TO_CACHE);
101 
102 	wreq = netfs_alloc_request(mapping, file, start, 0, origin);
103 	if (IS_ERR(wreq))
104 		return wreq;
105 
106 	_enter("R=%x", wreq->debug_id);
107 
108 	ictx = netfs_inode(wreq->inode);
109 	if (is_cacheable && netfs_is_cache_enabled(ictx))
110 		fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
111 	if (rolling_buffer_init(&wreq->buffer, wreq->debug_id, ITER_SOURCE) < 0)
112 		goto nomem;
113 
114 	wreq->cleaned_to = wreq->start;
115 
116 	wreq->io_streams[0].stream_nr		= 0;
117 	wreq->io_streams[0].source		= NETFS_UPLOAD_TO_SERVER;
118 	wreq->io_streams[0].prepare_write	= ictx->ops->prepare_write;
119 	wreq->io_streams[0].issue_write		= ictx->ops->issue_write;
120 	wreq->io_streams[0].collected_to	= start;
121 	wreq->io_streams[0].transferred		= LONG_MAX;
122 
123 	wreq->io_streams[1].stream_nr		= 1;
124 	wreq->io_streams[1].source		= NETFS_WRITE_TO_CACHE;
125 	wreq->io_streams[1].collected_to	= start;
126 	wreq->io_streams[1].transferred		= LONG_MAX;
127 	if (fscache_resources_valid(&wreq->cache_resources)) {
128 		wreq->io_streams[1].avail	= true;
129 		wreq->io_streams[1].active	= true;
130 		wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq;
131 		wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write;
132 	}
133 
134 	return wreq;
135 nomem:
136 	wreq->error = -ENOMEM;
137 	netfs_put_request(wreq, false, netfs_rreq_trace_put_failed);
138 	return ERR_PTR(-ENOMEM);
139 }
140 
141 /**
142  * netfs_prepare_write_failed - Note write preparation failed
143  * @subreq: The subrequest to mark
144  *
145  * Mark a subrequest to note that preparation for write failed.
146  */
147 void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq)
148 {
149 	__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
150 	trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed);
151 }
152 EXPORT_SYMBOL(netfs_prepare_write_failed);
153 
154 /*
155  * Prepare a write subrequest.  We need to allocate a new subrequest
156  * if we don't have one.
157  */
158 static void netfs_prepare_write(struct netfs_io_request *wreq,
159 				struct netfs_io_stream *stream,
160 				loff_t start)
161 {
162 	struct netfs_io_subrequest *subreq;
163 	struct iov_iter *wreq_iter = &wreq->buffer.iter;
164 
165 	/* Make sure we don't point the iterator at a used-up folio_queue
166 	 * struct being used as a placeholder to prevent the queue from
167 	 * collapsing.  In such a case, extend the queue.
168 	 */
169 	if (iov_iter_is_folioq(wreq_iter) &&
170 	    wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq))
171 		rolling_buffer_make_space(&wreq->buffer);
172 
173 	subreq = netfs_alloc_subrequest(wreq);
174 	subreq->source		= stream->source;
175 	subreq->start		= start;
176 	subreq->stream_nr	= stream->stream_nr;
177 	subreq->io_iter		= *wreq_iter;
178 
179 	_enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
180 
181 	trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
182 
183 	stream->sreq_max_len	= UINT_MAX;
184 	stream->sreq_max_segs	= INT_MAX;
185 	switch (stream->source) {
186 	case NETFS_UPLOAD_TO_SERVER:
187 		netfs_stat(&netfs_n_wh_upload);
188 		stream->sreq_max_len = wreq->wsize;
189 		break;
190 	case NETFS_WRITE_TO_CACHE:
191 		netfs_stat(&netfs_n_wh_write);
192 		break;
193 	default:
194 		WARN_ON_ONCE(1);
195 		break;
196 	}
197 
198 	if (stream->prepare_write)
199 		stream->prepare_write(subreq);
200 
201 	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
202 
203 	/* We add to the end of the list whilst the collector may be walking
204 	 * the list.  The collector only goes nextwards and uses the lock to
205 	 * remove entries off of the front.
206 	 */
207 	spin_lock(&wreq->lock);
208 	list_add_tail(&subreq->rreq_link, &stream->subrequests);
209 	if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
210 		stream->front = subreq;
211 		if (!stream->active) {
212 			stream->collected_to = stream->front->start;
213 			/* Write list pointers before active flag */
214 			smp_store_release(&stream->active, true);
215 		}
216 	}
217 
218 	spin_unlock(&wreq->lock);
219 
220 	stream->construct = subreq;
221 }
222 
223 /*
224  * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
225  * operation.  The operation may be asynchronous and should call
226  * netfs_write_subrequest_terminated() when complete.
227  */
228 static void netfs_do_issue_write(struct netfs_io_stream *stream,
229 				 struct netfs_io_subrequest *subreq)
230 {
231 	struct netfs_io_request *wreq = subreq->rreq;
232 
233 	_enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
234 
235 	if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
236 		return netfs_write_subrequest_terminated(subreq, subreq->error, false);
237 
238 	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
239 	stream->issue_write(subreq);
240 }
241 
242 void netfs_reissue_write(struct netfs_io_stream *stream,
243 			 struct netfs_io_subrequest *subreq,
244 			 struct iov_iter *source)
245 {
246 	size_t size = subreq->len - subreq->transferred;
247 
248 	// TODO: Use encrypted buffer
249 	subreq->io_iter = *source;
250 	iov_iter_advance(source, size);
251 	iov_iter_truncate(&subreq->io_iter, size);
252 
253 	subreq->retry_count++;
254 	__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
255 	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
256 	netfs_do_issue_write(stream, subreq);
257 }
258 
259 void netfs_issue_write(struct netfs_io_request *wreq,
260 		       struct netfs_io_stream *stream)
261 {
262 	struct netfs_io_subrequest *subreq = stream->construct;
263 
264 	if (!subreq)
265 		return;
266 	stream->construct = NULL;
267 	subreq->io_iter.count = subreq->len;
268 	netfs_do_issue_write(stream, subreq);
269 }
270 
271 /*
272  * Add data to the write subrequest, dispatching each as we fill it up or if it
273  * is discontiguous with the previous.  We only fill one part at a time so that
274  * we can avoid overrunning the credits obtained (cifs) and try to parallelise
275  * content-crypto preparation with network writes.
276  */
277 size_t netfs_advance_write(struct netfs_io_request *wreq,
278 			   struct netfs_io_stream *stream,
279 			   loff_t start, size_t len, bool to_eof)
280 {
281 	struct netfs_io_subrequest *subreq = stream->construct;
282 	size_t part;
283 
284 	if (!stream->avail) {
285 		_leave("no write");
286 		return len;
287 	}
288 
289 	_enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
290 
291 	if (subreq && start != subreq->start + subreq->len) {
292 		netfs_issue_write(wreq, stream);
293 		subreq = NULL;
294 	}
295 
296 	if (!stream->construct)
297 		netfs_prepare_write(wreq, stream, start);
298 	subreq = stream->construct;
299 
300 	part = umin(stream->sreq_max_len - subreq->len, len);
301 	_debug("part %zx/%zx %zx/%zx", subreq->len, stream->sreq_max_len, part, len);
302 	subreq->len += part;
303 	subreq->nr_segs++;
304 	stream->submit_extendable_to -= part;
305 
306 	if (subreq->len >= stream->sreq_max_len ||
307 	    subreq->nr_segs >= stream->sreq_max_segs ||
308 	    to_eof) {
309 		netfs_issue_write(wreq, stream);
310 		subreq = NULL;
311 	}
312 
313 	return part;
314 }
315 
316 /*
317  * Write some of a pending folio data back to the server.
318  */
319 static int netfs_write_folio(struct netfs_io_request *wreq,
320 			     struct writeback_control *wbc,
321 			     struct folio *folio)
322 {
323 	struct netfs_io_stream *upload = &wreq->io_streams[0];
324 	struct netfs_io_stream *cache  = &wreq->io_streams[1];
325 	struct netfs_io_stream *stream;
326 	struct netfs_group *fgroup; /* TODO: Use this with ceph */
327 	struct netfs_folio *finfo;
328 	size_t iter_off = 0;
329 	size_t fsize = folio_size(folio), flen = fsize, foff = 0;
330 	loff_t fpos = folio_pos(folio), i_size;
331 	bool to_eof = false, streamw = false;
332 	bool debug = false;
333 
334 	_enter("");
335 
336 	if (rolling_buffer_make_space(&wreq->buffer) < 0)
337 		return -ENOMEM;
338 
339 	/* netfs_perform_write() may shift i_size around the page or from out
340 	 * of the page to beyond it, but cannot move i_size into or through the
341 	 * page since we have it locked.
342 	 */
343 	i_size = i_size_read(wreq->inode);
344 
345 	if (fpos >= i_size) {
346 		/* mmap beyond eof. */
347 		_debug("beyond eof");
348 		folio_start_writeback(folio);
349 		folio_unlock(folio);
350 		wreq->nr_group_rel += netfs_folio_written_back(folio);
351 		netfs_put_group_many(wreq->group, wreq->nr_group_rel);
352 		wreq->nr_group_rel = 0;
353 		return 0;
354 	}
355 
356 	if (fpos + fsize > wreq->i_size)
357 		wreq->i_size = i_size;
358 
359 	fgroup = netfs_folio_group(folio);
360 	finfo = netfs_folio_info(folio);
361 	if (finfo) {
362 		foff = finfo->dirty_offset;
363 		flen = foff + finfo->dirty_len;
364 		streamw = true;
365 	}
366 
367 	if (wreq->origin == NETFS_WRITETHROUGH) {
368 		to_eof = false;
369 		if (flen > i_size - fpos)
370 			flen = i_size - fpos;
371 	} else if (flen > i_size - fpos) {
372 		flen = i_size - fpos;
373 		if (!streamw)
374 			folio_zero_segment(folio, flen, fsize);
375 		to_eof = true;
376 	} else if (flen == i_size - fpos) {
377 		to_eof = true;
378 	}
379 	flen -= foff;
380 
381 	_debug("folio %zx %zx %zx", foff, flen, fsize);
382 
383 	/* Deal with discontinuities in the stream of dirty pages.  These can
384 	 * arise from a number of sources:
385 	 *
386 	 * (1) Intervening non-dirty pages from random-access writes, multiple
387 	 *     flushers writing back different parts simultaneously and manual
388 	 *     syncing.
389 	 *
390 	 * (2) Partially-written pages from write-streaming.
391 	 *
392 	 * (3) Pages that belong to a different write-back group (eg.  Ceph
393 	 *     snapshots).
394 	 *
395 	 * (4) Actually-clean pages that were marked for write to the cache
396 	 *     when they were read.  Note that these appear as a special
397 	 *     write-back group.
398 	 */
399 	if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
400 		netfs_issue_write(wreq, upload);
401 	} else if (fgroup != wreq->group) {
402 		/* We can't write this page to the server yet. */
403 		kdebug("wrong group");
404 		folio_redirty_for_writepage(wbc, folio);
405 		folio_unlock(folio);
406 		netfs_issue_write(wreq, upload);
407 		netfs_issue_write(wreq, cache);
408 		return 0;
409 	}
410 
411 	if (foff > 0)
412 		netfs_issue_write(wreq, upload);
413 	if (streamw)
414 		netfs_issue_write(wreq, cache);
415 
416 	/* Flip the page to the writeback state and unlock.  If we're called
417 	 * from write-through, then the page has already been put into the wb
418 	 * state.
419 	 */
420 	if (wreq->origin == NETFS_WRITEBACK)
421 		folio_start_writeback(folio);
422 	folio_unlock(folio);
423 
424 	if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
425 		if (!cache->avail) {
426 			trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
427 			netfs_issue_write(wreq, upload);
428 			netfs_folio_written_back(folio);
429 			return 0;
430 		}
431 		trace_netfs_folio(folio, netfs_folio_trace_store_copy);
432 	} else if (!upload->avail && !cache->avail) {
433 		trace_netfs_folio(folio, netfs_folio_trace_cancel_store);
434 		netfs_folio_written_back(folio);
435 		return 0;
436 	} else if (!upload->construct) {
437 		trace_netfs_folio(folio, netfs_folio_trace_store);
438 	} else {
439 		trace_netfs_folio(folio, netfs_folio_trace_store_plus);
440 	}
441 
442 	/* Attach the folio to the rolling buffer. */
443 	rolling_buffer_append(&wreq->buffer, folio, 0);
444 
445 	/* Move the submission point forward to allow for write-streaming data
446 	 * not starting at the front of the page.  We don't do write-streaming
447 	 * with the cache as the cache requires DIO alignment.
448 	 *
449 	 * Also skip uploading for data that's been read and just needs copying
450 	 * to the cache.
451 	 */
452 	for (int s = 0; s < NR_IO_STREAMS; s++) {
453 		stream = &wreq->io_streams[s];
454 		stream->submit_off = foff;
455 		stream->submit_len = flen;
456 		if (!stream->avail ||
457 		    (stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
458 		    (stream->source == NETFS_UPLOAD_TO_SERVER &&
459 		     fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
460 			stream->submit_off = UINT_MAX;
461 			stream->submit_len = 0;
462 		}
463 	}
464 
465 	/* Attach the folio to one or more subrequests.  For a big folio, we
466 	 * could end up with thousands of subrequests if the wsize is small -
467 	 * but we might need to wait during the creation of subrequests for
468 	 * network resources (eg. SMB credits).
469 	 */
470 	for (;;) {
471 		ssize_t part;
472 		size_t lowest_off = ULONG_MAX;
473 		int choose_s = -1;
474 
475 		/* Always add to the lowest-submitted stream first. */
476 		for (int s = 0; s < NR_IO_STREAMS; s++) {
477 			stream = &wreq->io_streams[s];
478 			if (stream->submit_len > 0 &&
479 			    stream->submit_off < lowest_off) {
480 				lowest_off = stream->submit_off;
481 				choose_s = s;
482 			}
483 		}
484 
485 		if (choose_s < 0)
486 			break;
487 		stream = &wreq->io_streams[choose_s];
488 
489 		/* Advance the iterator(s). */
490 		if (stream->submit_off > iter_off) {
491 			rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off);
492 			iter_off = stream->submit_off;
493 		}
494 
495 		atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
496 		stream->submit_extendable_to = fsize - stream->submit_off;
497 		part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
498 					   stream->submit_len, to_eof);
499 		stream->submit_off += part;
500 		if (part > stream->submit_len)
501 			stream->submit_len = 0;
502 		else
503 			stream->submit_len -= part;
504 		if (part > 0)
505 			debug = true;
506 	}
507 
508 	if (fsize > iter_off)
509 		rolling_buffer_advance(&wreq->buffer, fsize - iter_off);
510 	atomic64_set(&wreq->issued_to, fpos + fsize);
511 
512 	if (!debug)
513 		kdebug("R=%x: No submit", wreq->debug_id);
514 
515 	if (foff + flen < fsize)
516 		for (int s = 0; s < NR_IO_STREAMS; s++)
517 			netfs_issue_write(wreq, &wreq->io_streams[s]);
518 
519 	_leave(" = 0");
520 	return 0;
521 }
522 
523 /*
524  * End the issuing of writes, letting the collector know we're done.
525  */
526 static void netfs_end_issue_write(struct netfs_io_request *wreq)
527 {
528 	bool needs_poke = true;
529 
530 	smp_wmb(); /* Write subreq lists before ALL_QUEUED. */
531 	set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
532 
533 	for (int s = 0; s < NR_IO_STREAMS; s++) {
534 		struct netfs_io_stream *stream = &wreq->io_streams[s];
535 
536 		if (!stream->active)
537 			continue;
538 		if (!list_empty(&stream->subrequests))
539 			needs_poke = false;
540 		netfs_issue_write(wreq, stream);
541 	}
542 
543 	if (needs_poke)
544 		netfs_wake_write_collector(wreq, false);
545 }
546 
547 /*
548  * Write some of the pending data back to the server
549  */
550 int netfs_writepages(struct address_space *mapping,
551 		     struct writeback_control *wbc)
552 {
553 	struct netfs_inode *ictx = netfs_inode(mapping->host);
554 	struct netfs_io_request *wreq = NULL;
555 	struct folio *folio;
556 	int error = 0;
557 
558 	if (!mutex_trylock(&ictx->wb_lock)) {
559 		if (wbc->sync_mode == WB_SYNC_NONE) {
560 			netfs_stat(&netfs_n_wb_lock_skip);
561 			return 0;
562 		}
563 		netfs_stat(&netfs_n_wb_lock_wait);
564 		mutex_lock(&ictx->wb_lock);
565 	}
566 
567 	/* Need the first folio to be able to set up the op. */
568 	folio = writeback_iter(mapping, wbc, NULL, &error);
569 	if (!folio)
570 		goto out;
571 
572 	wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK);
573 	if (IS_ERR(wreq)) {
574 		error = PTR_ERR(wreq);
575 		goto couldnt_start;
576 	}
577 
578 	trace_netfs_write(wreq, netfs_write_trace_writeback);
579 	netfs_stat(&netfs_n_wh_writepages);
580 
581 	do {
582 		_debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to));
583 
584 		/* It appears we don't have to handle cyclic writeback wrapping. */
585 		WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to));
586 
587 		if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
588 		    unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
589 			set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
590 			wreq->netfs_ops->begin_writeback(wreq);
591 		}
592 
593 		error = netfs_write_folio(wreq, wbc, folio);
594 		if (error < 0)
595 			break;
596 	} while ((folio = writeback_iter(mapping, wbc, folio, &error)));
597 
598 	netfs_end_issue_write(wreq);
599 
600 	mutex_unlock(&ictx->wb_lock);
601 
602 	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
603 	_leave(" = %d", error);
604 	return error;
605 
606 couldnt_start:
607 	netfs_kill_dirty_pages(mapping, wbc, folio);
608 out:
609 	mutex_unlock(&ictx->wb_lock);
610 	_leave(" = %d", error);
611 	return error;
612 }
613 EXPORT_SYMBOL(netfs_writepages);
614 
615 /*
616  * Begin a write operation for writing through the pagecache.
617  */
618 struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
619 {
620 	struct netfs_io_request *wreq = NULL;
621 	struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));
622 
623 	mutex_lock(&ictx->wb_lock);
624 
625 	wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp,
626 				      iocb->ki_pos, NETFS_WRITETHROUGH);
627 	if (IS_ERR(wreq)) {
628 		mutex_unlock(&ictx->wb_lock);
629 		return wreq;
630 	}
631 
632 	wreq->io_streams[0].avail = true;
633 	trace_netfs_write(wreq, netfs_write_trace_writethrough);
634 	return wreq;
635 }
636 
637 /*
638  * Advance the state of the write operation used when writing through the
639  * pagecache.  Data has been copied into the pagecache that we need to append
640  * to the request.  If we've added more than wsize then we need to create a new
641  * subrequest.
642  */
643 int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
644 			       struct folio *folio, size_t copied, bool to_page_end,
645 			       struct folio **writethrough_cache)
646 {
647 	_enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
648 	       wreq->debug_id, wreq->buffer.iter.count, wreq->wsize, copied, to_page_end);
649 
650 	if (!*writethrough_cache) {
651 		if (folio_test_dirty(folio))
652 			/* Sigh.  mmap. */
653 			folio_clear_dirty_for_io(folio);
654 
655 		/* We can make multiple writes to the folio... */
656 		folio_start_writeback(folio);
657 		if (wreq->len == 0)
658 			trace_netfs_folio(folio, netfs_folio_trace_wthru);
659 		else
660 			trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
661 		*writethrough_cache = folio;
662 	}
663 
664 	wreq->len += copied;
665 	if (!to_page_end)
666 		return 0;
667 
668 	*writethrough_cache = NULL;
669 	return netfs_write_folio(wreq, wbc, folio);
670 }
671 
672 /*
673  * End a write operation used when writing through the pagecache.
674  */
675 int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
676 			   struct folio *writethrough_cache)
677 {
678 	struct netfs_inode *ictx = netfs_inode(wreq->inode);
679 	int ret;
680 
681 	_enter("R=%x", wreq->debug_id);
682 
683 	if (writethrough_cache)
684 		netfs_write_folio(wreq, wbc, writethrough_cache);
685 
686 	netfs_end_issue_write(wreq);
687 
688 	mutex_unlock(&ictx->wb_lock);
689 
690 	if (wreq->iocb) {
691 		ret = -EIOCBQUEUED;
692 	} else {
693 		wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
694 		ret = wreq->error;
695 	}
696 	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
697 	return ret;
698 }
699 
700 /*
701  * Write data to the server without going through the pagecache and without
702  * writing it to the local cache.
703  */
704 int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
705 {
706 	struct netfs_io_stream *upload = &wreq->io_streams[0];
707 	ssize_t part;
708 	loff_t start = wreq->start;
709 	int error = 0;
710 
711 	_enter("%zx", len);
712 
713 	if (wreq->origin == NETFS_DIO_WRITE)
714 		inode_dio_begin(wreq->inode);
715 
716 	while (len) {
717 		// TODO: Prepare content encryption
718 
719 		_debug("unbuffered %zx", len);
720 		part = netfs_advance_write(wreq, upload, start, len, false);
721 		start += part;
722 		len -= part;
723 		rolling_buffer_advance(&wreq->buffer, part);
724 		if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
725 			trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
726 			wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags));
727 		}
728 		if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
729 			break;
730 	}
731 
732 	netfs_end_issue_write(wreq);
733 	_leave(" = %d", error);
734 	return error;
735 }
736 
737 /*
738  * Write some of a pending folio data back to the server and/or the cache.
739  */
740 static int netfs_write_folio_single(struct netfs_io_request *wreq,
741 				    struct folio *folio)
742 {
743 	struct netfs_io_stream *upload = &wreq->io_streams[0];
744 	struct netfs_io_stream *cache  = &wreq->io_streams[1];
745 	struct netfs_io_stream *stream;
746 	size_t iter_off = 0;
747 	size_t fsize = folio_size(folio), flen;
748 	loff_t fpos = folio_pos(folio);
749 	bool to_eof = false;
750 	bool no_debug = false;
751 
752 	_enter("");
753 
754 	flen = folio_size(folio);
755 	if (flen > wreq->i_size - fpos) {
756 		flen = wreq->i_size - fpos;
757 		folio_zero_segment(folio, flen, fsize);
758 		to_eof = true;
759 	} else if (flen == wreq->i_size - fpos) {
760 		to_eof = true;
761 	}
762 
763 	_debug("folio %zx/%zx", flen, fsize);
764 
765 	if (!upload->avail && !cache->avail) {
766 		trace_netfs_folio(folio, netfs_folio_trace_cancel_store);
767 		return 0;
768 	}
769 
770 	if (!upload->construct)
771 		trace_netfs_folio(folio, netfs_folio_trace_store);
772 	else
773 		trace_netfs_folio(folio, netfs_folio_trace_store_plus);
774 
775 	/* Attach the folio to the rolling buffer. */
776 	folio_get(folio);
777 	rolling_buffer_append(&wreq->buffer, folio, NETFS_ROLLBUF_PUT_MARK);
778 
779 	/* Move the submission point forward to allow for write-streaming data
780 	 * not starting at the front of the page.  We don't do write-streaming
781 	 * with the cache as the cache requires DIO alignment.
782 	 *
783 	 * Also skip uploading for data that's been read and just needs copying
784 	 * to the cache.
785 	 */
786 	for (int s = 0; s < NR_IO_STREAMS; s++) {
787 		stream = &wreq->io_streams[s];
788 		stream->submit_off = 0;
789 		stream->submit_len = flen;
790 		if (!stream->avail) {
791 			stream->submit_off = UINT_MAX;
792 			stream->submit_len = 0;
793 		}
794 	}
795 
796 	/* Attach the folio to one or more subrequests.  For a big folio, we
797 	 * could end up with thousands of subrequests if the wsize is small -
798 	 * but we might need to wait during the creation of subrequests for
799 	 * network resources (eg. SMB credits).
800 	 */
801 	for (;;) {
802 		ssize_t part;
803 		size_t lowest_off = ULONG_MAX;
804 		int choose_s = -1;
805 
806 		/* Always add to the lowest-submitted stream first. */
807 		for (int s = 0; s < NR_IO_STREAMS; s++) {
808 			stream = &wreq->io_streams[s];
809 			if (stream->submit_len > 0 &&
810 			    stream->submit_off < lowest_off) {
811 				lowest_off = stream->submit_off;
812 				choose_s = s;
813 			}
814 		}
815 
816 		if (choose_s < 0)
817 			break;
818 		stream = &wreq->io_streams[choose_s];
819 
820 		/* Advance the iterator(s). */
821 		if (stream->submit_off > iter_off) {
822 			rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off);
823 			iter_off = stream->submit_off;
824 		}
825 
826 		atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
827 		stream->submit_extendable_to = fsize - stream->submit_off;
828 		part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
829 					   stream->submit_len, to_eof);
830 		stream->submit_off += part;
831 		if (part > stream->submit_len)
832 			stream->submit_len = 0;
833 		else
834 			stream->submit_len -= part;
835 		if (part > 0)
836 			no_debug = true;
837 	}
838 
839 	wreq->buffer.iter.iov_offset = 0;
840 	if (fsize > iter_off)
841 		rolling_buffer_advance(&wreq->buffer, fsize - iter_off);
842 	atomic64_set(&wreq->issued_to, fpos + fsize);
843 
844 	if (!no_debug)
845 		kdebug("R=%x: No submit", wreq->debug_id);
846 	_leave(" = 0");
847 	return 0;
848 }
849 
850 /**
851  * netfs_writeback_single - Write back a monolithic payload
852  * @mapping: The mapping to write from
853  * @wbc: Hints from the VM
854  * @iter: Data to write, must be ITER_FOLIOQ.
855  *
856  * Write a monolithic, non-pagecache object back to the server and/or
857  * the cache.
858  */
859 int netfs_writeback_single(struct address_space *mapping,
860 			   struct writeback_control *wbc,
861 			   struct iov_iter *iter)
862 {
863 	struct netfs_io_request *wreq;
864 	struct netfs_inode *ictx = netfs_inode(mapping->host);
865 	struct folio_queue *fq;
866 	size_t size = iov_iter_count(iter);
867 	int ret;
868 
869 	if (WARN_ON_ONCE(!iov_iter_is_folioq(iter)))
870 		return -EIO;
871 
872 	if (!mutex_trylock(&ictx->wb_lock)) {
873 		if (wbc->sync_mode == WB_SYNC_NONE) {
874 			netfs_stat(&netfs_n_wb_lock_skip);
875 			return 0;
876 		}
877 		netfs_stat(&netfs_n_wb_lock_wait);
878 		mutex_lock(&ictx->wb_lock);
879 	}
880 
881 	wreq = netfs_create_write_req(mapping, NULL, 0, NETFS_WRITEBACK_SINGLE);
882 	if (IS_ERR(wreq)) {
883 		ret = PTR_ERR(wreq);
884 		goto couldnt_start;
885 	}
886 
887 	trace_netfs_write(wreq, netfs_write_trace_writeback);
888 	netfs_stat(&netfs_n_wh_writepages);
889 
890 	if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
891 		wreq->netfs_ops->begin_writeback(wreq);
892 
893 	for (fq = (struct folio_queue *)iter->folioq; fq; fq = fq->next) {
894 		for (int slot = 0; slot < folioq_count(fq); slot++) {
895 			struct folio *folio = folioq_folio(fq, slot);
896 			size_t part = umin(folioq_folio_size(fq, slot), size);
897 
898 			_debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to));
899 
900 			ret = netfs_write_folio_single(wreq, folio);
901 			if (ret < 0)
902 				goto stop;
903 			size -= part;
904 			if (size <= 0)
905 				goto stop;
906 		}
907 	}
908 
909 stop:
910 	for (int s = 0; s < NR_IO_STREAMS; s++)
911 		netfs_issue_write(wreq, &wreq->io_streams[s]);
912 	smp_wmb(); /* Write lists before ALL_QUEUED. */
913 	set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
914 
915 	mutex_unlock(&ictx->wb_lock);
916 
917 	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
918 	_leave(" = %d", ret);
919 	return ret;
920 
921 couldnt_start:
922 	mutex_unlock(&ictx->wb_lock);
923 	_leave(" = %d", ret);
924 	return ret;
925 }
926 EXPORT_SYMBOL(netfs_writeback_single);
927