xref: /linux/fs/netfs/buffered_read.c (revision 9ac45d4628dec6d78b17846115f6df6c1d1de69e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Network filesystem high-level buffered read support.
3  *
4  * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 
8 #include <linux/export.h>
9 #include <linux/task_io_accounting_ops.h>
10 #include "internal.h"
11 
12 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
13 					 unsigned long long *_start,
14 					 unsigned long long *_len,
15 					 unsigned long long i_size)
16 {
17 	struct netfs_cache_resources *cres = &rreq->cache_resources;
18 
19 	if (cres->ops && cres->ops->expand_readahead)
20 		cres->ops->expand_readahead(cres, _start, _len, i_size);
21 }
22 
23 static void netfs_rreq_expand(struct netfs_io_request *rreq,
24 			      struct readahead_control *ractl)
25 {
26 	/* Give the cache a chance to change the request parameters.  The
27 	 * resultant request must contain the original region.
28 	 */
29 	netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
30 
31 	/* Give the netfs a chance to change the request parameters.  The
32 	 * resultant request must contain the original region.
33 	 */
34 	if (rreq->netfs_ops->expand_readahead)
35 		rreq->netfs_ops->expand_readahead(rreq);
36 
37 	/* Expand the request if the cache wants it to start earlier.  Note
38 	 * that the expansion may get further extended if the VM wishes to
39 	 * insert THPs and the preferred start and/or end wind up in the middle
40 	 * of THPs.
41 	 *
42 	 * If this is the case, however, the THP size should be an integer
43 	 * multiple of the cache granule size, so we get a whole number of
44 	 * granules to deal with.
45 	 */
46 	if (rreq->start  != readahead_pos(ractl) ||
47 	    rreq->len != readahead_length(ractl)) {
48 		readahead_expand(ractl, rreq->start, rreq->len);
49 		rreq->start  = readahead_pos(ractl);
50 		rreq->len = readahead_length(ractl);
51 
52 		trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
53 				 netfs_read_trace_expanded);
54 	}
55 }
56 
57 /*
58  * Begin an operation, and fetch the stored zero point value from the cookie if
59  * available.
60  */
61 static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
62 {
63 	return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
64 }
65 
66 /*
67  * Decant the list of folios to read into a rolling buffer.
68  */
69 static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
70 					struct folio_queue *folioq)
71 {
72 	unsigned int order, nr;
73 	size_t size = 0;
74 
75 	nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios,
76 			       ARRAY_SIZE(folioq->vec.folios));
77 	folioq->vec.nr = nr;
78 	for (int i = 0; i < nr; i++) {
79 		struct folio *folio = folioq_folio(folioq, i);
80 
81 		trace_netfs_folio(folio, netfs_folio_trace_read);
82 		order = folio_order(folio);
83 		folioq->orders[i] = order;
84 		size += PAGE_SIZE << order;
85 	}
86 
87 	for (int i = nr; i < folioq_nr_slots(folioq); i++)
88 		folioq_clear(folioq, i);
89 
90 	return size;
91 }
92 
93 /*
94  * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O
95  * @subreq: The subrequest to be set up
96  *
97  * Prepare the I/O iterator representing the read buffer on a subrequest for
98  * the filesystem to use for I/O (it can be passed directly to a socket).  This
99  * is intended to be called from the ->issue_read() method once the filesystem
100  * has trimmed the request to the size it wants.
101  *
102  * Returns the limited size if successful and -ENOMEM if insufficient memory
103  * available.
104  *
105  * [!] NOTE: This must be run in the same thread as ->issue_read() was called
106  * in as we access the readahead_control struct.
107  */
108 static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
109 {
110 	struct netfs_io_request *rreq = subreq->rreq;
111 	size_t rsize = subreq->len;
112 
113 	if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER)
114 		rsize = umin(rsize, rreq->io_streams[0].sreq_max_len);
115 
116 	if (rreq->ractl) {
117 		/* If we don't have sufficient folios in the rolling buffer,
118 		 * extract a folioq's worth from the readahead region at a time
119 		 * into the buffer.  Note that this acquires a ref on each page
120 		 * that we will need to release later - but we don't want to do
121 		 * that until after we've started the I/O.
122 		 */
123 		while (rreq->submitted < subreq->start + rsize) {
124 			struct folio_queue *tail = rreq->buffer_tail, *new;
125 			size_t added;
126 
127 			new = kmalloc(sizeof(*new), GFP_NOFS);
128 			if (!new)
129 				return -ENOMEM;
130 			netfs_stat(&netfs_n_folioq);
131 			folioq_init(new);
132 			new->prev = tail;
133 			tail->next = new;
134 			rreq->buffer_tail = new;
135 			added = netfs_load_buffer_from_ra(rreq, new);
136 			rreq->iter.count += added;
137 			rreq->submitted += added;
138 		}
139 	}
140 
141 	subreq->len = rsize;
142 	if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
143 		size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize,
144 						rreq->io_streams[0].sreq_max_segs);
145 
146 		if (limit < rsize) {
147 			subreq->len = limit;
148 			trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
149 		}
150 	}
151 
152 	subreq->io_iter	= rreq->iter;
153 
154 	if (iov_iter_is_folioq(&subreq->io_iter)) {
155 		if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) {
156 			subreq->io_iter.folioq = subreq->io_iter.folioq->next;
157 			subreq->io_iter.folioq_slot = 0;
158 		}
159 		subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq;
160 		subreq->curr_folioq_slot = subreq->io_iter.folioq_slot;
161 		subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
162 	}
163 
164 	iov_iter_truncate(&subreq->io_iter, subreq->len);
165 	iov_iter_advance(&rreq->iter, subreq->len);
166 	return subreq->len;
167 }
168 
169 static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rreq,
170 						     struct netfs_io_subrequest *subreq,
171 						     loff_t i_size)
172 {
173 	struct netfs_cache_resources *cres = &rreq->cache_resources;
174 
175 	if (!cres->ops)
176 		return NETFS_DOWNLOAD_FROM_SERVER;
177 	return cres->ops->prepare_read(subreq, i_size);
178 }
179 
180 static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
181 					bool was_async)
182 {
183 	struct netfs_io_subrequest *subreq = priv;
184 
185 	if (transferred_or_error < 0) {
186 		netfs_read_subreq_terminated(subreq, transferred_or_error, was_async);
187 		return;
188 	}
189 
190 	if (transferred_or_error > 0)
191 		subreq->transferred += transferred_or_error;
192 	netfs_read_subreq_terminated(subreq, 0, was_async);
193 }
194 
195 /*
196  * Issue a read against the cache.
197  * - Eats the caller's ref on subreq.
198  */
199 static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
200 					  struct netfs_io_subrequest *subreq)
201 {
202 	struct netfs_cache_resources *cres = &rreq->cache_resources;
203 
204 	netfs_stat(&netfs_n_rh_read);
205 	cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_IGNORE,
206 			netfs_cache_read_terminated, subreq);
207 }
208 
209 /*
210  * Perform a read to the pagecache from a series of sources of different types,
211  * slicing up the region to be read according to available cache blocks and
212  * network rsize.
213  */
214 static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
215 {
216 	struct netfs_inode *ictx = netfs_inode(rreq->inode);
217 	unsigned long long start = rreq->start;
218 	ssize_t size = rreq->len;
219 	int ret = 0;
220 
221 	atomic_inc(&rreq->nr_outstanding);
222 
223 	do {
224 		struct netfs_io_subrequest *subreq;
225 		enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
226 		ssize_t slice;
227 
228 		subreq = netfs_alloc_subrequest(rreq);
229 		if (!subreq) {
230 			ret = -ENOMEM;
231 			break;
232 		}
233 
234 		subreq->start	= start;
235 		subreq->len	= size;
236 
237 		atomic_inc(&rreq->nr_outstanding);
238 		spin_lock_bh(&rreq->lock);
239 		list_add_tail(&subreq->rreq_link, &rreq->subrequests);
240 		subreq->prev_donated = rreq->prev_donated;
241 		rreq->prev_donated = 0;
242 		trace_netfs_sreq(subreq, netfs_sreq_trace_added);
243 		spin_unlock_bh(&rreq->lock);
244 
245 		source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
246 		subreq->source = source;
247 		if (source == NETFS_DOWNLOAD_FROM_SERVER) {
248 			unsigned long long zp = umin(ictx->zero_point, rreq->i_size);
249 			size_t len = subreq->len;
250 
251 			if (subreq->start >= zp) {
252 				subreq->source = source = NETFS_FILL_WITH_ZEROES;
253 				goto fill_with_zeroes;
254 			}
255 
256 			if (len > zp - subreq->start)
257 				len = zp - subreq->start;
258 			if (len == 0) {
259 				pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx",
260 				       rreq->debug_id, subreq->debug_index,
261 				       subreq->len, size,
262 				       subreq->start, ictx->zero_point, rreq->i_size);
263 				break;
264 			}
265 			subreq->len = len;
266 
267 			netfs_stat(&netfs_n_rh_download);
268 			if (rreq->netfs_ops->prepare_read) {
269 				ret = rreq->netfs_ops->prepare_read(subreq);
270 				if (ret < 0) {
271 					atomic_dec(&rreq->nr_outstanding);
272 					netfs_put_subrequest(subreq, false,
273 							     netfs_sreq_trace_put_cancel);
274 					break;
275 				}
276 				trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
277 			}
278 
279 			slice = netfs_prepare_read_iterator(subreq);
280 			if (slice < 0) {
281 				atomic_dec(&rreq->nr_outstanding);
282 				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
283 				ret = slice;
284 				break;
285 			}
286 
287 			rreq->netfs_ops->issue_read(subreq);
288 			goto done;
289 		}
290 
291 	fill_with_zeroes:
292 		if (source == NETFS_FILL_WITH_ZEROES) {
293 			subreq->source = NETFS_FILL_WITH_ZEROES;
294 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
295 			netfs_stat(&netfs_n_rh_zero);
296 			slice = netfs_prepare_read_iterator(subreq);
297 			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
298 			netfs_read_subreq_terminated(subreq, 0, false);
299 			goto done;
300 		}
301 
302 		if (source == NETFS_READ_FROM_CACHE) {
303 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
304 			slice = netfs_prepare_read_iterator(subreq);
305 			netfs_read_cache_to_pagecache(rreq, subreq);
306 			goto done;
307 		}
308 
309 		pr_err("Unexpected read source %u\n", source);
310 		WARN_ON_ONCE(1);
311 		break;
312 
313 	done:
314 		size -= slice;
315 		start += slice;
316 		cond_resched();
317 	} while (size > 0);
318 
319 	if (atomic_dec_and_test(&rreq->nr_outstanding))
320 		netfs_rreq_terminated(rreq, false);
321 
322 	/* Defer error return as we may need to wait for outstanding I/O. */
323 	cmpxchg(&rreq->error, 0, ret);
324 }
325 
326 /*
327  * Wait for the read operation to complete, successfully or otherwise.
328  */
329 static int netfs_wait_for_read(struct netfs_io_request *rreq)
330 {
331 	int ret;
332 
333 	trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
334 	wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
335 	ret = rreq->error;
336 	if (ret == 0 && rreq->submitted < rreq->len) {
337 		trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
338 		ret = -EIO;
339 	}
340 
341 	return ret;
342 }
343 
344 /*
345  * Set up the initial folioq of buffer folios in the rolling buffer and set the
346  * iterator to refer to it.
347  */
348 static int netfs_prime_buffer(struct netfs_io_request *rreq)
349 {
350 	struct folio_queue *folioq;
351 	size_t added;
352 
353 	folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
354 	if (!folioq)
355 		return -ENOMEM;
356 	netfs_stat(&netfs_n_folioq);
357 	folioq_init(folioq);
358 	rreq->buffer = folioq;
359 	rreq->buffer_tail = folioq;
360 	rreq->submitted = rreq->start;
361 	iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0);
362 
363 	added = netfs_load_buffer_from_ra(rreq, folioq);
364 	rreq->iter.count += added;
365 	rreq->submitted += added;
366 	return 0;
367 }
368 
369 /*
370  * Drop the ref on each folio that we inherited from the VM readahead code.  We
371  * still have the folio locks to pin the page until we complete the I/O.
372  *
373  * Note that we can't just release the batch in each queue struct as we use the
374  * occupancy count in other places.
375  */
376 static void netfs_put_ra_refs(struct folio_queue *folioq)
377 {
378 	struct folio_batch fbatch;
379 
380 	folio_batch_init(&fbatch);
381 	while (folioq) {
382 		for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) {
383 			struct folio *folio = folioq_folio(folioq, slot);
384 			if (!folio)
385 				continue;
386 			trace_netfs_folio(folio, netfs_folio_trace_read_put);
387 			if (!folio_batch_add(&fbatch, folio))
388 				folio_batch_release(&fbatch);
389 		}
390 		folioq = folioq->next;
391 	}
392 
393 	folio_batch_release(&fbatch);
394 }
395 
396 /**
397  * netfs_readahead - Helper to manage a read request
398  * @ractl: The description of the readahead request
399  *
400  * Fulfil a readahead request by drawing data from the cache if possible, or
401  * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
402  * requests from different sources will get munged together.  If necessary, the
403  * readahead window can be expanded in either direction to a more convenient
404  * alighment for RPC efficiency or to make storage in the cache feasible.
405  *
406  * The calling netfs must initialise a netfs context contiguous to the vfs
407  * inode before calling this.
408  *
409  * This is usable whether or not caching is enabled.
410  */
411 void netfs_readahead(struct readahead_control *ractl)
412 {
413 	struct netfs_io_request *rreq;
414 	struct netfs_inode *ictx = netfs_inode(ractl->mapping->host);
415 	unsigned long long start = readahead_pos(ractl);
416 	size_t size = readahead_length(ractl);
417 	int ret;
418 
419 	rreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size,
420 				   NETFS_READAHEAD);
421 	if (IS_ERR(rreq))
422 		return;
423 
424 	ret = netfs_begin_cache_read(rreq, ictx);
425 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
426 		goto cleanup_free;
427 
428 	netfs_stat(&netfs_n_rh_readahead);
429 	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
430 			 netfs_read_trace_readahead);
431 
432 	netfs_rreq_expand(rreq, ractl);
433 
434 	rreq->ractl = ractl;
435 	if (netfs_prime_buffer(rreq) < 0)
436 		goto cleanup_free;
437 	netfs_read_to_pagecache(rreq);
438 
439 	/* Release the folio refs whilst we're waiting for the I/O. */
440 	netfs_put_ra_refs(rreq->buffer);
441 
442 	netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
443 	return;
444 
445 cleanup_free:
446 	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
447 	return;
448 }
449 EXPORT_SYMBOL(netfs_readahead);
450 
451 /*
452  * Create a rolling buffer with a single occupying folio.
453  */
454 static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio)
455 {
456 	struct folio_queue *folioq;
457 
458 	folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
459 	if (!folioq)
460 		return -ENOMEM;
461 
462 	netfs_stat(&netfs_n_folioq);
463 	folioq_init(folioq);
464 	folioq_append(folioq, folio);
465 	BUG_ON(folioq_folio(folioq, 0) != folio);
466 	BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio));
467 	rreq->buffer = folioq;
468 	rreq->buffer_tail = folioq;
469 	rreq->submitted = rreq->start + rreq->len;
470 	iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len);
471 	rreq->ractl = (struct readahead_control *)1UL;
472 	return 0;
473 }
474 
475 /*
476  * Read into gaps in a folio partially filled by a streaming write.
477  */
478 static int netfs_read_gaps(struct file *file, struct folio *folio)
479 {
480 	struct netfs_io_request *rreq;
481 	struct address_space *mapping = folio->mapping;
482 	struct netfs_folio *finfo = netfs_folio_info(folio);
483 	struct netfs_inode *ctx = netfs_inode(mapping->host);
484 	struct folio *sink = NULL;
485 	struct bio_vec *bvec;
486 	unsigned int from = finfo->dirty_offset;
487 	unsigned int to = from + finfo->dirty_len;
488 	unsigned int off = 0, i = 0;
489 	size_t flen = folio_size(folio);
490 	size_t nr_bvec = flen / PAGE_SIZE + 2;
491 	size_t part;
492 	int ret;
493 
494 	_enter("%lx", folio->index);
495 
496 	rreq = netfs_alloc_request(mapping, file, folio_pos(folio), flen, NETFS_READ_GAPS);
497 	if (IS_ERR(rreq)) {
498 		ret = PTR_ERR(rreq);
499 		goto alloc_error;
500 	}
501 
502 	ret = netfs_begin_cache_read(rreq, ctx);
503 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
504 		goto discard;
505 
506 	netfs_stat(&netfs_n_rh_read_folio);
507 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_read_gaps);
508 
509 	/* Fiddle the buffer so that a gap at the beginning and/or a gap at the
510 	 * end get copied to, but the middle is discarded.
511 	 */
512 	ret = -ENOMEM;
513 	bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
514 	if (!bvec)
515 		goto discard;
516 
517 	sink = folio_alloc(GFP_KERNEL, 0);
518 	if (!sink) {
519 		kfree(bvec);
520 		goto discard;
521 	}
522 
523 	trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
524 
525 	rreq->direct_bv = bvec;
526 	rreq->direct_bv_count = nr_bvec;
527 	if (from > 0) {
528 		bvec_set_folio(&bvec[i++], folio, from, 0);
529 		off = from;
530 	}
531 	while (off < to) {
532 		part = min_t(size_t, to - off, PAGE_SIZE);
533 		bvec_set_folio(&bvec[i++], sink, part, 0);
534 		off += part;
535 	}
536 	if (to < flen)
537 		bvec_set_folio(&bvec[i++], folio, flen - to, to);
538 	iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
539 	rreq->submitted = rreq->start + flen;
540 
541 	netfs_read_to_pagecache(rreq);
542 
543 	if (sink)
544 		folio_put(sink);
545 
546 	ret = netfs_wait_for_read(rreq);
547 	if (ret == 0) {
548 		flush_dcache_folio(folio);
549 		folio_mark_uptodate(folio);
550 	}
551 	folio_unlock(folio);
552 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
553 	return ret < 0 ? ret : 0;
554 
555 discard:
556 	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
557 alloc_error:
558 	folio_unlock(folio);
559 	return ret;
560 }
561 
562 /**
563  * netfs_read_folio - Helper to manage a read_folio request
564  * @file: The file to read from
565  * @folio: The folio to read
566  *
567  * Fulfil a read_folio request by drawing data from the cache if
568  * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
569  * Multiple I/O requests from different sources will get munged together.
570  *
571  * The calling netfs must initialise a netfs context contiguous to the vfs
572  * inode before calling this.
573  *
574  * This is usable whether or not caching is enabled.
575  */
576 int netfs_read_folio(struct file *file, struct folio *folio)
577 {
578 	struct address_space *mapping = folio->mapping;
579 	struct netfs_io_request *rreq;
580 	struct netfs_inode *ctx = netfs_inode(mapping->host);
581 	int ret;
582 
583 	if (folio_test_dirty(folio)) {
584 		trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
585 		return netfs_read_gaps(file, folio);
586 	}
587 
588 	_enter("%lx", folio->index);
589 
590 	rreq = netfs_alloc_request(mapping, file,
591 				   folio_pos(folio), folio_size(folio),
592 				   NETFS_READPAGE);
593 	if (IS_ERR(rreq)) {
594 		ret = PTR_ERR(rreq);
595 		goto alloc_error;
596 	}
597 
598 	ret = netfs_begin_cache_read(rreq, ctx);
599 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
600 		goto discard;
601 
602 	netfs_stat(&netfs_n_rh_read_folio);
603 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
604 
605 	/* Set up the output buffer */
606 	ret = netfs_create_singular_buffer(rreq, folio);
607 	if (ret < 0)
608 		goto discard;
609 
610 	netfs_read_to_pagecache(rreq);
611 	ret = netfs_wait_for_read(rreq);
612 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
613 	return ret < 0 ? ret : 0;
614 
615 discard:
616 	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
617 alloc_error:
618 	folio_unlock(folio);
619 	return ret;
620 }
621 EXPORT_SYMBOL(netfs_read_folio);
622 
623 /*
624  * Prepare a folio for writing without reading first
625  * @folio: The folio being prepared
626  * @pos: starting position for the write
627  * @len: length of write
628  * @always_fill: T if the folio should always be completely filled/cleared
629  *
630  * In some cases, write_begin doesn't need to read at all:
631  * - full folio write
632  * - write that lies in a folio that is completely beyond EOF
633  * - write that covers the folio from start to EOF or beyond it
634  *
635  * If any of these criteria are met, then zero out the unwritten parts
636  * of the folio and return true. Otherwise, return false.
637  */
638 static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
639 				 bool always_fill)
640 {
641 	struct inode *inode = folio_inode(folio);
642 	loff_t i_size = i_size_read(inode);
643 	size_t offset = offset_in_folio(folio, pos);
644 	size_t plen = folio_size(folio);
645 
646 	if (unlikely(always_fill)) {
647 		if (pos - offset + len <= i_size)
648 			return false; /* Page entirely before EOF */
649 		zero_user_segment(&folio->page, 0, plen);
650 		folio_mark_uptodate(folio);
651 		return true;
652 	}
653 
654 	/* Full folio write */
655 	if (offset == 0 && len >= plen)
656 		return true;
657 
658 	/* Page entirely beyond the end of the file */
659 	if (pos - offset >= i_size)
660 		goto zero_out;
661 
662 	/* Write that covers from the start of the folio to EOF or beyond */
663 	if (offset == 0 && (pos + len) >= i_size)
664 		goto zero_out;
665 
666 	return false;
667 zero_out:
668 	zero_user_segments(&folio->page, 0, offset, offset + len, plen);
669 	return true;
670 }
671 
672 /**
673  * netfs_write_begin - Helper to prepare for writing [DEPRECATED]
674  * @ctx: The netfs context
675  * @file: The file to read from
676  * @mapping: The mapping to read from
677  * @pos: File position at which the write will begin
678  * @len: The length of the write (may extend beyond the end of the folio chosen)
679  * @_folio: Where to put the resultant folio
680  * @_fsdata: Place for the netfs to store a cookie
681  *
682  * Pre-read data for a write-begin request by drawing data from the cache if
683  * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
684  * Multiple I/O requests from different sources will get munged together.
685  *
686  * The calling netfs must provide a table of operations, only one of which,
687  * issue_read, is mandatory.
688  *
689  * The check_write_begin() operation can be provided to check for and flush
690  * conflicting writes once the folio is grabbed and locked.  It is passed a
691  * pointer to the fsdata cookie that gets returned to the VM to be passed to
692  * write_end.  It is permitted to sleep.  It should return 0 if the request
693  * should go ahead or it may return an error.  It may also unlock and put the
694  * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
695  * will cause the folio to be re-got and the process to be retried.
696  *
697  * The calling netfs must initialise a netfs context contiguous to the vfs
698  * inode before calling this.
699  *
700  * This is usable whether or not caching is enabled.
701  *
702  * Note that this should be considered deprecated and netfs_perform_write()
703  * used instead.
704  */
705 int netfs_write_begin(struct netfs_inode *ctx,
706 		      struct file *file, struct address_space *mapping,
707 		      loff_t pos, unsigned int len, struct folio **_folio,
708 		      void **_fsdata)
709 {
710 	struct netfs_io_request *rreq;
711 	struct folio *folio;
712 	pgoff_t index = pos >> PAGE_SHIFT;
713 	int ret;
714 
715 retry:
716 	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
717 				    mapping_gfp_mask(mapping));
718 	if (IS_ERR(folio))
719 		return PTR_ERR(folio);
720 
721 	if (ctx->ops->check_write_begin) {
722 		/* Allow the netfs (eg. ceph) to flush conflicts. */
723 		ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
724 		if (ret < 0) {
725 			trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
726 			goto error;
727 		}
728 		if (!folio)
729 			goto retry;
730 	}
731 
732 	if (folio_test_uptodate(folio))
733 		goto have_folio;
734 
735 	/* If the page is beyond the EOF, we want to clear it - unless it's
736 	 * within the cache granule containing the EOF, in which case we need
737 	 * to preload the granule.
738 	 */
739 	if (!netfs_is_cache_enabled(ctx) &&
740 	    netfs_skip_folio_read(folio, pos, len, false)) {
741 		netfs_stat(&netfs_n_rh_write_zskip);
742 		goto have_folio_no_wait;
743 	}
744 
745 	rreq = netfs_alloc_request(mapping, file,
746 				   folio_pos(folio), folio_size(folio),
747 				   NETFS_READ_FOR_WRITE);
748 	if (IS_ERR(rreq)) {
749 		ret = PTR_ERR(rreq);
750 		goto error;
751 	}
752 	rreq->no_unlock_folio	= folio->index;
753 	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
754 
755 	ret = netfs_begin_cache_read(rreq, ctx);
756 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
757 		goto error_put;
758 
759 	netfs_stat(&netfs_n_rh_write_begin);
760 	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
761 
762 	/* Set up the output buffer */
763 	ret = netfs_create_singular_buffer(rreq, folio);
764 	if (ret < 0)
765 		goto error_put;
766 
767 	netfs_read_to_pagecache(rreq);
768 	ret = netfs_wait_for_read(rreq);
769 	if (ret < 0)
770 		goto error;
771 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
772 
773 have_folio:
774 	ret = folio_wait_private_2_killable(folio);
775 	if (ret < 0)
776 		goto error;
777 have_folio_no_wait:
778 	*_folio = folio;
779 	_leave(" = 0");
780 	return 0;
781 
782 error_put:
783 	netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
784 error:
785 	if (folio) {
786 		folio_unlock(folio);
787 		folio_put(folio);
788 	}
789 	_leave(" = %d", ret);
790 	return ret;
791 }
792 EXPORT_SYMBOL(netfs_write_begin);
793 
794 /*
795  * Preload the data into a page we're proposing to write into.
796  */
797 int netfs_prefetch_for_write(struct file *file, struct folio *folio,
798 			     size_t offset, size_t len)
799 {
800 	struct netfs_io_request *rreq;
801 	struct address_space *mapping = folio->mapping;
802 	struct netfs_inode *ctx = netfs_inode(mapping->host);
803 	unsigned long long start = folio_pos(folio);
804 	size_t flen = folio_size(folio);
805 	int ret;
806 
807 	_enter("%zx @%llx", flen, start);
808 
809 	ret = -ENOMEM;
810 
811 	rreq = netfs_alloc_request(mapping, file, start, flen,
812 				   NETFS_READ_FOR_WRITE);
813 	if (IS_ERR(rreq)) {
814 		ret = PTR_ERR(rreq);
815 		goto error;
816 	}
817 
818 	rreq->no_unlock_folio = folio->index;
819 	__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
820 	ret = netfs_begin_cache_read(rreq, ctx);
821 	if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
822 		goto error_put;
823 
824 	netfs_stat(&netfs_n_rh_write_begin);
825 	trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
826 
827 	/* Set up the output buffer */
828 	ret = netfs_create_singular_buffer(rreq, folio);
829 	if (ret < 0)
830 		goto error_put;
831 
832 	folioq_mark2(rreq->buffer, 0);
833 	netfs_read_to_pagecache(rreq);
834 	ret = netfs_wait_for_read(rreq);
835 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
836 	return ret;
837 
838 error_put:
839 	netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
840 error:
841 	_leave(" = %d", ret);
842 	return ret;
843 }
844 
845 /**
846  * netfs_buffered_read_iter - Filesystem buffered I/O read routine
847  * @iocb: kernel I/O control block
848  * @iter: destination for the data read
849  *
850  * This is the ->read_iter() routine for all filesystems that can use the page
851  * cache directly.
852  *
853  * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
854  * returned when no data can be read without waiting for I/O requests to
855  * complete; it doesn't prevent readahead.
856  *
857  * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
858  * shall be made for the read or for readahead.  When no data can be read,
859  * -EAGAIN shall be returned.  When readahead would be triggered, a partial,
860  * possibly empty read shall be returned.
861  *
862  * Return:
863  * * number of bytes copied, even for partial reads
864  * * negative error code (or 0 if IOCB_NOIO) if nothing was read
865  */
866 ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
867 {
868 	struct inode *inode = file_inode(iocb->ki_filp);
869 	struct netfs_inode *ictx = netfs_inode(inode);
870 	ssize_t ret;
871 
872 	if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
873 			 test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
874 		return -EINVAL;
875 
876 	ret = netfs_start_io_read(inode);
877 	if (ret == 0) {
878 		ret = filemap_read(iocb, iter, 0);
879 		netfs_end_io_read(inode);
880 	}
881 	return ret;
882 }
883 EXPORT_SYMBOL(netfs_buffered_read_iter);
884 
885 /**
886  * netfs_file_read_iter - Generic filesystem read routine
887  * @iocb: kernel I/O control block
888  * @iter: destination for the data read
889  *
890  * This is the ->read_iter() routine for all filesystems that can use the page
891  * cache directly.
892  *
893  * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
894  * returned when no data can be read without waiting for I/O requests to
895  * complete; it doesn't prevent readahead.
896  *
897  * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
898  * shall be made for the read or for readahead.  When no data can be read,
899  * -EAGAIN shall be returned.  When readahead would be triggered, a partial,
900  * possibly empty read shall be returned.
901  *
902  * Return:
903  * * number of bytes copied, even for partial reads
904  * * negative error code (or 0 if IOCB_NOIO) if nothing was read
905  */
906 ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
907 {
908 	struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
909 
910 	if ((iocb->ki_flags & IOCB_DIRECT) ||
911 	    test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
912 		return netfs_unbuffered_read_iter(iocb, iter);
913 
914 	return netfs_buffered_read_iter(iocb, iter);
915 }
916 EXPORT_SYMBOL(netfs_file_read_iter);
917