xref: /linux/fs/bcachefs/fs-io-pagecache.c (revision 13b25489b6f8bd73ed65f07928f7c27a481f1820)
1 // SPDX-License-Identifier: GPL-2.0
2 #ifndef NO_BCACHEFS_FS
3 
4 #include "bcachefs.h"
5 #include "btree_iter.h"
6 #include "extents.h"
7 #include "fs-io.h"
8 #include "fs-io-pagecache.h"
9 #include "subvolume.h"
10 
11 #include <linux/pagevec.h>
12 #include <linux/writeback.h>
13 
14 int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
15 				     loff_t start, u64 end,
16 				     fgf_t fgp_flags, gfp_t gfp,
17 				     folios *fs)
18 {
19 	struct folio *f;
20 	u64 pos = start;
21 	int ret = 0;
22 
23 	while (pos < end) {
24 		if ((u64) pos >= (u64) start + (1ULL << 20))
25 			fgp_flags &= ~FGP_CREAT;
26 
27 		ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
28 		if (ret)
29 			break;
30 
31 		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
32 		if (IS_ERR_OR_NULL(f))
33 			break;
34 
35 		BUG_ON(fs->nr && folio_pos(f) != pos);
36 
37 		pos = folio_end_pos(f);
38 		darray_push(fs, f);
39 	}
40 
41 	if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
42 		ret = -ENOMEM;
43 
44 	return fs->nr ? 0 : ret;
45 }
46 
47 /* pagecache_block must be held */
48 int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
49 					    loff_t start, loff_t end)
50 {
51 	int ret;
52 
53 	/*
54 	 * XXX: the way this is currently implemented, we can spin if a process
55 	 * is continually redirtying a specific page
56 	 */
57 	do {
58 		if (!mapping->nrpages)
59 			return 0;
60 
61 		ret = filemap_write_and_wait_range(mapping, start, end);
62 		if (ret)
63 			break;
64 
65 		if (!mapping->nrpages)
66 			return 0;
67 
68 		ret = invalidate_inode_pages2_range(mapping,
69 				start >> PAGE_SHIFT,
70 				end >> PAGE_SHIFT);
71 	} while (ret == -EBUSY);
72 
73 	return ret;
74 }
75 
76 #if 0
77 /* Useful for debug tracing: */
78 static const char * const bch2_folio_sector_states[] = {
79 #define x(n)	#n,
80 	BCH_FOLIO_SECTOR_STATE()
81 #undef x
82 	NULL
83 };
84 #endif
85 
86 static inline enum bch_folio_sector_state
87 folio_sector_dirty(enum bch_folio_sector_state state)
88 {
89 	switch (state) {
90 	case SECTOR_unallocated:
91 		return SECTOR_dirty;
92 	case SECTOR_reserved:
93 		return SECTOR_dirty_reserved;
94 	default:
95 		return state;
96 	}
97 }
98 
99 static inline enum bch_folio_sector_state
100 folio_sector_undirty(enum bch_folio_sector_state state)
101 {
102 	switch (state) {
103 	case SECTOR_dirty:
104 		return SECTOR_unallocated;
105 	case SECTOR_dirty_reserved:
106 		return SECTOR_reserved;
107 	default:
108 		return state;
109 	}
110 }
111 
112 static inline enum bch_folio_sector_state
113 folio_sector_reserve(enum bch_folio_sector_state state)
114 {
115 	switch (state) {
116 	case SECTOR_unallocated:
117 		return SECTOR_reserved;
118 	case SECTOR_dirty:
119 		return SECTOR_dirty_reserved;
120 	default:
121 		return state;
122 	}
123 }
124 
125 /* for newly allocated folios: */
126 struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
127 {
128 	struct bch_folio *s;
129 
130 	s = kzalloc(sizeof(*s) +
131 		    sizeof(struct bch_folio_sector) *
132 		    folio_sectors(folio), gfp);
133 	if (!s)
134 		return NULL;
135 
136 	spin_lock_init(&s->lock);
137 	folio_attach_private(folio, s);
138 	return s;
139 }
140 
141 struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
142 {
143 	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
144 }
145 
146 static unsigned bkey_to_sector_state(struct bkey_s_c k)
147 {
148 	if (bkey_extent_is_reservation(k))
149 		return SECTOR_reserved;
150 	if (bkey_extent_is_allocation(k.k))
151 		return SECTOR_allocated;
152 	return SECTOR_unallocated;
153 }
154 
155 static void __bch2_folio_set(struct folio *folio,
156 			     unsigned pg_offset, unsigned pg_len,
157 			     unsigned nr_ptrs, unsigned state)
158 {
159 	struct bch_folio *s = bch2_folio(folio);
160 	unsigned i, sectors = folio_sectors(folio);
161 
162 	BUG_ON(pg_offset >= sectors);
163 	BUG_ON(pg_offset + pg_len > sectors);
164 
165 	spin_lock(&s->lock);
166 
167 	for (i = pg_offset; i < pg_offset + pg_len; i++) {
168 		s->s[i].nr_replicas	= nr_ptrs;
169 		bch2_folio_sector_set(folio, s, i, state);
170 	}
171 
172 	if (i == sectors)
173 		s->uptodate = true;
174 
175 	spin_unlock(&s->lock);
176 }
177 
178 /*
179  * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
180  * extents btree:
181  */
182 int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
183 		   struct folio **fs, unsigned nr_folios)
184 {
185 	u64 offset = folio_sector(fs[0]);
186 	bool need_set = false;
187 
188 	for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
189 		struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
190 		if (!s)
191 			return -ENOMEM;
192 
193 		need_set |= !s->uptodate;
194 	}
195 
196 	if (!need_set)
197 		return 0;
198 
199 	unsigned folio_idx = 0;
200 
201 	return bch2_trans_run(c,
202 		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
203 				   POS(inum.inum, offset),
204 				   POS(inum.inum, U64_MAX),
205 				   inum.subvol, BTREE_ITER_slots, k, ({
206 			unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
207 			unsigned state = bkey_to_sector_state(k);
208 
209 			while (folio_idx < nr_folios) {
210 				struct folio *folio = fs[folio_idx];
211 				u64 folio_start	= folio_sector(folio);
212 				u64 folio_end	= folio_end_sector(folio);
213 				unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
214 					folio_start;
215 				unsigned folio_len = min(k.k->p.offset, folio_end) -
216 					folio_offset - folio_start;
217 
218 				BUG_ON(k.k->p.offset < folio_start);
219 				BUG_ON(bkey_start_offset(k.k) > folio_end);
220 
221 				if (!bch2_folio(folio)->uptodate)
222 					__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
223 
224 				if (k.k->p.offset < folio_end)
225 					break;
226 				folio_idx++;
227 			}
228 
229 			if (folio_idx == nr_folios)
230 				break;
231 			0;
232 		})));
233 }
234 
235 void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
236 {
237 	struct bvec_iter iter;
238 	struct folio_vec fv;
239 	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
240 		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
241 	unsigned state = bkey_to_sector_state(k);
242 
243 	bio_for_each_folio(fv, bio, iter)
244 		__bch2_folio_set(fv.fv_folio,
245 				 fv.fv_offset >> 9,
246 				 fv.fv_len >> 9,
247 				 nr_ptrs, state);
248 }
249 
250 void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
251 				     u64 start, u64 end)
252 {
253 	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
254 	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
255 	struct folio_batch fbatch;
256 	unsigned i, j;
257 
258 	if (end <= start)
259 		return;
260 
261 	folio_batch_init(&fbatch);
262 
263 	while (filemap_get_folios(inode->v.i_mapping,
264 				  &index, end_index, &fbatch)) {
265 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
266 			struct folio *folio = fbatch.folios[i];
267 			u64 folio_start = folio_sector(folio);
268 			u64 folio_end = folio_end_sector(folio);
269 			unsigned folio_offset = max(start, folio_start) - folio_start;
270 			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
271 			struct bch_folio *s;
272 
273 			BUG_ON(end <= folio_start);
274 
275 			folio_lock(folio);
276 			s = bch2_folio(folio);
277 
278 			if (s) {
279 				spin_lock(&s->lock);
280 				for (j = folio_offset; j < folio_offset + folio_len; j++)
281 					s->s[j].nr_replicas = 0;
282 				spin_unlock(&s->lock);
283 			}
284 
285 			folio_unlock(folio);
286 		}
287 		folio_batch_release(&fbatch);
288 		cond_resched();
289 	}
290 }
291 
292 int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
293 				 u64 *start, u64 end,
294 				 bool nonblocking)
295 {
296 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
297 	pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
298 	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
299 	struct folio_batch fbatch;
300 	s64 i_sectors_delta = 0;
301 	int ret = 0;
302 
303 	if (end <= *start)
304 		return 0;
305 
306 	folio_batch_init(&fbatch);
307 
308 	while (filemap_get_folios(inode->v.i_mapping,
309 				  &index, end_index, &fbatch)) {
310 		for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
311 			struct folio *folio = fbatch.folios[i];
312 
313 			if (!nonblocking)
314 				folio_lock(folio);
315 			else if (!folio_trylock(folio)) {
316 				folio_batch_release(&fbatch);
317 				ret = -EAGAIN;
318 				break;
319 			}
320 
321 			u64 folio_start = folio_sector(folio);
322 			u64 folio_end = folio_end_sector(folio);
323 
324 			BUG_ON(end <= folio_start);
325 
326 			*start = min(end, folio_end);
327 
328 			struct bch_folio *s = bch2_folio(folio);
329 			if (s) {
330 				unsigned folio_offset = max(*start, folio_start) - folio_start;
331 				unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
332 
333 				spin_lock(&s->lock);
334 				for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
335 					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
336 					bch2_folio_sector_set(folio, s, j,
337 						folio_sector_reserve(s->s[j].state));
338 				}
339 				spin_unlock(&s->lock);
340 			}
341 
342 			folio_unlock(folio);
343 		}
344 		folio_batch_release(&fbatch);
345 		cond_resched();
346 	}
347 
348 	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
349 	return ret;
350 }
351 
352 static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
353 					  unsigned nr_replicas)
354 {
355 	return max(0, (int) nr_replicas -
356 		   s->nr_replicas -
357 		   s->replicas_reserved);
358 }
359 
360 int bch2_get_folio_disk_reservation(struct bch_fs *c,
361 				struct bch_inode_info *inode,
362 				struct folio *folio, bool check_enospc)
363 {
364 	struct bch_folio *s = bch2_folio_create(folio, 0);
365 	unsigned nr_replicas = inode_nr_replicas(c, inode);
366 	struct disk_reservation disk_res = { 0 };
367 	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
368 	int ret;
369 
370 	if (!s)
371 		return -ENOMEM;
372 
373 	for (i = 0; i < sectors; i++)
374 		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
375 
376 	if (!disk_res_sectors)
377 		return 0;
378 
379 	ret = bch2_disk_reservation_get(c, &disk_res,
380 					disk_res_sectors, 1,
381 					!check_enospc
382 					? BCH_DISK_RESERVATION_NOFAIL
383 					: 0);
384 	if (unlikely(ret))
385 		return ret;
386 
387 	for (i = 0; i < sectors; i++)
388 		s->s[i].replicas_reserved +=
389 			sectors_to_reserve(&s->s[i], nr_replicas);
390 
391 	return 0;
392 }
393 
394 void bch2_folio_reservation_put(struct bch_fs *c,
395 			struct bch_inode_info *inode,
396 			struct bch2_folio_reservation *res)
397 {
398 	bch2_disk_reservation_put(c, &res->disk);
399 	bch2_quota_reservation_put(c, inode, &res->quota);
400 }
401 
402 static int __bch2_folio_reservation_get(struct bch_fs *c,
403 			struct bch_inode_info *inode,
404 			struct folio *folio,
405 			struct bch2_folio_reservation *res,
406 			size_t offset, size_t len,
407 			bool partial)
408 {
409 	struct bch_folio *s = bch2_folio_create(folio, 0);
410 	unsigned i, disk_sectors = 0, quota_sectors = 0;
411 	struct disk_reservation disk_res = {};
412 	size_t reserved = len;
413 	int ret;
414 
415 	if (!s)
416 		return -ENOMEM;
417 
418 	BUG_ON(!s->uptodate);
419 
420 	for (i = round_down(offset, block_bytes(c)) >> 9;
421 	     i < round_up(offset + len, block_bytes(c)) >> 9;
422 	     i++) {
423 		disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
424 		quota_sectors += s->s[i].state == SECTOR_unallocated;
425 	}
426 
427 	if (disk_sectors) {
428 		ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
429 				partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
430 		if (unlikely(ret))
431 			return ret;
432 
433 		if (unlikely(disk_res.sectors != disk_sectors)) {
434 			disk_sectors = quota_sectors = 0;
435 
436 			for (i = round_down(offset, block_bytes(c)) >> 9;
437 			     i < round_up(offset + len, block_bytes(c)) >> 9;
438 			     i++) {
439 				disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
440 				if (disk_sectors > disk_res.sectors) {
441 					/*
442 					 * Make sure to get a reservation that's
443 					 * aligned to the filesystem blocksize:
444 					 */
445 					unsigned reserved_offset = round_down(i << 9, block_bytes(c));
446 					reserved = clamp(reserved_offset, offset, offset + len) - offset;
447 
448 					if (!reserved) {
449 						bch2_disk_reservation_put(c, &disk_res);
450 						return -BCH_ERR_ENOSPC_disk_reservation;
451 					}
452 					break;
453 				}
454 				quota_sectors += s->s[i].state == SECTOR_unallocated;
455 			}
456 		}
457 	}
458 
459 	if (quota_sectors) {
460 		ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
461 		if (unlikely(ret)) {
462 			bch2_disk_reservation_put(c, &disk_res);
463 			return ret;
464 		}
465 	}
466 
467 	res->disk.sectors += disk_res.sectors;
468 	return partial ? reserved : 0;
469 }
470 
471 int bch2_folio_reservation_get(struct bch_fs *c,
472 			struct bch_inode_info *inode,
473 			struct folio *folio,
474 			struct bch2_folio_reservation *res,
475 			size_t offset, size_t len)
476 {
477 	return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
478 }
479 
480 ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
481 			struct bch_inode_info *inode,
482 			struct folio *folio,
483 			struct bch2_folio_reservation *res,
484 			size_t offset, size_t len)
485 {
486 	return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
487 }
488 
489 static void bch2_clear_folio_bits(struct folio *folio)
490 {
491 	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
492 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
493 	struct bch_folio *s = bch2_folio(folio);
494 	struct disk_reservation disk_res = { 0 };
495 	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
496 
497 	if (!s)
498 		return;
499 
500 	EBUG_ON(!folio_test_locked(folio));
501 	EBUG_ON(folio_test_writeback(folio));
502 
503 	for (i = 0; i < sectors; i++) {
504 		disk_res.sectors += s->s[i].replicas_reserved;
505 		s->s[i].replicas_reserved = 0;
506 
507 		dirty_sectors -= s->s[i].state == SECTOR_dirty;
508 		bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
509 	}
510 
511 	bch2_disk_reservation_put(c, &disk_res);
512 
513 	bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
514 
515 	bch2_folio_release(folio);
516 }
517 
518 void bch2_set_folio_dirty(struct bch_fs *c,
519 			  struct bch_inode_info *inode,
520 			  struct folio *folio,
521 			  struct bch2_folio_reservation *res,
522 			  unsigned offset, unsigned len)
523 {
524 	struct bch_folio *s = bch2_folio(folio);
525 	unsigned i, dirty_sectors = 0;
526 
527 	WARN_ON((u64) folio_pos(folio) + offset + len >
528 		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
529 
530 	BUG_ON(!s->uptodate);
531 
532 	spin_lock(&s->lock);
533 
534 	for (i = round_down(offset, block_bytes(c)) >> 9;
535 	     i < round_up(offset + len, block_bytes(c)) >> 9;
536 	     i++) {
537 		unsigned sectors = sectors_to_reserve(&s->s[i],
538 						res->disk.nr_replicas);
539 
540 		/*
541 		 * This can happen if we race with the error path in
542 		 * bch2_writepage_io_done():
543 		 */
544 		sectors = min_t(unsigned, sectors, res->disk.sectors);
545 
546 		s->s[i].replicas_reserved += sectors;
547 		res->disk.sectors -= sectors;
548 
549 		dirty_sectors += s->s[i].state == SECTOR_unallocated;
550 
551 		bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
552 	}
553 
554 	spin_unlock(&s->lock);
555 
556 	bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
557 
558 	if (!folio_test_dirty(folio))
559 		filemap_dirty_folio(inode->v.i_mapping, folio);
560 }
561 
562 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
563 {
564 	struct file *file = vmf->vma->vm_file;
565 	struct address_space *mapping = file->f_mapping;
566 	struct address_space *fdm = faults_disabled_mapping();
567 	struct bch_inode_info *inode = file_bch_inode(file);
568 	vm_fault_t ret;
569 
570 	if (fdm == mapping)
571 		return VM_FAULT_SIGBUS;
572 
573 	/* Lock ordering: */
574 	if (fdm > mapping) {
575 		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
576 
577 		if (bch2_pagecache_add_tryget(inode))
578 			goto got_lock;
579 
580 		bch2_pagecache_block_put(fdm_host);
581 
582 		bch2_pagecache_add_get(inode);
583 		bch2_pagecache_add_put(inode);
584 
585 		bch2_pagecache_block_get(fdm_host);
586 
587 		/* Signal that lock has been dropped: */
588 		set_fdm_dropped_locks();
589 		return VM_FAULT_SIGBUS;
590 	}
591 
592 	bch2_pagecache_add_get(inode);
593 got_lock:
594 	ret = filemap_fault(vmf);
595 	bch2_pagecache_add_put(inode);
596 
597 	return ret;
598 }
599 
600 vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
601 {
602 	struct folio *folio = page_folio(vmf->page);
603 	struct file *file = vmf->vma->vm_file;
604 	struct bch_inode_info *inode = file_bch_inode(file);
605 	struct address_space *mapping = file->f_mapping;
606 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
607 	struct bch2_folio_reservation res;
608 	unsigned len;
609 	loff_t isize;
610 	vm_fault_t ret;
611 
612 	bch2_folio_reservation_init(c, inode, &res);
613 
614 	sb_start_pagefault(inode->v.i_sb);
615 	file_update_time(file);
616 
617 	/*
618 	 * Not strictly necessary, but helps avoid dio writes livelocking in
619 	 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
620 	 * a bch2_write_invalidate_inode_pages_range() that works without dropping
621 	 * page lock before invalidating page
622 	 */
623 	bch2_pagecache_add_get(inode);
624 
625 	folio_lock(folio);
626 	isize = i_size_read(&inode->v);
627 
628 	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
629 		folio_unlock(folio);
630 		ret = VM_FAULT_NOPAGE;
631 		goto out;
632 	}
633 
634 	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
635 
636 	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
637 	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
638 		folio_unlock(folio);
639 		ret = VM_FAULT_SIGBUS;
640 		goto out;
641 	}
642 
643 	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
644 	bch2_folio_reservation_put(c, inode, &res);
645 
646 	folio_wait_stable(folio);
647 	ret = VM_FAULT_LOCKED;
648 out:
649 	bch2_pagecache_add_put(inode);
650 	sb_end_pagefault(inode->v.i_sb);
651 
652 	return ret;
653 }
654 
655 void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
656 {
657 	if (offset || length < folio_size(folio))
658 		return;
659 
660 	bch2_clear_folio_bits(folio);
661 }
662 
663 bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
664 {
665 	if (folio_test_dirty(folio) || folio_test_writeback(folio))
666 		return false;
667 
668 	bch2_clear_folio_bits(folio);
669 	return true;
670 }
671 
672 /* fseek: */
673 
674 static int folio_data_offset(struct folio *folio, loff_t pos,
675 			     unsigned min_replicas)
676 {
677 	struct bch_folio *s = bch2_folio(folio);
678 	unsigned i, sectors = folio_sectors(folio);
679 
680 	if (s)
681 		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
682 			if (s->s[i].state >= SECTOR_dirty &&
683 			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
684 				return i << SECTOR_SHIFT;
685 
686 	return -1;
687 }
688 
689 loff_t bch2_seek_pagecache_data(struct inode *vinode,
690 				loff_t start_offset,
691 				loff_t end_offset,
692 				unsigned min_replicas,
693 				bool nonblock)
694 {
695 	struct folio_batch fbatch;
696 	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
697 	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
698 	pgoff_t index		= start_index;
699 	unsigned i;
700 	loff_t ret;
701 	int offset;
702 
703 	folio_batch_init(&fbatch);
704 
705 	while (filemap_get_folios(vinode->i_mapping,
706 				  &index, end_index, &fbatch)) {
707 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
708 			struct folio *folio = fbatch.folios[i];
709 
710 			if (!nonblock) {
711 				folio_lock(folio);
712 			} else if (!folio_trylock(folio)) {
713 				folio_batch_release(&fbatch);
714 				return -EAGAIN;
715 			}
716 
717 			offset = folio_data_offset(folio,
718 					max(folio_pos(folio), start_offset),
719 					min_replicas);
720 			if (offset >= 0) {
721 				ret = clamp(folio_pos(folio) + offset,
722 					    start_offset, end_offset);
723 				folio_unlock(folio);
724 				folio_batch_release(&fbatch);
725 				return ret;
726 			}
727 			folio_unlock(folio);
728 		}
729 		folio_batch_release(&fbatch);
730 		cond_resched();
731 	}
732 
733 	return end_offset;
734 }
735 
736 /*
737  * Search for a hole in a folio.
738  *
739  * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
740  * code to indicate a pagecache hole exists at the returned offset. Otherwise
741  * return 0 if the folio is filled with data, or an error code. This function
742  * can return -EAGAIN if nonblock is specified.
743  */
744 static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
745 			      unsigned min_replicas, bool nonblock)
746 {
747 	struct folio *folio;
748 	struct bch_folio *s;
749 	unsigned i, sectors;
750 	int ret = -ENOENT;
751 
752 	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
753 				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
754 	if (IS_ERR(folio))
755 		return PTR_ERR(folio);
756 
757 	s = bch2_folio(folio);
758 	if (!s)
759 		goto unlock;
760 
761 	sectors = folio_sectors(folio);
762 	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
763 		if (s->s[i].state < SECTOR_dirty ||
764 		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
765 			*offset = max(*offset,
766 				      folio_pos(folio) + (i << SECTOR_SHIFT));
767 			goto unlock;
768 		}
769 
770 	*offset = folio_end_pos(folio);
771 	ret = 0;
772 unlock:
773 	folio_unlock(folio);
774 	folio_put(folio);
775 	return ret;
776 }
777 
778 loff_t bch2_seek_pagecache_hole(struct inode *vinode,
779 				loff_t start_offset,
780 				loff_t end_offset,
781 				unsigned min_replicas,
782 				bool nonblock)
783 {
784 	struct address_space *mapping = vinode->i_mapping;
785 	loff_t offset = start_offset;
786 	loff_t ret = 0;
787 
788 	while (!ret && offset < end_offset)
789 		ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
790 
791 	if (ret && ret != -ENOENT)
792 		return ret;
793 	return min(offset, end_offset);
794 }
795 
796 int bch2_clamp_data_hole(struct inode *inode,
797 			 u64 *hole_start,
798 			 u64 *hole_end,
799 			 unsigned min_replicas,
800 			 bool nonblock)
801 {
802 	loff_t ret;
803 
804 	ret = bch2_seek_pagecache_hole(inode,
805 		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
806 	if (ret < 0)
807 		return ret;
808 
809 	*hole_start = ret;
810 
811 	if (*hole_start == *hole_end)
812 		return 0;
813 
814 	ret = bch2_seek_pagecache_data(inode,
815 		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
816 	if (ret < 0)
817 		return ret;
818 
819 	*hole_end = ret;
820 	return 0;
821 }
822 
823 #endif /* NO_BCACHEFS_FS */
824