xref: /linux/fs/buffer.c (revision 6b3f7af57881f6d6250c6dcc4d910fe8e855a607)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  linux/fs/buffer.c
4  *
5  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
6  */
7 
8 /*
9  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10  *
11  * Removed a lot of unnecessary code and simplified things now that
12  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13  *
14  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
15  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
16  *
17  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18  *
19  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20  */
21 
22 #include <linux/kernel.h>
23 #include <linux/sched/signal.h>
24 #include <linux/syscalls.h>
25 #include <linux/fs.h>
26 #include <linux/iomap.h>
27 #include <linux/mm.h>
28 #include <linux/percpu.h>
29 #include <linux/slab.h>
30 #include <linux/capability.h>
31 #include <linux/blkdev.h>
32 #include <linux/blk-crypto.h>
33 #include <linux/file.h>
34 #include <linux/quotaops.h>
35 #include <linux/highmem.h>
36 #include <linux/export.h>
37 #include <linux/backing-dev.h>
38 #include <linux/writeback.h>
39 #include <linux/hash.h>
40 #include <linux/suspend.h>
41 #include <linux/buffer_head.h>
42 #include <linux/task_io_accounting_ops.h>
43 #include <linux/bio.h>
44 #include <linux/cpu.h>
45 #include <linux/bitops.h>
46 #include <linux/mpage.h>
47 #include <linux/bit_spinlock.h>
48 #include <linux/folio_batch.h>
49 #include <linux/sched/mm.h>
50 #include <trace/events/block.h>
51 #include <linux/fscrypt.h>
52 #include <linux/fsverity.h>
53 #include <linux/sched/isolation.h>
54 
55 #include "internal.h"
56 
57 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
58 
59 inline void touch_buffer(struct buffer_head *bh)
60 {
61 	trace_block_touch_buffer(bh);
62 	folio_mark_accessed(bh->b_folio);
63 }
64 EXPORT_SYMBOL(touch_buffer);
65 
66 void __lock_buffer(struct buffer_head *bh)
67 {
68 	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
69 }
70 EXPORT_SYMBOL(__lock_buffer);
71 
72 void unlock_buffer(struct buffer_head *bh)
73 {
74 	clear_and_wake_up_bit(BH_Lock, &bh->b_state);
75 }
76 EXPORT_SYMBOL(unlock_buffer);
77 
78 /*
79  * Returns if the folio has dirty or writeback buffers. If all the buffers
80  * are unlocked and clean then the folio_test_dirty information is stale. If
81  * any of the buffers are locked, it is assumed they are locked for IO.
82  */
83 void buffer_check_dirty_writeback(struct folio *folio,
84 				     bool *dirty, bool *writeback)
85 {
86 	struct buffer_head *head, *bh;
87 	*dirty = false;
88 	*writeback = false;
89 
90 	BUG_ON(!folio_test_locked(folio));
91 
92 	head = folio_buffers(folio);
93 	if (!head)
94 		return;
95 
96 	if (folio_test_writeback(folio))
97 		*writeback = true;
98 
99 	bh = head;
100 	do {
101 		if (buffer_locked(bh))
102 			*writeback = true;
103 
104 		if (buffer_dirty(bh))
105 			*dirty = true;
106 
107 		bh = bh->b_this_page;
108 	} while (bh != head);
109 }
110 
111 /*
112  * Block until a buffer comes unlocked.  This doesn't stop it
113  * from becoming locked again - you have to lock it yourself
114  * if you want to preserve its state.
115  */
116 void __wait_on_buffer(struct buffer_head * bh)
117 {
118 	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
119 }
120 EXPORT_SYMBOL(__wait_on_buffer);
121 
122 static void buffer_io_error(struct buffer_head *bh, char *msg)
123 {
124 	if (!test_bit(BH_Quiet, &bh->b_state))
125 		printk_ratelimited(KERN_ERR
126 			"Buffer I/O error on dev %pg, logical block %llu%s\n",
127 			bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
128 }
129 
130 /**
131  * bio_endio_bh - Discard the bio used to submit a buffer.
132  * @bio: The bio.
133  * @bhp: Where to return the buffer_head.
134  *
135  * Call this in your bio_end_io handler to retrieve the buffer_head
136  * submitted in bh_submit().  If you did not call bh_submit(), do not
137  * call this function; it will return garbage.
138  *
139  * This function consumes the bio refcount which will probably free the
140  * bio.
141  *
142  * Return: True if the I/O succeeded.
143  */
144 bool bio_endio_bh(struct bio *bio, struct buffer_head **bhp)
145 {
146 	bool success = bio->bi_status == BLK_STS_OK;
147 	struct buffer_head *bh = bio->bi_private;
148 
149 	if (unlikely(bio_flagged(bio, BIO_QUIET)))
150 		set_bit(BH_Quiet, &bh->b_state);
151 	bio_put(bio);
152 
153 	*bhp = bh;
154 	return success;
155 }
156 EXPORT_SYMBOL(bio_endio_bh);
157 
158 /**
159  * end_buffer_read_sync - Handle buffer reads finishing
160  * @bh: The buffer.
161  * @uptodate: True if the read was successful.
162  *
163  * If a buffer is read through a mechanism that isn't bh_submit(), you
164  * can call this function to finish the read.
165  */
166 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
167 {
168 	if (uptodate) {
169 		set_buffer_uptodate(bh);
170 	} else {
171 		/* This happens, due to failed read-ahead attempts. */
172 		clear_buffer_uptodate(bh);
173 	}
174 	unlock_buffer(bh);
175 }
176 EXPORT_SYMBOL(end_buffer_read_sync);
177 
178 /**
179  * bh_end_read - I/O end handler for reads
180  * @bio: The bio being completed.
181  *
182  * Pass this function to bh_submit() if you're reading into the buffer,
183  * unless you need your own special I/O end handler.
184  */
185 void bh_end_read(struct bio *bio)
186 {
187 	struct buffer_head *bh;
188 	bool uptodate = bio_endio_bh(bio, &bh);
189 	end_buffer_read_sync(bh, uptodate);
190 }
191 EXPORT_SYMBOL(bh_end_read);
192 
193 /**
194  * bh_end_write - I/O end handler for writes
195  * @bio: The bio being completed.
196  *
197  * Pass this function to bh_submit() if you're writing from the buffer,
198  * unless you need your own special I/O end handler.
199  */
200 void bh_end_write(struct bio *bio)
201 {
202 	struct buffer_head *bh;
203 	bool success = bio_endio_bh(bio, &bh);
204 
205 	if (success) {
206 		set_buffer_uptodate(bh);
207 	} else {
208 		buffer_io_error(bh, ", lost sync page write");
209 		mark_buffer_write_io_error(bh);
210 		clear_buffer_uptodate(bh);
211 	}
212 	unlock_buffer(bh);
213 }
214 EXPORT_SYMBOL(bh_end_write);
215 
216 static struct buffer_head *
217 __find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
218 {
219 	struct address_space *bd_mapping = bdev->bd_mapping;
220 	const int blkbits = bd_mapping->host->i_blkbits;
221 	struct buffer_head *ret = NULL;
222 	pgoff_t index;
223 	struct buffer_head *bh;
224 	struct buffer_head *head;
225 	struct folio *folio;
226 	int all_mapped = 1;
227 	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
228 
229 	index = ((loff_t)block << blkbits) / PAGE_SIZE;
230 	folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
231 	if (IS_ERR(folio))
232 		goto out;
233 
234 	/*
235 	 * Folio lock protects the buffers. Callers that cannot block
236 	 * will fallback to serializing vs try_to_free_buffers() via
237 	 * the i_private_lock.
238 	 */
239 	if (atomic)
240 		spin_lock(&bd_mapping->i_private_lock);
241 	else
242 		folio_lock(folio);
243 
244 	head = folio_buffers(folio);
245 	if (!head)
246 		goto out_unlock;
247 	/*
248 	 * Upon a noref migration, the folio lock serializes here;
249 	 * otherwise bail.
250 	 */
251 	if (test_bit_acquire(BH_Migrate, &head->b_state)) {
252 		WARN_ON(!atomic);
253 		goto out_unlock;
254 	}
255 
256 	bh = head;
257 	do {
258 		if (!buffer_mapped(bh))
259 			all_mapped = 0;
260 		else if (bh->b_blocknr == block) {
261 			ret = bh;
262 			get_bh(bh);
263 			goto out_unlock;
264 		}
265 		bh = bh->b_this_page;
266 	} while (bh != head);
267 
268 	/* we might be here because some of the buffers on this page are
269 	 * not mapped.  This is due to various races between
270 	 * file io on the block device and getblk.  It gets dealt with
271 	 * elsewhere, don't buffer_error if we had some unmapped buffers
272 	 */
273 	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
274 	if (all_mapped && __ratelimit(&last_warned)) {
275 		printk("__find_get_block_slow() failed. block=%llu, "
276 		       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
277 		       "device %pg blocksize: %d\n",
278 		       (unsigned long long)block,
279 		       (unsigned long long)bh->b_blocknr,
280 		       bh->b_state, bh->b_size, bdev,
281 		       1 << blkbits);
282 	}
283 out_unlock:
284 	if (atomic)
285 		spin_unlock(&bd_mapping->i_private_lock);
286 	else
287 		folio_unlock(folio);
288 	folio_put(folio);
289 out:
290 	return ret;
291 }
292 
293 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
294 {
295 	unsigned long flags;
296 	struct buffer_head *first;
297 	struct buffer_head *tmp;
298 	struct folio *folio;
299 	int folio_uptodate = 1;
300 
301 	BUG_ON(!buffer_async_read(bh));
302 
303 	folio = bh->b_folio;
304 	if (uptodate) {
305 		set_buffer_uptodate(bh);
306 	} else {
307 		clear_buffer_uptodate(bh);
308 		buffer_io_error(bh, ", async page read");
309 	}
310 
311 	/*
312 	 * Be _very_ careful from here on. Bad things can happen if
313 	 * two buffer heads end IO at almost the same time and both
314 	 * decide that the page is now completely done.
315 	 */
316 	first = folio_buffers(folio);
317 	spin_lock_irqsave(&first->b_uptodate_lock, flags);
318 	clear_buffer_async_read(bh);
319 	unlock_buffer(bh);
320 	tmp = bh;
321 	do {
322 		if (!buffer_uptodate(tmp))
323 			folio_uptodate = 0;
324 		if (buffer_async_read(tmp)) {
325 			BUG_ON(!buffer_locked(tmp));
326 			goto still_busy;
327 		}
328 		tmp = tmp->b_this_page;
329 	} while (tmp != bh);
330 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
331 
332 	folio_end_read(folio, folio_uptodate);
333 	return;
334 
335 still_busy:
336 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
337 }
338 
339 struct postprocess_bh_ctx {
340 	struct work_struct work;
341 	struct buffer_head *bh;
342 	struct fsverity_info *vi;
343 };
344 
345 static void verify_bh(struct work_struct *work)
346 {
347 	struct postprocess_bh_ctx *ctx =
348 		container_of(work, struct postprocess_bh_ctx, work);
349 	struct buffer_head *bh = ctx->bh;
350 	bool valid;
351 
352 	valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size,
353 				       bh_offset(bh));
354 	end_buffer_async_read(bh, valid);
355 	kfree(ctx);
356 }
357 
358 static void decrypt_bh(struct work_struct *work)
359 {
360 	struct postprocess_bh_ctx *ctx =
361 		container_of(work, struct postprocess_bh_ctx, work);
362 	struct buffer_head *bh = ctx->bh;
363 	int err;
364 
365 	err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
366 					       bh_offset(bh));
367 	if (err == 0 && ctx->vi) {
368 		/*
369 		 * We use different work queues for decryption and for verity
370 		 * because verity may require reading metadata pages that need
371 		 * decryption, and we shouldn't recurse to the same workqueue.
372 		 */
373 		INIT_WORK(&ctx->work, verify_bh);
374 		fsverity_enqueue_verify_work(&ctx->work);
375 		return;
376 	}
377 	end_buffer_async_read(bh, err == 0);
378 	kfree(ctx);
379 }
380 
381 /*
382  * I/O completion handler for block_read_full_folio() - folios
383  * which come unlocked at the end of I/O.
384  */
385 static void bh_end_async_read(struct bio *bio)
386 {
387 	struct buffer_head *bh;
388 	bool uptodate = bio_endio_bh(bio, &bh);
389 	struct inode *inode = bh->b_folio->mapping->host;
390 	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
391 	struct fsverity_info *vi = NULL;
392 
393 	/* needed by ext4 */
394 	if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
395 		vi = fsverity_get_info(inode);
396 
397 	/* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
398 	if (uptodate && (decrypt || vi)) {
399 		struct postprocess_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);
400 
401 		if (ctx) {
402 			ctx->bh = bh;
403 			ctx->vi = vi;
404 			if (decrypt) {
405 				INIT_WORK(&ctx->work, decrypt_bh);
406 				fscrypt_enqueue_decrypt_work(&ctx->work);
407 			} else {
408 				INIT_WORK(&ctx->work, verify_bh);
409 				fsverity_enqueue_verify_work(&ctx->work);
410 			}
411 			return;
412 		}
413 		uptodate = false;
414 	}
415 	end_buffer_async_read(bh, uptodate);
416 }
417 
418 /**
419  * bh_end_async_write - I/O end handler for async folio writes
420  * @bio: The bio being completed.
421  *
422  * Pass this function to bh_submit() if you're doing the equivalent of
423  * block_write_full_folio().  That is, the folio is unlocked, and will
424  * have its writeback flag cleared once all async write buffers have
425  * completed.
426  */
427 void bh_end_async_write(struct bio *bio)
428 {
429 	struct buffer_head *bh;
430 	bool success = bio_endio_bh(bio, &bh);
431 	unsigned long flags;
432 	struct buffer_head *first;
433 	struct buffer_head *tmp;
434 	struct folio *folio;
435 
436 	BUG_ON(!buffer_async_write(bh));
437 
438 	folio = bh->b_folio;
439 	if (success) {
440 		set_buffer_uptodate(bh);
441 	} else {
442 		buffer_io_error(bh, ", lost async page write");
443 		mark_buffer_write_io_error(bh);
444 		clear_buffer_uptodate(bh);
445 	}
446 
447 	first = folio_buffers(folio);
448 	spin_lock_irqsave(&first->b_uptodate_lock, flags);
449 
450 	clear_buffer_async_write(bh);
451 	unlock_buffer(bh);
452 	tmp = bh->b_this_page;
453 	while (tmp != bh) {
454 		if (buffer_async_write(tmp)) {
455 			BUG_ON(!buffer_locked(tmp));
456 			goto still_busy;
457 		}
458 		tmp = tmp->b_this_page;
459 	}
460 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
461 	folio_end_writeback(folio);
462 	return;
463 
464 still_busy:
465 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
466 }
467 EXPORT_SYMBOL(bh_end_async_write);
468 
469 
470 /*
471  * fs/buffer.c contains helper functions for buffer-backed address space's
472  * fsync functions.  A common requirement for buffer-based filesystems is
473  * that certain data from the backing blockdev needs to be written out for
474  * a successful fsync().  For example, ext2 indirect blocks need to be
475  * written back and waited upon before fsync() returns.
476  *
477  * The functions mmb_mark_buffer_dirty(), mmb_sync(), mmb_has_buffers()
478  * and mmb_invalidate() are provided for the management of a list of dependent
479  * buffers in mapping_metadata_bhs struct.
480  *
481  * The locking is a little subtle: The list of buffer heads is protected by
482  * the lock in mapping_metadata_bhs so functions coming from bdev mapping
483  * (such as try_to_free_buffers()) need to safely get to mapping_metadata_bhs
484  * using RCU, grab the lock, verify we didn't race with somebody detaching the
485  * bh / moving it to different inode and only then proceeding.
486  */
487 
488 void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping)
489 {
490 	spin_lock_init(&mmb->lock);
491 	INIT_LIST_HEAD(&mmb->list);
492 	mmb->mapping = mapping;
493 }
494 EXPORT_SYMBOL(mmb_init);
495 
496 static void __remove_assoc_queue(struct mapping_metadata_bhs *mmb,
497 			         struct buffer_head *bh)
498 {
499 	lockdep_assert_held(&mmb->lock);
500 	list_del_init(&bh->b_assoc_buffers);
501 	WARN_ON(!bh->b_mmb);
502 	bh->b_mmb = NULL;
503 }
504 
505 static void remove_assoc_queue(struct buffer_head *bh)
506 {
507 	struct mapping_metadata_bhs *mmb;
508 
509 	/*
510 	 * The locking dance is ugly here. We need to acquire the lock
511 	 * protecting the metadata bh list while possibly racing with bh
512 	 * being removed from the list or moved to a different one.  We
513 	 * use RCU to pin mapping_metadata_bhs in memory to
514 	 * opportunistically acquire the lock and then recheck the bh
515 	 * didn't move under us.
516 	 */
517 	while (bh->b_mmb) {
518 		rcu_read_lock();
519 		mmb = READ_ONCE(bh->b_mmb);
520 		if (mmb) {
521 			spin_lock(&mmb->lock);
522 			if (bh->b_mmb == mmb)
523 				__remove_assoc_queue(mmb, bh);
524 			spin_unlock(&mmb->lock);
525 		}
526 		rcu_read_unlock();
527 	}
528 }
529 
530 bool mmb_has_buffers(struct mapping_metadata_bhs *mmb)
531 {
532 	return !list_empty(&mmb->list);
533 }
534 EXPORT_SYMBOL_GPL(mmb_has_buffers);
535 
536 /**
537  * mmb_sync - write out & wait upon all buffers in a list
538  * @mmb: the list of buffers to write
539  *
540  * Starts I/O against the buffers in the given list and waits upon
541  * that I/O. Basically, this is a convenience function for fsync().  @mmb is
542  * for a file or directory which needs those buffers to be written for a
543  * successful fsync().
544  *
545  * We have conflicting pressures: we want to make sure that all
546  * initially dirty buffers get waited on, but that any subsequently
547  * dirtied buffers don't.  After all, we don't want fsync to last
548  * forever if somebody is actively writing to the file.
549  *
550  * Do this in two main stages: first we copy dirty buffers to a
551  * temporary inode list, queueing the writes as we go. Then we clean
552  * up, waiting for those writes to complete. mark_buffer_dirty_inode()
553  * doesn't touch b_assoc_buffers list if b_mmb is not NULL so we are sure the
554  * buffer stays on our list until IO completes (at which point it can be
555  * reaped).
556  */
557 int mmb_sync(struct mapping_metadata_bhs *mmb)
558 {
559 	struct buffer_head *bh;
560 	int err = 0;
561 	struct blk_plug plug;
562 	LIST_HEAD(tmp);
563 
564 	if (!mmb_has_buffers(mmb))
565 		return 0;
566 
567 	blk_start_plug(&plug);
568 
569 	spin_lock(&mmb->lock);
570 	while (!list_empty(&mmb->list)) {
571 		bh = BH_ENTRY(mmb->list.next);
572 		WARN_ON_ONCE(bh->b_mmb != mmb);
573 		__remove_assoc_queue(mmb, bh);
574 		/* Avoid race with mark_buffer_dirty_inode() which does
575 		 * a lockless check and we rely on seeing the dirty bit */
576 		smp_mb();
577 		if (buffer_dirty(bh) || buffer_locked(bh)) {
578 			list_add(&bh->b_assoc_buffers, &tmp);
579 			bh->b_mmb = mmb;
580 			if (buffer_dirty(bh)) {
581 				get_bh(bh);
582 				spin_unlock(&mmb->lock);
583 				/*
584 				 * Ensure any pending I/O completes so that
585 				 * write_dirty_buffer() actually writes the
586 				 * current contents - it is a noop if I/O is
587 				 * still in flight on potentially older
588 				 * contents.
589 				 */
590 				write_dirty_buffer(bh, REQ_SYNC);
591 
592 				/*
593 				 * Kick off IO for the previous mapping. Note
594 				 * that we will not run the very last mapping,
595 				 * wait_on_buffer() will do that for us
596 				 * through sync_buffer().
597 				 */
598 				brelse(bh);
599 				spin_lock(&mmb->lock);
600 			}
601 		}
602 	}
603 
604 	spin_unlock(&mmb->lock);
605 	blk_finish_plug(&plug);
606 	spin_lock(&mmb->lock);
607 
608 	while (!list_empty(&tmp)) {
609 		bh = BH_ENTRY(tmp.prev);
610 		get_bh(bh);
611 		__remove_assoc_queue(mmb, bh);
612 		/* Avoid race with mark_buffer_dirty_inode() which does
613 		 * a lockless check and we rely on seeing the dirty bit */
614 		smp_mb();
615 		if (buffer_dirty(bh)) {
616 			list_add(&bh->b_assoc_buffers, &mmb->list);
617 			bh->b_mmb = mmb;
618 		}
619 		spin_unlock(&mmb->lock);
620 		wait_on_buffer(bh);
621 		if (!buffer_uptodate(bh))
622 			err = -EIO;
623 		brelse(bh);
624 		spin_lock(&mmb->lock);
625 	}
626 	spin_unlock(&mmb->lock);
627 	return err;
628 }
629 EXPORT_SYMBOL(mmb_sync);
630 
631 /**
632  * mmb_fsync_noflush - fsync implementation for simple filesystems with
633  * 		       metadata buffers list
634  *
635  * @file:	file to synchronize
636  * @mmb:	list of metadata bhs to flush
637  * @start:	start offset in bytes
638  * @end:	end offset in bytes (inclusive)
639  * @datasync:	only synchronize essential metadata if true
640  *
641  * This is an implementation of the fsync method for simple filesystems which
642  * track all non-inode metadata in the buffers list hanging off the @mmb
643  * structure.
644  */
645 int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
646 		      loff_t start, loff_t end, bool datasync)
647 {
648 	struct inode *inode = file->f_mapping->host;
649 	int err;
650 	int ret = 0;
651 
652 	err = file_write_and_wait_range(file, start, end);
653 	if (err)
654 		return err;
655 
656 	if (mmb)
657 		ret = mmb_sync(mmb);
658 	if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
659 		goto out;
660 	if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
661 		goto out;
662 
663 	err = sync_inode_metadata(inode, 1);
664 	if (ret == 0)
665 		ret = err;
666 
667 out:
668 	/* check and advance again to catch errors after syncing out buffers */
669 	err = file_check_and_advance_wb_err(file);
670 	if (ret == 0)
671 		ret = err;
672 	return ret;
673 }
674 EXPORT_SYMBOL(mmb_fsync_noflush);
675 
676 /**
677  * mmb_fsync - fsync implementation for simple filesystems with metadata
678  * 	       buffers list
679  *
680  * @file:	file to synchronize
681  * @mmb:	list of metadata bhs to flush
682  * @start:	start offset in bytes
683  * @end:	end offset in bytes (inclusive)
684  * @datasync:	only synchronize essential metadata if true
685  *
686  * This is an implementation of the fsync method for simple filesystems which
687  * track all non-inode metadata in the buffers list hanging off the @mmb
688  * structure. This also makes sure that a device cache flush operation is
689  * called at the end.
690  */
691 int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb,
692 	      loff_t start, loff_t end, bool datasync)
693 {
694 	struct inode *inode = file->f_mapping->host;
695 	int ret;
696 
697 	ret = mmb_fsync_noflush(file, mmb, start, end, datasync);
698 	if (!ret)
699 		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
700 	return ret;
701 }
702 EXPORT_SYMBOL(mmb_fsync);
703 
704 /*
705  * Called when we've recently written block `bblock', and it is known that
706  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
707  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
708  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
709  */
710 void write_boundary_block(struct block_device *bdev,
711 			sector_t bblock, unsigned blocksize)
712 {
713 	struct buffer_head *bh;
714 
715 	bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
716 	if (bh) {
717 		if (buffer_dirty(bh))
718 			write_dirty_buffer(bh, 0);
719 		put_bh(bh);
720 	}
721 }
722 
723 void mmb_mark_buffer_dirty(struct buffer_head *bh,
724 			   struct mapping_metadata_bhs *mmb)
725 {
726 	mark_buffer_dirty(bh);
727 	if (!bh->b_mmb) {
728 		spin_lock(&mmb->lock);
729 		/*
730 		 * For a corrupted filesystem with multiply claimed blocks this
731 		 * can fail. Avoid corrupting the linked list in that case.
732 		 */
733 		if (cmpxchg(&bh->b_mmb, NULL, mmb) != NULL) {
734 			spin_unlock(&mmb->lock);
735 			return;
736 		}
737 		list_move_tail(&bh->b_assoc_buffers, &mmb->list);
738 		spin_unlock(&mmb->lock);
739 	}
740 }
741 EXPORT_SYMBOL(mmb_mark_buffer_dirty);
742 
743 /**
744  * block_dirty_folio - Mark a folio as dirty.
745  * @mapping: The address space containing this folio.
746  * @folio: The folio to mark dirty.
747  *
748  * Filesystems which use buffer_heads can use this function as their
749  * ->dirty_folio implementation.  Some filesystems need to do a little
750  * work before calling this function.  Filesystems which do not use
751  * buffer_heads should call filemap_dirty_folio() instead.
752  *
753  * If the folio has buffers, the uptodate buffers are set dirty, to
754  * preserve dirty-state coherency between the folio and the buffers.
755  * Buffers added to a dirty folio are created dirty.
756  *
757  * The buffers are dirtied before the folio is dirtied.  There's a small
758  * race window in which writeback may see the folio cleanness but not the
759  * buffer dirtiness.  That's fine.  If this code were to set the folio
760  * dirty before the buffers, writeback could clear the folio dirty flag,
761  * see a bunch of clean buffers and we'd end up with dirty buffers/clean
762  * folio on the dirty folio list.
763  *
764  * We use i_private_lock to lock against try_to_free_buffers() while
765  * using the folio's buffer list.  This also prevents clean buffers
766  * being added to the folio after it was set dirty.
767  *
768  * Context: May only be called from process context.  Does not sleep.
769  * Caller must ensure that @folio cannot be truncated during this call,
770  * typically by holding the folio lock or having a page in the folio
771  * mapped and holding the page table lock.
772  *
773  * Return: True if the folio was dirtied; false if it was already dirtied.
774  */
775 bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
776 {
777 	struct buffer_head *head;
778 	bool newly_dirty;
779 
780 	spin_lock(&mapping->i_private_lock);
781 	head = folio_buffers(folio);
782 	if (head) {
783 		struct buffer_head *bh = head;
784 
785 		do {
786 			set_buffer_dirty(bh);
787 			bh = bh->b_this_page;
788 		} while (bh != head);
789 	}
790 	/*
791 	 * Lock out page's memcg migration to keep PageDirty
792 	 * synchronized with per-memcg dirty page counters.
793 	 */
794 	newly_dirty = !folio_test_set_dirty(folio);
795 	spin_unlock(&mapping->i_private_lock);
796 
797 	if (newly_dirty)
798 		__folio_mark_dirty(folio, mapping, 1);
799 
800 	if (newly_dirty)
801 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
802 
803 	return newly_dirty;
804 }
805 EXPORT_SYMBOL(block_dirty_folio);
806 
807 /*
808  * Invalidate any and all dirty buffers on a given buffers list.  We are
809  * probably unmounting the fs, but that doesn't mean we have already
810  * done a sync().  Just drop the buffers from the inode list.
811  */
812 void mmb_invalidate(struct mapping_metadata_bhs *mmb)
813 {
814 	if (mmb_has_buffers(mmb)) {
815 		spin_lock(&mmb->lock);
816 		while (!list_empty(&mmb->list))
817 			__remove_assoc_queue(mmb, BH_ENTRY(mmb->list.next));
818 		spin_unlock(&mmb->lock);
819 	}
820 }
821 EXPORT_SYMBOL(mmb_invalidate);
822 
823 /*
824  * Create the appropriate buffers when given a folio for data area and
825  * the size of each buffer.. Use the bh->b_this_page linked list to
826  * follow the buffers created.  Return NULL if unable to create more
827  * buffers.
828  *
829  * The retry flag is used to differentiate async IO (paging, swapping)
830  * which may not fail from ordinary buffer allocations.
831  */
832 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
833 					gfp_t gfp)
834 {
835 	struct buffer_head *bh, *head;
836 	long offset;
837 	struct mem_cgroup *memcg, *old_memcg;
838 
839 	memcg = get_mem_cgroup_from_folio(folio);
840 	old_memcg = set_active_memcg(memcg);
841 
842 	head = NULL;
843 	offset = folio_size(folio);
844 	while ((offset -= size) >= 0) {
845 		bh = alloc_buffer_head(gfp);
846 		if (!bh)
847 			goto no_grow;
848 
849 		bh->b_this_page = head;
850 		bh->b_blocknr = -1;
851 		head = bh;
852 
853 		bh->b_size = size;
854 
855 		/* Link the buffer to its folio */
856 		folio_set_bh(bh, folio, offset);
857 	}
858 out:
859 	set_active_memcg(old_memcg);
860 	mem_cgroup_put(memcg);
861 	return head;
862 /*
863  * In case anything failed, we just free everything we got.
864  */
865 no_grow:
866 	if (head) {
867 		do {
868 			bh = head;
869 			head = head->b_this_page;
870 			free_buffer_head(bh);
871 		} while (head);
872 	}
873 
874 	goto out;
875 }
876 EXPORT_SYMBOL_GPL(folio_alloc_buffers);
877 
878 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
879 {
880 	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
881 
882 	return folio_alloc_buffers(page_folio(page), size, gfp);
883 }
884 EXPORT_SYMBOL_GPL(alloc_page_buffers);
885 
886 static inline void link_dev_buffers(struct folio *folio,
887 		struct buffer_head *head)
888 {
889 	struct buffer_head *bh, *tail;
890 
891 	bh = head;
892 	do {
893 		tail = bh;
894 		bh = bh->b_this_page;
895 	} while (bh);
896 	tail->b_this_page = head;
897 	folio_attach_private(folio, head);
898 }
899 
900 static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
901 {
902 	sector_t retval = ~((sector_t)0);
903 	loff_t sz = bdev_nr_bytes(bdev);
904 
905 	if (sz) {
906 		unsigned int sizebits = blksize_bits(size);
907 		retval = (sz >> sizebits);
908 	}
909 	return retval;
910 }
911 
912 /*
913  * Initialise the state of a blockdev folio's buffers.
914  */
915 static sector_t folio_init_buffers(struct folio *folio,
916 		struct block_device *bdev, unsigned size)
917 {
918 	struct buffer_head *head = folio_buffers(folio);
919 	struct buffer_head *bh = head;
920 	bool uptodate = folio_test_uptodate(folio);
921 	sector_t block = div_u64(folio_pos(folio), size);
922 	sector_t end_block = blkdev_max_block(bdev, size);
923 
924 	do {
925 		if (!buffer_mapped(bh)) {
926 			bh->b_private = NULL;
927 			bh->b_bdev = bdev;
928 			bh->b_blocknr = block;
929 			if (uptodate)
930 				set_buffer_uptodate(bh);
931 			if (block < end_block)
932 				set_buffer_mapped(bh);
933 		}
934 		block++;
935 		bh = bh->b_this_page;
936 	} while (bh != head);
937 
938 	/*
939 	 * Caller needs to validate requested block against end of device.
940 	 */
941 	return end_block;
942 }
943 
944 /*
945  * Create the page-cache folio that contains the requested block.
946  *
947  * This is used purely for blockdev mappings.
948  *
949  * Returns false if we have a failure which cannot be cured by retrying
950  * without sleeping.  Returns true if we succeeded, or the caller should retry.
951  */
952 static bool grow_dev_folio(struct block_device *bdev, sector_t block,
953 		pgoff_t index, unsigned size, gfp_t gfp)
954 {
955 	struct address_space *mapping = bdev->bd_mapping;
956 	struct folio *folio;
957 	struct buffer_head *bh;
958 	sector_t end_block = 0;
959 
960 	folio = __filemap_get_folio(mapping, index,
961 			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
962 	if (IS_ERR(folio))
963 		return false;
964 
965 	bh = folio_buffers(folio);
966 	if (bh) {
967 		if (bh->b_size == size) {
968 			end_block = folio_init_buffers(folio, bdev, size);
969 			goto unlock;
970 		}
971 
972 		/*
973 		 * Retrying may succeed; for example the folio may finish
974 		 * writeback, or buffers may be cleaned.  This should not
975 		 * happen very often; maybe we have old buffers attached to
976 		 * this blockdev's page cache and we're trying to change
977 		 * the block size?
978 		 */
979 		if (!try_to_free_buffers(folio)) {
980 			end_block = ~0ULL;
981 			goto unlock;
982 		}
983 	}
984 
985 	bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
986 	if (!bh)
987 		goto unlock;
988 
989 	/*
990 	 * Link the folio to the buffers and initialise them.  Take the
991 	 * lock to be atomic wrt __find_get_block(), which does not
992 	 * run under the folio lock.
993 	 */
994 	spin_lock(&mapping->i_private_lock);
995 	link_dev_buffers(folio, bh);
996 	end_block = folio_init_buffers(folio, bdev, size);
997 	spin_unlock(&mapping->i_private_lock);
998 unlock:
999 	folio_unlock(folio);
1000 	folio_put(folio);
1001 	return block < end_block;
1002 }
1003 
1004 /*
1005  * Create buffers for the specified block device block's folio.  If
1006  * that folio was dirty, the buffers are set dirty also.  Returns false
1007  * if we've hit a permanent error.
1008  */
1009 static bool grow_buffers(struct block_device *bdev, sector_t block,
1010 		unsigned size, gfp_t gfp)
1011 {
1012 	loff_t pos;
1013 
1014 	/*
1015 	 * Check for a block which lies outside our maximum possible
1016 	 * pagecache index.
1017 	 */
1018 	if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
1019 		printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
1020 			__func__, (unsigned long long)block,
1021 			bdev);
1022 		return false;
1023 	}
1024 
1025 	/* Create a folio with the proper size buffers */
1026 	return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
1027 }
1028 
1029 static struct buffer_head *
1030 __getblk_slow(struct block_device *bdev, sector_t block,
1031 	     unsigned size, gfp_t gfp)
1032 {
1033 	bool blocking = gfpflags_allow_blocking(gfp);
1034 
1035 	if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
1036 		printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
1037 		       size, bdev_logical_block_size(bdev));
1038 		return NULL;
1039 	}
1040 
1041 	for (;;) {
1042 		struct buffer_head *bh;
1043 
1044 		if (!grow_buffers(bdev, block, size, gfp))
1045 			return NULL;
1046 
1047 		if (blocking)
1048 			bh = __find_get_block_nonatomic(bdev, block, size);
1049 		else
1050 			bh = __find_get_block(bdev, block, size);
1051 		if (bh)
1052 			return bh;
1053 	}
1054 }
1055 
1056 /*
1057  * The relationship between dirty buffers and dirty pages:
1058  *
1059  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1060  * the page is tagged dirty in the page cache.
1061  *
1062  * At all times, the dirtiness of the buffers represents the dirtiness of
1063  * subsections of the page.  If the page has buffers, the page dirty bit is
1064  * merely a hint about the true dirty state.
1065  *
1066  * When a page is set dirty in its entirety, all its buffers are marked dirty
1067  * (if the page has buffers).
1068  *
1069  * When a buffer is marked dirty, its page is dirtied, but the page's other
1070  * buffers are not.
1071  *
1072  * Also.  When blockdev buffers are explicitly read with bread(), they
1073  * individually become uptodate.  But their backing page remains not
1074  * uptodate - even if all of its buffers are uptodate.  A subsequent
1075  * block_read_full_folio() against that folio will discover all the uptodate
1076  * buffers, will set the folio uptodate and will perform no I/O.
1077  */
1078 
1079 /**
1080  * mark_buffer_dirty - mark a buffer_head as needing writeout
1081  * @bh: the buffer_head to mark dirty
1082  *
1083  * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1084  * its backing page dirty, then tag the page as dirty in the page cache
1085  * and then attach the address_space's inode to its superblock's dirty
1086  * inode list.
1087  *
1088  * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->i_private_lock,
1089  * i_pages lock and mapping->host->i_lock.
1090  */
1091 void mark_buffer_dirty(struct buffer_head *bh)
1092 {
1093 	WARN_ON_ONCE(!buffer_uptodate(bh));
1094 
1095 	trace_block_dirty_buffer(bh);
1096 
1097 	/*
1098 	 * Very *carefully* optimize the it-is-already-dirty case.
1099 	 *
1100 	 * Don't let the final "is it dirty" escape to before we
1101 	 * perhaps modified the buffer.
1102 	 */
1103 	if (buffer_dirty(bh)) {
1104 		smp_mb();
1105 		if (buffer_dirty(bh))
1106 			return;
1107 	}
1108 
1109 	if (!test_set_buffer_dirty(bh)) {
1110 		struct folio *folio = bh->b_folio;
1111 		struct address_space *mapping = NULL;
1112 
1113 		if (!folio_test_set_dirty(folio)) {
1114 			mapping = folio->mapping;
1115 			if (mapping)
1116 				__folio_mark_dirty(folio, mapping, 0);
1117 		}
1118 		if (mapping)
1119 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1120 	}
1121 }
1122 EXPORT_SYMBOL(mark_buffer_dirty);
1123 
1124 void mark_buffer_write_io_error(struct buffer_head *bh)
1125 {
1126 	set_buffer_write_io_error(bh);
1127 	/* FIXME: do we need to set this in both places? */
1128 	if (bh->b_folio && bh->b_folio->mapping)
1129 		mapping_set_error(bh->b_folio->mapping, -EIO);
1130 	if (bh->b_mmb)
1131 		mapping_set_error(bh->b_mmb->mapping, -EIO);
1132 }
1133 EXPORT_SYMBOL(mark_buffer_write_io_error);
1134 
1135 /**
1136  * __brelse - Release a buffer.
1137  * @bh: The buffer to release.
1138  *
1139  * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
1140  */
1141 void __brelse(struct buffer_head *bh)
1142 {
1143 	if (atomic_read(&bh->b_count)) {
1144 		put_bh(bh);
1145 		return;
1146 	}
1147 	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1148 }
1149 EXPORT_SYMBOL(__brelse);
1150 
1151 /**
1152  * __bforget - Discard any dirty data in a buffer.
1153  * @bh: The buffer to forget.
1154  *
1155  * This variant of bforget() can be called if @bh is guaranteed to not
1156  * be NULL.
1157  */
1158 void __bforget(struct buffer_head *bh)
1159 {
1160 	clear_buffer_dirty(bh);
1161 	remove_assoc_queue(bh);
1162 	__brelse(bh);
1163 }
1164 EXPORT_SYMBOL(__bforget);
1165 
1166 static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh,
1167 				  gfp_t gfp_mask)
1168 {
1169 	const struct address_space *mapping = folio_mapping(bh->b_folio);
1170 
1171 	/*
1172 	 * The ext4 journal (jbd2) can submit a buffer_head it directly created
1173 	 * for a non-pagecache page.  fscrypt doesn't care about these.
1174 	 */
1175 	if (!mapping)
1176 		return;
1177 	fscrypt_set_bio_crypt_ctx(bio, mapping->host,
1178 			folio_pos(bh->b_folio) + bh_offset(bh), gfp_mask);
1179 }
1180 
1181 static void __bh_submit(struct buffer_head *bh, blk_opf_t opf,
1182 		enum rw_hint write_hint, struct writeback_control *wbc,
1183 		bio_end_io_t end_bio)
1184 {
1185 	const enum req_op op = opf & REQ_OP_MASK;
1186 	struct bio *bio;
1187 
1188 	BUG_ON(!buffer_locked(bh));
1189 	BUG_ON(!buffer_mapped(bh));
1190 	BUG_ON(buffer_delay(bh));
1191 	BUG_ON(buffer_unwritten(bh));
1192 
1193 	/*
1194 	 * Only clear out a write error when rewriting
1195 	 */
1196 	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
1197 		clear_buffer_write_io_error(bh);
1198 
1199 	if (buffer_meta(bh))
1200 		opf |= REQ_META;
1201 	if (buffer_prio(bh))
1202 		opf |= REQ_PRIO;
1203 
1204 	bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
1205 
1206 	if (IS_ENABLED(CONFIG_FS_ENCRYPTION))
1207 		buffer_set_crypto_ctx(bio, bh, GFP_NOIO);
1208 
1209 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
1210 	bio->bi_write_hint = write_hint;
1211 
1212 	bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
1213 
1214 	bio->bi_end_io = end_bio;
1215 	bio->bi_private = bh;
1216 
1217 	/* Take care of bh's that straddle the end of the device */
1218 	guard_bio_eod(bio);
1219 
1220 	if (wbc) {
1221 		wbc_init_bio(wbc, bio);
1222 		wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
1223 	}
1224 
1225 	blk_crypto_submit_bio(bio);
1226 }
1227 
1228 /**
1229  * bh_submit - Start I/O against a buffer head
1230  * @bh: The buffer head to perform I/O on.
1231  * @opf: Operation and flags for bio.
1232  * @end_io: The routine to call when I/O has completed.
1233  *
1234  * If you need to do I/O on an individual bh (instead of allowing the
1235  * page cache to do I/O on the folio that it is in), call this function.
1236  */
1237 void bh_submit(struct buffer_head *bh, blk_opf_t opf, bio_end_io_t end_io)
1238 {
1239 	__bh_submit(bh, opf, WRITE_LIFE_NOT_SET, NULL, end_io);
1240 }
1241 EXPORT_SYMBOL(bh_submit);
1242 
1243 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1244 {
1245 	lock_buffer(bh);
1246 	if (buffer_uptodate(bh)) {
1247 		unlock_buffer(bh);
1248 		return bh;
1249 	} else {
1250 		bh_submit(bh, REQ_OP_READ, bh_end_read);
1251 		wait_on_buffer(bh);
1252 		if (buffer_uptodate(bh))
1253 			return bh;
1254 	}
1255 	brelse(bh);
1256 	return NULL;
1257 }
1258 
1259 /*
1260  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1261  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1262  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1263  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1264  * CPU's LRUs at the same time.
1265  *
1266  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1267  * sb_find_get_block().
1268  *
1269  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1270  * a local interrupt disable for that.
1271  */
1272 
1273 #define BH_LRU_SIZE	16
1274 
1275 struct bh_lru {
1276 	struct buffer_head *bhs[BH_LRU_SIZE];
1277 };
1278 
1279 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1280 
1281 #ifdef CONFIG_SMP
1282 #define bh_lru_lock()	local_irq_disable()
1283 #define bh_lru_unlock()	local_irq_enable()
1284 #else
1285 #define bh_lru_lock()	preempt_disable()
1286 #define bh_lru_unlock()	preempt_enable()
1287 #endif
1288 
1289 static inline void check_irqs_on(void)
1290 {
1291 #ifdef irqs_disabled
1292 	BUG_ON(irqs_disabled());
1293 #endif
1294 }
1295 
1296 /*
1297  * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
1298  * inserted at the front, and the buffer_head at the back if any is evicted.
1299  * Or, if already in the LRU it is moved to the front.
1300  */
1301 static void bh_lru_install(struct buffer_head *bh)
1302 {
1303 	struct buffer_head *evictee = bh;
1304 	struct bh_lru *b;
1305 	int i;
1306 
1307 	check_irqs_on();
1308 	bh_lru_lock();
1309 
1310 	/*
1311 	 * the refcount of buffer_head in bh_lru prevents dropping the
1312 	 * attached page(i.e., try_to_free_buffers) so it could cause
1313 	 * failing page migration.
1314 	 * Skip putting upcoming bh into bh_lru until migration is done.
1315 	 */
1316 	if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
1317 		bh_lru_unlock();
1318 		return;
1319 	}
1320 
1321 	b = this_cpu_ptr(&bh_lrus);
1322 	for (i = 0; i < BH_LRU_SIZE; i++) {
1323 		swap(evictee, b->bhs[i]);
1324 		if (evictee == bh) {
1325 			bh_lru_unlock();
1326 			return;
1327 		}
1328 	}
1329 
1330 	get_bh(bh);
1331 	bh_lru_unlock();
1332 	brelse(evictee);
1333 }
1334 
1335 /*
1336  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1337  */
1338 static struct buffer_head *
1339 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1340 {
1341 	struct buffer_head *ret = NULL;
1342 	unsigned int i;
1343 
1344 	check_irqs_on();
1345 	bh_lru_lock();
1346 	if (cpu_is_isolated(smp_processor_id())) {
1347 		bh_lru_unlock();
1348 		return NULL;
1349 	}
1350 	for (i = 0; i < BH_LRU_SIZE; i++) {
1351 		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1352 
1353 		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1354 		    bh->b_size == size) {
1355 			if (i) {
1356 				while (i) {
1357 					__this_cpu_write(bh_lrus.bhs[i],
1358 						__this_cpu_read(bh_lrus.bhs[i - 1]));
1359 					i--;
1360 				}
1361 				__this_cpu_write(bh_lrus.bhs[0], bh);
1362 			}
1363 			get_bh(bh);
1364 			ret = bh;
1365 			break;
1366 		}
1367 	}
1368 	bh_lru_unlock();
1369 	return ret;
1370 }
1371 
1372 /*
1373  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1374  * it in the LRU and mark it as accessed.  If it is not present then return
1375  * NULL. Atomic context callers may also return NULL if the buffer is being
1376  * migrated; similarly the page is not marked accessed either.
1377  */
1378 static struct buffer_head *
1379 find_get_block_common(struct block_device *bdev, sector_t block,
1380 			unsigned size, bool atomic)
1381 {
1382 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1383 
1384 	if (bh == NULL) {
1385 		/* __find_get_block_slow will mark the page accessed */
1386 		bh = __find_get_block_slow(bdev, block, atomic);
1387 		if (bh)
1388 			bh_lru_install(bh);
1389 	} else
1390 		touch_buffer(bh);
1391 
1392 	return bh;
1393 }
1394 
1395 struct buffer_head *
1396 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1397 {
1398 	return find_get_block_common(bdev, block, size, true);
1399 }
1400 EXPORT_SYMBOL(__find_get_block);
1401 
1402 /* same as __find_get_block() but allows sleeping contexts */
1403 struct buffer_head *
1404 __find_get_block_nonatomic(struct block_device *bdev, sector_t block,
1405 			   unsigned size)
1406 {
1407 	return find_get_block_common(bdev, block, size, false);
1408 }
1409 EXPORT_SYMBOL(__find_get_block_nonatomic);
1410 
1411 /**
1412  * bdev_getblk - Get a buffer_head in a block device's buffer cache.
1413  * @bdev: The block device.
1414  * @block: The block number.
1415  * @size: The size of buffer_heads for this @bdev.
1416  * @gfp: The memory allocation flags to use.
1417  *
1418  * The returned buffer head has its reference count incremented, but is
1419  * not locked.  The caller should call brelse() when it has finished
1420  * with the buffer.  The buffer may not be uptodate.  If needed, the
1421  * caller can bring it uptodate either by reading it or overwriting it.
1422  *
1423  * Return: The buffer head, or NULL if memory could not be allocated.
1424  */
1425 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
1426 		unsigned size, gfp_t gfp)
1427 {
1428 	struct buffer_head *bh;
1429 
1430 	if (gfpflags_allow_blocking(gfp))
1431 		bh = __find_get_block_nonatomic(bdev, block, size);
1432 	else
1433 		bh = __find_get_block(bdev, block, size);
1434 
1435 	might_alloc(gfp);
1436 	if (bh)
1437 		return bh;
1438 
1439 	return __getblk_slow(bdev, block, size, gfp);
1440 }
1441 EXPORT_SYMBOL(bdev_getblk);
1442 
1443 /*
1444  * Do async read-ahead on a buffer..
1445  */
1446 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1447 {
1448 	struct buffer_head *bh = bdev_getblk(bdev, block, size,
1449 			GFP_NOWAIT | __GFP_MOVABLE);
1450 
1451 	if (likely(bh)) {
1452 		bh_readahead(bh, REQ_RAHEAD);
1453 		brelse(bh);
1454 	}
1455 }
1456 EXPORT_SYMBOL(__breadahead);
1457 
1458 /**
1459  * __bread_gfp() - Read a block.
1460  * @bdev: The block device to read from.
1461  * @block: Block number in units of block size.
1462  * @size: The block size of this device in bytes.
1463  * @gfp: Not page allocation flags; see below.
1464  *
1465  * You are not expected to call this function.  You should use one of
1466  * sb_bread(), sb_bread_unmovable() or __bread().
1467  *
1468  * Read a specified block, and return the buffer head that refers to it.
1469  * If @gfp is 0, the memory will be allocated using the block device's
1470  * default GFP flags.  If @gfp is __GFP_MOVABLE, the memory may be
1471  * allocated from a movable area.  Do not pass in a complete set of
1472  * GFP flags.
1473  *
1474  * The returned buffer head has its refcount increased.  The caller should
1475  * call brelse() when it has finished with the buffer.
1476  *
1477  * Context: May sleep waiting for I/O.
1478  * Return: NULL if the block was unreadable.
1479  */
1480 struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
1481 		unsigned size, gfp_t gfp)
1482 {
1483 	struct buffer_head *bh;
1484 
1485 	gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
1486 
1487 	/*
1488 	 * Prefer looping in the allocator rather than here, at least that
1489 	 * code knows what it's doing.
1490 	 */
1491 	gfp |= __GFP_NOFAIL;
1492 
1493 	bh = bdev_getblk(bdev, block, size, gfp);
1494 
1495 	if (likely(bh) && !buffer_uptodate(bh))
1496 		bh = __bread_slow(bh);
1497 	return bh;
1498 }
1499 EXPORT_SYMBOL(__bread_gfp);
1500 
1501 static void __invalidate_bh_lrus(struct bh_lru *b)
1502 {
1503 	int i;
1504 
1505 	for (i = 0; i < BH_LRU_SIZE; i++) {
1506 		brelse(b->bhs[i]);
1507 		b->bhs[i] = NULL;
1508 	}
1509 }
1510 /*
1511  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1512  * This doesn't race because it runs in each cpu either in irq
1513  * or with preempt disabled.
1514  */
1515 static void invalidate_bh_lru(void *arg)
1516 {
1517 	struct bh_lru *b = &get_cpu_var(bh_lrus);
1518 
1519 	__invalidate_bh_lrus(b);
1520 	put_cpu_var(bh_lrus);
1521 }
1522 
1523 bool has_bh_in_lru(int cpu, void *dummy)
1524 {
1525 	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1526 	int i;
1527 
1528 	for (i = 0; i < BH_LRU_SIZE; i++) {
1529 		if (b->bhs[i])
1530 			return true;
1531 	}
1532 
1533 	return false;
1534 }
1535 
1536 void invalidate_bh_lrus(void)
1537 {
1538 	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1539 }
1540 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1541 
1542 /*
1543  * It's called from workqueue context so we need a bh_lru_lock to close
1544  * the race with preemption/irq.
1545  */
1546 void invalidate_bh_lrus_cpu(void)
1547 {
1548 	struct bh_lru *b;
1549 
1550 	bh_lru_lock();
1551 	b = this_cpu_ptr(&bh_lrus);
1552 	__invalidate_bh_lrus(b);
1553 	bh_lru_unlock();
1554 }
1555 
1556 void folio_set_bh(struct buffer_head *bh, struct folio *folio,
1557 		  unsigned long offset)
1558 {
1559 	bh->b_folio = folio;
1560 	BUG_ON(offset >= folio_size(folio));
1561 	if (folio_test_highmem(folio))
1562 		/*
1563 		 * This catches illegal uses and preserves the offset:
1564 		 */
1565 		bh->b_data = (char *)(0 + offset);
1566 	else
1567 		bh->b_data = folio_address(folio) + offset;
1568 }
1569 EXPORT_SYMBOL(folio_set_bh);
1570 
1571 /*
1572  * Called when truncating a buffer on a page completely.
1573  */
1574 
1575 /* Bits that are cleared during an invalidate */
1576 #define BUFFER_FLAGS_DISCARD \
1577 	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1578 	 1 << BH_Delay | 1 << BH_Unwritten)
1579 
1580 static void discard_buffer(struct buffer_head * bh)
1581 {
1582 	unsigned long b_state;
1583 
1584 	lock_buffer(bh);
1585 	clear_buffer_dirty(bh);
1586 	bh->b_bdev = NULL;
1587 	b_state = READ_ONCE(bh->b_state);
1588 	do {
1589 	} while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
1590 				      b_state & ~BUFFER_FLAGS_DISCARD));
1591 	unlock_buffer(bh);
1592 }
1593 
1594 /**
1595  * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1596  * @folio: The folio which is affected.
1597  * @offset: start of the range to invalidate
1598  * @length: length of the range to invalidate
1599  *
1600  * block_invalidate_folio() is called when all or part of the folio has been
1601  * invalidated by a truncate operation.
1602  *
1603  * block_invalidate_folio() does not have to release all buffers, but it must
1604  * ensure that no dirty buffer is left outside @offset and that no I/O
1605  * is underway against any of the blocks which are outside the truncation
1606  * point.  Because the caller is about to free (and possibly reuse) those
1607  * blocks on-disk.
1608  */
1609 void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1610 {
1611 	struct buffer_head *head, *bh, *next;
1612 	size_t curr_off = 0;
1613 	size_t stop = length + offset;
1614 
1615 	BUG_ON(!folio_test_locked(folio));
1616 
1617 	/*
1618 	 * Check for overflow
1619 	 */
1620 	BUG_ON(stop > folio_size(folio) || stop < length);
1621 
1622 	head = folio_buffers(folio);
1623 	if (!head)
1624 		return;
1625 
1626 	bh = head;
1627 	do {
1628 		size_t next_off = curr_off + bh->b_size;
1629 		next = bh->b_this_page;
1630 
1631 		/*
1632 		 * Are we still fully in range ?
1633 		 */
1634 		if (next_off > stop)
1635 			goto out;
1636 
1637 		/*
1638 		 * is this block fully invalidated?
1639 		 */
1640 		if (offset <= curr_off)
1641 			discard_buffer(bh);
1642 		curr_off = next_off;
1643 		bh = next;
1644 	} while (bh != head);
1645 
1646 	/*
1647 	 * We release buffers only if the entire folio is being invalidated.
1648 	 * The get_block cached value has been unconditionally invalidated,
1649 	 * so real IO is not possible anymore.
1650 	 */
1651 	if (length == folio_size(folio))
1652 		filemap_release_folio(folio, 0);
1653 out:
1654 	folio_clear_mappedtodisk(folio);
1655 }
1656 EXPORT_SYMBOL(block_invalidate_folio);
1657 
1658 /*
1659  * We attach and possibly dirty the buffers atomically wrt
1660  * block_dirty_folio() via i_private_lock.  try_to_free_buffers
1661  * is already excluded via the folio lock.
1662  */
1663 struct buffer_head *create_empty_buffers(struct folio *folio,
1664 		unsigned long blocksize, unsigned long b_state)
1665 {
1666 	struct buffer_head *bh, *head, *tail;
1667 	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
1668 
1669 	head = folio_alloc_buffers(folio, blocksize, gfp);
1670 	bh = head;
1671 	do {
1672 		bh->b_state |= b_state;
1673 		tail = bh;
1674 		bh = bh->b_this_page;
1675 	} while (bh);
1676 	tail->b_this_page = head;
1677 
1678 	spin_lock(&folio->mapping->i_private_lock);
1679 	if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
1680 		bh = head;
1681 		do {
1682 			if (folio_test_dirty(folio))
1683 				set_buffer_dirty(bh);
1684 			if (folio_test_uptodate(folio))
1685 				set_buffer_uptodate(bh);
1686 			bh = bh->b_this_page;
1687 		} while (bh != head);
1688 	}
1689 	folio_attach_private(folio, head);
1690 	spin_unlock(&folio->mapping->i_private_lock);
1691 
1692 	return head;
1693 }
1694 EXPORT_SYMBOL(create_empty_buffers);
1695 
1696 /**
1697  * clean_bdev_aliases: clean a range of buffers in block device
1698  * @bdev: Block device to clean buffers in
1699  * @block: Start of a range of blocks to clean
1700  * @len: Number of blocks to clean
1701  *
1702  * We are taking a range of blocks for data and we don't want writeback of any
1703  * buffer-cache aliases starting from return from this function and until the
1704  * moment when something will explicitly mark the buffer dirty (hopefully that
1705  * will not happen until we will free that block ;-) We don't even need to mark
1706  * it not-uptodate - nobody can expect anything from a newly allocated buffer
1707  * anyway. We used to use unmap_buffer() for such invalidation, but that was
1708  * wrong. We definitely don't want to mark the alias unmapped, for example - it
1709  * would confuse anyone who might pick it with bread() afterwards...
1710  *
1711  * Also..  Note that bforget() doesn't lock the buffer.  So there can be
1712  * writeout I/O going on against recently-freed buffers.  We don't wait on that
1713  * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1714  * need to.  That happens here.
1715  */
1716 void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1717 {
1718 	struct address_space *bd_mapping = bdev->bd_mapping;
1719 	const int blkbits = bd_mapping->host->i_blkbits;
1720 	struct folio_batch fbatch;
1721 	pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
1722 	pgoff_t end;
1723 	int i, count;
1724 	struct buffer_head *bh;
1725 	struct buffer_head *head;
1726 
1727 	end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
1728 	folio_batch_init(&fbatch);
1729 	while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
1730 		count = folio_batch_count(&fbatch);
1731 		for (i = 0; i < count; i++) {
1732 			struct folio *folio = fbatch.folios[i];
1733 
1734 			if (!folio_buffers(folio))
1735 				continue;
1736 			/*
1737 			 * We use folio lock instead of bd_mapping->i_private_lock
1738 			 * to pin buffers here since we can afford to sleep and
1739 			 * it scales better than a global spinlock lock.
1740 			 */
1741 			folio_lock(folio);
1742 			/* Recheck when the folio is locked which pins bhs */
1743 			head = folio_buffers(folio);
1744 			if (!head)
1745 				goto unlock_page;
1746 			bh = head;
1747 			do {
1748 				if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1749 					goto next;
1750 				if (bh->b_blocknr >= block + len)
1751 					break;
1752 				clear_buffer_dirty(bh);
1753 				wait_on_buffer(bh);
1754 				clear_buffer_req(bh);
1755 next:
1756 				bh = bh->b_this_page;
1757 			} while (bh != head);
1758 unlock_page:
1759 			folio_unlock(folio);
1760 		}
1761 		folio_batch_release(&fbatch);
1762 		cond_resched();
1763 		/* End of range already reached? */
1764 		if (index > end || !index)
1765 			break;
1766 	}
1767 }
1768 EXPORT_SYMBOL(clean_bdev_aliases);
1769 
1770 static struct buffer_head *folio_create_buffers(struct folio *folio,
1771 						struct inode *inode,
1772 						unsigned int b_state)
1773 {
1774 	struct buffer_head *bh;
1775 
1776 	BUG_ON(!folio_test_locked(folio));
1777 
1778 	bh = folio_buffers(folio);
1779 	if (!bh)
1780 		bh = create_empty_buffers(folio,
1781 				1 << READ_ONCE(inode->i_blkbits), b_state);
1782 	return bh;
1783 }
1784 
1785 /*
1786  * NOTE! All mapped/uptodate combinations are valid:
1787  *
1788  *	Mapped	Uptodate	Meaning
1789  *
1790  *	No	No		"unknown" - must do get_block()
1791  *	No	Yes		"hole" - zero-filled
1792  *	Yes	No		"allocated" - allocated on disk, not read in
1793  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1794  *
1795  * "Dirty" is valid only with the last case (mapped+uptodate).
1796  */
1797 
1798 /*
1799  * While block_write_full_folio is writing back the dirty buffers under
1800  * the folio lock, whoever dirtied the buffers may decide to clean them
1801  * again at any time.  We handle that by only looking at the buffer
1802  * state inside lock_buffer().
1803  *
1804  * If block_write_full_folio() is called for regular writeback
1805  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a folio which
1806  * has a locked buffer.   This only can happen if someone has written
1807  * the buffer directly, with bh_submit().  At the address_space level
1808  * the folio writeback flag prevents this contention from occurring.
1809  *
1810  * If block_write_full_folio() is called with wbc->sync_mode ==
1811  * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1812  * causes the writes to be flagged as synchronous writes.
1813  */
1814 int __block_write_full_folio(struct inode *inode, struct folio *folio,
1815 			get_block_t *get_block, struct writeback_control *wbc)
1816 {
1817 	int err;
1818 	sector_t block;
1819 	sector_t last_block;
1820 	struct buffer_head *bh, *head;
1821 	size_t blocksize;
1822 	int nr_underway = 0;
1823 	blk_opf_t write_flags = wbc_to_write_flags(wbc);
1824 
1825 	head = folio_create_buffers(folio, inode,
1826 				    (1 << BH_Dirty) | (1 << BH_Uptodate));
1827 
1828 	/*
1829 	 * Be very careful.  We have no exclusion from block_dirty_folio
1830 	 * here, and the (potentially unmapped) buffers may become dirty at
1831 	 * any time.  If a buffer becomes dirty here after we've inspected it
1832 	 * then we just miss that fact, and the folio stays dirty.
1833 	 *
1834 	 * Buffers outside i_size may be dirtied by block_dirty_folio;
1835 	 * handle that here by just cleaning them.
1836 	 */
1837 
1838 	bh = head;
1839 	blocksize = bh->b_size;
1840 
1841 	block = div_u64(folio_pos(folio), blocksize);
1842 	last_block = div_u64(i_size_read(inode) - 1, blocksize);
1843 
1844 	/*
1845 	 * Get all the dirty buffers mapped to disk addresses and
1846 	 * handle any aliases from the underlying blockdev's mapping.
1847 	 */
1848 	do {
1849 		if (block > last_block) {
1850 			/*
1851 			 * mapped buffers outside i_size will occur, because
1852 			 * this folio can be outside i_size when there is a
1853 			 * truncate in progress.
1854 			 */
1855 			/*
1856 			 * The buffer was zeroed by block_write_full_folio()
1857 			 */
1858 			clear_buffer_dirty(bh);
1859 			set_buffer_uptodate(bh);
1860 		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1861 			   buffer_dirty(bh)) {
1862 			WARN_ON(bh->b_size != blocksize);
1863 			err = get_block(inode, block, bh, 1);
1864 			if (err)
1865 				goto recover;
1866 			clear_buffer_delay(bh);
1867 			if (buffer_new(bh)) {
1868 				/* blockdev mappings never come here */
1869 				clear_buffer_new(bh);
1870 				clean_bdev_bh_alias(bh);
1871 			}
1872 		}
1873 		bh = bh->b_this_page;
1874 		block++;
1875 	} while (bh != head);
1876 
1877 	do {
1878 		if (!buffer_mapped(bh))
1879 			continue;
1880 		/*
1881 		 * If it's a fully non-blocking write attempt and we cannot
1882 		 * lock the buffer then redirty the folio.  Note that this can
1883 		 * potentially cause a busy-wait loop from writeback threads
1884 		 * and kswapd activity, but those code paths have their own
1885 		 * higher-level throttling.
1886 		 */
1887 		if (wbc->sync_mode != WB_SYNC_NONE) {
1888 			lock_buffer(bh);
1889 		} else if (!trylock_buffer(bh)) {
1890 			folio_redirty_for_writepage(wbc, folio);
1891 			continue;
1892 		}
1893 		if (test_clear_buffer_dirty(bh)) {
1894 			set_buffer_async_write(bh);
1895 		} else {
1896 			unlock_buffer(bh);
1897 		}
1898 	} while ((bh = bh->b_this_page) != head);
1899 
1900 	/*
1901 	 * The folio and its buffers are protected by the writeback flag,
1902 	 * so we can drop the bh refcounts early.
1903 	 */
1904 	BUG_ON(folio_test_writeback(folio));
1905 	folio_start_writeback(folio);
1906 
1907 	do {
1908 		struct buffer_head *next = bh->b_this_page;
1909 		if (buffer_async_write(bh)) {
1910 			__bh_submit(bh, REQ_OP_WRITE | write_flags,
1911 					inode->i_write_hint, wbc,
1912 					bh_end_async_write);
1913 			nr_underway++;
1914 		}
1915 		bh = next;
1916 	} while (bh != head);
1917 	folio_unlock(folio);
1918 
1919 	err = 0;
1920 done:
1921 	if (nr_underway == 0) {
1922 		/*
1923 		 * The folio was marked dirty, but the buffers were
1924 		 * clean.  Someone wrote them back by hand with
1925 		 * write_dirty_buffer/bh_submit.  A rare case.
1926 		 */
1927 		folio_end_writeback(folio);
1928 
1929 		/*
1930 		 * The folio and buffer_heads can be released at any time from
1931 		 * here on.
1932 		 */
1933 	}
1934 	return err;
1935 
1936 recover:
1937 	/*
1938 	 * ENOSPC, or some other error.  We may already have added some
1939 	 * blocks to the file, so we need to write these out to avoid
1940 	 * exposing stale data.
1941 	 * The folio is currently locked and not marked for writeback
1942 	 */
1943 	bh = head;
1944 	/* Recovery: lock and submit the mapped buffers */
1945 	do {
1946 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1947 		    !buffer_delay(bh)) {
1948 			lock_buffer(bh);
1949 			set_buffer_async_write(bh);
1950 		} else {
1951 			/*
1952 			 * The buffer may have been set dirty during
1953 			 * attachment to a dirty folio.
1954 			 */
1955 			clear_buffer_dirty(bh);
1956 		}
1957 	} while ((bh = bh->b_this_page) != head);
1958 	BUG_ON(folio_test_writeback(folio));
1959 	mapping_set_error(folio->mapping, err);
1960 	folio_start_writeback(folio);
1961 	do {
1962 		struct buffer_head *next = bh->b_this_page;
1963 		if (buffer_async_write(bh)) {
1964 			clear_buffer_dirty(bh);
1965 			__bh_submit(bh, REQ_OP_WRITE | write_flags,
1966 					inode->i_write_hint, wbc,
1967 					bh_end_async_write);
1968 			nr_underway++;
1969 		}
1970 		bh = next;
1971 	} while (bh != head);
1972 	folio_unlock(folio);
1973 	goto done;
1974 }
1975 EXPORT_SYMBOL(__block_write_full_folio);
1976 
1977 /*
1978  * If a folio has any new buffers, zero them out here, and mark them uptodate
1979  * and dirty so they'll be written out (in order to prevent uninitialised
1980  * block data from leaking). And clear the new bit.
1981  */
1982 void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
1983 {
1984 	size_t block_start, block_end;
1985 	struct buffer_head *head, *bh;
1986 
1987 	BUG_ON(!folio_test_locked(folio));
1988 	head = folio_buffers(folio);
1989 	if (!head)
1990 		return;
1991 
1992 	bh = head;
1993 	block_start = 0;
1994 	do {
1995 		block_end = block_start + bh->b_size;
1996 
1997 		if (buffer_new(bh)) {
1998 			if (block_end > from && block_start < to) {
1999 				if (!folio_test_uptodate(folio)) {
2000 					size_t start, xend;
2001 
2002 					start = max(from, block_start);
2003 					xend = min(to, block_end);
2004 
2005 					folio_zero_segment(folio, start, xend);
2006 					set_buffer_uptodate(bh);
2007 				}
2008 
2009 				clear_buffer_new(bh);
2010 				mark_buffer_dirty(bh);
2011 			}
2012 		}
2013 
2014 		block_start = block_end;
2015 		bh = bh->b_this_page;
2016 	} while (bh != head);
2017 }
2018 EXPORT_SYMBOL(folio_zero_new_buffers);
2019 
2020 static int
2021 iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
2022 		const struct iomap *iomap)
2023 {
2024 	loff_t offset = (loff_t)block << inode->i_blkbits;
2025 
2026 	bh->b_bdev = iomap->bdev;
2027 
2028 	/*
2029 	 * Block points to offset in file we need to map, iomap contains
2030 	 * the offset at which the map starts. If the map ends before the
2031 	 * current block, then do not map the buffer and let the caller
2032 	 * handle it.
2033 	 */
2034 	if (offset >= iomap->offset + iomap->length)
2035 		return -EIO;
2036 
2037 	switch (iomap->type) {
2038 	case IOMAP_HOLE:
2039 		/*
2040 		 * If the buffer is not up to date or beyond the current EOF,
2041 		 * we need to mark it as new to ensure sub-block zeroing is
2042 		 * executed if necessary.
2043 		 */
2044 		if (!buffer_uptodate(bh) ||
2045 		    (offset >= i_size_read(inode)))
2046 			set_buffer_new(bh);
2047 		return 0;
2048 	case IOMAP_DELALLOC:
2049 		if (!buffer_uptodate(bh) ||
2050 		    (offset >= i_size_read(inode)))
2051 			set_buffer_new(bh);
2052 		set_buffer_uptodate(bh);
2053 		set_buffer_mapped(bh);
2054 		set_buffer_delay(bh);
2055 		return 0;
2056 	case IOMAP_UNWRITTEN:
2057 		/*
2058 		 * For unwritten regions, we always need to ensure that regions
2059 		 * in the block we are not writing to are zeroed. Mark the
2060 		 * buffer as new to ensure this.
2061 		 */
2062 		set_buffer_new(bh);
2063 		set_buffer_unwritten(bh);
2064 		fallthrough;
2065 	case IOMAP_MAPPED:
2066 		if ((iomap->flags & IOMAP_F_NEW) ||
2067 		    offset >= i_size_read(inode)) {
2068 			/*
2069 			 * This can happen if truncating the block device races
2070 			 * with the check in the caller as i_size updates on
2071 			 * block devices aren't synchronized by i_rwsem for
2072 			 * block devices.
2073 			 */
2074 			if (S_ISBLK(inode->i_mode))
2075 				return -EIO;
2076 			set_buffer_new(bh);
2077 		}
2078 		bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2079 				inode->i_blkbits;
2080 		set_buffer_mapped(bh);
2081 		return 0;
2082 	default:
2083 		WARN_ON_ONCE(1);
2084 		return -EIO;
2085 	}
2086 }
2087 
2088 int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
2089 		get_block_t *get_block, const struct iomap *iomap)
2090 {
2091 	size_t from = offset_in_folio(folio, pos);
2092 	size_t to = from + len;
2093 	struct inode *inode = folio->mapping->host;
2094 	size_t block_start, block_end;
2095 	sector_t block;
2096 	int err = 0;
2097 	size_t blocksize;
2098 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2099 
2100 	BUG_ON(!folio_test_locked(folio));
2101 	BUG_ON(to > folio_size(folio));
2102 	BUG_ON(from > to);
2103 
2104 	head = folio_create_buffers(folio, inode, 0);
2105 	blocksize = head->b_size;
2106 	block = div_u64(folio_pos(folio), blocksize);
2107 
2108 	for (bh = head, block_start = 0; bh != head || !block_start;
2109 	    block++, block_start=block_end, bh = bh->b_this_page) {
2110 		block_end = block_start + blocksize;
2111 		if (block_end <= from || block_start >= to) {
2112 			if (folio_test_uptodate(folio)) {
2113 				if (!buffer_uptodate(bh))
2114 					set_buffer_uptodate(bh);
2115 			}
2116 			continue;
2117 		}
2118 		if (buffer_new(bh))
2119 			clear_buffer_new(bh);
2120 		if (!buffer_mapped(bh)) {
2121 			WARN_ON(bh->b_size != blocksize);
2122 			if (get_block)
2123 				err = get_block(inode, block, bh, 1);
2124 			else
2125 				err = iomap_to_bh(inode, block, bh, iomap);
2126 			if (err)
2127 				break;
2128 
2129 			if (buffer_new(bh)) {
2130 				clean_bdev_bh_alias(bh);
2131 				if (folio_test_uptodate(folio)) {
2132 					clear_buffer_new(bh);
2133 					set_buffer_uptodate(bh);
2134 					mark_buffer_dirty(bh);
2135 					continue;
2136 				}
2137 				if (block_end > to || block_start < from)
2138 					folio_zero_segments(folio,
2139 						to, block_end,
2140 						block_start, from);
2141 				continue;
2142 			}
2143 		}
2144 		if (folio_test_uptodate(folio)) {
2145 			if (!buffer_uptodate(bh))
2146 				set_buffer_uptodate(bh);
2147 			continue;
2148 		}
2149 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2150 		    !buffer_unwritten(bh) &&
2151 		     (block_start < from || block_end > to)) {
2152 			bh_read_nowait(bh, 0);
2153 			*wait_bh++=bh;
2154 		}
2155 	}
2156 	/*
2157 	 * If we issued read requests - let them complete.
2158 	 */
2159 	while(wait_bh > wait) {
2160 		wait_on_buffer(*--wait_bh);
2161 		if (!buffer_uptodate(*wait_bh))
2162 			err = -EIO;
2163 	}
2164 	if (unlikely(err))
2165 		folio_zero_new_buffers(folio, from, to);
2166 	return err;
2167 }
2168 
2169 int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
2170 		get_block_t *get_block)
2171 {
2172 	return __block_write_begin_int(folio, pos, len, get_block, NULL);
2173 }
2174 EXPORT_SYMBOL(__block_write_begin);
2175 
2176 void block_commit_write(struct folio *folio, size_t from, size_t to)
2177 {
2178 	size_t block_start, block_end;
2179 	bool partial = false;
2180 	unsigned blocksize;
2181 	struct buffer_head *bh, *head;
2182 
2183 	bh = head = folio_buffers(folio);
2184 	if (!bh)
2185 		return;
2186 	blocksize = bh->b_size;
2187 
2188 	block_start = 0;
2189 	do {
2190 		block_end = block_start + blocksize;
2191 		if (block_end <= from || block_start >= to) {
2192 			if (!buffer_uptodate(bh))
2193 				partial = true;
2194 		} else {
2195 			set_buffer_uptodate(bh);
2196 			mark_buffer_dirty(bh);
2197 		}
2198 		if (buffer_new(bh))
2199 			clear_buffer_new(bh);
2200 
2201 		block_start = block_end;
2202 		bh = bh->b_this_page;
2203 	} while (bh != head);
2204 
2205 	/*
2206 	 * If this is a partial write which happened to make all buffers
2207 	 * uptodate then we can optimize away a bogus read_folio() for
2208 	 * the next read(). Here we 'discover' whether the folio went
2209 	 * uptodate as a result of this (potentially partial) write.
2210 	 */
2211 	if (!partial)
2212 		folio_mark_uptodate(folio);
2213 }
2214 EXPORT_SYMBOL(block_commit_write);
2215 
2216 /*
2217  * block_write_begin takes care of the basic task of block allocation and
2218  * bringing partial write blocks uptodate first.
2219  *
2220  * The filesystem needs to handle block truncation upon failure.
2221  */
2222 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2223 		struct folio **foliop, get_block_t *get_block)
2224 {
2225 	pgoff_t index = pos >> PAGE_SHIFT;
2226 	struct folio *folio;
2227 	int status;
2228 
2229 	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2230 			mapping_gfp_mask(mapping));
2231 	if (IS_ERR(folio))
2232 		return PTR_ERR(folio);
2233 
2234 	status = __block_write_begin_int(folio, pos, len, get_block, NULL);
2235 	if (unlikely(status)) {
2236 		folio_unlock(folio);
2237 		folio_put(folio);
2238 		folio = NULL;
2239 	}
2240 
2241 	*foliop = folio;
2242 	return status;
2243 }
2244 EXPORT_SYMBOL(block_write_begin);
2245 
2246 int block_write_end(loff_t pos, unsigned len, unsigned copied,
2247 		struct folio *folio)
2248 {
2249 	size_t start = pos - folio_pos(folio);
2250 
2251 	if (unlikely(copied < len)) {
2252 		/*
2253 		 * The buffers that were written will now be uptodate, so
2254 		 * we don't have to worry about a read_folio reading them
2255 		 * and overwriting a partial write. However if we have
2256 		 * encountered a short write and only partially written
2257 		 * into a buffer, it will not be marked uptodate, so a
2258 		 * read_folio might come in and destroy our partial write.
2259 		 *
2260 		 * Do the simplest thing, and just treat any short write to a
2261 		 * non uptodate folio as a zero-length write, and force the
2262 		 * caller to redo the whole thing.
2263 		 */
2264 		if (!folio_test_uptodate(folio))
2265 			copied = 0;
2266 
2267 		folio_zero_new_buffers(folio, start+copied, start+len);
2268 	}
2269 	flush_dcache_folio(folio);
2270 
2271 	/* This could be a short (even 0-length) commit */
2272 	block_commit_write(folio, start, start + copied);
2273 
2274 	return copied;
2275 }
2276 EXPORT_SYMBOL(block_write_end);
2277 
2278 int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
2279 		      loff_t pos, unsigned len, unsigned copied,
2280 		      struct folio *folio, void *fsdata)
2281 {
2282 	struct inode *inode = mapping->host;
2283 	loff_t old_size = inode->i_size;
2284 	bool i_size_changed = false;
2285 
2286 	copied = block_write_end(pos, len, copied, folio);
2287 
2288 	/*
2289 	 * No need to use i_size_read() here, the i_size cannot change under us
2290 	 * because we hold i_rwsem.
2291 	 *
2292 	 * But it's important to update i_size while still holding folio lock:
2293 	 * page writeout could otherwise come in and zero beyond i_size.
2294 	 */
2295 	if (pos + copied > inode->i_size) {
2296 		i_size_write(inode, pos + copied);
2297 		i_size_changed = true;
2298 	}
2299 
2300 	folio_unlock(folio);
2301 	folio_put(folio);
2302 
2303 	if (old_size < pos)
2304 		pagecache_isize_extended(inode, old_size, pos);
2305 	/*
2306 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2307 	 * makes the holding time of page lock longer. Second, it forces lock
2308 	 * ordering of page lock and transaction start for journaling
2309 	 * filesystems.
2310 	 */
2311 	if (i_size_changed)
2312 		mark_inode_dirty(inode);
2313 	return copied;
2314 }
2315 EXPORT_SYMBOL(generic_write_end);
2316 
2317 /*
2318  * block_is_partially_uptodate checks whether buffers within a folio are
2319  * uptodate or not.
2320  *
2321  * Returns true if all buffers which correspond to the specified part
2322  * of the folio are uptodate.
2323  */
2324 bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2325 {
2326 	unsigned block_start, block_end, blocksize;
2327 	unsigned to;
2328 	struct buffer_head *bh, *head;
2329 	bool ret = true;
2330 
2331 	head = folio_buffers(folio);
2332 	if (!head)
2333 		return false;
2334 	blocksize = head->b_size;
2335 	to = min(folio_size(folio) - from, count);
2336 	to = from + to;
2337 	if (from < blocksize && to > folio_size(folio) - blocksize)
2338 		return false;
2339 
2340 	bh = head;
2341 	block_start = 0;
2342 	do {
2343 		block_end = block_start + blocksize;
2344 		if (block_end > from && block_start < to) {
2345 			if (!buffer_uptodate(bh)) {
2346 				ret = false;
2347 				break;
2348 			}
2349 			if (block_end >= to)
2350 				break;
2351 		}
2352 		block_start = block_end;
2353 		bh = bh->b_this_page;
2354 	} while (bh != head);
2355 
2356 	return ret;
2357 }
2358 EXPORT_SYMBOL(block_is_partially_uptodate);
2359 
2360 /*
2361  * Generic "read_folio" function for block devices that have the normal
2362  * get_block functionality. This is most of the block device filesystems.
2363  * Reads the folio asynchronously --- the unlock_buffer() and
2364  * set/clear_buffer_uptodate() functions propagate buffer state into the
2365  * folio once IO has completed.
2366  */
2367 int block_read_full_folio(struct folio *folio, get_block_t *get_block)
2368 {
2369 	struct inode *inode = folio->mapping->host;
2370 	sector_t iblock, lblock;
2371 	struct buffer_head *bh, *head, *prev = NULL;
2372 	size_t blocksize;
2373 	int fully_mapped = 1;
2374 	bool page_error = false;
2375 	loff_t limit = i_size_read(inode);
2376 
2377 	/* This is needed for ext4. */
2378 	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2379 		limit = inode->i_sb->s_maxbytes;
2380 
2381 	head = folio_create_buffers(folio, inode, 0);
2382 	blocksize = head->b_size;
2383 
2384 	iblock = div_u64(folio_pos(folio), blocksize);
2385 	lblock = div_u64(limit + blocksize - 1, blocksize);
2386 	bh = head;
2387 
2388 	do {
2389 		if (buffer_uptodate(bh))
2390 			continue;
2391 
2392 		if (!buffer_mapped(bh)) {
2393 			int err = 0;
2394 
2395 			fully_mapped = 0;
2396 			if (iblock < lblock) {
2397 				WARN_ON(bh->b_size != blocksize);
2398 				err = get_block(inode, iblock, bh, 0);
2399 				if (err)
2400 					page_error = true;
2401 			}
2402 			if (!buffer_mapped(bh)) {
2403 				folio_zero_range(folio, bh_offset(bh),
2404 						blocksize);
2405 				if (!err)
2406 					set_buffer_uptodate(bh);
2407 				continue;
2408 			}
2409 			/*
2410 			 * get_block() might have updated the buffer
2411 			 * synchronously
2412 			 */
2413 			if (buffer_uptodate(bh))
2414 				continue;
2415 		}
2416 
2417 		lock_buffer(bh);
2418 		if (buffer_uptodate(bh)) {
2419 			unlock_buffer(bh);
2420 			continue;
2421 		}
2422 
2423 		/*
2424 		 * If a folio's buffers are under async readin
2425 		 * (end_buffer_async_read completion) then there is a
2426 		 * possibility that another thread of control could lock
2427 		 * one of the buffers after it has completed but while
2428 		 * some of the other buffers have not completed.  This
2429 		 * locked buffer would confuse end_buffer_async_read()
2430 		 * into not unlocking the folio.  So the absence of
2431 		 * BH_Async_Read tells end_buffer_async_read() that this
2432 		 * buffer is not under async I/O.
2433 		 *
2434 		 * The folio comes unlocked when it has no locked
2435 		 * buffer_async buffers left.
2436 		 *
2437 		 * The folio lock prevents anyone starting new async
2438 		 * I/O reads into any of the buffers.
2439 		 *
2440 		 * The writeback flag is used to prevent simultaneous
2441 		 * writeout of the same folio.
2442 		 *
2443 		 * The folio lock prevents anyone from starting writeback
2444 		 * of a folio which is under read I/O (the writeback
2445 		 * flag is only ever set on a locked folio).
2446 		 */
2447 		set_buffer_async_read(bh);
2448 		if (prev)
2449 			bh_submit(prev, REQ_OP_READ, bh_end_async_read);
2450 		prev = bh;
2451 	} while (iblock++, (bh = bh->b_this_page) != head);
2452 
2453 	if (fully_mapped)
2454 		folio_set_mappedtodisk(folio);
2455 
2456 	/*
2457 	 * All buffers are uptodate or get_block() returned an error
2458 	 * when trying to map them - we must finish the read because
2459 	 * end_buffer_async_read() will never be called on any buffer
2460 	 * in this folio.
2461 	 */
2462 	if (prev)
2463 		bh_submit(prev, REQ_OP_READ, bh_end_async_read);
2464 	else
2465 		folio_end_read(folio, !page_error);
2466 
2467 	return 0;
2468 }
2469 EXPORT_SYMBOL(block_read_full_folio);
2470 
2471 /* utility function for filesystems that need to do work on expanding
2472  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2473  * deal with the hole.
2474  */
2475 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2476 {
2477 	struct address_space *mapping = inode->i_mapping;
2478 	const struct address_space_operations *aops = mapping->a_ops;
2479 	struct folio *folio;
2480 	void *fsdata = NULL;
2481 	int err;
2482 
2483 	err = inode_newsize_ok(inode, size);
2484 	if (err)
2485 		goto out;
2486 
2487 	err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
2488 	if (err)
2489 		goto out;
2490 
2491 	err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
2492 	BUG_ON(err > 0);
2493 
2494 out:
2495 	return err;
2496 }
2497 EXPORT_SYMBOL(generic_cont_expand_simple);
2498 
2499 static int cont_expand_zero(const struct kiocb *iocb,
2500 			    struct address_space *mapping,
2501 			    loff_t pos, loff_t *bytes)
2502 {
2503 	struct inode *inode = mapping->host;
2504 	const struct address_space_operations *aops = mapping->a_ops;
2505 	unsigned int blocksize = i_blocksize(inode);
2506 	struct folio *folio;
2507 	void *fsdata = NULL;
2508 	pgoff_t index, curidx;
2509 	loff_t curpos;
2510 	unsigned zerofrom, offset, len;
2511 	int err = 0;
2512 
2513 	index = pos >> PAGE_SHIFT;
2514 	offset = pos & ~PAGE_MASK;
2515 
2516 	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2517 		zerofrom = curpos & ~PAGE_MASK;
2518 		if (zerofrom & (blocksize-1)) {
2519 			*bytes |= (blocksize-1);
2520 			(*bytes)++;
2521 		}
2522 		len = PAGE_SIZE - zerofrom;
2523 
2524 		err = aops->write_begin(iocb, mapping, curpos, len,
2525 					    &folio, &fsdata);
2526 		if (err)
2527 			goto out;
2528 		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
2529 		err = aops->write_end(iocb, mapping, curpos, len, len,
2530 						folio, fsdata);
2531 		if (err < 0)
2532 			goto out;
2533 		BUG_ON(err != len);
2534 		err = 0;
2535 
2536 		balance_dirty_pages_ratelimited(mapping);
2537 
2538 		if (fatal_signal_pending(current)) {
2539 			err = -EINTR;
2540 			goto out;
2541 		}
2542 	}
2543 
2544 	/* page covers the boundary, find the boundary offset */
2545 	if (index == curidx) {
2546 		zerofrom = curpos & ~PAGE_MASK;
2547 		/* if we will expand the thing last block will be filled */
2548 		if (offset <= zerofrom) {
2549 			goto out;
2550 		}
2551 		if (zerofrom & (blocksize-1)) {
2552 			*bytes |= (blocksize-1);
2553 			(*bytes)++;
2554 		}
2555 		len = offset - zerofrom;
2556 
2557 		err = aops->write_begin(iocb, mapping, curpos, len,
2558 					    &folio, &fsdata);
2559 		if (err)
2560 			goto out;
2561 		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
2562 		err = aops->write_end(iocb, mapping, curpos, len, len,
2563 						folio, fsdata);
2564 		if (err < 0)
2565 			goto out;
2566 		BUG_ON(err != len);
2567 		err = 0;
2568 	}
2569 out:
2570 	return err;
2571 }
2572 
2573 /*
2574  * For moronic filesystems that do not allow holes in file.
2575  * We may have to extend the file.
2576  */
2577 int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
2578 		     loff_t pos, unsigned len, struct folio **foliop,
2579 		     void **fsdata, get_block_t *get_block, loff_t *bytes)
2580 {
2581 	struct inode *inode = mapping->host;
2582 	unsigned int blocksize = i_blocksize(inode);
2583 	unsigned int zerofrom;
2584 	int err;
2585 
2586 	err = cont_expand_zero(iocb, mapping, pos, bytes);
2587 	if (err)
2588 		return err;
2589 
2590 	zerofrom = *bytes & ~PAGE_MASK;
2591 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2592 		*bytes |= (blocksize-1);
2593 		(*bytes)++;
2594 	}
2595 
2596 	return block_write_begin(mapping, pos, len, foliop, get_block);
2597 }
2598 EXPORT_SYMBOL(cont_write_begin);
2599 
2600 /*
2601  * block_page_mkwrite() is not allowed to change the file size as it gets
2602  * called from a page fault handler when a page is first dirtied. Hence we must
2603  * be careful to check for EOF conditions here. We set the page up correctly
2604  * for a written page which means we get ENOSPC checking when writing into
2605  * holes and correct delalloc and unwritten extent mapping on filesystems that
2606  * support these features.
2607  *
2608  * We are not allowed to take the i_rwsem here so we have to play games to
2609  * protect against truncate races as the page could now be beyond EOF.  Because
2610  * truncate writes the inode size before removing pages, once we have the
2611  * page lock we can determine safely if the page is beyond EOF. If it is not
2612  * beyond EOF, then the page is guaranteed safe against truncation until we
2613  * unlock the page.
2614  *
2615  * Direct callers of this function should protect against filesystem freezing
2616  * using sb_start_pagefault() - sb_end_pagefault() functions.
2617  */
2618 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2619 			 get_block_t get_block)
2620 {
2621 	struct folio *folio = page_folio(vmf->page);
2622 	struct inode *inode = file_inode(vma->vm_file);
2623 	unsigned long end;
2624 	loff_t size;
2625 	int ret;
2626 
2627 	folio_lock(folio);
2628 	size = i_size_read(inode);
2629 	if ((folio->mapping != inode->i_mapping) ||
2630 	    (folio_pos(folio) >= size)) {
2631 		/* We overload EFAULT to mean page got truncated */
2632 		ret = -EFAULT;
2633 		goto out_unlock;
2634 	}
2635 
2636 	end = folio_size(folio);
2637 	/* folio is wholly or partially inside EOF */
2638 	if (folio_pos(folio) + end > size)
2639 		end = size - folio_pos(folio);
2640 
2641 	ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
2642 	if (unlikely(ret))
2643 		goto out_unlock;
2644 
2645 	block_commit_write(folio, 0, end);
2646 
2647 	folio_mark_dirty(folio);
2648 	folio_wait_stable(folio);
2649 	return 0;
2650 out_unlock:
2651 	folio_unlock(folio);
2652 	return ret;
2653 }
2654 EXPORT_SYMBOL(block_page_mkwrite);
2655 
2656 int block_truncate_page(struct address_space *mapping,
2657 			loff_t from, get_block_t *get_block)
2658 {
2659 	pgoff_t index = from >> PAGE_SHIFT;
2660 	unsigned blocksize;
2661 	sector_t iblock;
2662 	size_t offset, length, pos;
2663 	struct inode *inode = mapping->host;
2664 	struct folio *folio;
2665 	struct buffer_head *bh;
2666 	int err = 0;
2667 
2668 	blocksize = i_blocksize(inode);
2669 	length = from & (blocksize - 1);
2670 
2671 	/* Block boundary? Nothing to do */
2672 	if (!length)
2673 		return 0;
2674 
2675 	length = blocksize - length;
2676 	iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
2677 
2678 	folio = filemap_grab_folio(mapping, index);
2679 	if (IS_ERR(folio))
2680 		return PTR_ERR(folio);
2681 
2682 	bh = folio_buffers(folio);
2683 	if (!bh)
2684 		bh = create_empty_buffers(folio, blocksize, 0);
2685 
2686 	/* Find the buffer that contains "offset" */
2687 	offset = offset_in_folio(folio, from);
2688 	pos = blocksize;
2689 	while (offset >= pos) {
2690 		bh = bh->b_this_page;
2691 		iblock++;
2692 		pos += blocksize;
2693 	}
2694 
2695 	if (!buffer_mapped(bh)) {
2696 		WARN_ON(bh->b_size != blocksize);
2697 		err = get_block(inode, iblock, bh, 0);
2698 		if (err)
2699 			goto unlock;
2700 		/* unmapped? It's a hole - nothing to do */
2701 		if (!buffer_mapped(bh))
2702 			goto unlock;
2703 	}
2704 
2705 	/* Ok, it's mapped. Make sure it's up-to-date */
2706 	if (folio_test_uptodate(folio))
2707 		set_buffer_uptodate(bh);
2708 
2709 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2710 		err = bh_read(bh, 0);
2711 		/* Uhhuh. Read error. Complain and punt. */
2712 		if (err < 0)
2713 			goto unlock;
2714 	}
2715 
2716 	folio_zero_range(folio, offset, length);
2717 	mark_buffer_dirty(bh);
2718 
2719 unlock:
2720 	folio_unlock(folio);
2721 	folio_put(folio);
2722 
2723 	return err;
2724 }
2725 EXPORT_SYMBOL(block_truncate_page);
2726 
2727 /*
2728  * The generic write folio function for buffer-backed address_spaces
2729  */
2730 int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
2731 		void *get_block)
2732 {
2733 	struct inode * const inode = folio->mapping->host;
2734 	loff_t i_size = i_size_read(inode);
2735 
2736 	/* Is the folio fully inside i_size? */
2737 	if (folio_next_pos(folio) <= i_size)
2738 		return __block_write_full_folio(inode, folio, get_block, wbc);
2739 
2740 	/* Is the folio fully outside i_size? (truncate in progress) */
2741 	if (folio_pos(folio) >= i_size) {
2742 		folio_unlock(folio);
2743 		return 0; /* don't care */
2744 	}
2745 
2746 	/*
2747 	 * The folio straddles i_size.  It must be zeroed out on each and every
2748 	 * writeback invocation because it may be mmapped.  "A file is mapped
2749 	 * in multiples of the page size.  For a file that is not a multiple of
2750 	 * the page size, the remaining memory is zeroed when mapped, and
2751 	 * writes to that region are not written out to the file."
2752 	 */
2753 	folio_zero_segment(folio, offset_in_folio(folio, i_size),
2754 			folio_size(folio));
2755 	return __block_write_full_folio(inode, folio, get_block, wbc);
2756 }
2757 
2758 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2759 			    get_block_t *get_block)
2760 {
2761 	struct inode *inode = mapping->host;
2762 	struct buffer_head tmp = {
2763 		.b_size = i_blocksize(inode),
2764 	};
2765 
2766 	get_block(inode, block, &tmp, 0);
2767 	return tmp.b_blocknr;
2768 }
2769 EXPORT_SYMBOL(generic_block_bmap);
2770 
2771 void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2772 {
2773 	lock_buffer(bh);
2774 	if (!test_clear_buffer_dirty(bh)) {
2775 		unlock_buffer(bh);
2776 		return;
2777 	}
2778 	bh_submit(bh, REQ_OP_WRITE | op_flags, bh_end_write);
2779 }
2780 EXPORT_SYMBOL(write_dirty_buffer);
2781 
2782 /*
2783  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2784  * and then start new I/O and then wait upon it.  The caller must have a ref on
2785  * the buffer_head.
2786  */
2787 int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2788 {
2789 	WARN_ON(atomic_read(&bh->b_count) < 1);
2790 	lock_buffer(bh);
2791 	if (test_clear_buffer_dirty(bh)) {
2792 		/*
2793 		 * The bh should be mapped, but it might not be if the
2794 		 * device was hot-removed. Not much we can do but fail the I/O.
2795 		 */
2796 		if (!buffer_mapped(bh)) {
2797 			unlock_buffer(bh);
2798 			return -EIO;
2799 		}
2800 
2801 		bh_submit(bh, REQ_OP_WRITE | op_flags, bh_end_write);
2802 		wait_on_buffer(bh);
2803 		if (!buffer_uptodate(bh))
2804 			return -EIO;
2805 	} else {
2806 		unlock_buffer(bh);
2807 	}
2808 	return 0;
2809 }
2810 EXPORT_SYMBOL(__sync_dirty_buffer);
2811 
2812 int sync_dirty_buffer(struct buffer_head *bh)
2813 {
2814 	return __sync_dirty_buffer(bh, REQ_SYNC);
2815 }
2816 EXPORT_SYMBOL(sync_dirty_buffer);
2817 
2818 static inline int buffer_busy(struct buffer_head *bh)
2819 {
2820 	return atomic_read(&bh->b_count) |
2821 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2822 }
2823 
2824 static bool
2825 drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
2826 {
2827 	struct buffer_head *head = folio_buffers(folio);
2828 	struct buffer_head *bh;
2829 
2830 	bh = head;
2831 	do {
2832 		if (buffer_busy(bh))
2833 			goto failed;
2834 		bh = bh->b_this_page;
2835 	} while (bh != head);
2836 
2837 	do {
2838 		struct buffer_head *next = bh->b_this_page;
2839 
2840 		remove_assoc_queue(bh);
2841 		bh = next;
2842 	} while (bh != head);
2843 	*buffers_to_free = head;
2844 	folio_detach_private(folio);
2845 	return true;
2846 failed:
2847 	return false;
2848 }
2849 
2850 /**
2851  * try_to_free_buffers - Release buffers attached to this folio.
2852  * @folio: The folio.
2853  *
2854  * If any buffers are in use (dirty, under writeback, elevated refcount),
2855  * no buffers will be freed.
2856  *
2857  * If the folio is dirty but all the buffers are clean then we need to
2858  * be sure to mark the folio clean as well.  This is because the folio
2859  * may be against a block device, and a later reattachment of buffers
2860  * to a dirty folio will set *all* buffers dirty.  Which would corrupt
2861  * filesystem data on the same device.
2862  *
2863  * The same applies to regular filesystem folios: if all the buffers are
2864  * clean then we set the folio clean and proceed.  To do that, we require
2865  * total exclusion from block_dirty_folio().  That is obtained with
2866  * i_private_lock.
2867  *
2868  * Exclusion against try_to_free_buffers may be obtained by either
2869  * locking the folio or by holding its mapping's i_private_lock.
2870  *
2871  * Context: Process context.  @folio must be locked.  Will not sleep.
2872  * Return: true if all buffers attached to this folio were freed.
2873  */
2874 bool try_to_free_buffers(struct folio *folio)
2875 {
2876 	struct address_space * const mapping = folio->mapping;
2877 	struct buffer_head *buffers_to_free = NULL;
2878 	bool ret = 0;
2879 
2880 	BUG_ON(!folio_test_locked(folio));
2881 	if (folio_test_writeback(folio))
2882 		return false;
2883 
2884 	/* Misconfigured folio check */
2885 	if (WARN_ON_ONCE(!folio_buffers(folio)))
2886 		return true;
2887 
2888 	if (mapping == NULL) {		/* can this still happen? */
2889 		ret = drop_buffers(folio, &buffers_to_free);
2890 		goto out;
2891 	}
2892 
2893 	spin_lock(&mapping->i_private_lock);
2894 	ret = drop_buffers(folio, &buffers_to_free);
2895 
2896 	/*
2897 	 * If the filesystem writes its buffers by hand (eg ext3)
2898 	 * then we can have clean buffers against a dirty folio.  We
2899 	 * clean the folio here; otherwise the VM will never notice
2900 	 * that the filesystem did any IO at all.
2901 	 *
2902 	 * Also, during truncate, discard_buffer will have marked all
2903 	 * the folio's buffers clean.  We discover that here and clean
2904 	 * the folio also.
2905 	 *
2906 	 * i_private_lock must be held over this entire operation in order
2907 	 * to synchronise against block_dirty_folio and prevent the
2908 	 * dirty bit from being lost.
2909 	 */
2910 	if (ret)
2911 		folio_cancel_dirty(folio);
2912 	spin_unlock(&mapping->i_private_lock);
2913 out:
2914 	if (buffers_to_free) {
2915 		struct buffer_head *bh = buffers_to_free;
2916 
2917 		do {
2918 			struct buffer_head *next = bh->b_this_page;
2919 			free_buffer_head(bh);
2920 			bh = next;
2921 		} while (bh != buffers_to_free);
2922 	}
2923 	return ret;
2924 }
2925 EXPORT_SYMBOL(try_to_free_buffers);
2926 
2927 /*
2928  * Buffer-head allocation
2929  */
2930 static struct kmem_cache *bh_cachep __ro_after_init;
2931 
2932 /*
2933  * Once the number of bh's in the machine exceeds this level, we start
2934  * stripping them in writeback.
2935  */
2936 static unsigned long max_buffer_heads __ro_after_init;
2937 
2938 int buffer_heads_over_limit;
2939 
2940 struct bh_accounting {
2941 	int nr;			/* Number of live bh's */
2942 	int ratelimit;		/* Limit cacheline bouncing */
2943 };
2944 
2945 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2946 
2947 static void recalc_bh_state(void)
2948 {
2949 	int i;
2950 	int tot = 0;
2951 
2952 	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
2953 		return;
2954 	__this_cpu_write(bh_accounting.ratelimit, 0);
2955 	for_each_online_cpu(i)
2956 		tot += per_cpu(bh_accounting, i).nr;
2957 	buffer_heads_over_limit = (tot > max_buffer_heads);
2958 }
2959 
2960 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2961 {
2962 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
2963 	if (ret) {
2964 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
2965 		spin_lock_init(&ret->b_uptodate_lock);
2966 		preempt_disable();
2967 		__this_cpu_inc(bh_accounting.nr);
2968 		recalc_bh_state();
2969 		preempt_enable();
2970 	}
2971 	return ret;
2972 }
2973 EXPORT_SYMBOL(alloc_buffer_head);
2974 
2975 void free_buffer_head(struct buffer_head *bh)
2976 {
2977 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
2978 	kmem_cache_free(bh_cachep, bh);
2979 	preempt_disable();
2980 	__this_cpu_dec(bh_accounting.nr);
2981 	recalc_bh_state();
2982 	preempt_enable();
2983 }
2984 EXPORT_SYMBOL(free_buffer_head);
2985 
2986 static int buffer_exit_cpu_dead(unsigned int cpu)
2987 {
2988 	int i;
2989 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2990 
2991 	for (i = 0; i < BH_LRU_SIZE; i++) {
2992 		brelse(b->bhs[i]);
2993 		b->bhs[i] = NULL;
2994 	}
2995 	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
2996 	per_cpu(bh_accounting, cpu).nr = 0;
2997 	return 0;
2998 }
2999 
3000 /**
3001  * bh_uptodate_or_lock - Test whether the buffer is uptodate
3002  * @bh: struct buffer_head
3003  *
3004  * Return true if the buffer is up-to-date and false,
3005  * with the buffer locked, if not.
3006  */
3007 int bh_uptodate_or_lock(struct buffer_head *bh)
3008 {
3009 	if (!buffer_uptodate(bh)) {
3010 		lock_buffer(bh);
3011 		if (!buffer_uptodate(bh))
3012 			return 0;
3013 		unlock_buffer(bh);
3014 	}
3015 	return 1;
3016 }
3017 EXPORT_SYMBOL(bh_uptodate_or_lock);
3018 
3019 /**
3020  * __bh_read - Submit read for a locked buffer
3021  * @bh: struct buffer_head
3022  * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3023  * @wait: wait until reading finish
3024  *
3025  * Returns zero on success or don't wait, and -EIO on error.
3026  */
3027 int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3028 {
3029 	int ret = 0;
3030 
3031 	BUG_ON(!buffer_locked(bh));
3032 
3033 	bh_submit(bh, REQ_OP_READ | op_flags, bh_end_read);
3034 	if (wait) {
3035 		wait_on_buffer(bh);
3036 		if (!buffer_uptodate(bh))
3037 			ret = -EIO;
3038 	}
3039 	return ret;
3040 }
3041 EXPORT_SYMBOL(__bh_read);
3042 
3043 /**
3044  * __bh_read_batch - Submit read for a batch of unlocked buffers
3045  * @nr: entry number of the buffer batch
3046  * @bhs: a batch of struct buffer_head
3047  * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3048  * @force_lock: force to get a lock on the buffer if set, otherwise drops any
3049  *              buffer that cannot lock.
3050  *
3051  * Returns zero on success or don't wait, and -EIO on error.
3052  */
3053 void __bh_read_batch(int nr, struct buffer_head *bhs[],
3054 		     blk_opf_t op_flags, bool force_lock)
3055 {
3056 	int i;
3057 
3058 	for (i = 0; i < nr; i++) {
3059 		struct buffer_head *bh = bhs[i];
3060 
3061 		if (buffer_uptodate(bh))
3062 			continue;
3063 
3064 		if (force_lock)
3065 			lock_buffer(bh);
3066 		else
3067 			if (!trylock_buffer(bh))
3068 				continue;
3069 
3070 		if (buffer_uptodate(bh)) {
3071 			unlock_buffer(bh);
3072 			continue;
3073 		}
3074 
3075 		bh_submit(bh, REQ_OP_READ | op_flags, bh_end_read);
3076 	}
3077 }
3078 EXPORT_SYMBOL(__bh_read_batch);
3079 
3080 void __init buffer_init(void)
3081 {
3082 	unsigned long nrpages;
3083 	int ret;
3084 
3085 	bh_cachep = KMEM_CACHE(buffer_head,
3086 				SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
3087 	/*
3088 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3089 	 */
3090 	nrpages = (nr_free_buffer_pages() * 10) / 100;
3091 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3092 	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3093 					NULL, buffer_exit_cpu_dead);
3094 	WARN_ON(ret < 0);
3095 }
3096