xref: /linux/block/bdev.c (revision 11a299a7933e03c83818b431e6a1c53ad387423d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1991, 1992  Linus Torvalds
4  *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
5  *  Copyright (C) 2016 - 2020 Christoph Hellwig
6  */
7 
8 #include <linux/init.h>
9 #include <linux/mm.h>
10 #include <linux/slab.h>
11 #include <linux/kmod.h>
12 #include <linux/major.h>
13 #include <linux/device_cgroup.h>
14 #include <linux/blkdev.h>
15 #include <linux/blk-integrity.h>
16 #include <linux/backing-dev.h>
17 #include <linux/module.h>
18 #include <linux/blkpg.h>
19 #include <linux/magic.h>
20 #include <linux/buffer_head.h>
21 #include <linux/swap.h>
22 #include <linux/writeback.h>
23 #include <linux/mount.h>
24 #include <linux/pseudo_fs.h>
25 #include <linux/uio.h>
26 #include <linux/namei.h>
27 #include <linux/security.h>
28 #include <linux/part_stat.h>
29 #include <linux/uaccess.h>
30 #include <linux/stat.h>
31 #include "../fs/internal.h"
32 #include "blk.h"
33 
34 /* Should we allow writing to mounted block devices? */
35 static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
36 
37 struct bdev_inode {
38 	struct block_device bdev;
39 	struct inode vfs_inode;
40 };
41 
BDEV_I(struct inode * inode)42 static inline struct bdev_inode *BDEV_I(struct inode *inode)
43 {
44 	return container_of(inode, struct bdev_inode, vfs_inode);
45 }
46 
BD_INODE(struct block_device * bdev)47 static inline struct inode *BD_INODE(struct block_device *bdev)
48 {
49 	return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode;
50 }
51 
I_BDEV(struct inode * inode)52 struct block_device *I_BDEV(struct inode *inode)
53 {
54 	return &BDEV_I(inode)->bdev;
55 }
56 EXPORT_SYMBOL(I_BDEV);
57 
file_bdev(struct file * bdev_file)58 struct block_device *file_bdev(struct file *bdev_file)
59 {
60 	return I_BDEV(bdev_file->f_mapping->host);
61 }
62 EXPORT_SYMBOL(file_bdev);
63 
bdev_write_inode(struct block_device * bdev)64 static void bdev_write_inode(struct block_device *bdev)
65 {
66 	struct inode *inode = BD_INODE(bdev);
67 	int ret;
68 
69 	spin_lock(&inode->i_lock);
70 	while (inode->i_state & I_DIRTY) {
71 		spin_unlock(&inode->i_lock);
72 		ret = write_inode_now(inode, true);
73 		if (ret)
74 			pr_warn_ratelimited(
75 	"VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
76 				bdev, ret);
77 		spin_lock(&inode->i_lock);
78 	}
79 	spin_unlock(&inode->i_lock);
80 }
81 
82 /* Kill _all_ buffers and pagecache , dirty or not.. */
kill_bdev(struct block_device * bdev)83 static void kill_bdev(struct block_device *bdev)
84 {
85 	struct address_space *mapping = bdev->bd_mapping;
86 
87 	if (mapping_empty(mapping))
88 		return;
89 
90 	invalidate_bh_lrus();
91 	truncate_inode_pages(mapping, 0);
92 }
93 
94 /* Invalidate clean unused buffers and pagecache. */
invalidate_bdev(struct block_device * bdev)95 void invalidate_bdev(struct block_device *bdev)
96 {
97 	struct address_space *mapping = bdev->bd_mapping;
98 
99 	if (mapping->nrpages) {
100 		invalidate_bh_lrus();
101 		lru_add_drain_all();	/* make sure all lru add caches are flushed */
102 		invalidate_mapping_pages(mapping, 0, -1);
103 	}
104 }
105 EXPORT_SYMBOL(invalidate_bdev);
106 
107 /*
108  * Drop all buffers & page cache for given bdev range. This function bails
109  * with error if bdev has other exclusive owner (such as filesystem).
110  */
truncate_bdev_range(struct block_device * bdev,blk_mode_t mode,loff_t lstart,loff_t lend)111 int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
112 			loff_t lstart, loff_t lend)
113 {
114 	/*
115 	 * If we don't hold exclusive handle for the device, upgrade to it
116 	 * while we discard the buffer cache to avoid discarding buffers
117 	 * under live filesystem.
118 	 */
119 	if (!(mode & BLK_OPEN_EXCL)) {
120 		int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL);
121 		if (err)
122 			goto invalidate;
123 	}
124 
125 	truncate_inode_pages_range(bdev->bd_mapping, lstart, lend);
126 	if (!(mode & BLK_OPEN_EXCL))
127 		bd_abort_claiming(bdev, truncate_bdev_range);
128 	return 0;
129 
130 invalidate:
131 	/*
132 	 * Someone else has handle exclusively open. Try invalidating instead.
133 	 * The 'end' argument is inclusive so the rounding is safe.
134 	 */
135 	return invalidate_inode_pages2_range(bdev->bd_mapping,
136 					     lstart >> PAGE_SHIFT,
137 					     lend >> PAGE_SHIFT);
138 }
139 
set_init_blocksize(struct block_device * bdev)140 static void set_init_blocksize(struct block_device *bdev)
141 {
142 	unsigned int bsize = bdev_logical_block_size(bdev);
143 	loff_t size = i_size_read(BD_INODE(bdev));
144 
145 	while (bsize < PAGE_SIZE) {
146 		if (size & bsize)
147 			break;
148 		bsize <<= 1;
149 	}
150 	BD_INODE(bdev)->i_blkbits = blksize_bits(bsize);
151 }
152 
set_blocksize(struct file * file,int size)153 int set_blocksize(struct file *file, int size)
154 {
155 	struct inode *inode = file->f_mapping->host;
156 	struct block_device *bdev = I_BDEV(inode);
157 
158 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
159 	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
160 		return -EINVAL;
161 
162 	/* Size cannot be smaller than the size supported by the device */
163 	if (size < bdev_logical_block_size(bdev))
164 		return -EINVAL;
165 
166 	if (!file->private_data)
167 		return -EINVAL;
168 
169 	/* Don't change the size if it is same as current */
170 	if (inode->i_blkbits != blksize_bits(size)) {
171 		sync_blockdev(bdev);
172 		inode->i_blkbits = blksize_bits(size);
173 		kill_bdev(bdev);
174 	}
175 	return 0;
176 }
177 
178 EXPORT_SYMBOL(set_blocksize);
179 
sb_set_blocksize(struct super_block * sb,int size)180 int sb_set_blocksize(struct super_block *sb, int size)
181 {
182 	if (set_blocksize(sb->s_bdev_file, size))
183 		return 0;
184 	/* If we get here, we know size is power of two
185 	 * and it's value is between 512 and PAGE_SIZE */
186 	sb->s_blocksize = size;
187 	sb->s_blocksize_bits = blksize_bits(size);
188 	return sb->s_blocksize;
189 }
190 
191 EXPORT_SYMBOL(sb_set_blocksize);
192 
sb_min_blocksize(struct super_block * sb,int size)193 int sb_min_blocksize(struct super_block *sb, int size)
194 {
195 	int minsize = bdev_logical_block_size(sb->s_bdev);
196 	if (size < minsize)
197 		size = minsize;
198 	return sb_set_blocksize(sb, size);
199 }
200 
201 EXPORT_SYMBOL(sb_min_blocksize);
202 
sync_blockdev_nowait(struct block_device * bdev)203 int sync_blockdev_nowait(struct block_device *bdev)
204 {
205 	if (!bdev)
206 		return 0;
207 	return filemap_flush(bdev->bd_mapping);
208 }
209 EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
210 
211 /*
212  * Write out and wait upon all the dirty data associated with a block
213  * device via its mapping.  Does not take the superblock lock.
214  */
sync_blockdev(struct block_device * bdev)215 int sync_blockdev(struct block_device *bdev)
216 {
217 	if (!bdev)
218 		return 0;
219 	return filemap_write_and_wait(bdev->bd_mapping);
220 }
221 EXPORT_SYMBOL(sync_blockdev);
222 
sync_blockdev_range(struct block_device * bdev,loff_t lstart,loff_t lend)223 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
224 {
225 	return filemap_write_and_wait_range(bdev->bd_mapping,
226 			lstart, lend);
227 }
228 EXPORT_SYMBOL(sync_blockdev_range);
229 
230 /**
231  * bdev_freeze - lock a filesystem and force it into a consistent state
232  * @bdev:	blockdevice to lock
233  *
234  * If a superblock is found on this device, we take the s_umount semaphore
235  * on it to make sure nobody unmounts until the snapshot creation is done.
236  * The reference counter (bd_fsfreeze_count) guarantees that only the last
237  * unfreeze process can unfreeze the frozen filesystem actually when multiple
238  * freeze requests arrive simultaneously. It counts up in bdev_freeze() and
239  * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
240  * actually.
241  *
242  * Return: On success zero is returned, negative error code on failure.
243  */
bdev_freeze(struct block_device * bdev)244 int bdev_freeze(struct block_device *bdev)
245 {
246 	int error = 0;
247 
248 	mutex_lock(&bdev->bd_fsfreeze_mutex);
249 
250 	if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
251 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
252 		return 0;
253 	}
254 
255 	mutex_lock(&bdev->bd_holder_lock);
256 	if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
257 		error = bdev->bd_holder_ops->freeze(bdev);
258 		lockdep_assert_not_held(&bdev->bd_holder_lock);
259 	} else {
260 		mutex_unlock(&bdev->bd_holder_lock);
261 		error = sync_blockdev(bdev);
262 	}
263 
264 	if (error)
265 		atomic_dec(&bdev->bd_fsfreeze_count);
266 
267 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
268 	return error;
269 }
270 EXPORT_SYMBOL(bdev_freeze);
271 
272 /**
273  * bdev_thaw - unlock filesystem
274  * @bdev:	blockdevice to unlock
275  *
276  * Unlocks the filesystem and marks it writeable again after bdev_freeze().
277  *
278  * Return: On success zero is returned, negative error code on failure.
279  */
bdev_thaw(struct block_device * bdev)280 int bdev_thaw(struct block_device *bdev)
281 {
282 	int error = -EINVAL, nr_freeze;
283 
284 	mutex_lock(&bdev->bd_fsfreeze_mutex);
285 
286 	/*
287 	 * If this returns < 0 it means that @bd_fsfreeze_count was
288 	 * already 0 and no decrement was performed.
289 	 */
290 	nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
291 	if (nr_freeze < 0)
292 		goto out;
293 
294 	error = 0;
295 	if (nr_freeze > 0)
296 		goto out;
297 
298 	mutex_lock(&bdev->bd_holder_lock);
299 	if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
300 		error = bdev->bd_holder_ops->thaw(bdev);
301 		lockdep_assert_not_held(&bdev->bd_holder_lock);
302 	} else {
303 		mutex_unlock(&bdev->bd_holder_lock);
304 	}
305 
306 	if (error)
307 		atomic_inc(&bdev->bd_fsfreeze_count);
308 out:
309 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
310 	return error;
311 }
312 EXPORT_SYMBOL(bdev_thaw);
313 
314 /*
315  * pseudo-fs
316  */
317 
318 static  __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
319 static struct kmem_cache *bdev_cachep __ro_after_init;
320 
bdev_alloc_inode(struct super_block * sb)321 static struct inode *bdev_alloc_inode(struct super_block *sb)
322 {
323 	struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);
324 
325 	if (!ei)
326 		return NULL;
327 	memset(&ei->bdev, 0, sizeof(ei->bdev));
328 
329 	if (security_bdev_alloc(&ei->bdev)) {
330 		kmem_cache_free(bdev_cachep, ei);
331 		return NULL;
332 	}
333 	return &ei->vfs_inode;
334 }
335 
bdev_free_inode(struct inode * inode)336 static void bdev_free_inode(struct inode *inode)
337 {
338 	struct block_device *bdev = I_BDEV(inode);
339 
340 	free_percpu(bdev->bd_stats);
341 	kfree(bdev->bd_meta_info);
342 	security_bdev_free(bdev);
343 
344 	if (!bdev_is_partition(bdev)) {
345 		if (bdev->bd_disk && bdev->bd_disk->bdi)
346 			bdi_put(bdev->bd_disk->bdi);
347 		kfree(bdev->bd_disk);
348 	}
349 
350 	if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
351 		blk_free_ext_minor(MINOR(bdev->bd_dev));
352 
353 	kmem_cache_free(bdev_cachep, BDEV_I(inode));
354 }
355 
init_once(void * data)356 static void init_once(void *data)
357 {
358 	struct bdev_inode *ei = data;
359 
360 	inode_init_once(&ei->vfs_inode);
361 }
362 
bdev_evict_inode(struct inode * inode)363 static void bdev_evict_inode(struct inode *inode)
364 {
365 	truncate_inode_pages_final(&inode->i_data);
366 	invalidate_inode_buffers(inode); /* is it needed here? */
367 	clear_inode(inode);
368 }
369 
370 static const struct super_operations bdev_sops = {
371 	.statfs = simple_statfs,
372 	.alloc_inode = bdev_alloc_inode,
373 	.free_inode = bdev_free_inode,
374 	.drop_inode = generic_delete_inode,
375 	.evict_inode = bdev_evict_inode,
376 };
377 
bd_init_fs_context(struct fs_context * fc)378 static int bd_init_fs_context(struct fs_context *fc)
379 {
380 	struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
381 	if (!ctx)
382 		return -ENOMEM;
383 	fc->s_iflags |= SB_I_CGROUPWB;
384 	ctx->ops = &bdev_sops;
385 	return 0;
386 }
387 
388 static struct file_system_type bd_type = {
389 	.name		= "bdev",
390 	.init_fs_context = bd_init_fs_context,
391 	.kill_sb	= kill_anon_super,
392 };
393 
394 struct super_block *blockdev_superblock __ro_after_init;
395 static struct vfsmount *blockdev_mnt __ro_after_init;
396 EXPORT_SYMBOL_GPL(blockdev_superblock);
397 
bdev_cache_init(void)398 void __init bdev_cache_init(void)
399 {
400 	int err;
401 
402 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
403 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
404 				SLAB_ACCOUNT|SLAB_PANIC),
405 			init_once);
406 	err = register_filesystem(&bd_type);
407 	if (err)
408 		panic("Cannot register bdev pseudo-fs");
409 	blockdev_mnt = kern_mount(&bd_type);
410 	if (IS_ERR(blockdev_mnt))
411 		panic("Cannot create bdev pseudo-fs");
412 	blockdev_superblock = blockdev_mnt->mnt_sb;   /* For writeback */
413 }
414 
bdev_alloc(struct gendisk * disk,u8 partno)415 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
416 {
417 	struct block_device *bdev;
418 	struct inode *inode;
419 
420 	inode = new_inode(blockdev_superblock);
421 	if (!inode)
422 		return NULL;
423 	inode->i_mode = S_IFBLK;
424 	inode->i_rdev = 0;
425 	inode->i_data.a_ops = &def_blk_aops;
426 	mapping_set_gfp_mask(&inode->i_data, GFP_USER);
427 
428 	bdev = I_BDEV(inode);
429 	mutex_init(&bdev->bd_fsfreeze_mutex);
430 	spin_lock_init(&bdev->bd_size_lock);
431 	mutex_init(&bdev->bd_holder_lock);
432 	atomic_set(&bdev->__bd_flags, partno);
433 	bdev->bd_mapping = &inode->i_data;
434 	bdev->bd_queue = disk->queue;
435 	if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO))
436 		bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO);
437 	bdev->bd_stats = alloc_percpu(struct disk_stats);
438 	if (!bdev->bd_stats) {
439 		iput(inode);
440 		return NULL;
441 	}
442 	bdev->bd_disk = disk;
443 	return bdev;
444 }
445 
bdev_set_nr_sectors(struct block_device * bdev,sector_t sectors)446 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
447 {
448 	spin_lock(&bdev->bd_size_lock);
449 	i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT);
450 	bdev->bd_nr_sectors = sectors;
451 	spin_unlock(&bdev->bd_size_lock);
452 }
453 
bdev_add(struct block_device * bdev,dev_t dev)454 void bdev_add(struct block_device *bdev, dev_t dev)
455 {
456 	struct inode *inode = BD_INODE(bdev);
457 	if (bdev_stable_writes(bdev))
458 		mapping_set_stable_writes(bdev->bd_mapping);
459 	bdev->bd_dev = dev;
460 	inode->i_rdev = dev;
461 	inode->i_ino = dev;
462 	insert_inode_hash(inode);
463 }
464 
bdev_unhash(struct block_device * bdev)465 void bdev_unhash(struct block_device *bdev)
466 {
467 	remove_inode_hash(BD_INODE(bdev));
468 }
469 
bdev_drop(struct block_device * bdev)470 void bdev_drop(struct block_device *bdev)
471 {
472 	iput(BD_INODE(bdev));
473 }
474 
nr_blockdev_pages(void)475 long nr_blockdev_pages(void)
476 {
477 	struct inode *inode;
478 	long ret = 0;
479 
480 	spin_lock(&blockdev_superblock->s_inode_list_lock);
481 	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
482 		ret += inode->i_mapping->nrpages;
483 	spin_unlock(&blockdev_superblock->s_inode_list_lock);
484 
485 	return ret;
486 }
487 
488 /**
489  * bd_may_claim - test whether a block device can be claimed
490  * @bdev: block device of interest
491  * @holder: holder trying to claim @bdev
492  * @hops: holder ops
493  *
494  * Test whether @bdev can be claimed by @holder.
495  *
496  * RETURNS:
497  * %true if @bdev can be claimed, %false otherwise.
498  */
bd_may_claim(struct block_device * bdev,void * holder,const struct blk_holder_ops * hops)499 static bool bd_may_claim(struct block_device *bdev, void *holder,
500 		const struct blk_holder_ops *hops)
501 {
502 	struct block_device *whole = bdev_whole(bdev);
503 
504 	lockdep_assert_held(&bdev_lock);
505 
506 	if (bdev->bd_holder) {
507 		/*
508 		 * The same holder can always re-claim.
509 		 */
510 		if (bdev->bd_holder == holder) {
511 			if (WARN_ON_ONCE(bdev->bd_holder_ops != hops))
512 				return false;
513 			return true;
514 		}
515 		return false;
516 	}
517 
518 	/*
519 	 * If the whole devices holder is set to bd_may_claim, a partition on
520 	 * the device is claimed, but not the whole device.
521 	 */
522 	if (whole != bdev &&
523 	    whole->bd_holder && whole->bd_holder != bd_may_claim)
524 		return false;
525 	return true;
526 }
527 
528 /**
529  * bd_prepare_to_claim - claim a block device
530  * @bdev: block device of interest
531  * @holder: holder trying to claim @bdev
532  * @hops: holder ops.
533  *
534  * Claim @bdev.  This function fails if @bdev is already claimed by another
535  * holder and waits if another claiming is in progress. return, the caller
536  * has ownership of bd_claiming and bd_holder[s].
537  *
538  * RETURNS:
539  * 0 if @bdev can be claimed, -EBUSY otherwise.
540  */
bd_prepare_to_claim(struct block_device * bdev,void * holder,const struct blk_holder_ops * hops)541 int bd_prepare_to_claim(struct block_device *bdev, void *holder,
542 		const struct blk_holder_ops *hops)
543 {
544 	struct block_device *whole = bdev_whole(bdev);
545 
546 	if (WARN_ON_ONCE(!holder))
547 		return -EINVAL;
548 retry:
549 	mutex_lock(&bdev_lock);
550 	/* if someone else claimed, fail */
551 	if (!bd_may_claim(bdev, holder, hops)) {
552 		mutex_unlock(&bdev_lock);
553 		return -EBUSY;
554 	}
555 
556 	/* if claiming is already in progress, wait for it to finish */
557 	if (whole->bd_claiming) {
558 		wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming);
559 		DEFINE_WAIT(wait);
560 
561 		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
562 		mutex_unlock(&bdev_lock);
563 		schedule();
564 		finish_wait(wq, &wait);
565 		goto retry;
566 	}
567 
568 	/* yay, all mine */
569 	whole->bd_claiming = holder;
570 	mutex_unlock(&bdev_lock);
571 	return 0;
572 }
573 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
574 
bd_clear_claiming(struct block_device * whole,void * holder)575 static void bd_clear_claiming(struct block_device *whole, void *holder)
576 {
577 	lockdep_assert_held(&bdev_lock);
578 	/* tell others that we're done */
579 	BUG_ON(whole->bd_claiming != holder);
580 	whole->bd_claiming = NULL;
581 	wake_up_var(&whole->bd_claiming);
582 }
583 
584 /**
585  * bd_finish_claiming - finish claiming of a block device
586  * @bdev: block device of interest
587  * @holder: holder that has claimed @bdev
588  * @hops: block device holder operations
589  *
590  * Finish exclusive open of a block device. Mark the device as exlusively
591  * open by the holder and wake up all waiters for exclusive open to finish.
592  */
bd_finish_claiming(struct block_device * bdev,void * holder,const struct blk_holder_ops * hops)593 static void bd_finish_claiming(struct block_device *bdev, void *holder,
594 		const struct blk_holder_ops *hops)
595 {
596 	struct block_device *whole = bdev_whole(bdev);
597 
598 	mutex_lock(&bdev_lock);
599 	BUG_ON(!bd_may_claim(bdev, holder, hops));
600 	/*
601 	 * Note that for a whole device bd_holders will be incremented twice,
602 	 * and bd_holder will be set to bd_may_claim before being set to holder
603 	 */
604 	whole->bd_holders++;
605 	whole->bd_holder = bd_may_claim;
606 	bdev->bd_holders++;
607 	mutex_lock(&bdev->bd_holder_lock);
608 	bdev->bd_holder = holder;
609 	bdev->bd_holder_ops = hops;
610 	mutex_unlock(&bdev->bd_holder_lock);
611 	bd_clear_claiming(whole, holder);
612 	mutex_unlock(&bdev_lock);
613 }
614 
615 /**
616  * bd_abort_claiming - abort claiming of a block device
617  * @bdev: block device of interest
618  * @holder: holder that has claimed @bdev
619  *
620  * Abort claiming of a block device when the exclusive open failed. This can be
621  * also used when exclusive open is not actually desired and we just needed
622  * to block other exclusive openers for a while.
623  */
bd_abort_claiming(struct block_device * bdev,void * holder)624 void bd_abort_claiming(struct block_device *bdev, void *holder)
625 {
626 	mutex_lock(&bdev_lock);
627 	bd_clear_claiming(bdev_whole(bdev), holder);
628 	mutex_unlock(&bdev_lock);
629 }
630 EXPORT_SYMBOL(bd_abort_claiming);
631 
bd_end_claim(struct block_device * bdev,void * holder)632 static void bd_end_claim(struct block_device *bdev, void *holder)
633 {
634 	struct block_device *whole = bdev_whole(bdev);
635 	bool unblock = false;
636 
637 	/*
638 	 * Release a claim on the device.  The holder fields are protected with
639 	 * bdev_lock.  open_mutex is used to synchronize disk_holder unlinking.
640 	 */
641 	mutex_lock(&bdev_lock);
642 	WARN_ON_ONCE(bdev->bd_holder != holder);
643 	WARN_ON_ONCE(--bdev->bd_holders < 0);
644 	WARN_ON_ONCE(--whole->bd_holders < 0);
645 	if (!bdev->bd_holders) {
646 		mutex_lock(&bdev->bd_holder_lock);
647 		bdev->bd_holder = NULL;
648 		bdev->bd_holder_ops = NULL;
649 		mutex_unlock(&bdev->bd_holder_lock);
650 		if (bdev_test_flag(bdev, BD_WRITE_HOLDER))
651 			unblock = true;
652 	}
653 	if (!whole->bd_holders)
654 		whole->bd_holder = NULL;
655 	mutex_unlock(&bdev_lock);
656 
657 	/*
658 	 * If this was the last claim, remove holder link and unblock evpoll if
659 	 * it was a write holder.
660 	 */
661 	if (unblock) {
662 		disk_unblock_events(bdev->bd_disk);
663 		bdev_clear_flag(bdev, BD_WRITE_HOLDER);
664 	}
665 }
666 
blkdev_flush_mapping(struct block_device * bdev)667 static void blkdev_flush_mapping(struct block_device *bdev)
668 {
669 	WARN_ON_ONCE(bdev->bd_holders);
670 	sync_blockdev(bdev);
671 	kill_bdev(bdev);
672 	bdev_write_inode(bdev);
673 }
674 
blkdev_put_whole(struct block_device * bdev)675 static void blkdev_put_whole(struct block_device *bdev)
676 {
677 	if (atomic_dec_and_test(&bdev->bd_openers))
678 		blkdev_flush_mapping(bdev);
679 	if (bdev->bd_disk->fops->release)
680 		bdev->bd_disk->fops->release(bdev->bd_disk);
681 }
682 
blkdev_get_whole(struct block_device * bdev,blk_mode_t mode)683 static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode)
684 {
685 	struct gendisk *disk = bdev->bd_disk;
686 	int ret;
687 
688 	if (disk->fops->open) {
689 		ret = disk->fops->open(disk, mode);
690 		if (ret) {
691 			/* avoid ghost partitions on a removed medium */
692 			if (ret == -ENOMEDIUM &&
693 			     test_bit(GD_NEED_PART_SCAN, &disk->state))
694 				bdev_disk_changed(disk, true);
695 			return ret;
696 		}
697 	}
698 
699 	if (!atomic_read(&bdev->bd_openers))
700 		set_init_blocksize(bdev);
701 	atomic_inc(&bdev->bd_openers);
702 	if (test_bit(GD_NEED_PART_SCAN, &disk->state)) {
703 		/*
704 		 * Only return scanning errors if we are called from contexts
705 		 * that explicitly want them, e.g. the BLKRRPART ioctl.
706 		 */
707 		ret = bdev_disk_changed(disk, false);
708 		if (ret && (mode & BLK_OPEN_STRICT_SCAN)) {
709 			blkdev_put_whole(bdev);
710 			return ret;
711 		}
712 	}
713 	return 0;
714 }
715 
blkdev_get_part(struct block_device * part,blk_mode_t mode)716 static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
717 {
718 	struct gendisk *disk = part->bd_disk;
719 	int ret;
720 
721 	ret = blkdev_get_whole(bdev_whole(part), mode);
722 	if (ret)
723 		return ret;
724 
725 	ret = -ENXIO;
726 	if (!bdev_nr_sectors(part))
727 		goto out_blkdev_put;
728 
729 	if (!atomic_read(&part->bd_openers)) {
730 		disk->open_partitions++;
731 		set_init_blocksize(part);
732 	}
733 	atomic_inc(&part->bd_openers);
734 	return 0;
735 
736 out_blkdev_put:
737 	blkdev_put_whole(bdev_whole(part));
738 	return ret;
739 }
740 
bdev_permission(dev_t dev,blk_mode_t mode,void * holder)741 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
742 {
743 	int ret;
744 
745 	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
746 			MAJOR(dev), MINOR(dev),
747 			((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
748 			((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
749 	if (ret)
750 		return ret;
751 
752 	/* Blocking writes requires exclusive opener */
753 	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
754 		return -EINVAL;
755 
756 	/*
757 	 * We're using error pointers to indicate to ->release() when we
758 	 * failed to open that block device. Also this doesn't make sense.
759 	 */
760 	if (WARN_ON_ONCE(IS_ERR(holder)))
761 		return -EINVAL;
762 
763 	return 0;
764 }
765 
blkdev_put_part(struct block_device * part)766 static void blkdev_put_part(struct block_device *part)
767 {
768 	struct block_device *whole = bdev_whole(part);
769 
770 	if (atomic_dec_and_test(&part->bd_openers)) {
771 		blkdev_flush_mapping(part);
772 		whole->bd_disk->open_partitions--;
773 	}
774 	blkdev_put_whole(whole);
775 }
776 
blkdev_get_no_open(dev_t dev)777 struct block_device *blkdev_get_no_open(dev_t dev)
778 {
779 	struct block_device *bdev;
780 	struct inode *inode;
781 
782 	inode = ilookup(blockdev_superblock, dev);
783 	if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
784 		blk_request_module(dev);
785 		inode = ilookup(blockdev_superblock, dev);
786 		if (inode)
787 			pr_warn_ratelimited(
788 "block device autoloading is deprecated and will be removed.\n");
789 	}
790 	if (!inode)
791 		return NULL;
792 
793 	/* switch from the inode reference to a device mode one: */
794 	bdev = &BDEV_I(inode)->bdev;
795 	if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
796 		bdev = NULL;
797 	iput(inode);
798 	return bdev;
799 }
800 
blkdev_put_no_open(struct block_device * bdev)801 void blkdev_put_no_open(struct block_device *bdev)
802 {
803 	put_device(&bdev->bd_device);
804 }
805 
bdev_writes_blocked(struct block_device * bdev)806 static bool bdev_writes_blocked(struct block_device *bdev)
807 {
808 	return bdev->bd_writers < 0;
809 }
810 
bdev_block_writes(struct block_device * bdev)811 static void bdev_block_writes(struct block_device *bdev)
812 {
813 	bdev->bd_writers--;
814 }
815 
bdev_unblock_writes(struct block_device * bdev)816 static void bdev_unblock_writes(struct block_device *bdev)
817 {
818 	bdev->bd_writers++;
819 }
820 
bdev_may_open(struct block_device * bdev,blk_mode_t mode)821 static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
822 {
823 	if (bdev_allow_write_mounted)
824 		return true;
825 	/* Writes blocked? */
826 	if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
827 		return false;
828 	if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
829 		return false;
830 	return true;
831 }
832 
bdev_claim_write_access(struct block_device * bdev,blk_mode_t mode)833 static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
834 {
835 	if (bdev_allow_write_mounted)
836 		return;
837 
838 	/* Claim exclusive or shared write access. */
839 	if (mode & BLK_OPEN_RESTRICT_WRITES)
840 		bdev_block_writes(bdev);
841 	else if (mode & BLK_OPEN_WRITE)
842 		bdev->bd_writers++;
843 }
844 
bdev_unclaimed(const struct file * bdev_file)845 static inline bool bdev_unclaimed(const struct file *bdev_file)
846 {
847 	return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host);
848 }
849 
bdev_yield_write_access(struct file * bdev_file)850 static void bdev_yield_write_access(struct file *bdev_file)
851 {
852 	struct block_device *bdev;
853 
854 	if (bdev_allow_write_mounted)
855 		return;
856 
857 	if (bdev_unclaimed(bdev_file))
858 		return;
859 
860 	bdev = file_bdev(bdev_file);
861 
862 	if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED)
863 		bdev_unblock_writes(bdev);
864 	else if (bdev_file->f_mode & FMODE_WRITE)
865 		bdev->bd_writers--;
866 }
867 
868 /**
869  * bdev_open - open a block device
870  * @bdev: block device to open
871  * @mode: open mode (BLK_OPEN_*)
872  * @holder: exclusive holder identifier
873  * @hops: holder operations
874  * @bdev_file: file for the block device
875  *
876  * Open the block device. If @holder is not %NULL, the block device is opened
877  * with exclusive access.  Exclusive opens may nest for the same @holder.
878  *
879  * CONTEXT:
880  * Might sleep.
881  *
882  * RETURNS:
883  * zero on success, -errno on failure.
884  */
bdev_open(struct block_device * bdev,blk_mode_t mode,void * holder,const struct blk_holder_ops * hops,struct file * bdev_file)885 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
886 	      const struct blk_holder_ops *hops, struct file *bdev_file)
887 {
888 	bool unblock_events = true;
889 	struct gendisk *disk = bdev->bd_disk;
890 	int ret;
891 
892 	if (holder) {
893 		mode |= BLK_OPEN_EXCL;
894 		ret = bd_prepare_to_claim(bdev, holder, hops);
895 		if (ret)
896 			return ret;
897 	} else {
898 		if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL))
899 			return -EIO;
900 	}
901 
902 	disk_block_events(disk);
903 
904 	mutex_lock(&disk->open_mutex);
905 	ret = -ENXIO;
906 	if (!disk_live(disk))
907 		goto abort_claiming;
908 	if (!try_module_get(disk->fops->owner))
909 		goto abort_claiming;
910 	ret = -EBUSY;
911 	if (!bdev_may_open(bdev, mode))
912 		goto put_module;
913 	if (bdev_is_partition(bdev))
914 		ret = blkdev_get_part(bdev, mode);
915 	else
916 		ret = blkdev_get_whole(bdev, mode);
917 	if (ret)
918 		goto put_module;
919 	bdev_claim_write_access(bdev, mode);
920 	if (holder) {
921 		bd_finish_claiming(bdev, holder, hops);
922 
923 		/*
924 		 * Block event polling for write claims if requested.  Any write
925 		 * holder makes the write_holder state stick until all are
926 		 * released.  This is good enough and tracking individual
927 		 * writeable reference is too fragile given the way @mode is
928 		 * used in blkdev_get/put().
929 		 */
930 		if ((mode & BLK_OPEN_WRITE) &&
931 		    !bdev_test_flag(bdev, BD_WRITE_HOLDER) &&
932 		    (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
933 			bdev_set_flag(bdev, BD_WRITE_HOLDER);
934 			unblock_events = false;
935 		}
936 	}
937 	mutex_unlock(&disk->open_mutex);
938 
939 	if (unblock_events)
940 		disk_unblock_events(disk);
941 
942 	bdev_file->f_flags |= O_LARGEFILE;
943 	bdev_file->f_mode |= FMODE_CAN_ODIRECT;
944 	if (bdev_nowait(bdev))
945 		bdev_file->f_mode |= FMODE_NOWAIT;
946 	if (mode & BLK_OPEN_RESTRICT_WRITES)
947 		bdev_file->f_mode |= FMODE_WRITE_RESTRICTED;
948 	bdev_file->f_mapping = bdev->bd_mapping;
949 	bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
950 	bdev_file->private_data = holder;
951 
952 	return 0;
953 put_module:
954 	module_put(disk->fops->owner);
955 abort_claiming:
956 	if (holder)
957 		bd_abort_claiming(bdev, holder);
958 	mutex_unlock(&disk->open_mutex);
959 	disk_unblock_events(disk);
960 	return ret;
961 }
962 
963 /*
964  * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
965  * associated with the floppy driver where it has allowed ioctls if the
966  * file was opened for writing, but does not allow reads or writes.
967  * Make sure that this quirk is reflected in @f_flags.
968  *
969  * It can also happen if a block device is opened as O_RDWR | O_WRONLY.
970  */
blk_to_file_flags(blk_mode_t mode)971 static unsigned blk_to_file_flags(blk_mode_t mode)
972 {
973 	unsigned int flags = 0;
974 
975 	if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) ==
976 	    (BLK_OPEN_READ | BLK_OPEN_WRITE))
977 		flags |= O_RDWR;
978 	else if (mode & BLK_OPEN_WRITE_IOCTL)
979 		flags |= O_RDWR | O_WRONLY;
980 	else if (mode & BLK_OPEN_WRITE)
981 		flags |= O_WRONLY;
982 	else if (mode & BLK_OPEN_READ)
983 		flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */
984 	else
985 		WARN_ON_ONCE(true);
986 
987 	if (mode & BLK_OPEN_NDELAY)
988 		flags |= O_NDELAY;
989 
990 	return flags;
991 }
992 
bdev_file_open_by_dev(dev_t dev,blk_mode_t mode,void * holder,const struct blk_holder_ops * hops)993 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
994 				   const struct blk_holder_ops *hops)
995 {
996 	struct file *bdev_file;
997 	struct block_device *bdev;
998 	unsigned int flags;
999 	int ret;
1000 
1001 	ret = bdev_permission(dev, mode, holder);
1002 	if (ret)
1003 		return ERR_PTR(ret);
1004 
1005 	bdev = blkdev_get_no_open(dev);
1006 	if (!bdev)
1007 		return ERR_PTR(-ENXIO);
1008 
1009 	flags = blk_to_file_flags(mode);
1010 	bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev),
1011 			blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
1012 	if (IS_ERR(bdev_file)) {
1013 		blkdev_put_no_open(bdev);
1014 		return bdev_file;
1015 	}
1016 	ihold(BD_INODE(bdev));
1017 
1018 	ret = bdev_open(bdev, mode, holder, hops, bdev_file);
1019 	if (ret) {
1020 		/* We failed to open the block device. Let ->release() know. */
1021 		bdev_file->private_data = ERR_PTR(ret);
1022 		fput(bdev_file);
1023 		return ERR_PTR(ret);
1024 	}
1025 	return bdev_file;
1026 }
1027 EXPORT_SYMBOL(bdev_file_open_by_dev);
1028 
bdev_file_open_by_path(const char * path,blk_mode_t mode,void * holder,const struct blk_holder_ops * hops)1029 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
1030 				    void *holder,
1031 				    const struct blk_holder_ops *hops)
1032 {
1033 	struct file *file;
1034 	dev_t dev;
1035 	int error;
1036 
1037 	error = lookup_bdev(path, &dev);
1038 	if (error)
1039 		return ERR_PTR(error);
1040 
1041 	file = bdev_file_open_by_dev(dev, mode, holder, hops);
1042 	if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) {
1043 		if (bdev_read_only(file_bdev(file))) {
1044 			fput(file);
1045 			file = ERR_PTR(-EACCES);
1046 		}
1047 	}
1048 
1049 	return file;
1050 }
1051 EXPORT_SYMBOL(bdev_file_open_by_path);
1052 
bd_yield_claim(struct file * bdev_file)1053 static inline void bd_yield_claim(struct file *bdev_file)
1054 {
1055 	struct block_device *bdev = file_bdev(bdev_file);
1056 	void *holder = bdev_file->private_data;
1057 
1058 	lockdep_assert_held(&bdev->bd_disk->open_mutex);
1059 
1060 	if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder)))
1061 		return;
1062 
1063 	if (!bdev_unclaimed(bdev_file))
1064 		bd_end_claim(bdev, holder);
1065 }
1066 
bdev_release(struct file * bdev_file)1067 void bdev_release(struct file *bdev_file)
1068 {
1069 	struct block_device *bdev = file_bdev(bdev_file);
1070 	void *holder = bdev_file->private_data;
1071 	struct gendisk *disk = bdev->bd_disk;
1072 
1073 	/* We failed to open that block device. */
1074 	if (IS_ERR(holder))
1075 		goto put_no_open;
1076 
1077 	/*
1078 	 * Sync early if it looks like we're the last one.  If someone else
1079 	 * opens the block device between now and the decrement of bd_openers
1080 	 * then we did a sync that we didn't need to, but that's not the end
1081 	 * of the world and we want to avoid long (could be several minute)
1082 	 * syncs while holding the mutex.
1083 	 */
1084 	if (atomic_read(&bdev->bd_openers) == 1)
1085 		sync_blockdev(bdev);
1086 
1087 	mutex_lock(&disk->open_mutex);
1088 	bdev_yield_write_access(bdev_file);
1089 
1090 	if (holder)
1091 		bd_yield_claim(bdev_file);
1092 
1093 	/*
1094 	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
1095 	 * event.  This is to ensure detection of media removal commanded
1096 	 * from userland - e.g. eject(1).
1097 	 */
1098 	disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
1099 
1100 	if (bdev_is_partition(bdev))
1101 		blkdev_put_part(bdev);
1102 	else
1103 		blkdev_put_whole(bdev);
1104 	mutex_unlock(&disk->open_mutex);
1105 
1106 	module_put(disk->fops->owner);
1107 put_no_open:
1108 	blkdev_put_no_open(bdev);
1109 }
1110 
1111 /**
1112  * bdev_fput - yield claim to the block device and put the file
1113  * @bdev_file: open block device
1114  *
1115  * Yield claim on the block device and put the file. Ensure that the
1116  * block device can be reclaimed before the file is closed which is a
1117  * deferred operation.
1118  */
bdev_fput(struct file * bdev_file)1119 void bdev_fput(struct file *bdev_file)
1120 {
1121 	if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
1122 		return;
1123 
1124 	if (bdev_file->private_data) {
1125 		struct block_device *bdev = file_bdev(bdev_file);
1126 		struct gendisk *disk = bdev->bd_disk;
1127 
1128 		mutex_lock(&disk->open_mutex);
1129 		bdev_yield_write_access(bdev_file);
1130 		bd_yield_claim(bdev_file);
1131 		/*
1132 		 * Tell release we already gave up our hold on the
1133 		 * device and if write restrictions are available that
1134 		 * we already gave up write access to the device.
1135 		 */
1136 		bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
1137 		mutex_unlock(&disk->open_mutex);
1138 	}
1139 
1140 	fput(bdev_file);
1141 }
1142 EXPORT_SYMBOL(bdev_fput);
1143 
1144 /**
1145  * lookup_bdev() - Look up a struct block_device by name.
1146  * @pathname: Name of the block device in the filesystem.
1147  * @dev: Pointer to the block device's dev_t, if found.
1148  *
1149  * Lookup the block device's dev_t at @pathname in the current
1150  * namespace if possible and return it in @dev.
1151  *
1152  * Context: May sleep.
1153  * Return: 0 if succeeded, negative errno otherwise.
1154  */
lookup_bdev(const char * pathname,dev_t * dev)1155 int lookup_bdev(const char *pathname, dev_t *dev)
1156 {
1157 	struct inode *inode;
1158 	struct path path;
1159 	int error;
1160 
1161 	if (!pathname || !*pathname)
1162 		return -EINVAL;
1163 
1164 	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1165 	if (error)
1166 		return error;
1167 
1168 	inode = d_backing_inode(path.dentry);
1169 	error = -ENOTBLK;
1170 	if (!S_ISBLK(inode->i_mode))
1171 		goto out_path_put;
1172 	error = -EACCES;
1173 	if (!may_open_dev(&path))
1174 		goto out_path_put;
1175 
1176 	*dev = inode->i_rdev;
1177 	error = 0;
1178 out_path_put:
1179 	path_put(&path);
1180 	return error;
1181 }
1182 EXPORT_SYMBOL(lookup_bdev);
1183 
1184 /**
1185  * bdev_mark_dead - mark a block device as dead
1186  * @bdev: block device to operate on
1187  * @surprise: indicate a surprise removal
1188  *
1189  * Tell the file system that this devices or media is dead.  If @surprise is set
1190  * to %true the device or media is already gone, if not we are preparing for an
1191  * orderly removal.
1192  *
1193  * This calls into the file system, which then typicall syncs out all dirty data
1194  * and writes back inodes and then invalidates any cached data in the inodes on
1195  * the file system.  In addition we also invalidate the block device mapping.
1196  */
bdev_mark_dead(struct block_device * bdev,bool surprise)1197 void bdev_mark_dead(struct block_device *bdev, bool surprise)
1198 {
1199 	mutex_lock(&bdev->bd_holder_lock);
1200 	if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
1201 		bdev->bd_holder_ops->mark_dead(bdev, surprise);
1202 	else {
1203 		mutex_unlock(&bdev->bd_holder_lock);
1204 		sync_blockdev(bdev);
1205 	}
1206 
1207 	invalidate_bdev(bdev);
1208 }
1209 /*
1210  * New drivers should not use this directly.  There are some drivers however
1211  * that needs this for historical reasons. For example, the DASD driver has
1212  * historically had a shutdown to offline mode that doesn't actually remove the
1213  * gendisk that otherwise looks a lot like a safe device removal.
1214  */
1215 EXPORT_SYMBOL_GPL(bdev_mark_dead);
1216 
sync_bdevs(bool wait)1217 void sync_bdevs(bool wait)
1218 {
1219 	struct inode *inode, *old_inode = NULL;
1220 
1221 	spin_lock(&blockdev_superblock->s_inode_list_lock);
1222 	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
1223 		struct address_space *mapping = inode->i_mapping;
1224 		struct block_device *bdev;
1225 
1226 		spin_lock(&inode->i_lock);
1227 		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
1228 		    mapping->nrpages == 0) {
1229 			spin_unlock(&inode->i_lock);
1230 			continue;
1231 		}
1232 		__iget(inode);
1233 		spin_unlock(&inode->i_lock);
1234 		spin_unlock(&blockdev_superblock->s_inode_list_lock);
1235 		/*
1236 		 * We hold a reference to 'inode' so it couldn't have been
1237 		 * removed from s_inodes list while we dropped the
1238 		 * s_inode_list_lock  We cannot iput the inode now as we can
1239 		 * be holding the last reference and we cannot iput it under
1240 		 * s_inode_list_lock. So we keep the reference and iput it
1241 		 * later.
1242 		 */
1243 		iput(old_inode);
1244 		old_inode = inode;
1245 		bdev = I_BDEV(inode);
1246 
1247 		mutex_lock(&bdev->bd_disk->open_mutex);
1248 		if (!atomic_read(&bdev->bd_openers)) {
1249 			; /* skip */
1250 		} else if (wait) {
1251 			/*
1252 			 * We keep the error status of individual mapping so
1253 			 * that applications can catch the writeback error using
1254 			 * fsync(2). See filemap_fdatawait_keep_errors() for
1255 			 * details.
1256 			 */
1257 			filemap_fdatawait_keep_errors(inode->i_mapping);
1258 		} else {
1259 			filemap_fdatawrite(inode->i_mapping);
1260 		}
1261 		mutex_unlock(&bdev->bd_disk->open_mutex);
1262 
1263 		spin_lock(&blockdev_superblock->s_inode_list_lock);
1264 	}
1265 	spin_unlock(&blockdev_superblock->s_inode_list_lock);
1266 	iput(old_inode);
1267 }
1268 
1269 /*
1270  * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices.
1271  */
bdev_statx(struct path * path,struct kstat * stat,u32 request_mask)1272 void bdev_statx(struct path *path, struct kstat *stat,
1273 		u32 request_mask)
1274 {
1275 	struct inode *backing_inode;
1276 	struct block_device *bdev;
1277 
1278 	if (!(request_mask & (STATX_DIOALIGN | STATX_WRITE_ATOMIC)))
1279 		return;
1280 
1281 	backing_inode = d_backing_inode(path->dentry);
1282 
1283 	/*
1284 	 * Note that backing_inode is the inode of a block device node file,
1285 	 * not the block device's internal inode.  Therefore it is *not* valid
1286 	 * to use I_BDEV() here; the block device has to be looked up by i_rdev
1287 	 * instead.
1288 	 */
1289 	bdev = blkdev_get_no_open(backing_inode->i_rdev);
1290 	if (!bdev)
1291 		return;
1292 
1293 	if (request_mask & STATX_DIOALIGN) {
1294 		stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
1295 		stat->dio_offset_align = bdev_logical_block_size(bdev);
1296 		stat->result_mask |= STATX_DIOALIGN;
1297 	}
1298 
1299 	if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) {
1300 		struct request_queue *bd_queue = bdev->bd_queue;
1301 
1302 		generic_fill_statx_atomic_writes(stat,
1303 			queue_atomic_write_unit_min_bytes(bd_queue),
1304 			queue_atomic_write_unit_max_bytes(bd_queue));
1305 	}
1306 
1307 	blkdev_put_no_open(bdev);
1308 }
1309 
disk_live(struct gendisk * disk)1310 bool disk_live(struct gendisk *disk)
1311 {
1312 	return !inode_unhashed(BD_INODE(disk->part0));
1313 }
1314 EXPORT_SYMBOL_GPL(disk_live);
1315 
block_size(struct block_device * bdev)1316 unsigned int block_size(struct block_device *bdev)
1317 {
1318 	return 1 << BD_INODE(bdev)->i_blkbits;
1319 }
1320 EXPORT_SYMBOL_GPL(block_size);
1321 
setup_bdev_allow_write_mounted(char * str)1322 static int __init setup_bdev_allow_write_mounted(char *str)
1323 {
1324 	if (kstrtobool(str, &bdev_allow_write_mounted))
1325 		pr_warn("Invalid option string for bdev_allow_write_mounted:"
1326 			" '%s'\n", str);
1327 	return 1;
1328 }
1329 __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
1330