1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 8 #include <linux/init.h> 9 #include <linux/mm.h> 10 #include <linux/slab.h> 11 #include <linux/kmod.h> 12 #include <linux/major.h> 13 #include <linux/device_cgroup.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-integrity.h> 16 #include <linux/backing-dev.h> 17 #include <linux/module.h> 18 #include <linux/blkpg.h> 19 #include <linux/magic.h> 20 #include <linux/buffer_head.h> 21 #include <linux/swap.h> 22 #include <linux/writeback.h> 23 #include <linux/mount.h> 24 #include <linux/pseudo_fs.h> 25 #include <linux/uio.h> 26 #include <linux/namei.h> 27 #include <linux/security.h> 28 #include <linux/part_stat.h> 29 #include <linux/uaccess.h> 30 #include <linux/stat.h> 31 #include "../fs/internal.h" 32 #include "blk.h" 33 34 /* Should we allow writing to mounted block devices? */ 35 static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); 36 37 struct bdev_inode { 38 struct block_device bdev; 39 struct inode vfs_inode; 40 }; 41 42 static inline struct bdev_inode *BDEV_I(struct inode *inode) 43 { 44 return container_of(inode, struct bdev_inode, vfs_inode); 45 } 46 47 static inline struct inode *BD_INODE(struct block_device *bdev) 48 { 49 return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode; 50 } 51 52 struct block_device *I_BDEV(struct inode *inode) 53 { 54 return &BDEV_I(inode)->bdev; 55 } 56 EXPORT_SYMBOL(I_BDEV); 57 58 struct block_device *file_bdev(struct file *bdev_file) 59 { 60 return I_BDEV(bdev_file->f_mapping->host); 61 } 62 EXPORT_SYMBOL(file_bdev); 63 64 static void bdev_write_inode(struct block_device *bdev) 65 { 66 struct inode *inode = BD_INODE(bdev); 67 int ret; 68 69 spin_lock(&inode->i_lock); 70 while (inode_state_read(inode) & I_DIRTY) { 71 spin_unlock(&inode->i_lock); 72 ret = write_inode_now(inode, true); 73 if (ret) 74 pr_warn_ratelimited( 75 "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", 76 bdev, ret); 77 spin_lock(&inode->i_lock); 78 } 79 spin_unlock(&inode->i_lock); 80 } 81 82 /* Kill _all_ buffers and pagecache , dirty or not.. */ 83 static void kill_bdev(struct block_device *bdev) 84 { 85 struct address_space *mapping = bdev->bd_mapping; 86 87 if (mapping_empty(mapping)) 88 return; 89 90 invalidate_bh_lrus(); 91 truncate_inode_pages(mapping, 0); 92 } 93 94 /* Invalidate clean unused buffers and pagecache. */ 95 void invalidate_bdev(struct block_device *bdev) 96 { 97 struct address_space *mapping = bdev->bd_mapping; 98 99 if (mapping->nrpages) { 100 invalidate_bh_lrus(); 101 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 102 invalidate_mapping_pages(mapping, 0, -1); 103 } 104 } 105 EXPORT_SYMBOL(invalidate_bdev); 106 107 /* 108 * Drop all buffers & page cache for given bdev range. This function bails 109 * with error if bdev has other exclusive owner (such as filesystem). 110 */ 111 int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, 112 loff_t lstart, loff_t lend) 113 { 114 /* 115 * If we don't hold exclusive handle for the device, upgrade to it 116 * while we discard the buffer cache to avoid discarding buffers 117 * under live filesystem. 118 */ 119 if (!(mode & BLK_OPEN_EXCL)) { 120 int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); 121 if (err) 122 goto invalidate; 123 } 124 125 truncate_inode_pages_range(bdev->bd_mapping, lstart, lend); 126 if (!(mode & BLK_OPEN_EXCL)) 127 bd_abort_claiming(bdev, truncate_bdev_range); 128 return 0; 129 130 invalidate: 131 /* 132 * Someone else has handle exclusively open. Try invalidating instead. 133 * The 'end' argument is inclusive so the rounding is safe. 134 */ 135 return invalidate_inode_pages2_range(bdev->bd_mapping, 136 lstart >> PAGE_SHIFT, 137 lend >> PAGE_SHIFT); 138 } 139 140 static void set_init_blocksize(struct block_device *bdev) 141 { 142 unsigned int bsize = bdev_logical_block_size(bdev); 143 loff_t size = i_size_read(BD_INODE(bdev)); 144 145 while (bsize < PAGE_SIZE) { 146 if (size & bsize) 147 break; 148 bsize <<= 1; 149 } 150 BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); 151 mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping, 152 get_order(bsize)); 153 } 154 155 /** 156 * bdev_validate_blocksize - check that this block size is acceptable 157 * @bdev: blockdevice to check 158 * @block_size: block size to check 159 * 160 * For block device users that do not use buffer heads or the block device 161 * page cache, make sure that this block size can be used with the device. 162 * 163 * Return: On success zero is returned, negative error code on failure. 164 */ 165 int bdev_validate_blocksize(struct block_device *bdev, int block_size) 166 { 167 if (blk_validate_block_size(block_size)) 168 return -EINVAL; 169 170 /* Size cannot be smaller than the size supported by the device */ 171 if (block_size < bdev_logical_block_size(bdev)) 172 return -EINVAL; 173 174 return 0; 175 } 176 EXPORT_SYMBOL_GPL(bdev_validate_blocksize); 177 178 int set_blocksize(struct file *file, int size) 179 { 180 struct inode *inode = file->f_mapping->host; 181 struct block_device *bdev = I_BDEV(inode); 182 int ret; 183 184 ret = bdev_validate_blocksize(bdev, size); 185 if (ret) 186 return ret; 187 188 if (!file->private_data) 189 return -EINVAL; 190 191 /* Don't change the size if it is same as current */ 192 if (inode->i_blkbits != blksize_bits(size)) { 193 /* 194 * Flush and truncate the pagecache before we reconfigure the 195 * mapping geometry because folio sizes are variable now. If a 196 * reader has already allocated a folio whose size is smaller 197 * than the new min_order but invokes readahead after the new 198 * min_order becomes visible, readahead will think there are 199 * "zero" blocks per folio and crash. Take the inode and 200 * invalidation locks to avoid racing with 201 * read/write/fallocate. 202 */ 203 inode_lock(inode); 204 filemap_invalidate_lock(inode->i_mapping); 205 206 sync_blockdev(bdev); 207 kill_bdev(bdev); 208 209 inode->i_blkbits = blksize_bits(size); 210 mapping_set_folio_min_order(inode->i_mapping, get_order(size)); 211 filemap_invalidate_unlock(inode->i_mapping); 212 inode_unlock(inode); 213 } 214 return 0; 215 } 216 217 EXPORT_SYMBOL(set_blocksize); 218 219 static int sb_validate_large_blocksize(struct super_block *sb, int size) 220 { 221 const char *err_str = NULL; 222 223 if (!(sb->s_type->fs_flags & FS_LBS)) 224 err_str = "not supported by filesystem"; 225 else if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 226 err_str = "is only supported with CONFIG_TRANSPARENT_HUGEPAGE"; 227 228 if (!err_str) 229 return 0; 230 231 pr_warn_ratelimited("%s: block size(%d) > page size(%lu) %s\n", 232 sb->s_type->name, size, PAGE_SIZE, err_str); 233 return -EINVAL; 234 } 235 236 int sb_set_blocksize(struct super_block *sb, int size) 237 { 238 if (size > PAGE_SIZE && sb_validate_large_blocksize(sb, size)) 239 return 0; 240 if (set_blocksize(sb->s_bdev_file, size)) 241 return 0; 242 /* If we get here, we know size is validated */ 243 sb->s_blocksize = size; 244 sb->s_blocksize_bits = blksize_bits(size); 245 return sb->s_blocksize; 246 } 247 248 EXPORT_SYMBOL(sb_set_blocksize); 249 250 int __must_check sb_min_blocksize(struct super_block *sb, int size) 251 { 252 int minsize = bdev_logical_block_size(sb->s_bdev); 253 if (size < minsize) 254 size = minsize; 255 return sb_set_blocksize(sb, size); 256 } 257 258 EXPORT_SYMBOL(sb_min_blocksize); 259 260 int sync_blockdev_nowait(struct block_device *bdev) 261 { 262 if (!bdev) 263 return 0; 264 return filemap_flush(bdev->bd_mapping); 265 } 266 EXPORT_SYMBOL_GPL(sync_blockdev_nowait); 267 268 /* 269 * Write out and wait upon all the dirty data associated with a block 270 * device via its mapping. Does not take the superblock lock. 271 */ 272 int sync_blockdev(struct block_device *bdev) 273 { 274 if (!bdev) 275 return 0; 276 return filemap_write_and_wait(bdev->bd_mapping); 277 } 278 EXPORT_SYMBOL(sync_blockdev); 279 280 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) 281 { 282 return filemap_write_and_wait_range(bdev->bd_mapping, 283 lstart, lend); 284 } 285 EXPORT_SYMBOL(sync_blockdev_range); 286 287 /** 288 * bdev_freeze - lock a filesystem and force it into a consistent state 289 * @bdev: blockdevice to lock 290 * 291 * If a superblock is found on this device, we take the s_umount semaphore 292 * on it to make sure nobody unmounts until the snapshot creation is done. 293 * The reference counter (bd_fsfreeze_count) guarantees that only the last 294 * unfreeze process can unfreeze the frozen filesystem actually when multiple 295 * freeze requests arrive simultaneously. It counts up in bdev_freeze() and 296 * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze 297 * actually. 298 * 299 * Return: On success zero is returned, negative error code on failure. 300 */ 301 int bdev_freeze(struct block_device *bdev) 302 { 303 int error = 0; 304 305 mutex_lock(&bdev->bd_fsfreeze_mutex); 306 307 if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { 308 mutex_unlock(&bdev->bd_fsfreeze_mutex); 309 return 0; 310 } 311 312 mutex_lock(&bdev->bd_holder_lock); 313 if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { 314 error = bdev->bd_holder_ops->freeze(bdev); 315 lockdep_assert_not_held(&bdev->bd_holder_lock); 316 } else { 317 mutex_unlock(&bdev->bd_holder_lock); 318 error = sync_blockdev(bdev); 319 } 320 321 if (error) 322 atomic_dec(&bdev->bd_fsfreeze_count); 323 324 mutex_unlock(&bdev->bd_fsfreeze_mutex); 325 return error; 326 } 327 EXPORT_SYMBOL(bdev_freeze); 328 329 /** 330 * bdev_thaw - unlock filesystem 331 * @bdev: blockdevice to unlock 332 * 333 * Unlocks the filesystem and marks it writeable again after bdev_freeze(). 334 * 335 * Return: On success zero is returned, negative error code on failure. 336 */ 337 int bdev_thaw(struct block_device *bdev) 338 { 339 int error = -EINVAL, nr_freeze; 340 341 mutex_lock(&bdev->bd_fsfreeze_mutex); 342 343 /* 344 * If this returns < 0 it means that @bd_fsfreeze_count was 345 * already 0 and no decrement was performed. 346 */ 347 nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); 348 if (nr_freeze < 0) 349 goto out; 350 351 error = 0; 352 if (nr_freeze > 0) 353 goto out; 354 355 mutex_lock(&bdev->bd_holder_lock); 356 if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { 357 error = bdev->bd_holder_ops->thaw(bdev); 358 lockdep_assert_not_held(&bdev->bd_holder_lock); 359 } else { 360 mutex_unlock(&bdev->bd_holder_lock); 361 } 362 363 if (error) 364 atomic_inc(&bdev->bd_fsfreeze_count); 365 out: 366 mutex_unlock(&bdev->bd_fsfreeze_mutex); 367 return error; 368 } 369 EXPORT_SYMBOL(bdev_thaw); 370 371 /* 372 * pseudo-fs 373 */ 374 375 static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); 376 static struct kmem_cache *bdev_cachep __ro_after_init; 377 378 static struct inode *bdev_alloc_inode(struct super_block *sb) 379 { 380 struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); 381 382 if (!ei) 383 return NULL; 384 memset(&ei->bdev, 0, sizeof(ei->bdev)); 385 386 if (security_bdev_alloc(&ei->bdev)) { 387 kmem_cache_free(bdev_cachep, ei); 388 return NULL; 389 } 390 return &ei->vfs_inode; 391 } 392 393 static void bdev_free_inode(struct inode *inode) 394 { 395 struct block_device *bdev = I_BDEV(inode); 396 397 free_percpu(bdev->bd_stats); 398 kfree(bdev->bd_meta_info); 399 security_bdev_free(bdev); 400 401 if (!bdev_is_partition(bdev)) { 402 if (bdev->bd_disk && bdev->bd_disk->bdi) 403 bdi_put(bdev->bd_disk->bdi); 404 kfree(bdev->bd_disk); 405 } 406 407 if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) 408 blk_free_ext_minor(MINOR(bdev->bd_dev)); 409 410 kmem_cache_free(bdev_cachep, BDEV_I(inode)); 411 } 412 413 static void init_once(void *data) 414 { 415 struct bdev_inode *ei = data; 416 417 inode_init_once(&ei->vfs_inode); 418 } 419 420 static const struct super_operations bdev_sops = { 421 .statfs = simple_statfs, 422 .alloc_inode = bdev_alloc_inode, 423 .free_inode = bdev_free_inode, 424 .drop_inode = inode_just_drop, 425 }; 426 427 static int bd_init_fs_context(struct fs_context *fc) 428 { 429 struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); 430 if (!ctx) 431 return -ENOMEM; 432 fc->s_iflags |= SB_I_CGROUPWB; 433 ctx->ops = &bdev_sops; 434 return 0; 435 } 436 437 static struct file_system_type bd_type = { 438 .name = "bdev", 439 .init_fs_context = bd_init_fs_context, 440 .kill_sb = kill_anon_super, 441 }; 442 443 struct super_block *blockdev_superblock __ro_after_init; 444 static struct vfsmount *blockdev_mnt __ro_after_init; 445 EXPORT_SYMBOL_GPL(blockdev_superblock); 446 447 void __init bdev_cache_init(void) 448 { 449 int err; 450 451 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 452 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 453 SLAB_ACCOUNT|SLAB_PANIC), 454 init_once); 455 err = register_filesystem(&bd_type); 456 if (err) 457 panic("Cannot register bdev pseudo-fs"); 458 blockdev_mnt = kern_mount(&bd_type); 459 if (IS_ERR(blockdev_mnt)) 460 panic("Cannot create bdev pseudo-fs"); 461 blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ 462 } 463 464 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) 465 { 466 struct block_device *bdev; 467 struct inode *inode; 468 469 inode = new_inode(blockdev_superblock); 470 if (!inode) 471 return NULL; 472 inode->i_mode = S_IFBLK; 473 inode->i_rdev = 0; 474 inode->i_data.a_ops = &def_blk_aops; 475 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 476 477 bdev = I_BDEV(inode); 478 mutex_init(&bdev->bd_fsfreeze_mutex); 479 spin_lock_init(&bdev->bd_size_lock); 480 mutex_init(&bdev->bd_holder_lock); 481 atomic_set(&bdev->__bd_flags, partno); 482 bdev->bd_mapping = &inode->i_data; 483 bdev->bd_queue = disk->queue; 484 if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO)) 485 bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO); 486 bdev->bd_stats = alloc_percpu(struct disk_stats); 487 if (!bdev->bd_stats) { 488 iput(inode); 489 return NULL; 490 } 491 bdev->bd_disk = disk; 492 return bdev; 493 } 494 495 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) 496 { 497 spin_lock(&bdev->bd_size_lock); 498 i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT); 499 bdev->bd_nr_sectors = sectors; 500 spin_unlock(&bdev->bd_size_lock); 501 } 502 503 void bdev_add(struct block_device *bdev, dev_t dev) 504 { 505 struct inode *inode = BD_INODE(bdev); 506 if (bdev_stable_writes(bdev)) 507 mapping_set_stable_writes(bdev->bd_mapping); 508 bdev->bd_dev = dev; 509 inode->i_rdev = dev; 510 inode->i_ino = dev; 511 insert_inode_hash(inode); 512 } 513 514 void bdev_unhash(struct block_device *bdev) 515 { 516 remove_inode_hash(BD_INODE(bdev)); 517 } 518 519 void bdev_drop(struct block_device *bdev) 520 { 521 iput(BD_INODE(bdev)); 522 } 523 524 long nr_blockdev_pages(void) 525 { 526 struct inode *inode; 527 long ret = 0; 528 529 spin_lock(&blockdev_superblock->s_inode_list_lock); 530 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) 531 ret += inode->i_mapping->nrpages; 532 spin_unlock(&blockdev_superblock->s_inode_list_lock); 533 534 return ret; 535 } 536 537 /** 538 * bd_may_claim - test whether a block device can be claimed 539 * @bdev: block device of interest 540 * @holder: holder trying to claim @bdev 541 * @hops: holder ops 542 * 543 * Test whether @bdev can be claimed by @holder. 544 * 545 * RETURNS: 546 * %true if @bdev can be claimed, %false otherwise. 547 */ 548 static bool bd_may_claim(struct block_device *bdev, void *holder, 549 const struct blk_holder_ops *hops) 550 { 551 struct block_device *whole = bdev_whole(bdev); 552 553 lockdep_assert_held(&bdev_lock); 554 555 if (bdev->bd_holder) { 556 /* 557 * The same holder can always re-claim. 558 */ 559 if (bdev->bd_holder == holder) { 560 if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) 561 return false; 562 return true; 563 } 564 return false; 565 } 566 567 /* 568 * If the whole devices holder is set to bd_may_claim, a partition on 569 * the device is claimed, but not the whole device. 570 */ 571 if (whole != bdev && 572 whole->bd_holder && whole->bd_holder != bd_may_claim) 573 return false; 574 return true; 575 } 576 577 /** 578 * bd_prepare_to_claim - claim a block device 579 * @bdev: block device of interest 580 * @holder: holder trying to claim @bdev 581 * @hops: holder ops. 582 * 583 * Claim @bdev. This function fails if @bdev is already claimed by another 584 * holder and waits if another claiming is in progress. return, the caller 585 * has ownership of bd_claiming and bd_holder[s]. 586 * 587 * RETURNS: 588 * 0 if @bdev can be claimed, -EBUSY otherwise. 589 */ 590 int bd_prepare_to_claim(struct block_device *bdev, void *holder, 591 const struct blk_holder_ops *hops) 592 { 593 struct block_device *whole = bdev_whole(bdev); 594 595 if (WARN_ON_ONCE(!holder)) 596 return -EINVAL; 597 retry: 598 mutex_lock(&bdev_lock); 599 /* if someone else claimed, fail */ 600 if (!bd_may_claim(bdev, holder, hops)) { 601 mutex_unlock(&bdev_lock); 602 return -EBUSY; 603 } 604 605 /* if claiming is already in progress, wait for it to finish */ 606 if (whole->bd_claiming) { 607 wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); 608 DEFINE_WAIT(wait); 609 610 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 611 mutex_unlock(&bdev_lock); 612 schedule(); 613 finish_wait(wq, &wait); 614 goto retry; 615 } 616 617 /* yay, all mine */ 618 whole->bd_claiming = holder; 619 mutex_unlock(&bdev_lock); 620 return 0; 621 } 622 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ 623 624 static void bd_clear_claiming(struct block_device *whole, void *holder) 625 { 626 lockdep_assert_held(&bdev_lock); 627 /* tell others that we're done */ 628 BUG_ON(whole->bd_claiming != holder); 629 whole->bd_claiming = NULL; 630 wake_up_var(&whole->bd_claiming); 631 } 632 633 /** 634 * bd_finish_claiming - finish claiming of a block device 635 * @bdev: block device of interest 636 * @holder: holder that has claimed @bdev 637 * @hops: block device holder operations 638 * 639 * Finish exclusive open of a block device. Mark the device as exlusively 640 * open by the holder and wake up all waiters for exclusive open to finish. 641 */ 642 static void bd_finish_claiming(struct block_device *bdev, void *holder, 643 const struct blk_holder_ops *hops) 644 { 645 struct block_device *whole = bdev_whole(bdev); 646 647 mutex_lock(&bdev_lock); 648 BUG_ON(!bd_may_claim(bdev, holder, hops)); 649 /* 650 * Note that for a whole device bd_holders will be incremented twice, 651 * and bd_holder will be set to bd_may_claim before being set to holder 652 */ 653 whole->bd_holders++; 654 whole->bd_holder = bd_may_claim; 655 bdev->bd_holders++; 656 mutex_lock(&bdev->bd_holder_lock); 657 bdev->bd_holder = holder; 658 bdev->bd_holder_ops = hops; 659 mutex_unlock(&bdev->bd_holder_lock); 660 bd_clear_claiming(whole, holder); 661 mutex_unlock(&bdev_lock); 662 } 663 664 /** 665 * bd_abort_claiming - abort claiming of a block device 666 * @bdev: block device of interest 667 * @holder: holder that has claimed @bdev 668 * 669 * Abort claiming of a block device when the exclusive open failed. This can be 670 * also used when exclusive open is not actually desired and we just needed 671 * to block other exclusive openers for a while. 672 */ 673 void bd_abort_claiming(struct block_device *bdev, void *holder) 674 { 675 mutex_lock(&bdev_lock); 676 bd_clear_claiming(bdev_whole(bdev), holder); 677 mutex_unlock(&bdev_lock); 678 } 679 EXPORT_SYMBOL(bd_abort_claiming); 680 681 static void bd_end_claim(struct block_device *bdev, void *holder) 682 { 683 struct block_device *whole = bdev_whole(bdev); 684 bool unblock = false; 685 686 /* 687 * Release a claim on the device. The holder fields are protected with 688 * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. 689 */ 690 mutex_lock(&bdev_lock); 691 WARN_ON_ONCE(bdev->bd_holder != holder); 692 WARN_ON_ONCE(--bdev->bd_holders < 0); 693 WARN_ON_ONCE(--whole->bd_holders < 0); 694 if (!bdev->bd_holders) { 695 mutex_lock(&bdev->bd_holder_lock); 696 bdev->bd_holder = NULL; 697 bdev->bd_holder_ops = NULL; 698 mutex_unlock(&bdev->bd_holder_lock); 699 if (bdev_test_flag(bdev, BD_WRITE_HOLDER)) 700 unblock = true; 701 } 702 if (!whole->bd_holders) 703 whole->bd_holder = NULL; 704 mutex_unlock(&bdev_lock); 705 706 /* 707 * If this was the last claim, remove holder link and unblock evpoll if 708 * it was a write holder. 709 */ 710 if (unblock) { 711 disk_unblock_events(bdev->bd_disk); 712 bdev_clear_flag(bdev, BD_WRITE_HOLDER); 713 } 714 } 715 716 static void blkdev_flush_mapping(struct block_device *bdev) 717 { 718 WARN_ON_ONCE(bdev->bd_holders); 719 sync_blockdev(bdev); 720 kill_bdev(bdev); 721 bdev_write_inode(bdev); 722 } 723 724 static void blkdev_put_whole(struct block_device *bdev) 725 { 726 if (atomic_dec_and_test(&bdev->bd_openers)) 727 blkdev_flush_mapping(bdev); 728 if (bdev->bd_disk->fops->release) 729 bdev->bd_disk->fops->release(bdev->bd_disk); 730 } 731 732 static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) 733 { 734 struct gendisk *disk = bdev->bd_disk; 735 int ret; 736 737 if (disk->fops->open) { 738 ret = disk->fops->open(disk, mode); 739 if (ret) { 740 /* avoid ghost partitions on a removed medium */ 741 if (ret == -ENOMEDIUM && 742 test_bit(GD_NEED_PART_SCAN, &disk->state)) 743 bdev_disk_changed(disk, true); 744 return ret; 745 } 746 } 747 748 if (!atomic_read(&bdev->bd_openers)) 749 set_init_blocksize(bdev); 750 atomic_inc(&bdev->bd_openers); 751 if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { 752 /* 753 * Only return scanning errors if we are called from contexts 754 * that explicitly want them, e.g. the BLKRRPART ioctl. 755 */ 756 ret = bdev_disk_changed(disk, false); 757 if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { 758 blkdev_put_whole(bdev); 759 return ret; 760 } 761 } 762 return 0; 763 } 764 765 static int blkdev_get_part(struct block_device *part, blk_mode_t mode) 766 { 767 struct gendisk *disk = part->bd_disk; 768 int ret; 769 770 ret = blkdev_get_whole(bdev_whole(part), mode); 771 if (ret) 772 return ret; 773 774 ret = -ENXIO; 775 if (!bdev_nr_sectors(part)) 776 goto out_blkdev_put; 777 778 if (!atomic_read(&part->bd_openers)) { 779 disk->open_partitions++; 780 set_init_blocksize(part); 781 } 782 atomic_inc(&part->bd_openers); 783 return 0; 784 785 out_blkdev_put: 786 blkdev_put_whole(bdev_whole(part)); 787 return ret; 788 } 789 790 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) 791 { 792 int ret; 793 794 ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, 795 MAJOR(dev), MINOR(dev), 796 ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | 797 ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); 798 if (ret) 799 return ret; 800 801 /* Blocking writes requires exclusive opener */ 802 if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) 803 return -EINVAL; 804 805 /* 806 * We're using error pointers to indicate to ->release() when we 807 * failed to open that block device. Also this doesn't make sense. 808 */ 809 if (WARN_ON_ONCE(IS_ERR(holder))) 810 return -EINVAL; 811 812 return 0; 813 } 814 815 static void blkdev_put_part(struct block_device *part) 816 { 817 struct block_device *whole = bdev_whole(part); 818 819 if (atomic_dec_and_test(&part->bd_openers)) { 820 blkdev_flush_mapping(part); 821 whole->bd_disk->open_partitions--; 822 } 823 blkdev_put_whole(whole); 824 } 825 826 struct block_device *blkdev_get_no_open(dev_t dev, bool autoload) 827 { 828 struct block_device *bdev; 829 struct inode *inode; 830 831 inode = ilookup(blockdev_superblock, dev); 832 if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { 833 blk_request_module(dev); 834 inode = ilookup(blockdev_superblock, dev); 835 if (inode) 836 pr_warn_ratelimited( 837 "block device autoloading is deprecated and will be removed.\n"); 838 } 839 if (!inode) 840 return NULL; 841 842 /* switch from the inode reference to a device mode one: */ 843 bdev = &BDEV_I(inode)->bdev; 844 if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) 845 bdev = NULL; 846 iput(inode); 847 return bdev; 848 } 849 850 void blkdev_put_no_open(struct block_device *bdev) 851 { 852 put_device(&bdev->bd_device); 853 } 854 855 static bool bdev_writes_blocked(struct block_device *bdev) 856 { 857 return bdev->bd_writers < 0; 858 } 859 860 static void bdev_block_writes(struct block_device *bdev) 861 { 862 bdev->bd_writers--; 863 } 864 865 static void bdev_unblock_writes(struct block_device *bdev) 866 { 867 bdev->bd_writers++; 868 } 869 870 static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) 871 { 872 if (bdev_allow_write_mounted) 873 return true; 874 /* Writes blocked? */ 875 if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) 876 return false; 877 if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) 878 return false; 879 return true; 880 } 881 882 static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) 883 { 884 if (bdev_allow_write_mounted) 885 return; 886 887 /* Claim exclusive or shared write access. */ 888 if (mode & BLK_OPEN_RESTRICT_WRITES) 889 bdev_block_writes(bdev); 890 else if (mode & BLK_OPEN_WRITE) 891 bdev->bd_writers++; 892 } 893 894 static inline bool bdev_unclaimed(const struct file *bdev_file) 895 { 896 return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); 897 } 898 899 static void bdev_yield_write_access(struct file *bdev_file) 900 { 901 struct block_device *bdev; 902 903 if (bdev_allow_write_mounted) 904 return; 905 906 if (bdev_unclaimed(bdev_file)) 907 return; 908 909 bdev = file_bdev(bdev_file); 910 911 if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) 912 bdev_unblock_writes(bdev); 913 else if (bdev_file->f_mode & FMODE_WRITE) 914 bdev->bd_writers--; 915 } 916 917 /** 918 * bdev_open - open a block device 919 * @bdev: block device to open 920 * @mode: open mode (BLK_OPEN_*) 921 * @holder: exclusive holder identifier 922 * @hops: holder operations 923 * @bdev_file: file for the block device 924 * 925 * Open the block device. If @holder is not %NULL, the block device is opened 926 * with exclusive access. Exclusive opens may nest for the same @holder. 927 * 928 * CONTEXT: 929 * Might sleep. 930 * 931 * RETURNS: 932 * zero on success, -errno on failure. 933 */ 934 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, 935 const struct blk_holder_ops *hops, struct file *bdev_file) 936 { 937 bool unblock_events = true; 938 struct gendisk *disk = bdev->bd_disk; 939 int ret; 940 941 if (holder) { 942 mode |= BLK_OPEN_EXCL; 943 ret = bd_prepare_to_claim(bdev, holder, hops); 944 if (ret) 945 return ret; 946 } else { 947 if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) 948 return -EIO; 949 } 950 951 disk_block_events(disk); 952 953 mutex_lock(&disk->open_mutex); 954 ret = -ENXIO; 955 if (!disk_live(disk)) 956 goto abort_claiming; 957 if (!try_module_get(disk->fops->owner)) 958 goto abort_claiming; 959 ret = -EBUSY; 960 if (!bdev_may_open(bdev, mode)) 961 goto put_module; 962 if (bdev_is_partition(bdev)) 963 ret = blkdev_get_part(bdev, mode); 964 else 965 ret = blkdev_get_whole(bdev, mode); 966 if (ret) 967 goto put_module; 968 bdev_claim_write_access(bdev, mode); 969 if (holder) { 970 bd_finish_claiming(bdev, holder, hops); 971 972 /* 973 * Block event polling for write claims if requested. Any write 974 * holder makes the write_holder state stick until all are 975 * released. This is good enough and tracking individual 976 * writeable reference is too fragile given the way @mode is 977 * used in blkdev_get/put(). 978 */ 979 if ((mode & BLK_OPEN_WRITE) && 980 !bdev_test_flag(bdev, BD_WRITE_HOLDER) && 981 (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { 982 bdev_set_flag(bdev, BD_WRITE_HOLDER); 983 unblock_events = false; 984 } 985 } 986 mutex_unlock(&disk->open_mutex); 987 988 if (unblock_events) 989 disk_unblock_events(disk); 990 991 bdev_file->f_flags |= O_LARGEFILE; 992 bdev_file->f_mode |= FMODE_CAN_ODIRECT; 993 if (bdev_nowait(bdev)) 994 bdev_file->f_mode |= FMODE_NOWAIT; 995 if (mode & BLK_OPEN_RESTRICT_WRITES) 996 bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; 997 bdev_file->f_mapping = bdev->bd_mapping; 998 bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); 999 bdev_file->private_data = holder; 1000 1001 return 0; 1002 put_module: 1003 module_put(disk->fops->owner); 1004 abort_claiming: 1005 if (holder) 1006 bd_abort_claiming(bdev, holder); 1007 mutex_unlock(&disk->open_mutex); 1008 disk_unblock_events(disk); 1009 return ret; 1010 } 1011 1012 /* 1013 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk 1014 * associated with the floppy driver where it has allowed ioctls if the 1015 * file was opened for writing, but does not allow reads or writes. 1016 * Make sure that this quirk is reflected in @f_flags. 1017 * 1018 * It can also happen if a block device is opened as O_RDWR | O_WRONLY. 1019 */ 1020 static unsigned blk_to_file_flags(blk_mode_t mode) 1021 { 1022 unsigned int flags = 0; 1023 1024 if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == 1025 (BLK_OPEN_READ | BLK_OPEN_WRITE)) 1026 flags |= O_RDWR; 1027 else if (mode & BLK_OPEN_WRITE_IOCTL) 1028 flags |= O_RDWR | O_WRONLY; 1029 else if (mode & BLK_OPEN_WRITE) 1030 flags |= O_WRONLY; 1031 else if (mode & BLK_OPEN_READ) 1032 flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ 1033 else 1034 WARN_ON_ONCE(true); 1035 1036 if (mode & BLK_OPEN_NDELAY) 1037 flags |= O_NDELAY; 1038 1039 return flags; 1040 } 1041 1042 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, 1043 const struct blk_holder_ops *hops) 1044 { 1045 struct file *bdev_file; 1046 struct block_device *bdev; 1047 unsigned int flags; 1048 int ret; 1049 1050 ret = bdev_permission(dev, mode, holder); 1051 if (ret) 1052 return ERR_PTR(ret); 1053 1054 bdev = blkdev_get_no_open(dev, true); 1055 if (!bdev) 1056 return ERR_PTR(-ENXIO); 1057 1058 flags = blk_to_file_flags(mode); 1059 bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev), 1060 blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); 1061 if (IS_ERR(bdev_file)) { 1062 blkdev_put_no_open(bdev); 1063 return bdev_file; 1064 } 1065 ihold(BD_INODE(bdev)); 1066 1067 ret = bdev_open(bdev, mode, holder, hops, bdev_file); 1068 if (ret) { 1069 /* We failed to open the block device. Let ->release() know. */ 1070 bdev_file->private_data = ERR_PTR(ret); 1071 fput(bdev_file); 1072 return ERR_PTR(ret); 1073 } 1074 return bdev_file; 1075 } 1076 EXPORT_SYMBOL(bdev_file_open_by_dev); 1077 1078 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, 1079 void *holder, 1080 const struct blk_holder_ops *hops) 1081 { 1082 struct file *file; 1083 dev_t dev; 1084 int error; 1085 1086 error = lookup_bdev(path, &dev); 1087 if (error) 1088 return ERR_PTR(error); 1089 1090 file = bdev_file_open_by_dev(dev, mode, holder, hops); 1091 if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { 1092 if (bdev_read_only(file_bdev(file))) { 1093 fput(file); 1094 file = ERR_PTR(-EACCES); 1095 } 1096 } 1097 1098 return file; 1099 } 1100 EXPORT_SYMBOL(bdev_file_open_by_path); 1101 1102 static inline void bd_yield_claim(struct file *bdev_file) 1103 { 1104 struct block_device *bdev = file_bdev(bdev_file); 1105 void *holder = bdev_file->private_data; 1106 1107 lockdep_assert_held(&bdev->bd_disk->open_mutex); 1108 1109 if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) 1110 return; 1111 1112 if (!bdev_unclaimed(bdev_file)) 1113 bd_end_claim(bdev, holder); 1114 } 1115 1116 void bdev_release(struct file *bdev_file) 1117 { 1118 struct block_device *bdev = file_bdev(bdev_file); 1119 void *holder = bdev_file->private_data; 1120 struct gendisk *disk = bdev->bd_disk; 1121 1122 /* We failed to open that block device. */ 1123 if (IS_ERR(holder)) 1124 goto put_no_open; 1125 1126 /* 1127 * Sync early if it looks like we're the last one. If someone else 1128 * opens the block device between now and the decrement of bd_openers 1129 * then we did a sync that we didn't need to, but that's not the end 1130 * of the world and we want to avoid long (could be several minute) 1131 * syncs while holding the mutex. 1132 */ 1133 if (atomic_read(&bdev->bd_openers) == 1) 1134 sync_blockdev(bdev); 1135 1136 mutex_lock(&disk->open_mutex); 1137 bdev_yield_write_access(bdev_file); 1138 1139 if (holder) 1140 bd_yield_claim(bdev_file); 1141 1142 /* 1143 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1144 * event. This is to ensure detection of media removal commanded 1145 * from userland - e.g. eject(1). 1146 */ 1147 disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); 1148 1149 if (bdev_is_partition(bdev)) 1150 blkdev_put_part(bdev); 1151 else 1152 blkdev_put_whole(bdev); 1153 mutex_unlock(&disk->open_mutex); 1154 1155 module_put(disk->fops->owner); 1156 put_no_open: 1157 blkdev_put_no_open(bdev); 1158 } 1159 1160 /** 1161 * bdev_fput - yield claim to the block device and put the file 1162 * @bdev_file: open block device 1163 * 1164 * Yield claim on the block device and put the file. Ensure that the 1165 * block device can be reclaimed before the file is closed which is a 1166 * deferred operation. 1167 */ 1168 void bdev_fput(struct file *bdev_file) 1169 { 1170 if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) 1171 return; 1172 1173 if (bdev_file->private_data) { 1174 struct block_device *bdev = file_bdev(bdev_file); 1175 struct gendisk *disk = bdev->bd_disk; 1176 1177 mutex_lock(&disk->open_mutex); 1178 bdev_yield_write_access(bdev_file); 1179 bd_yield_claim(bdev_file); 1180 /* 1181 * Tell release we already gave up our hold on the 1182 * device and if write restrictions are available that 1183 * we already gave up write access to the device. 1184 */ 1185 bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); 1186 mutex_unlock(&disk->open_mutex); 1187 } 1188 1189 fput(bdev_file); 1190 } 1191 EXPORT_SYMBOL(bdev_fput); 1192 1193 /** 1194 * lookup_bdev() - Look up a struct block_device by name. 1195 * @pathname: Name of the block device in the filesystem. 1196 * @dev: Pointer to the block device's dev_t, if found. 1197 * 1198 * Lookup the block device's dev_t at @pathname in the current 1199 * namespace if possible and return it in @dev. 1200 * 1201 * Context: May sleep. 1202 * Return: 0 if succeeded, negative errno otherwise. 1203 */ 1204 int lookup_bdev(const char *pathname, dev_t *dev) 1205 { 1206 struct inode *inode; 1207 struct path path; 1208 int error; 1209 1210 if (!pathname || !*pathname) 1211 return -EINVAL; 1212 1213 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1214 if (error) 1215 return error; 1216 1217 inode = d_backing_inode(path.dentry); 1218 error = -ENOTBLK; 1219 if (!S_ISBLK(inode->i_mode)) 1220 goto out_path_put; 1221 error = -EACCES; 1222 if (!may_open_dev(&path)) 1223 goto out_path_put; 1224 1225 *dev = inode->i_rdev; 1226 error = 0; 1227 out_path_put: 1228 path_put(&path); 1229 return error; 1230 } 1231 EXPORT_SYMBOL(lookup_bdev); 1232 1233 /** 1234 * bdev_mark_dead - mark a block device as dead 1235 * @bdev: block device to operate on 1236 * @surprise: indicate a surprise removal 1237 * 1238 * Tell the file system that this devices or media is dead. If @surprise is set 1239 * to %true the device or media is already gone, if not we are preparing for an 1240 * orderly removal. 1241 * 1242 * This calls into the file system, which then typicall syncs out all dirty data 1243 * and writes back inodes and then invalidates any cached data in the inodes on 1244 * the file system. In addition we also invalidate the block device mapping. 1245 */ 1246 void bdev_mark_dead(struct block_device *bdev, bool surprise) 1247 { 1248 mutex_lock(&bdev->bd_holder_lock); 1249 if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) 1250 bdev->bd_holder_ops->mark_dead(bdev, surprise); 1251 else { 1252 mutex_unlock(&bdev->bd_holder_lock); 1253 sync_blockdev(bdev); 1254 } 1255 1256 invalidate_bdev(bdev); 1257 } 1258 /* 1259 * New drivers should not use this directly. There are some drivers however 1260 * that needs this for historical reasons. For example, the DASD driver has 1261 * historically had a shutdown to offline mode that doesn't actually remove the 1262 * gendisk that otherwise looks a lot like a safe device removal. 1263 */ 1264 EXPORT_SYMBOL_GPL(bdev_mark_dead); 1265 1266 void sync_bdevs(bool wait) 1267 { 1268 struct inode *inode, *old_inode = NULL; 1269 1270 spin_lock(&blockdev_superblock->s_inode_list_lock); 1271 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1272 struct address_space *mapping = inode->i_mapping; 1273 struct block_device *bdev; 1274 1275 spin_lock(&inode->i_lock); 1276 if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW) || 1277 mapping->nrpages == 0) { 1278 spin_unlock(&inode->i_lock); 1279 continue; 1280 } 1281 __iget(inode); 1282 spin_unlock(&inode->i_lock); 1283 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1284 /* 1285 * We hold a reference to 'inode' so it couldn't have been 1286 * removed from s_inodes list while we dropped the 1287 * s_inode_list_lock We cannot iput the inode now as we can 1288 * be holding the last reference and we cannot iput it under 1289 * s_inode_list_lock. So we keep the reference and iput it 1290 * later. 1291 */ 1292 iput(old_inode); 1293 old_inode = inode; 1294 bdev = I_BDEV(inode); 1295 1296 mutex_lock(&bdev->bd_disk->open_mutex); 1297 if (!atomic_read(&bdev->bd_openers)) { 1298 ; /* skip */ 1299 } else if (wait) { 1300 /* 1301 * We keep the error status of individual mapping so 1302 * that applications can catch the writeback error using 1303 * fsync(2). See filemap_fdatawait_keep_errors() for 1304 * details. 1305 */ 1306 filemap_fdatawait_keep_errors(inode->i_mapping); 1307 } else { 1308 filemap_fdatawrite(inode->i_mapping); 1309 } 1310 mutex_unlock(&bdev->bd_disk->open_mutex); 1311 1312 spin_lock(&blockdev_superblock->s_inode_list_lock); 1313 } 1314 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1315 iput(old_inode); 1316 } 1317 1318 /* 1319 * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. 1320 */ 1321 void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) 1322 { 1323 struct block_device *bdev; 1324 1325 /* 1326 * Note that d_backing_inode() returns the block device node inode, not 1327 * the block device's internal inode. Therefore it is *not* valid to 1328 * use I_BDEV() here; the block device has to be looked up by i_rdev 1329 * instead. 1330 */ 1331 bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false); 1332 if (!bdev) 1333 return; 1334 1335 if (request_mask & STATX_DIOALIGN) { 1336 stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 1337 stat->dio_offset_align = bdev_logical_block_size(bdev); 1338 stat->result_mask |= STATX_DIOALIGN; 1339 } 1340 1341 if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { 1342 struct request_queue *bd_queue = bdev->bd_queue; 1343 1344 generic_fill_statx_atomic_writes(stat, 1345 queue_atomic_write_unit_min_bytes(bd_queue), 1346 queue_atomic_write_unit_max_bytes(bd_queue), 1347 0); 1348 } 1349 1350 stat->blksize = bdev_io_min(bdev); 1351 1352 blkdev_put_no_open(bdev); 1353 } 1354 1355 bool disk_live(struct gendisk *disk) 1356 { 1357 return !inode_unhashed(BD_INODE(disk->part0)); 1358 } 1359 EXPORT_SYMBOL_GPL(disk_live); 1360 1361 unsigned int block_size(struct block_device *bdev) 1362 { 1363 return 1 << BD_INODE(bdev)->i_blkbits; 1364 } 1365 EXPORT_SYMBOL_GPL(block_size); 1366 1367 static int __init setup_bdev_allow_write_mounted(char *str) 1368 { 1369 if (kstrtobool(str, &bdev_allow_write_mounted)) 1370 pr_warn("Invalid option string for bdev_allow_write_mounted:" 1371 " '%s'\n", str); 1372 return 1; 1373 } 1374 __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); 1375