1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 8 #include <linux/init.h> 9 #include <linux/mm.h> 10 #include <linux/slab.h> 11 #include <linux/kmod.h> 12 #include <linux/major.h> 13 #include <linux/device_cgroup.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-integrity.h> 16 #include <linux/backing-dev.h> 17 #include <linux/module.h> 18 #include <linux/blkpg.h> 19 #include <linux/magic.h> 20 #include <linux/buffer_head.h> 21 #include <linux/swap.h> 22 #include <linux/writeback.h> 23 #include <linux/mount.h> 24 #include <linux/pseudo_fs.h> 25 #include <linux/uio.h> 26 #include <linux/namei.h> 27 #include <linux/security.h> 28 #include <linux/part_stat.h> 29 #include <linux/uaccess.h> 30 #include <linux/stat.h> 31 #include "../fs/internal.h" 32 #include "blk.h" 33 34 /* Should we allow writing to mounted block devices? */ 35 static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); 36 37 struct bdev_inode { 38 struct block_device bdev; 39 struct inode vfs_inode; 40 }; 41 42 static inline struct bdev_inode *BDEV_I(struct inode *inode) 43 { 44 return container_of(inode, struct bdev_inode, vfs_inode); 45 } 46 47 static inline struct inode *BD_INODE(struct block_device *bdev) 48 { 49 return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode; 50 } 51 52 struct block_device *I_BDEV(struct inode *inode) 53 { 54 return &BDEV_I(inode)->bdev; 55 } 56 EXPORT_SYMBOL(I_BDEV); 57 58 struct block_device *file_bdev(struct file *bdev_file) 59 { 60 return I_BDEV(bdev_file->f_mapping->host); 61 } 62 EXPORT_SYMBOL(file_bdev); 63 64 static void bdev_write_inode(struct block_device *bdev) 65 { 66 struct inode *inode = BD_INODE(bdev); 67 int ret; 68 69 spin_lock(&inode->i_lock); 70 while (inode_state_read(inode) & I_DIRTY) { 71 spin_unlock(&inode->i_lock); 72 ret = write_inode_now(inode, true); 73 if (ret) 74 pr_warn_ratelimited( 75 "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", 76 bdev, ret); 77 spin_lock(&inode->i_lock); 78 } 79 spin_unlock(&inode->i_lock); 80 } 81 82 /* Kill _all_ buffers and pagecache , dirty or not.. */ 83 static void kill_bdev(struct block_device *bdev) 84 { 85 struct address_space *mapping = bdev->bd_mapping; 86 87 if (mapping_empty(mapping)) 88 return; 89 90 invalidate_bh_lrus(); 91 truncate_inode_pages(mapping, 0); 92 } 93 94 /* Invalidate clean unused buffers and pagecache. */ 95 void invalidate_bdev(struct block_device *bdev) 96 { 97 struct address_space *mapping = bdev->bd_mapping; 98 99 if (mapping->nrpages) { 100 invalidate_bh_lrus(); 101 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 102 invalidate_mapping_pages(mapping, 0, -1); 103 } 104 } 105 EXPORT_SYMBOL(invalidate_bdev); 106 107 /* 108 * Drop all buffers & page cache for given bdev range. This function bails 109 * with error if bdev has other exclusive owner (such as filesystem). 110 */ 111 int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, 112 loff_t lstart, loff_t lend) 113 { 114 /* 115 * If we don't hold exclusive handle for the device, upgrade to it 116 * while we discard the buffer cache to avoid discarding buffers 117 * under live filesystem. 118 */ 119 if (!(mode & BLK_OPEN_EXCL)) { 120 int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); 121 if (err) 122 goto invalidate; 123 } 124 125 truncate_inode_pages_range(bdev->bd_mapping, lstart, lend); 126 if (!(mode & BLK_OPEN_EXCL)) 127 bd_abort_claiming(bdev, truncate_bdev_range); 128 return 0; 129 130 invalidate: 131 /* 132 * Someone else has handle exclusively open. Try invalidating instead. 133 * The 'end' argument is inclusive so the rounding is safe. 134 */ 135 return invalidate_inode_pages2_range(bdev->bd_mapping, 136 lstart >> PAGE_SHIFT, 137 lend >> PAGE_SHIFT); 138 } 139 140 static void set_init_blocksize(struct block_device *bdev) 141 { 142 unsigned int bsize = bdev_logical_block_size(bdev); 143 loff_t size = i_size_read(BD_INODE(bdev)); 144 145 while (bsize < PAGE_SIZE) { 146 if (size & bsize) 147 break; 148 bsize <<= 1; 149 } 150 BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); 151 mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping, 152 get_order(bsize)); 153 } 154 155 /** 156 * bdev_validate_blocksize - check that this block size is acceptable 157 * @bdev: blockdevice to check 158 * @block_size: block size to check 159 * 160 * For block device users that do not use buffer heads or the block device 161 * page cache, make sure that this block size can be used with the device. 162 * 163 * Return: On success zero is returned, negative error code on failure. 164 */ 165 int bdev_validate_blocksize(struct block_device *bdev, int block_size) 166 { 167 if (blk_validate_block_size(block_size)) 168 return -EINVAL; 169 170 /* Size cannot be smaller than the size supported by the device */ 171 if (block_size < bdev_logical_block_size(bdev)) 172 return -EINVAL; 173 174 return 0; 175 } 176 EXPORT_SYMBOL_GPL(bdev_validate_blocksize); 177 178 int set_blocksize(struct file *file, int size) 179 { 180 struct inode *inode = file->f_mapping->host; 181 struct block_device *bdev = I_BDEV(inode); 182 int ret; 183 184 ret = bdev_validate_blocksize(bdev, size); 185 if (ret) 186 return ret; 187 188 if (!file->private_data) 189 return -EINVAL; 190 191 /* Don't change the size if it is same as current */ 192 if (inode->i_blkbits != blksize_bits(size)) { 193 /* 194 * Flush and truncate the pagecache before we reconfigure the 195 * mapping geometry because folio sizes are variable now. If a 196 * reader has already allocated a folio whose size is smaller 197 * than the new min_order but invokes readahead after the new 198 * min_order becomes visible, readahead will think there are 199 * "zero" blocks per folio and crash. Take the inode and 200 * invalidation locks to avoid racing with 201 * read/write/fallocate. 202 */ 203 inode_lock(inode); 204 filemap_invalidate_lock(inode->i_mapping); 205 206 sync_blockdev(bdev); 207 kill_bdev(bdev); 208 209 inode->i_blkbits = blksize_bits(size); 210 mapping_set_folio_min_order(inode->i_mapping, get_order(size)); 211 filemap_invalidate_unlock(inode->i_mapping); 212 inode_unlock(inode); 213 } 214 return 0; 215 } 216 217 EXPORT_SYMBOL(set_blocksize); 218 219 static int sb_validate_large_blocksize(struct super_block *sb, int size) 220 { 221 const char *err_str = NULL; 222 223 if (!(sb->s_type->fs_flags & FS_LBS)) 224 err_str = "not supported by filesystem"; 225 else if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 226 err_str = "is only supported with CONFIG_TRANSPARENT_HUGEPAGE"; 227 228 if (!err_str) 229 return 0; 230 231 pr_warn_ratelimited("%s: block size(%d) > page size(%lu) %s\n", 232 sb->s_type->name, size, PAGE_SIZE, err_str); 233 return -EINVAL; 234 } 235 236 int sb_set_blocksize(struct super_block *sb, int size) 237 { 238 if (size > PAGE_SIZE && sb_validate_large_blocksize(sb, size)) 239 return 0; 240 if (set_blocksize(sb->s_bdev_file, size)) 241 return 0; 242 /* If we get here, we know size is validated */ 243 sb->s_blocksize = size; 244 sb->s_blocksize_bits = blksize_bits(size); 245 return sb->s_blocksize; 246 } 247 248 EXPORT_SYMBOL(sb_set_blocksize); 249 250 int __must_check sb_min_blocksize(struct super_block *sb, int size) 251 { 252 int minsize = bdev_logical_block_size(sb->s_bdev); 253 if (size < minsize) 254 size = minsize; 255 return sb_set_blocksize(sb, size); 256 } 257 258 EXPORT_SYMBOL(sb_min_blocksize); 259 260 int sync_blockdev_nowait(struct block_device *bdev) 261 { 262 if (!bdev) 263 return 0; 264 return filemap_flush(bdev->bd_mapping); 265 } 266 EXPORT_SYMBOL_GPL(sync_blockdev_nowait); 267 268 /* 269 * Write out and wait upon all the dirty data associated with a block 270 * device via its mapping. Does not take the superblock lock. 271 */ 272 int sync_blockdev(struct block_device *bdev) 273 { 274 if (!bdev) 275 return 0; 276 return filemap_write_and_wait(bdev->bd_mapping); 277 } 278 EXPORT_SYMBOL(sync_blockdev); 279 280 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) 281 { 282 return filemap_write_and_wait_range(bdev->bd_mapping, 283 lstart, lend); 284 } 285 EXPORT_SYMBOL(sync_blockdev_range); 286 287 /** 288 * bdev_freeze - lock a filesystem and force it into a consistent state 289 * @bdev: blockdevice to lock 290 * 291 * If a superblock is found on this device, we take the s_umount semaphore 292 * on it to make sure nobody unmounts until the snapshot creation is done. 293 * The reference counter (bd_fsfreeze_count) guarantees that only the last 294 * unfreeze process can unfreeze the frozen filesystem actually when multiple 295 * freeze requests arrive simultaneously. It counts up in bdev_freeze() and 296 * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze 297 * actually. 298 * 299 * Return: On success zero is returned, negative error code on failure. 300 */ 301 int bdev_freeze(struct block_device *bdev) 302 { 303 int error = 0; 304 305 mutex_lock(&bdev->bd_fsfreeze_mutex); 306 307 if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { 308 mutex_unlock(&bdev->bd_fsfreeze_mutex); 309 return 0; 310 } 311 312 mutex_lock(&bdev->bd_holder_lock); 313 if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { 314 error = bdev->bd_holder_ops->freeze(bdev); 315 lockdep_assert_not_held(&bdev->bd_holder_lock); 316 } else { 317 mutex_unlock(&bdev->bd_holder_lock); 318 error = sync_blockdev(bdev); 319 } 320 321 if (error) 322 atomic_dec(&bdev->bd_fsfreeze_count); 323 324 mutex_unlock(&bdev->bd_fsfreeze_mutex); 325 return error; 326 } 327 EXPORT_SYMBOL(bdev_freeze); 328 329 /** 330 * bdev_thaw - unlock filesystem 331 * @bdev: blockdevice to unlock 332 * 333 * Unlocks the filesystem and marks it writeable again after bdev_freeze(). 334 * 335 * Return: On success zero is returned, negative error code on failure. 336 */ 337 int bdev_thaw(struct block_device *bdev) 338 { 339 int error = -EINVAL, nr_freeze; 340 341 mutex_lock(&bdev->bd_fsfreeze_mutex); 342 343 /* 344 * If this returns < 0 it means that @bd_fsfreeze_count was 345 * already 0 and no decrement was performed. 346 */ 347 nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); 348 if (nr_freeze < 0) 349 goto out; 350 351 error = 0; 352 if (nr_freeze > 0) 353 goto out; 354 355 mutex_lock(&bdev->bd_holder_lock); 356 if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { 357 error = bdev->bd_holder_ops->thaw(bdev); 358 lockdep_assert_not_held(&bdev->bd_holder_lock); 359 } else { 360 mutex_unlock(&bdev->bd_holder_lock); 361 } 362 363 if (error) 364 atomic_inc(&bdev->bd_fsfreeze_count); 365 out: 366 mutex_unlock(&bdev->bd_fsfreeze_mutex); 367 return error; 368 } 369 EXPORT_SYMBOL(bdev_thaw); 370 371 /* 372 * pseudo-fs 373 */ 374 375 static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); 376 static struct kmem_cache *bdev_cachep __ro_after_init; 377 378 static struct inode *bdev_alloc_inode(struct super_block *sb) 379 { 380 struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); 381 382 if (!ei) 383 return NULL; 384 memset(&ei->bdev, 0, sizeof(ei->bdev)); 385 386 if (security_bdev_alloc(&ei->bdev)) { 387 kmem_cache_free(bdev_cachep, ei); 388 return NULL; 389 } 390 return &ei->vfs_inode; 391 } 392 393 static void bdev_free_inode(struct inode *inode) 394 { 395 struct block_device *bdev = I_BDEV(inode); 396 397 free_percpu(bdev->bd_stats); 398 kfree(bdev->bd_meta_info); 399 security_bdev_free(bdev); 400 401 if (!bdev_is_partition(bdev)) { 402 if (bdev->bd_disk && bdev->bd_disk->bdi) 403 bdi_put(bdev->bd_disk->bdi); 404 kfree(bdev->bd_disk); 405 } 406 407 if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) 408 blk_free_ext_minor(MINOR(bdev->bd_dev)); 409 410 kmem_cache_free(bdev_cachep, BDEV_I(inode)); 411 } 412 413 static void init_once(void *data) 414 { 415 struct bdev_inode *ei = data; 416 417 inode_init_once(&ei->vfs_inode); 418 } 419 420 static const struct super_operations bdev_sops = { 421 .statfs = simple_statfs, 422 .alloc_inode = bdev_alloc_inode, 423 .free_inode = bdev_free_inode, 424 .drop_inode = inode_just_drop, 425 }; 426 427 static int bd_init_fs_context(struct fs_context *fc) 428 { 429 struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); 430 if (!ctx) 431 return -ENOMEM; 432 fc->s_iflags |= SB_I_CGROUPWB; 433 ctx->ops = &bdev_sops; 434 return 0; 435 } 436 437 static struct file_system_type bd_type = { 438 .name = "bdev", 439 .init_fs_context = bd_init_fs_context, 440 .kill_sb = kill_anon_super, 441 }; 442 443 struct super_block *blockdev_superblock __ro_after_init; 444 static struct vfsmount *blockdev_mnt __ro_after_init; 445 EXPORT_SYMBOL_GPL(blockdev_superblock); 446 447 void __init bdev_cache_init(void) 448 { 449 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 450 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 451 SLAB_ACCOUNT|SLAB_PANIC), 452 init_once); 453 blockdev_mnt = kern_mount(&bd_type); 454 if (IS_ERR(blockdev_mnt)) 455 panic("Cannot create bdev pseudo-fs"); 456 blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ 457 } 458 459 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) 460 { 461 struct block_device *bdev; 462 struct inode *inode; 463 464 inode = new_inode(blockdev_superblock); 465 if (!inode) 466 return NULL; 467 inode->i_mode = S_IFBLK; 468 inode->i_rdev = 0; 469 inode->i_data.a_ops = &def_blk_aops; 470 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 471 472 bdev = I_BDEV(inode); 473 mutex_init(&bdev->bd_fsfreeze_mutex); 474 spin_lock_init(&bdev->bd_size_lock); 475 mutex_init(&bdev->bd_holder_lock); 476 atomic_set(&bdev->__bd_flags, partno); 477 bdev->bd_mapping = &inode->i_data; 478 bdev->bd_queue = disk->queue; 479 if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO)) 480 bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO); 481 bdev->bd_stats = alloc_percpu(struct disk_stats); 482 if (!bdev->bd_stats) { 483 iput(inode); 484 return NULL; 485 } 486 bdev->bd_disk = disk; 487 return bdev; 488 } 489 490 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) 491 { 492 spin_lock(&bdev->bd_size_lock); 493 i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT); 494 bdev->bd_nr_sectors = sectors; 495 spin_unlock(&bdev->bd_size_lock); 496 } 497 498 void bdev_add(struct block_device *bdev, dev_t dev) 499 { 500 struct inode *inode = BD_INODE(bdev); 501 if (bdev_stable_writes(bdev)) 502 mapping_set_stable_writes(bdev->bd_mapping); 503 bdev->bd_dev = dev; 504 inode->i_rdev = dev; 505 inode->i_ino = dev; 506 insert_inode_hash(inode); 507 } 508 509 void bdev_unhash(struct block_device *bdev) 510 { 511 remove_inode_hash(BD_INODE(bdev)); 512 } 513 514 void bdev_drop(struct block_device *bdev) 515 { 516 iput(BD_INODE(bdev)); 517 } 518 519 long nr_blockdev_pages(void) 520 { 521 struct inode *inode; 522 long ret = 0; 523 524 spin_lock(&blockdev_superblock->s_inode_list_lock); 525 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) 526 ret += inode->i_mapping->nrpages; 527 spin_unlock(&blockdev_superblock->s_inode_list_lock); 528 529 return ret; 530 } 531 532 /** 533 * bd_may_claim - test whether a block device can be claimed 534 * @bdev: block device of interest 535 * @holder: holder trying to claim @bdev 536 * @hops: holder ops 537 * 538 * Test whether @bdev can be claimed by @holder. 539 * 540 * RETURNS: 541 * %true if @bdev can be claimed, %false otherwise. 542 */ 543 static bool bd_may_claim(struct block_device *bdev, void *holder, 544 const struct blk_holder_ops *hops) 545 { 546 struct block_device *whole = bdev_whole(bdev); 547 548 lockdep_assert_held(&bdev_lock); 549 550 if (bdev->bd_holder) { 551 /* 552 * The same holder can always re-claim. 553 */ 554 if (bdev->bd_holder == holder) { 555 if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) 556 return false; 557 return true; 558 } 559 return false; 560 } 561 562 /* 563 * If the whole devices holder is set to bd_may_claim, a partition on 564 * the device is claimed, but not the whole device. 565 */ 566 if (whole != bdev && 567 whole->bd_holder && whole->bd_holder != bd_may_claim) 568 return false; 569 return true; 570 } 571 572 /** 573 * bd_prepare_to_claim - claim a block device 574 * @bdev: block device of interest 575 * @holder: holder trying to claim @bdev 576 * @hops: holder ops. 577 * 578 * Claim @bdev. This function fails if @bdev is already claimed by another 579 * holder and waits if another claiming is in progress. return, the caller 580 * has ownership of bd_claiming and bd_holder[s]. 581 * 582 * RETURNS: 583 * 0 if @bdev can be claimed, -EBUSY otherwise. 584 */ 585 int bd_prepare_to_claim(struct block_device *bdev, void *holder, 586 const struct blk_holder_ops *hops) 587 { 588 struct block_device *whole = bdev_whole(bdev); 589 590 if (WARN_ON_ONCE(!holder)) 591 return -EINVAL; 592 retry: 593 mutex_lock(&bdev_lock); 594 /* if someone else claimed, fail */ 595 if (!bd_may_claim(bdev, holder, hops)) { 596 mutex_unlock(&bdev_lock); 597 return -EBUSY; 598 } 599 600 /* if claiming is already in progress, wait for it to finish */ 601 if (whole->bd_claiming) { 602 wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); 603 DEFINE_WAIT(wait); 604 605 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 606 mutex_unlock(&bdev_lock); 607 schedule(); 608 finish_wait(wq, &wait); 609 goto retry; 610 } 611 612 /* yay, all mine */ 613 whole->bd_claiming = holder; 614 mutex_unlock(&bdev_lock); 615 return 0; 616 } 617 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ 618 619 static void bd_clear_claiming(struct block_device *whole, void *holder) 620 { 621 lockdep_assert_held(&bdev_lock); 622 /* tell others that we're done */ 623 BUG_ON(whole->bd_claiming != holder); 624 whole->bd_claiming = NULL; 625 wake_up_var(&whole->bd_claiming); 626 } 627 628 /** 629 * bd_finish_claiming - finish claiming of a block device 630 * @bdev: block device of interest 631 * @holder: holder that has claimed @bdev 632 * @hops: block device holder operations 633 * 634 * Finish exclusive open of a block device. Mark the device as exlusively 635 * open by the holder and wake up all waiters for exclusive open to finish. 636 */ 637 static void bd_finish_claiming(struct block_device *bdev, void *holder, 638 const struct blk_holder_ops *hops) 639 { 640 struct block_device *whole = bdev_whole(bdev); 641 642 mutex_lock(&bdev_lock); 643 BUG_ON(!bd_may_claim(bdev, holder, hops)); 644 /* 645 * Note that for a whole device bd_holders will be incremented twice, 646 * and bd_holder will be set to bd_may_claim before being set to holder 647 */ 648 whole->bd_holders++; 649 whole->bd_holder = bd_may_claim; 650 bdev->bd_holders++; 651 mutex_lock(&bdev->bd_holder_lock); 652 bdev->bd_holder = holder; 653 bdev->bd_holder_ops = hops; 654 mutex_unlock(&bdev->bd_holder_lock); 655 bd_clear_claiming(whole, holder); 656 mutex_unlock(&bdev_lock); 657 } 658 659 /** 660 * bd_abort_claiming - abort claiming of a block device 661 * @bdev: block device of interest 662 * @holder: holder that has claimed @bdev 663 * 664 * Abort claiming of a block device when the exclusive open failed. This can be 665 * also used when exclusive open is not actually desired and we just needed 666 * to block other exclusive openers for a while. 667 */ 668 void bd_abort_claiming(struct block_device *bdev, void *holder) 669 { 670 mutex_lock(&bdev_lock); 671 bd_clear_claiming(bdev_whole(bdev), holder); 672 mutex_unlock(&bdev_lock); 673 } 674 EXPORT_SYMBOL(bd_abort_claiming); 675 676 static void bd_end_claim(struct block_device *bdev, void *holder) 677 { 678 struct block_device *whole = bdev_whole(bdev); 679 bool unblock = false; 680 681 /* 682 * Release a claim on the device. The holder fields are protected with 683 * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. 684 */ 685 mutex_lock(&bdev_lock); 686 WARN_ON_ONCE(bdev->bd_holder != holder); 687 WARN_ON_ONCE(--bdev->bd_holders < 0); 688 WARN_ON_ONCE(--whole->bd_holders < 0); 689 if (!bdev->bd_holders) { 690 mutex_lock(&bdev->bd_holder_lock); 691 bdev->bd_holder = NULL; 692 bdev->bd_holder_ops = NULL; 693 mutex_unlock(&bdev->bd_holder_lock); 694 if (bdev_test_flag(bdev, BD_WRITE_HOLDER)) 695 unblock = true; 696 } 697 if (!whole->bd_holders) 698 whole->bd_holder = NULL; 699 mutex_unlock(&bdev_lock); 700 701 /* 702 * If this was the last claim, remove holder link and unblock evpoll if 703 * it was a write holder. 704 */ 705 if (unblock) { 706 disk_unblock_events(bdev->bd_disk); 707 bdev_clear_flag(bdev, BD_WRITE_HOLDER); 708 } 709 } 710 711 static void blkdev_flush_mapping(struct block_device *bdev) 712 { 713 WARN_ON_ONCE(bdev->bd_holders); 714 sync_blockdev(bdev); 715 kill_bdev(bdev); 716 bdev_write_inode(bdev); 717 } 718 719 static void blkdev_put_whole(struct block_device *bdev) 720 { 721 if (atomic_dec_and_test(&bdev->bd_openers)) 722 blkdev_flush_mapping(bdev); 723 if (bdev->bd_disk->fops->release) 724 bdev->bd_disk->fops->release(bdev->bd_disk); 725 } 726 727 static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) 728 { 729 struct gendisk *disk = bdev->bd_disk; 730 int ret; 731 732 if (disk->fops->open) { 733 ret = disk->fops->open(disk, mode); 734 if (ret) { 735 /* avoid ghost partitions on a removed medium */ 736 if (ret == -ENOMEDIUM && 737 test_bit(GD_NEED_PART_SCAN, &disk->state)) 738 bdev_disk_changed(disk, true); 739 return ret; 740 } 741 } 742 743 if (!atomic_read(&bdev->bd_openers)) 744 set_init_blocksize(bdev); 745 atomic_inc(&bdev->bd_openers); 746 if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { 747 /* 748 * Only return scanning errors if we are called from contexts 749 * that explicitly want them, e.g. the BLKRRPART ioctl. 750 */ 751 ret = bdev_disk_changed(disk, false); 752 if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { 753 blkdev_put_whole(bdev); 754 return ret; 755 } 756 } 757 return 0; 758 } 759 760 static int blkdev_get_part(struct block_device *part, blk_mode_t mode) 761 { 762 struct gendisk *disk = part->bd_disk; 763 int ret; 764 765 ret = blkdev_get_whole(bdev_whole(part), mode); 766 if (ret) 767 return ret; 768 769 ret = -ENXIO; 770 if (!bdev_nr_sectors(part)) 771 goto out_blkdev_put; 772 773 if (!atomic_read(&part->bd_openers)) { 774 disk->open_partitions++; 775 set_init_blocksize(part); 776 } 777 atomic_inc(&part->bd_openers); 778 return 0; 779 780 out_blkdev_put: 781 blkdev_put_whole(bdev_whole(part)); 782 return ret; 783 } 784 785 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) 786 { 787 int ret; 788 789 ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, 790 MAJOR(dev), MINOR(dev), 791 ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | 792 ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); 793 if (ret) 794 return ret; 795 796 /* Blocking writes requires exclusive opener */ 797 if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) 798 return -EINVAL; 799 800 /* 801 * We're using error pointers to indicate to ->release() when we 802 * failed to open that block device. Also this doesn't make sense. 803 */ 804 if (WARN_ON_ONCE(IS_ERR(holder))) 805 return -EINVAL; 806 807 return 0; 808 } 809 810 static void blkdev_put_part(struct block_device *part) 811 { 812 struct block_device *whole = bdev_whole(part); 813 814 if (atomic_dec_and_test(&part->bd_openers)) { 815 blkdev_flush_mapping(part); 816 whole->bd_disk->open_partitions--; 817 } 818 blkdev_put_whole(whole); 819 } 820 821 struct block_device *blkdev_get_no_open(dev_t dev, bool autoload) 822 { 823 struct block_device *bdev; 824 struct inode *inode; 825 826 inode = ilookup(blockdev_superblock, dev); 827 if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { 828 blk_request_module(dev); 829 inode = ilookup(blockdev_superblock, dev); 830 if (inode) 831 pr_warn_ratelimited( 832 "block device autoloading is deprecated and will be removed.\n"); 833 } 834 if (!inode) 835 return NULL; 836 837 /* switch from the inode reference to a device mode one: */ 838 bdev = &BDEV_I(inode)->bdev; 839 if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) 840 bdev = NULL; 841 iput(inode); 842 return bdev; 843 } 844 845 void blkdev_put_no_open(struct block_device *bdev) 846 { 847 put_device(&bdev->bd_device); 848 } 849 850 static bool bdev_writes_blocked(struct block_device *bdev) 851 { 852 return bdev->bd_writers < 0; 853 } 854 855 static void bdev_block_writes(struct block_device *bdev) 856 { 857 bdev->bd_writers--; 858 } 859 860 static void bdev_unblock_writes(struct block_device *bdev) 861 { 862 bdev->bd_writers++; 863 } 864 865 static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) 866 { 867 if (bdev_allow_write_mounted) 868 return true; 869 /* Writes blocked? */ 870 if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) 871 return false; 872 if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) 873 return false; 874 return true; 875 } 876 877 static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) 878 { 879 if (bdev_allow_write_mounted) 880 return; 881 882 /* Claim exclusive or shared write access. */ 883 if (mode & BLK_OPEN_RESTRICT_WRITES) 884 bdev_block_writes(bdev); 885 else if (mode & BLK_OPEN_WRITE) 886 bdev->bd_writers++; 887 } 888 889 static inline bool bdev_unclaimed(const struct file *bdev_file) 890 { 891 return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); 892 } 893 894 static void bdev_yield_write_access(struct file *bdev_file) 895 { 896 struct block_device *bdev; 897 898 if (bdev_allow_write_mounted) 899 return; 900 901 if (bdev_unclaimed(bdev_file)) 902 return; 903 904 bdev = file_bdev(bdev_file); 905 906 if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) 907 bdev_unblock_writes(bdev); 908 else if (bdev_file->f_mode & FMODE_WRITE) 909 bdev->bd_writers--; 910 } 911 912 /** 913 * bdev_open - open a block device 914 * @bdev: block device to open 915 * @mode: open mode (BLK_OPEN_*) 916 * @holder: exclusive holder identifier 917 * @hops: holder operations 918 * @bdev_file: file for the block device 919 * 920 * Open the block device. If @holder is not %NULL, the block device is opened 921 * with exclusive access. Exclusive opens may nest for the same @holder. 922 * 923 * CONTEXT: 924 * Might sleep. 925 * 926 * RETURNS: 927 * zero on success, -errno on failure. 928 */ 929 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, 930 const struct blk_holder_ops *hops, struct file *bdev_file) 931 { 932 bool unblock_events = true; 933 struct gendisk *disk = bdev->bd_disk; 934 int ret; 935 936 if (holder) { 937 mode |= BLK_OPEN_EXCL; 938 ret = bd_prepare_to_claim(bdev, holder, hops); 939 if (ret) 940 return ret; 941 } else { 942 if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) 943 return -EIO; 944 } 945 946 disk_block_events(disk); 947 948 mutex_lock(&disk->open_mutex); 949 ret = -ENXIO; 950 if (!disk_live(disk)) 951 goto abort_claiming; 952 if (!try_module_get(disk->fops->owner)) 953 goto abort_claiming; 954 ret = -EBUSY; 955 if (!bdev_may_open(bdev, mode)) 956 goto put_module; 957 if (bdev_is_partition(bdev)) 958 ret = blkdev_get_part(bdev, mode); 959 else 960 ret = blkdev_get_whole(bdev, mode); 961 if (ret) 962 goto put_module; 963 bdev_claim_write_access(bdev, mode); 964 if (holder) { 965 bd_finish_claiming(bdev, holder, hops); 966 967 /* 968 * Block event polling for write claims if requested. Any write 969 * holder makes the write_holder state stick until all are 970 * released. This is good enough and tracking individual 971 * writeable reference is too fragile given the way @mode is 972 * used in blkdev_get/put(). 973 */ 974 if ((mode & BLK_OPEN_WRITE) && 975 !bdev_test_flag(bdev, BD_WRITE_HOLDER) && 976 (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { 977 bdev_set_flag(bdev, BD_WRITE_HOLDER); 978 unblock_events = false; 979 } 980 } 981 mutex_unlock(&disk->open_mutex); 982 983 if (unblock_events) 984 disk_unblock_events(disk); 985 986 bdev_file->f_flags |= O_LARGEFILE; 987 bdev_file->f_mode |= FMODE_CAN_ODIRECT; 988 if (bdev_nowait(bdev)) 989 bdev_file->f_mode |= FMODE_NOWAIT; 990 if (mode & BLK_OPEN_RESTRICT_WRITES) 991 bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; 992 bdev_file->f_mapping = bdev->bd_mapping; 993 bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); 994 bdev_file->private_data = holder; 995 996 return 0; 997 put_module: 998 module_put(disk->fops->owner); 999 abort_claiming: 1000 if (holder) 1001 bd_abort_claiming(bdev, holder); 1002 mutex_unlock(&disk->open_mutex); 1003 disk_unblock_events(disk); 1004 return ret; 1005 } 1006 1007 /* 1008 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk 1009 * associated with the floppy driver where it has allowed ioctls if the 1010 * file was opened for writing, but does not allow reads or writes. 1011 * Make sure that this quirk is reflected in @f_flags. 1012 * 1013 * It can also happen if a block device is opened as O_RDWR | O_WRONLY. 1014 */ 1015 static unsigned blk_to_file_flags(blk_mode_t mode) 1016 { 1017 unsigned int flags = 0; 1018 1019 if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == 1020 (BLK_OPEN_READ | BLK_OPEN_WRITE)) 1021 flags |= O_RDWR; 1022 else if (mode & BLK_OPEN_WRITE_IOCTL) 1023 flags |= O_RDWR | O_WRONLY; 1024 else if (mode & BLK_OPEN_WRITE) 1025 flags |= O_WRONLY; 1026 else if (mode & BLK_OPEN_READ) 1027 flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ 1028 else 1029 WARN_ON_ONCE(true); 1030 1031 if (mode & BLK_OPEN_NDELAY) 1032 flags |= O_NDELAY; 1033 1034 return flags; 1035 } 1036 1037 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, 1038 const struct blk_holder_ops *hops) 1039 { 1040 struct file *bdev_file; 1041 struct block_device *bdev; 1042 unsigned int flags; 1043 int ret; 1044 1045 ret = bdev_permission(dev, mode, holder); 1046 if (ret) 1047 return ERR_PTR(ret); 1048 1049 bdev = blkdev_get_no_open(dev, true); 1050 if (!bdev) 1051 return ERR_PTR(-ENXIO); 1052 1053 flags = blk_to_file_flags(mode); 1054 bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev), 1055 blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); 1056 if (IS_ERR(bdev_file)) { 1057 blkdev_put_no_open(bdev); 1058 return bdev_file; 1059 } 1060 ihold(BD_INODE(bdev)); 1061 1062 ret = bdev_open(bdev, mode, holder, hops, bdev_file); 1063 if (ret) { 1064 /* We failed to open the block device. Let ->release() know. */ 1065 bdev_file->private_data = ERR_PTR(ret); 1066 fput(bdev_file); 1067 return ERR_PTR(ret); 1068 } 1069 return bdev_file; 1070 } 1071 EXPORT_SYMBOL(bdev_file_open_by_dev); 1072 1073 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, 1074 void *holder, 1075 const struct blk_holder_ops *hops) 1076 { 1077 struct file *file; 1078 dev_t dev; 1079 int error; 1080 1081 error = lookup_bdev(path, &dev); 1082 if (error) 1083 return ERR_PTR(error); 1084 1085 file = bdev_file_open_by_dev(dev, mode, holder, hops); 1086 if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { 1087 if (bdev_read_only(file_bdev(file))) { 1088 fput(file); 1089 file = ERR_PTR(-EACCES); 1090 } 1091 } 1092 1093 return file; 1094 } 1095 EXPORT_SYMBOL(bdev_file_open_by_path); 1096 1097 static inline void bd_yield_claim(struct file *bdev_file) 1098 { 1099 struct block_device *bdev = file_bdev(bdev_file); 1100 void *holder = bdev_file->private_data; 1101 1102 lockdep_assert_held(&bdev->bd_disk->open_mutex); 1103 1104 if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) 1105 return; 1106 1107 if (!bdev_unclaimed(bdev_file)) 1108 bd_end_claim(bdev, holder); 1109 } 1110 1111 void bdev_release(struct file *bdev_file) 1112 { 1113 struct block_device *bdev = file_bdev(bdev_file); 1114 void *holder = bdev_file->private_data; 1115 struct gendisk *disk = bdev->bd_disk; 1116 1117 /* We failed to open that block device. */ 1118 if (IS_ERR(holder)) 1119 goto put_no_open; 1120 1121 /* 1122 * Sync early if it looks like we're the last one. If someone else 1123 * opens the block device between now and the decrement of bd_openers 1124 * then we did a sync that we didn't need to, but that's not the end 1125 * of the world and we want to avoid long (could be several minute) 1126 * syncs while holding the mutex. 1127 */ 1128 if (atomic_read(&bdev->bd_openers) == 1) 1129 sync_blockdev(bdev); 1130 1131 mutex_lock(&disk->open_mutex); 1132 bdev_yield_write_access(bdev_file); 1133 1134 if (holder) 1135 bd_yield_claim(bdev_file); 1136 1137 /* 1138 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1139 * event. This is to ensure detection of media removal commanded 1140 * from userland - e.g. eject(1). 1141 */ 1142 disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); 1143 1144 if (bdev_is_partition(bdev)) 1145 blkdev_put_part(bdev); 1146 else 1147 blkdev_put_whole(bdev); 1148 mutex_unlock(&disk->open_mutex); 1149 1150 module_put(disk->fops->owner); 1151 put_no_open: 1152 blkdev_put_no_open(bdev); 1153 } 1154 1155 /** 1156 * bdev_fput - yield claim to the block device and put the file 1157 * @bdev_file: open block device 1158 * 1159 * Yield claim on the block device and put the file. Ensure that the 1160 * block device can be reclaimed before the file is closed which is a 1161 * deferred operation. 1162 */ 1163 void bdev_fput(struct file *bdev_file) 1164 { 1165 if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) 1166 return; 1167 1168 if (bdev_file->private_data) { 1169 struct block_device *bdev = file_bdev(bdev_file); 1170 struct gendisk *disk = bdev->bd_disk; 1171 1172 mutex_lock(&disk->open_mutex); 1173 bdev_yield_write_access(bdev_file); 1174 bd_yield_claim(bdev_file); 1175 /* 1176 * Tell release we already gave up our hold on the 1177 * device and if write restrictions are available that 1178 * we already gave up write access to the device. 1179 */ 1180 bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); 1181 mutex_unlock(&disk->open_mutex); 1182 } 1183 1184 fput(bdev_file); 1185 } 1186 EXPORT_SYMBOL(bdev_fput); 1187 1188 /** 1189 * lookup_bdev() - Look up a struct block_device by name. 1190 * @pathname: Name of the block device in the filesystem. 1191 * @dev: Pointer to the block device's dev_t, if found. 1192 * 1193 * Lookup the block device's dev_t at @pathname in the current 1194 * namespace if possible and return it in @dev. 1195 * 1196 * Context: May sleep. 1197 * Return: 0 if succeeded, negative errno otherwise. 1198 */ 1199 int lookup_bdev(const char *pathname, dev_t *dev) 1200 { 1201 struct inode *inode; 1202 struct path path; 1203 int error; 1204 1205 if (!pathname || !*pathname) 1206 return -EINVAL; 1207 1208 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1209 if (error) 1210 return error; 1211 1212 inode = d_backing_inode(path.dentry); 1213 error = -ENOTBLK; 1214 if (!S_ISBLK(inode->i_mode)) 1215 goto out_path_put; 1216 error = -EACCES; 1217 if (!may_open_dev(&path)) 1218 goto out_path_put; 1219 1220 *dev = inode->i_rdev; 1221 error = 0; 1222 out_path_put: 1223 path_put(&path); 1224 return error; 1225 } 1226 EXPORT_SYMBOL(lookup_bdev); 1227 1228 /** 1229 * bdev_mark_dead - mark a block device as dead 1230 * @bdev: block device to operate on 1231 * @surprise: indicate a surprise removal 1232 * 1233 * Tell the file system that this devices or media is dead. If @surprise is set 1234 * to %true the device or media is already gone, if not we are preparing for an 1235 * orderly removal. 1236 * 1237 * This calls into the file system, which then typicall syncs out all dirty data 1238 * and writes back inodes and then invalidates any cached data in the inodes on 1239 * the file system. In addition we also invalidate the block device mapping. 1240 */ 1241 void bdev_mark_dead(struct block_device *bdev, bool surprise) 1242 { 1243 mutex_lock(&bdev->bd_holder_lock); 1244 if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) 1245 bdev->bd_holder_ops->mark_dead(bdev, surprise); 1246 else { 1247 mutex_unlock(&bdev->bd_holder_lock); 1248 /* 1249 * On surprise removal the device is already gone; syncing is 1250 * futile and can hang forever waiting on I/O that will never 1251 * complete. Match fs_bdev_mark_dead(), which also skips it. 1252 */ 1253 if (!surprise) 1254 sync_blockdev(bdev); 1255 } 1256 1257 invalidate_bdev(bdev); 1258 } 1259 /* 1260 * New drivers should not use this directly. There are some drivers however 1261 * that needs this for historical reasons. For example, the DASD driver has 1262 * historically had a shutdown to offline mode that doesn't actually remove the 1263 * gendisk that otherwise looks a lot like a safe device removal. 1264 */ 1265 EXPORT_SYMBOL_GPL(bdev_mark_dead); 1266 1267 void sync_bdevs(bool wait) 1268 { 1269 struct inode *inode, *old_inode = NULL; 1270 1271 spin_lock(&blockdev_superblock->s_inode_list_lock); 1272 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1273 struct address_space *mapping = inode->i_mapping; 1274 struct block_device *bdev; 1275 1276 spin_lock(&inode->i_lock); 1277 if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW) || 1278 mapping->nrpages == 0) { 1279 spin_unlock(&inode->i_lock); 1280 continue; 1281 } 1282 __iget(inode); 1283 spin_unlock(&inode->i_lock); 1284 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1285 /* 1286 * We hold a reference to 'inode' so it couldn't have been 1287 * removed from s_inodes list while we dropped the 1288 * s_inode_list_lock We cannot iput the inode now as we can 1289 * be holding the last reference and we cannot iput it under 1290 * s_inode_list_lock. So we keep the reference and iput it 1291 * later. 1292 */ 1293 iput(old_inode); 1294 old_inode = inode; 1295 bdev = I_BDEV(inode); 1296 1297 mutex_lock(&bdev->bd_disk->open_mutex); 1298 if (!atomic_read(&bdev->bd_openers)) { 1299 ; /* skip */ 1300 } else if (wait) { 1301 /* 1302 * We keep the error status of individual mapping so 1303 * that applications can catch the writeback error using 1304 * fsync(2). See filemap_fdatawait_keep_errors() for 1305 * details. 1306 */ 1307 filemap_fdatawait_keep_errors(inode->i_mapping); 1308 } else { 1309 filemap_fdatawrite(inode->i_mapping); 1310 } 1311 mutex_unlock(&bdev->bd_disk->open_mutex); 1312 1313 spin_lock(&blockdev_superblock->s_inode_list_lock); 1314 } 1315 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1316 iput(old_inode); 1317 } 1318 1319 /* 1320 * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. 1321 */ 1322 void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) 1323 { 1324 struct block_device *bdev; 1325 1326 /* 1327 * Note that d_backing_inode() returns the block device node inode, not 1328 * the block device's internal inode. Therefore it is *not* valid to 1329 * use I_BDEV() here; the block device has to be looked up by i_rdev 1330 * instead. 1331 */ 1332 bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false); 1333 if (!bdev) 1334 return; 1335 1336 if (request_mask & STATX_DIOALIGN) { 1337 stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 1338 stat->dio_offset_align = bdev_logical_block_size(bdev); 1339 stat->result_mask |= STATX_DIOALIGN; 1340 } 1341 1342 if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { 1343 struct request_queue *bd_queue = bdev->bd_queue; 1344 1345 generic_fill_statx_atomic_writes(stat, 1346 queue_atomic_write_unit_min_bytes(bd_queue), 1347 queue_atomic_write_unit_max_bytes(bd_queue), 1348 0); 1349 } 1350 1351 stat->blksize = bdev_io_min(bdev); 1352 1353 blkdev_put_no_open(bdev); 1354 } 1355 1356 bool disk_live(struct gendisk *disk) 1357 { 1358 return !inode_unhashed(BD_INODE(disk->part0)); 1359 } 1360 EXPORT_SYMBOL_GPL(disk_live); 1361 1362 unsigned int block_size(struct block_device *bdev) 1363 { 1364 return 1 << BD_INODE(bdev)->i_blkbits; 1365 } 1366 EXPORT_SYMBOL_GPL(block_size); 1367 1368 static int __init setup_bdev_allow_write_mounted(char *str) 1369 { 1370 if (kstrtobool(str, &bdev_allow_write_mounted)) 1371 pr_warn("Invalid option string for bdev_allow_write_mounted:" 1372 " '%s'\n", str); 1373 return 1; 1374 } 1375 __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); 1376