1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 8 #include <linux/init.h> 9 #include <linux/mm.h> 10 #include <linux/slab.h> 11 #include <linux/kmod.h> 12 #include <linux/major.h> 13 #include <linux/device_cgroup.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-integrity.h> 16 #include <linux/backing-dev.h> 17 #include <linux/module.h> 18 #include <linux/blkpg.h> 19 #include <linux/magic.h> 20 #include <linux/buffer_head.h> 21 #include <linux/swap.h> 22 #include <linux/writeback.h> 23 #include <linux/mount.h> 24 #include <linux/pseudo_fs.h> 25 #include <linux/uio.h> 26 #include <linux/namei.h> 27 #include <linux/security.h> 28 #include <linux/part_stat.h> 29 #include <linux/uaccess.h> 30 #include <linux/stat.h> 31 #include "../fs/internal.h" 32 #include "blk.h" 33 34 /* Should we allow writing to mounted block devices? */ 35 static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); 36 37 struct bdev_inode { 38 struct block_device bdev; 39 struct inode vfs_inode; 40 }; 41 42 static inline struct bdev_inode *BDEV_I(struct inode *inode) 43 { 44 return container_of(inode, struct bdev_inode, vfs_inode); 45 } 46 47 static inline struct inode *BD_INODE(struct block_device *bdev) 48 { 49 return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode; 50 } 51 52 struct block_device *I_BDEV(struct inode *inode) 53 { 54 return &BDEV_I(inode)->bdev; 55 } 56 EXPORT_SYMBOL(I_BDEV); 57 58 struct block_device *file_bdev(struct file *bdev_file) 59 { 60 return I_BDEV(bdev_file->f_mapping->host); 61 } 62 EXPORT_SYMBOL(file_bdev); 63 64 static void bdev_write_inode(struct block_device *bdev) 65 { 66 struct inode *inode = BD_INODE(bdev); 67 int ret; 68 69 spin_lock(&inode->i_lock); 70 while (inode->i_state & I_DIRTY) { 71 spin_unlock(&inode->i_lock); 72 ret = write_inode_now(inode, true); 73 if (ret) 74 pr_warn_ratelimited( 75 "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", 76 bdev, ret); 77 spin_lock(&inode->i_lock); 78 } 79 spin_unlock(&inode->i_lock); 80 } 81 82 /* Kill _all_ buffers and pagecache , dirty or not.. */ 83 static void kill_bdev(struct block_device *bdev) 84 { 85 struct address_space *mapping = bdev->bd_mapping; 86 87 if (mapping_empty(mapping)) 88 return; 89 90 invalidate_bh_lrus(); 91 truncate_inode_pages(mapping, 0); 92 } 93 94 /* Invalidate clean unused buffers and pagecache. */ 95 void invalidate_bdev(struct block_device *bdev) 96 { 97 struct address_space *mapping = bdev->bd_mapping; 98 99 if (mapping->nrpages) { 100 invalidate_bh_lrus(); 101 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 102 invalidate_mapping_pages(mapping, 0, -1); 103 } 104 } 105 EXPORT_SYMBOL(invalidate_bdev); 106 107 /* 108 * Drop all buffers & page cache for given bdev range. This function bails 109 * with error if bdev has other exclusive owner (such as filesystem). 110 */ 111 int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, 112 loff_t lstart, loff_t lend) 113 { 114 /* 115 * If we don't hold exclusive handle for the device, upgrade to it 116 * while we discard the buffer cache to avoid discarding buffers 117 * under live filesystem. 118 */ 119 if (!(mode & BLK_OPEN_EXCL)) { 120 int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); 121 if (err) 122 goto invalidate; 123 } 124 125 truncate_inode_pages_range(bdev->bd_mapping, lstart, lend); 126 if (!(mode & BLK_OPEN_EXCL)) 127 bd_abort_claiming(bdev, truncate_bdev_range); 128 return 0; 129 130 invalidate: 131 /* 132 * Someone else has handle exclusively open. Try invalidating instead. 133 * The 'end' argument is inclusive so the rounding is safe. 134 */ 135 return invalidate_inode_pages2_range(bdev->bd_mapping, 136 lstart >> PAGE_SHIFT, 137 lend >> PAGE_SHIFT); 138 } 139 140 static void set_init_blocksize(struct block_device *bdev) 141 { 142 unsigned int bsize = bdev_logical_block_size(bdev); 143 loff_t size = i_size_read(BD_INODE(bdev)); 144 145 while (bsize < PAGE_SIZE) { 146 if (size & bsize) 147 break; 148 bsize <<= 1; 149 } 150 BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); 151 } 152 153 int set_blocksize(struct file *file, int size) 154 { 155 struct inode *inode = file->f_mapping->host; 156 struct block_device *bdev = I_BDEV(inode); 157 158 if (blk_validate_block_size(size)) 159 return -EINVAL; 160 161 /* Size cannot be smaller than the size supported by the device */ 162 if (size < bdev_logical_block_size(bdev)) 163 return -EINVAL; 164 165 if (!file->private_data) 166 return -EINVAL; 167 168 /* Don't change the size if it is same as current */ 169 if (inode->i_blkbits != blksize_bits(size)) { 170 sync_blockdev(bdev); 171 inode->i_blkbits = blksize_bits(size); 172 kill_bdev(bdev); 173 } 174 return 0; 175 } 176 177 EXPORT_SYMBOL(set_blocksize); 178 179 int sb_set_blocksize(struct super_block *sb, int size) 180 { 181 if (set_blocksize(sb->s_bdev_file, size)) 182 return 0; 183 /* If we get here, we know size is power of two 184 * and it's value is between 512 and PAGE_SIZE */ 185 sb->s_blocksize = size; 186 sb->s_blocksize_bits = blksize_bits(size); 187 return sb->s_blocksize; 188 } 189 190 EXPORT_SYMBOL(sb_set_blocksize); 191 192 int sb_min_blocksize(struct super_block *sb, int size) 193 { 194 int minsize = bdev_logical_block_size(sb->s_bdev); 195 if (size < minsize) 196 size = minsize; 197 return sb_set_blocksize(sb, size); 198 } 199 200 EXPORT_SYMBOL(sb_min_blocksize); 201 202 int sync_blockdev_nowait(struct block_device *bdev) 203 { 204 if (!bdev) 205 return 0; 206 return filemap_flush(bdev->bd_mapping); 207 } 208 EXPORT_SYMBOL_GPL(sync_blockdev_nowait); 209 210 /* 211 * Write out and wait upon all the dirty data associated with a block 212 * device via its mapping. Does not take the superblock lock. 213 */ 214 int sync_blockdev(struct block_device *bdev) 215 { 216 if (!bdev) 217 return 0; 218 return filemap_write_and_wait(bdev->bd_mapping); 219 } 220 EXPORT_SYMBOL(sync_blockdev); 221 222 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) 223 { 224 return filemap_write_and_wait_range(bdev->bd_mapping, 225 lstart, lend); 226 } 227 EXPORT_SYMBOL(sync_blockdev_range); 228 229 /** 230 * bdev_freeze - lock a filesystem and force it into a consistent state 231 * @bdev: blockdevice to lock 232 * 233 * If a superblock is found on this device, we take the s_umount semaphore 234 * on it to make sure nobody unmounts until the snapshot creation is done. 235 * The reference counter (bd_fsfreeze_count) guarantees that only the last 236 * unfreeze process can unfreeze the frozen filesystem actually when multiple 237 * freeze requests arrive simultaneously. It counts up in bdev_freeze() and 238 * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze 239 * actually. 240 * 241 * Return: On success zero is returned, negative error code on failure. 242 */ 243 int bdev_freeze(struct block_device *bdev) 244 { 245 int error = 0; 246 247 mutex_lock(&bdev->bd_fsfreeze_mutex); 248 249 if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { 250 mutex_unlock(&bdev->bd_fsfreeze_mutex); 251 return 0; 252 } 253 254 mutex_lock(&bdev->bd_holder_lock); 255 if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { 256 error = bdev->bd_holder_ops->freeze(bdev); 257 lockdep_assert_not_held(&bdev->bd_holder_lock); 258 } else { 259 mutex_unlock(&bdev->bd_holder_lock); 260 error = sync_blockdev(bdev); 261 } 262 263 if (error) 264 atomic_dec(&bdev->bd_fsfreeze_count); 265 266 mutex_unlock(&bdev->bd_fsfreeze_mutex); 267 return error; 268 } 269 EXPORT_SYMBOL(bdev_freeze); 270 271 /** 272 * bdev_thaw - unlock filesystem 273 * @bdev: blockdevice to unlock 274 * 275 * Unlocks the filesystem and marks it writeable again after bdev_freeze(). 276 * 277 * Return: On success zero is returned, negative error code on failure. 278 */ 279 int bdev_thaw(struct block_device *bdev) 280 { 281 int error = -EINVAL, nr_freeze; 282 283 mutex_lock(&bdev->bd_fsfreeze_mutex); 284 285 /* 286 * If this returns < 0 it means that @bd_fsfreeze_count was 287 * already 0 and no decrement was performed. 288 */ 289 nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); 290 if (nr_freeze < 0) 291 goto out; 292 293 error = 0; 294 if (nr_freeze > 0) 295 goto out; 296 297 mutex_lock(&bdev->bd_holder_lock); 298 if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { 299 error = bdev->bd_holder_ops->thaw(bdev); 300 lockdep_assert_not_held(&bdev->bd_holder_lock); 301 } else { 302 mutex_unlock(&bdev->bd_holder_lock); 303 } 304 305 if (error) 306 atomic_inc(&bdev->bd_fsfreeze_count); 307 out: 308 mutex_unlock(&bdev->bd_fsfreeze_mutex); 309 return error; 310 } 311 EXPORT_SYMBOL(bdev_thaw); 312 313 /* 314 * pseudo-fs 315 */ 316 317 static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); 318 static struct kmem_cache *bdev_cachep __ro_after_init; 319 320 static struct inode *bdev_alloc_inode(struct super_block *sb) 321 { 322 struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); 323 324 if (!ei) 325 return NULL; 326 memset(&ei->bdev, 0, sizeof(ei->bdev)); 327 328 if (security_bdev_alloc(&ei->bdev)) { 329 kmem_cache_free(bdev_cachep, ei); 330 return NULL; 331 } 332 return &ei->vfs_inode; 333 } 334 335 static void bdev_free_inode(struct inode *inode) 336 { 337 struct block_device *bdev = I_BDEV(inode); 338 339 free_percpu(bdev->bd_stats); 340 kfree(bdev->bd_meta_info); 341 security_bdev_free(bdev); 342 343 if (!bdev_is_partition(bdev)) { 344 if (bdev->bd_disk && bdev->bd_disk->bdi) 345 bdi_put(bdev->bd_disk->bdi); 346 kfree(bdev->bd_disk); 347 } 348 349 if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) 350 blk_free_ext_minor(MINOR(bdev->bd_dev)); 351 352 kmem_cache_free(bdev_cachep, BDEV_I(inode)); 353 } 354 355 static void init_once(void *data) 356 { 357 struct bdev_inode *ei = data; 358 359 inode_init_once(&ei->vfs_inode); 360 } 361 362 static void bdev_evict_inode(struct inode *inode) 363 { 364 truncate_inode_pages_final(&inode->i_data); 365 invalidate_inode_buffers(inode); /* is it needed here? */ 366 clear_inode(inode); 367 } 368 369 static const struct super_operations bdev_sops = { 370 .statfs = simple_statfs, 371 .alloc_inode = bdev_alloc_inode, 372 .free_inode = bdev_free_inode, 373 .drop_inode = generic_delete_inode, 374 .evict_inode = bdev_evict_inode, 375 }; 376 377 static int bd_init_fs_context(struct fs_context *fc) 378 { 379 struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); 380 if (!ctx) 381 return -ENOMEM; 382 fc->s_iflags |= SB_I_CGROUPWB; 383 ctx->ops = &bdev_sops; 384 return 0; 385 } 386 387 static struct file_system_type bd_type = { 388 .name = "bdev", 389 .init_fs_context = bd_init_fs_context, 390 .kill_sb = kill_anon_super, 391 }; 392 393 struct super_block *blockdev_superblock __ro_after_init; 394 static struct vfsmount *blockdev_mnt __ro_after_init; 395 EXPORT_SYMBOL_GPL(blockdev_superblock); 396 397 void __init bdev_cache_init(void) 398 { 399 int err; 400 401 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 402 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 403 SLAB_ACCOUNT|SLAB_PANIC), 404 init_once); 405 err = register_filesystem(&bd_type); 406 if (err) 407 panic("Cannot register bdev pseudo-fs"); 408 blockdev_mnt = kern_mount(&bd_type); 409 if (IS_ERR(blockdev_mnt)) 410 panic("Cannot create bdev pseudo-fs"); 411 blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ 412 } 413 414 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) 415 { 416 struct block_device *bdev; 417 struct inode *inode; 418 419 inode = new_inode(blockdev_superblock); 420 if (!inode) 421 return NULL; 422 inode->i_mode = S_IFBLK; 423 inode->i_rdev = 0; 424 inode->i_data.a_ops = &def_blk_aops; 425 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 426 427 bdev = I_BDEV(inode); 428 mutex_init(&bdev->bd_fsfreeze_mutex); 429 spin_lock_init(&bdev->bd_size_lock); 430 mutex_init(&bdev->bd_holder_lock); 431 atomic_set(&bdev->__bd_flags, partno); 432 bdev->bd_mapping = &inode->i_data; 433 bdev->bd_queue = disk->queue; 434 if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO)) 435 bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO); 436 bdev->bd_stats = alloc_percpu(struct disk_stats); 437 if (!bdev->bd_stats) { 438 iput(inode); 439 return NULL; 440 } 441 bdev->bd_disk = disk; 442 return bdev; 443 } 444 445 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) 446 { 447 spin_lock(&bdev->bd_size_lock); 448 i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT); 449 bdev->bd_nr_sectors = sectors; 450 spin_unlock(&bdev->bd_size_lock); 451 } 452 453 void bdev_add(struct block_device *bdev, dev_t dev) 454 { 455 struct inode *inode = BD_INODE(bdev); 456 if (bdev_stable_writes(bdev)) 457 mapping_set_stable_writes(bdev->bd_mapping); 458 bdev->bd_dev = dev; 459 inode->i_rdev = dev; 460 inode->i_ino = dev; 461 insert_inode_hash(inode); 462 } 463 464 void bdev_unhash(struct block_device *bdev) 465 { 466 remove_inode_hash(BD_INODE(bdev)); 467 } 468 469 void bdev_drop(struct block_device *bdev) 470 { 471 iput(BD_INODE(bdev)); 472 } 473 474 long nr_blockdev_pages(void) 475 { 476 struct inode *inode; 477 long ret = 0; 478 479 spin_lock(&blockdev_superblock->s_inode_list_lock); 480 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) 481 ret += inode->i_mapping->nrpages; 482 spin_unlock(&blockdev_superblock->s_inode_list_lock); 483 484 return ret; 485 } 486 487 /** 488 * bd_may_claim - test whether a block device can be claimed 489 * @bdev: block device of interest 490 * @holder: holder trying to claim @bdev 491 * @hops: holder ops 492 * 493 * Test whether @bdev can be claimed by @holder. 494 * 495 * RETURNS: 496 * %true if @bdev can be claimed, %false otherwise. 497 */ 498 static bool bd_may_claim(struct block_device *bdev, void *holder, 499 const struct blk_holder_ops *hops) 500 { 501 struct block_device *whole = bdev_whole(bdev); 502 503 lockdep_assert_held(&bdev_lock); 504 505 if (bdev->bd_holder) { 506 /* 507 * The same holder can always re-claim. 508 */ 509 if (bdev->bd_holder == holder) { 510 if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) 511 return false; 512 return true; 513 } 514 return false; 515 } 516 517 /* 518 * If the whole devices holder is set to bd_may_claim, a partition on 519 * the device is claimed, but not the whole device. 520 */ 521 if (whole != bdev && 522 whole->bd_holder && whole->bd_holder != bd_may_claim) 523 return false; 524 return true; 525 } 526 527 /** 528 * bd_prepare_to_claim - claim a block device 529 * @bdev: block device of interest 530 * @holder: holder trying to claim @bdev 531 * @hops: holder ops. 532 * 533 * Claim @bdev. This function fails if @bdev is already claimed by another 534 * holder and waits if another claiming is in progress. return, the caller 535 * has ownership of bd_claiming and bd_holder[s]. 536 * 537 * RETURNS: 538 * 0 if @bdev can be claimed, -EBUSY otherwise. 539 */ 540 int bd_prepare_to_claim(struct block_device *bdev, void *holder, 541 const struct blk_holder_ops *hops) 542 { 543 struct block_device *whole = bdev_whole(bdev); 544 545 if (WARN_ON_ONCE(!holder)) 546 return -EINVAL; 547 retry: 548 mutex_lock(&bdev_lock); 549 /* if someone else claimed, fail */ 550 if (!bd_may_claim(bdev, holder, hops)) { 551 mutex_unlock(&bdev_lock); 552 return -EBUSY; 553 } 554 555 /* if claiming is already in progress, wait for it to finish */ 556 if (whole->bd_claiming) { 557 wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); 558 DEFINE_WAIT(wait); 559 560 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 561 mutex_unlock(&bdev_lock); 562 schedule(); 563 finish_wait(wq, &wait); 564 goto retry; 565 } 566 567 /* yay, all mine */ 568 whole->bd_claiming = holder; 569 mutex_unlock(&bdev_lock); 570 return 0; 571 } 572 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ 573 574 static void bd_clear_claiming(struct block_device *whole, void *holder) 575 { 576 lockdep_assert_held(&bdev_lock); 577 /* tell others that we're done */ 578 BUG_ON(whole->bd_claiming != holder); 579 whole->bd_claiming = NULL; 580 wake_up_var(&whole->bd_claiming); 581 } 582 583 /** 584 * bd_finish_claiming - finish claiming of a block device 585 * @bdev: block device of interest 586 * @holder: holder that has claimed @bdev 587 * @hops: block device holder operations 588 * 589 * Finish exclusive open of a block device. Mark the device as exlusively 590 * open by the holder and wake up all waiters for exclusive open to finish. 591 */ 592 static void bd_finish_claiming(struct block_device *bdev, void *holder, 593 const struct blk_holder_ops *hops) 594 { 595 struct block_device *whole = bdev_whole(bdev); 596 597 mutex_lock(&bdev_lock); 598 BUG_ON(!bd_may_claim(bdev, holder, hops)); 599 /* 600 * Note that for a whole device bd_holders will be incremented twice, 601 * and bd_holder will be set to bd_may_claim before being set to holder 602 */ 603 whole->bd_holders++; 604 whole->bd_holder = bd_may_claim; 605 bdev->bd_holders++; 606 mutex_lock(&bdev->bd_holder_lock); 607 bdev->bd_holder = holder; 608 bdev->bd_holder_ops = hops; 609 mutex_unlock(&bdev->bd_holder_lock); 610 bd_clear_claiming(whole, holder); 611 mutex_unlock(&bdev_lock); 612 } 613 614 /** 615 * bd_abort_claiming - abort claiming of a block device 616 * @bdev: block device of interest 617 * @holder: holder that has claimed @bdev 618 * 619 * Abort claiming of a block device when the exclusive open failed. This can be 620 * also used when exclusive open is not actually desired and we just needed 621 * to block other exclusive openers for a while. 622 */ 623 void bd_abort_claiming(struct block_device *bdev, void *holder) 624 { 625 mutex_lock(&bdev_lock); 626 bd_clear_claiming(bdev_whole(bdev), holder); 627 mutex_unlock(&bdev_lock); 628 } 629 EXPORT_SYMBOL(bd_abort_claiming); 630 631 static void bd_end_claim(struct block_device *bdev, void *holder) 632 { 633 struct block_device *whole = bdev_whole(bdev); 634 bool unblock = false; 635 636 /* 637 * Release a claim on the device. The holder fields are protected with 638 * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. 639 */ 640 mutex_lock(&bdev_lock); 641 WARN_ON_ONCE(bdev->bd_holder != holder); 642 WARN_ON_ONCE(--bdev->bd_holders < 0); 643 WARN_ON_ONCE(--whole->bd_holders < 0); 644 if (!bdev->bd_holders) { 645 mutex_lock(&bdev->bd_holder_lock); 646 bdev->bd_holder = NULL; 647 bdev->bd_holder_ops = NULL; 648 mutex_unlock(&bdev->bd_holder_lock); 649 if (bdev_test_flag(bdev, BD_WRITE_HOLDER)) 650 unblock = true; 651 } 652 if (!whole->bd_holders) 653 whole->bd_holder = NULL; 654 mutex_unlock(&bdev_lock); 655 656 /* 657 * If this was the last claim, remove holder link and unblock evpoll if 658 * it was a write holder. 659 */ 660 if (unblock) { 661 disk_unblock_events(bdev->bd_disk); 662 bdev_clear_flag(bdev, BD_WRITE_HOLDER); 663 } 664 } 665 666 static void blkdev_flush_mapping(struct block_device *bdev) 667 { 668 WARN_ON_ONCE(bdev->bd_holders); 669 sync_blockdev(bdev); 670 kill_bdev(bdev); 671 bdev_write_inode(bdev); 672 } 673 674 static void blkdev_put_whole(struct block_device *bdev) 675 { 676 if (atomic_dec_and_test(&bdev->bd_openers)) 677 blkdev_flush_mapping(bdev); 678 if (bdev->bd_disk->fops->release) 679 bdev->bd_disk->fops->release(bdev->bd_disk); 680 } 681 682 static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) 683 { 684 struct gendisk *disk = bdev->bd_disk; 685 int ret; 686 687 if (disk->fops->open) { 688 ret = disk->fops->open(disk, mode); 689 if (ret) { 690 /* avoid ghost partitions on a removed medium */ 691 if (ret == -ENOMEDIUM && 692 test_bit(GD_NEED_PART_SCAN, &disk->state)) 693 bdev_disk_changed(disk, true); 694 return ret; 695 } 696 } 697 698 if (!atomic_read(&bdev->bd_openers)) 699 set_init_blocksize(bdev); 700 atomic_inc(&bdev->bd_openers); 701 if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { 702 /* 703 * Only return scanning errors if we are called from contexts 704 * that explicitly want them, e.g. the BLKRRPART ioctl. 705 */ 706 ret = bdev_disk_changed(disk, false); 707 if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { 708 blkdev_put_whole(bdev); 709 return ret; 710 } 711 } 712 return 0; 713 } 714 715 static int blkdev_get_part(struct block_device *part, blk_mode_t mode) 716 { 717 struct gendisk *disk = part->bd_disk; 718 int ret; 719 720 ret = blkdev_get_whole(bdev_whole(part), mode); 721 if (ret) 722 return ret; 723 724 ret = -ENXIO; 725 if (!bdev_nr_sectors(part)) 726 goto out_blkdev_put; 727 728 if (!atomic_read(&part->bd_openers)) { 729 disk->open_partitions++; 730 set_init_blocksize(part); 731 } 732 atomic_inc(&part->bd_openers); 733 return 0; 734 735 out_blkdev_put: 736 blkdev_put_whole(bdev_whole(part)); 737 return ret; 738 } 739 740 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) 741 { 742 int ret; 743 744 ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, 745 MAJOR(dev), MINOR(dev), 746 ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | 747 ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); 748 if (ret) 749 return ret; 750 751 /* Blocking writes requires exclusive opener */ 752 if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) 753 return -EINVAL; 754 755 /* 756 * We're using error pointers to indicate to ->release() when we 757 * failed to open that block device. Also this doesn't make sense. 758 */ 759 if (WARN_ON_ONCE(IS_ERR(holder))) 760 return -EINVAL; 761 762 return 0; 763 } 764 765 static void blkdev_put_part(struct block_device *part) 766 { 767 struct block_device *whole = bdev_whole(part); 768 769 if (atomic_dec_and_test(&part->bd_openers)) { 770 blkdev_flush_mapping(part); 771 whole->bd_disk->open_partitions--; 772 } 773 blkdev_put_whole(whole); 774 } 775 776 struct block_device *blkdev_get_no_open(dev_t dev) 777 { 778 struct block_device *bdev; 779 struct inode *inode; 780 781 inode = ilookup(blockdev_superblock, dev); 782 if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { 783 blk_request_module(dev); 784 inode = ilookup(blockdev_superblock, dev); 785 if (inode) 786 pr_warn_ratelimited( 787 "block device autoloading is deprecated and will be removed.\n"); 788 } 789 if (!inode) 790 return NULL; 791 792 /* switch from the inode reference to a device mode one: */ 793 bdev = &BDEV_I(inode)->bdev; 794 if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) 795 bdev = NULL; 796 iput(inode); 797 return bdev; 798 } 799 800 void blkdev_put_no_open(struct block_device *bdev) 801 { 802 put_device(&bdev->bd_device); 803 } 804 805 static bool bdev_writes_blocked(struct block_device *bdev) 806 { 807 return bdev->bd_writers < 0; 808 } 809 810 static void bdev_block_writes(struct block_device *bdev) 811 { 812 bdev->bd_writers--; 813 } 814 815 static void bdev_unblock_writes(struct block_device *bdev) 816 { 817 bdev->bd_writers++; 818 } 819 820 static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) 821 { 822 if (bdev_allow_write_mounted) 823 return true; 824 /* Writes blocked? */ 825 if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) 826 return false; 827 if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) 828 return false; 829 return true; 830 } 831 832 static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) 833 { 834 if (bdev_allow_write_mounted) 835 return; 836 837 /* Claim exclusive or shared write access. */ 838 if (mode & BLK_OPEN_RESTRICT_WRITES) 839 bdev_block_writes(bdev); 840 else if (mode & BLK_OPEN_WRITE) 841 bdev->bd_writers++; 842 } 843 844 static inline bool bdev_unclaimed(const struct file *bdev_file) 845 { 846 return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); 847 } 848 849 static void bdev_yield_write_access(struct file *bdev_file) 850 { 851 struct block_device *bdev; 852 853 if (bdev_allow_write_mounted) 854 return; 855 856 if (bdev_unclaimed(bdev_file)) 857 return; 858 859 bdev = file_bdev(bdev_file); 860 861 if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) 862 bdev_unblock_writes(bdev); 863 else if (bdev_file->f_mode & FMODE_WRITE) 864 bdev->bd_writers--; 865 } 866 867 /** 868 * bdev_open - open a block device 869 * @bdev: block device to open 870 * @mode: open mode (BLK_OPEN_*) 871 * @holder: exclusive holder identifier 872 * @hops: holder operations 873 * @bdev_file: file for the block device 874 * 875 * Open the block device. If @holder is not %NULL, the block device is opened 876 * with exclusive access. Exclusive opens may nest for the same @holder. 877 * 878 * CONTEXT: 879 * Might sleep. 880 * 881 * RETURNS: 882 * zero on success, -errno on failure. 883 */ 884 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, 885 const struct blk_holder_ops *hops, struct file *bdev_file) 886 { 887 bool unblock_events = true; 888 struct gendisk *disk = bdev->bd_disk; 889 int ret; 890 891 if (holder) { 892 mode |= BLK_OPEN_EXCL; 893 ret = bd_prepare_to_claim(bdev, holder, hops); 894 if (ret) 895 return ret; 896 } else { 897 if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) 898 return -EIO; 899 } 900 901 disk_block_events(disk); 902 903 mutex_lock(&disk->open_mutex); 904 ret = -ENXIO; 905 if (!disk_live(disk)) 906 goto abort_claiming; 907 if (!try_module_get(disk->fops->owner)) 908 goto abort_claiming; 909 ret = -EBUSY; 910 if (!bdev_may_open(bdev, mode)) 911 goto put_module; 912 if (bdev_is_partition(bdev)) 913 ret = blkdev_get_part(bdev, mode); 914 else 915 ret = blkdev_get_whole(bdev, mode); 916 if (ret) 917 goto put_module; 918 bdev_claim_write_access(bdev, mode); 919 if (holder) { 920 bd_finish_claiming(bdev, holder, hops); 921 922 /* 923 * Block event polling for write claims if requested. Any write 924 * holder makes the write_holder state stick until all are 925 * released. This is good enough and tracking individual 926 * writeable reference is too fragile given the way @mode is 927 * used in blkdev_get/put(). 928 */ 929 if ((mode & BLK_OPEN_WRITE) && 930 !bdev_test_flag(bdev, BD_WRITE_HOLDER) && 931 (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { 932 bdev_set_flag(bdev, BD_WRITE_HOLDER); 933 unblock_events = false; 934 } 935 } 936 mutex_unlock(&disk->open_mutex); 937 938 if (unblock_events) 939 disk_unblock_events(disk); 940 941 bdev_file->f_flags |= O_LARGEFILE; 942 bdev_file->f_mode |= FMODE_CAN_ODIRECT; 943 if (bdev_nowait(bdev)) 944 bdev_file->f_mode |= FMODE_NOWAIT; 945 if (mode & BLK_OPEN_RESTRICT_WRITES) 946 bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; 947 bdev_file->f_mapping = bdev->bd_mapping; 948 bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); 949 bdev_file->private_data = holder; 950 951 return 0; 952 put_module: 953 module_put(disk->fops->owner); 954 abort_claiming: 955 if (holder) 956 bd_abort_claiming(bdev, holder); 957 mutex_unlock(&disk->open_mutex); 958 disk_unblock_events(disk); 959 return ret; 960 } 961 962 /* 963 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk 964 * associated with the floppy driver where it has allowed ioctls if the 965 * file was opened for writing, but does not allow reads or writes. 966 * Make sure that this quirk is reflected in @f_flags. 967 * 968 * It can also happen if a block device is opened as O_RDWR | O_WRONLY. 969 */ 970 static unsigned blk_to_file_flags(blk_mode_t mode) 971 { 972 unsigned int flags = 0; 973 974 if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == 975 (BLK_OPEN_READ | BLK_OPEN_WRITE)) 976 flags |= O_RDWR; 977 else if (mode & BLK_OPEN_WRITE_IOCTL) 978 flags |= O_RDWR | O_WRONLY; 979 else if (mode & BLK_OPEN_WRITE) 980 flags |= O_WRONLY; 981 else if (mode & BLK_OPEN_READ) 982 flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ 983 else 984 WARN_ON_ONCE(true); 985 986 if (mode & BLK_OPEN_NDELAY) 987 flags |= O_NDELAY; 988 989 return flags; 990 } 991 992 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, 993 const struct blk_holder_ops *hops) 994 { 995 struct file *bdev_file; 996 struct block_device *bdev; 997 unsigned int flags; 998 int ret; 999 1000 ret = bdev_permission(dev, mode, holder); 1001 if (ret) 1002 return ERR_PTR(ret); 1003 1004 bdev = blkdev_get_no_open(dev); 1005 if (!bdev) 1006 return ERR_PTR(-ENXIO); 1007 1008 flags = blk_to_file_flags(mode); 1009 bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev), 1010 blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); 1011 if (IS_ERR(bdev_file)) { 1012 blkdev_put_no_open(bdev); 1013 return bdev_file; 1014 } 1015 ihold(BD_INODE(bdev)); 1016 1017 ret = bdev_open(bdev, mode, holder, hops, bdev_file); 1018 if (ret) { 1019 /* We failed to open the block device. Let ->release() know. */ 1020 bdev_file->private_data = ERR_PTR(ret); 1021 fput(bdev_file); 1022 return ERR_PTR(ret); 1023 } 1024 return bdev_file; 1025 } 1026 EXPORT_SYMBOL(bdev_file_open_by_dev); 1027 1028 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, 1029 void *holder, 1030 const struct blk_holder_ops *hops) 1031 { 1032 struct file *file; 1033 dev_t dev; 1034 int error; 1035 1036 error = lookup_bdev(path, &dev); 1037 if (error) 1038 return ERR_PTR(error); 1039 1040 file = bdev_file_open_by_dev(dev, mode, holder, hops); 1041 if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { 1042 if (bdev_read_only(file_bdev(file))) { 1043 fput(file); 1044 file = ERR_PTR(-EACCES); 1045 } 1046 } 1047 1048 return file; 1049 } 1050 EXPORT_SYMBOL(bdev_file_open_by_path); 1051 1052 static inline void bd_yield_claim(struct file *bdev_file) 1053 { 1054 struct block_device *bdev = file_bdev(bdev_file); 1055 void *holder = bdev_file->private_data; 1056 1057 lockdep_assert_held(&bdev->bd_disk->open_mutex); 1058 1059 if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) 1060 return; 1061 1062 if (!bdev_unclaimed(bdev_file)) 1063 bd_end_claim(bdev, holder); 1064 } 1065 1066 void bdev_release(struct file *bdev_file) 1067 { 1068 struct block_device *bdev = file_bdev(bdev_file); 1069 void *holder = bdev_file->private_data; 1070 struct gendisk *disk = bdev->bd_disk; 1071 1072 /* We failed to open that block device. */ 1073 if (IS_ERR(holder)) 1074 goto put_no_open; 1075 1076 /* 1077 * Sync early if it looks like we're the last one. If someone else 1078 * opens the block device between now and the decrement of bd_openers 1079 * then we did a sync that we didn't need to, but that's not the end 1080 * of the world and we want to avoid long (could be several minute) 1081 * syncs while holding the mutex. 1082 */ 1083 if (atomic_read(&bdev->bd_openers) == 1) 1084 sync_blockdev(bdev); 1085 1086 mutex_lock(&disk->open_mutex); 1087 bdev_yield_write_access(bdev_file); 1088 1089 if (holder) 1090 bd_yield_claim(bdev_file); 1091 1092 /* 1093 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1094 * event. This is to ensure detection of media removal commanded 1095 * from userland - e.g. eject(1). 1096 */ 1097 disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); 1098 1099 if (bdev_is_partition(bdev)) 1100 blkdev_put_part(bdev); 1101 else 1102 blkdev_put_whole(bdev); 1103 mutex_unlock(&disk->open_mutex); 1104 1105 module_put(disk->fops->owner); 1106 put_no_open: 1107 blkdev_put_no_open(bdev); 1108 } 1109 1110 /** 1111 * bdev_fput - yield claim to the block device and put the file 1112 * @bdev_file: open block device 1113 * 1114 * Yield claim on the block device and put the file. Ensure that the 1115 * block device can be reclaimed before the file is closed which is a 1116 * deferred operation. 1117 */ 1118 void bdev_fput(struct file *bdev_file) 1119 { 1120 if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) 1121 return; 1122 1123 if (bdev_file->private_data) { 1124 struct block_device *bdev = file_bdev(bdev_file); 1125 struct gendisk *disk = bdev->bd_disk; 1126 1127 mutex_lock(&disk->open_mutex); 1128 bdev_yield_write_access(bdev_file); 1129 bd_yield_claim(bdev_file); 1130 /* 1131 * Tell release we already gave up our hold on the 1132 * device and if write restrictions are available that 1133 * we already gave up write access to the device. 1134 */ 1135 bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); 1136 mutex_unlock(&disk->open_mutex); 1137 } 1138 1139 fput(bdev_file); 1140 } 1141 EXPORT_SYMBOL(bdev_fput); 1142 1143 /** 1144 * lookup_bdev() - Look up a struct block_device by name. 1145 * @pathname: Name of the block device in the filesystem. 1146 * @dev: Pointer to the block device's dev_t, if found. 1147 * 1148 * Lookup the block device's dev_t at @pathname in the current 1149 * namespace if possible and return it in @dev. 1150 * 1151 * Context: May sleep. 1152 * Return: 0 if succeeded, negative errno otherwise. 1153 */ 1154 int lookup_bdev(const char *pathname, dev_t *dev) 1155 { 1156 struct inode *inode; 1157 struct path path; 1158 int error; 1159 1160 if (!pathname || !*pathname) 1161 return -EINVAL; 1162 1163 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1164 if (error) 1165 return error; 1166 1167 inode = d_backing_inode(path.dentry); 1168 error = -ENOTBLK; 1169 if (!S_ISBLK(inode->i_mode)) 1170 goto out_path_put; 1171 error = -EACCES; 1172 if (!may_open_dev(&path)) 1173 goto out_path_put; 1174 1175 *dev = inode->i_rdev; 1176 error = 0; 1177 out_path_put: 1178 path_put(&path); 1179 return error; 1180 } 1181 EXPORT_SYMBOL(lookup_bdev); 1182 1183 /** 1184 * bdev_mark_dead - mark a block device as dead 1185 * @bdev: block device to operate on 1186 * @surprise: indicate a surprise removal 1187 * 1188 * Tell the file system that this devices or media is dead. If @surprise is set 1189 * to %true the device or media is already gone, if not we are preparing for an 1190 * orderly removal. 1191 * 1192 * This calls into the file system, which then typicall syncs out all dirty data 1193 * and writes back inodes and then invalidates any cached data in the inodes on 1194 * the file system. In addition we also invalidate the block device mapping. 1195 */ 1196 void bdev_mark_dead(struct block_device *bdev, bool surprise) 1197 { 1198 mutex_lock(&bdev->bd_holder_lock); 1199 if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) 1200 bdev->bd_holder_ops->mark_dead(bdev, surprise); 1201 else { 1202 mutex_unlock(&bdev->bd_holder_lock); 1203 sync_blockdev(bdev); 1204 } 1205 1206 invalidate_bdev(bdev); 1207 } 1208 /* 1209 * New drivers should not use this directly. There are some drivers however 1210 * that needs this for historical reasons. For example, the DASD driver has 1211 * historically had a shutdown to offline mode that doesn't actually remove the 1212 * gendisk that otherwise looks a lot like a safe device removal. 1213 */ 1214 EXPORT_SYMBOL_GPL(bdev_mark_dead); 1215 1216 void sync_bdevs(bool wait) 1217 { 1218 struct inode *inode, *old_inode = NULL; 1219 1220 spin_lock(&blockdev_superblock->s_inode_list_lock); 1221 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1222 struct address_space *mapping = inode->i_mapping; 1223 struct block_device *bdev; 1224 1225 spin_lock(&inode->i_lock); 1226 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || 1227 mapping->nrpages == 0) { 1228 spin_unlock(&inode->i_lock); 1229 continue; 1230 } 1231 __iget(inode); 1232 spin_unlock(&inode->i_lock); 1233 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1234 /* 1235 * We hold a reference to 'inode' so it couldn't have been 1236 * removed from s_inodes list while we dropped the 1237 * s_inode_list_lock We cannot iput the inode now as we can 1238 * be holding the last reference and we cannot iput it under 1239 * s_inode_list_lock. So we keep the reference and iput it 1240 * later. 1241 */ 1242 iput(old_inode); 1243 old_inode = inode; 1244 bdev = I_BDEV(inode); 1245 1246 mutex_lock(&bdev->bd_disk->open_mutex); 1247 if (!atomic_read(&bdev->bd_openers)) { 1248 ; /* skip */ 1249 } else if (wait) { 1250 /* 1251 * We keep the error status of individual mapping so 1252 * that applications can catch the writeback error using 1253 * fsync(2). See filemap_fdatawait_keep_errors() for 1254 * details. 1255 */ 1256 filemap_fdatawait_keep_errors(inode->i_mapping); 1257 } else { 1258 filemap_fdatawrite(inode->i_mapping); 1259 } 1260 mutex_unlock(&bdev->bd_disk->open_mutex); 1261 1262 spin_lock(&blockdev_superblock->s_inode_list_lock); 1263 } 1264 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1265 iput(old_inode); 1266 } 1267 1268 /* 1269 * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. 1270 */ 1271 void bdev_statx(struct path *path, struct kstat *stat, 1272 u32 request_mask) 1273 { 1274 struct inode *backing_inode; 1275 struct block_device *bdev; 1276 1277 if (!(request_mask & (STATX_DIOALIGN | STATX_WRITE_ATOMIC))) 1278 return; 1279 1280 backing_inode = d_backing_inode(path->dentry); 1281 1282 /* 1283 * Note that backing_inode is the inode of a block device node file, 1284 * not the block device's internal inode. Therefore it is *not* valid 1285 * to use I_BDEV() here; the block device has to be looked up by i_rdev 1286 * instead. 1287 */ 1288 bdev = blkdev_get_no_open(backing_inode->i_rdev); 1289 if (!bdev) 1290 return; 1291 1292 if (request_mask & STATX_DIOALIGN) { 1293 stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 1294 stat->dio_offset_align = bdev_logical_block_size(bdev); 1295 stat->result_mask |= STATX_DIOALIGN; 1296 } 1297 1298 if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { 1299 struct request_queue *bd_queue = bdev->bd_queue; 1300 1301 generic_fill_statx_atomic_writes(stat, 1302 queue_atomic_write_unit_min_bytes(bd_queue), 1303 queue_atomic_write_unit_max_bytes(bd_queue)); 1304 } 1305 1306 blkdev_put_no_open(bdev); 1307 } 1308 1309 bool disk_live(struct gendisk *disk) 1310 { 1311 return !inode_unhashed(BD_INODE(disk->part0)); 1312 } 1313 EXPORT_SYMBOL_GPL(disk_live); 1314 1315 unsigned int block_size(struct block_device *bdev) 1316 { 1317 return 1 << BD_INODE(bdev)->i_blkbits; 1318 } 1319 EXPORT_SYMBOL_GPL(block_size); 1320 1321 static int __init setup_bdev_allow_write_mounted(char *str) 1322 { 1323 if (kstrtobool(str, &bdev_allow_write_mounted)) 1324 pr_warn("Invalid option string for bdev_allow_write_mounted:" 1325 " '%s'\n", str); 1326 return 1; 1327 } 1328 __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); 1329