1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 41 typedef struct vdev_disk { 42 struct block_device *vd_bdev; 43 krwlock_t vd_lock; 44 } vdev_disk_t; 45 46 /* 47 * Unique identifier for the exclusive vdev holder. 48 */ 49 static void *zfs_vdev_holder = VDEV_HOLDER; 50 51 /* 52 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 53 * device is missing. The missing path may be transient since the links 54 * can be briefly removed and recreated in response to udev events. 55 */ 56 static unsigned zfs_vdev_open_timeout_ms = 1000; 57 58 /* 59 * Size of the "reserved" partition, in blocks. 60 */ 61 #define EFI_MIN_RESV_SIZE (16 * 1024) 62 63 /* 64 * Virtual device vector for disks. 65 */ 66 typedef struct dio_request { 67 zio_t *dr_zio; /* Parent ZIO */ 68 atomic_t dr_ref; /* References */ 69 int dr_error; /* Bio error */ 70 int dr_bio_count; /* Count of bio's */ 71 struct bio *dr_bio[0]; /* Attached bio's */ 72 } dio_request_t; 73 74 static fmode_t 75 vdev_bdev_mode(spa_mode_t spa_mode) 76 { 77 fmode_t mode = 0; 78 79 if (spa_mode & SPA_MODE_READ) 80 mode |= FMODE_READ; 81 82 if (spa_mode & SPA_MODE_WRITE) 83 mode |= FMODE_WRITE; 84 85 return (mode); 86 } 87 88 /* 89 * Returns the usable capacity (in bytes) for the partition or disk. 90 */ 91 static uint64_t 92 bdev_capacity(struct block_device *bdev) 93 { 94 return (i_size_read(bdev->bd_inode)); 95 } 96 97 #if !defined(HAVE_BDEV_WHOLE) 98 static inline struct block_device * 99 bdev_whole(struct block_device *bdev) 100 { 101 return (bdev->bd_contains); 102 } 103 #endif 104 105 /* 106 * Returns the maximum expansion capacity of the block device (in bytes). 107 * 108 * It is possible to expand a vdev when it has been created as a wholedisk 109 * and the containing block device has increased in capacity. Or when the 110 * partition containing the pool has been manually increased in size. 111 * 112 * This function is only responsible for calculating the potential expansion 113 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 114 * responsible for verifying the expected partition layout in the wholedisk 115 * case, and updating the partition table if appropriate. Once the partition 116 * size has been increased the additional capacity will be visible using 117 * bdev_capacity(). 118 * 119 * The returned maximum expansion capacity is always expected to be larger, or 120 * at the very least equal, to its usable capacity to prevent overestimating 121 * the pool expandsize. 122 */ 123 static uint64_t 124 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 125 { 126 uint64_t psize; 127 int64_t available; 128 129 if (wholedisk && bdev != bdev_whole(bdev)) { 130 /* 131 * When reporting maximum expansion capacity for a wholedisk 132 * deduct any capacity which is expected to be lost due to 133 * alignment restrictions. Over reporting this value isn't 134 * harmful and would only result in slightly less capacity 135 * than expected post expansion. 136 * The estimated available space may be slightly smaller than 137 * bdev_capacity() for devices where the number of sectors is 138 * not a multiple of the alignment size and the partition layout 139 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 140 * "reserved" EFI partition: in such cases return the device 141 * usable capacity. 142 */ 143 available = i_size_read(bdev_whole(bdev)->bd_inode) - 144 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 145 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 146 psize = MAX(available, bdev_capacity(bdev)); 147 } else { 148 psize = bdev_capacity(bdev); 149 } 150 151 return (psize); 152 } 153 154 static void 155 vdev_disk_error(zio_t *zio) 156 { 157 /* 158 * This function can be called in interrupt context, for instance while 159 * handling IRQs coming from a misbehaving disk device; use printk() 160 * which is safe from any context. 161 */ 162 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 163 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), 164 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 165 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 166 zio->io_flags); 167 } 168 169 static int 170 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 171 uint64_t *logical_ashift, uint64_t *physical_ashift) 172 { 173 struct block_device *bdev; 174 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 175 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 176 vdev_disk_t *vd; 177 178 /* Must have a pathname and it must be absolute. */ 179 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 180 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 181 vdev_dbgmsg(v, "invalid vdev_path"); 182 return (SET_ERROR(EINVAL)); 183 } 184 185 /* 186 * Reopen the device if it is currently open. When expanding a 187 * partition force re-scanning the partition table if userland 188 * did not take care of this already. We need to do this while closed 189 * in order to get an accurate updated block device size. Then 190 * since udev may need to recreate the device links increase the 191 * open retry timeout before reporting the device as unavailable. 192 */ 193 vd = v->vdev_tsd; 194 if (vd) { 195 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 196 boolean_t reread_part = B_FALSE; 197 198 rw_enter(&vd->vd_lock, RW_WRITER); 199 bdev = vd->vd_bdev; 200 vd->vd_bdev = NULL; 201 202 if (bdev) { 203 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 204 bdevname(bdev_whole(bdev), disk_name + 5); 205 /* 206 * If userland has BLKPG_RESIZE_PARTITION, 207 * then it should have updated the partition 208 * table already. We can detect this by 209 * comparing our current physical size 210 * with that of the device. If they are 211 * the same, then we must not have 212 * BLKPG_RESIZE_PARTITION or it failed to 213 * update the partition table online. We 214 * fallback to rescanning the partition 215 * table from the kernel below. However, 216 * if the capacity already reflects the 217 * updated partition, then we skip 218 * rescanning the partition table here. 219 */ 220 if (v->vdev_psize == bdev_capacity(bdev)) 221 reread_part = B_TRUE; 222 } 223 224 blkdev_put(bdev, mode | FMODE_EXCL); 225 } 226 227 if (reread_part) { 228 bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, 229 zfs_vdev_holder); 230 if (!IS_ERR(bdev)) { 231 int error = vdev_bdev_reread_part(bdev); 232 blkdev_put(bdev, mode | FMODE_EXCL); 233 if (error == 0) { 234 timeout = MSEC2NSEC( 235 zfs_vdev_open_timeout_ms * 2); 236 } 237 } 238 } 239 } else { 240 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 241 242 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 243 rw_enter(&vd->vd_lock, RW_WRITER); 244 } 245 246 /* 247 * Devices are always opened by the path provided at configuration 248 * time. This means that if the provided path is a udev by-id path 249 * then drives may be re-cabled without an issue. If the provided 250 * path is a udev by-path path, then the physical location information 251 * will be preserved. This can be critical for more complicated 252 * configurations where drives are located in specific physical 253 * locations to maximize the systems tolerance to component failure. 254 * 255 * Alternatively, you can provide your own udev rule to flexibly map 256 * the drives as you see fit. It is not advised that you use the 257 * /dev/[hd]d devices which may be reordered due to probing order. 258 * Devices in the wrong locations will be detected by the higher 259 * level vdev validation. 260 * 261 * The specified paths may be briefly removed and recreated in 262 * response to udev events. This should be exceptionally unlikely 263 * because the zpool command makes every effort to verify these paths 264 * have already settled prior to reaching this point. Therefore, 265 * a ENOENT failure at this point is highly likely to be transient 266 * and it is reasonable to sleep and retry before giving up. In 267 * practice delays have been observed to be on the order of 100ms. 268 */ 269 hrtime_t start = gethrtime(); 270 bdev = ERR_PTR(-ENXIO); 271 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 272 bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, 273 zfs_vdev_holder); 274 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 275 schedule_timeout(MSEC_TO_TICK(10)); 276 } else if (IS_ERR(bdev)) { 277 break; 278 } 279 } 280 281 if (IS_ERR(bdev)) { 282 int error = -PTR_ERR(bdev); 283 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 284 (u_longlong_t)(gethrtime() - start), 285 (u_longlong_t)timeout); 286 vd->vd_bdev = NULL; 287 v->vdev_tsd = vd; 288 rw_exit(&vd->vd_lock); 289 return (SET_ERROR(error)); 290 } else { 291 vd->vd_bdev = bdev; 292 v->vdev_tsd = vd; 293 rw_exit(&vd->vd_lock); 294 } 295 296 struct request_queue *q = bdev_get_queue(vd->vd_bdev); 297 298 /* Determine the physical block size */ 299 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 300 301 /* Determine the logical block size */ 302 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 303 304 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 305 v->vdev_nowritecache = B_FALSE; 306 307 /* Set when device reports it supports TRIM. */ 308 v->vdev_has_trim = !!blk_queue_discard(q); 309 310 /* Set when device reports it supports secure TRIM. */ 311 v->vdev_has_securetrim = !!blk_queue_discard_secure(q); 312 313 /* Inform the ZIO pipeline that we are non-rotational */ 314 v->vdev_nonrot = blk_queue_nonrot(q); 315 316 /* Physical volume size in bytes for the partition */ 317 *psize = bdev_capacity(vd->vd_bdev); 318 319 /* Physical volume size in bytes including possible expansion space */ 320 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 321 322 /* Based on the minimum sector size set the block size */ 323 *physical_ashift = highbit64(MAX(physical_block_size, 324 SPA_MINBLOCKSIZE)) - 1; 325 326 *logical_ashift = highbit64(MAX(logical_block_size, 327 SPA_MINBLOCKSIZE)) - 1; 328 329 return (0); 330 } 331 332 static void 333 vdev_disk_close(vdev_t *v) 334 { 335 vdev_disk_t *vd = v->vdev_tsd; 336 337 if (v->vdev_reopening || vd == NULL) 338 return; 339 340 if (vd->vd_bdev != NULL) { 341 blkdev_put(vd->vd_bdev, 342 vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); 343 } 344 345 rw_destroy(&vd->vd_lock); 346 kmem_free(vd, sizeof (vdev_disk_t)); 347 v->vdev_tsd = NULL; 348 } 349 350 static dio_request_t * 351 vdev_disk_dio_alloc(int bio_count) 352 { 353 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 354 sizeof (struct bio *) * bio_count, KM_SLEEP); 355 atomic_set(&dr->dr_ref, 0); 356 dr->dr_bio_count = bio_count; 357 dr->dr_error = 0; 358 359 for (int i = 0; i < dr->dr_bio_count; i++) 360 dr->dr_bio[i] = NULL; 361 362 return (dr); 363 } 364 365 static void 366 vdev_disk_dio_free(dio_request_t *dr) 367 { 368 int i; 369 370 for (i = 0; i < dr->dr_bio_count; i++) 371 if (dr->dr_bio[i]) 372 bio_put(dr->dr_bio[i]); 373 374 kmem_free(dr, sizeof (dio_request_t) + 375 sizeof (struct bio *) * dr->dr_bio_count); 376 } 377 378 static void 379 vdev_disk_dio_get(dio_request_t *dr) 380 { 381 atomic_inc(&dr->dr_ref); 382 } 383 384 static int 385 vdev_disk_dio_put(dio_request_t *dr) 386 { 387 int rc = atomic_dec_return(&dr->dr_ref); 388 389 /* 390 * Free the dio_request when the last reference is dropped and 391 * ensure zio_interpret is called only once with the correct zio 392 */ 393 if (rc == 0) { 394 zio_t *zio = dr->dr_zio; 395 int error = dr->dr_error; 396 397 vdev_disk_dio_free(dr); 398 399 if (zio) { 400 zio->io_error = error; 401 ASSERT3S(zio->io_error, >=, 0); 402 if (zio->io_error) 403 vdev_disk_error(zio); 404 405 zio_delay_interrupt(zio); 406 } 407 } 408 409 return (rc); 410 } 411 412 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 413 { 414 dio_request_t *dr = bio->bi_private; 415 int rc; 416 417 if (dr->dr_error == 0) { 418 #ifdef HAVE_1ARG_BIO_END_IO_T 419 dr->dr_error = BIO_END_IO_ERROR(bio); 420 #else 421 if (error) 422 dr->dr_error = -(error); 423 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 424 dr->dr_error = EIO; 425 #endif 426 } 427 428 /* Drop reference acquired by __vdev_disk_physio */ 429 rc = vdev_disk_dio_put(dr); 430 } 431 432 static inline void 433 vdev_submit_bio_impl(struct bio *bio) 434 { 435 #ifdef HAVE_1ARG_SUBMIT_BIO 436 submit_bio(bio); 437 #else 438 submit_bio(0, bio); 439 #endif 440 } 441 442 /* 443 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 444 * replace it with preempt_schedule under the following condition: 445 */ 446 #if defined(CONFIG_ARM64) && \ 447 defined(CONFIG_PREEMPTION) && \ 448 defined(CONFIG_BLK_CGROUP) 449 #define preempt_schedule_notrace(x) preempt_schedule(x) 450 #endif 451 452 #ifdef HAVE_BIO_SET_DEV 453 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 454 /* 455 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 456 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 457 * As a side effect the function was converted to GPL-only. Define our 458 * own version when needed which uses rcu_read_lock_sched(). 459 */ 460 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) 461 static inline bool 462 vdev_blkg_tryget(struct blkcg_gq *blkg) 463 { 464 struct percpu_ref *ref = &blkg->refcnt; 465 unsigned long __percpu *count; 466 bool rc; 467 468 rcu_read_lock_sched(); 469 470 if (__ref_is_percpu(ref, &count)) { 471 this_cpu_inc(*count); 472 rc = true; 473 } else { 474 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 475 rc = atomic_long_inc_not_zero(&ref->data->count); 476 #else 477 rc = atomic_long_inc_not_zero(&ref->count); 478 #endif 479 } 480 481 rcu_read_unlock_sched(); 482 483 return (rc); 484 } 485 #elif defined(HAVE_BLKG_TRYGET) 486 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 487 #endif 488 /* 489 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 490 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 491 * the entire macro. Provide a minimal version which always assigns the 492 * request queue's root_blkg to the bio. 493 */ 494 static inline void 495 vdev_bio_associate_blkg(struct bio *bio) 496 { 497 #if defined(HAVE_BIO_BDEV_DISK) 498 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 499 #else 500 struct request_queue *q = bio->bi_disk->queue; 501 #endif 502 503 ASSERT3P(q, !=, NULL); 504 ASSERT3P(bio->bi_blkg, ==, NULL); 505 506 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 507 bio->bi_blkg = q->root_blkg; 508 } 509 #define bio_associate_blkg vdev_bio_associate_blkg 510 #endif 511 #else 512 /* 513 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 514 */ 515 static inline void 516 bio_set_dev(struct bio *bio, struct block_device *bdev) 517 { 518 bio->bi_bdev = bdev; 519 } 520 #endif /* HAVE_BIO_SET_DEV */ 521 522 static inline void 523 vdev_submit_bio(struct bio *bio) 524 { 525 struct bio_list *bio_list = current->bio_list; 526 current->bio_list = NULL; 527 vdev_submit_bio_impl(bio); 528 current->bio_list = bio_list; 529 } 530 531 static int 532 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 533 size_t io_size, uint64_t io_offset, int rw, int flags) 534 { 535 dio_request_t *dr; 536 uint64_t abd_offset; 537 uint64_t bio_offset; 538 int bio_size; 539 int bio_count = 16; 540 int error = 0; 541 struct blk_plug plug; 542 543 /* 544 * Accessing outside the block device is never allowed. 545 */ 546 if (io_offset + io_size > bdev->bd_inode->i_size) { 547 vdev_dbgmsg(zio->io_vd, 548 "Illegal access %llu size %llu, device size %llu", 549 (u_longlong_t)io_offset, 550 (u_longlong_t)io_size, 551 (u_longlong_t)i_size_read(bdev->bd_inode)); 552 return (SET_ERROR(EIO)); 553 } 554 555 retry: 556 dr = vdev_disk_dio_alloc(bio_count); 557 558 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 559 bio_set_flags_failfast(bdev, &flags); 560 561 dr->dr_zio = zio; 562 563 /* 564 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 565 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 566 * can cover at least 128KB and at most 1MB. When the required number 567 * of iovec's exceeds this, we are forced to break the IO in multiple 568 * bio's and wait for them all to complete. This is likely if the 569 * recordsize property is increased beyond 1MB. The default 570 * bio_count=16 should typically accommodate the maximum-size zio of 571 * 16MB. 572 */ 573 574 abd_offset = 0; 575 bio_offset = io_offset; 576 bio_size = io_size; 577 for (int i = 0; i <= dr->dr_bio_count; i++) { 578 579 /* Finished constructing bio's for given buffer */ 580 if (bio_size <= 0) 581 break; 582 583 /* 584 * If additional bio's are required, we have to retry, but 585 * this should be rare - see the comment above. 586 */ 587 if (dr->dr_bio_count == i) { 588 vdev_disk_dio_free(dr); 589 bio_count *= 2; 590 goto retry; 591 } 592 593 /* bio_alloc() with __GFP_WAIT never returns NULL */ 594 #ifdef HAVE_BIO_MAX_SEGS 595 dr->dr_bio[i] = bio_alloc(GFP_NOIO, bio_max_segs( 596 abd_nr_pages_off(zio->io_abd, bio_size, abd_offset))); 597 #else 598 dr->dr_bio[i] = bio_alloc(GFP_NOIO, 599 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), 600 BIO_MAX_PAGES)); 601 #endif 602 if (unlikely(dr->dr_bio[i] == NULL)) { 603 vdev_disk_dio_free(dr); 604 return (SET_ERROR(ENOMEM)); 605 } 606 607 /* Matching put called by vdev_disk_physio_completion */ 608 vdev_disk_dio_get(dr); 609 610 bio_set_dev(dr->dr_bio[i], bdev); 611 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 612 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 613 dr->dr_bio[i]->bi_private = dr; 614 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 615 616 /* Remaining size is returned to become the new size */ 617 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 618 bio_size, abd_offset); 619 620 /* Advance in buffer and construct another bio if needed */ 621 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 622 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 623 } 624 625 /* Extra reference to protect dio_request during vdev_submit_bio */ 626 vdev_disk_dio_get(dr); 627 628 if (dr->dr_bio_count > 1) 629 blk_start_plug(&plug); 630 631 /* Submit all bio's associated with this dio */ 632 for (int i = 0; i < dr->dr_bio_count; i++) { 633 if (dr->dr_bio[i]) 634 vdev_submit_bio(dr->dr_bio[i]); 635 } 636 637 if (dr->dr_bio_count > 1) 638 blk_finish_plug(&plug); 639 640 (void) vdev_disk_dio_put(dr); 641 642 return (error); 643 } 644 645 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 646 { 647 zio_t *zio = bio->bi_private; 648 #ifdef HAVE_1ARG_BIO_END_IO_T 649 zio->io_error = BIO_END_IO_ERROR(bio); 650 #else 651 zio->io_error = -error; 652 #endif 653 654 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 655 zio->io_vd->vdev_nowritecache = B_TRUE; 656 657 bio_put(bio); 658 ASSERT3S(zio->io_error, >=, 0); 659 if (zio->io_error) 660 vdev_disk_error(zio); 661 zio_interrupt(zio); 662 } 663 664 static int 665 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 666 { 667 struct request_queue *q; 668 struct bio *bio; 669 670 q = bdev_get_queue(bdev); 671 if (!q) 672 return (SET_ERROR(ENXIO)); 673 674 bio = bio_alloc(GFP_NOIO, 0); 675 /* bio_alloc() with __GFP_WAIT never returns NULL */ 676 if (unlikely(bio == NULL)) 677 return (SET_ERROR(ENOMEM)); 678 679 bio->bi_end_io = vdev_disk_io_flush_completion; 680 bio->bi_private = zio; 681 bio_set_dev(bio, bdev); 682 bio_set_flush(bio); 683 vdev_submit_bio(bio); 684 invalidate_bdev(bdev); 685 686 return (0); 687 } 688 689 static void 690 vdev_disk_io_start(zio_t *zio) 691 { 692 vdev_t *v = zio->io_vd; 693 vdev_disk_t *vd = v->vdev_tsd; 694 unsigned long trim_flags = 0; 695 int rw, error; 696 697 /* 698 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 699 * Nothing to be done here but return failure. 700 */ 701 if (vd == NULL) { 702 zio->io_error = ENXIO; 703 zio_interrupt(zio); 704 return; 705 } 706 707 rw_enter(&vd->vd_lock, RW_READER); 708 709 /* 710 * If the vdev is closed, it's likely due to a failed reopen and is 711 * in the UNAVAIL state. Nothing to be done here but return failure. 712 */ 713 if (vd->vd_bdev == NULL) { 714 rw_exit(&vd->vd_lock); 715 zio->io_error = ENXIO; 716 zio_interrupt(zio); 717 return; 718 } 719 720 switch (zio->io_type) { 721 case ZIO_TYPE_IOCTL: 722 723 if (!vdev_readable(v)) { 724 rw_exit(&vd->vd_lock); 725 zio->io_error = SET_ERROR(ENXIO); 726 zio_interrupt(zio); 727 return; 728 } 729 730 switch (zio->io_cmd) { 731 case DKIOCFLUSHWRITECACHE: 732 733 if (zfs_nocacheflush) 734 break; 735 736 if (v->vdev_nowritecache) { 737 zio->io_error = SET_ERROR(ENOTSUP); 738 break; 739 } 740 741 error = vdev_disk_io_flush(vd->vd_bdev, zio); 742 if (error == 0) { 743 rw_exit(&vd->vd_lock); 744 return; 745 } 746 747 zio->io_error = error; 748 749 break; 750 751 default: 752 zio->io_error = SET_ERROR(ENOTSUP); 753 } 754 755 rw_exit(&vd->vd_lock); 756 zio_execute(zio); 757 return; 758 case ZIO_TYPE_WRITE: 759 rw = WRITE; 760 break; 761 762 case ZIO_TYPE_READ: 763 rw = READ; 764 break; 765 766 case ZIO_TYPE_TRIM: 767 #if defined(BLKDEV_DISCARD_SECURE) 768 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 769 trim_flags |= BLKDEV_DISCARD_SECURE; 770 #endif 771 zio->io_error = -blkdev_issue_discard(vd->vd_bdev, 772 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, 773 trim_flags); 774 775 rw_exit(&vd->vd_lock); 776 zio_interrupt(zio); 777 return; 778 779 default: 780 rw_exit(&vd->vd_lock); 781 zio->io_error = SET_ERROR(ENOTSUP); 782 zio_interrupt(zio); 783 return; 784 } 785 786 zio->io_target_timestamp = zio_handle_io_delay(zio); 787 error = __vdev_disk_physio(vd->vd_bdev, zio, 788 zio->io_size, zio->io_offset, rw, 0); 789 rw_exit(&vd->vd_lock); 790 791 if (error) { 792 zio->io_error = error; 793 zio_interrupt(zio); 794 return; 795 } 796 } 797 798 static void 799 vdev_disk_io_done(zio_t *zio) 800 { 801 /* 802 * If the device returned EIO, we revalidate the media. If it is 803 * determined the media has changed this triggers the asynchronous 804 * removal of the device from the configuration. 805 */ 806 if (zio->io_error == EIO) { 807 vdev_t *v = zio->io_vd; 808 vdev_disk_t *vd = v->vdev_tsd; 809 810 if (zfs_check_media_change(vd->vd_bdev)) { 811 invalidate_bdev(vd->vd_bdev); 812 v->vdev_remove_wanted = B_TRUE; 813 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 814 } 815 } 816 } 817 818 static void 819 vdev_disk_hold(vdev_t *vd) 820 { 821 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 822 823 /* We must have a pathname, and it must be absolute. */ 824 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 825 return; 826 827 /* 828 * Only prefetch path and devid info if the device has 829 * never been opened. 830 */ 831 if (vd->vdev_tsd != NULL) 832 return; 833 834 } 835 836 static void 837 vdev_disk_rele(vdev_t *vd) 838 { 839 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 840 841 /* XXX: Implement me as a vnode rele for the device */ 842 } 843 844 vdev_ops_t vdev_disk_ops = { 845 .vdev_op_init = NULL, 846 .vdev_op_fini = NULL, 847 .vdev_op_open = vdev_disk_open, 848 .vdev_op_close = vdev_disk_close, 849 .vdev_op_asize = vdev_default_asize, 850 .vdev_op_min_asize = vdev_default_min_asize, 851 .vdev_op_min_alloc = NULL, 852 .vdev_op_io_start = vdev_disk_io_start, 853 .vdev_op_io_done = vdev_disk_io_done, 854 .vdev_op_state_change = NULL, 855 .vdev_op_need_resilver = NULL, 856 .vdev_op_hold = vdev_disk_hold, 857 .vdev_op_rele = vdev_disk_rele, 858 .vdev_op_remap = NULL, 859 .vdev_op_xlate = vdev_default_xlate, 860 .vdev_op_rebuild_asize = NULL, 861 .vdev_op_metaslab_init = NULL, 862 .vdev_op_config_generate = NULL, 863 .vdev_op_nparity = NULL, 864 .vdev_op_ndisks = NULL, 865 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 866 .vdev_op_leaf = B_TRUE /* leaf vdev */ 867 }; 868 869 /* 870 * The zfs_vdev_scheduler module option has been deprecated. Setting this 871 * value no longer has any effect. It has not yet been entirely removed 872 * to allow the module to be loaded if this option is specified in the 873 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 874 */ 875 static int 876 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 877 { 878 int error = param_set_charp(val, kp); 879 if (error == 0) { 880 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 881 "is not supported.\n"); 882 } 883 884 return (error); 885 } 886 887 char *zfs_vdev_scheduler = "unused"; 888 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 889 param_get_charp, &zfs_vdev_scheduler, 0644); 890 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 891 892 int 893 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 894 { 895 uint64_t val; 896 int error; 897 898 error = kstrtoull(buf, 0, &val); 899 if (error < 0) 900 return (SET_ERROR(error)); 901 902 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 903 return (SET_ERROR(-EINVAL)); 904 905 error = param_set_ulong(buf, kp); 906 if (error < 0) 907 return (SET_ERROR(error)); 908 909 return (0); 910 } 911 912 int 913 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 914 { 915 uint64_t val; 916 int error; 917 918 error = kstrtoull(buf, 0, &val); 919 if (error < 0) 920 return (SET_ERROR(error)); 921 922 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 923 return (SET_ERROR(-EINVAL)); 924 925 error = param_set_ulong(buf, kp); 926 if (error < 0) 927 return (SET_ERROR(error)); 928 929 return (0); 930 } 931