1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 41 typedef struct vdev_disk { 42 struct block_device *vd_bdev; 43 krwlock_t vd_lock; 44 } vdev_disk_t; 45 46 /* 47 * Unique identifier for the exclusive vdev holder. 48 */ 49 static void *zfs_vdev_holder = VDEV_HOLDER; 50 51 /* 52 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 53 * device is missing. The missing path may be transient since the links 54 * can be briefly removed and recreated in response to udev events. 55 */ 56 static unsigned zfs_vdev_open_timeout_ms = 1000; 57 58 /* 59 * Size of the "reserved" partition, in blocks. 60 */ 61 #define EFI_MIN_RESV_SIZE (16 * 1024) 62 63 /* 64 * Virtual device vector for disks. 65 */ 66 typedef struct dio_request { 67 zio_t *dr_zio; /* Parent ZIO */ 68 atomic_t dr_ref; /* References */ 69 int dr_error; /* Bio error */ 70 int dr_bio_count; /* Count of bio's */ 71 struct bio *dr_bio[0]; /* Attached bio's */ 72 } dio_request_t; 73 74 static fmode_t 75 vdev_bdev_mode(spa_mode_t spa_mode) 76 { 77 fmode_t mode = 0; 78 79 if (spa_mode & SPA_MODE_READ) 80 mode |= FMODE_READ; 81 82 if (spa_mode & SPA_MODE_WRITE) 83 mode |= FMODE_WRITE; 84 85 return (mode); 86 } 87 88 /* 89 * Returns the usable capacity (in bytes) for the partition or disk. 90 */ 91 static uint64_t 92 bdev_capacity(struct block_device *bdev) 93 { 94 return (i_size_read(bdev->bd_inode)); 95 } 96 97 /* 98 * Returns the maximum expansion capacity of the block device (in bytes). 99 * 100 * It is possible to expand a vdev when it has been created as a wholedisk 101 * and the containing block device has increased in capacity. Or when the 102 * partition containing the pool has been manually increased in size. 103 * 104 * This function is only responsible for calculating the potential expansion 105 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 106 * responsible for verifying the expected partition layout in the wholedisk 107 * case, and updating the partition table if appropriate. Once the partition 108 * size has been increased the additional capacity will be visible using 109 * bdev_capacity(). 110 * 111 * The returned maximum expansion capacity is always expected to be larger, or 112 * at the very least equal, to its usable capacity to prevent overestimating 113 * the pool expandsize. 114 */ 115 static uint64_t 116 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 117 { 118 uint64_t psize; 119 int64_t available; 120 121 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { 122 /* 123 * When reporting maximum expansion capacity for a wholedisk 124 * deduct any capacity which is expected to be lost due to 125 * alignment restrictions. Over reporting this value isn't 126 * harmful and would only result in slightly less capacity 127 * than expected post expansion. 128 * The estimated available space may be slightly smaller than 129 * bdev_capacity() for devices where the number of sectors is 130 * not a multiple of the alignment size and the partition layout 131 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 132 * "reserved" EFI partition: in such cases return the device 133 * usable capacity. 134 */ 135 available = i_size_read(bdev->bd_contains->bd_inode) - 136 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 137 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 138 psize = MAX(available, bdev_capacity(bdev)); 139 } else { 140 psize = bdev_capacity(bdev); 141 } 142 143 return (psize); 144 } 145 146 static void 147 vdev_disk_error(zio_t *zio) 148 { 149 /* 150 * This function can be called in interrupt context, for instance while 151 * handling IRQs coming from a misbehaving disk device; use printk() 152 * which is safe from any context. 153 */ 154 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 155 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), 156 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 157 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 158 zio->io_flags); 159 } 160 161 static int 162 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 163 uint64_t *logical_ashift, uint64_t *physical_ashift) 164 { 165 struct block_device *bdev; 166 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 167 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 168 vdev_disk_t *vd; 169 170 /* Must have a pathname and it must be absolute. */ 171 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 172 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 173 vdev_dbgmsg(v, "invalid vdev_path"); 174 return (SET_ERROR(EINVAL)); 175 } 176 177 /* 178 * Reopen the device if it is currently open. When expanding a 179 * partition force re-scanning the partition table if userland 180 * did not take care of this already. We need to do this while closed 181 * in order to get an accurate updated block device size. Then 182 * since udev may need to recreate the device links increase the 183 * open retry timeout before reporting the device as unavailable. 184 */ 185 vd = v->vdev_tsd; 186 if (vd) { 187 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 188 boolean_t reread_part = B_FALSE; 189 190 rw_enter(&vd->vd_lock, RW_WRITER); 191 bdev = vd->vd_bdev; 192 vd->vd_bdev = NULL; 193 194 if (bdev) { 195 if (v->vdev_expanding && bdev != bdev->bd_contains) { 196 bdevname(bdev->bd_contains, disk_name + 5); 197 /* 198 * If userland has BLKPG_RESIZE_PARTITION, 199 * then it should have updated the partition 200 * table already. We can detect this by 201 * comparing our current physical size 202 * with that of the device. If they are 203 * the same, then we must not have 204 * BLKPG_RESIZE_PARTITION or it failed to 205 * update the partition table online. We 206 * fallback to rescanning the partition 207 * table from the kernel below. However, 208 * if the capacity already reflects the 209 * updated partition, then we skip 210 * rescanning the partition table here. 211 */ 212 if (v->vdev_psize == bdev_capacity(bdev)) 213 reread_part = B_TRUE; 214 } 215 216 blkdev_put(bdev, mode | FMODE_EXCL); 217 } 218 219 if (reread_part) { 220 bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, 221 zfs_vdev_holder); 222 if (!IS_ERR(bdev)) { 223 int error = vdev_bdev_reread_part(bdev); 224 blkdev_put(bdev, mode | FMODE_EXCL); 225 if (error == 0) { 226 timeout = MSEC2NSEC( 227 zfs_vdev_open_timeout_ms * 2); 228 } 229 } 230 } 231 } else { 232 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 233 234 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 235 rw_enter(&vd->vd_lock, RW_WRITER); 236 } 237 238 /* 239 * Devices are always opened by the path provided at configuration 240 * time. This means that if the provided path is a udev by-id path 241 * then drives may be re-cabled without an issue. If the provided 242 * path is a udev by-path path, then the physical location information 243 * will be preserved. This can be critical for more complicated 244 * configurations where drives are located in specific physical 245 * locations to maximize the systems tolerance to component failure. 246 * 247 * Alternatively, you can provide your own udev rule to flexibly map 248 * the drives as you see fit. It is not advised that you use the 249 * /dev/[hd]d devices which may be reordered due to probing order. 250 * Devices in the wrong locations will be detected by the higher 251 * level vdev validation. 252 * 253 * The specified paths may be briefly removed and recreated in 254 * response to udev events. This should be exceptionally unlikely 255 * because the zpool command makes every effort to verify these paths 256 * have already settled prior to reaching this point. Therefore, 257 * a ENOENT failure at this point is highly likely to be transient 258 * and it is reasonable to sleep and retry before giving up. In 259 * practice delays have been observed to be on the order of 100ms. 260 */ 261 hrtime_t start = gethrtime(); 262 bdev = ERR_PTR(-ENXIO); 263 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 264 bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, 265 zfs_vdev_holder); 266 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 267 schedule_timeout(MSEC_TO_TICK(10)); 268 } else if (IS_ERR(bdev)) { 269 break; 270 } 271 } 272 273 if (IS_ERR(bdev)) { 274 int error = -PTR_ERR(bdev); 275 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 276 (u_longlong_t)(gethrtime() - start), 277 (u_longlong_t)timeout); 278 vd->vd_bdev = NULL; 279 v->vdev_tsd = vd; 280 rw_exit(&vd->vd_lock); 281 return (SET_ERROR(error)); 282 } else { 283 vd->vd_bdev = bdev; 284 v->vdev_tsd = vd; 285 rw_exit(&vd->vd_lock); 286 } 287 288 struct request_queue *q = bdev_get_queue(vd->vd_bdev); 289 290 /* Determine the physical block size */ 291 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 292 293 /* Determine the logical block size */ 294 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 295 296 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 297 v->vdev_nowritecache = B_FALSE; 298 299 /* Set when device reports it supports TRIM. */ 300 v->vdev_has_trim = !!blk_queue_discard(q); 301 302 /* Set when device reports it supports secure TRIM. */ 303 v->vdev_has_securetrim = !!blk_queue_discard_secure(q); 304 305 /* Inform the ZIO pipeline that we are non-rotational */ 306 v->vdev_nonrot = blk_queue_nonrot(q); 307 308 /* Physical volume size in bytes for the partition */ 309 *psize = bdev_capacity(vd->vd_bdev); 310 311 /* Physical volume size in bytes including possible expansion space */ 312 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 313 314 /* Based on the minimum sector size set the block size */ 315 *physical_ashift = highbit64(MAX(physical_block_size, 316 SPA_MINBLOCKSIZE)) - 1; 317 318 *logical_ashift = highbit64(MAX(logical_block_size, 319 SPA_MINBLOCKSIZE)) - 1; 320 321 return (0); 322 } 323 324 static void 325 vdev_disk_close(vdev_t *v) 326 { 327 vdev_disk_t *vd = v->vdev_tsd; 328 329 if (v->vdev_reopening || vd == NULL) 330 return; 331 332 if (vd->vd_bdev != NULL) { 333 blkdev_put(vd->vd_bdev, 334 vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); 335 } 336 337 rw_destroy(&vd->vd_lock); 338 kmem_free(vd, sizeof (vdev_disk_t)); 339 v->vdev_tsd = NULL; 340 } 341 342 static dio_request_t * 343 vdev_disk_dio_alloc(int bio_count) 344 { 345 dio_request_t *dr; 346 int i; 347 348 dr = kmem_zalloc(sizeof (dio_request_t) + 349 sizeof (struct bio *) * bio_count, KM_SLEEP); 350 if (dr) { 351 atomic_set(&dr->dr_ref, 0); 352 dr->dr_bio_count = bio_count; 353 dr->dr_error = 0; 354 355 for (i = 0; i < dr->dr_bio_count; i++) 356 dr->dr_bio[i] = NULL; 357 } 358 359 return (dr); 360 } 361 362 static void 363 vdev_disk_dio_free(dio_request_t *dr) 364 { 365 int i; 366 367 for (i = 0; i < dr->dr_bio_count; i++) 368 if (dr->dr_bio[i]) 369 bio_put(dr->dr_bio[i]); 370 371 kmem_free(dr, sizeof (dio_request_t) + 372 sizeof (struct bio *) * dr->dr_bio_count); 373 } 374 375 static void 376 vdev_disk_dio_get(dio_request_t *dr) 377 { 378 atomic_inc(&dr->dr_ref); 379 } 380 381 static int 382 vdev_disk_dio_put(dio_request_t *dr) 383 { 384 int rc = atomic_dec_return(&dr->dr_ref); 385 386 /* 387 * Free the dio_request when the last reference is dropped and 388 * ensure zio_interpret is called only once with the correct zio 389 */ 390 if (rc == 0) { 391 zio_t *zio = dr->dr_zio; 392 int error = dr->dr_error; 393 394 vdev_disk_dio_free(dr); 395 396 if (zio) { 397 zio->io_error = error; 398 ASSERT3S(zio->io_error, >=, 0); 399 if (zio->io_error) 400 vdev_disk_error(zio); 401 402 zio_delay_interrupt(zio); 403 } 404 } 405 406 return (rc); 407 } 408 409 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 410 { 411 dio_request_t *dr = bio->bi_private; 412 int rc; 413 414 if (dr->dr_error == 0) { 415 #ifdef HAVE_1ARG_BIO_END_IO_T 416 dr->dr_error = BIO_END_IO_ERROR(bio); 417 #else 418 if (error) 419 dr->dr_error = -(error); 420 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 421 dr->dr_error = EIO; 422 #endif 423 } 424 425 /* Drop reference acquired by __vdev_disk_physio */ 426 rc = vdev_disk_dio_put(dr); 427 } 428 429 static inline void 430 vdev_submit_bio_impl(struct bio *bio) 431 { 432 #ifdef HAVE_1ARG_SUBMIT_BIO 433 submit_bio(bio); 434 #else 435 submit_bio(0, bio); 436 #endif 437 } 438 439 /* 440 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 441 * replace it with preempt_schedule under the following condition: 442 */ 443 #if defined(CONFIG_ARM64) && \ 444 defined(CONFIG_PREEMPTION) && \ 445 defined(CONFIG_BLK_CGROUP) 446 #define preempt_schedule_notrace(x) preempt_schedule(x) 447 #endif 448 449 #ifdef HAVE_BIO_SET_DEV 450 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 451 /* 452 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 453 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 454 * As a side effect the function was converted to GPL-only. Define our 455 * own version when needed which uses rcu_read_lock_sched(). 456 */ 457 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) 458 static inline bool 459 vdev_blkg_tryget(struct blkcg_gq *blkg) 460 { 461 struct percpu_ref *ref = &blkg->refcnt; 462 unsigned long __percpu *count; 463 bool rc; 464 465 rcu_read_lock_sched(); 466 467 if (__ref_is_percpu(ref, &count)) { 468 this_cpu_inc(*count); 469 rc = true; 470 } else { 471 rc = atomic_long_inc_not_zero(&ref->count); 472 } 473 474 rcu_read_unlock_sched(); 475 476 return (rc); 477 } 478 #elif defined(HAVE_BLKG_TRYGET) 479 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 480 #endif 481 /* 482 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 483 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 484 * the entire macro. Provide a minimal version which always assigns the 485 * request queue's root_blkg to the bio. 486 */ 487 static inline void 488 vdev_bio_associate_blkg(struct bio *bio) 489 { 490 struct request_queue *q = bio->bi_disk->queue; 491 492 ASSERT3P(q, !=, NULL); 493 ASSERT3P(bio->bi_blkg, ==, NULL); 494 495 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 496 bio->bi_blkg = q->root_blkg; 497 } 498 #define bio_associate_blkg vdev_bio_associate_blkg 499 #endif 500 #else 501 /* 502 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 503 */ 504 static inline void 505 bio_set_dev(struct bio *bio, struct block_device *bdev) 506 { 507 bio->bi_bdev = bdev; 508 } 509 #endif /* HAVE_BIO_SET_DEV */ 510 511 static inline void 512 vdev_submit_bio(struct bio *bio) 513 { 514 struct bio_list *bio_list = current->bio_list; 515 current->bio_list = NULL; 516 vdev_submit_bio_impl(bio); 517 current->bio_list = bio_list; 518 } 519 520 static int 521 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 522 size_t io_size, uint64_t io_offset, int rw, int flags) 523 { 524 dio_request_t *dr; 525 uint64_t abd_offset; 526 uint64_t bio_offset; 527 int bio_size, bio_count = 16; 528 int i = 0, error = 0; 529 struct blk_plug plug; 530 531 /* 532 * Accessing outside the block device is never allowed. 533 */ 534 if (io_offset + io_size > bdev->bd_inode->i_size) { 535 vdev_dbgmsg(zio->io_vd, 536 "Illegal access %llu size %llu, device size %llu", 537 io_offset, io_size, i_size_read(bdev->bd_inode)); 538 return (SET_ERROR(EIO)); 539 } 540 541 retry: 542 dr = vdev_disk_dio_alloc(bio_count); 543 if (dr == NULL) 544 return (SET_ERROR(ENOMEM)); 545 546 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 547 bio_set_flags_failfast(bdev, &flags); 548 549 dr->dr_zio = zio; 550 551 /* 552 * When the IO size exceeds the maximum bio size for the request 553 * queue we are forced to break the IO in multiple bio's and wait 554 * for them all to complete. Ideally, all pool users will set 555 * their volume block size to match the maximum request size and 556 * the common case will be one bio per vdev IO request. 557 */ 558 559 abd_offset = 0; 560 bio_offset = io_offset; 561 bio_size = io_size; 562 for (i = 0; i <= dr->dr_bio_count; i++) { 563 564 /* Finished constructing bio's for given buffer */ 565 if (bio_size <= 0) 566 break; 567 568 /* 569 * By default only 'bio_count' bio's per dio are allowed. 570 * However, if we find ourselves in a situation where more 571 * are needed we allocate a larger dio and warn the user. 572 */ 573 if (dr->dr_bio_count == i) { 574 vdev_disk_dio_free(dr); 575 bio_count *= 2; 576 goto retry; 577 } 578 579 /* bio_alloc() with __GFP_WAIT never returns NULL */ 580 dr->dr_bio[i] = bio_alloc(GFP_NOIO, 581 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), 582 BIO_MAX_PAGES)); 583 if (unlikely(dr->dr_bio[i] == NULL)) { 584 vdev_disk_dio_free(dr); 585 return (SET_ERROR(ENOMEM)); 586 } 587 588 /* Matching put called by vdev_disk_physio_completion */ 589 vdev_disk_dio_get(dr); 590 591 bio_set_dev(dr->dr_bio[i], bdev); 592 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 593 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 594 dr->dr_bio[i]->bi_private = dr; 595 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 596 597 /* Remaining size is returned to become the new size */ 598 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 599 bio_size, abd_offset); 600 601 /* Advance in buffer and construct another bio if needed */ 602 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 603 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 604 } 605 606 /* Extra reference to protect dio_request during vdev_submit_bio */ 607 vdev_disk_dio_get(dr); 608 609 if (dr->dr_bio_count > 1) 610 blk_start_plug(&plug); 611 612 /* Submit all bio's associated with this dio */ 613 for (i = 0; i < dr->dr_bio_count; i++) 614 if (dr->dr_bio[i]) 615 vdev_submit_bio(dr->dr_bio[i]); 616 617 if (dr->dr_bio_count > 1) 618 blk_finish_plug(&plug); 619 620 (void) vdev_disk_dio_put(dr); 621 622 return (error); 623 } 624 625 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 626 { 627 zio_t *zio = bio->bi_private; 628 #ifdef HAVE_1ARG_BIO_END_IO_T 629 zio->io_error = BIO_END_IO_ERROR(bio); 630 #else 631 zio->io_error = -error; 632 #endif 633 634 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 635 zio->io_vd->vdev_nowritecache = B_TRUE; 636 637 bio_put(bio); 638 ASSERT3S(zio->io_error, >=, 0); 639 if (zio->io_error) 640 vdev_disk_error(zio); 641 zio_interrupt(zio); 642 } 643 644 static int 645 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 646 { 647 struct request_queue *q; 648 struct bio *bio; 649 650 q = bdev_get_queue(bdev); 651 if (!q) 652 return (SET_ERROR(ENXIO)); 653 654 bio = bio_alloc(GFP_NOIO, 0); 655 /* bio_alloc() with __GFP_WAIT never returns NULL */ 656 if (unlikely(bio == NULL)) 657 return (SET_ERROR(ENOMEM)); 658 659 bio->bi_end_io = vdev_disk_io_flush_completion; 660 bio->bi_private = zio; 661 bio_set_dev(bio, bdev); 662 bio_set_flush(bio); 663 vdev_submit_bio(bio); 664 invalidate_bdev(bdev); 665 666 return (0); 667 } 668 669 static void 670 vdev_disk_io_start(zio_t *zio) 671 { 672 vdev_t *v = zio->io_vd; 673 vdev_disk_t *vd = v->vdev_tsd; 674 unsigned long trim_flags = 0; 675 int rw, error; 676 677 /* 678 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 679 * Nothing to be done here but return failure. 680 */ 681 if (vd == NULL) { 682 zio->io_error = ENXIO; 683 zio_interrupt(zio); 684 return; 685 } 686 687 rw_enter(&vd->vd_lock, RW_READER); 688 689 /* 690 * If the vdev is closed, it's likely due to a failed reopen and is 691 * in the UNAVAIL state. Nothing to be done here but return failure. 692 */ 693 if (vd->vd_bdev == NULL) { 694 rw_exit(&vd->vd_lock); 695 zio->io_error = ENXIO; 696 zio_interrupt(zio); 697 return; 698 } 699 700 switch (zio->io_type) { 701 case ZIO_TYPE_IOCTL: 702 703 if (!vdev_readable(v)) { 704 rw_exit(&vd->vd_lock); 705 zio->io_error = SET_ERROR(ENXIO); 706 zio_interrupt(zio); 707 return; 708 } 709 710 switch (zio->io_cmd) { 711 case DKIOCFLUSHWRITECACHE: 712 713 if (zfs_nocacheflush) 714 break; 715 716 if (v->vdev_nowritecache) { 717 zio->io_error = SET_ERROR(ENOTSUP); 718 break; 719 } 720 721 error = vdev_disk_io_flush(vd->vd_bdev, zio); 722 if (error == 0) { 723 rw_exit(&vd->vd_lock); 724 return; 725 } 726 727 zio->io_error = error; 728 729 break; 730 731 default: 732 zio->io_error = SET_ERROR(ENOTSUP); 733 } 734 735 rw_exit(&vd->vd_lock); 736 zio_execute(zio); 737 return; 738 case ZIO_TYPE_WRITE: 739 rw = WRITE; 740 break; 741 742 case ZIO_TYPE_READ: 743 rw = READ; 744 break; 745 746 case ZIO_TYPE_TRIM: 747 #if defined(BLKDEV_DISCARD_SECURE) 748 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 749 trim_flags |= BLKDEV_DISCARD_SECURE; 750 #endif 751 zio->io_error = -blkdev_issue_discard(vd->vd_bdev, 752 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, 753 trim_flags); 754 755 rw_exit(&vd->vd_lock); 756 zio_interrupt(zio); 757 return; 758 759 default: 760 rw_exit(&vd->vd_lock); 761 zio->io_error = SET_ERROR(ENOTSUP); 762 zio_interrupt(zio); 763 return; 764 } 765 766 zio->io_target_timestamp = zio_handle_io_delay(zio); 767 error = __vdev_disk_physio(vd->vd_bdev, zio, 768 zio->io_size, zio->io_offset, rw, 0); 769 rw_exit(&vd->vd_lock); 770 771 if (error) { 772 zio->io_error = error; 773 zio_interrupt(zio); 774 return; 775 } 776 } 777 778 static void 779 vdev_disk_io_done(zio_t *zio) 780 { 781 /* 782 * If the device returned EIO, we revalidate the media. If it is 783 * determined the media has changed this triggers the asynchronous 784 * removal of the device from the configuration. 785 */ 786 if (zio->io_error == EIO) { 787 vdev_t *v = zio->io_vd; 788 vdev_disk_t *vd = v->vdev_tsd; 789 790 if (check_disk_change(vd->vd_bdev)) { 791 invalidate_bdev(vd->vd_bdev); 792 v->vdev_remove_wanted = B_TRUE; 793 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 794 } 795 } 796 } 797 798 static void 799 vdev_disk_hold(vdev_t *vd) 800 { 801 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 802 803 /* We must have a pathname, and it must be absolute. */ 804 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 805 return; 806 807 /* 808 * Only prefetch path and devid info if the device has 809 * never been opened. 810 */ 811 if (vd->vdev_tsd != NULL) 812 return; 813 814 } 815 816 static void 817 vdev_disk_rele(vdev_t *vd) 818 { 819 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 820 821 /* XXX: Implement me as a vnode rele for the device */ 822 } 823 824 vdev_ops_t vdev_disk_ops = { 825 .vdev_op_open = vdev_disk_open, 826 .vdev_op_close = vdev_disk_close, 827 .vdev_op_asize = vdev_default_asize, 828 .vdev_op_io_start = vdev_disk_io_start, 829 .vdev_op_io_done = vdev_disk_io_done, 830 .vdev_op_state_change = NULL, 831 .vdev_op_need_resilver = NULL, 832 .vdev_op_hold = vdev_disk_hold, 833 .vdev_op_rele = vdev_disk_rele, 834 .vdev_op_remap = NULL, 835 .vdev_op_xlate = vdev_default_xlate, 836 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 837 .vdev_op_leaf = B_TRUE /* leaf vdev */ 838 }; 839 840 /* 841 * The zfs_vdev_scheduler module option has been deprecated. Setting this 842 * value no longer has any effect. It has not yet been entirely removed 843 * to allow the module to be loaded if this option is specified in the 844 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 845 */ 846 static int 847 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 848 { 849 int error = param_set_charp(val, kp); 850 if (error == 0) { 851 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 852 "is not supported.\n"); 853 } 854 855 return (error); 856 } 857 858 char *zfs_vdev_scheduler = "unused"; 859 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 860 param_get_charp, &zfs_vdev_scheduler, 0644); 861 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 862 863 int 864 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 865 { 866 uint64_t val; 867 int error; 868 869 error = kstrtoull(buf, 0, &val); 870 if (error < 0) 871 return (SET_ERROR(error)); 872 873 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 874 return (SET_ERROR(-EINVAL)); 875 876 error = param_set_ulong(buf, kp); 877 if (error < 0) 878 return (SET_ERROR(error)); 879 880 return (0); 881 } 882 883 int 884 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 885 { 886 uint64_t val; 887 int error; 888 889 error = kstrtoull(buf, 0, &val); 890 if (error < 0) 891 return (SET_ERROR(error)); 892 893 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 894 return (SET_ERROR(-EINVAL)); 895 896 error = param_set_ulong(buf, kp); 897 if (error < 0) 898 return (SET_ERROR(error)); 899 900 return (0); 901 } 902