1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/msdos_fs.h> 38 #include <linux/vfs_compat.h> 39 40 typedef struct vdev_disk { 41 struct block_device *vd_bdev; 42 krwlock_t vd_lock; 43 } vdev_disk_t; 44 45 /* 46 * Unique identifier for the exclusive vdev holder. 47 */ 48 static void *zfs_vdev_holder = VDEV_HOLDER; 49 50 /* 51 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 52 * device is missing. The missing path may be transient since the links 53 * can be briefly removed and recreated in response to udev events. 54 */ 55 static unsigned zfs_vdev_open_timeout_ms = 1000; 56 57 /* 58 * Size of the "reserved" partition, in blocks. 59 */ 60 #define EFI_MIN_RESV_SIZE (16 * 1024) 61 62 /* 63 * Virtual device vector for disks. 64 */ 65 typedef struct dio_request { 66 zio_t *dr_zio; /* Parent ZIO */ 67 atomic_t dr_ref; /* References */ 68 int dr_error; /* Bio error */ 69 int dr_bio_count; /* Count of bio's */ 70 struct bio *dr_bio[0]; /* Attached bio's */ 71 } dio_request_t; 72 73 static fmode_t 74 vdev_bdev_mode(spa_mode_t spa_mode) 75 { 76 fmode_t mode = 0; 77 78 if (spa_mode & SPA_MODE_READ) 79 mode |= FMODE_READ; 80 81 if (spa_mode & SPA_MODE_WRITE) 82 mode |= FMODE_WRITE; 83 84 return (mode); 85 } 86 87 /* 88 * Returns the usable capacity (in bytes) for the partition or disk. 89 */ 90 static uint64_t 91 bdev_capacity(struct block_device *bdev) 92 { 93 return (i_size_read(bdev->bd_inode)); 94 } 95 96 /* 97 * Returns the maximum expansion capacity of the block device (in bytes). 98 * 99 * It is possible to expand a vdev when it has been created as a wholedisk 100 * and the containing block device has increased in capacity. Or when the 101 * partition containing the pool has been manually increased in size. 102 * 103 * This function is only responsible for calculating the potential expansion 104 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 105 * responsible for verifying the expected partition layout in the wholedisk 106 * case, and updating the partition table if appropriate. Once the partition 107 * size has been increased the additional capacity will be visible using 108 * bdev_capacity(). 109 * 110 * The returned maximum expansion capacity is always expected to be larger, or 111 * at the very least equal, to its usable capacity to prevent overestimating 112 * the pool expandsize. 113 */ 114 static uint64_t 115 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 116 { 117 uint64_t psize; 118 int64_t available; 119 120 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { 121 /* 122 * When reporting maximum expansion capacity for a wholedisk 123 * deduct any capacity which is expected to be lost due to 124 * alignment restrictions. Over reporting this value isn't 125 * harmful and would only result in slightly less capacity 126 * than expected post expansion. 127 * The estimated available space may be slightly smaller than 128 * bdev_capacity() for devices where the number of sectors is 129 * not a multiple of the alignment size and the partition layout 130 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 131 * "reserved" EFI partition: in such cases return the device 132 * usable capacity. 133 */ 134 available = i_size_read(bdev->bd_contains->bd_inode) - 135 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 136 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 137 psize = MAX(available, bdev_capacity(bdev)); 138 } else { 139 psize = bdev_capacity(bdev); 140 } 141 142 return (psize); 143 } 144 145 static void 146 vdev_disk_error(zio_t *zio) 147 { 148 /* 149 * This function can be called in interrupt context, for instance while 150 * handling IRQs coming from a misbehaving disk device; use printk() 151 * which is safe from any context. 152 */ 153 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 154 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), 155 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 156 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 157 zio->io_flags); 158 } 159 160 static int 161 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 162 uint64_t *logical_ashift, uint64_t *physical_ashift) 163 { 164 struct block_device *bdev; 165 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 166 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 167 vdev_disk_t *vd; 168 169 /* Must have a pathname and it must be absolute. */ 170 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 171 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 172 vdev_dbgmsg(v, "invalid vdev_path"); 173 return (SET_ERROR(EINVAL)); 174 } 175 176 /* 177 * Reopen the device if it is currently open. When expanding a 178 * partition force re-scanning the partition table while closed 179 * in order to get an accurate updated block device size. Then 180 * since udev may need to recreate the device links increase the 181 * open retry timeout before reporting the device as unavailable. 182 */ 183 vd = v->vdev_tsd; 184 if (vd) { 185 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 186 boolean_t reread_part = B_FALSE; 187 188 rw_enter(&vd->vd_lock, RW_WRITER); 189 bdev = vd->vd_bdev; 190 vd->vd_bdev = NULL; 191 192 if (bdev) { 193 if (v->vdev_expanding && bdev != bdev->bd_contains) { 194 bdevname(bdev->bd_contains, disk_name + 5); 195 reread_part = B_TRUE; 196 } 197 198 blkdev_put(bdev, mode | FMODE_EXCL); 199 } 200 201 if (reread_part) { 202 bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, 203 zfs_vdev_holder); 204 if (!IS_ERR(bdev)) { 205 int error = vdev_bdev_reread_part(bdev); 206 blkdev_put(bdev, mode | FMODE_EXCL); 207 if (error == 0) { 208 timeout = MSEC2NSEC( 209 zfs_vdev_open_timeout_ms * 2); 210 } 211 } 212 } 213 } else { 214 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 215 216 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 217 rw_enter(&vd->vd_lock, RW_WRITER); 218 } 219 220 /* 221 * Devices are always opened by the path provided at configuration 222 * time. This means that if the provided path is a udev by-id path 223 * then drives may be re-cabled without an issue. If the provided 224 * path is a udev by-path path, then the physical location information 225 * will be preserved. This can be critical for more complicated 226 * configurations where drives are located in specific physical 227 * locations to maximize the systems tolerance to component failure. 228 * 229 * Alternatively, you can provide your own udev rule to flexibly map 230 * the drives as you see fit. It is not advised that you use the 231 * /dev/[hd]d devices which may be reordered due to probing order. 232 * Devices in the wrong locations will be detected by the higher 233 * level vdev validation. 234 * 235 * The specified paths may be briefly removed and recreated in 236 * response to udev events. This should be exceptionally unlikely 237 * because the zpool command makes every effort to verify these paths 238 * have already settled prior to reaching this point. Therefore, 239 * a ENOENT failure at this point is highly likely to be transient 240 * and it is reasonable to sleep and retry before giving up. In 241 * practice delays have been observed to be on the order of 100ms. 242 */ 243 hrtime_t start = gethrtime(); 244 bdev = ERR_PTR(-ENXIO); 245 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 246 bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, 247 zfs_vdev_holder); 248 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 249 schedule_timeout(MSEC_TO_TICK(10)); 250 } else if (IS_ERR(bdev)) { 251 break; 252 } 253 } 254 255 if (IS_ERR(bdev)) { 256 int error = -PTR_ERR(bdev); 257 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 258 (u_longlong_t)(gethrtime() - start), 259 (u_longlong_t)timeout); 260 vd->vd_bdev = NULL; 261 v->vdev_tsd = vd; 262 rw_exit(&vd->vd_lock); 263 return (SET_ERROR(error)); 264 } else { 265 vd->vd_bdev = bdev; 266 v->vdev_tsd = vd; 267 rw_exit(&vd->vd_lock); 268 } 269 270 struct request_queue *q = bdev_get_queue(vd->vd_bdev); 271 272 /* Determine the physical block size */ 273 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 274 275 /* Determine the logical block size */ 276 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 277 278 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 279 v->vdev_nowritecache = B_FALSE; 280 281 /* Set when device reports it supports TRIM. */ 282 v->vdev_has_trim = !!blk_queue_discard(q); 283 284 /* Set when device reports it supports secure TRIM. */ 285 v->vdev_has_securetrim = !!blk_queue_discard_secure(q); 286 287 /* Inform the ZIO pipeline that we are non-rotational */ 288 v->vdev_nonrot = blk_queue_nonrot(q); 289 290 /* Physical volume size in bytes for the partition */ 291 *psize = bdev_capacity(vd->vd_bdev); 292 293 /* Physical volume size in bytes including possible expansion space */ 294 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 295 296 /* Based on the minimum sector size set the block size */ 297 *physical_ashift = highbit64(MAX(physical_block_size, 298 SPA_MINBLOCKSIZE)) - 1; 299 300 *logical_ashift = highbit64(MAX(logical_block_size, 301 SPA_MINBLOCKSIZE)) - 1; 302 303 return (0); 304 } 305 306 static void 307 vdev_disk_close(vdev_t *v) 308 { 309 vdev_disk_t *vd = v->vdev_tsd; 310 311 if (v->vdev_reopening || vd == NULL) 312 return; 313 314 if (vd->vd_bdev != NULL) { 315 blkdev_put(vd->vd_bdev, 316 vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); 317 } 318 319 rw_destroy(&vd->vd_lock); 320 kmem_free(vd, sizeof (vdev_disk_t)); 321 v->vdev_tsd = NULL; 322 } 323 324 static dio_request_t * 325 vdev_disk_dio_alloc(int bio_count) 326 { 327 dio_request_t *dr; 328 int i; 329 330 dr = kmem_zalloc(sizeof (dio_request_t) + 331 sizeof (struct bio *) * bio_count, KM_SLEEP); 332 if (dr) { 333 atomic_set(&dr->dr_ref, 0); 334 dr->dr_bio_count = bio_count; 335 dr->dr_error = 0; 336 337 for (i = 0; i < dr->dr_bio_count; i++) 338 dr->dr_bio[i] = NULL; 339 } 340 341 return (dr); 342 } 343 344 static void 345 vdev_disk_dio_free(dio_request_t *dr) 346 { 347 int i; 348 349 for (i = 0; i < dr->dr_bio_count; i++) 350 if (dr->dr_bio[i]) 351 bio_put(dr->dr_bio[i]); 352 353 kmem_free(dr, sizeof (dio_request_t) + 354 sizeof (struct bio *) * dr->dr_bio_count); 355 } 356 357 static void 358 vdev_disk_dio_get(dio_request_t *dr) 359 { 360 atomic_inc(&dr->dr_ref); 361 } 362 363 static int 364 vdev_disk_dio_put(dio_request_t *dr) 365 { 366 int rc = atomic_dec_return(&dr->dr_ref); 367 368 /* 369 * Free the dio_request when the last reference is dropped and 370 * ensure zio_interpret is called only once with the correct zio 371 */ 372 if (rc == 0) { 373 zio_t *zio = dr->dr_zio; 374 int error = dr->dr_error; 375 376 vdev_disk_dio_free(dr); 377 378 if (zio) { 379 zio->io_error = error; 380 ASSERT3S(zio->io_error, >=, 0); 381 if (zio->io_error) 382 vdev_disk_error(zio); 383 384 zio_delay_interrupt(zio); 385 } 386 } 387 388 return (rc); 389 } 390 391 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 392 { 393 dio_request_t *dr = bio->bi_private; 394 int rc; 395 396 if (dr->dr_error == 0) { 397 #ifdef HAVE_1ARG_BIO_END_IO_T 398 dr->dr_error = BIO_END_IO_ERROR(bio); 399 #else 400 if (error) 401 dr->dr_error = -(error); 402 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 403 dr->dr_error = EIO; 404 #endif 405 } 406 407 /* Drop reference acquired by __vdev_disk_physio */ 408 rc = vdev_disk_dio_put(dr); 409 } 410 411 static inline void 412 vdev_submit_bio_impl(struct bio *bio) 413 { 414 #ifdef HAVE_1ARG_SUBMIT_BIO 415 submit_bio(bio); 416 #else 417 submit_bio(0, bio); 418 #endif 419 } 420 421 #ifdef HAVE_BIO_SET_DEV 422 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 423 /* 424 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 425 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 426 * As a side effect the function was converted to GPL-only. Define our 427 * own version when needed which uses rcu_read_lock_sched(). 428 */ 429 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) 430 static inline bool 431 vdev_blkg_tryget(struct blkcg_gq *blkg) 432 { 433 struct percpu_ref *ref = &blkg->refcnt; 434 unsigned long __percpu *count; 435 bool rc; 436 437 rcu_read_lock_sched(); 438 439 if (__ref_is_percpu(ref, &count)) { 440 this_cpu_inc(*count); 441 rc = true; 442 } else { 443 rc = atomic_long_inc_not_zero(&ref->count); 444 } 445 446 rcu_read_unlock_sched(); 447 448 return (rc); 449 } 450 #elif defined(HAVE_BLKG_TRYGET) 451 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 452 #endif 453 /* 454 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 455 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 456 * the entire macro. Provide a minimal version which always assigns the 457 * request queue's root_blkg to the bio. 458 */ 459 static inline void 460 vdev_bio_associate_blkg(struct bio *bio) 461 { 462 struct request_queue *q = bio->bi_disk->queue; 463 464 ASSERT3P(q, !=, NULL); 465 ASSERT3P(bio->bi_blkg, ==, NULL); 466 467 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 468 bio->bi_blkg = q->root_blkg; 469 } 470 #define bio_associate_blkg vdev_bio_associate_blkg 471 #endif 472 #else 473 /* 474 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 475 */ 476 static inline void 477 bio_set_dev(struct bio *bio, struct block_device *bdev) 478 { 479 bio->bi_bdev = bdev; 480 } 481 #endif /* HAVE_BIO_SET_DEV */ 482 483 static inline void 484 vdev_submit_bio(struct bio *bio) 485 { 486 struct bio_list *bio_list = current->bio_list; 487 current->bio_list = NULL; 488 vdev_submit_bio_impl(bio); 489 current->bio_list = bio_list; 490 } 491 492 static int 493 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 494 size_t io_size, uint64_t io_offset, int rw, int flags) 495 { 496 dio_request_t *dr; 497 uint64_t abd_offset; 498 uint64_t bio_offset; 499 int bio_size, bio_count = 16; 500 int i = 0, error = 0; 501 struct blk_plug plug; 502 503 /* 504 * Accessing outside the block device is never allowed. 505 */ 506 if (io_offset + io_size > bdev->bd_inode->i_size) { 507 vdev_dbgmsg(zio->io_vd, 508 "Illegal access %llu size %llu, device size %llu", 509 io_offset, io_size, i_size_read(bdev->bd_inode)); 510 return (SET_ERROR(EIO)); 511 } 512 513 retry: 514 dr = vdev_disk_dio_alloc(bio_count); 515 if (dr == NULL) 516 return (SET_ERROR(ENOMEM)); 517 518 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 519 bio_set_flags_failfast(bdev, &flags); 520 521 dr->dr_zio = zio; 522 523 /* 524 * When the IO size exceeds the maximum bio size for the request 525 * queue we are forced to break the IO in multiple bio's and wait 526 * for them all to complete. Ideally, all pool users will set 527 * their volume block size to match the maximum request size and 528 * the common case will be one bio per vdev IO request. 529 */ 530 531 abd_offset = 0; 532 bio_offset = io_offset; 533 bio_size = io_size; 534 for (i = 0; i <= dr->dr_bio_count; i++) { 535 536 /* Finished constructing bio's for given buffer */ 537 if (bio_size <= 0) 538 break; 539 540 /* 541 * By default only 'bio_count' bio's per dio are allowed. 542 * However, if we find ourselves in a situation where more 543 * are needed we allocate a larger dio and warn the user. 544 */ 545 if (dr->dr_bio_count == i) { 546 vdev_disk_dio_free(dr); 547 bio_count *= 2; 548 goto retry; 549 } 550 551 /* bio_alloc() with __GFP_WAIT never returns NULL */ 552 dr->dr_bio[i] = bio_alloc(GFP_NOIO, 553 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), 554 BIO_MAX_PAGES)); 555 if (unlikely(dr->dr_bio[i] == NULL)) { 556 vdev_disk_dio_free(dr); 557 return (SET_ERROR(ENOMEM)); 558 } 559 560 /* Matching put called by vdev_disk_physio_completion */ 561 vdev_disk_dio_get(dr); 562 563 bio_set_dev(dr->dr_bio[i], bdev); 564 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 565 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 566 dr->dr_bio[i]->bi_private = dr; 567 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 568 569 /* Remaining size is returned to become the new size */ 570 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 571 bio_size, abd_offset); 572 573 /* Advance in buffer and construct another bio if needed */ 574 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 575 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 576 } 577 578 /* Extra reference to protect dio_request during vdev_submit_bio */ 579 vdev_disk_dio_get(dr); 580 581 if (dr->dr_bio_count > 1) 582 blk_start_plug(&plug); 583 584 /* Submit all bio's associated with this dio */ 585 for (i = 0; i < dr->dr_bio_count; i++) 586 if (dr->dr_bio[i]) 587 vdev_submit_bio(dr->dr_bio[i]); 588 589 if (dr->dr_bio_count > 1) 590 blk_finish_plug(&plug); 591 592 (void) vdev_disk_dio_put(dr); 593 594 return (error); 595 } 596 597 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 598 { 599 zio_t *zio = bio->bi_private; 600 #ifdef HAVE_1ARG_BIO_END_IO_T 601 zio->io_error = BIO_END_IO_ERROR(bio); 602 #else 603 zio->io_error = -error; 604 #endif 605 606 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 607 zio->io_vd->vdev_nowritecache = B_TRUE; 608 609 bio_put(bio); 610 ASSERT3S(zio->io_error, >=, 0); 611 if (zio->io_error) 612 vdev_disk_error(zio); 613 zio_interrupt(zio); 614 } 615 616 static int 617 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 618 { 619 struct request_queue *q; 620 struct bio *bio; 621 622 q = bdev_get_queue(bdev); 623 if (!q) 624 return (SET_ERROR(ENXIO)); 625 626 bio = bio_alloc(GFP_NOIO, 0); 627 /* bio_alloc() with __GFP_WAIT never returns NULL */ 628 if (unlikely(bio == NULL)) 629 return (SET_ERROR(ENOMEM)); 630 631 bio->bi_end_io = vdev_disk_io_flush_completion; 632 bio->bi_private = zio; 633 bio_set_dev(bio, bdev); 634 bio_set_flush(bio); 635 vdev_submit_bio(bio); 636 invalidate_bdev(bdev); 637 638 return (0); 639 } 640 641 static void 642 vdev_disk_io_start(zio_t *zio) 643 { 644 vdev_t *v = zio->io_vd; 645 vdev_disk_t *vd = v->vdev_tsd; 646 unsigned long trim_flags = 0; 647 int rw, error; 648 649 /* 650 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 651 * Nothing to be done here but return failure. 652 */ 653 if (vd == NULL) { 654 zio->io_error = ENXIO; 655 zio_interrupt(zio); 656 return; 657 } 658 659 rw_enter(&vd->vd_lock, RW_READER); 660 661 /* 662 * If the vdev is closed, it's likely due to a failed reopen and is 663 * in the UNAVAIL state. Nothing to be done here but return failure. 664 */ 665 if (vd->vd_bdev == NULL) { 666 rw_exit(&vd->vd_lock); 667 zio->io_error = ENXIO; 668 zio_interrupt(zio); 669 return; 670 } 671 672 switch (zio->io_type) { 673 case ZIO_TYPE_IOCTL: 674 675 if (!vdev_readable(v)) { 676 rw_exit(&vd->vd_lock); 677 zio->io_error = SET_ERROR(ENXIO); 678 zio_interrupt(zio); 679 return; 680 } 681 682 switch (zio->io_cmd) { 683 case DKIOCFLUSHWRITECACHE: 684 685 if (zfs_nocacheflush) 686 break; 687 688 if (v->vdev_nowritecache) { 689 zio->io_error = SET_ERROR(ENOTSUP); 690 break; 691 } 692 693 error = vdev_disk_io_flush(vd->vd_bdev, zio); 694 if (error == 0) { 695 rw_exit(&vd->vd_lock); 696 return; 697 } 698 699 zio->io_error = error; 700 701 break; 702 703 default: 704 zio->io_error = SET_ERROR(ENOTSUP); 705 } 706 707 rw_exit(&vd->vd_lock); 708 zio_execute(zio); 709 return; 710 case ZIO_TYPE_WRITE: 711 rw = WRITE; 712 break; 713 714 case ZIO_TYPE_READ: 715 rw = READ; 716 break; 717 718 case ZIO_TYPE_TRIM: 719 #if defined(BLKDEV_DISCARD_SECURE) 720 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 721 trim_flags |= BLKDEV_DISCARD_SECURE; 722 #endif 723 zio->io_error = -blkdev_issue_discard(vd->vd_bdev, 724 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, 725 trim_flags); 726 727 rw_exit(&vd->vd_lock); 728 zio_interrupt(zio); 729 return; 730 731 default: 732 rw_exit(&vd->vd_lock); 733 zio->io_error = SET_ERROR(ENOTSUP); 734 zio_interrupt(zio); 735 return; 736 } 737 738 zio->io_target_timestamp = zio_handle_io_delay(zio); 739 error = __vdev_disk_physio(vd->vd_bdev, zio, 740 zio->io_size, zio->io_offset, rw, 0); 741 rw_exit(&vd->vd_lock); 742 743 if (error) { 744 zio->io_error = error; 745 zio_interrupt(zio); 746 return; 747 } 748 } 749 750 static void 751 vdev_disk_io_done(zio_t *zio) 752 { 753 /* 754 * If the device returned EIO, we revalidate the media. If it is 755 * determined the media has changed this triggers the asynchronous 756 * removal of the device from the configuration. 757 */ 758 if (zio->io_error == EIO) { 759 vdev_t *v = zio->io_vd; 760 vdev_disk_t *vd = v->vdev_tsd; 761 762 if (check_disk_change(vd->vd_bdev)) { 763 invalidate_bdev(vd->vd_bdev); 764 v->vdev_remove_wanted = B_TRUE; 765 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 766 } 767 } 768 } 769 770 static void 771 vdev_disk_hold(vdev_t *vd) 772 { 773 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 774 775 /* We must have a pathname, and it must be absolute. */ 776 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 777 return; 778 779 /* 780 * Only prefetch path and devid info if the device has 781 * never been opened. 782 */ 783 if (vd->vdev_tsd != NULL) 784 return; 785 786 } 787 788 static void 789 vdev_disk_rele(vdev_t *vd) 790 { 791 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 792 793 /* XXX: Implement me as a vnode rele for the device */ 794 } 795 796 vdev_ops_t vdev_disk_ops = { 797 .vdev_op_open = vdev_disk_open, 798 .vdev_op_close = vdev_disk_close, 799 .vdev_op_asize = vdev_default_asize, 800 .vdev_op_io_start = vdev_disk_io_start, 801 .vdev_op_io_done = vdev_disk_io_done, 802 .vdev_op_state_change = NULL, 803 .vdev_op_need_resilver = NULL, 804 .vdev_op_hold = vdev_disk_hold, 805 .vdev_op_rele = vdev_disk_rele, 806 .vdev_op_remap = NULL, 807 .vdev_op_xlate = vdev_default_xlate, 808 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 809 .vdev_op_leaf = B_TRUE /* leaf vdev */ 810 }; 811 812 /* 813 * The zfs_vdev_scheduler module option has been deprecated. Setting this 814 * value no longer has any effect. It has not yet been entirely removed 815 * to allow the module to be loaded if this option is specified in the 816 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 817 */ 818 static int 819 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 820 { 821 int error = param_set_charp(val, kp); 822 if (error == 0) { 823 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 824 "is not supported.\n"); 825 } 826 827 return (error); 828 } 829 830 char *zfs_vdev_scheduler = "unused"; 831 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 832 param_get_charp, &zfs_vdev_scheduler, 0644); 833 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 834 835 int 836 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 837 { 838 uint64_t val; 839 int error; 840 841 error = kstrtoull(buf, 0, &val); 842 if (error < 0) 843 return (SET_ERROR(error)); 844 845 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 846 return (SET_ERROR(-EINVAL)); 847 848 error = param_set_ulong(buf, kp); 849 if (error < 0) 850 return (SET_ERROR(error)); 851 852 return (0); 853 } 854 855 int 856 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 857 { 858 uint64_t val; 859 int error; 860 861 error = kstrtoull(buf, 0, &val); 862 if (error < 0) 863 return (SET_ERROR(error)); 864 865 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 866 return (SET_ERROR(-EINVAL)); 867 868 error = param_set_ulong(buf, kp); 869 if (error < 0) 870 return (SET_ERROR(error)); 871 872 return (0); 873 } 874