1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 41 #include <linux/blk-cgroup.h> 42 #endif 43 44 typedef struct vdev_disk { 45 struct block_device *vd_bdev; 46 krwlock_t vd_lock; 47 } vdev_disk_t; 48 49 /* 50 * Unique identifier for the exclusive vdev holder. 51 */ 52 static void *zfs_vdev_holder = VDEV_HOLDER; 53 54 /* 55 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 56 * device is missing. The missing path may be transient since the links 57 * can be briefly removed and recreated in response to udev events. 58 */ 59 static uint_t zfs_vdev_open_timeout_ms = 1000; 60 61 /* 62 * Size of the "reserved" partition, in blocks. 63 */ 64 #define EFI_MIN_RESV_SIZE (16 * 1024) 65 66 /* 67 * Virtual device vector for disks. 68 */ 69 typedef struct dio_request { 70 zio_t *dr_zio; /* Parent ZIO */ 71 atomic_t dr_ref; /* References */ 72 int dr_error; /* Bio error */ 73 int dr_bio_count; /* Count of bio's */ 74 struct bio *dr_bio[]; /* Attached bio's */ 75 } dio_request_t; 76 77 /* 78 * BIO request failfast mask. 79 */ 80 81 static unsigned int zfs_vdev_failfast_mask = 1; 82 83 static fmode_t 84 vdev_bdev_mode(spa_mode_t spa_mode) 85 { 86 fmode_t mode = 0; 87 88 if (spa_mode & SPA_MODE_READ) 89 mode |= FMODE_READ; 90 91 if (spa_mode & SPA_MODE_WRITE) 92 mode |= FMODE_WRITE; 93 94 return (mode); 95 } 96 97 /* 98 * Returns the usable capacity (in bytes) for the partition or disk. 99 */ 100 static uint64_t 101 bdev_capacity(struct block_device *bdev) 102 { 103 return (i_size_read(bdev->bd_inode)); 104 } 105 106 #if !defined(HAVE_BDEV_WHOLE) 107 static inline struct block_device * 108 bdev_whole(struct block_device *bdev) 109 { 110 return (bdev->bd_contains); 111 } 112 #endif 113 114 #if defined(HAVE_BDEVNAME) 115 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 116 #else 117 static inline void 118 vdev_bdevname(struct block_device *bdev, char *name) 119 { 120 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 121 } 122 #endif 123 124 /* 125 * Returns the maximum expansion capacity of the block device (in bytes). 126 * 127 * It is possible to expand a vdev when it has been created as a wholedisk 128 * and the containing block device has increased in capacity. Or when the 129 * partition containing the pool has been manually increased in size. 130 * 131 * This function is only responsible for calculating the potential expansion 132 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 133 * responsible for verifying the expected partition layout in the wholedisk 134 * case, and updating the partition table if appropriate. Once the partition 135 * size has been increased the additional capacity will be visible using 136 * bdev_capacity(). 137 * 138 * The returned maximum expansion capacity is always expected to be larger, or 139 * at the very least equal, to its usable capacity to prevent overestimating 140 * the pool expandsize. 141 */ 142 static uint64_t 143 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 144 { 145 uint64_t psize; 146 int64_t available; 147 148 if (wholedisk && bdev != bdev_whole(bdev)) { 149 /* 150 * When reporting maximum expansion capacity for a wholedisk 151 * deduct any capacity which is expected to be lost due to 152 * alignment restrictions. Over reporting this value isn't 153 * harmful and would only result in slightly less capacity 154 * than expected post expansion. 155 * The estimated available space may be slightly smaller than 156 * bdev_capacity() for devices where the number of sectors is 157 * not a multiple of the alignment size and the partition layout 158 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 159 * "reserved" EFI partition: in such cases return the device 160 * usable capacity. 161 */ 162 available = i_size_read(bdev_whole(bdev)->bd_inode) - 163 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 164 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 165 psize = MAX(available, bdev_capacity(bdev)); 166 } else { 167 psize = bdev_capacity(bdev); 168 } 169 170 return (psize); 171 } 172 173 static void 174 vdev_disk_error(zio_t *zio) 175 { 176 /* 177 * This function can be called in interrupt context, for instance while 178 * handling IRQs coming from a misbehaving disk device; use printk() 179 * which is safe from any context. 180 */ 181 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 182 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 183 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 184 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 185 zio->io_flags); 186 } 187 188 static void 189 vdev_disk_kobj_evt_post(vdev_t *v) 190 { 191 vdev_disk_t *vd = v->vdev_tsd; 192 if (vd && vd->vd_bdev) { 193 spl_signal_kobj_evt(vd->vd_bdev); 194 } else { 195 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 196 v->vdev_path); 197 } 198 } 199 200 static int 201 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 202 uint64_t *logical_ashift, uint64_t *physical_ashift) 203 { 204 struct block_device *bdev; 205 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 206 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 207 vdev_disk_t *vd; 208 209 /* Must have a pathname and it must be absolute. */ 210 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 211 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 212 vdev_dbgmsg(v, "invalid vdev_path"); 213 return (SET_ERROR(EINVAL)); 214 } 215 216 /* 217 * Reopen the device if it is currently open. When expanding a 218 * partition force re-scanning the partition table if userland 219 * did not take care of this already. We need to do this while closed 220 * in order to get an accurate updated block device size. Then 221 * since udev may need to recreate the device links increase the 222 * open retry timeout before reporting the device as unavailable. 223 */ 224 vd = v->vdev_tsd; 225 if (vd) { 226 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 227 boolean_t reread_part = B_FALSE; 228 229 rw_enter(&vd->vd_lock, RW_WRITER); 230 bdev = vd->vd_bdev; 231 vd->vd_bdev = NULL; 232 233 if (bdev) { 234 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 235 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 236 /* 237 * If userland has BLKPG_RESIZE_PARTITION, 238 * then it should have updated the partition 239 * table already. We can detect this by 240 * comparing our current physical size 241 * with that of the device. If they are 242 * the same, then we must not have 243 * BLKPG_RESIZE_PARTITION or it failed to 244 * update the partition table online. We 245 * fallback to rescanning the partition 246 * table from the kernel below. However, 247 * if the capacity already reflects the 248 * updated partition, then we skip 249 * rescanning the partition table here. 250 */ 251 if (v->vdev_psize == bdev_capacity(bdev)) 252 reread_part = B_TRUE; 253 } 254 255 blkdev_put(bdev, mode | FMODE_EXCL); 256 } 257 258 if (reread_part) { 259 bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, 260 zfs_vdev_holder); 261 if (!IS_ERR(bdev)) { 262 int error = vdev_bdev_reread_part(bdev); 263 blkdev_put(bdev, mode | FMODE_EXCL); 264 if (error == 0) { 265 timeout = MSEC2NSEC( 266 zfs_vdev_open_timeout_ms * 2); 267 } 268 } 269 } 270 } else { 271 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 272 273 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 274 rw_enter(&vd->vd_lock, RW_WRITER); 275 } 276 277 /* 278 * Devices are always opened by the path provided at configuration 279 * time. This means that if the provided path is a udev by-id path 280 * then drives may be re-cabled without an issue. If the provided 281 * path is a udev by-path path, then the physical location information 282 * will be preserved. This can be critical for more complicated 283 * configurations where drives are located in specific physical 284 * locations to maximize the systems tolerance to component failure. 285 * 286 * Alternatively, you can provide your own udev rule to flexibly map 287 * the drives as you see fit. It is not advised that you use the 288 * /dev/[hd]d devices which may be reordered due to probing order. 289 * Devices in the wrong locations will be detected by the higher 290 * level vdev validation. 291 * 292 * The specified paths may be briefly removed and recreated in 293 * response to udev events. This should be exceptionally unlikely 294 * because the zpool command makes every effort to verify these paths 295 * have already settled prior to reaching this point. Therefore, 296 * a ENOENT failure at this point is highly likely to be transient 297 * and it is reasonable to sleep and retry before giving up. In 298 * practice delays have been observed to be on the order of 100ms. 299 * 300 * When ERESTARTSYS is returned it indicates the block device is 301 * a zvol which could not be opened due to the deadlock detection 302 * logic in zvol_open(). Extend the timeout and retry the open 303 * subsequent attempts are expected to eventually succeed. 304 */ 305 hrtime_t start = gethrtime(); 306 bdev = ERR_PTR(-ENXIO); 307 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 308 bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, 309 zfs_vdev_holder); 310 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 311 /* 312 * There is no point of waiting since device is removed 313 * explicitly 314 */ 315 if (v->vdev_removed) 316 break; 317 318 schedule_timeout(MSEC_TO_TICK(10)); 319 } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { 320 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 321 continue; 322 } else if (IS_ERR(bdev)) { 323 break; 324 } 325 } 326 327 if (IS_ERR(bdev)) { 328 int error = -PTR_ERR(bdev); 329 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 330 (u_longlong_t)(gethrtime() - start), 331 (u_longlong_t)timeout); 332 vd->vd_bdev = NULL; 333 v->vdev_tsd = vd; 334 rw_exit(&vd->vd_lock); 335 return (SET_ERROR(error)); 336 } else { 337 vd->vd_bdev = bdev; 338 v->vdev_tsd = vd; 339 rw_exit(&vd->vd_lock); 340 } 341 342 /* Determine the physical block size */ 343 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 344 345 /* Determine the logical block size */ 346 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 347 348 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 349 v->vdev_nowritecache = B_FALSE; 350 351 /* Set when device reports it supports TRIM. */ 352 v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); 353 354 /* Set when device reports it supports secure TRIM. */ 355 v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); 356 357 /* Inform the ZIO pipeline that we are non-rotational */ 358 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); 359 360 /* Physical volume size in bytes for the partition */ 361 *psize = bdev_capacity(vd->vd_bdev); 362 363 /* Physical volume size in bytes including possible expansion space */ 364 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 365 366 /* Based on the minimum sector size set the block size */ 367 *physical_ashift = highbit64(MAX(physical_block_size, 368 SPA_MINBLOCKSIZE)) - 1; 369 370 *logical_ashift = highbit64(MAX(logical_block_size, 371 SPA_MINBLOCKSIZE)) - 1; 372 373 return (0); 374 } 375 376 static void 377 vdev_disk_close(vdev_t *v) 378 { 379 vdev_disk_t *vd = v->vdev_tsd; 380 381 if (v->vdev_reopening || vd == NULL) 382 return; 383 384 if (vd->vd_bdev != NULL) { 385 blkdev_put(vd->vd_bdev, 386 vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); 387 } 388 389 rw_destroy(&vd->vd_lock); 390 kmem_free(vd, sizeof (vdev_disk_t)); 391 v->vdev_tsd = NULL; 392 } 393 394 static dio_request_t * 395 vdev_disk_dio_alloc(int bio_count) 396 { 397 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 398 sizeof (struct bio *) * bio_count, KM_SLEEP); 399 atomic_set(&dr->dr_ref, 0); 400 dr->dr_bio_count = bio_count; 401 dr->dr_error = 0; 402 403 for (int i = 0; i < dr->dr_bio_count; i++) 404 dr->dr_bio[i] = NULL; 405 406 return (dr); 407 } 408 409 static void 410 vdev_disk_dio_free(dio_request_t *dr) 411 { 412 int i; 413 414 for (i = 0; i < dr->dr_bio_count; i++) 415 if (dr->dr_bio[i]) 416 bio_put(dr->dr_bio[i]); 417 418 kmem_free(dr, sizeof (dio_request_t) + 419 sizeof (struct bio *) * dr->dr_bio_count); 420 } 421 422 static void 423 vdev_disk_dio_get(dio_request_t *dr) 424 { 425 atomic_inc(&dr->dr_ref); 426 } 427 428 static void 429 vdev_disk_dio_put(dio_request_t *dr) 430 { 431 int rc = atomic_dec_return(&dr->dr_ref); 432 433 /* 434 * Free the dio_request when the last reference is dropped and 435 * ensure zio_interpret is called only once with the correct zio 436 */ 437 if (rc == 0) { 438 zio_t *zio = dr->dr_zio; 439 int error = dr->dr_error; 440 441 vdev_disk_dio_free(dr); 442 443 if (zio) { 444 zio->io_error = error; 445 ASSERT3S(zio->io_error, >=, 0); 446 if (zio->io_error) 447 vdev_disk_error(zio); 448 449 zio_delay_interrupt(zio); 450 } 451 } 452 } 453 454 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 455 { 456 dio_request_t *dr = bio->bi_private; 457 458 if (dr->dr_error == 0) { 459 #ifdef HAVE_1ARG_BIO_END_IO_T 460 dr->dr_error = BIO_END_IO_ERROR(bio); 461 #else 462 if (error) 463 dr->dr_error = -(error); 464 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 465 dr->dr_error = EIO; 466 #endif 467 } 468 469 /* Drop reference acquired by __vdev_disk_physio */ 470 vdev_disk_dio_put(dr); 471 } 472 473 static inline void 474 vdev_submit_bio_impl(struct bio *bio) 475 { 476 #ifdef HAVE_1ARG_SUBMIT_BIO 477 (void) submit_bio(bio); 478 #else 479 (void) submit_bio(bio_data_dir(bio), bio); 480 #endif 481 } 482 483 /* 484 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 485 * replace it with preempt_schedule under the following condition: 486 */ 487 #if defined(CONFIG_ARM64) && \ 488 defined(CONFIG_PREEMPTION) && \ 489 defined(CONFIG_BLK_CGROUP) 490 #define preempt_schedule_notrace(x) preempt_schedule(x) 491 #endif 492 493 /* 494 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 495 * as an argument removing the need to set it with bio_set_dev(). This 496 * removes the need for all of the following compatibility code. 497 */ 498 #if !defined(HAVE_BIO_ALLOC_4ARG) 499 500 #ifdef HAVE_BIO_SET_DEV 501 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 502 /* 503 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 504 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 505 * As a side effect the function was converted to GPL-only. Define our 506 * own version when needed which uses rcu_read_lock_sched(). 507 * 508 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 509 * part, moving blkg_tryget into the private one. Define our own version. 510 */ 511 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 512 static inline bool 513 vdev_blkg_tryget(struct blkcg_gq *blkg) 514 { 515 struct percpu_ref *ref = &blkg->refcnt; 516 unsigned long __percpu *count; 517 bool rc; 518 519 rcu_read_lock_sched(); 520 521 if (__ref_is_percpu(ref, &count)) { 522 this_cpu_inc(*count); 523 rc = true; 524 } else { 525 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 526 rc = atomic_long_inc_not_zero(&ref->data->count); 527 #else 528 rc = atomic_long_inc_not_zero(&ref->count); 529 #endif 530 } 531 532 rcu_read_unlock_sched(); 533 534 return (rc); 535 } 536 #else 537 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 538 #endif 539 #ifdef HAVE_BIO_SET_DEV_MACRO 540 /* 541 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 542 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 543 * the entire macro. Provide a minimal version which always assigns the 544 * request queue's root_blkg to the bio. 545 */ 546 static inline void 547 vdev_bio_associate_blkg(struct bio *bio) 548 { 549 #if defined(HAVE_BIO_BDEV_DISK) 550 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 551 #else 552 struct request_queue *q = bio->bi_disk->queue; 553 #endif 554 555 ASSERT3P(q, !=, NULL); 556 ASSERT3P(bio->bi_blkg, ==, NULL); 557 558 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 559 bio->bi_blkg = q->root_blkg; 560 } 561 562 #define bio_associate_blkg vdev_bio_associate_blkg 563 #else 564 static inline void 565 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 566 { 567 #if defined(HAVE_BIO_BDEV_DISK) 568 struct request_queue *q = bdev->bd_disk->queue; 569 #else 570 struct request_queue *q = bio->bi_disk->queue; 571 #endif 572 bio_clear_flag(bio, BIO_REMAPPED); 573 if (bio->bi_bdev != bdev) 574 bio_clear_flag(bio, BIO_THROTTLED); 575 bio->bi_bdev = bdev; 576 577 ASSERT3P(q, !=, NULL); 578 ASSERT3P(bio->bi_blkg, ==, NULL); 579 580 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 581 bio->bi_blkg = q->root_blkg; 582 } 583 #define bio_set_dev vdev_bio_set_dev 584 #endif 585 #endif 586 #else 587 /* 588 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 589 */ 590 static inline void 591 bio_set_dev(struct bio *bio, struct block_device *bdev) 592 { 593 bio->bi_bdev = bdev; 594 } 595 #endif /* HAVE_BIO_SET_DEV */ 596 #endif /* !HAVE_BIO_ALLOC_4ARG */ 597 598 static inline void 599 vdev_submit_bio(struct bio *bio) 600 { 601 struct bio_list *bio_list = current->bio_list; 602 current->bio_list = NULL; 603 vdev_submit_bio_impl(bio); 604 current->bio_list = bio_list; 605 } 606 607 static inline struct bio * 608 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 609 unsigned short nr_vecs) 610 { 611 struct bio *bio; 612 613 #ifdef HAVE_BIO_ALLOC_4ARG 614 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 615 #else 616 bio = bio_alloc(gfp_mask, nr_vecs); 617 if (likely(bio != NULL)) 618 bio_set_dev(bio, bdev); 619 #endif 620 621 return (bio); 622 } 623 624 static inline unsigned int 625 vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 626 { 627 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 628 bio_size, abd_offset); 629 630 #ifdef HAVE_BIO_MAX_SEGS 631 return (bio_max_segs(nr_segs)); 632 #else 633 return (MIN(nr_segs, BIO_MAX_PAGES)); 634 #endif 635 } 636 637 static int 638 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 639 size_t io_size, uint64_t io_offset, int rw, int flags) 640 { 641 dio_request_t *dr; 642 uint64_t abd_offset; 643 uint64_t bio_offset; 644 int bio_size; 645 int bio_count = 16; 646 int error = 0; 647 struct blk_plug plug; 648 unsigned short nr_vecs; 649 650 /* 651 * Accessing outside the block device is never allowed. 652 */ 653 if (io_offset + io_size > bdev->bd_inode->i_size) { 654 vdev_dbgmsg(zio->io_vd, 655 "Illegal access %llu size %llu, device size %llu", 656 (u_longlong_t)io_offset, 657 (u_longlong_t)io_size, 658 (u_longlong_t)i_size_read(bdev->bd_inode)); 659 return (SET_ERROR(EIO)); 660 } 661 662 retry: 663 dr = vdev_disk_dio_alloc(bio_count); 664 665 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 666 zio->io_vd->vdev_failfast == B_TRUE) { 667 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 668 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 669 } 670 671 dr->dr_zio = zio; 672 673 /* 674 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 675 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 676 * can cover at least 128KB and at most 1MB. When the required number 677 * of iovec's exceeds this, we are forced to break the IO in multiple 678 * bio's and wait for them all to complete. This is likely if the 679 * recordsize property is increased beyond 1MB. The default 680 * bio_count=16 should typically accommodate the maximum-size zio of 681 * 16MB. 682 */ 683 684 abd_offset = 0; 685 bio_offset = io_offset; 686 bio_size = io_size; 687 for (int i = 0; i <= dr->dr_bio_count; i++) { 688 689 /* Finished constructing bio's for given buffer */ 690 if (bio_size <= 0) 691 break; 692 693 /* 694 * If additional bio's are required, we have to retry, but 695 * this should be rare - see the comment above. 696 */ 697 if (dr->dr_bio_count == i) { 698 vdev_disk_dio_free(dr); 699 bio_count *= 2; 700 goto retry; 701 } 702 703 nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); 704 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 705 if (unlikely(dr->dr_bio[i] == NULL)) { 706 vdev_disk_dio_free(dr); 707 return (SET_ERROR(ENOMEM)); 708 } 709 710 /* Matching put called by vdev_disk_physio_completion */ 711 vdev_disk_dio_get(dr); 712 713 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 714 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 715 dr->dr_bio[i]->bi_private = dr; 716 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 717 718 /* Remaining size is returned to become the new size */ 719 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 720 bio_size, abd_offset); 721 722 /* Advance in buffer and construct another bio if needed */ 723 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 724 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 725 } 726 727 /* Extra reference to protect dio_request during vdev_submit_bio */ 728 vdev_disk_dio_get(dr); 729 730 if (dr->dr_bio_count > 1) 731 blk_start_plug(&plug); 732 733 /* Submit all bio's associated with this dio */ 734 for (int i = 0; i < dr->dr_bio_count; i++) { 735 if (dr->dr_bio[i]) 736 vdev_submit_bio(dr->dr_bio[i]); 737 } 738 739 if (dr->dr_bio_count > 1) 740 blk_finish_plug(&plug); 741 742 vdev_disk_dio_put(dr); 743 744 return (error); 745 } 746 747 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 748 { 749 zio_t *zio = bio->bi_private; 750 #ifdef HAVE_1ARG_BIO_END_IO_T 751 zio->io_error = BIO_END_IO_ERROR(bio); 752 #else 753 zio->io_error = -error; 754 #endif 755 756 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 757 zio->io_vd->vdev_nowritecache = B_TRUE; 758 759 bio_put(bio); 760 ASSERT3S(zio->io_error, >=, 0); 761 if (zio->io_error) 762 vdev_disk_error(zio); 763 zio_interrupt(zio); 764 } 765 766 static int 767 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 768 { 769 struct request_queue *q; 770 struct bio *bio; 771 772 q = bdev_get_queue(bdev); 773 if (!q) 774 return (SET_ERROR(ENXIO)); 775 776 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 777 if (unlikely(bio == NULL)) 778 return (SET_ERROR(ENOMEM)); 779 780 bio->bi_end_io = vdev_disk_io_flush_completion; 781 bio->bi_private = zio; 782 bio_set_flush(bio); 783 vdev_submit_bio(bio); 784 invalidate_bdev(bdev); 785 786 return (0); 787 } 788 789 static int 790 vdev_disk_io_trim(zio_t *zio) 791 { 792 vdev_t *v = zio->io_vd; 793 vdev_disk_t *vd = v->vdev_tsd; 794 795 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 796 if (zio->io_trim_flags & ZIO_TRIM_SECURE) { 797 return (-blkdev_issue_secure_erase(vd->vd_bdev, 798 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 799 } else { 800 return (-blkdev_issue_discard(vd->vd_bdev, 801 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 802 } 803 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 804 unsigned long trim_flags = 0; 805 #if defined(BLKDEV_DISCARD_SECURE) 806 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 807 trim_flags |= BLKDEV_DISCARD_SECURE; 808 #endif 809 return (-blkdev_issue_discard(vd->vd_bdev, 810 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); 811 #else 812 #error "Unsupported kernel" 813 #endif 814 } 815 816 static void 817 vdev_disk_io_start(zio_t *zio) 818 { 819 vdev_t *v = zio->io_vd; 820 vdev_disk_t *vd = v->vdev_tsd; 821 int rw, error; 822 823 /* 824 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 825 * Nothing to be done here but return failure. 826 */ 827 if (vd == NULL) { 828 zio->io_error = ENXIO; 829 zio_interrupt(zio); 830 return; 831 } 832 833 rw_enter(&vd->vd_lock, RW_READER); 834 835 /* 836 * If the vdev is closed, it's likely due to a failed reopen and is 837 * in the UNAVAIL state. Nothing to be done here but return failure. 838 */ 839 if (vd->vd_bdev == NULL) { 840 rw_exit(&vd->vd_lock); 841 zio->io_error = ENXIO; 842 zio_interrupt(zio); 843 return; 844 } 845 846 switch (zio->io_type) { 847 case ZIO_TYPE_IOCTL: 848 849 if (!vdev_readable(v)) { 850 rw_exit(&vd->vd_lock); 851 zio->io_error = SET_ERROR(ENXIO); 852 zio_interrupt(zio); 853 return; 854 } 855 856 switch (zio->io_cmd) { 857 case DKIOCFLUSHWRITECACHE: 858 859 if (zfs_nocacheflush) 860 break; 861 862 if (v->vdev_nowritecache) { 863 zio->io_error = SET_ERROR(ENOTSUP); 864 break; 865 } 866 867 error = vdev_disk_io_flush(vd->vd_bdev, zio); 868 if (error == 0) { 869 rw_exit(&vd->vd_lock); 870 return; 871 } 872 873 zio->io_error = error; 874 875 break; 876 877 default: 878 zio->io_error = SET_ERROR(ENOTSUP); 879 } 880 881 rw_exit(&vd->vd_lock); 882 zio_execute(zio); 883 return; 884 case ZIO_TYPE_WRITE: 885 rw = WRITE; 886 break; 887 888 case ZIO_TYPE_READ: 889 rw = READ; 890 break; 891 892 case ZIO_TYPE_TRIM: 893 zio->io_error = vdev_disk_io_trim(zio); 894 rw_exit(&vd->vd_lock); 895 zio_interrupt(zio); 896 return; 897 898 default: 899 rw_exit(&vd->vd_lock); 900 zio->io_error = SET_ERROR(ENOTSUP); 901 zio_interrupt(zio); 902 return; 903 } 904 905 zio->io_target_timestamp = zio_handle_io_delay(zio); 906 error = __vdev_disk_physio(vd->vd_bdev, zio, 907 zio->io_size, zio->io_offset, rw, 0); 908 rw_exit(&vd->vd_lock); 909 910 if (error) { 911 zio->io_error = error; 912 zio_interrupt(zio); 913 return; 914 } 915 } 916 917 static void 918 vdev_disk_io_done(zio_t *zio) 919 { 920 /* 921 * If the device returned EIO, we revalidate the media. If it is 922 * determined the media has changed this triggers the asynchronous 923 * removal of the device from the configuration. 924 */ 925 if (zio->io_error == EIO) { 926 vdev_t *v = zio->io_vd; 927 vdev_disk_t *vd = v->vdev_tsd; 928 929 if (!zfs_check_disk_status(vd->vd_bdev)) { 930 invalidate_bdev(vd->vd_bdev); 931 v->vdev_remove_wanted = B_TRUE; 932 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 933 } 934 } 935 } 936 937 static void 938 vdev_disk_hold(vdev_t *vd) 939 { 940 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 941 942 /* We must have a pathname, and it must be absolute. */ 943 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 944 return; 945 946 /* 947 * Only prefetch path and devid info if the device has 948 * never been opened. 949 */ 950 if (vd->vdev_tsd != NULL) 951 return; 952 953 } 954 955 static void 956 vdev_disk_rele(vdev_t *vd) 957 { 958 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 959 960 /* XXX: Implement me as a vnode rele for the device */ 961 } 962 963 vdev_ops_t vdev_disk_ops = { 964 .vdev_op_init = NULL, 965 .vdev_op_fini = NULL, 966 .vdev_op_open = vdev_disk_open, 967 .vdev_op_close = vdev_disk_close, 968 .vdev_op_asize = vdev_default_asize, 969 .vdev_op_min_asize = vdev_default_min_asize, 970 .vdev_op_min_alloc = NULL, 971 .vdev_op_io_start = vdev_disk_io_start, 972 .vdev_op_io_done = vdev_disk_io_done, 973 .vdev_op_state_change = NULL, 974 .vdev_op_need_resilver = NULL, 975 .vdev_op_hold = vdev_disk_hold, 976 .vdev_op_rele = vdev_disk_rele, 977 .vdev_op_remap = NULL, 978 .vdev_op_xlate = vdev_default_xlate, 979 .vdev_op_rebuild_asize = NULL, 980 .vdev_op_metaslab_init = NULL, 981 .vdev_op_config_generate = NULL, 982 .vdev_op_nparity = NULL, 983 .vdev_op_ndisks = NULL, 984 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 985 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 986 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 987 }; 988 989 /* 990 * The zfs_vdev_scheduler module option has been deprecated. Setting this 991 * value no longer has any effect. It has not yet been entirely removed 992 * to allow the module to be loaded if this option is specified in the 993 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 994 */ 995 static int 996 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 997 { 998 int error = param_set_charp(val, kp); 999 if (error == 0) { 1000 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1001 "is not supported.\n"); 1002 } 1003 1004 return (error); 1005 } 1006 1007 static const char *zfs_vdev_scheduler = "unused"; 1008 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1009 param_get_charp, &zfs_vdev_scheduler, 0644); 1010 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1011 1012 int 1013 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1014 { 1015 uint_t val; 1016 int error; 1017 1018 error = kstrtouint(buf, 0, &val); 1019 if (error < 0) 1020 return (SET_ERROR(error)); 1021 1022 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1023 return (SET_ERROR(-EINVAL)); 1024 1025 error = param_set_uint(buf, kp); 1026 if (error < 0) 1027 return (SET_ERROR(error)); 1028 1029 return (0); 1030 } 1031 1032 int 1033 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1034 { 1035 uint_t val; 1036 int error; 1037 1038 error = kstrtouint(buf, 0, &val); 1039 if (error < 0) 1040 return (SET_ERROR(error)); 1041 1042 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1043 return (SET_ERROR(-EINVAL)); 1044 1045 error = param_set_uint(buf, kp); 1046 if (error < 0) 1047 return (SET_ERROR(error)); 1048 1049 return (0); 1050 } 1051 1052 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1053 "Timeout before determining that a device is missing"); 1054 1055 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1056 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1057