1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 41 #include <linux/blk-cgroup.h> 42 #endif 43 44 typedef struct vdev_disk { 45 struct block_device *vd_bdev; 46 krwlock_t vd_lock; 47 } vdev_disk_t; 48 49 /* 50 * Unique identifier for the exclusive vdev holder. 51 */ 52 static void *zfs_vdev_holder = VDEV_HOLDER; 53 54 /* 55 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 56 * device is missing. The missing path may be transient since the links 57 * can be briefly removed and recreated in response to udev events. 58 */ 59 static uint_t zfs_vdev_open_timeout_ms = 1000; 60 61 /* 62 * Size of the "reserved" partition, in blocks. 63 */ 64 #define EFI_MIN_RESV_SIZE (16 * 1024) 65 66 /* 67 * Virtual device vector for disks. 68 */ 69 typedef struct dio_request { 70 zio_t *dr_zio; /* Parent ZIO */ 71 atomic_t dr_ref; /* References */ 72 int dr_error; /* Bio error */ 73 int dr_bio_count; /* Count of bio's */ 74 struct bio *dr_bio[0]; /* Attached bio's */ 75 } dio_request_t; 76 77 /* 78 * BIO request failfast mask. 79 */ 80 81 static unsigned int zfs_vdev_failfast_mask = 1; 82 83 static fmode_t 84 vdev_bdev_mode(spa_mode_t spa_mode) 85 { 86 fmode_t mode = 0; 87 88 if (spa_mode & SPA_MODE_READ) 89 mode |= FMODE_READ; 90 91 if (spa_mode & SPA_MODE_WRITE) 92 mode |= FMODE_WRITE; 93 94 return (mode); 95 } 96 97 /* 98 * Returns the usable capacity (in bytes) for the partition or disk. 99 */ 100 static uint64_t 101 bdev_capacity(struct block_device *bdev) 102 { 103 return (i_size_read(bdev->bd_inode)); 104 } 105 106 #if !defined(HAVE_BDEV_WHOLE) 107 static inline struct block_device * 108 bdev_whole(struct block_device *bdev) 109 { 110 return (bdev->bd_contains); 111 } 112 #endif 113 114 #if defined(HAVE_BDEVNAME) 115 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 116 #else 117 static inline void 118 vdev_bdevname(struct block_device *bdev, char *name) 119 { 120 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 121 } 122 #endif 123 124 /* 125 * Returns the maximum expansion capacity of the block device (in bytes). 126 * 127 * It is possible to expand a vdev when it has been created as a wholedisk 128 * and the containing block device has increased in capacity. Or when the 129 * partition containing the pool has been manually increased in size. 130 * 131 * This function is only responsible for calculating the potential expansion 132 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 133 * responsible for verifying the expected partition layout in the wholedisk 134 * case, and updating the partition table if appropriate. Once the partition 135 * size has been increased the additional capacity will be visible using 136 * bdev_capacity(). 137 * 138 * The returned maximum expansion capacity is always expected to be larger, or 139 * at the very least equal, to its usable capacity to prevent overestimating 140 * the pool expandsize. 141 */ 142 static uint64_t 143 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 144 { 145 uint64_t psize; 146 int64_t available; 147 148 if (wholedisk && bdev != bdev_whole(bdev)) { 149 /* 150 * When reporting maximum expansion capacity for a wholedisk 151 * deduct any capacity which is expected to be lost due to 152 * alignment restrictions. Over reporting this value isn't 153 * harmful and would only result in slightly less capacity 154 * than expected post expansion. 155 * The estimated available space may be slightly smaller than 156 * bdev_capacity() for devices where the number of sectors is 157 * not a multiple of the alignment size and the partition layout 158 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 159 * "reserved" EFI partition: in such cases return the device 160 * usable capacity. 161 */ 162 available = i_size_read(bdev_whole(bdev)->bd_inode) - 163 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 164 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 165 psize = MAX(available, bdev_capacity(bdev)); 166 } else { 167 psize = bdev_capacity(bdev); 168 } 169 170 return (psize); 171 } 172 173 static void 174 vdev_disk_error(zio_t *zio) 175 { 176 /* 177 * This function can be called in interrupt context, for instance while 178 * handling IRQs coming from a misbehaving disk device; use printk() 179 * which is safe from any context. 180 */ 181 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 182 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 183 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 184 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 185 zio->io_flags); 186 } 187 188 static void 189 vdev_disk_kobj_evt_post(vdev_t *v) 190 { 191 vdev_disk_t *vd = v->vdev_tsd; 192 if (vd && vd->vd_bdev) { 193 spl_signal_kobj_evt(vd->vd_bdev); 194 } else { 195 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 196 v->vdev_path); 197 } 198 } 199 200 static int 201 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 202 uint64_t *logical_ashift, uint64_t *physical_ashift) 203 { 204 struct block_device *bdev; 205 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 206 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 207 vdev_disk_t *vd; 208 209 /* Must have a pathname and it must be absolute. */ 210 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 211 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 212 vdev_dbgmsg(v, "invalid vdev_path"); 213 return (SET_ERROR(EINVAL)); 214 } 215 216 /* 217 * Reopen the device if it is currently open. When expanding a 218 * partition force re-scanning the partition table if userland 219 * did not take care of this already. We need to do this while closed 220 * in order to get an accurate updated block device size. Then 221 * since udev may need to recreate the device links increase the 222 * open retry timeout before reporting the device as unavailable. 223 */ 224 vd = v->vdev_tsd; 225 if (vd) { 226 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 227 boolean_t reread_part = B_FALSE; 228 229 rw_enter(&vd->vd_lock, RW_WRITER); 230 bdev = vd->vd_bdev; 231 vd->vd_bdev = NULL; 232 233 if (bdev) { 234 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 235 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 236 /* 237 * If userland has BLKPG_RESIZE_PARTITION, 238 * then it should have updated the partition 239 * table already. We can detect this by 240 * comparing our current physical size 241 * with that of the device. If they are 242 * the same, then we must not have 243 * BLKPG_RESIZE_PARTITION or it failed to 244 * update the partition table online. We 245 * fallback to rescanning the partition 246 * table from the kernel below. However, 247 * if the capacity already reflects the 248 * updated partition, then we skip 249 * rescanning the partition table here. 250 */ 251 if (v->vdev_psize == bdev_capacity(bdev)) 252 reread_part = B_TRUE; 253 } 254 255 blkdev_put(bdev, mode | FMODE_EXCL); 256 } 257 258 if (reread_part) { 259 bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, 260 zfs_vdev_holder); 261 if (!IS_ERR(bdev)) { 262 int error = vdev_bdev_reread_part(bdev); 263 blkdev_put(bdev, mode | FMODE_EXCL); 264 if (error == 0) { 265 timeout = MSEC2NSEC( 266 zfs_vdev_open_timeout_ms * 2); 267 } 268 } 269 } 270 } else { 271 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 272 273 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 274 rw_enter(&vd->vd_lock, RW_WRITER); 275 } 276 277 /* 278 * Devices are always opened by the path provided at configuration 279 * time. This means that if the provided path is a udev by-id path 280 * then drives may be re-cabled without an issue. If the provided 281 * path is a udev by-path path, then the physical location information 282 * will be preserved. This can be critical for more complicated 283 * configurations where drives are located in specific physical 284 * locations to maximize the systems tolerance to component failure. 285 * 286 * Alternatively, you can provide your own udev rule to flexibly map 287 * the drives as you see fit. It is not advised that you use the 288 * /dev/[hd]d devices which may be reordered due to probing order. 289 * Devices in the wrong locations will be detected by the higher 290 * level vdev validation. 291 * 292 * The specified paths may be briefly removed and recreated in 293 * response to udev events. This should be exceptionally unlikely 294 * because the zpool command makes every effort to verify these paths 295 * have already settled prior to reaching this point. Therefore, 296 * a ENOENT failure at this point is highly likely to be transient 297 * and it is reasonable to sleep and retry before giving up. In 298 * practice delays have been observed to be on the order of 100ms. 299 * 300 * When ERESTARTSYS is returned it indicates the block device is 301 * a zvol which could not be opened due to the deadlock detection 302 * logic in zvol_open(). Extend the timeout and retry the open 303 * subsequent attempts are expected to eventually succeed. 304 */ 305 hrtime_t start = gethrtime(); 306 bdev = ERR_PTR(-ENXIO); 307 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 308 bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, 309 zfs_vdev_holder); 310 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 311 /* 312 * There is no point of waiting since device is removed 313 * explicitly 314 */ 315 if (v->vdev_removed) 316 break; 317 318 schedule_timeout(MSEC_TO_TICK(10)); 319 } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { 320 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 321 continue; 322 } else if (IS_ERR(bdev)) { 323 break; 324 } 325 } 326 327 if (IS_ERR(bdev)) { 328 int error = -PTR_ERR(bdev); 329 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 330 (u_longlong_t)(gethrtime() - start), 331 (u_longlong_t)timeout); 332 vd->vd_bdev = NULL; 333 v->vdev_tsd = vd; 334 rw_exit(&vd->vd_lock); 335 return (SET_ERROR(error)); 336 } else { 337 vd->vd_bdev = bdev; 338 v->vdev_tsd = vd; 339 rw_exit(&vd->vd_lock); 340 } 341 342 /* Determine the physical block size */ 343 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 344 345 /* Determine the logical block size */ 346 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 347 348 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 349 v->vdev_nowritecache = B_FALSE; 350 351 /* Set when device reports it supports TRIM. */ 352 v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); 353 354 /* Set when device reports it supports secure TRIM. */ 355 v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); 356 357 /* Inform the ZIO pipeline that we are non-rotational */ 358 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); 359 360 /* Physical volume size in bytes for the partition */ 361 *psize = bdev_capacity(vd->vd_bdev); 362 363 /* Physical volume size in bytes including possible expansion space */ 364 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 365 366 /* Based on the minimum sector size set the block size */ 367 *physical_ashift = highbit64(MAX(physical_block_size, 368 SPA_MINBLOCKSIZE)) - 1; 369 370 *logical_ashift = highbit64(MAX(logical_block_size, 371 SPA_MINBLOCKSIZE)) - 1; 372 373 return (0); 374 } 375 376 static void 377 vdev_disk_close(vdev_t *v) 378 { 379 vdev_disk_t *vd = v->vdev_tsd; 380 381 if (v->vdev_reopening || vd == NULL) 382 return; 383 384 if (vd->vd_bdev != NULL) { 385 blkdev_put(vd->vd_bdev, 386 vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); 387 } 388 389 rw_destroy(&vd->vd_lock); 390 kmem_free(vd, sizeof (vdev_disk_t)); 391 v->vdev_tsd = NULL; 392 } 393 394 static dio_request_t * 395 vdev_disk_dio_alloc(int bio_count) 396 { 397 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 398 sizeof (struct bio *) * bio_count, KM_SLEEP); 399 atomic_set(&dr->dr_ref, 0); 400 dr->dr_bio_count = bio_count; 401 dr->dr_error = 0; 402 403 for (int i = 0; i < dr->dr_bio_count; i++) 404 dr->dr_bio[i] = NULL; 405 406 return (dr); 407 } 408 409 static void 410 vdev_disk_dio_free(dio_request_t *dr) 411 { 412 int i; 413 414 for (i = 0; i < dr->dr_bio_count; i++) 415 if (dr->dr_bio[i]) 416 bio_put(dr->dr_bio[i]); 417 418 kmem_free(dr, sizeof (dio_request_t) + 419 sizeof (struct bio *) * dr->dr_bio_count); 420 } 421 422 static void 423 vdev_disk_dio_get(dio_request_t *dr) 424 { 425 atomic_inc(&dr->dr_ref); 426 } 427 428 static int 429 vdev_disk_dio_put(dio_request_t *dr) 430 { 431 int rc = atomic_dec_return(&dr->dr_ref); 432 433 /* 434 * Free the dio_request when the last reference is dropped and 435 * ensure zio_interpret is called only once with the correct zio 436 */ 437 if (rc == 0) { 438 zio_t *zio = dr->dr_zio; 439 int error = dr->dr_error; 440 441 vdev_disk_dio_free(dr); 442 443 if (zio) { 444 zio->io_error = error; 445 ASSERT3S(zio->io_error, >=, 0); 446 if (zio->io_error) 447 vdev_disk_error(zio); 448 449 zio_delay_interrupt(zio); 450 } 451 } 452 453 return (rc); 454 } 455 456 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 457 { 458 dio_request_t *dr = bio->bi_private; 459 int rc; 460 461 if (dr->dr_error == 0) { 462 #ifdef HAVE_1ARG_BIO_END_IO_T 463 dr->dr_error = BIO_END_IO_ERROR(bio); 464 #else 465 if (error) 466 dr->dr_error = -(error); 467 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 468 dr->dr_error = EIO; 469 #endif 470 } 471 472 /* Drop reference acquired by __vdev_disk_physio */ 473 rc = vdev_disk_dio_put(dr); 474 } 475 476 static inline void 477 vdev_submit_bio_impl(struct bio *bio) 478 { 479 #ifdef HAVE_1ARG_SUBMIT_BIO 480 (void) submit_bio(bio); 481 #else 482 (void) submit_bio(bio_data_dir(bio), bio); 483 #endif 484 } 485 486 /* 487 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 488 * replace it with preempt_schedule under the following condition: 489 */ 490 #if defined(CONFIG_ARM64) && \ 491 defined(CONFIG_PREEMPTION) && \ 492 defined(CONFIG_BLK_CGROUP) 493 #define preempt_schedule_notrace(x) preempt_schedule(x) 494 #endif 495 496 /* 497 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 498 * as an argument removing the need to set it with bio_set_dev(). This 499 * removes the need for all of the following compatibility code. 500 */ 501 #if !defined(HAVE_BIO_ALLOC_4ARG) 502 503 #ifdef HAVE_BIO_SET_DEV 504 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 505 /* 506 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 507 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 508 * As a side effect the function was converted to GPL-only. Define our 509 * own version when needed which uses rcu_read_lock_sched(). 510 * 511 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 512 * part, moving blkg_tryget into the private one. Define our own version. 513 */ 514 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 515 static inline bool 516 vdev_blkg_tryget(struct blkcg_gq *blkg) 517 { 518 struct percpu_ref *ref = &blkg->refcnt; 519 unsigned long __percpu *count; 520 bool rc; 521 522 rcu_read_lock_sched(); 523 524 if (__ref_is_percpu(ref, &count)) { 525 this_cpu_inc(*count); 526 rc = true; 527 } else { 528 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 529 rc = atomic_long_inc_not_zero(&ref->data->count); 530 #else 531 rc = atomic_long_inc_not_zero(&ref->count); 532 #endif 533 } 534 535 rcu_read_unlock_sched(); 536 537 return (rc); 538 } 539 #else 540 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 541 #endif 542 #ifdef HAVE_BIO_SET_DEV_MACRO 543 /* 544 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 545 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 546 * the entire macro. Provide a minimal version which always assigns the 547 * request queue's root_blkg to the bio. 548 */ 549 static inline void 550 vdev_bio_associate_blkg(struct bio *bio) 551 { 552 #if defined(HAVE_BIO_BDEV_DISK) 553 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 554 #else 555 struct request_queue *q = bio->bi_disk->queue; 556 #endif 557 558 ASSERT3P(q, !=, NULL); 559 ASSERT3P(bio->bi_blkg, ==, NULL); 560 561 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 562 bio->bi_blkg = q->root_blkg; 563 } 564 565 #define bio_associate_blkg vdev_bio_associate_blkg 566 #else 567 static inline void 568 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 569 { 570 #if defined(HAVE_BIO_BDEV_DISK) 571 struct request_queue *q = bdev->bd_disk->queue; 572 #else 573 struct request_queue *q = bio->bi_disk->queue; 574 #endif 575 bio_clear_flag(bio, BIO_REMAPPED); 576 if (bio->bi_bdev != bdev) 577 bio_clear_flag(bio, BIO_THROTTLED); 578 bio->bi_bdev = bdev; 579 580 ASSERT3P(q, !=, NULL); 581 ASSERT3P(bio->bi_blkg, ==, NULL); 582 583 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 584 bio->bi_blkg = q->root_blkg; 585 } 586 #define bio_set_dev vdev_bio_set_dev 587 #endif 588 #endif 589 #else 590 /* 591 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 592 */ 593 static inline void 594 bio_set_dev(struct bio *bio, struct block_device *bdev) 595 { 596 bio->bi_bdev = bdev; 597 } 598 #endif /* HAVE_BIO_SET_DEV */ 599 #endif /* !HAVE_BIO_ALLOC_4ARG */ 600 601 static inline void 602 vdev_submit_bio(struct bio *bio) 603 { 604 struct bio_list *bio_list = current->bio_list; 605 current->bio_list = NULL; 606 vdev_submit_bio_impl(bio); 607 current->bio_list = bio_list; 608 } 609 610 static inline struct bio * 611 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 612 unsigned short nr_vecs) 613 { 614 struct bio *bio; 615 616 #ifdef HAVE_BIO_ALLOC_4ARG 617 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 618 #else 619 bio = bio_alloc(gfp_mask, nr_vecs); 620 if (likely(bio != NULL)) 621 bio_set_dev(bio, bdev); 622 #endif 623 624 return (bio); 625 } 626 627 static inline unsigned int 628 vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 629 { 630 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 631 bio_size, abd_offset); 632 633 #ifdef HAVE_BIO_MAX_SEGS 634 return (bio_max_segs(nr_segs)); 635 #else 636 return (MIN(nr_segs, BIO_MAX_PAGES)); 637 #endif 638 } 639 640 static int 641 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 642 size_t io_size, uint64_t io_offset, int rw, int flags) 643 { 644 dio_request_t *dr; 645 uint64_t abd_offset; 646 uint64_t bio_offset; 647 int bio_size; 648 int bio_count = 16; 649 int error = 0; 650 struct blk_plug plug; 651 unsigned short nr_vecs; 652 653 /* 654 * Accessing outside the block device is never allowed. 655 */ 656 if (io_offset + io_size > bdev->bd_inode->i_size) { 657 vdev_dbgmsg(zio->io_vd, 658 "Illegal access %llu size %llu, device size %llu", 659 (u_longlong_t)io_offset, 660 (u_longlong_t)io_size, 661 (u_longlong_t)i_size_read(bdev->bd_inode)); 662 return (SET_ERROR(EIO)); 663 } 664 665 retry: 666 dr = vdev_disk_dio_alloc(bio_count); 667 668 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 669 zio->io_vd->vdev_failfast == B_TRUE) { 670 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 671 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 672 } 673 674 dr->dr_zio = zio; 675 676 /* 677 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 678 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 679 * can cover at least 128KB and at most 1MB. When the required number 680 * of iovec's exceeds this, we are forced to break the IO in multiple 681 * bio's and wait for them all to complete. This is likely if the 682 * recordsize property is increased beyond 1MB. The default 683 * bio_count=16 should typically accommodate the maximum-size zio of 684 * 16MB. 685 */ 686 687 abd_offset = 0; 688 bio_offset = io_offset; 689 bio_size = io_size; 690 for (int i = 0; i <= dr->dr_bio_count; i++) { 691 692 /* Finished constructing bio's for given buffer */ 693 if (bio_size <= 0) 694 break; 695 696 /* 697 * If additional bio's are required, we have to retry, but 698 * this should be rare - see the comment above. 699 */ 700 if (dr->dr_bio_count == i) { 701 vdev_disk_dio_free(dr); 702 bio_count *= 2; 703 goto retry; 704 } 705 706 nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); 707 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 708 if (unlikely(dr->dr_bio[i] == NULL)) { 709 vdev_disk_dio_free(dr); 710 return (SET_ERROR(ENOMEM)); 711 } 712 713 /* Matching put called by vdev_disk_physio_completion */ 714 vdev_disk_dio_get(dr); 715 716 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 717 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 718 dr->dr_bio[i]->bi_private = dr; 719 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 720 721 /* Remaining size is returned to become the new size */ 722 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 723 bio_size, abd_offset); 724 725 /* Advance in buffer and construct another bio if needed */ 726 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 727 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 728 } 729 730 /* Extra reference to protect dio_request during vdev_submit_bio */ 731 vdev_disk_dio_get(dr); 732 733 if (dr->dr_bio_count > 1) 734 blk_start_plug(&plug); 735 736 /* Submit all bio's associated with this dio */ 737 for (int i = 0; i < dr->dr_bio_count; i++) { 738 if (dr->dr_bio[i]) 739 vdev_submit_bio(dr->dr_bio[i]); 740 } 741 742 if (dr->dr_bio_count > 1) 743 blk_finish_plug(&plug); 744 745 (void) vdev_disk_dio_put(dr); 746 747 return (error); 748 } 749 750 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 751 { 752 zio_t *zio = bio->bi_private; 753 #ifdef HAVE_1ARG_BIO_END_IO_T 754 zio->io_error = BIO_END_IO_ERROR(bio); 755 #else 756 zio->io_error = -error; 757 #endif 758 759 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 760 zio->io_vd->vdev_nowritecache = B_TRUE; 761 762 bio_put(bio); 763 ASSERT3S(zio->io_error, >=, 0); 764 if (zio->io_error) 765 vdev_disk_error(zio); 766 zio_interrupt(zio); 767 } 768 769 static int 770 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 771 { 772 struct request_queue *q; 773 struct bio *bio; 774 775 q = bdev_get_queue(bdev); 776 if (!q) 777 return (SET_ERROR(ENXIO)); 778 779 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 780 if (unlikely(bio == NULL)) 781 return (SET_ERROR(ENOMEM)); 782 783 bio->bi_end_io = vdev_disk_io_flush_completion; 784 bio->bi_private = zio; 785 bio_set_flush(bio); 786 vdev_submit_bio(bio); 787 invalidate_bdev(bdev); 788 789 return (0); 790 } 791 792 static int 793 vdev_disk_io_trim(zio_t *zio) 794 { 795 vdev_t *v = zio->io_vd; 796 vdev_disk_t *vd = v->vdev_tsd; 797 798 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 799 if (zio->io_trim_flags & ZIO_TRIM_SECURE) { 800 return (-blkdev_issue_secure_erase(vd->vd_bdev, 801 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 802 } else { 803 return (-blkdev_issue_discard(vd->vd_bdev, 804 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 805 } 806 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 807 unsigned long trim_flags = 0; 808 #if defined(BLKDEV_DISCARD_SECURE) 809 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 810 trim_flags |= BLKDEV_DISCARD_SECURE; 811 #endif 812 return (-blkdev_issue_discard(vd->vd_bdev, 813 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); 814 #else 815 #error "Unsupported kernel" 816 #endif 817 } 818 819 static void 820 vdev_disk_io_start(zio_t *zio) 821 { 822 vdev_t *v = zio->io_vd; 823 vdev_disk_t *vd = v->vdev_tsd; 824 int rw, error; 825 826 /* 827 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 828 * Nothing to be done here but return failure. 829 */ 830 if (vd == NULL) { 831 zio->io_error = ENXIO; 832 zio_interrupt(zio); 833 return; 834 } 835 836 rw_enter(&vd->vd_lock, RW_READER); 837 838 /* 839 * If the vdev is closed, it's likely due to a failed reopen and is 840 * in the UNAVAIL state. Nothing to be done here but return failure. 841 */ 842 if (vd->vd_bdev == NULL) { 843 rw_exit(&vd->vd_lock); 844 zio->io_error = ENXIO; 845 zio_interrupt(zio); 846 return; 847 } 848 849 switch (zio->io_type) { 850 case ZIO_TYPE_IOCTL: 851 852 if (!vdev_readable(v)) { 853 rw_exit(&vd->vd_lock); 854 zio->io_error = SET_ERROR(ENXIO); 855 zio_interrupt(zio); 856 return; 857 } 858 859 switch (zio->io_cmd) { 860 case DKIOCFLUSHWRITECACHE: 861 862 if (zfs_nocacheflush) 863 break; 864 865 if (v->vdev_nowritecache) { 866 zio->io_error = SET_ERROR(ENOTSUP); 867 break; 868 } 869 870 error = vdev_disk_io_flush(vd->vd_bdev, zio); 871 if (error == 0) { 872 rw_exit(&vd->vd_lock); 873 return; 874 } 875 876 zio->io_error = error; 877 878 break; 879 880 default: 881 zio->io_error = SET_ERROR(ENOTSUP); 882 } 883 884 rw_exit(&vd->vd_lock); 885 zio_execute(zio); 886 return; 887 case ZIO_TYPE_WRITE: 888 rw = WRITE; 889 break; 890 891 case ZIO_TYPE_READ: 892 rw = READ; 893 break; 894 895 case ZIO_TYPE_TRIM: 896 zio->io_error = vdev_disk_io_trim(zio); 897 rw_exit(&vd->vd_lock); 898 zio_interrupt(zio); 899 return; 900 901 default: 902 rw_exit(&vd->vd_lock); 903 zio->io_error = SET_ERROR(ENOTSUP); 904 zio_interrupt(zio); 905 return; 906 } 907 908 zio->io_target_timestamp = zio_handle_io_delay(zio); 909 error = __vdev_disk_physio(vd->vd_bdev, zio, 910 zio->io_size, zio->io_offset, rw, 0); 911 rw_exit(&vd->vd_lock); 912 913 if (error) { 914 zio->io_error = error; 915 zio_interrupt(zio); 916 return; 917 } 918 } 919 920 static void 921 vdev_disk_io_done(zio_t *zio) 922 { 923 /* 924 * If the device returned EIO, we revalidate the media. If it is 925 * determined the media has changed this triggers the asynchronous 926 * removal of the device from the configuration. 927 */ 928 if (zio->io_error == EIO) { 929 vdev_t *v = zio->io_vd; 930 vdev_disk_t *vd = v->vdev_tsd; 931 932 if (!zfs_check_disk_status(vd->vd_bdev)) { 933 invalidate_bdev(vd->vd_bdev); 934 v->vdev_remove_wanted = B_TRUE; 935 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 936 } 937 } 938 } 939 940 static void 941 vdev_disk_hold(vdev_t *vd) 942 { 943 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 944 945 /* We must have a pathname, and it must be absolute. */ 946 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 947 return; 948 949 /* 950 * Only prefetch path and devid info if the device has 951 * never been opened. 952 */ 953 if (vd->vdev_tsd != NULL) 954 return; 955 956 } 957 958 static void 959 vdev_disk_rele(vdev_t *vd) 960 { 961 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 962 963 /* XXX: Implement me as a vnode rele for the device */ 964 } 965 966 vdev_ops_t vdev_disk_ops = { 967 .vdev_op_init = NULL, 968 .vdev_op_fini = NULL, 969 .vdev_op_open = vdev_disk_open, 970 .vdev_op_close = vdev_disk_close, 971 .vdev_op_asize = vdev_default_asize, 972 .vdev_op_min_asize = vdev_default_min_asize, 973 .vdev_op_min_alloc = NULL, 974 .vdev_op_io_start = vdev_disk_io_start, 975 .vdev_op_io_done = vdev_disk_io_done, 976 .vdev_op_state_change = NULL, 977 .vdev_op_need_resilver = NULL, 978 .vdev_op_hold = vdev_disk_hold, 979 .vdev_op_rele = vdev_disk_rele, 980 .vdev_op_remap = NULL, 981 .vdev_op_xlate = vdev_default_xlate, 982 .vdev_op_rebuild_asize = NULL, 983 .vdev_op_metaslab_init = NULL, 984 .vdev_op_config_generate = NULL, 985 .vdev_op_nparity = NULL, 986 .vdev_op_ndisks = NULL, 987 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 988 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 989 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 990 }; 991 992 /* 993 * The zfs_vdev_scheduler module option has been deprecated. Setting this 994 * value no longer has any effect. It has not yet been entirely removed 995 * to allow the module to be loaded if this option is specified in the 996 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 997 */ 998 static int 999 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1000 { 1001 int error = param_set_charp(val, kp); 1002 if (error == 0) { 1003 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1004 "is not supported.\n"); 1005 } 1006 1007 return (error); 1008 } 1009 1010 static const char *zfs_vdev_scheduler = "unused"; 1011 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1012 param_get_charp, &zfs_vdev_scheduler, 0644); 1013 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1014 1015 int 1016 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1017 { 1018 uint_t val; 1019 int error; 1020 1021 error = kstrtouint(buf, 0, &val); 1022 if (error < 0) 1023 return (SET_ERROR(error)); 1024 1025 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1026 return (SET_ERROR(-EINVAL)); 1027 1028 error = param_set_uint(buf, kp); 1029 if (error < 0) 1030 return (SET_ERROR(error)); 1031 1032 return (0); 1033 } 1034 1035 int 1036 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1037 { 1038 uint_t val; 1039 int error; 1040 1041 error = kstrtouint(buf, 0, &val); 1042 if (error < 0) 1043 return (SET_ERROR(error)); 1044 1045 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1046 return (SET_ERROR(-EINVAL)); 1047 1048 error = param_set_uint(buf, kp); 1049 if (error < 0) 1050 return (SET_ERROR(error)); 1051 1052 return (0); 1053 } 1054 1055 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1056 "Timeout before determining that a device is missing"); 1057 1058 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1059 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1060