1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 41 #include <linux/blk-cgroup.h> 42 #endif 43 44 typedef struct vdev_disk { 45 struct block_device *vd_bdev; 46 krwlock_t vd_lock; 47 } vdev_disk_t; 48 49 /* 50 * Unique identifier for the exclusive vdev holder. 51 */ 52 static void *zfs_vdev_holder = VDEV_HOLDER; 53 54 /* 55 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 56 * device is missing. The missing path may be transient since the links 57 * can be briefly removed and recreated in response to udev events. 58 */ 59 static uint_t zfs_vdev_open_timeout_ms = 1000; 60 61 /* 62 * Size of the "reserved" partition, in blocks. 63 */ 64 #define EFI_MIN_RESV_SIZE (16 * 1024) 65 66 /* 67 * Virtual device vector for disks. 68 */ 69 typedef struct dio_request { 70 zio_t *dr_zio; /* Parent ZIO */ 71 atomic_t dr_ref; /* References */ 72 int dr_error; /* Bio error */ 73 int dr_bio_count; /* Count of bio's */ 74 struct bio *dr_bio[]; /* Attached bio's */ 75 } dio_request_t; 76 77 /* 78 * BIO request failfast mask. 79 */ 80 81 static unsigned int zfs_vdev_failfast_mask = 1; 82 83 #ifdef HAVE_BLK_MODE_T 84 static blk_mode_t 85 #else 86 static fmode_t 87 #endif 88 vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) 89 { 90 #ifdef HAVE_BLK_MODE_T 91 blk_mode_t mode = 0; 92 93 if (spa_mode & SPA_MODE_READ) 94 mode |= BLK_OPEN_READ; 95 96 if (spa_mode & SPA_MODE_WRITE) 97 mode |= BLK_OPEN_WRITE; 98 99 if (exclusive) 100 mode |= BLK_OPEN_EXCL; 101 #else 102 fmode_t mode = 0; 103 104 if (spa_mode & SPA_MODE_READ) 105 mode |= FMODE_READ; 106 107 if (spa_mode & SPA_MODE_WRITE) 108 mode |= FMODE_WRITE; 109 110 if (exclusive) 111 mode |= FMODE_EXCL; 112 #endif 113 114 return (mode); 115 } 116 117 /* 118 * Returns the usable capacity (in bytes) for the partition or disk. 119 */ 120 static uint64_t 121 bdev_capacity(struct block_device *bdev) 122 { 123 return (i_size_read(bdev->bd_inode)); 124 } 125 126 #if !defined(HAVE_BDEV_WHOLE) 127 static inline struct block_device * 128 bdev_whole(struct block_device *bdev) 129 { 130 return (bdev->bd_contains); 131 } 132 #endif 133 134 #if defined(HAVE_BDEVNAME) 135 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 136 #else 137 static inline void 138 vdev_bdevname(struct block_device *bdev, char *name) 139 { 140 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 141 } 142 #endif 143 144 /* 145 * Returns the maximum expansion capacity of the block device (in bytes). 146 * 147 * It is possible to expand a vdev when it has been created as a wholedisk 148 * and the containing block device has increased in capacity. Or when the 149 * partition containing the pool has been manually increased in size. 150 * 151 * This function is only responsible for calculating the potential expansion 152 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 153 * responsible for verifying the expected partition layout in the wholedisk 154 * case, and updating the partition table if appropriate. Once the partition 155 * size has been increased the additional capacity will be visible using 156 * bdev_capacity(). 157 * 158 * The returned maximum expansion capacity is always expected to be larger, or 159 * at the very least equal, to its usable capacity to prevent overestimating 160 * the pool expandsize. 161 */ 162 static uint64_t 163 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 164 { 165 uint64_t psize; 166 int64_t available; 167 168 if (wholedisk && bdev != bdev_whole(bdev)) { 169 /* 170 * When reporting maximum expansion capacity for a wholedisk 171 * deduct any capacity which is expected to be lost due to 172 * alignment restrictions. Over reporting this value isn't 173 * harmful and would only result in slightly less capacity 174 * than expected post expansion. 175 * The estimated available space may be slightly smaller than 176 * bdev_capacity() for devices where the number of sectors is 177 * not a multiple of the alignment size and the partition layout 178 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 179 * "reserved" EFI partition: in such cases return the device 180 * usable capacity. 181 */ 182 available = i_size_read(bdev_whole(bdev)->bd_inode) - 183 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 184 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 185 psize = MAX(available, bdev_capacity(bdev)); 186 } else { 187 psize = bdev_capacity(bdev); 188 } 189 190 return (psize); 191 } 192 193 static void 194 vdev_disk_error(zio_t *zio) 195 { 196 /* 197 * This function can be called in interrupt context, for instance while 198 * handling IRQs coming from a misbehaving disk device; use printk() 199 * which is safe from any context. 200 */ 201 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 202 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 203 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 204 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 205 zio->io_flags); 206 } 207 208 static void 209 vdev_disk_kobj_evt_post(vdev_t *v) 210 { 211 vdev_disk_t *vd = v->vdev_tsd; 212 if (vd && vd->vd_bdev) { 213 spl_signal_kobj_evt(vd->vd_bdev); 214 } else { 215 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 216 v->vdev_path); 217 } 218 } 219 220 #if !defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 221 /* 222 * Define a dummy struct blk_holder_ops for kernel versions 223 * prior to 6.5. 224 */ 225 struct blk_holder_ops {}; 226 #endif 227 228 static struct block_device * 229 vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, 230 const struct blk_holder_ops *hops) 231 { 232 #ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG 233 return (blkdev_get_by_path(path, 234 vdev_bdev_mode(mode, B_TRUE), holder, hops)); 235 #else 236 return (blkdev_get_by_path(path, 237 vdev_bdev_mode(mode, B_TRUE), holder)); 238 #endif 239 } 240 241 static void 242 vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) 243 { 244 #ifdef HAVE_BLKDEV_PUT_HOLDER 245 return (blkdev_put(bdev, holder)); 246 #else 247 return (blkdev_put(bdev, vdev_bdev_mode(mode, B_TRUE))); 248 #endif 249 } 250 251 static int 252 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 253 uint64_t *logical_ashift, uint64_t *physical_ashift) 254 { 255 struct block_device *bdev; 256 #ifdef HAVE_BLK_MODE_T 257 blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); 258 #else 259 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); 260 #endif 261 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 262 vdev_disk_t *vd; 263 264 /* Must have a pathname and it must be absolute. */ 265 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 266 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 267 vdev_dbgmsg(v, "invalid vdev_path"); 268 return (SET_ERROR(EINVAL)); 269 } 270 271 /* 272 * Reopen the device if it is currently open. When expanding a 273 * partition force re-scanning the partition table if userland 274 * did not take care of this already. We need to do this while closed 275 * in order to get an accurate updated block device size. Then 276 * since udev may need to recreate the device links increase the 277 * open retry timeout before reporting the device as unavailable. 278 */ 279 vd = v->vdev_tsd; 280 if (vd) { 281 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 282 boolean_t reread_part = B_FALSE; 283 284 rw_enter(&vd->vd_lock, RW_WRITER); 285 bdev = vd->vd_bdev; 286 vd->vd_bdev = NULL; 287 288 if (bdev) { 289 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 290 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 291 /* 292 * If userland has BLKPG_RESIZE_PARTITION, 293 * then it should have updated the partition 294 * table already. We can detect this by 295 * comparing our current physical size 296 * with that of the device. If they are 297 * the same, then we must not have 298 * BLKPG_RESIZE_PARTITION or it failed to 299 * update the partition table online. We 300 * fallback to rescanning the partition 301 * table from the kernel below. However, 302 * if the capacity already reflects the 303 * updated partition, then we skip 304 * rescanning the partition table here. 305 */ 306 if (v->vdev_psize == bdev_capacity(bdev)) 307 reread_part = B_TRUE; 308 } 309 310 vdev_blkdev_put(bdev, mode, zfs_vdev_holder); 311 } 312 313 if (reread_part) { 314 bdev = vdev_blkdev_get_by_path(disk_name, mode, 315 zfs_vdev_holder, NULL); 316 if (!IS_ERR(bdev)) { 317 int error = vdev_bdev_reread_part(bdev); 318 vdev_blkdev_put(bdev, mode, zfs_vdev_holder); 319 if (error == 0) { 320 timeout = MSEC2NSEC( 321 zfs_vdev_open_timeout_ms * 2); 322 } 323 } 324 } 325 } else { 326 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 327 328 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 329 rw_enter(&vd->vd_lock, RW_WRITER); 330 } 331 332 /* 333 * Devices are always opened by the path provided at configuration 334 * time. This means that if the provided path is a udev by-id path 335 * then drives may be re-cabled without an issue. If the provided 336 * path is a udev by-path path, then the physical location information 337 * will be preserved. This can be critical for more complicated 338 * configurations where drives are located in specific physical 339 * locations to maximize the systems tolerance to component failure. 340 * 341 * Alternatively, you can provide your own udev rule to flexibly map 342 * the drives as you see fit. It is not advised that you use the 343 * /dev/[hd]d devices which may be reordered due to probing order. 344 * Devices in the wrong locations will be detected by the higher 345 * level vdev validation. 346 * 347 * The specified paths may be briefly removed and recreated in 348 * response to udev events. This should be exceptionally unlikely 349 * because the zpool command makes every effort to verify these paths 350 * have already settled prior to reaching this point. Therefore, 351 * a ENOENT failure at this point is highly likely to be transient 352 * and it is reasonable to sleep and retry before giving up. In 353 * practice delays have been observed to be on the order of 100ms. 354 * 355 * When ERESTARTSYS is returned it indicates the block device is 356 * a zvol which could not be opened due to the deadlock detection 357 * logic in zvol_open(). Extend the timeout and retry the open 358 * subsequent attempts are expected to eventually succeed. 359 */ 360 hrtime_t start = gethrtime(); 361 bdev = ERR_PTR(-ENXIO); 362 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 363 bdev = vdev_blkdev_get_by_path(v->vdev_path, mode, 364 zfs_vdev_holder, NULL); 365 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 366 /* 367 * There is no point of waiting since device is removed 368 * explicitly 369 */ 370 if (v->vdev_removed) 371 break; 372 373 schedule_timeout(MSEC_TO_TICK(10)); 374 } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { 375 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 376 continue; 377 } else if (IS_ERR(bdev)) { 378 break; 379 } 380 } 381 382 if (IS_ERR(bdev)) { 383 int error = -PTR_ERR(bdev); 384 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 385 (u_longlong_t)(gethrtime() - start), 386 (u_longlong_t)timeout); 387 vd->vd_bdev = NULL; 388 v->vdev_tsd = vd; 389 rw_exit(&vd->vd_lock); 390 return (SET_ERROR(error)); 391 } else { 392 vd->vd_bdev = bdev; 393 v->vdev_tsd = vd; 394 rw_exit(&vd->vd_lock); 395 } 396 397 /* Determine the physical block size */ 398 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 399 400 /* Determine the logical block size */ 401 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 402 403 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 404 v->vdev_nowritecache = B_FALSE; 405 406 /* Set when device reports it supports TRIM. */ 407 v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); 408 409 /* Set when device reports it supports secure TRIM. */ 410 v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); 411 412 /* Inform the ZIO pipeline that we are non-rotational */ 413 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); 414 415 /* Physical volume size in bytes for the partition */ 416 *psize = bdev_capacity(vd->vd_bdev); 417 418 /* Physical volume size in bytes including possible expansion space */ 419 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 420 421 /* Based on the minimum sector size set the block size */ 422 *physical_ashift = highbit64(MAX(physical_block_size, 423 SPA_MINBLOCKSIZE)) - 1; 424 425 *logical_ashift = highbit64(MAX(logical_block_size, 426 SPA_MINBLOCKSIZE)) - 1; 427 428 return (0); 429 } 430 431 static void 432 vdev_disk_close(vdev_t *v) 433 { 434 vdev_disk_t *vd = v->vdev_tsd; 435 436 if (v->vdev_reopening || vd == NULL) 437 return; 438 439 if (vd->vd_bdev != NULL) { 440 vdev_blkdev_put(vd->vd_bdev, spa_mode(v->vdev_spa), 441 zfs_vdev_holder); 442 } 443 444 rw_destroy(&vd->vd_lock); 445 kmem_free(vd, sizeof (vdev_disk_t)); 446 v->vdev_tsd = NULL; 447 } 448 449 static dio_request_t * 450 vdev_disk_dio_alloc(int bio_count) 451 { 452 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 453 sizeof (struct bio *) * bio_count, KM_SLEEP); 454 atomic_set(&dr->dr_ref, 0); 455 dr->dr_bio_count = bio_count; 456 dr->dr_error = 0; 457 458 for (int i = 0; i < dr->dr_bio_count; i++) 459 dr->dr_bio[i] = NULL; 460 461 return (dr); 462 } 463 464 static void 465 vdev_disk_dio_free(dio_request_t *dr) 466 { 467 int i; 468 469 for (i = 0; i < dr->dr_bio_count; i++) 470 if (dr->dr_bio[i]) 471 bio_put(dr->dr_bio[i]); 472 473 kmem_free(dr, sizeof (dio_request_t) + 474 sizeof (struct bio *) * dr->dr_bio_count); 475 } 476 477 static void 478 vdev_disk_dio_get(dio_request_t *dr) 479 { 480 atomic_inc(&dr->dr_ref); 481 } 482 483 static void 484 vdev_disk_dio_put(dio_request_t *dr) 485 { 486 int rc = atomic_dec_return(&dr->dr_ref); 487 488 /* 489 * Free the dio_request when the last reference is dropped and 490 * ensure zio_interpret is called only once with the correct zio 491 */ 492 if (rc == 0) { 493 zio_t *zio = dr->dr_zio; 494 int error = dr->dr_error; 495 496 vdev_disk_dio_free(dr); 497 498 if (zio) { 499 zio->io_error = error; 500 ASSERT3S(zio->io_error, >=, 0); 501 if (zio->io_error) 502 vdev_disk_error(zio); 503 504 zio_delay_interrupt(zio); 505 } 506 } 507 } 508 509 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 510 { 511 dio_request_t *dr = bio->bi_private; 512 513 if (dr->dr_error == 0) { 514 #ifdef HAVE_1ARG_BIO_END_IO_T 515 dr->dr_error = BIO_END_IO_ERROR(bio); 516 #else 517 if (error) 518 dr->dr_error = -(error); 519 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 520 dr->dr_error = EIO; 521 #endif 522 } 523 524 /* Drop reference acquired by __vdev_disk_physio */ 525 vdev_disk_dio_put(dr); 526 } 527 528 static inline void 529 vdev_submit_bio_impl(struct bio *bio) 530 { 531 #ifdef HAVE_1ARG_SUBMIT_BIO 532 (void) submit_bio(bio); 533 #else 534 (void) submit_bio(bio_data_dir(bio), bio); 535 #endif 536 } 537 538 /* 539 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 540 * replace it with preempt_schedule under the following condition: 541 */ 542 #if defined(CONFIG_ARM64) && \ 543 defined(CONFIG_PREEMPTION) && \ 544 defined(CONFIG_BLK_CGROUP) 545 #define preempt_schedule_notrace(x) preempt_schedule(x) 546 #endif 547 548 /* 549 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 550 * as an argument removing the need to set it with bio_set_dev(). This 551 * removes the need for all of the following compatibility code. 552 */ 553 #if !defined(HAVE_BIO_ALLOC_4ARG) 554 555 #ifdef HAVE_BIO_SET_DEV 556 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 557 /* 558 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 559 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 560 * As a side effect the function was converted to GPL-only. Define our 561 * own version when needed which uses rcu_read_lock_sched(). 562 * 563 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 564 * part, moving blkg_tryget into the private one. Define our own version. 565 */ 566 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 567 static inline bool 568 vdev_blkg_tryget(struct blkcg_gq *blkg) 569 { 570 struct percpu_ref *ref = &blkg->refcnt; 571 unsigned long __percpu *count; 572 bool rc; 573 574 rcu_read_lock_sched(); 575 576 if (__ref_is_percpu(ref, &count)) { 577 this_cpu_inc(*count); 578 rc = true; 579 } else { 580 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 581 rc = atomic_long_inc_not_zero(&ref->data->count); 582 #else 583 rc = atomic_long_inc_not_zero(&ref->count); 584 #endif 585 } 586 587 rcu_read_unlock_sched(); 588 589 return (rc); 590 } 591 #else 592 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 593 #endif 594 #ifdef HAVE_BIO_SET_DEV_MACRO 595 /* 596 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 597 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 598 * the entire macro. Provide a minimal version which always assigns the 599 * request queue's root_blkg to the bio. 600 */ 601 static inline void 602 vdev_bio_associate_blkg(struct bio *bio) 603 { 604 #if defined(HAVE_BIO_BDEV_DISK) 605 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 606 #else 607 struct request_queue *q = bio->bi_disk->queue; 608 #endif 609 610 ASSERT3P(q, !=, NULL); 611 ASSERT3P(bio->bi_blkg, ==, NULL); 612 613 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 614 bio->bi_blkg = q->root_blkg; 615 } 616 617 #define bio_associate_blkg vdev_bio_associate_blkg 618 #else 619 static inline void 620 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 621 { 622 #if defined(HAVE_BIO_BDEV_DISK) 623 struct request_queue *q = bdev->bd_disk->queue; 624 #else 625 struct request_queue *q = bio->bi_disk->queue; 626 #endif 627 bio_clear_flag(bio, BIO_REMAPPED); 628 if (bio->bi_bdev != bdev) 629 bio_clear_flag(bio, BIO_THROTTLED); 630 bio->bi_bdev = bdev; 631 632 ASSERT3P(q, !=, NULL); 633 ASSERT3P(bio->bi_blkg, ==, NULL); 634 635 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 636 bio->bi_blkg = q->root_blkg; 637 } 638 #define bio_set_dev vdev_bio_set_dev 639 #endif 640 #endif 641 #else 642 /* 643 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 644 */ 645 static inline void 646 bio_set_dev(struct bio *bio, struct block_device *bdev) 647 { 648 bio->bi_bdev = bdev; 649 } 650 #endif /* HAVE_BIO_SET_DEV */ 651 #endif /* !HAVE_BIO_ALLOC_4ARG */ 652 653 static inline void 654 vdev_submit_bio(struct bio *bio) 655 { 656 struct bio_list *bio_list = current->bio_list; 657 current->bio_list = NULL; 658 vdev_submit_bio_impl(bio); 659 current->bio_list = bio_list; 660 } 661 662 static inline struct bio * 663 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 664 unsigned short nr_vecs) 665 { 666 struct bio *bio; 667 668 #ifdef HAVE_BIO_ALLOC_4ARG 669 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 670 #else 671 bio = bio_alloc(gfp_mask, nr_vecs); 672 if (likely(bio != NULL)) 673 bio_set_dev(bio, bdev); 674 #endif 675 676 return (bio); 677 } 678 679 static inline unsigned int 680 vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 681 { 682 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 683 bio_size, abd_offset); 684 685 #ifdef HAVE_BIO_MAX_SEGS 686 return (bio_max_segs(nr_segs)); 687 #else 688 return (MIN(nr_segs, BIO_MAX_PAGES)); 689 #endif 690 } 691 692 static int 693 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 694 size_t io_size, uint64_t io_offset, int rw, int flags) 695 { 696 dio_request_t *dr; 697 uint64_t abd_offset; 698 uint64_t bio_offset; 699 int bio_size; 700 int bio_count = 16; 701 int error = 0; 702 struct blk_plug plug; 703 unsigned short nr_vecs; 704 705 /* 706 * Accessing outside the block device is never allowed. 707 */ 708 if (io_offset + io_size > bdev->bd_inode->i_size) { 709 vdev_dbgmsg(zio->io_vd, 710 "Illegal access %llu size %llu, device size %llu", 711 (u_longlong_t)io_offset, 712 (u_longlong_t)io_size, 713 (u_longlong_t)i_size_read(bdev->bd_inode)); 714 return (SET_ERROR(EIO)); 715 } 716 717 retry: 718 dr = vdev_disk_dio_alloc(bio_count); 719 720 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 721 zio->io_vd->vdev_failfast == B_TRUE) { 722 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 723 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 724 } 725 726 dr->dr_zio = zio; 727 728 /* 729 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 730 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 731 * can cover at least 128KB and at most 1MB. When the required number 732 * of iovec's exceeds this, we are forced to break the IO in multiple 733 * bio's and wait for them all to complete. This is likely if the 734 * recordsize property is increased beyond 1MB. The default 735 * bio_count=16 should typically accommodate the maximum-size zio of 736 * 16MB. 737 */ 738 739 abd_offset = 0; 740 bio_offset = io_offset; 741 bio_size = io_size; 742 for (int i = 0; i <= dr->dr_bio_count; i++) { 743 744 /* Finished constructing bio's for given buffer */ 745 if (bio_size <= 0) 746 break; 747 748 /* 749 * If additional bio's are required, we have to retry, but 750 * this should be rare - see the comment above. 751 */ 752 if (dr->dr_bio_count == i) { 753 vdev_disk_dio_free(dr); 754 bio_count *= 2; 755 goto retry; 756 } 757 758 nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); 759 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 760 if (unlikely(dr->dr_bio[i] == NULL)) { 761 vdev_disk_dio_free(dr); 762 return (SET_ERROR(ENOMEM)); 763 } 764 765 /* Matching put called by vdev_disk_physio_completion */ 766 vdev_disk_dio_get(dr); 767 768 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 769 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 770 dr->dr_bio[i]->bi_private = dr; 771 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 772 773 /* Remaining size is returned to become the new size */ 774 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 775 bio_size, abd_offset); 776 777 /* Advance in buffer and construct another bio if needed */ 778 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 779 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 780 } 781 782 /* Extra reference to protect dio_request during vdev_submit_bio */ 783 vdev_disk_dio_get(dr); 784 785 if (dr->dr_bio_count > 1) 786 blk_start_plug(&plug); 787 788 /* Submit all bio's associated with this dio */ 789 for (int i = 0; i < dr->dr_bio_count; i++) { 790 if (dr->dr_bio[i]) 791 vdev_submit_bio(dr->dr_bio[i]); 792 } 793 794 if (dr->dr_bio_count > 1) 795 blk_finish_plug(&plug); 796 797 vdev_disk_dio_put(dr); 798 799 return (error); 800 } 801 802 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 803 { 804 zio_t *zio = bio->bi_private; 805 #ifdef HAVE_1ARG_BIO_END_IO_T 806 zio->io_error = BIO_END_IO_ERROR(bio); 807 #else 808 zio->io_error = -error; 809 #endif 810 811 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 812 zio->io_vd->vdev_nowritecache = B_TRUE; 813 814 bio_put(bio); 815 ASSERT3S(zio->io_error, >=, 0); 816 if (zio->io_error) 817 vdev_disk_error(zio); 818 zio_interrupt(zio); 819 } 820 821 static int 822 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 823 { 824 struct request_queue *q; 825 struct bio *bio; 826 827 q = bdev_get_queue(bdev); 828 if (!q) 829 return (SET_ERROR(ENXIO)); 830 831 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 832 if (unlikely(bio == NULL)) 833 return (SET_ERROR(ENOMEM)); 834 835 bio->bi_end_io = vdev_disk_io_flush_completion; 836 bio->bi_private = zio; 837 bio_set_flush(bio); 838 vdev_submit_bio(bio); 839 invalidate_bdev(bdev); 840 841 return (0); 842 } 843 844 static int 845 vdev_disk_io_trim(zio_t *zio) 846 { 847 vdev_t *v = zio->io_vd; 848 vdev_disk_t *vd = v->vdev_tsd; 849 850 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 851 if (zio->io_trim_flags & ZIO_TRIM_SECURE) { 852 return (-blkdev_issue_secure_erase(vd->vd_bdev, 853 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 854 } else { 855 return (-blkdev_issue_discard(vd->vd_bdev, 856 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 857 } 858 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 859 unsigned long trim_flags = 0; 860 #if defined(BLKDEV_DISCARD_SECURE) 861 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 862 trim_flags |= BLKDEV_DISCARD_SECURE; 863 #endif 864 return (-blkdev_issue_discard(vd->vd_bdev, 865 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); 866 #else 867 #error "Unsupported kernel" 868 #endif 869 } 870 871 static void 872 vdev_disk_io_start(zio_t *zio) 873 { 874 vdev_t *v = zio->io_vd; 875 vdev_disk_t *vd = v->vdev_tsd; 876 int rw, error; 877 878 /* 879 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 880 * Nothing to be done here but return failure. 881 */ 882 if (vd == NULL) { 883 zio->io_error = ENXIO; 884 zio_interrupt(zio); 885 return; 886 } 887 888 rw_enter(&vd->vd_lock, RW_READER); 889 890 /* 891 * If the vdev is closed, it's likely due to a failed reopen and is 892 * in the UNAVAIL state. Nothing to be done here but return failure. 893 */ 894 if (vd->vd_bdev == NULL) { 895 rw_exit(&vd->vd_lock); 896 zio->io_error = ENXIO; 897 zio_interrupt(zio); 898 return; 899 } 900 901 switch (zio->io_type) { 902 case ZIO_TYPE_IOCTL: 903 904 if (!vdev_readable(v)) { 905 rw_exit(&vd->vd_lock); 906 zio->io_error = SET_ERROR(ENXIO); 907 zio_interrupt(zio); 908 return; 909 } 910 911 switch (zio->io_cmd) { 912 case DKIOCFLUSHWRITECACHE: 913 914 if (zfs_nocacheflush) 915 break; 916 917 if (v->vdev_nowritecache) { 918 zio->io_error = SET_ERROR(ENOTSUP); 919 break; 920 } 921 922 error = vdev_disk_io_flush(vd->vd_bdev, zio); 923 if (error == 0) { 924 rw_exit(&vd->vd_lock); 925 return; 926 } 927 928 zio->io_error = error; 929 930 break; 931 932 default: 933 zio->io_error = SET_ERROR(ENOTSUP); 934 } 935 936 rw_exit(&vd->vd_lock); 937 zio_execute(zio); 938 return; 939 case ZIO_TYPE_WRITE: 940 rw = WRITE; 941 break; 942 943 case ZIO_TYPE_READ: 944 rw = READ; 945 break; 946 947 case ZIO_TYPE_TRIM: 948 zio->io_error = vdev_disk_io_trim(zio); 949 rw_exit(&vd->vd_lock); 950 zio_interrupt(zio); 951 return; 952 953 default: 954 rw_exit(&vd->vd_lock); 955 zio->io_error = SET_ERROR(ENOTSUP); 956 zio_interrupt(zio); 957 return; 958 } 959 960 zio->io_target_timestamp = zio_handle_io_delay(zio); 961 error = __vdev_disk_physio(vd->vd_bdev, zio, 962 zio->io_size, zio->io_offset, rw, 0); 963 rw_exit(&vd->vd_lock); 964 965 if (error) { 966 zio->io_error = error; 967 zio_interrupt(zio); 968 return; 969 } 970 } 971 972 static void 973 vdev_disk_io_done(zio_t *zio) 974 { 975 /* 976 * If the device returned EIO, we revalidate the media. If it is 977 * determined the media has changed this triggers the asynchronous 978 * removal of the device from the configuration. 979 */ 980 if (zio->io_error == EIO) { 981 vdev_t *v = zio->io_vd; 982 vdev_disk_t *vd = v->vdev_tsd; 983 984 if (!zfs_check_disk_status(vd->vd_bdev)) { 985 invalidate_bdev(vd->vd_bdev); 986 v->vdev_remove_wanted = B_TRUE; 987 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 988 } 989 } 990 } 991 992 static void 993 vdev_disk_hold(vdev_t *vd) 994 { 995 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 996 997 /* We must have a pathname, and it must be absolute. */ 998 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 999 return; 1000 1001 /* 1002 * Only prefetch path and devid info if the device has 1003 * never been opened. 1004 */ 1005 if (vd->vdev_tsd != NULL) 1006 return; 1007 1008 } 1009 1010 static void 1011 vdev_disk_rele(vdev_t *vd) 1012 { 1013 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1014 1015 /* XXX: Implement me as a vnode rele for the device */ 1016 } 1017 1018 vdev_ops_t vdev_disk_ops = { 1019 .vdev_op_init = NULL, 1020 .vdev_op_fini = NULL, 1021 .vdev_op_open = vdev_disk_open, 1022 .vdev_op_close = vdev_disk_close, 1023 .vdev_op_asize = vdev_default_asize, 1024 .vdev_op_min_asize = vdev_default_min_asize, 1025 .vdev_op_min_alloc = NULL, 1026 .vdev_op_io_start = vdev_disk_io_start, 1027 .vdev_op_io_done = vdev_disk_io_done, 1028 .vdev_op_state_change = NULL, 1029 .vdev_op_need_resilver = NULL, 1030 .vdev_op_hold = vdev_disk_hold, 1031 .vdev_op_rele = vdev_disk_rele, 1032 .vdev_op_remap = NULL, 1033 .vdev_op_xlate = vdev_default_xlate, 1034 .vdev_op_rebuild_asize = NULL, 1035 .vdev_op_metaslab_init = NULL, 1036 .vdev_op_config_generate = NULL, 1037 .vdev_op_nparity = NULL, 1038 .vdev_op_ndisks = NULL, 1039 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1040 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1041 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1042 }; 1043 1044 /* 1045 * The zfs_vdev_scheduler module option has been deprecated. Setting this 1046 * value no longer has any effect. It has not yet been entirely removed 1047 * to allow the module to be loaded if this option is specified in the 1048 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 1049 */ 1050 static int 1051 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1052 { 1053 int error = param_set_charp(val, kp); 1054 if (error == 0) { 1055 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1056 "is not supported.\n"); 1057 } 1058 1059 return (error); 1060 } 1061 1062 static const char *zfs_vdev_scheduler = "unused"; 1063 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1064 param_get_charp, &zfs_vdev_scheduler, 0644); 1065 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1066 1067 int 1068 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1069 { 1070 uint_t val; 1071 int error; 1072 1073 error = kstrtouint(buf, 0, &val); 1074 if (error < 0) 1075 return (SET_ERROR(error)); 1076 1077 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1078 return (SET_ERROR(-EINVAL)); 1079 1080 error = param_set_uint(buf, kp); 1081 if (error < 0) 1082 return (SET_ERROR(error)); 1083 1084 return (0); 1085 } 1086 1087 int 1088 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1089 { 1090 uint_t val; 1091 int error; 1092 1093 error = kstrtouint(buf, 0, &val); 1094 if (error < 0) 1095 return (SET_ERROR(error)); 1096 1097 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1098 return (SET_ERROR(-EINVAL)); 1099 1100 error = param_set_uint(buf, kp); 1101 if (error < 0) 1102 return (SET_ERROR(error)); 1103 1104 return (0); 1105 } 1106 1107 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1108 "Timeout before determining that a device is missing"); 1109 1110 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1111 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1112