1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 41 #include <linux/blk-cgroup.h> 42 #endif 43 44 typedef struct vdev_disk { 45 struct block_device *vd_bdev; 46 krwlock_t vd_lock; 47 } vdev_disk_t; 48 49 /* 50 * Unique identifier for the exclusive vdev holder. 51 */ 52 static void *zfs_vdev_holder = VDEV_HOLDER; 53 54 /* 55 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 56 * device is missing. The missing path may be transient since the links 57 * can be briefly removed and recreated in response to udev events. 58 */ 59 static uint_t zfs_vdev_open_timeout_ms = 1000; 60 61 /* 62 * Size of the "reserved" partition, in blocks. 63 */ 64 #define EFI_MIN_RESV_SIZE (16 * 1024) 65 66 /* 67 * Virtual device vector for disks. 68 */ 69 typedef struct dio_request { 70 zio_t *dr_zio; /* Parent ZIO */ 71 atomic_t dr_ref; /* References */ 72 int dr_error; /* Bio error */ 73 int dr_bio_count; /* Count of bio's */ 74 struct bio *dr_bio[]; /* Attached bio's */ 75 } dio_request_t; 76 77 /* 78 * BIO request failfast mask. 79 */ 80 81 static unsigned int zfs_vdev_failfast_mask = 1; 82 83 #ifdef HAVE_BLK_MODE_T 84 static blk_mode_t 85 #else 86 static fmode_t 87 #endif 88 vdev_bdev_mode(spa_mode_t spa_mode) 89 { 90 #ifdef HAVE_BLK_MODE_T 91 blk_mode_t mode = 0; 92 93 if (spa_mode & SPA_MODE_READ) 94 mode |= BLK_OPEN_READ; 95 96 if (spa_mode & SPA_MODE_WRITE) 97 mode |= BLK_OPEN_WRITE; 98 #else 99 fmode_t mode = 0; 100 101 if (spa_mode & SPA_MODE_READ) 102 mode |= FMODE_READ; 103 104 if (spa_mode & SPA_MODE_WRITE) 105 mode |= FMODE_WRITE; 106 #endif 107 108 return (mode); 109 } 110 111 /* 112 * Returns the usable capacity (in bytes) for the partition or disk. 113 */ 114 static uint64_t 115 bdev_capacity(struct block_device *bdev) 116 { 117 return (i_size_read(bdev->bd_inode)); 118 } 119 120 #if !defined(HAVE_BDEV_WHOLE) 121 static inline struct block_device * 122 bdev_whole(struct block_device *bdev) 123 { 124 return (bdev->bd_contains); 125 } 126 #endif 127 128 #if defined(HAVE_BDEVNAME) 129 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 130 #else 131 static inline void 132 vdev_bdevname(struct block_device *bdev, char *name) 133 { 134 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 135 } 136 #endif 137 138 /* 139 * Returns the maximum expansion capacity of the block device (in bytes). 140 * 141 * It is possible to expand a vdev when it has been created as a wholedisk 142 * and the containing block device has increased in capacity. Or when the 143 * partition containing the pool has been manually increased in size. 144 * 145 * This function is only responsible for calculating the potential expansion 146 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 147 * responsible for verifying the expected partition layout in the wholedisk 148 * case, and updating the partition table if appropriate. Once the partition 149 * size has been increased the additional capacity will be visible using 150 * bdev_capacity(). 151 * 152 * The returned maximum expansion capacity is always expected to be larger, or 153 * at the very least equal, to its usable capacity to prevent overestimating 154 * the pool expandsize. 155 */ 156 static uint64_t 157 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 158 { 159 uint64_t psize; 160 int64_t available; 161 162 if (wholedisk && bdev != bdev_whole(bdev)) { 163 /* 164 * When reporting maximum expansion capacity for a wholedisk 165 * deduct any capacity which is expected to be lost due to 166 * alignment restrictions. Over reporting this value isn't 167 * harmful and would only result in slightly less capacity 168 * than expected post expansion. 169 * The estimated available space may be slightly smaller than 170 * bdev_capacity() for devices where the number of sectors is 171 * not a multiple of the alignment size and the partition layout 172 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 173 * "reserved" EFI partition: in such cases return the device 174 * usable capacity. 175 */ 176 available = i_size_read(bdev_whole(bdev)->bd_inode) - 177 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 178 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 179 psize = MAX(available, bdev_capacity(bdev)); 180 } else { 181 psize = bdev_capacity(bdev); 182 } 183 184 return (psize); 185 } 186 187 static void 188 vdev_disk_error(zio_t *zio) 189 { 190 /* 191 * This function can be called in interrupt context, for instance while 192 * handling IRQs coming from a misbehaving disk device; use printk() 193 * which is safe from any context. 194 */ 195 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 196 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 197 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 198 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 199 zio->io_flags); 200 } 201 202 static void 203 vdev_disk_kobj_evt_post(vdev_t *v) 204 { 205 vdev_disk_t *vd = v->vdev_tsd; 206 if (vd && vd->vd_bdev) { 207 spl_signal_kobj_evt(vd->vd_bdev); 208 } else { 209 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 210 v->vdev_path); 211 } 212 } 213 214 #if !defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 215 /* 216 * Define a dummy struct blk_holder_ops for kernel versions 217 * prior to 6.5. 218 */ 219 struct blk_holder_ops {}; 220 #endif 221 222 static struct block_device * 223 vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, 224 const struct blk_holder_ops *hops) 225 { 226 #ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG 227 return (blkdev_get_by_path(path, 228 vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops)); 229 #else 230 return (blkdev_get_by_path(path, 231 vdev_bdev_mode(mode) | FMODE_EXCL, holder)); 232 #endif 233 } 234 235 static void 236 vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) 237 { 238 #ifdef HAVE_BLKDEV_PUT_HOLDER 239 return (blkdev_put(bdev, holder)); 240 #else 241 return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL)); 242 #endif 243 } 244 245 static int 246 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 247 uint64_t *logical_ashift, uint64_t *physical_ashift) 248 { 249 struct block_device *bdev; 250 #ifdef HAVE_BLK_MODE_T 251 blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 252 #else 253 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 254 #endif 255 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 256 vdev_disk_t *vd; 257 258 /* Must have a pathname and it must be absolute. */ 259 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 260 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 261 vdev_dbgmsg(v, "invalid vdev_path"); 262 return (SET_ERROR(EINVAL)); 263 } 264 265 /* 266 * Reopen the device if it is currently open. When expanding a 267 * partition force re-scanning the partition table if userland 268 * did not take care of this already. We need to do this while closed 269 * in order to get an accurate updated block device size. Then 270 * since udev may need to recreate the device links increase the 271 * open retry timeout before reporting the device as unavailable. 272 */ 273 vd = v->vdev_tsd; 274 if (vd) { 275 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 276 boolean_t reread_part = B_FALSE; 277 278 rw_enter(&vd->vd_lock, RW_WRITER); 279 bdev = vd->vd_bdev; 280 vd->vd_bdev = NULL; 281 282 if (bdev) { 283 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 284 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 285 /* 286 * If userland has BLKPG_RESIZE_PARTITION, 287 * then it should have updated the partition 288 * table already. We can detect this by 289 * comparing our current physical size 290 * with that of the device. If they are 291 * the same, then we must not have 292 * BLKPG_RESIZE_PARTITION or it failed to 293 * update the partition table online. We 294 * fallback to rescanning the partition 295 * table from the kernel below. However, 296 * if the capacity already reflects the 297 * updated partition, then we skip 298 * rescanning the partition table here. 299 */ 300 if (v->vdev_psize == bdev_capacity(bdev)) 301 reread_part = B_TRUE; 302 } 303 304 vdev_blkdev_put(bdev, mode, zfs_vdev_holder); 305 } 306 307 if (reread_part) { 308 bdev = vdev_blkdev_get_by_path(disk_name, mode, 309 zfs_vdev_holder, NULL); 310 if (!IS_ERR(bdev)) { 311 int error = vdev_bdev_reread_part(bdev); 312 vdev_blkdev_put(bdev, mode, zfs_vdev_holder); 313 if (error == 0) { 314 timeout = MSEC2NSEC( 315 zfs_vdev_open_timeout_ms * 2); 316 } 317 } 318 } 319 } else { 320 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 321 322 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 323 rw_enter(&vd->vd_lock, RW_WRITER); 324 } 325 326 /* 327 * Devices are always opened by the path provided at configuration 328 * time. This means that if the provided path is a udev by-id path 329 * then drives may be re-cabled without an issue. If the provided 330 * path is a udev by-path path, then the physical location information 331 * will be preserved. This can be critical for more complicated 332 * configurations where drives are located in specific physical 333 * locations to maximize the systems tolerance to component failure. 334 * 335 * Alternatively, you can provide your own udev rule to flexibly map 336 * the drives as you see fit. It is not advised that you use the 337 * /dev/[hd]d devices which may be reordered due to probing order. 338 * Devices in the wrong locations will be detected by the higher 339 * level vdev validation. 340 * 341 * The specified paths may be briefly removed and recreated in 342 * response to udev events. This should be exceptionally unlikely 343 * because the zpool command makes every effort to verify these paths 344 * have already settled prior to reaching this point. Therefore, 345 * a ENOENT failure at this point is highly likely to be transient 346 * and it is reasonable to sleep and retry before giving up. In 347 * practice delays have been observed to be on the order of 100ms. 348 * 349 * When ERESTARTSYS is returned it indicates the block device is 350 * a zvol which could not be opened due to the deadlock detection 351 * logic in zvol_open(). Extend the timeout and retry the open 352 * subsequent attempts are expected to eventually succeed. 353 */ 354 hrtime_t start = gethrtime(); 355 bdev = ERR_PTR(-ENXIO); 356 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 357 bdev = vdev_blkdev_get_by_path(v->vdev_path, mode, 358 zfs_vdev_holder, NULL); 359 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 360 /* 361 * There is no point of waiting since device is removed 362 * explicitly 363 */ 364 if (v->vdev_removed) 365 break; 366 367 schedule_timeout(MSEC_TO_TICK(10)); 368 } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { 369 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 370 continue; 371 } else if (IS_ERR(bdev)) { 372 break; 373 } 374 } 375 376 if (IS_ERR(bdev)) { 377 int error = -PTR_ERR(bdev); 378 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 379 (u_longlong_t)(gethrtime() - start), 380 (u_longlong_t)timeout); 381 vd->vd_bdev = NULL; 382 v->vdev_tsd = vd; 383 rw_exit(&vd->vd_lock); 384 return (SET_ERROR(error)); 385 } else { 386 vd->vd_bdev = bdev; 387 v->vdev_tsd = vd; 388 rw_exit(&vd->vd_lock); 389 } 390 391 /* Determine the physical block size */ 392 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 393 394 /* Determine the logical block size */ 395 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 396 397 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 398 v->vdev_nowritecache = B_FALSE; 399 400 /* Set when device reports it supports TRIM. */ 401 v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); 402 403 /* Set when device reports it supports secure TRIM. */ 404 v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); 405 406 /* Inform the ZIO pipeline that we are non-rotational */ 407 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); 408 409 /* Physical volume size in bytes for the partition */ 410 *psize = bdev_capacity(vd->vd_bdev); 411 412 /* Physical volume size in bytes including possible expansion space */ 413 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 414 415 /* Based on the minimum sector size set the block size */ 416 *physical_ashift = highbit64(MAX(physical_block_size, 417 SPA_MINBLOCKSIZE)) - 1; 418 419 *logical_ashift = highbit64(MAX(logical_block_size, 420 SPA_MINBLOCKSIZE)) - 1; 421 422 return (0); 423 } 424 425 static void 426 vdev_disk_close(vdev_t *v) 427 { 428 vdev_disk_t *vd = v->vdev_tsd; 429 430 if (v->vdev_reopening || vd == NULL) 431 return; 432 433 if (vd->vd_bdev != NULL) { 434 vdev_blkdev_put(vd->vd_bdev, spa_mode(v->vdev_spa), 435 zfs_vdev_holder); 436 } 437 438 rw_destroy(&vd->vd_lock); 439 kmem_free(vd, sizeof (vdev_disk_t)); 440 v->vdev_tsd = NULL; 441 } 442 443 static dio_request_t * 444 vdev_disk_dio_alloc(int bio_count) 445 { 446 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 447 sizeof (struct bio *) * bio_count, KM_SLEEP); 448 atomic_set(&dr->dr_ref, 0); 449 dr->dr_bio_count = bio_count; 450 dr->dr_error = 0; 451 452 for (int i = 0; i < dr->dr_bio_count; i++) 453 dr->dr_bio[i] = NULL; 454 455 return (dr); 456 } 457 458 static void 459 vdev_disk_dio_free(dio_request_t *dr) 460 { 461 int i; 462 463 for (i = 0; i < dr->dr_bio_count; i++) 464 if (dr->dr_bio[i]) 465 bio_put(dr->dr_bio[i]); 466 467 kmem_free(dr, sizeof (dio_request_t) + 468 sizeof (struct bio *) * dr->dr_bio_count); 469 } 470 471 static void 472 vdev_disk_dio_get(dio_request_t *dr) 473 { 474 atomic_inc(&dr->dr_ref); 475 } 476 477 static void 478 vdev_disk_dio_put(dio_request_t *dr) 479 { 480 int rc = atomic_dec_return(&dr->dr_ref); 481 482 /* 483 * Free the dio_request when the last reference is dropped and 484 * ensure zio_interpret is called only once with the correct zio 485 */ 486 if (rc == 0) { 487 zio_t *zio = dr->dr_zio; 488 int error = dr->dr_error; 489 490 vdev_disk_dio_free(dr); 491 492 if (zio) { 493 zio->io_error = error; 494 ASSERT3S(zio->io_error, >=, 0); 495 if (zio->io_error) 496 vdev_disk_error(zio); 497 498 zio_delay_interrupt(zio); 499 } 500 } 501 } 502 503 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 504 { 505 dio_request_t *dr = bio->bi_private; 506 507 if (dr->dr_error == 0) { 508 #ifdef HAVE_1ARG_BIO_END_IO_T 509 dr->dr_error = BIO_END_IO_ERROR(bio); 510 #else 511 if (error) 512 dr->dr_error = -(error); 513 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 514 dr->dr_error = EIO; 515 #endif 516 } 517 518 /* Drop reference acquired by __vdev_disk_physio */ 519 vdev_disk_dio_put(dr); 520 } 521 522 static inline void 523 vdev_submit_bio_impl(struct bio *bio) 524 { 525 #ifdef HAVE_1ARG_SUBMIT_BIO 526 (void) submit_bio(bio); 527 #else 528 (void) submit_bio(bio_data_dir(bio), bio); 529 #endif 530 } 531 532 /* 533 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 534 * replace it with preempt_schedule under the following condition: 535 */ 536 #if defined(CONFIG_ARM64) && \ 537 defined(CONFIG_PREEMPTION) && \ 538 defined(CONFIG_BLK_CGROUP) 539 #define preempt_schedule_notrace(x) preempt_schedule(x) 540 #endif 541 542 /* 543 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 544 * as an argument removing the need to set it with bio_set_dev(). This 545 * removes the need for all of the following compatibility code. 546 */ 547 #if !defined(HAVE_BIO_ALLOC_4ARG) 548 549 #ifdef HAVE_BIO_SET_DEV 550 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 551 /* 552 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 553 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 554 * As a side effect the function was converted to GPL-only. Define our 555 * own version when needed which uses rcu_read_lock_sched(). 556 * 557 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 558 * part, moving blkg_tryget into the private one. Define our own version. 559 */ 560 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 561 static inline bool 562 vdev_blkg_tryget(struct blkcg_gq *blkg) 563 { 564 struct percpu_ref *ref = &blkg->refcnt; 565 unsigned long __percpu *count; 566 bool rc; 567 568 rcu_read_lock_sched(); 569 570 if (__ref_is_percpu(ref, &count)) { 571 this_cpu_inc(*count); 572 rc = true; 573 } else { 574 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 575 rc = atomic_long_inc_not_zero(&ref->data->count); 576 #else 577 rc = atomic_long_inc_not_zero(&ref->count); 578 #endif 579 } 580 581 rcu_read_unlock_sched(); 582 583 return (rc); 584 } 585 #else 586 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 587 #endif 588 #ifdef HAVE_BIO_SET_DEV_MACRO 589 /* 590 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 591 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 592 * the entire macro. Provide a minimal version which always assigns the 593 * request queue's root_blkg to the bio. 594 */ 595 static inline void 596 vdev_bio_associate_blkg(struct bio *bio) 597 { 598 #if defined(HAVE_BIO_BDEV_DISK) 599 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 600 #else 601 struct request_queue *q = bio->bi_disk->queue; 602 #endif 603 604 ASSERT3P(q, !=, NULL); 605 ASSERT3P(bio->bi_blkg, ==, NULL); 606 607 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 608 bio->bi_blkg = q->root_blkg; 609 } 610 611 #define bio_associate_blkg vdev_bio_associate_blkg 612 #else 613 static inline void 614 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 615 { 616 #if defined(HAVE_BIO_BDEV_DISK) 617 struct request_queue *q = bdev->bd_disk->queue; 618 #else 619 struct request_queue *q = bio->bi_disk->queue; 620 #endif 621 bio_clear_flag(bio, BIO_REMAPPED); 622 if (bio->bi_bdev != bdev) 623 bio_clear_flag(bio, BIO_THROTTLED); 624 bio->bi_bdev = bdev; 625 626 ASSERT3P(q, !=, NULL); 627 ASSERT3P(bio->bi_blkg, ==, NULL); 628 629 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 630 bio->bi_blkg = q->root_blkg; 631 } 632 #define bio_set_dev vdev_bio_set_dev 633 #endif 634 #endif 635 #else 636 /* 637 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 638 */ 639 static inline void 640 bio_set_dev(struct bio *bio, struct block_device *bdev) 641 { 642 bio->bi_bdev = bdev; 643 } 644 #endif /* HAVE_BIO_SET_DEV */ 645 #endif /* !HAVE_BIO_ALLOC_4ARG */ 646 647 static inline void 648 vdev_submit_bio(struct bio *bio) 649 { 650 struct bio_list *bio_list = current->bio_list; 651 current->bio_list = NULL; 652 vdev_submit_bio_impl(bio); 653 current->bio_list = bio_list; 654 } 655 656 static inline struct bio * 657 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 658 unsigned short nr_vecs) 659 { 660 struct bio *bio; 661 662 #ifdef HAVE_BIO_ALLOC_4ARG 663 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 664 #else 665 bio = bio_alloc(gfp_mask, nr_vecs); 666 if (likely(bio != NULL)) 667 bio_set_dev(bio, bdev); 668 #endif 669 670 return (bio); 671 } 672 673 static inline unsigned int 674 vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 675 { 676 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 677 bio_size, abd_offset); 678 679 #ifdef HAVE_BIO_MAX_SEGS 680 return (bio_max_segs(nr_segs)); 681 #else 682 return (MIN(nr_segs, BIO_MAX_PAGES)); 683 #endif 684 } 685 686 static int 687 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 688 size_t io_size, uint64_t io_offset, int rw, int flags) 689 { 690 dio_request_t *dr; 691 uint64_t abd_offset; 692 uint64_t bio_offset; 693 int bio_size; 694 int bio_count = 16; 695 int error = 0; 696 struct blk_plug plug; 697 unsigned short nr_vecs; 698 699 /* 700 * Accessing outside the block device is never allowed. 701 */ 702 if (io_offset + io_size > bdev->bd_inode->i_size) { 703 vdev_dbgmsg(zio->io_vd, 704 "Illegal access %llu size %llu, device size %llu", 705 (u_longlong_t)io_offset, 706 (u_longlong_t)io_size, 707 (u_longlong_t)i_size_read(bdev->bd_inode)); 708 return (SET_ERROR(EIO)); 709 } 710 711 retry: 712 dr = vdev_disk_dio_alloc(bio_count); 713 714 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 715 zio->io_vd->vdev_failfast == B_TRUE) { 716 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 717 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 718 } 719 720 dr->dr_zio = zio; 721 722 /* 723 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 724 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 725 * can cover at least 128KB and at most 1MB. When the required number 726 * of iovec's exceeds this, we are forced to break the IO in multiple 727 * bio's and wait for them all to complete. This is likely if the 728 * recordsize property is increased beyond 1MB. The default 729 * bio_count=16 should typically accommodate the maximum-size zio of 730 * 16MB. 731 */ 732 733 abd_offset = 0; 734 bio_offset = io_offset; 735 bio_size = io_size; 736 for (int i = 0; i <= dr->dr_bio_count; i++) { 737 738 /* Finished constructing bio's for given buffer */ 739 if (bio_size <= 0) 740 break; 741 742 /* 743 * If additional bio's are required, we have to retry, but 744 * this should be rare - see the comment above. 745 */ 746 if (dr->dr_bio_count == i) { 747 vdev_disk_dio_free(dr); 748 bio_count *= 2; 749 goto retry; 750 } 751 752 nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); 753 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 754 if (unlikely(dr->dr_bio[i] == NULL)) { 755 vdev_disk_dio_free(dr); 756 return (SET_ERROR(ENOMEM)); 757 } 758 759 /* Matching put called by vdev_disk_physio_completion */ 760 vdev_disk_dio_get(dr); 761 762 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 763 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 764 dr->dr_bio[i]->bi_private = dr; 765 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 766 767 /* Remaining size is returned to become the new size */ 768 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 769 bio_size, abd_offset); 770 771 /* Advance in buffer and construct another bio if needed */ 772 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 773 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 774 } 775 776 /* Extra reference to protect dio_request during vdev_submit_bio */ 777 vdev_disk_dio_get(dr); 778 779 if (dr->dr_bio_count > 1) 780 blk_start_plug(&plug); 781 782 /* Submit all bio's associated with this dio */ 783 for (int i = 0; i < dr->dr_bio_count; i++) { 784 if (dr->dr_bio[i]) 785 vdev_submit_bio(dr->dr_bio[i]); 786 } 787 788 if (dr->dr_bio_count > 1) 789 blk_finish_plug(&plug); 790 791 vdev_disk_dio_put(dr); 792 793 return (error); 794 } 795 796 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 797 { 798 zio_t *zio = bio->bi_private; 799 #ifdef HAVE_1ARG_BIO_END_IO_T 800 zio->io_error = BIO_END_IO_ERROR(bio); 801 #else 802 zio->io_error = -error; 803 #endif 804 805 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 806 zio->io_vd->vdev_nowritecache = B_TRUE; 807 808 bio_put(bio); 809 ASSERT3S(zio->io_error, >=, 0); 810 if (zio->io_error) 811 vdev_disk_error(zio); 812 zio_interrupt(zio); 813 } 814 815 static int 816 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 817 { 818 struct request_queue *q; 819 struct bio *bio; 820 821 q = bdev_get_queue(bdev); 822 if (!q) 823 return (SET_ERROR(ENXIO)); 824 825 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 826 if (unlikely(bio == NULL)) 827 return (SET_ERROR(ENOMEM)); 828 829 bio->bi_end_io = vdev_disk_io_flush_completion; 830 bio->bi_private = zio; 831 bio_set_flush(bio); 832 vdev_submit_bio(bio); 833 invalidate_bdev(bdev); 834 835 return (0); 836 } 837 838 static int 839 vdev_disk_io_trim(zio_t *zio) 840 { 841 vdev_t *v = zio->io_vd; 842 vdev_disk_t *vd = v->vdev_tsd; 843 844 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 845 if (zio->io_trim_flags & ZIO_TRIM_SECURE) { 846 return (-blkdev_issue_secure_erase(vd->vd_bdev, 847 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 848 } else { 849 return (-blkdev_issue_discard(vd->vd_bdev, 850 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 851 } 852 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 853 unsigned long trim_flags = 0; 854 #if defined(BLKDEV_DISCARD_SECURE) 855 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 856 trim_flags |= BLKDEV_DISCARD_SECURE; 857 #endif 858 return (-blkdev_issue_discard(vd->vd_bdev, 859 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); 860 #else 861 #error "Unsupported kernel" 862 #endif 863 } 864 865 static void 866 vdev_disk_io_start(zio_t *zio) 867 { 868 vdev_t *v = zio->io_vd; 869 vdev_disk_t *vd = v->vdev_tsd; 870 int rw, error; 871 872 /* 873 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 874 * Nothing to be done here but return failure. 875 */ 876 if (vd == NULL) { 877 zio->io_error = ENXIO; 878 zio_interrupt(zio); 879 return; 880 } 881 882 rw_enter(&vd->vd_lock, RW_READER); 883 884 /* 885 * If the vdev is closed, it's likely due to a failed reopen and is 886 * in the UNAVAIL state. Nothing to be done here but return failure. 887 */ 888 if (vd->vd_bdev == NULL) { 889 rw_exit(&vd->vd_lock); 890 zio->io_error = ENXIO; 891 zio_interrupt(zio); 892 return; 893 } 894 895 switch (zio->io_type) { 896 case ZIO_TYPE_IOCTL: 897 898 if (!vdev_readable(v)) { 899 rw_exit(&vd->vd_lock); 900 zio->io_error = SET_ERROR(ENXIO); 901 zio_interrupt(zio); 902 return; 903 } 904 905 switch (zio->io_cmd) { 906 case DKIOCFLUSHWRITECACHE: 907 908 if (zfs_nocacheflush) 909 break; 910 911 if (v->vdev_nowritecache) { 912 zio->io_error = SET_ERROR(ENOTSUP); 913 break; 914 } 915 916 error = vdev_disk_io_flush(vd->vd_bdev, zio); 917 if (error == 0) { 918 rw_exit(&vd->vd_lock); 919 return; 920 } 921 922 zio->io_error = error; 923 924 break; 925 926 default: 927 zio->io_error = SET_ERROR(ENOTSUP); 928 } 929 930 rw_exit(&vd->vd_lock); 931 zio_execute(zio); 932 return; 933 case ZIO_TYPE_WRITE: 934 rw = WRITE; 935 break; 936 937 case ZIO_TYPE_READ: 938 rw = READ; 939 break; 940 941 case ZIO_TYPE_TRIM: 942 zio->io_error = vdev_disk_io_trim(zio); 943 rw_exit(&vd->vd_lock); 944 zio_interrupt(zio); 945 return; 946 947 default: 948 rw_exit(&vd->vd_lock); 949 zio->io_error = SET_ERROR(ENOTSUP); 950 zio_interrupt(zio); 951 return; 952 } 953 954 zio->io_target_timestamp = zio_handle_io_delay(zio); 955 error = __vdev_disk_physio(vd->vd_bdev, zio, 956 zio->io_size, zio->io_offset, rw, 0); 957 rw_exit(&vd->vd_lock); 958 959 if (error) { 960 zio->io_error = error; 961 zio_interrupt(zio); 962 return; 963 } 964 } 965 966 static void 967 vdev_disk_io_done(zio_t *zio) 968 { 969 /* 970 * If the device returned EIO, we revalidate the media. If it is 971 * determined the media has changed this triggers the asynchronous 972 * removal of the device from the configuration. 973 */ 974 if (zio->io_error == EIO) { 975 vdev_t *v = zio->io_vd; 976 vdev_disk_t *vd = v->vdev_tsd; 977 978 if (!zfs_check_disk_status(vd->vd_bdev)) { 979 invalidate_bdev(vd->vd_bdev); 980 v->vdev_remove_wanted = B_TRUE; 981 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 982 } 983 } 984 } 985 986 static void 987 vdev_disk_hold(vdev_t *vd) 988 { 989 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 990 991 /* We must have a pathname, and it must be absolute. */ 992 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 993 return; 994 995 /* 996 * Only prefetch path and devid info if the device has 997 * never been opened. 998 */ 999 if (vd->vdev_tsd != NULL) 1000 return; 1001 1002 } 1003 1004 static void 1005 vdev_disk_rele(vdev_t *vd) 1006 { 1007 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1008 1009 /* XXX: Implement me as a vnode rele for the device */ 1010 } 1011 1012 vdev_ops_t vdev_disk_ops = { 1013 .vdev_op_init = NULL, 1014 .vdev_op_fini = NULL, 1015 .vdev_op_open = vdev_disk_open, 1016 .vdev_op_close = vdev_disk_close, 1017 .vdev_op_asize = vdev_default_asize, 1018 .vdev_op_min_asize = vdev_default_min_asize, 1019 .vdev_op_min_alloc = NULL, 1020 .vdev_op_io_start = vdev_disk_io_start, 1021 .vdev_op_io_done = vdev_disk_io_done, 1022 .vdev_op_state_change = NULL, 1023 .vdev_op_need_resilver = NULL, 1024 .vdev_op_hold = vdev_disk_hold, 1025 .vdev_op_rele = vdev_disk_rele, 1026 .vdev_op_remap = NULL, 1027 .vdev_op_xlate = vdev_default_xlate, 1028 .vdev_op_rebuild_asize = NULL, 1029 .vdev_op_metaslab_init = NULL, 1030 .vdev_op_config_generate = NULL, 1031 .vdev_op_nparity = NULL, 1032 .vdev_op_ndisks = NULL, 1033 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1034 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1035 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1036 }; 1037 1038 /* 1039 * The zfs_vdev_scheduler module option has been deprecated. Setting this 1040 * value no longer has any effect. It has not yet been entirely removed 1041 * to allow the module to be loaded if this option is specified in the 1042 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 1043 */ 1044 static int 1045 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1046 { 1047 int error = param_set_charp(val, kp); 1048 if (error == 0) { 1049 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1050 "is not supported.\n"); 1051 } 1052 1053 return (error); 1054 } 1055 1056 static const char *zfs_vdev_scheduler = "unused"; 1057 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1058 param_get_charp, &zfs_vdev_scheduler, 0644); 1059 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1060 1061 int 1062 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1063 { 1064 uint_t val; 1065 int error; 1066 1067 error = kstrtouint(buf, 0, &val); 1068 if (error < 0) 1069 return (SET_ERROR(error)); 1070 1071 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1072 return (SET_ERROR(-EINVAL)); 1073 1074 error = param_set_uint(buf, kp); 1075 if (error < 0) 1076 return (SET_ERROR(error)); 1077 1078 return (0); 1079 } 1080 1081 int 1082 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1083 { 1084 uint_t val; 1085 int error; 1086 1087 error = kstrtouint(buf, 0, &val); 1088 if (error < 0) 1089 return (SET_ERROR(error)); 1090 1091 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1092 return (SET_ERROR(-EINVAL)); 1093 1094 error = param_set_uint(buf, kp); 1095 if (error < 0) 1096 return (SET_ERROR(error)); 1097 1098 return (0); 1099 } 1100 1101 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1102 "Timeout before determining that a device is missing"); 1103 1104 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1105 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1106