1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 24 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 25 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 26 * LLNL-CODE-403049. 27 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 28 * Copyright (c) 2023, 2024, 2025, Klara, Inc. 29 */ 30 31 #include <sys/zfs_context.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev_disk.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/vdev_trim.h> 36 #include <sys/abd.h> 37 #include <sys/fs/zfs.h> 38 #include <sys/zio.h> 39 #include <linux/blkpg.h> 40 #include <linux/msdos_fs.h> 41 #include <linux/vfs_compat.h> 42 #include <linux/blk-cgroup.h> 43 44 /* 45 * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying 46 * block_device. Since it carries the block_device inside, its convenient to 47 * just use the handle as a proxy. 48 * 49 * Linux 6.9.x uses a file for the same purpose. 50 * 51 * For pre-6.8, we just emulate this with a cast, since we don't need any of 52 * the other fields inside the handle. 53 */ 54 #if defined(HAVE_BDEV_OPEN_BY_PATH) 55 typedef struct bdev_handle zfs_bdev_handle_t; 56 #define BDH_BDEV(bdh) ((bdh)->bdev) 57 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 58 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 59 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 60 #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 61 typedef struct file zfs_bdev_handle_t; 62 #define BDH_BDEV(bdh) (file_bdev(bdh)) 63 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 64 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 65 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 66 #else 67 typedef void zfs_bdev_handle_t; 68 #define BDH_BDEV(bdh) ((struct block_device *)bdh) 69 #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) 70 #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) 71 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 72 #endif 73 74 typedef struct vdev_disk { 75 zfs_bdev_handle_t *vd_bdh; 76 krwlock_t vd_lock; 77 } vdev_disk_t; 78 79 /* 80 * Maximum number of segments to add to a bio (min 4). If this is higher than 81 * the maximum allowed by the device queue or the kernel itself, it will be 82 * clamped. Setting it to zero will cause the kernel's ideal size to be used. 83 */ 84 uint_t zfs_vdev_disk_max_segs = 0; 85 86 /* 87 * Unique identifier for the exclusive vdev holder. 88 */ 89 static void *zfs_vdev_holder = VDEV_HOLDER; 90 91 /* 92 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 93 * device is missing. The missing path may be transient since the links 94 * can be briefly removed and recreated in response to udev events. 95 */ 96 static uint_t zfs_vdev_open_timeout_ms = 1000; 97 98 /* 99 * Size of the "reserved" partition, in blocks. 100 */ 101 #define EFI_MIN_RESV_SIZE (16 * 1024) 102 103 /* 104 * BIO request failfast mask. 105 */ 106 107 static unsigned int zfs_vdev_failfast_mask = 1; 108 109 /* 110 * Convert SPA mode flags into bdev open mode flags. 111 */ 112 #ifdef HAVE_BLK_MODE_T 113 typedef blk_mode_t vdev_bdev_mode_t; 114 #define VDEV_BDEV_MODE_READ BLK_OPEN_READ 115 #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE 116 #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL 117 #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) 118 #else 119 typedef fmode_t vdev_bdev_mode_t; 120 #define VDEV_BDEV_MODE_READ FMODE_READ 121 #define VDEV_BDEV_MODE_WRITE FMODE_WRITE 122 #define VDEV_BDEV_MODE_EXCL FMODE_EXCL 123 #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) 124 #endif 125 126 static vdev_bdev_mode_t 127 vdev_bdev_mode(spa_mode_t smode) 128 { 129 ASSERT3U(smode, !=, SPA_MODE_UNINIT); 130 ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); 131 132 vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; 133 134 if (smode & SPA_MODE_READ) 135 bmode |= VDEV_BDEV_MODE_READ; 136 137 if (smode & SPA_MODE_WRITE) 138 bmode |= VDEV_BDEV_MODE_WRITE; 139 140 ASSERT(bmode & VDEV_BDEV_MODE_MASK); 141 ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); 142 143 return (bmode); 144 } 145 146 /* 147 * Returns the usable capacity (in bytes) for the partition or disk. 148 */ 149 static uint64_t 150 bdev_capacity(struct block_device *bdev) 151 { 152 #ifdef HAVE_BDEV_NR_BYTES 153 return (bdev_nr_bytes(bdev)); 154 #else 155 return (i_size_read(bdev->bd_inode)); 156 #endif 157 } 158 159 #if !defined(HAVE_BDEV_WHOLE) 160 static inline struct block_device * 161 bdev_whole(struct block_device *bdev) 162 { 163 return (bdev->bd_contains); 164 } 165 #endif 166 167 #if defined(HAVE_BDEVNAME) 168 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 169 #else 170 static inline void 171 vdev_bdevname(struct block_device *bdev, char *name) 172 { 173 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 174 } 175 #endif 176 177 /* 178 * Returns the maximum expansion capacity of the block device (in bytes). 179 * 180 * It is possible to expand a vdev when it has been created as a wholedisk 181 * and the containing block device has increased in capacity. Or when the 182 * partition containing the pool has been manually increased in size. 183 * 184 * This function is only responsible for calculating the potential expansion 185 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 186 * responsible for verifying the expected partition layout in the wholedisk 187 * case, and updating the partition table if appropriate. Once the partition 188 * size has been increased the additional capacity will be visible using 189 * bdev_capacity(). 190 * 191 * The returned maximum expansion capacity is always expected to be larger, or 192 * at the very least equal, to its usable capacity to prevent overestimating 193 * the pool expandsize. 194 */ 195 static uint64_t 196 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 197 { 198 uint64_t psize; 199 int64_t available; 200 201 if (wholedisk && bdev != bdev_whole(bdev)) { 202 /* 203 * When reporting maximum expansion capacity for a wholedisk 204 * deduct any capacity which is expected to be lost due to 205 * alignment restrictions. Over reporting this value isn't 206 * harmful and would only result in slightly less capacity 207 * than expected post expansion. 208 * The estimated available space may be slightly smaller than 209 * bdev_capacity() for devices where the number of sectors is 210 * not a multiple of the alignment size and the partition layout 211 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 212 * "reserved" EFI partition: in such cases return the device 213 * usable capacity. 214 */ 215 available = bdev_capacity(bdev_whole(bdev)) - 216 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 217 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 218 psize = MAX(available, bdev_capacity(bdev)); 219 } else { 220 psize = bdev_capacity(bdev); 221 } 222 223 return (psize); 224 } 225 226 static void 227 vdev_disk_error(zio_t *zio) 228 { 229 /* 230 * This function can be called in interrupt context, for instance while 231 * handling IRQs coming from a misbehaving disk device; use printk() 232 * which is safe from any context. 233 */ 234 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 235 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 236 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 237 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 238 zio->io_flags); 239 } 240 241 static void 242 vdev_disk_kobj_evt_post(vdev_t *v) 243 { 244 vdev_disk_t *vd = v->vdev_tsd; 245 if (vd && vd->vd_bdh) { 246 spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); 247 } else { 248 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 249 v->vdev_path); 250 } 251 } 252 253 static zfs_bdev_handle_t * 254 vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) 255 { 256 vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); 257 258 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 259 return (bdev_file_open_by_path(path, bmode, holder, NULL)); 260 #elif defined(HAVE_BDEV_OPEN_BY_PATH) 261 return (bdev_open_by_path(path, bmode, holder, NULL)); 262 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 263 return (blkdev_get_by_path(path, bmode, holder, NULL)); 264 #else 265 return (blkdev_get_by_path(path, bmode, holder)); 266 #endif 267 } 268 269 static void 270 vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) 271 { 272 #if defined(HAVE_BDEV_RELEASE) 273 return (bdev_release(bdh)); 274 #elif defined(HAVE_BLKDEV_PUT_HOLDER) 275 return (blkdev_put(BDH_BDEV(bdh), holder)); 276 #elif defined(HAVE_BLKDEV_PUT) 277 return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); 278 #else 279 fput(bdh); 280 #endif 281 } 282 283 static int 284 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 285 uint64_t *logical_ashift, uint64_t *physical_ashift) 286 { 287 zfs_bdev_handle_t *bdh; 288 spa_mode_t smode = spa_mode(v->vdev_spa); 289 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 290 vdev_disk_t *vd; 291 292 /* Must have a pathname and it must be absolute. */ 293 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 294 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 295 vdev_dbgmsg(v, "invalid vdev_path"); 296 return (SET_ERROR(EINVAL)); 297 } 298 299 /* 300 * Reopen the device if it is currently open. When expanding a 301 * partition force re-scanning the partition table if userland 302 * did not take care of this already. We need to do this while closed 303 * in order to get an accurate updated block device size. Then 304 * since udev may need to recreate the device links increase the 305 * open retry timeout before reporting the device as unavailable. 306 */ 307 vd = v->vdev_tsd; 308 if (vd) { 309 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 310 boolean_t reread_part = B_FALSE; 311 312 rw_enter(&vd->vd_lock, RW_WRITER); 313 bdh = vd->vd_bdh; 314 vd->vd_bdh = NULL; 315 316 if (bdh) { 317 struct block_device *bdev = BDH_BDEV(bdh); 318 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 319 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 320 /* 321 * If userland has BLKPG_RESIZE_PARTITION, 322 * then it should have updated the partition 323 * table already. We can detect this by 324 * comparing our current physical size 325 * with that of the device. If they are 326 * the same, then we must not have 327 * BLKPG_RESIZE_PARTITION or it failed to 328 * update the partition table online. We 329 * fallback to rescanning the partition 330 * table from the kernel below. However, 331 * if the capacity already reflects the 332 * updated partition, then we skip 333 * rescanning the partition table here. 334 */ 335 if (v->vdev_psize == bdev_capacity(bdev)) 336 reread_part = B_TRUE; 337 } 338 339 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 340 } 341 342 if (reread_part) { 343 bdh = vdev_blkdev_get_by_path(disk_name, smode, 344 zfs_vdev_holder); 345 if (!BDH_IS_ERR(bdh)) { 346 int error = 347 vdev_bdev_reread_part(BDH_BDEV(bdh)); 348 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 349 if (error == 0) { 350 timeout = MSEC2NSEC( 351 zfs_vdev_open_timeout_ms * 2); 352 } 353 } 354 } 355 } else { 356 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 357 358 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 359 rw_enter(&vd->vd_lock, RW_WRITER); 360 } 361 362 /* 363 * Devices are always opened by the path provided at configuration 364 * time. This means that if the provided path is a udev by-id path 365 * then drives may be re-cabled without an issue. If the provided 366 * path is a udev by-path path, then the physical location information 367 * will be preserved. This can be critical for more complicated 368 * configurations where drives are located in specific physical 369 * locations to maximize the systems tolerance to component failure. 370 * 371 * Alternatively, you can provide your own udev rule to flexibly map 372 * the drives as you see fit. It is not advised that you use the 373 * /dev/[hd]d devices which may be reordered due to probing order. 374 * Devices in the wrong locations will be detected by the higher 375 * level vdev validation. 376 * 377 * The specified paths may be briefly removed and recreated in 378 * response to udev events. This should be exceptionally unlikely 379 * because the zpool command makes every effort to verify these paths 380 * have already settled prior to reaching this point. Therefore, 381 * a ENOENT failure at this point is highly likely to be transient 382 * and it is reasonable to sleep and retry before giving up. In 383 * practice delays have been observed to be on the order of 100ms. 384 * 385 * When ERESTARTSYS is returned it indicates the block device is 386 * a zvol which could not be opened due to the deadlock detection 387 * logic in zvol_open(). Extend the timeout and retry the open 388 * subsequent attempts are expected to eventually succeed. 389 */ 390 hrtime_t start = gethrtime(); 391 bdh = BDH_ERR_PTR(-ENXIO); 392 while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { 393 bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, 394 zfs_vdev_holder); 395 if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { 396 /* 397 * There is no point of waiting since device is removed 398 * explicitly 399 */ 400 if (v->vdev_removed) 401 break; 402 403 schedule_timeout_interruptible(MSEC_TO_TICK(10)); 404 } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { 405 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 406 continue; 407 } else if (BDH_IS_ERR(bdh)) { 408 break; 409 } 410 } 411 412 if (BDH_IS_ERR(bdh)) { 413 int error = -BDH_PTR_ERR(bdh); 414 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 415 (u_longlong_t)(gethrtime() - start), 416 (u_longlong_t)timeout); 417 vd->vd_bdh = NULL; 418 v->vdev_tsd = vd; 419 rw_exit(&vd->vd_lock); 420 return (SET_ERROR(error)); 421 } else { 422 vd->vd_bdh = bdh; 423 v->vdev_tsd = vd; 424 rw_exit(&vd->vd_lock); 425 } 426 427 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 428 429 /* Determine the physical block size */ 430 int physical_block_size = bdev_physical_block_size(bdev); 431 432 /* Determine the logical block size */ 433 int logical_block_size = bdev_logical_block_size(bdev); 434 435 /* 436 * If the device has a write cache, clear the nowritecache flag, 437 * so that we start issuing flush requests again. 438 */ 439 v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); 440 441 /* Set when device reports it supports TRIM. */ 442 v->vdev_has_trim = bdev_discard_supported(bdev); 443 444 /* Set when device reports it supports secure TRIM. */ 445 v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); 446 447 /* Inform the ZIO pipeline that we are non-rotational */ 448 #ifdef HAVE_BLK_QUEUE_ROT 449 v->vdev_nonrot = !blk_queue_rot(bdev_get_queue(bdev)); 450 #else 451 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); 452 #endif 453 454 /* Is backed by a block device. */ 455 v->vdev_is_blkdev = B_TRUE; 456 457 /* Physical volume size in bytes for the partition */ 458 *psize = bdev_capacity(bdev); 459 460 /* Physical volume size in bytes including possible expansion space */ 461 *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); 462 463 /* Based on the minimum sector size set the block size */ 464 *physical_ashift = highbit64(MAX(physical_block_size, 465 SPA_MINBLOCKSIZE)) - 1; 466 467 *logical_ashift = highbit64(MAX(logical_block_size, 468 SPA_MINBLOCKSIZE)) - 1; 469 470 return (0); 471 } 472 473 static void 474 vdev_disk_close(vdev_t *v) 475 { 476 vdev_disk_t *vd = v->vdev_tsd; 477 478 if (v->vdev_reopening || vd == NULL) 479 return; 480 481 rw_enter(&vd->vd_lock, RW_WRITER); 482 483 if (vd->vd_bdh != NULL) 484 vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), 485 zfs_vdev_holder); 486 487 v->vdev_tsd = NULL; 488 489 rw_exit(&vd->vd_lock); 490 rw_destroy(&vd->vd_lock); 491 kmem_free(vd, sizeof (vdev_disk_t)); 492 } 493 494 /* 495 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 496 * replace it with preempt_schedule under the following condition: 497 */ 498 #if defined(CONFIG_ARM64) && \ 499 defined(CONFIG_PREEMPTION) && \ 500 defined(CONFIG_BLK_CGROUP) 501 #define preempt_schedule_notrace(x) preempt_schedule(x) 502 #endif 503 504 /* 505 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 506 * as an argument removing the need to set it with bio_set_dev(). This 507 * removes the need for all of the following compatibility code. 508 */ 509 #if !defined(HAVE_BIO_ALLOC_4ARG) 510 511 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 512 /* 513 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 514 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 515 * As a side effect the function was converted to GPL-only. Define our 516 * own version when needed which uses rcu_read_lock_sched(). 517 * 518 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 519 * part, moving blkg_tryget into the private one. Define our own version. 520 */ 521 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 522 static inline bool 523 vdev_blkg_tryget(struct blkcg_gq *blkg) 524 { 525 struct percpu_ref *ref = &blkg->refcnt; 526 unsigned long __percpu *count; 527 bool rc; 528 529 rcu_read_lock_sched(); 530 531 if (__ref_is_percpu(ref, &count)) { 532 this_cpu_inc(*count); 533 rc = true; 534 } else { 535 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 536 rc = atomic_long_inc_not_zero(&ref->data->count); 537 #else 538 rc = atomic_long_inc_not_zero(&ref->count); 539 #endif 540 } 541 542 rcu_read_unlock_sched(); 543 544 return (rc); 545 } 546 #else 547 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 548 #endif 549 #ifdef HAVE_BIO_SET_DEV_MACRO 550 /* 551 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 552 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 553 * the entire macro. Provide a minimal version which always assigns the 554 * request queue's root_blkg to the bio. 555 */ 556 static inline void 557 vdev_bio_associate_blkg(struct bio *bio) 558 { 559 #if defined(HAVE_BIO_BDEV_DISK) 560 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 561 #else 562 struct request_queue *q = bio->bi_disk->queue; 563 #endif 564 565 ASSERT3P(q, !=, NULL); 566 ASSERT0P(bio->bi_blkg); 567 568 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 569 bio->bi_blkg = q->root_blkg; 570 } 571 572 #define bio_associate_blkg vdev_bio_associate_blkg 573 #else 574 static inline void 575 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 576 { 577 #if defined(HAVE_BIO_BDEV_DISK) 578 struct request_queue *q = bdev->bd_disk->queue; 579 #else 580 struct request_queue *q = bio->bi_disk->queue; 581 #endif 582 bio_clear_flag(bio, BIO_REMAPPED); 583 if (bio->bi_bdev != bdev) 584 bio_clear_flag(bio, BIO_THROTTLED); 585 bio->bi_bdev = bdev; 586 587 ASSERT3P(q, !=, NULL); 588 ASSERT0P(bio->bi_blkg); 589 590 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 591 bio->bi_blkg = q->root_blkg; 592 } 593 #define bio_set_dev vdev_bio_set_dev 594 #endif 595 #endif 596 #endif /* !HAVE_BIO_ALLOC_4ARG */ 597 598 static inline void 599 vdev_submit_bio(struct bio *bio) 600 { 601 struct bio_list *bio_list = current->bio_list; 602 current->bio_list = NULL; 603 (void) submit_bio(bio); 604 current->bio_list = bio_list; 605 } 606 607 static inline struct bio * 608 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 609 unsigned short nr_vecs) 610 { 611 struct bio *bio; 612 613 #ifdef HAVE_BIO_ALLOC_4ARG 614 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 615 #else 616 bio = bio_alloc(gfp_mask, nr_vecs); 617 if (likely(bio != NULL)) 618 bio_set_dev(bio, bdev); 619 #endif 620 621 return (bio); 622 } 623 624 static inline uint_t 625 vdev_bio_max_segs(struct block_device *bdev) 626 { 627 /* 628 * Smallest of the device max segs and the tunable max segs. Minimum 629 * 4, so there's room to finish split pages if they come up. 630 */ 631 const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); 632 const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? 633 MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; 634 const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); 635 636 #ifdef HAVE_BIO_MAX_SEGS 637 return (bio_max_segs(max_segs)); 638 #else 639 return (MIN(max_segs, BIO_MAX_PAGES)); 640 #endif 641 } 642 643 static inline uint_t 644 vdev_bio_max_bytes(struct block_device *bdev) 645 { 646 return (queue_max_sectors(bdev_get_queue(bdev)) << 9); 647 } 648 649 650 /* 651 * Virtual block IO object (VBIO) 652 * 653 * Linux block IO (BIO) objects have a limit on how many data segments (pages) 654 * they can hold. Depending on how they're allocated and structured, a large 655 * ZIO can require more than one BIO to be submitted to the kernel, which then 656 * all have to complete before we can return the completed ZIO back to ZFS. 657 * 658 * A VBIO is a wrapper around multiple BIOs, carrying everything needed to 659 * translate a ZIO down into the kernel block layer and back again. 660 * 661 * Note that these are only used for data ZIOs (read/write). Meta-operations 662 * (flush/trim) don't need multiple BIOs and so can just make the call 663 * directly. 664 */ 665 typedef struct { 666 zio_t *vbio_zio; /* parent zio */ 667 668 struct block_device *vbio_bdev; /* blockdev to submit bios to */ 669 670 abd_t *vbio_abd; /* abd carrying borrowed linear buf */ 671 672 uint_t vbio_max_segs; /* max segs per bio */ 673 674 uint_t vbio_max_bytes; /* max bytes per bio */ 675 uint_t vbio_lbs_mask; /* logical block size mask */ 676 677 uint64_t vbio_offset; /* start offset of next bio */ 678 679 struct bio *vbio_bio; /* pointer to the current bio */ 680 int vbio_flags; /* bio flags */ 681 } vbio_t; 682 683 static vbio_t * 684 vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) 685 { 686 vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); 687 688 vbio->vbio_zio = zio; 689 vbio->vbio_bdev = bdev; 690 vbio->vbio_abd = NULL; 691 vbio->vbio_max_segs = vdev_bio_max_segs(bdev); 692 vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); 693 vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); 694 vbio->vbio_offset = zio->io_offset; 695 vbio->vbio_bio = NULL; 696 vbio->vbio_flags = flags; 697 698 return (vbio); 699 } 700 701 static void vbio_completion(struct bio *bio); 702 703 static int 704 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) 705 { 706 struct bio *bio = vbio->vbio_bio; 707 uint_t ssize; 708 709 while (size > 0) { 710 if (bio == NULL) { 711 /* New BIO, allocate and set up */ 712 bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, 713 vbio->vbio_max_segs); 714 VERIFY(bio); 715 716 BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; 717 bio_set_op_attrs(bio, 718 vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? 719 WRITE : READ, vbio->vbio_flags); 720 721 if (vbio->vbio_bio) { 722 bio_chain(vbio->vbio_bio, bio); 723 vdev_submit_bio(vbio->vbio_bio); 724 } 725 vbio->vbio_bio = bio; 726 } 727 728 /* 729 * Only load as much of the current page data as will fit in 730 * the space left in the BIO, respecting lbs alignment. Older 731 * kernels will error if we try to overfill the BIO, while 732 * newer ones will accept it and split the BIO. This ensures 733 * everything works on older kernels, and avoids an additional 734 * overhead on the new. 735 */ 736 ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & 737 vbio->vbio_lbs_mask); 738 if (ssize > 0 && 739 bio_add_page(bio, page, ssize, offset) == ssize) { 740 /* Accepted, adjust and load any remaining. */ 741 size -= ssize; 742 offset += ssize; 743 continue; 744 } 745 746 /* No room, set up for a new BIO and loop */ 747 vbio->vbio_offset += BIO_BI_SIZE(bio); 748 749 /* Signal new BIO allocation wanted */ 750 bio = NULL; 751 } 752 753 return (0); 754 } 755 756 /* Iterator callback to submit ABD pages to the vbio. */ 757 static int 758 vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) 759 { 760 vbio_t *vbio = priv; 761 return (vbio_add_page(vbio, page, len, off)); 762 } 763 764 /* Create some BIOs, fill them with data and submit them */ 765 static void 766 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) 767 { 768 /* 769 * We plug so we can submit the BIOs as we go and only unplug them when 770 * they are fully created and submitted. This is important; if we don't 771 * plug, then the kernel may start executing earlier BIOs while we're 772 * still creating and executing later ones, and if the device goes 773 * away while that's happening, older kernels can get confused and 774 * trample memory. 775 */ 776 struct blk_plug plug; 777 blk_start_plug(&plug); 778 779 (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); 780 ASSERT(vbio->vbio_bio); 781 782 vbio->vbio_bio->bi_end_io = vbio_completion; 783 vbio->vbio_bio->bi_private = vbio; 784 785 /* 786 * Once submitted, vbio_bio now owns vbio (through bi_private) and we 787 * can't touch it again. The bio may complete and vbio_completion() be 788 * called and free the vbio before this task is run again, so we must 789 * consider it invalid from this point. 790 */ 791 vdev_submit_bio(vbio->vbio_bio); 792 793 blk_finish_plug(&plug); 794 } 795 796 /* IO completion callback */ 797 static void 798 vbio_completion(struct bio *bio) 799 { 800 vbio_t *vbio = bio->bi_private; 801 zio_t *zio = vbio->vbio_zio; 802 803 ASSERT(zio); 804 805 /* Capture and log any errors */ 806 zio->io_error = bi_status_to_errno(bio->bi_status); 807 ASSERT3U(zio->io_error, >=, 0); 808 809 if (zio->io_error) 810 vdev_disk_error(zio); 811 812 /* Return the BIO to the kernel */ 813 bio_put(bio); 814 815 /* 816 * We're likely in an interrupt context so we can't do ABD/memory work 817 * here; instead we stash vbio on the zio and take care of it in the 818 * done callback. 819 */ 820 ASSERT0P(zio->io_bio); 821 zio->io_bio = vbio; 822 823 zio_delay_interrupt(zio); 824 } 825 826 /* 827 * Iterator callback to count ABD pages and check their size & alignment. 828 * 829 * On Linux, each BIO segment can take a page pointer, and an offset+length of 830 * the data within that page. A page can be arbitrarily large ("compound" 831 * pages) but we still have to ensure the data portion is correctly sized and 832 * aligned to the logical block size, to ensure that if the kernel wants to 833 * split the BIO, the two halves will still be properly aligned. 834 * 835 * NOTE: if you change this function, change the copy in 836 * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test 837 * data there to validate the change you're making. 838 */ 839 typedef struct { 840 size_t blocksize; 841 int seen_first; 842 int seen_last; 843 } vdev_disk_check_alignment_t; 844 845 static int 846 vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len, 847 void *priv) 848 { 849 (void) page; 850 vdev_disk_check_alignment_t *s = priv; 851 852 /* 853 * The cardinal rule: a single on-disk block must never cross an 854 * physical (order-0) page boundary, as the kernel expects to be able 855 * to split at both LBS and page boundaries. 856 * 857 * This implies various alignment rules for the blocks in this 858 * (possibly compound) page, which we can check for. 859 */ 860 861 /* 862 * If the previous page did not end on a page boundary, then we 863 * can't proceed without creating a hole. 864 */ 865 if (s->seen_last) 866 return (1); 867 868 /* This page must contain only whole LBS-sized blocks. */ 869 if (!IS_P2ALIGNED(len, s->blocksize)) 870 return (1); 871 872 /* 873 * If this is not the first page in the ABD, then the data must start 874 * on a page-aligned boundary (so the kernel can split on page 875 * boundaries without having to deal with a hole). If it is, then 876 * it can start on LBS-alignment. 877 */ 878 if (s->seen_first) { 879 if (!IS_P2ALIGNED(off, PAGESIZE)) 880 return (1); 881 } else { 882 if (!IS_P2ALIGNED(off, s->blocksize)) 883 return (1); 884 s->seen_first = 1; 885 } 886 887 /* 888 * If this data does not end on a page-aligned boundary, then this 889 * must be the last page in the ABD, for the same reason. 890 */ 891 s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE); 892 893 return (0); 894 } 895 896 /* 897 * Check if we can submit the pages in this ABD to the kernel as-is. Returns 898 * the number of pages, or 0 if it can't be submitted like this. 899 */ 900 static boolean_t 901 vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev) 902 { 903 vdev_disk_check_alignment_t s = { 904 .blocksize = bdev_logical_block_size(bdev), 905 }; 906 907 if (abd_iterate_page_func(abd, 0, size, 908 vdev_disk_check_alignment_cb, &s)) 909 return (B_FALSE); 910 911 return (B_TRUE); 912 } 913 914 static int 915 vdev_disk_io_rw(zio_t *zio) 916 { 917 vdev_t *v = zio->io_vd; 918 vdev_disk_t *vd = v->vdev_tsd; 919 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 920 int flags = 0; 921 922 /* 923 * Accessing outside the block device is never allowed. 924 */ 925 if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) { 926 vdev_dbgmsg(zio->io_vd, 927 "Illegal access %llu size %llu, device size %llu", 928 (u_longlong_t)zio->io_offset, 929 (u_longlong_t)zio->io_size, 930 (u_longlong_t)bdev_capacity(bdev)); 931 return (SET_ERROR(EIO)); 932 } 933 934 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 935 v->vdev_failfast == B_TRUE) { 936 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 937 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 938 } 939 940 /* 941 * Check alignment of the incoming ABD. If any part of it would require 942 * submitting a page that is not aligned to both the logical block size 943 * and the page size, then we take a copy into a new memory region with 944 * correct alignment. This should be impossible on a 512b LBS. On 945 * larger blocks, this can happen at least when a small number of 946 * blocks (usually 1) are allocated from a shared slab, or when 947 * abnormally-small data regions (eg gang headers) are mixed into the 948 * same ABD as larger allocations (eg aggregations). 949 */ 950 abd_t *abd = zio->io_abd; 951 if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) { 952 /* Allocate a new memory region with guaranteed alignment */ 953 abd = abd_alloc_for_io(zio->io_size, 954 zio->io_abd->abd_flags & ABD_FLAG_META); 955 956 /* If we're writing copy our data into it */ 957 if (zio->io_type == ZIO_TYPE_WRITE) 958 abd_copy(abd, zio->io_abd, zio->io_size); 959 960 /* 961 * False here would mean the new allocation has an invalid 962 * alignment too, which would mean that abd_alloc() is not 963 * guaranteeing this, or our logic in 964 * vdev_disk_check_alignment() is wrong. In either case, 965 * something in seriously wrong and its not safe to continue. 966 */ 967 VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev)); 968 } 969 970 /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ 971 vbio_t *vbio = vbio_alloc(zio, bdev, flags); 972 if (abd != zio->io_abd) 973 vbio->vbio_abd = abd; 974 975 /* Fill it with data pages and submit it to the kernel */ 976 vbio_submit(vbio, abd, zio->io_size); 977 return (0); 978 } 979 980 static void 981 vdev_disk_io_flush_completion(struct bio *bio) 982 { 983 zio_t *zio = bio->bi_private; 984 zio->io_error = bi_status_to_errno(bio->bi_status); 985 if (zio->io_error == EOPNOTSUPP || zio->io_error == ENOTTY) 986 zio->io_error = SET_ERROR(ENOTSUP); 987 988 bio_put(bio); 989 ASSERT3S(zio->io_error, >=, 0); 990 if (zio->io_error) 991 vdev_disk_error(zio); 992 zio_interrupt(zio); 993 } 994 995 static int 996 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 997 { 998 struct request_queue *q; 999 struct bio *bio; 1000 1001 q = bdev_get_queue(bdev); 1002 if (!q) 1003 return (SET_ERROR(ENXIO)); 1004 1005 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 1006 if (unlikely(bio == NULL)) 1007 return (SET_ERROR(ENOMEM)); 1008 1009 bio->bi_end_io = vdev_disk_io_flush_completion; 1010 bio->bi_private = zio; 1011 bio_set_flush(bio); 1012 vdev_submit_bio(bio); 1013 invalidate_bdev(bdev); 1014 1015 return (0); 1016 } 1017 1018 static void 1019 vdev_disk_discard_end_io(struct bio *bio) 1020 { 1021 zio_t *zio = bio->bi_private; 1022 zio->io_error = bi_status_to_errno(bio->bi_status); 1023 1024 bio_put(bio); 1025 if (zio->io_error) 1026 vdev_disk_error(zio); 1027 zio_interrupt(zio); 1028 } 1029 1030 /* 1031 * Wrappers for the different secure erase and discard APIs. We use async 1032 * when available; in this case, *biop is set to the last bio in the chain. 1033 */ 1034 static int 1035 vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, 1036 sector_t nsect, struct bio **biop) 1037 { 1038 *biop = NULL; 1039 int error; 1040 1041 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 1042 error = blkdev_issue_secure_erase(BDH_BDEV(bdh), 1043 sector, nsect, GFP_NOFS); 1044 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 1045 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1046 sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); 1047 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 1048 error = blkdev_issue_discard(BDH_BDEV(bdh), 1049 sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); 1050 #else 1051 #error "unsupported kernel" 1052 #endif 1053 1054 return (error); 1055 } 1056 1057 static int 1058 vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, 1059 sector_t nsect, struct bio **biop) 1060 { 1061 *biop = NULL; 1062 int error; 1063 1064 #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 1065 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1066 sector, nsect, GFP_NOFS, 0, biop); 1067 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) 1068 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1069 sector, nsect, GFP_NOFS, biop); 1070 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 1071 error = blkdev_issue_discard(BDH_BDEV(bdh), 1072 sector, nsect, GFP_NOFS, 0); 1073 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) 1074 error = blkdev_issue_discard(BDH_BDEV(bdh), 1075 sector, nsect, GFP_NOFS); 1076 #else 1077 #error "unsupported kernel" 1078 #endif 1079 1080 return (error); 1081 } 1082 1083 /* 1084 * Entry point for TRIM ops. This calls the right wrapper for secure erase or 1085 * discard, and then does the appropriate finishing work for error vs success 1086 * and async vs sync. 1087 */ 1088 static int 1089 vdev_disk_io_trim(zio_t *zio) 1090 { 1091 int error; 1092 struct bio *bio; 1093 1094 zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; 1095 sector_t sector = zio->io_offset >> 9; 1096 sector_t nsects = zio->io_size >> 9; 1097 1098 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 1099 error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); 1100 else 1101 error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); 1102 1103 if (error != 0) 1104 return (SET_ERROR(-error)); 1105 1106 if (bio == NULL) { 1107 /* 1108 * This was a synchronous op that completed successfully, so 1109 * return it to ZFS immediately. 1110 */ 1111 zio_interrupt(zio); 1112 } else { 1113 /* 1114 * This was an asynchronous op; set up completion callback and 1115 * issue it. 1116 */ 1117 bio->bi_private = zio; 1118 bio->bi_end_io = vdev_disk_discard_end_io; 1119 vdev_submit_bio(bio); 1120 } 1121 1122 return (0); 1123 } 1124 1125 static void 1126 vdev_disk_io_start(zio_t *zio) 1127 { 1128 vdev_t *v = zio->io_vd; 1129 vdev_disk_t *vd = v->vdev_tsd; 1130 int error; 1131 1132 /* 1133 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 1134 * Nothing to be done here but return failure. 1135 */ 1136 if (vd == NULL) { 1137 zio->io_error = ENXIO; 1138 zio_interrupt(zio); 1139 return; 1140 } 1141 1142 rw_enter(&vd->vd_lock, RW_READER); 1143 1144 /* 1145 * If the vdev is closed, it's likely due to a failed reopen and is 1146 * in the UNAVAIL state. Nothing to be done here but return failure. 1147 */ 1148 if (vd->vd_bdh == NULL) { 1149 rw_exit(&vd->vd_lock); 1150 zio->io_error = ENXIO; 1151 zio_interrupt(zio); 1152 return; 1153 } 1154 1155 switch (zio->io_type) { 1156 case ZIO_TYPE_FLUSH: 1157 1158 if (!vdev_readable(v)) { 1159 /* Drive not there, can't flush */ 1160 error = SET_ERROR(ENXIO); 1161 } else if (zfs_nocacheflush) { 1162 /* Flushing disabled by operator, declare success */ 1163 error = 0; 1164 } else if (v->vdev_nowritecache) { 1165 /* This vdev not capable of flushing */ 1166 error = SET_ERROR(ENOTSUP); 1167 } else { 1168 /* 1169 * Issue the flush. If successful, the response will 1170 * be handled in the completion callback, so we're done. 1171 */ 1172 error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); 1173 if (error == 0) { 1174 rw_exit(&vd->vd_lock); 1175 return; 1176 } 1177 } 1178 1179 /* Couldn't issue the flush, so set the error and return it */ 1180 rw_exit(&vd->vd_lock); 1181 zio->io_error = error; 1182 zio_execute(zio); 1183 return; 1184 1185 case ZIO_TYPE_TRIM: 1186 error = vdev_disk_io_trim(zio); 1187 rw_exit(&vd->vd_lock); 1188 if (error) { 1189 zio->io_error = error; 1190 zio_execute(zio); 1191 } 1192 return; 1193 1194 case ZIO_TYPE_READ: 1195 case ZIO_TYPE_WRITE: 1196 zio->io_target_timestamp = zio_handle_io_delay(zio); 1197 error = vdev_disk_io_rw(zio); 1198 rw_exit(&vd->vd_lock); 1199 if (error) { 1200 zio->io_error = error; 1201 zio_interrupt(zio); 1202 } 1203 return; 1204 1205 default: 1206 /* 1207 * Getting here means our parent vdev has made a very strange 1208 * request of us, and shouldn't happen. Assert here to force a 1209 * crash in dev builds, but in production return the IO 1210 * unhandled. The pool will likely suspend anyway but that's 1211 * nicer than crashing the kernel. 1212 */ 1213 ASSERT3S(zio->io_type, ==, -1); 1214 1215 rw_exit(&vd->vd_lock); 1216 zio->io_error = SET_ERROR(ENOTSUP); 1217 zio_interrupt(zio); 1218 return; 1219 } 1220 1221 __builtin_unreachable(); 1222 } 1223 1224 static void 1225 vdev_disk_io_done(zio_t *zio) 1226 { 1227 /* If this was a read or write, we need to clean up the vbio */ 1228 if (zio->io_bio != NULL) { 1229 vbio_t *vbio = zio->io_bio; 1230 zio->io_bio = NULL; 1231 1232 /* 1233 * If we copied the ABD before issuing it, clean up and return 1234 * the copy to the ADB, with changes if appropriate. 1235 */ 1236 if (vbio->vbio_abd != NULL) { 1237 if (zio->io_type == ZIO_TYPE_READ) 1238 abd_copy(zio->io_abd, vbio->vbio_abd, 1239 zio->io_size); 1240 1241 abd_free(vbio->vbio_abd); 1242 vbio->vbio_abd = NULL; 1243 } 1244 1245 /* Final cleanup */ 1246 kmem_free(vbio, sizeof (vbio_t)); 1247 } 1248 1249 /* 1250 * If the device returned EIO, we revalidate the media. If it is 1251 * determined the media has changed this triggers the asynchronous 1252 * removal of the device from the configuration. 1253 */ 1254 if (zio->io_error == EIO) { 1255 vdev_t *v = zio->io_vd; 1256 vdev_disk_t *vd = v->vdev_tsd; 1257 1258 if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { 1259 invalidate_bdev(BDH_BDEV(vd->vd_bdh)); 1260 v->vdev_remove_wanted = B_TRUE; 1261 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1262 } 1263 } 1264 } 1265 1266 static void 1267 vdev_disk_hold(vdev_t *vd) 1268 { 1269 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1270 1271 /* We must have a pathname, and it must be absolute. */ 1272 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 1273 return; 1274 1275 /* 1276 * Only prefetch path and devid info if the device has 1277 * never been opened. 1278 */ 1279 if (vd->vdev_tsd != NULL) 1280 return; 1281 1282 } 1283 1284 static void 1285 vdev_disk_rele(vdev_t *vd) 1286 { 1287 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1288 1289 /* XXX: Implement me as a vnode rele for the device */ 1290 } 1291 1292 vdev_ops_t vdev_disk_ops = { 1293 .vdev_op_init = NULL, 1294 .vdev_op_fini = NULL, 1295 .vdev_op_open = vdev_disk_open, 1296 .vdev_op_close = vdev_disk_close, 1297 .vdev_op_asize_to_psize = vdev_default_psize, 1298 .vdev_op_psize_to_asize = vdev_default_asize, 1299 .vdev_op_min_asize = vdev_default_min_asize, 1300 .vdev_op_min_alloc = NULL, 1301 .vdev_op_io_start = vdev_disk_io_start, 1302 .vdev_op_io_done = vdev_disk_io_done, 1303 .vdev_op_state_change = NULL, 1304 .vdev_op_need_resilver = NULL, 1305 .vdev_op_hold = vdev_disk_hold, 1306 .vdev_op_rele = vdev_disk_rele, 1307 .vdev_op_remap = NULL, 1308 .vdev_op_xlate = vdev_default_xlate, 1309 .vdev_op_rebuild_asize = NULL, 1310 .vdev_op_metaslab_init = NULL, 1311 .vdev_op_config_generate = NULL, 1312 .vdev_op_nparity = NULL, 1313 .vdev_op_ndisks = NULL, 1314 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1315 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1316 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1317 }; 1318 1319 int 1320 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1321 { 1322 uint_t val; 1323 int error; 1324 1325 error = kstrtouint(buf, 0, &val); 1326 if (error < 0) 1327 return (SET_ERROR(error)); 1328 1329 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1330 return (SET_ERROR(-EINVAL)); 1331 1332 error = param_set_uint(buf, kp); 1333 if (error < 0) 1334 return (SET_ERROR(error)); 1335 1336 return (0); 1337 } 1338 1339 int 1340 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1341 { 1342 uint_t val; 1343 int error; 1344 1345 error = kstrtouint(buf, 0, &val); 1346 if (error < 0) 1347 return (SET_ERROR(error)); 1348 1349 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1350 return (SET_ERROR(-EINVAL)); 1351 1352 error = param_set_uint(buf, kp); 1353 if (error < 0) 1354 return (SET_ERROR(error)); 1355 1356 return (0); 1357 } 1358 1359 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1360 "Timeout before determining that a device is missing"); 1361 1362 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1363 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1364 1365 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, 1366 "Maximum number of data segments to add to an IO request (min 4)"); 1367