1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 * Copyright (c) 2023, 2024, Klara Inc. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/vdev_trim.h> 35 #include <sys/abd.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/zio.h> 38 #include <linux/blkpg.h> 39 #include <linux/msdos_fs.h> 40 #include <linux/vfs_compat.h> 41 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 42 #include <linux/blk-cgroup.h> 43 #endif 44 45 /* 46 * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying 47 * block_device. Since it carries the block_device inside, its convenient to 48 * just use the handle as a proxy. For pre-6.8, we just emulate this with 49 * a cast, since we don't need any of the other fields inside the handle. 50 */ 51 #ifdef HAVE_BDEV_OPEN_BY_PATH 52 typedef struct bdev_handle zfs_bdev_handle_t; 53 #define BDH_BDEV(bdh) ((bdh)->bdev) 54 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 55 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 56 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 57 #else 58 typedef void zfs_bdev_handle_t; 59 #define BDH_BDEV(bdh) ((struct block_device *)bdh) 60 #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) 61 #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) 62 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 63 #endif 64 65 typedef struct vdev_disk { 66 zfs_bdev_handle_t *vd_bdh; 67 krwlock_t vd_lock; 68 } vdev_disk_t; 69 70 /* 71 * Maximum number of segments to add to a bio (min 4). If this is higher than 72 * the maximum allowed by the device queue or the kernel itself, it will be 73 * clamped. Setting it to zero will cause the kernel's ideal size to be used. 74 */ 75 uint_t zfs_vdev_disk_max_segs = 0; 76 77 /* 78 * Unique identifier for the exclusive vdev holder. 79 */ 80 static void *zfs_vdev_holder = VDEV_HOLDER; 81 82 /* 83 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 84 * device is missing. The missing path may be transient since the links 85 * can be briefly removed and recreated in response to udev events. 86 */ 87 static uint_t zfs_vdev_open_timeout_ms = 1000; 88 89 /* 90 * Size of the "reserved" partition, in blocks. 91 */ 92 #define EFI_MIN_RESV_SIZE (16 * 1024) 93 94 /* 95 * BIO request failfast mask. 96 */ 97 98 static unsigned int zfs_vdev_failfast_mask = 1; 99 100 /* 101 * Convert SPA mode flags into bdev open mode flags. 102 */ 103 #ifdef HAVE_BLK_MODE_T 104 typedef blk_mode_t vdev_bdev_mode_t; 105 #define VDEV_BDEV_MODE_READ BLK_OPEN_READ 106 #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE 107 #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL 108 #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) 109 #else 110 typedef fmode_t vdev_bdev_mode_t; 111 #define VDEV_BDEV_MODE_READ FMODE_READ 112 #define VDEV_BDEV_MODE_WRITE FMODE_WRITE 113 #define VDEV_BDEV_MODE_EXCL FMODE_EXCL 114 #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) 115 #endif 116 117 static vdev_bdev_mode_t 118 vdev_bdev_mode(spa_mode_t smode) 119 { 120 ASSERT3U(smode, !=, SPA_MODE_UNINIT); 121 ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); 122 123 vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; 124 125 if (smode & SPA_MODE_READ) 126 bmode |= VDEV_BDEV_MODE_READ; 127 128 if (smode & SPA_MODE_WRITE) 129 bmode |= VDEV_BDEV_MODE_WRITE; 130 131 ASSERT(bmode & VDEV_BDEV_MODE_MASK); 132 ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); 133 134 return (bmode); 135 } 136 137 /* 138 * Returns the usable capacity (in bytes) for the partition or disk. 139 */ 140 static uint64_t 141 bdev_capacity(struct block_device *bdev) 142 { 143 return (i_size_read(bdev->bd_inode)); 144 } 145 146 #if !defined(HAVE_BDEV_WHOLE) 147 static inline struct block_device * 148 bdev_whole(struct block_device *bdev) 149 { 150 return (bdev->bd_contains); 151 } 152 #endif 153 154 #if defined(HAVE_BDEVNAME) 155 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 156 #else 157 static inline void 158 vdev_bdevname(struct block_device *bdev, char *name) 159 { 160 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 161 } 162 #endif 163 164 /* 165 * Returns the maximum expansion capacity of the block device (in bytes). 166 * 167 * It is possible to expand a vdev when it has been created as a wholedisk 168 * and the containing block device has increased in capacity. Or when the 169 * partition containing the pool has been manually increased in size. 170 * 171 * This function is only responsible for calculating the potential expansion 172 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 173 * responsible for verifying the expected partition layout in the wholedisk 174 * case, and updating the partition table if appropriate. Once the partition 175 * size has been increased the additional capacity will be visible using 176 * bdev_capacity(). 177 * 178 * The returned maximum expansion capacity is always expected to be larger, or 179 * at the very least equal, to its usable capacity to prevent overestimating 180 * the pool expandsize. 181 */ 182 static uint64_t 183 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 184 { 185 uint64_t psize; 186 int64_t available; 187 188 if (wholedisk && bdev != bdev_whole(bdev)) { 189 /* 190 * When reporting maximum expansion capacity for a wholedisk 191 * deduct any capacity which is expected to be lost due to 192 * alignment restrictions. Over reporting this value isn't 193 * harmful and would only result in slightly less capacity 194 * than expected post expansion. 195 * The estimated available space may be slightly smaller than 196 * bdev_capacity() for devices where the number of sectors is 197 * not a multiple of the alignment size and the partition layout 198 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 199 * "reserved" EFI partition: in such cases return the device 200 * usable capacity. 201 */ 202 available = i_size_read(bdev_whole(bdev)->bd_inode) - 203 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 204 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 205 psize = MAX(available, bdev_capacity(bdev)); 206 } else { 207 psize = bdev_capacity(bdev); 208 } 209 210 return (psize); 211 } 212 213 static void 214 vdev_disk_error(zio_t *zio) 215 { 216 /* 217 * This function can be called in interrupt context, for instance while 218 * handling IRQs coming from a misbehaving disk device; use printk() 219 * which is safe from any context. 220 */ 221 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 222 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 223 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 224 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 225 zio->io_flags); 226 } 227 228 static void 229 vdev_disk_kobj_evt_post(vdev_t *v) 230 { 231 vdev_disk_t *vd = v->vdev_tsd; 232 if (vd && vd->vd_bdh) { 233 spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); 234 } else { 235 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 236 v->vdev_path); 237 } 238 } 239 240 static zfs_bdev_handle_t * 241 vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) 242 { 243 vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); 244 245 #if defined(HAVE_BDEV_OPEN_BY_PATH) 246 return (bdev_open_by_path(path, bmode, holder, NULL)); 247 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 248 return (blkdev_get_by_path(path, bmode, holder, NULL)); 249 #else 250 return (blkdev_get_by_path(path, bmode, holder)); 251 #endif 252 } 253 254 static void 255 vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) 256 { 257 #if defined(HAVE_BDEV_RELEASE) 258 return (bdev_release(bdh)); 259 #elif defined(HAVE_BLKDEV_PUT_HOLDER) 260 return (blkdev_put(BDH_BDEV(bdh), holder)); 261 #else 262 return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); 263 #endif 264 } 265 266 static int 267 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 268 uint64_t *logical_ashift, uint64_t *physical_ashift) 269 { 270 zfs_bdev_handle_t *bdh; 271 spa_mode_t smode = spa_mode(v->vdev_spa); 272 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 273 vdev_disk_t *vd; 274 275 /* Must have a pathname and it must be absolute. */ 276 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 277 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 278 vdev_dbgmsg(v, "invalid vdev_path"); 279 return (SET_ERROR(EINVAL)); 280 } 281 282 /* 283 * Reopen the device if it is currently open. When expanding a 284 * partition force re-scanning the partition table if userland 285 * did not take care of this already. We need to do this while closed 286 * in order to get an accurate updated block device size. Then 287 * since udev may need to recreate the device links increase the 288 * open retry timeout before reporting the device as unavailable. 289 */ 290 vd = v->vdev_tsd; 291 if (vd) { 292 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 293 boolean_t reread_part = B_FALSE; 294 295 rw_enter(&vd->vd_lock, RW_WRITER); 296 bdh = vd->vd_bdh; 297 vd->vd_bdh = NULL; 298 299 if (bdh) { 300 struct block_device *bdev = BDH_BDEV(bdh); 301 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 302 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 303 /* 304 * If userland has BLKPG_RESIZE_PARTITION, 305 * then it should have updated the partition 306 * table already. We can detect this by 307 * comparing our current physical size 308 * with that of the device. If they are 309 * the same, then we must not have 310 * BLKPG_RESIZE_PARTITION or it failed to 311 * update the partition table online. We 312 * fallback to rescanning the partition 313 * table from the kernel below. However, 314 * if the capacity already reflects the 315 * updated partition, then we skip 316 * rescanning the partition table here. 317 */ 318 if (v->vdev_psize == bdev_capacity(bdev)) 319 reread_part = B_TRUE; 320 } 321 322 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 323 } 324 325 if (reread_part) { 326 bdh = vdev_blkdev_get_by_path(disk_name, smode, 327 zfs_vdev_holder); 328 if (!BDH_IS_ERR(bdh)) { 329 int error = 330 vdev_bdev_reread_part(BDH_BDEV(bdh)); 331 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 332 if (error == 0) { 333 timeout = MSEC2NSEC( 334 zfs_vdev_open_timeout_ms * 2); 335 } 336 } 337 } 338 } else { 339 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 340 341 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 342 rw_enter(&vd->vd_lock, RW_WRITER); 343 } 344 345 /* 346 * Devices are always opened by the path provided at configuration 347 * time. This means that if the provided path is a udev by-id path 348 * then drives may be re-cabled without an issue. If the provided 349 * path is a udev by-path path, then the physical location information 350 * will be preserved. This can be critical for more complicated 351 * configurations where drives are located in specific physical 352 * locations to maximize the systems tolerance to component failure. 353 * 354 * Alternatively, you can provide your own udev rule to flexibly map 355 * the drives as you see fit. It is not advised that you use the 356 * /dev/[hd]d devices which may be reordered due to probing order. 357 * Devices in the wrong locations will be detected by the higher 358 * level vdev validation. 359 * 360 * The specified paths may be briefly removed and recreated in 361 * response to udev events. This should be exceptionally unlikely 362 * because the zpool command makes every effort to verify these paths 363 * have already settled prior to reaching this point. Therefore, 364 * a ENOENT failure at this point is highly likely to be transient 365 * and it is reasonable to sleep and retry before giving up. In 366 * practice delays have been observed to be on the order of 100ms. 367 * 368 * When ERESTARTSYS is returned it indicates the block device is 369 * a zvol which could not be opened due to the deadlock detection 370 * logic in zvol_open(). Extend the timeout and retry the open 371 * subsequent attempts are expected to eventually succeed. 372 */ 373 hrtime_t start = gethrtime(); 374 bdh = BDH_ERR_PTR(-ENXIO); 375 while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { 376 bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, 377 zfs_vdev_holder); 378 if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { 379 /* 380 * There is no point of waiting since device is removed 381 * explicitly 382 */ 383 if (v->vdev_removed) 384 break; 385 386 schedule_timeout(MSEC_TO_TICK(10)); 387 } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { 388 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 389 continue; 390 } else if (BDH_IS_ERR(bdh)) { 391 break; 392 } 393 } 394 395 if (BDH_IS_ERR(bdh)) { 396 int error = -BDH_PTR_ERR(bdh); 397 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 398 (u_longlong_t)(gethrtime() - start), 399 (u_longlong_t)timeout); 400 vd->vd_bdh = NULL; 401 v->vdev_tsd = vd; 402 rw_exit(&vd->vd_lock); 403 return (SET_ERROR(error)); 404 } else { 405 vd->vd_bdh = bdh; 406 v->vdev_tsd = vd; 407 rw_exit(&vd->vd_lock); 408 } 409 410 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 411 412 /* Determine the physical block size */ 413 int physical_block_size = bdev_physical_block_size(bdev); 414 415 /* Determine the logical block size */ 416 int logical_block_size = bdev_logical_block_size(bdev); 417 418 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 419 v->vdev_nowritecache = B_FALSE; 420 421 /* Set when device reports it supports TRIM. */ 422 v->vdev_has_trim = bdev_discard_supported(bdev); 423 424 /* Set when device reports it supports secure TRIM. */ 425 v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); 426 427 /* Inform the ZIO pipeline that we are non-rotational */ 428 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); 429 430 /* Physical volume size in bytes for the partition */ 431 *psize = bdev_capacity(bdev); 432 433 /* Physical volume size in bytes including possible expansion space */ 434 *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); 435 436 /* Based on the minimum sector size set the block size */ 437 *physical_ashift = highbit64(MAX(physical_block_size, 438 SPA_MINBLOCKSIZE)) - 1; 439 440 *logical_ashift = highbit64(MAX(logical_block_size, 441 SPA_MINBLOCKSIZE)) - 1; 442 443 return (0); 444 } 445 446 static void 447 vdev_disk_close(vdev_t *v) 448 { 449 vdev_disk_t *vd = v->vdev_tsd; 450 451 if (v->vdev_reopening || vd == NULL) 452 return; 453 454 if (vd->vd_bdh != NULL) 455 vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), 456 zfs_vdev_holder); 457 458 rw_destroy(&vd->vd_lock); 459 kmem_free(vd, sizeof (vdev_disk_t)); 460 v->vdev_tsd = NULL; 461 } 462 463 static inline void 464 vdev_submit_bio_impl(struct bio *bio) 465 { 466 #ifdef HAVE_1ARG_SUBMIT_BIO 467 (void) submit_bio(bio); 468 #else 469 (void) submit_bio(bio_data_dir(bio), bio); 470 #endif 471 } 472 473 /* 474 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 475 * replace it with preempt_schedule under the following condition: 476 */ 477 #if defined(CONFIG_ARM64) && \ 478 defined(CONFIG_PREEMPTION) && \ 479 defined(CONFIG_BLK_CGROUP) 480 #define preempt_schedule_notrace(x) preempt_schedule(x) 481 #endif 482 483 /* 484 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 485 * as an argument removing the need to set it with bio_set_dev(). This 486 * removes the need for all of the following compatibility code. 487 */ 488 #if !defined(HAVE_BIO_ALLOC_4ARG) 489 490 #ifdef HAVE_BIO_SET_DEV 491 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 492 /* 493 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 494 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 495 * As a side effect the function was converted to GPL-only. Define our 496 * own version when needed which uses rcu_read_lock_sched(). 497 * 498 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 499 * part, moving blkg_tryget into the private one. Define our own version. 500 */ 501 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 502 static inline bool 503 vdev_blkg_tryget(struct blkcg_gq *blkg) 504 { 505 struct percpu_ref *ref = &blkg->refcnt; 506 unsigned long __percpu *count; 507 bool rc; 508 509 rcu_read_lock_sched(); 510 511 if (__ref_is_percpu(ref, &count)) { 512 this_cpu_inc(*count); 513 rc = true; 514 } else { 515 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 516 rc = atomic_long_inc_not_zero(&ref->data->count); 517 #else 518 rc = atomic_long_inc_not_zero(&ref->count); 519 #endif 520 } 521 522 rcu_read_unlock_sched(); 523 524 return (rc); 525 } 526 #else 527 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 528 #endif 529 #ifdef HAVE_BIO_SET_DEV_MACRO 530 /* 531 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 532 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 533 * the entire macro. Provide a minimal version which always assigns the 534 * request queue's root_blkg to the bio. 535 */ 536 static inline void 537 vdev_bio_associate_blkg(struct bio *bio) 538 { 539 #if defined(HAVE_BIO_BDEV_DISK) 540 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 541 #else 542 struct request_queue *q = bio->bi_disk->queue; 543 #endif 544 545 ASSERT3P(q, !=, NULL); 546 ASSERT3P(bio->bi_blkg, ==, NULL); 547 548 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 549 bio->bi_blkg = q->root_blkg; 550 } 551 552 #define bio_associate_blkg vdev_bio_associate_blkg 553 #else 554 static inline void 555 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 556 { 557 #if defined(HAVE_BIO_BDEV_DISK) 558 struct request_queue *q = bdev->bd_disk->queue; 559 #else 560 struct request_queue *q = bio->bi_disk->queue; 561 #endif 562 bio_clear_flag(bio, BIO_REMAPPED); 563 if (bio->bi_bdev != bdev) 564 bio_clear_flag(bio, BIO_THROTTLED); 565 bio->bi_bdev = bdev; 566 567 ASSERT3P(q, !=, NULL); 568 ASSERT3P(bio->bi_blkg, ==, NULL); 569 570 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 571 bio->bi_blkg = q->root_blkg; 572 } 573 #define bio_set_dev vdev_bio_set_dev 574 #endif 575 #endif 576 #else 577 /* 578 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 579 */ 580 static inline void 581 bio_set_dev(struct bio *bio, struct block_device *bdev) 582 { 583 bio->bi_bdev = bdev; 584 } 585 #endif /* HAVE_BIO_SET_DEV */ 586 #endif /* !HAVE_BIO_ALLOC_4ARG */ 587 588 static inline void 589 vdev_submit_bio(struct bio *bio) 590 { 591 struct bio_list *bio_list = current->bio_list; 592 current->bio_list = NULL; 593 vdev_submit_bio_impl(bio); 594 current->bio_list = bio_list; 595 } 596 597 static inline struct bio * 598 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 599 unsigned short nr_vecs) 600 { 601 struct bio *bio; 602 603 #ifdef HAVE_BIO_ALLOC_4ARG 604 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 605 #else 606 bio = bio_alloc(gfp_mask, nr_vecs); 607 if (likely(bio != NULL)) 608 bio_set_dev(bio, bdev); 609 #endif 610 611 return (bio); 612 } 613 614 static inline uint_t 615 vdev_bio_max_segs(struct block_device *bdev) 616 { 617 /* 618 * Smallest of the device max segs and the tuneable max segs. Minimum 619 * 4, so there's room to finish split pages if they come up. 620 */ 621 const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); 622 const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? 623 MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; 624 const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); 625 626 #ifdef HAVE_BIO_MAX_SEGS 627 return (bio_max_segs(max_segs)); 628 #else 629 return (MIN(max_segs, BIO_MAX_PAGES)); 630 #endif 631 } 632 633 static inline uint_t 634 vdev_bio_max_bytes(struct block_device *bdev) 635 { 636 return (queue_max_sectors(bdev_get_queue(bdev)) << 9); 637 } 638 639 640 /* 641 * Virtual block IO object (VBIO) 642 * 643 * Linux block IO (BIO) objects have a limit on how many data segments (pages) 644 * they can hold. Depending on how they're allocated and structured, a large 645 * ZIO can require more than one BIO to be submitted to the kernel, which then 646 * all have to complete before we can return the completed ZIO back to ZFS. 647 * 648 * A VBIO is a wrapper around multiple BIOs, carrying everything needed to 649 * translate a ZIO down into the kernel block layer and back again. 650 * 651 * Note that these are only used for data ZIOs (read/write). Meta-operations 652 * (flush/trim) don't need multiple BIOs and so can just make the call 653 * directly. 654 */ 655 typedef struct { 656 zio_t *vbio_zio; /* parent zio */ 657 658 struct block_device *vbio_bdev; /* blockdev to submit bios to */ 659 660 abd_t *vbio_abd; /* abd carrying borrowed linear buf */ 661 662 uint_t vbio_max_segs; /* max segs per bio */ 663 664 uint_t vbio_max_bytes; /* max bytes per bio */ 665 uint_t vbio_lbs_mask; /* logical block size mask */ 666 667 uint64_t vbio_offset; /* start offset of next bio */ 668 669 struct bio *vbio_bio; /* pointer to the current bio */ 670 int vbio_flags; /* bio flags */ 671 } vbio_t; 672 673 static vbio_t * 674 vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) 675 { 676 vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); 677 678 vbio->vbio_zio = zio; 679 vbio->vbio_bdev = bdev; 680 vbio->vbio_abd = NULL; 681 vbio->vbio_max_segs = vdev_bio_max_segs(bdev); 682 vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); 683 vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); 684 vbio->vbio_offset = zio->io_offset; 685 vbio->vbio_bio = NULL; 686 vbio->vbio_flags = flags; 687 688 return (vbio); 689 } 690 691 BIO_END_IO_PROTO(vbio_completion, bio, error); 692 693 static int 694 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) 695 { 696 struct bio *bio = vbio->vbio_bio; 697 uint_t ssize; 698 699 while (size > 0) { 700 if (bio == NULL) { 701 /* New BIO, allocate and set up */ 702 bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, 703 vbio->vbio_max_segs); 704 VERIFY(bio); 705 706 BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; 707 bio_set_op_attrs(bio, 708 vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? 709 WRITE : READ, vbio->vbio_flags); 710 711 if (vbio->vbio_bio) { 712 bio_chain(vbio->vbio_bio, bio); 713 vdev_submit_bio(vbio->vbio_bio); 714 } 715 vbio->vbio_bio = bio; 716 } 717 718 /* 719 * Only load as much of the current page data as will fit in 720 * the space left in the BIO, respecting lbs alignment. Older 721 * kernels will error if we try to overfill the BIO, while 722 * newer ones will accept it and split the BIO. This ensures 723 * everything works on older kernels, and avoids an additional 724 * overhead on the new. 725 */ 726 ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & 727 vbio->vbio_lbs_mask); 728 if (ssize > 0 && 729 bio_add_page(bio, page, ssize, offset) == ssize) { 730 /* Accepted, adjust and load any remaining. */ 731 size -= ssize; 732 offset += ssize; 733 continue; 734 } 735 736 /* No room, set up for a new BIO and loop */ 737 vbio->vbio_offset += BIO_BI_SIZE(bio); 738 739 /* Signal new BIO allocation wanted */ 740 bio = NULL; 741 } 742 743 return (0); 744 } 745 746 /* Iterator callback to submit ABD pages to the vbio. */ 747 static int 748 vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) 749 { 750 vbio_t *vbio = priv; 751 return (vbio_add_page(vbio, page, len, off)); 752 } 753 754 /* Create some BIOs, fill them with data and submit them */ 755 static void 756 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) 757 { 758 ASSERT(vbio->vbio_bdev); 759 760 /* 761 * We plug so we can submit the BIOs as we go and only unplug them when 762 * they are fully created and submitted. This is important; if we don't 763 * plug, then the kernel may start executing earlier BIOs while we're 764 * still creating and executing later ones, and if the device goes 765 * away while that's happening, older kernels can get confused and 766 * trample memory. 767 */ 768 struct blk_plug plug; 769 blk_start_plug(&plug); 770 771 (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); 772 ASSERT(vbio->vbio_bio); 773 774 vbio->vbio_bio->bi_end_io = vbio_completion; 775 vbio->vbio_bio->bi_private = vbio; 776 777 vdev_submit_bio(vbio->vbio_bio); 778 779 blk_finish_plug(&plug); 780 781 vbio->vbio_bio = NULL; 782 vbio->vbio_bdev = NULL; 783 } 784 785 /* IO completion callback */ 786 BIO_END_IO_PROTO(vbio_completion, bio, error) 787 { 788 vbio_t *vbio = bio->bi_private; 789 zio_t *zio = vbio->vbio_zio; 790 791 ASSERT(zio); 792 793 /* Capture and log any errors */ 794 #ifdef HAVE_1ARG_BIO_END_IO_T 795 zio->io_error = BIO_END_IO_ERROR(bio); 796 #else 797 zio->io_error = 0; 798 if (error) 799 zio->io_error = -(error); 800 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 801 zio->io_error = EIO; 802 #endif 803 ASSERT3U(zio->io_error, >=, 0); 804 805 if (zio->io_error) 806 vdev_disk_error(zio); 807 808 /* Return the BIO to the kernel */ 809 bio_put(bio); 810 811 /* 812 * If we copied the ABD before issuing it, clean up and return the copy 813 * to the ADB, with changes if appropriate. 814 */ 815 if (vbio->vbio_abd != NULL) { 816 void *buf = abd_to_buf(vbio->vbio_abd); 817 abd_free(vbio->vbio_abd); 818 vbio->vbio_abd = NULL; 819 820 if (zio->io_type == ZIO_TYPE_READ) 821 abd_return_buf_copy(zio->io_abd, buf, zio->io_size); 822 else 823 abd_return_buf(zio->io_abd, buf, zio->io_size); 824 } 825 826 /* Final cleanup */ 827 kmem_free(vbio, sizeof (vbio_t)); 828 829 /* All done, submit for processing */ 830 zio_delay_interrupt(zio); 831 } 832 833 /* 834 * Iterator callback to count ABD pages and check their size & alignment. 835 * 836 * On Linux, each BIO segment can take a page pointer, and an offset+length of 837 * the data within that page. A page can be arbitrarily large ("compound" 838 * pages) but we still have to ensure the data portion is correctly sized and 839 * aligned to the logical block size, to ensure that if the kernel wants to 840 * split the BIO, the two halves will still be properly aligned. 841 */ 842 typedef struct { 843 uint_t bmask; 844 uint_t npages; 845 uint_t end; 846 } vdev_disk_check_pages_t; 847 848 static int 849 vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) 850 { 851 vdev_disk_check_pages_t *s = priv; 852 853 /* 854 * If we didn't finish on a block size boundary last time, then there 855 * would be a gap if we tried to use this ABD as-is, so abort. 856 */ 857 if (s->end != 0) 858 return (1); 859 860 /* 861 * Note if we're taking less than a full block, so we can check it 862 * above on the next call. 863 */ 864 s->end = len & s->bmask; 865 866 /* All blocks after the first must start on a block size boundary. */ 867 if (s->npages != 0 && (off & s->bmask) != 0) 868 return (1); 869 870 s->npages++; 871 return (0); 872 } 873 874 /* 875 * Check if we can submit the pages in this ABD to the kernel as-is. Returns 876 * the number of pages, or 0 if it can't be submitted like this. 877 */ 878 static boolean_t 879 vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) 880 { 881 vdev_disk_check_pages_t s = { 882 .bmask = bdev_logical_block_size(bdev)-1, 883 .npages = 0, 884 .end = 0, 885 }; 886 887 if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) 888 return (B_FALSE); 889 890 return (B_TRUE); 891 } 892 893 static int 894 vdev_disk_io_rw(zio_t *zio) 895 { 896 vdev_t *v = zio->io_vd; 897 vdev_disk_t *vd = v->vdev_tsd; 898 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 899 int flags = 0; 900 901 /* 902 * Accessing outside the block device is never allowed. 903 */ 904 if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { 905 vdev_dbgmsg(zio->io_vd, 906 "Illegal access %llu size %llu, device size %llu", 907 (u_longlong_t)zio->io_offset, 908 (u_longlong_t)zio->io_size, 909 (u_longlong_t)i_size_read(bdev->bd_inode)); 910 return (SET_ERROR(EIO)); 911 } 912 913 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 914 v->vdev_failfast == B_TRUE) { 915 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 916 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 917 } 918 919 /* 920 * Check alignment of the incoming ABD. If any part of it would require 921 * submitting a page that is not aligned to the logical block size, 922 * then we take a copy into a linear buffer and submit that instead. 923 * This should be impossible on a 512b LBS, and fairly rare on 4K, 924 * usually requiring abnormally-small data blocks (eg gang blocks) 925 * mixed into the same ABD as larger ones (eg aggregated). 926 */ 927 abd_t *abd = zio->io_abd; 928 if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { 929 void *buf; 930 if (zio->io_type == ZIO_TYPE_READ) 931 buf = abd_borrow_buf(zio->io_abd, zio->io_size); 932 else 933 buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); 934 935 /* 936 * Wrap the copy in an abd_t, so we can use the same iterators 937 * to count and fill the vbio later. 938 */ 939 abd = abd_get_from_buf(buf, zio->io_size); 940 941 /* 942 * False here would mean the borrowed copy has an invalid 943 * alignment too, which would mean we've somehow been passed a 944 * linear ABD with an interior page that has a non-zero offset 945 * or a size not a multiple of PAGE_SIZE. This is not possible. 946 * It would mean either zio_buf_alloc() or its underlying 947 * allocators have done something extremely strange, or our 948 * math in vdev_disk_check_pages() is wrong. In either case, 949 * something in seriously wrong and its not safe to continue. 950 */ 951 VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); 952 } 953 954 /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ 955 vbio_t *vbio = vbio_alloc(zio, bdev, flags); 956 if (abd != zio->io_abd) 957 vbio->vbio_abd = abd; 958 959 /* Fill it with data pages and submit it to the kernel */ 960 vbio_submit(vbio, abd, zio->io_size); 961 return (0); 962 } 963 964 /* ========== */ 965 966 /* 967 * This is the classic, battle-tested BIO submission code. Until we're totally 968 * sure that the new code is safe and correct in all cases, this will remain 969 * available and can be enabled by setting zfs_vdev_disk_classic=1 at module 970 * load time. 971 * 972 * These functions have been renamed to vdev_classic_* to make it clear what 973 * they belong to, but their implementations are unchanged. 974 */ 975 976 /* 977 * Virtual device vector for disks. 978 */ 979 typedef struct dio_request { 980 zio_t *dr_zio; /* Parent ZIO */ 981 atomic_t dr_ref; /* References */ 982 int dr_error; /* Bio error */ 983 int dr_bio_count; /* Count of bio's */ 984 struct bio *dr_bio[]; /* Attached bio's */ 985 } dio_request_t; 986 987 static dio_request_t * 988 vdev_classic_dio_alloc(int bio_count) 989 { 990 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 991 sizeof (struct bio *) * bio_count, KM_SLEEP); 992 atomic_set(&dr->dr_ref, 0); 993 dr->dr_bio_count = bio_count; 994 dr->dr_error = 0; 995 996 for (int i = 0; i < dr->dr_bio_count; i++) 997 dr->dr_bio[i] = NULL; 998 999 return (dr); 1000 } 1001 1002 static void 1003 vdev_classic_dio_free(dio_request_t *dr) 1004 { 1005 int i; 1006 1007 for (i = 0; i < dr->dr_bio_count; i++) 1008 if (dr->dr_bio[i]) 1009 bio_put(dr->dr_bio[i]); 1010 1011 kmem_free(dr, sizeof (dio_request_t) + 1012 sizeof (struct bio *) * dr->dr_bio_count); 1013 } 1014 1015 static void 1016 vdev_classic_dio_get(dio_request_t *dr) 1017 { 1018 atomic_inc(&dr->dr_ref); 1019 } 1020 1021 static void 1022 vdev_classic_dio_put(dio_request_t *dr) 1023 { 1024 int rc = atomic_dec_return(&dr->dr_ref); 1025 1026 /* 1027 * Free the dio_request when the last reference is dropped and 1028 * ensure zio_interpret is called only once with the correct zio 1029 */ 1030 if (rc == 0) { 1031 zio_t *zio = dr->dr_zio; 1032 int error = dr->dr_error; 1033 1034 vdev_classic_dio_free(dr); 1035 1036 if (zio) { 1037 zio->io_error = error; 1038 ASSERT3S(zio->io_error, >=, 0); 1039 if (zio->io_error) 1040 vdev_disk_error(zio); 1041 1042 zio_delay_interrupt(zio); 1043 } 1044 } 1045 } 1046 1047 BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) 1048 { 1049 dio_request_t *dr = bio->bi_private; 1050 1051 if (dr->dr_error == 0) { 1052 #ifdef HAVE_1ARG_BIO_END_IO_T 1053 dr->dr_error = BIO_END_IO_ERROR(bio); 1054 #else 1055 if (error) 1056 dr->dr_error = -(error); 1057 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1058 dr->dr_error = EIO; 1059 #endif 1060 } 1061 1062 /* Drop reference acquired by vdev_classic_physio */ 1063 vdev_classic_dio_put(dr); 1064 } 1065 1066 static inline unsigned int 1067 vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 1068 { 1069 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 1070 bio_size, abd_offset); 1071 1072 #ifdef HAVE_BIO_MAX_SEGS 1073 return (bio_max_segs(nr_segs)); 1074 #else 1075 return (MIN(nr_segs, BIO_MAX_PAGES)); 1076 #endif 1077 } 1078 1079 static int 1080 vdev_classic_physio(zio_t *zio) 1081 { 1082 vdev_t *v = zio->io_vd; 1083 vdev_disk_t *vd = v->vdev_tsd; 1084 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 1085 size_t io_size = zio->io_size; 1086 uint64_t io_offset = zio->io_offset; 1087 int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; 1088 int flags = 0; 1089 1090 dio_request_t *dr; 1091 uint64_t abd_offset; 1092 uint64_t bio_offset; 1093 int bio_size; 1094 int bio_count = 16; 1095 int error = 0; 1096 struct blk_plug plug; 1097 unsigned short nr_vecs; 1098 1099 /* 1100 * Accessing outside the block device is never allowed. 1101 */ 1102 if (io_offset + io_size > bdev->bd_inode->i_size) { 1103 vdev_dbgmsg(zio->io_vd, 1104 "Illegal access %llu size %llu, device size %llu", 1105 (u_longlong_t)io_offset, 1106 (u_longlong_t)io_size, 1107 (u_longlong_t)i_size_read(bdev->bd_inode)); 1108 return (SET_ERROR(EIO)); 1109 } 1110 1111 retry: 1112 dr = vdev_classic_dio_alloc(bio_count); 1113 1114 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 1115 zio->io_vd->vdev_failfast == B_TRUE) { 1116 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 1117 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 1118 } 1119 1120 dr->dr_zio = zio; 1121 1122 /* 1123 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 1124 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 1125 * can cover at least 128KB and at most 1MB. When the required number 1126 * of iovec's exceeds this, we are forced to break the IO in multiple 1127 * bio's and wait for them all to complete. This is likely if the 1128 * recordsize property is increased beyond 1MB. The default 1129 * bio_count=16 should typically accommodate the maximum-size zio of 1130 * 16MB. 1131 */ 1132 1133 abd_offset = 0; 1134 bio_offset = io_offset; 1135 bio_size = io_size; 1136 for (int i = 0; i <= dr->dr_bio_count; i++) { 1137 1138 /* Finished constructing bio's for given buffer */ 1139 if (bio_size <= 0) 1140 break; 1141 1142 /* 1143 * If additional bio's are required, we have to retry, but 1144 * this should be rare - see the comment above. 1145 */ 1146 if (dr->dr_bio_count == i) { 1147 vdev_classic_dio_free(dr); 1148 bio_count *= 2; 1149 goto retry; 1150 } 1151 1152 nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); 1153 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 1154 if (unlikely(dr->dr_bio[i] == NULL)) { 1155 vdev_classic_dio_free(dr); 1156 return (SET_ERROR(ENOMEM)); 1157 } 1158 1159 /* Matching put called by vdev_classic_physio_completion */ 1160 vdev_classic_dio_get(dr); 1161 1162 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 1163 dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; 1164 dr->dr_bio[i]->bi_private = dr; 1165 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 1166 1167 /* Remaining size is returned to become the new size */ 1168 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 1169 bio_size, abd_offset); 1170 1171 /* Advance in buffer and construct another bio if needed */ 1172 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1173 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1174 } 1175 1176 /* Extra reference to protect dio_request during vdev_submit_bio */ 1177 vdev_classic_dio_get(dr); 1178 1179 if (dr->dr_bio_count > 1) 1180 blk_start_plug(&plug); 1181 1182 /* Submit all bio's associated with this dio */ 1183 for (int i = 0; i < dr->dr_bio_count; i++) { 1184 if (dr->dr_bio[i]) 1185 vdev_submit_bio(dr->dr_bio[i]); 1186 } 1187 1188 if (dr->dr_bio_count > 1) 1189 blk_finish_plug(&plug); 1190 1191 vdev_classic_dio_put(dr); 1192 1193 return (error); 1194 } 1195 1196 /* ========== */ 1197 1198 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 1199 { 1200 zio_t *zio = bio->bi_private; 1201 #ifdef HAVE_1ARG_BIO_END_IO_T 1202 zio->io_error = BIO_END_IO_ERROR(bio); 1203 #else 1204 zio->io_error = -error; 1205 #endif 1206 1207 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 1208 zio->io_vd->vdev_nowritecache = B_TRUE; 1209 1210 bio_put(bio); 1211 ASSERT3S(zio->io_error, >=, 0); 1212 if (zio->io_error) 1213 vdev_disk_error(zio); 1214 zio_interrupt(zio); 1215 } 1216 1217 static int 1218 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 1219 { 1220 struct request_queue *q; 1221 struct bio *bio; 1222 1223 q = bdev_get_queue(bdev); 1224 if (!q) 1225 return (SET_ERROR(ENXIO)); 1226 1227 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 1228 if (unlikely(bio == NULL)) 1229 return (SET_ERROR(ENOMEM)); 1230 1231 bio->bi_end_io = vdev_disk_io_flush_completion; 1232 bio->bi_private = zio; 1233 bio_set_flush(bio); 1234 vdev_submit_bio(bio); 1235 invalidate_bdev(bdev); 1236 1237 return (0); 1238 } 1239 1240 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ 1241 defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) 1242 BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) 1243 { 1244 zio_t *zio = bio->bi_private; 1245 #ifdef HAVE_1ARG_BIO_END_IO_T 1246 zio->io_error = BIO_END_IO_ERROR(bio); 1247 #else 1248 zio->io_error = -error; 1249 #endif 1250 bio_put(bio); 1251 if (zio->io_error) 1252 vdev_disk_error(zio); 1253 zio_interrupt(zio); 1254 } 1255 1256 static int 1257 vdev_issue_discard_trim(zio_t *zio, unsigned long flags) 1258 { 1259 int ret; 1260 struct bio *bio = NULL; 1261 1262 #if defined(BLKDEV_DISCARD_SECURE) 1263 ret = - __blkdev_issue_discard( 1264 BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), 1265 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); 1266 #else 1267 (void) flags; 1268 ret = - __blkdev_issue_discard( 1269 BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), 1270 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); 1271 #endif 1272 if (!ret && bio) { 1273 bio->bi_private = zio; 1274 bio->bi_end_io = vdev_disk_discard_end_io; 1275 vdev_submit_bio(bio); 1276 } 1277 return (ret); 1278 } 1279 #endif 1280 1281 static int 1282 vdev_disk_io_trim(zio_t *zio) 1283 { 1284 unsigned long trim_flags = 0; 1285 if (zio->io_trim_flags & ZIO_TRIM_SECURE) { 1286 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 1287 return (-blkdev_issue_secure_erase( 1288 BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), 1289 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 1290 #elif defined(BLKDEV_DISCARD_SECURE) 1291 trim_flags |= BLKDEV_DISCARD_SECURE; 1292 #endif 1293 } 1294 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ 1295 defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) 1296 return (vdev_issue_discard_trim(zio, trim_flags)); 1297 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 1298 return (-blkdev_issue_discard( 1299 BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), 1300 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); 1301 #else 1302 #error "Unsupported kernel" 1303 #endif 1304 } 1305 1306 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; 1307 1308 static void 1309 vdev_disk_io_start(zio_t *zio) 1310 { 1311 vdev_t *v = zio->io_vd; 1312 vdev_disk_t *vd = v->vdev_tsd; 1313 int error; 1314 1315 /* 1316 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 1317 * Nothing to be done here but return failure. 1318 */ 1319 if (vd == NULL) { 1320 zio->io_error = ENXIO; 1321 zio_interrupt(zio); 1322 return; 1323 } 1324 1325 rw_enter(&vd->vd_lock, RW_READER); 1326 1327 /* 1328 * If the vdev is closed, it's likely due to a failed reopen and is 1329 * in the UNAVAIL state. Nothing to be done here but return failure. 1330 */ 1331 if (vd->vd_bdh == NULL) { 1332 rw_exit(&vd->vd_lock); 1333 zio->io_error = ENXIO; 1334 zio_interrupt(zio); 1335 return; 1336 } 1337 1338 switch (zio->io_type) { 1339 case ZIO_TYPE_IOCTL: 1340 1341 if (!vdev_readable(v)) { 1342 rw_exit(&vd->vd_lock); 1343 zio->io_error = SET_ERROR(ENXIO); 1344 zio_interrupt(zio); 1345 return; 1346 } 1347 1348 switch (zio->io_cmd) { 1349 case DKIOCFLUSHWRITECACHE: 1350 1351 if (zfs_nocacheflush) 1352 break; 1353 1354 if (v->vdev_nowritecache) { 1355 zio->io_error = SET_ERROR(ENOTSUP); 1356 break; 1357 } 1358 1359 error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); 1360 if (error == 0) { 1361 rw_exit(&vd->vd_lock); 1362 return; 1363 } 1364 1365 zio->io_error = error; 1366 1367 break; 1368 1369 default: 1370 zio->io_error = SET_ERROR(ENOTSUP); 1371 } 1372 1373 rw_exit(&vd->vd_lock); 1374 zio_execute(zio); 1375 return; 1376 1377 case ZIO_TYPE_TRIM: 1378 zio->io_error = vdev_disk_io_trim(zio); 1379 rw_exit(&vd->vd_lock); 1380 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 1381 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 1382 zio_interrupt(zio); 1383 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 1384 zio_interrupt(zio); 1385 #endif 1386 return; 1387 1388 case ZIO_TYPE_READ: 1389 case ZIO_TYPE_WRITE: 1390 zio->io_target_timestamp = zio_handle_io_delay(zio); 1391 error = vdev_disk_io_rw_fn(zio); 1392 rw_exit(&vd->vd_lock); 1393 if (error) { 1394 zio->io_error = error; 1395 zio_interrupt(zio); 1396 } 1397 return; 1398 1399 default: 1400 /* 1401 * Getting here means our parent vdev has made a very strange 1402 * request of us, and shouldn't happen. Assert here to force a 1403 * crash in dev builds, but in production return the IO 1404 * unhandled. The pool will likely suspend anyway but that's 1405 * nicer than crashing the kernel. 1406 */ 1407 ASSERT3S(zio->io_type, ==, -1); 1408 1409 rw_exit(&vd->vd_lock); 1410 zio->io_error = SET_ERROR(ENOTSUP); 1411 zio_interrupt(zio); 1412 return; 1413 } 1414 1415 __builtin_unreachable(); 1416 } 1417 1418 static void 1419 vdev_disk_io_done(zio_t *zio) 1420 { 1421 /* 1422 * If the device returned EIO, we revalidate the media. If it is 1423 * determined the media has changed this triggers the asynchronous 1424 * removal of the device from the configuration. 1425 */ 1426 if (zio->io_error == EIO) { 1427 vdev_t *v = zio->io_vd; 1428 vdev_disk_t *vd = v->vdev_tsd; 1429 1430 if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { 1431 invalidate_bdev(BDH_BDEV(vd->vd_bdh)); 1432 v->vdev_remove_wanted = B_TRUE; 1433 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1434 } 1435 } 1436 } 1437 1438 static void 1439 vdev_disk_hold(vdev_t *vd) 1440 { 1441 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1442 1443 /* We must have a pathname, and it must be absolute. */ 1444 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 1445 return; 1446 1447 /* 1448 * Only prefetch path and devid info if the device has 1449 * never been opened. 1450 */ 1451 if (vd->vdev_tsd != NULL) 1452 return; 1453 1454 } 1455 1456 static void 1457 vdev_disk_rele(vdev_t *vd) 1458 { 1459 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1460 1461 /* XXX: Implement me as a vnode rele for the device */ 1462 } 1463 1464 /* 1465 * BIO submission method. See comment above about vdev_classic. 1466 * Set zfs_vdev_disk_classic=0 for new, =1 for classic 1467 */ 1468 static uint_t zfs_vdev_disk_classic = 0; /* default new */ 1469 1470 /* Set submission function from module parameter */ 1471 static int 1472 vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) 1473 { 1474 int err = param_set_uint(buf, kp); 1475 if (err < 0) 1476 return (SET_ERROR(err)); 1477 1478 vdev_disk_io_rw_fn = 1479 zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; 1480 1481 printk(KERN_INFO "ZFS: forcing %s BIO submission\n", 1482 zfs_vdev_disk_classic ? "classic" : "new"); 1483 1484 return (0); 1485 } 1486 1487 /* 1488 * At first use vdev use, set the submission function from the default value if 1489 * it hasn't been set already. 1490 */ 1491 static int 1492 vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) 1493 { 1494 (void) spa; 1495 (void) nv; 1496 (void) tsd; 1497 1498 if (vdev_disk_io_rw_fn == NULL) 1499 vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? 1500 vdev_classic_physio : vdev_disk_io_rw; 1501 1502 return (0); 1503 } 1504 1505 vdev_ops_t vdev_disk_ops = { 1506 .vdev_op_init = vdev_disk_init, 1507 .vdev_op_fini = NULL, 1508 .vdev_op_open = vdev_disk_open, 1509 .vdev_op_close = vdev_disk_close, 1510 .vdev_op_asize = vdev_default_asize, 1511 .vdev_op_min_asize = vdev_default_min_asize, 1512 .vdev_op_min_alloc = NULL, 1513 .vdev_op_io_start = vdev_disk_io_start, 1514 .vdev_op_io_done = vdev_disk_io_done, 1515 .vdev_op_state_change = NULL, 1516 .vdev_op_need_resilver = NULL, 1517 .vdev_op_hold = vdev_disk_hold, 1518 .vdev_op_rele = vdev_disk_rele, 1519 .vdev_op_remap = NULL, 1520 .vdev_op_xlate = vdev_default_xlate, 1521 .vdev_op_rebuild_asize = NULL, 1522 .vdev_op_metaslab_init = NULL, 1523 .vdev_op_config_generate = NULL, 1524 .vdev_op_nparity = NULL, 1525 .vdev_op_ndisks = NULL, 1526 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1527 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1528 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1529 }; 1530 1531 /* 1532 * The zfs_vdev_scheduler module option has been deprecated. Setting this 1533 * value no longer has any effect. It has not yet been entirely removed 1534 * to allow the module to be loaded if this option is specified in the 1535 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 1536 */ 1537 static int 1538 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1539 { 1540 int error = param_set_charp(val, kp); 1541 if (error == 0) { 1542 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1543 "is not supported.\n"); 1544 } 1545 1546 return (error); 1547 } 1548 1549 static const char *zfs_vdev_scheduler = "unused"; 1550 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1551 param_get_charp, &zfs_vdev_scheduler, 0644); 1552 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1553 1554 int 1555 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1556 { 1557 uint_t val; 1558 int error; 1559 1560 error = kstrtouint(buf, 0, &val); 1561 if (error < 0) 1562 return (SET_ERROR(error)); 1563 1564 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1565 return (SET_ERROR(-EINVAL)); 1566 1567 error = param_set_uint(buf, kp); 1568 if (error < 0) 1569 return (SET_ERROR(error)); 1570 1571 return (0); 1572 } 1573 1574 int 1575 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1576 { 1577 uint_t val; 1578 int error; 1579 1580 error = kstrtouint(buf, 0, &val); 1581 if (error < 0) 1582 return (SET_ERROR(error)); 1583 1584 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1585 return (SET_ERROR(-EINVAL)); 1586 1587 error = param_set_uint(buf, kp); 1588 if (error < 0) 1589 return (SET_ERROR(error)); 1590 1591 return (0); 1592 } 1593 1594 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1595 "Timeout before determining that a device is missing"); 1596 1597 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1598 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1599 1600 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, 1601 "Maximum number of data segments to add to an IO request (min 4)"); 1602 1603 ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, 1604 vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, 1605 "Use classic BIO submission method"); 1606