1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 * Copyright (c) 2023, 2024, Klara Inc. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/vdev_trim.h> 35 #include <sys/abd.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/zio.h> 38 #include <linux/blkpg.h> 39 #include <linux/msdos_fs.h> 40 #include <linux/vfs_compat.h> 41 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 42 #include <linux/blk-cgroup.h> 43 #endif 44 45 /* 46 * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying 47 * block_device. Since it carries the block_device inside, its convenient to 48 * just use the handle as a proxy. 49 * 50 * Linux 6.9.x uses a file for the same purpose. 51 * 52 * For pre-6.8, we just emulate this with a cast, since we don't need any of 53 * the other fields inside the handle. 54 */ 55 #if defined(HAVE_BDEV_OPEN_BY_PATH) 56 typedef struct bdev_handle zfs_bdev_handle_t; 57 #define BDH_BDEV(bdh) ((bdh)->bdev) 58 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 59 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 60 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 61 #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 62 typedef struct file zfs_bdev_handle_t; 63 #define BDH_BDEV(bdh) (file_bdev(bdh)) 64 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 65 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 66 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 67 #else 68 typedef void zfs_bdev_handle_t; 69 #define BDH_BDEV(bdh) ((struct block_device *)bdh) 70 #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) 71 #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) 72 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 73 #endif 74 75 typedef struct vdev_disk { 76 zfs_bdev_handle_t *vd_bdh; 77 krwlock_t vd_lock; 78 } vdev_disk_t; 79 80 /* 81 * Maximum number of segments to add to a bio (min 4). If this is higher than 82 * the maximum allowed by the device queue or the kernel itself, it will be 83 * clamped. Setting it to zero will cause the kernel's ideal size to be used. 84 */ 85 uint_t zfs_vdev_disk_max_segs = 0; 86 87 /* 88 * Unique identifier for the exclusive vdev holder. 89 */ 90 static void *zfs_vdev_holder = VDEV_HOLDER; 91 92 /* 93 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 94 * device is missing. The missing path may be transient since the links 95 * can be briefly removed and recreated in response to udev events. 96 */ 97 static uint_t zfs_vdev_open_timeout_ms = 1000; 98 99 /* 100 * Size of the "reserved" partition, in blocks. 101 */ 102 #define EFI_MIN_RESV_SIZE (16 * 1024) 103 104 /* 105 * BIO request failfast mask. 106 */ 107 108 static unsigned int zfs_vdev_failfast_mask = 1; 109 110 /* 111 * Convert SPA mode flags into bdev open mode flags. 112 */ 113 #ifdef HAVE_BLK_MODE_T 114 typedef blk_mode_t vdev_bdev_mode_t; 115 #define VDEV_BDEV_MODE_READ BLK_OPEN_READ 116 #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE 117 #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL 118 #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) 119 #else 120 typedef fmode_t vdev_bdev_mode_t; 121 #define VDEV_BDEV_MODE_READ FMODE_READ 122 #define VDEV_BDEV_MODE_WRITE FMODE_WRITE 123 #define VDEV_BDEV_MODE_EXCL FMODE_EXCL 124 #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) 125 #endif 126 127 static vdev_bdev_mode_t 128 vdev_bdev_mode(spa_mode_t smode) 129 { 130 ASSERT3U(smode, !=, SPA_MODE_UNINIT); 131 ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); 132 133 vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; 134 135 if (smode & SPA_MODE_READ) 136 bmode |= VDEV_BDEV_MODE_READ; 137 138 if (smode & SPA_MODE_WRITE) 139 bmode |= VDEV_BDEV_MODE_WRITE; 140 141 ASSERT(bmode & VDEV_BDEV_MODE_MASK); 142 ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); 143 144 return (bmode); 145 } 146 147 /* 148 * Returns the usable capacity (in bytes) for the partition or disk. 149 */ 150 static uint64_t 151 bdev_capacity(struct block_device *bdev) 152 { 153 #ifdef HAVE_BDEV_NR_BYTES 154 return (bdev_nr_bytes(bdev)); 155 #else 156 return (i_size_read(bdev->bd_inode)); 157 #endif 158 } 159 160 #if !defined(HAVE_BDEV_WHOLE) 161 static inline struct block_device * 162 bdev_whole(struct block_device *bdev) 163 { 164 return (bdev->bd_contains); 165 } 166 #endif 167 168 #if defined(HAVE_BDEVNAME) 169 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 170 #else 171 static inline void 172 vdev_bdevname(struct block_device *bdev, char *name) 173 { 174 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 175 } 176 #endif 177 178 /* 179 * Returns the maximum expansion capacity of the block device (in bytes). 180 * 181 * It is possible to expand a vdev when it has been created as a wholedisk 182 * and the containing block device has increased in capacity. Or when the 183 * partition containing the pool has been manually increased in size. 184 * 185 * This function is only responsible for calculating the potential expansion 186 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 187 * responsible for verifying the expected partition layout in the wholedisk 188 * case, and updating the partition table if appropriate. Once the partition 189 * size has been increased the additional capacity will be visible using 190 * bdev_capacity(). 191 * 192 * The returned maximum expansion capacity is always expected to be larger, or 193 * at the very least equal, to its usable capacity to prevent overestimating 194 * the pool expandsize. 195 */ 196 static uint64_t 197 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 198 { 199 uint64_t psize; 200 int64_t available; 201 202 if (wholedisk && bdev != bdev_whole(bdev)) { 203 /* 204 * When reporting maximum expansion capacity for a wholedisk 205 * deduct any capacity which is expected to be lost due to 206 * alignment restrictions. Over reporting this value isn't 207 * harmful and would only result in slightly less capacity 208 * than expected post expansion. 209 * The estimated available space may be slightly smaller than 210 * bdev_capacity() for devices where the number of sectors is 211 * not a multiple of the alignment size and the partition layout 212 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 213 * "reserved" EFI partition: in such cases return the device 214 * usable capacity. 215 */ 216 available = bdev_capacity(bdev_whole(bdev)) - 217 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 218 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 219 psize = MAX(available, bdev_capacity(bdev)); 220 } else { 221 psize = bdev_capacity(bdev); 222 } 223 224 return (psize); 225 } 226 227 static void 228 vdev_disk_error(zio_t *zio) 229 { 230 /* 231 * This function can be called in interrupt context, for instance while 232 * handling IRQs coming from a misbehaving disk device; use printk() 233 * which is safe from any context. 234 */ 235 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 236 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 237 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 238 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 239 zio->io_flags); 240 } 241 242 static void 243 vdev_disk_kobj_evt_post(vdev_t *v) 244 { 245 vdev_disk_t *vd = v->vdev_tsd; 246 if (vd && vd->vd_bdh) { 247 spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); 248 } else { 249 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 250 v->vdev_path); 251 } 252 } 253 254 static zfs_bdev_handle_t * 255 vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) 256 { 257 vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); 258 259 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 260 return (bdev_file_open_by_path(path, bmode, holder, NULL)); 261 #elif defined(HAVE_BDEV_OPEN_BY_PATH) 262 return (bdev_open_by_path(path, bmode, holder, NULL)); 263 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 264 return (blkdev_get_by_path(path, bmode, holder, NULL)); 265 #else 266 return (blkdev_get_by_path(path, bmode, holder)); 267 #endif 268 } 269 270 static void 271 vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) 272 { 273 #if defined(HAVE_BDEV_RELEASE) 274 return (bdev_release(bdh)); 275 #elif defined(HAVE_BLKDEV_PUT_HOLDER) 276 return (blkdev_put(BDH_BDEV(bdh), holder)); 277 #elif defined(HAVE_BLKDEV_PUT) 278 return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); 279 #else 280 fput(bdh); 281 #endif 282 } 283 284 static int 285 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 286 uint64_t *logical_ashift, uint64_t *physical_ashift) 287 { 288 zfs_bdev_handle_t *bdh; 289 spa_mode_t smode = spa_mode(v->vdev_spa); 290 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 291 vdev_disk_t *vd; 292 293 /* Must have a pathname and it must be absolute. */ 294 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 295 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 296 vdev_dbgmsg(v, "invalid vdev_path"); 297 return (SET_ERROR(EINVAL)); 298 } 299 300 /* 301 * Reopen the device if it is currently open. When expanding a 302 * partition force re-scanning the partition table if userland 303 * did not take care of this already. We need to do this while closed 304 * in order to get an accurate updated block device size. Then 305 * since udev may need to recreate the device links increase the 306 * open retry timeout before reporting the device as unavailable. 307 */ 308 vd = v->vdev_tsd; 309 if (vd) { 310 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 311 boolean_t reread_part = B_FALSE; 312 313 rw_enter(&vd->vd_lock, RW_WRITER); 314 bdh = vd->vd_bdh; 315 vd->vd_bdh = NULL; 316 317 if (bdh) { 318 struct block_device *bdev = BDH_BDEV(bdh); 319 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 320 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 321 /* 322 * If userland has BLKPG_RESIZE_PARTITION, 323 * then it should have updated the partition 324 * table already. We can detect this by 325 * comparing our current physical size 326 * with that of the device. If they are 327 * the same, then we must not have 328 * BLKPG_RESIZE_PARTITION or it failed to 329 * update the partition table online. We 330 * fallback to rescanning the partition 331 * table from the kernel below. However, 332 * if the capacity already reflects the 333 * updated partition, then we skip 334 * rescanning the partition table here. 335 */ 336 if (v->vdev_psize == bdev_capacity(bdev)) 337 reread_part = B_TRUE; 338 } 339 340 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 341 } 342 343 if (reread_part) { 344 bdh = vdev_blkdev_get_by_path(disk_name, smode, 345 zfs_vdev_holder); 346 if (!BDH_IS_ERR(bdh)) { 347 int error = 348 vdev_bdev_reread_part(BDH_BDEV(bdh)); 349 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 350 if (error == 0) { 351 timeout = MSEC2NSEC( 352 zfs_vdev_open_timeout_ms * 2); 353 } 354 } 355 } 356 } else { 357 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 358 359 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 360 rw_enter(&vd->vd_lock, RW_WRITER); 361 } 362 363 /* 364 * Devices are always opened by the path provided at configuration 365 * time. This means that if the provided path is a udev by-id path 366 * then drives may be re-cabled without an issue. If the provided 367 * path is a udev by-path path, then the physical location information 368 * will be preserved. This can be critical for more complicated 369 * configurations where drives are located in specific physical 370 * locations to maximize the systems tolerance to component failure. 371 * 372 * Alternatively, you can provide your own udev rule to flexibly map 373 * the drives as you see fit. It is not advised that you use the 374 * /dev/[hd]d devices which may be reordered due to probing order. 375 * Devices in the wrong locations will be detected by the higher 376 * level vdev validation. 377 * 378 * The specified paths may be briefly removed and recreated in 379 * response to udev events. This should be exceptionally unlikely 380 * because the zpool command makes every effort to verify these paths 381 * have already settled prior to reaching this point. Therefore, 382 * a ENOENT failure at this point is highly likely to be transient 383 * and it is reasonable to sleep and retry before giving up. In 384 * practice delays have been observed to be on the order of 100ms. 385 * 386 * When ERESTARTSYS is returned it indicates the block device is 387 * a zvol which could not be opened due to the deadlock detection 388 * logic in zvol_open(). Extend the timeout and retry the open 389 * subsequent attempts are expected to eventually succeed. 390 */ 391 hrtime_t start = gethrtime(); 392 bdh = BDH_ERR_PTR(-ENXIO); 393 while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { 394 bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, 395 zfs_vdev_holder); 396 if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { 397 /* 398 * There is no point of waiting since device is removed 399 * explicitly 400 */ 401 if (v->vdev_removed) 402 break; 403 404 schedule_timeout_interruptible(MSEC_TO_TICK(10)); 405 } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { 406 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 407 continue; 408 } else if (BDH_IS_ERR(bdh)) { 409 break; 410 } 411 } 412 413 if (BDH_IS_ERR(bdh)) { 414 int error = -BDH_PTR_ERR(bdh); 415 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 416 (u_longlong_t)(gethrtime() - start), 417 (u_longlong_t)timeout); 418 vd->vd_bdh = NULL; 419 v->vdev_tsd = vd; 420 rw_exit(&vd->vd_lock); 421 return (SET_ERROR(error)); 422 } else { 423 vd->vd_bdh = bdh; 424 v->vdev_tsd = vd; 425 rw_exit(&vd->vd_lock); 426 } 427 428 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 429 430 /* Determine the physical block size */ 431 int physical_block_size = bdev_physical_block_size(bdev); 432 433 /* Determine the logical block size */ 434 int logical_block_size = bdev_logical_block_size(bdev); 435 436 /* 437 * If the device has a write cache, clear the nowritecache flag, 438 * so that we start issuing flush requests again. 439 */ 440 v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); 441 442 /* Set when device reports it supports TRIM. */ 443 v->vdev_has_trim = bdev_discard_supported(bdev); 444 445 /* Set when device reports it supports secure TRIM. */ 446 v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); 447 448 /* Inform the ZIO pipeline that we are non-rotational */ 449 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); 450 451 /* Physical volume size in bytes for the partition */ 452 *psize = bdev_capacity(bdev); 453 454 /* Physical volume size in bytes including possible expansion space */ 455 *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); 456 457 /* Based on the minimum sector size set the block size */ 458 *physical_ashift = highbit64(MAX(physical_block_size, 459 SPA_MINBLOCKSIZE)) - 1; 460 461 *logical_ashift = highbit64(MAX(logical_block_size, 462 SPA_MINBLOCKSIZE)) - 1; 463 464 return (0); 465 } 466 467 static void 468 vdev_disk_close(vdev_t *v) 469 { 470 vdev_disk_t *vd = v->vdev_tsd; 471 472 if (v->vdev_reopening || vd == NULL) 473 return; 474 475 if (vd->vd_bdh != NULL) 476 vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), 477 zfs_vdev_holder); 478 479 rw_destroy(&vd->vd_lock); 480 kmem_free(vd, sizeof (vdev_disk_t)); 481 v->vdev_tsd = NULL; 482 } 483 484 static inline void 485 vdev_submit_bio_impl(struct bio *bio) 486 { 487 #ifdef HAVE_1ARG_SUBMIT_BIO 488 (void) submit_bio(bio); 489 #else 490 (void) submit_bio(bio_data_dir(bio), bio); 491 #endif 492 } 493 494 /* 495 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 496 * replace it with preempt_schedule under the following condition: 497 */ 498 #if defined(CONFIG_ARM64) && \ 499 defined(CONFIG_PREEMPTION) && \ 500 defined(CONFIG_BLK_CGROUP) 501 #define preempt_schedule_notrace(x) preempt_schedule(x) 502 #endif 503 504 /* 505 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 506 * as an argument removing the need to set it with bio_set_dev(). This 507 * removes the need for all of the following compatibility code. 508 */ 509 #if !defined(HAVE_BIO_ALLOC_4ARG) 510 511 #ifdef HAVE_BIO_SET_DEV 512 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 513 /* 514 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 515 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 516 * As a side effect the function was converted to GPL-only. Define our 517 * own version when needed which uses rcu_read_lock_sched(). 518 * 519 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 520 * part, moving blkg_tryget into the private one. Define our own version. 521 */ 522 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 523 static inline bool 524 vdev_blkg_tryget(struct blkcg_gq *blkg) 525 { 526 struct percpu_ref *ref = &blkg->refcnt; 527 unsigned long __percpu *count; 528 bool rc; 529 530 rcu_read_lock_sched(); 531 532 if (__ref_is_percpu(ref, &count)) { 533 this_cpu_inc(*count); 534 rc = true; 535 } else { 536 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 537 rc = atomic_long_inc_not_zero(&ref->data->count); 538 #else 539 rc = atomic_long_inc_not_zero(&ref->count); 540 #endif 541 } 542 543 rcu_read_unlock_sched(); 544 545 return (rc); 546 } 547 #else 548 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 549 #endif 550 #ifdef HAVE_BIO_SET_DEV_MACRO 551 /* 552 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 553 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 554 * the entire macro. Provide a minimal version which always assigns the 555 * request queue's root_blkg to the bio. 556 */ 557 static inline void 558 vdev_bio_associate_blkg(struct bio *bio) 559 { 560 #if defined(HAVE_BIO_BDEV_DISK) 561 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 562 #else 563 struct request_queue *q = bio->bi_disk->queue; 564 #endif 565 566 ASSERT3P(q, !=, NULL); 567 ASSERT3P(bio->bi_blkg, ==, NULL); 568 569 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 570 bio->bi_blkg = q->root_blkg; 571 } 572 573 #define bio_associate_blkg vdev_bio_associate_blkg 574 #else 575 static inline void 576 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 577 { 578 #if defined(HAVE_BIO_BDEV_DISK) 579 struct request_queue *q = bdev->bd_disk->queue; 580 #else 581 struct request_queue *q = bio->bi_disk->queue; 582 #endif 583 bio_clear_flag(bio, BIO_REMAPPED); 584 if (bio->bi_bdev != bdev) 585 bio_clear_flag(bio, BIO_THROTTLED); 586 bio->bi_bdev = bdev; 587 588 ASSERT3P(q, !=, NULL); 589 ASSERT3P(bio->bi_blkg, ==, NULL); 590 591 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 592 bio->bi_blkg = q->root_blkg; 593 } 594 #define bio_set_dev vdev_bio_set_dev 595 #endif 596 #endif 597 #else 598 /* 599 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 600 */ 601 static inline void 602 bio_set_dev(struct bio *bio, struct block_device *bdev) 603 { 604 bio->bi_bdev = bdev; 605 } 606 #endif /* HAVE_BIO_SET_DEV */ 607 #endif /* !HAVE_BIO_ALLOC_4ARG */ 608 609 static inline void 610 vdev_submit_bio(struct bio *bio) 611 { 612 struct bio_list *bio_list = current->bio_list; 613 current->bio_list = NULL; 614 vdev_submit_bio_impl(bio); 615 current->bio_list = bio_list; 616 } 617 618 static inline struct bio * 619 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 620 unsigned short nr_vecs) 621 { 622 struct bio *bio; 623 624 #ifdef HAVE_BIO_ALLOC_4ARG 625 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 626 #else 627 bio = bio_alloc(gfp_mask, nr_vecs); 628 if (likely(bio != NULL)) 629 bio_set_dev(bio, bdev); 630 #endif 631 632 return (bio); 633 } 634 635 static inline uint_t 636 vdev_bio_max_segs(struct block_device *bdev) 637 { 638 /* 639 * Smallest of the device max segs and the tuneable max segs. Minimum 640 * 4, so there's room to finish split pages if they come up. 641 */ 642 const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); 643 const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? 644 MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; 645 const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); 646 647 #ifdef HAVE_BIO_MAX_SEGS 648 return (bio_max_segs(max_segs)); 649 #else 650 return (MIN(max_segs, BIO_MAX_PAGES)); 651 #endif 652 } 653 654 static inline uint_t 655 vdev_bio_max_bytes(struct block_device *bdev) 656 { 657 return (queue_max_sectors(bdev_get_queue(bdev)) << 9); 658 } 659 660 661 /* 662 * Virtual block IO object (VBIO) 663 * 664 * Linux block IO (BIO) objects have a limit on how many data segments (pages) 665 * they can hold. Depending on how they're allocated and structured, a large 666 * ZIO can require more than one BIO to be submitted to the kernel, which then 667 * all have to complete before we can return the completed ZIO back to ZFS. 668 * 669 * A VBIO is a wrapper around multiple BIOs, carrying everything needed to 670 * translate a ZIO down into the kernel block layer and back again. 671 * 672 * Note that these are only used for data ZIOs (read/write). Meta-operations 673 * (flush/trim) don't need multiple BIOs and so can just make the call 674 * directly. 675 */ 676 typedef struct { 677 zio_t *vbio_zio; /* parent zio */ 678 679 struct block_device *vbio_bdev; /* blockdev to submit bios to */ 680 681 abd_t *vbio_abd; /* abd carrying borrowed linear buf */ 682 683 uint_t vbio_max_segs; /* max segs per bio */ 684 685 uint_t vbio_max_bytes; /* max bytes per bio */ 686 uint_t vbio_lbs_mask; /* logical block size mask */ 687 688 uint64_t vbio_offset; /* start offset of next bio */ 689 690 struct bio *vbio_bio; /* pointer to the current bio */ 691 int vbio_flags; /* bio flags */ 692 } vbio_t; 693 694 static vbio_t * 695 vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) 696 { 697 vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); 698 699 vbio->vbio_zio = zio; 700 vbio->vbio_bdev = bdev; 701 vbio->vbio_abd = NULL; 702 vbio->vbio_max_segs = vdev_bio_max_segs(bdev); 703 vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); 704 vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); 705 vbio->vbio_offset = zio->io_offset; 706 vbio->vbio_bio = NULL; 707 vbio->vbio_flags = flags; 708 709 return (vbio); 710 } 711 712 BIO_END_IO_PROTO(vbio_completion, bio, error); 713 714 static int 715 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) 716 { 717 struct bio *bio = vbio->vbio_bio; 718 uint_t ssize; 719 720 while (size > 0) { 721 if (bio == NULL) { 722 /* New BIO, allocate and set up */ 723 bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, 724 vbio->vbio_max_segs); 725 VERIFY(bio); 726 727 BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; 728 bio_set_op_attrs(bio, 729 vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? 730 WRITE : READ, vbio->vbio_flags); 731 732 if (vbio->vbio_bio) { 733 bio_chain(vbio->vbio_bio, bio); 734 vdev_submit_bio(vbio->vbio_bio); 735 } 736 vbio->vbio_bio = bio; 737 } 738 739 /* 740 * Only load as much of the current page data as will fit in 741 * the space left in the BIO, respecting lbs alignment. Older 742 * kernels will error if we try to overfill the BIO, while 743 * newer ones will accept it and split the BIO. This ensures 744 * everything works on older kernels, and avoids an additional 745 * overhead on the new. 746 */ 747 ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & 748 vbio->vbio_lbs_mask); 749 if (ssize > 0 && 750 bio_add_page(bio, page, ssize, offset) == ssize) { 751 /* Accepted, adjust and load any remaining. */ 752 size -= ssize; 753 offset += ssize; 754 continue; 755 } 756 757 /* No room, set up for a new BIO and loop */ 758 vbio->vbio_offset += BIO_BI_SIZE(bio); 759 760 /* Signal new BIO allocation wanted */ 761 bio = NULL; 762 } 763 764 return (0); 765 } 766 767 /* Iterator callback to submit ABD pages to the vbio. */ 768 static int 769 vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) 770 { 771 vbio_t *vbio = priv; 772 return (vbio_add_page(vbio, page, len, off)); 773 } 774 775 /* Create some BIOs, fill them with data and submit them */ 776 static void 777 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) 778 { 779 /* 780 * We plug so we can submit the BIOs as we go and only unplug them when 781 * they are fully created and submitted. This is important; if we don't 782 * plug, then the kernel may start executing earlier BIOs while we're 783 * still creating and executing later ones, and if the device goes 784 * away while that's happening, older kernels can get confused and 785 * trample memory. 786 */ 787 struct blk_plug plug; 788 blk_start_plug(&plug); 789 790 (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); 791 ASSERT(vbio->vbio_bio); 792 793 vbio->vbio_bio->bi_end_io = vbio_completion; 794 vbio->vbio_bio->bi_private = vbio; 795 796 /* 797 * Once submitted, vbio_bio now owns vbio (through bi_private) and we 798 * can't touch it again. The bio may complete and vbio_completion() be 799 * called and free the vbio before this task is run again, so we must 800 * consider it invalid from this point. 801 */ 802 vdev_submit_bio(vbio->vbio_bio); 803 804 blk_finish_plug(&plug); 805 } 806 807 /* IO completion callback */ 808 BIO_END_IO_PROTO(vbio_completion, bio, error) 809 { 810 vbio_t *vbio = bio->bi_private; 811 zio_t *zio = vbio->vbio_zio; 812 813 ASSERT(zio); 814 815 /* Capture and log any errors */ 816 #ifdef HAVE_1ARG_BIO_END_IO_T 817 zio->io_error = BIO_END_IO_ERROR(bio); 818 #else 819 zio->io_error = 0; 820 if (error) 821 zio->io_error = -(error); 822 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 823 zio->io_error = EIO; 824 #endif 825 ASSERT3U(zio->io_error, >=, 0); 826 827 if (zio->io_error) 828 vdev_disk_error(zio); 829 830 /* Return the BIO to the kernel */ 831 bio_put(bio); 832 833 /* 834 * If we copied the ABD before issuing it, clean up and return the copy 835 * to the ADB, with changes if appropriate. 836 */ 837 if (vbio->vbio_abd != NULL) { 838 void *buf = abd_to_buf(vbio->vbio_abd); 839 abd_free(vbio->vbio_abd); 840 vbio->vbio_abd = NULL; 841 842 if (zio->io_type == ZIO_TYPE_READ) 843 abd_return_buf_copy(zio->io_abd, buf, zio->io_size); 844 else 845 abd_return_buf(zio->io_abd, buf, zio->io_size); 846 } 847 848 /* Final cleanup */ 849 kmem_free(vbio, sizeof (vbio_t)); 850 851 /* All done, submit for processing */ 852 zio_delay_interrupt(zio); 853 } 854 855 /* 856 * Iterator callback to count ABD pages and check their size & alignment. 857 * 858 * On Linux, each BIO segment can take a page pointer, and an offset+length of 859 * the data within that page. A page can be arbitrarily large ("compound" 860 * pages) but we still have to ensure the data portion is correctly sized and 861 * aligned to the logical block size, to ensure that if the kernel wants to 862 * split the BIO, the two halves will still be properly aligned. 863 * 864 * NOTE: if you change this function, change the copy in 865 * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test 866 * data there to validate the change you're making. 867 * 868 */ 869 typedef struct { 870 uint_t bmask; 871 uint_t npages; 872 uint_t end; 873 } vdev_disk_check_pages_t; 874 875 static int 876 vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) 877 { 878 (void) page; 879 vdev_disk_check_pages_t *s = priv; 880 881 /* 882 * If we didn't finish on a block size boundary last time, then there 883 * would be a gap if we tried to use this ABD as-is, so abort. 884 */ 885 if (s->end != 0) 886 return (1); 887 888 /* 889 * Note if we're taking less than a full block, so we can check it 890 * above on the next call. 891 */ 892 s->end = (off+len) & s->bmask; 893 894 /* All blocks after the first must start on a block size boundary. */ 895 if (s->npages != 0 && (off & s->bmask) != 0) 896 return (1); 897 898 s->npages++; 899 return (0); 900 } 901 902 /* 903 * Check if we can submit the pages in this ABD to the kernel as-is. Returns 904 * the number of pages, or 0 if it can't be submitted like this. 905 */ 906 static boolean_t 907 vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) 908 { 909 vdev_disk_check_pages_t s = { 910 .bmask = bdev_logical_block_size(bdev)-1, 911 .npages = 0, 912 .end = 0, 913 }; 914 915 if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) 916 return (B_FALSE); 917 918 return (B_TRUE); 919 } 920 921 static int 922 vdev_disk_io_rw(zio_t *zio) 923 { 924 vdev_t *v = zio->io_vd; 925 vdev_disk_t *vd = v->vdev_tsd; 926 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 927 int flags = 0; 928 929 /* 930 * Accessing outside the block device is never allowed. 931 */ 932 if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) { 933 vdev_dbgmsg(zio->io_vd, 934 "Illegal access %llu size %llu, device size %llu", 935 (u_longlong_t)zio->io_offset, 936 (u_longlong_t)zio->io_size, 937 (u_longlong_t)bdev_capacity(bdev)); 938 return (SET_ERROR(EIO)); 939 } 940 941 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 942 v->vdev_failfast == B_TRUE) { 943 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 944 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 945 } 946 947 /* 948 * Check alignment of the incoming ABD. If any part of it would require 949 * submitting a page that is not aligned to the logical block size, 950 * then we take a copy into a linear buffer and submit that instead. 951 * This should be impossible on a 512b LBS, and fairly rare on 4K, 952 * usually requiring abnormally-small data blocks (eg gang blocks) 953 * mixed into the same ABD as larger ones (eg aggregated). 954 */ 955 abd_t *abd = zio->io_abd; 956 if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { 957 void *buf; 958 if (zio->io_type == ZIO_TYPE_READ) 959 buf = abd_borrow_buf(zio->io_abd, zio->io_size); 960 else 961 buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); 962 963 /* 964 * Wrap the copy in an abd_t, so we can use the same iterators 965 * to count and fill the vbio later. 966 */ 967 abd = abd_get_from_buf(buf, zio->io_size); 968 969 /* 970 * False here would mean the borrowed copy has an invalid 971 * alignment too, which would mean we've somehow been passed a 972 * linear ABD with an interior page that has a non-zero offset 973 * or a size not a multiple of PAGE_SIZE. This is not possible. 974 * It would mean either zio_buf_alloc() or its underlying 975 * allocators have done something extremely strange, or our 976 * math in vdev_disk_check_pages() is wrong. In either case, 977 * something in seriously wrong and its not safe to continue. 978 */ 979 VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); 980 } 981 982 /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ 983 vbio_t *vbio = vbio_alloc(zio, bdev, flags); 984 if (abd != zio->io_abd) 985 vbio->vbio_abd = abd; 986 987 /* Fill it with data pages and submit it to the kernel */ 988 vbio_submit(vbio, abd, zio->io_size); 989 return (0); 990 } 991 992 /* ========== */ 993 994 /* 995 * This is the classic, battle-tested BIO submission code. Until we're totally 996 * sure that the new code is safe and correct in all cases, this will remain 997 * available and can be enabled by setting zfs_vdev_disk_classic=1 at module 998 * load time. 999 * 1000 * These functions have been renamed to vdev_classic_* to make it clear what 1001 * they belong to, but their implementations are unchanged. 1002 */ 1003 1004 /* 1005 * Virtual device vector for disks. 1006 */ 1007 typedef struct dio_request { 1008 zio_t *dr_zio; /* Parent ZIO */ 1009 atomic_t dr_ref; /* References */ 1010 int dr_error; /* Bio error */ 1011 int dr_bio_count; /* Count of bio's */ 1012 struct bio *dr_bio[]; /* Attached bio's */ 1013 } dio_request_t; 1014 1015 static dio_request_t * 1016 vdev_classic_dio_alloc(int bio_count) 1017 { 1018 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 1019 sizeof (struct bio *) * bio_count, KM_SLEEP); 1020 atomic_set(&dr->dr_ref, 0); 1021 dr->dr_bio_count = bio_count; 1022 dr->dr_error = 0; 1023 1024 for (int i = 0; i < dr->dr_bio_count; i++) 1025 dr->dr_bio[i] = NULL; 1026 1027 return (dr); 1028 } 1029 1030 static void 1031 vdev_classic_dio_free(dio_request_t *dr) 1032 { 1033 int i; 1034 1035 for (i = 0; i < dr->dr_bio_count; i++) 1036 if (dr->dr_bio[i]) 1037 bio_put(dr->dr_bio[i]); 1038 1039 kmem_free(dr, sizeof (dio_request_t) + 1040 sizeof (struct bio *) * dr->dr_bio_count); 1041 } 1042 1043 static void 1044 vdev_classic_dio_get(dio_request_t *dr) 1045 { 1046 atomic_inc(&dr->dr_ref); 1047 } 1048 1049 static void 1050 vdev_classic_dio_put(dio_request_t *dr) 1051 { 1052 int rc = atomic_dec_return(&dr->dr_ref); 1053 1054 /* 1055 * Free the dio_request when the last reference is dropped and 1056 * ensure zio_interpret is called only once with the correct zio 1057 */ 1058 if (rc == 0) { 1059 zio_t *zio = dr->dr_zio; 1060 int error = dr->dr_error; 1061 1062 vdev_classic_dio_free(dr); 1063 1064 if (zio) { 1065 zio->io_error = error; 1066 ASSERT3S(zio->io_error, >=, 0); 1067 if (zio->io_error) 1068 vdev_disk_error(zio); 1069 1070 zio_delay_interrupt(zio); 1071 } 1072 } 1073 } 1074 1075 BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) 1076 { 1077 dio_request_t *dr = bio->bi_private; 1078 1079 if (dr->dr_error == 0) { 1080 #ifdef HAVE_1ARG_BIO_END_IO_T 1081 dr->dr_error = BIO_END_IO_ERROR(bio); 1082 #else 1083 if (error) 1084 dr->dr_error = -(error); 1085 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1086 dr->dr_error = EIO; 1087 #endif 1088 } 1089 1090 /* Drop reference acquired by vdev_classic_physio */ 1091 vdev_classic_dio_put(dr); 1092 } 1093 1094 static inline unsigned int 1095 vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 1096 { 1097 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 1098 bio_size, abd_offset); 1099 1100 #ifdef HAVE_BIO_MAX_SEGS 1101 return (bio_max_segs(nr_segs)); 1102 #else 1103 return (MIN(nr_segs, BIO_MAX_PAGES)); 1104 #endif 1105 } 1106 1107 static int 1108 vdev_classic_physio(zio_t *zio) 1109 { 1110 vdev_t *v = zio->io_vd; 1111 vdev_disk_t *vd = v->vdev_tsd; 1112 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 1113 size_t io_size = zio->io_size; 1114 uint64_t io_offset = zio->io_offset; 1115 int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; 1116 int flags = 0; 1117 1118 dio_request_t *dr; 1119 uint64_t abd_offset; 1120 uint64_t bio_offset; 1121 int bio_size; 1122 int bio_count = 16; 1123 int error = 0; 1124 struct blk_plug plug; 1125 unsigned short nr_vecs; 1126 1127 /* 1128 * Accessing outside the block device is never allowed. 1129 */ 1130 if (io_offset + io_size > bdev_capacity(bdev)) { 1131 vdev_dbgmsg(zio->io_vd, 1132 "Illegal access %llu size %llu, device size %llu", 1133 (u_longlong_t)io_offset, 1134 (u_longlong_t)io_size, 1135 (u_longlong_t)bdev_capacity(bdev)); 1136 return (SET_ERROR(EIO)); 1137 } 1138 1139 retry: 1140 dr = vdev_classic_dio_alloc(bio_count); 1141 1142 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 1143 zio->io_vd->vdev_failfast == B_TRUE) { 1144 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 1145 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 1146 } 1147 1148 dr->dr_zio = zio; 1149 1150 /* 1151 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 1152 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 1153 * can cover at least 128KB and at most 1MB. When the required number 1154 * of iovec's exceeds this, we are forced to break the IO in multiple 1155 * bio's and wait for them all to complete. This is likely if the 1156 * recordsize property is increased beyond 1MB. The default 1157 * bio_count=16 should typically accommodate the maximum-size zio of 1158 * 16MB. 1159 */ 1160 1161 abd_offset = 0; 1162 bio_offset = io_offset; 1163 bio_size = io_size; 1164 for (int i = 0; i <= dr->dr_bio_count; i++) { 1165 1166 /* Finished constructing bio's for given buffer */ 1167 if (bio_size <= 0) 1168 break; 1169 1170 /* 1171 * If additional bio's are required, we have to retry, but 1172 * this should be rare - see the comment above. 1173 */ 1174 if (dr->dr_bio_count == i) { 1175 vdev_classic_dio_free(dr); 1176 bio_count *= 2; 1177 goto retry; 1178 } 1179 1180 nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); 1181 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 1182 if (unlikely(dr->dr_bio[i] == NULL)) { 1183 vdev_classic_dio_free(dr); 1184 return (SET_ERROR(ENOMEM)); 1185 } 1186 1187 /* Matching put called by vdev_classic_physio_completion */ 1188 vdev_classic_dio_get(dr); 1189 1190 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 1191 dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; 1192 dr->dr_bio[i]->bi_private = dr; 1193 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 1194 1195 /* Remaining size is returned to become the new size */ 1196 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 1197 bio_size, abd_offset); 1198 1199 /* Advance in buffer and construct another bio if needed */ 1200 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1201 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1202 } 1203 1204 /* Extra reference to protect dio_request during vdev_submit_bio */ 1205 vdev_classic_dio_get(dr); 1206 1207 if (dr->dr_bio_count > 1) 1208 blk_start_plug(&plug); 1209 1210 /* Submit all bio's associated with this dio */ 1211 for (int i = 0; i < dr->dr_bio_count; i++) { 1212 if (dr->dr_bio[i]) 1213 vdev_submit_bio(dr->dr_bio[i]); 1214 } 1215 1216 if (dr->dr_bio_count > 1) 1217 blk_finish_plug(&plug); 1218 1219 vdev_classic_dio_put(dr); 1220 1221 return (error); 1222 } 1223 1224 /* ========== */ 1225 1226 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 1227 { 1228 zio_t *zio = bio->bi_private; 1229 #ifdef HAVE_1ARG_BIO_END_IO_T 1230 zio->io_error = BIO_END_IO_ERROR(bio); 1231 #else 1232 zio->io_error = -error; 1233 #endif 1234 1235 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 1236 zio->io_vd->vdev_nowritecache = B_TRUE; 1237 1238 bio_put(bio); 1239 ASSERT3S(zio->io_error, >=, 0); 1240 if (zio->io_error) 1241 vdev_disk_error(zio); 1242 zio_interrupt(zio); 1243 } 1244 1245 static int 1246 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 1247 { 1248 struct request_queue *q; 1249 struct bio *bio; 1250 1251 q = bdev_get_queue(bdev); 1252 if (!q) 1253 return (SET_ERROR(ENXIO)); 1254 1255 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 1256 if (unlikely(bio == NULL)) 1257 return (SET_ERROR(ENOMEM)); 1258 1259 bio->bi_end_io = vdev_disk_io_flush_completion; 1260 bio->bi_private = zio; 1261 bio_set_flush(bio); 1262 vdev_submit_bio(bio); 1263 invalidate_bdev(bdev); 1264 1265 return (0); 1266 } 1267 1268 BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) 1269 { 1270 zio_t *zio = bio->bi_private; 1271 #ifdef HAVE_1ARG_BIO_END_IO_T 1272 zio->io_error = BIO_END_IO_ERROR(bio); 1273 #else 1274 zio->io_error = -error; 1275 #endif 1276 bio_put(bio); 1277 if (zio->io_error) 1278 vdev_disk_error(zio); 1279 zio_interrupt(zio); 1280 } 1281 1282 /* 1283 * Wrappers for the different secure erase and discard APIs. We use async 1284 * when available; in this case, *biop is set to the last bio in the chain. 1285 */ 1286 static int 1287 vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, 1288 sector_t nsect, struct bio **biop) 1289 { 1290 *biop = NULL; 1291 int error; 1292 1293 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 1294 error = blkdev_issue_secure_erase(BDH_BDEV(bdh), 1295 sector, nsect, GFP_NOFS); 1296 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 1297 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1298 sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); 1299 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 1300 error = blkdev_issue_discard(BDH_BDEV(bdh), 1301 sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); 1302 #else 1303 #error "unsupported kernel" 1304 #endif 1305 1306 return (error); 1307 } 1308 1309 static int 1310 vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, 1311 sector_t nsect, struct bio **biop) 1312 { 1313 *biop = NULL; 1314 int error; 1315 1316 #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 1317 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1318 sector, nsect, GFP_NOFS, 0, biop); 1319 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) 1320 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1321 sector, nsect, GFP_NOFS, biop); 1322 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 1323 error = blkdev_issue_discard(BDH_BDEV(bdh), 1324 sector, nsect, GFP_NOFS, 0); 1325 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) 1326 error = blkdev_issue_discard(BDH_BDEV(bdh), 1327 sector, nsect, GFP_NOFS); 1328 #else 1329 #error "unsupported kernel" 1330 #endif 1331 1332 return (error); 1333 } 1334 1335 /* 1336 * Entry point for TRIM ops. This calls the right wrapper for secure erase or 1337 * discard, and then does the appropriate finishing work for error vs success 1338 * and async vs sync. 1339 */ 1340 static int 1341 vdev_disk_io_trim(zio_t *zio) 1342 { 1343 int error; 1344 struct bio *bio; 1345 1346 zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; 1347 sector_t sector = zio->io_offset >> 9; 1348 sector_t nsects = zio->io_size >> 9; 1349 1350 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 1351 error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); 1352 else 1353 error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); 1354 1355 if (error != 0) 1356 return (SET_ERROR(-error)); 1357 1358 if (bio == NULL) { 1359 /* 1360 * This was a synchronous op that completed successfully, so 1361 * return it to ZFS immediately. 1362 */ 1363 zio_interrupt(zio); 1364 } else { 1365 /* 1366 * This was an asynchronous op; set up completion callback and 1367 * issue it. 1368 */ 1369 bio->bi_private = zio; 1370 bio->bi_end_io = vdev_disk_discard_end_io; 1371 vdev_submit_bio(bio); 1372 } 1373 1374 return (0); 1375 } 1376 1377 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; 1378 1379 static void 1380 vdev_disk_io_start(zio_t *zio) 1381 { 1382 vdev_t *v = zio->io_vd; 1383 vdev_disk_t *vd = v->vdev_tsd; 1384 int error; 1385 1386 /* 1387 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 1388 * Nothing to be done here but return failure. 1389 */ 1390 if (vd == NULL) { 1391 zio->io_error = ENXIO; 1392 zio_interrupt(zio); 1393 return; 1394 } 1395 1396 rw_enter(&vd->vd_lock, RW_READER); 1397 1398 /* 1399 * If the vdev is closed, it's likely due to a failed reopen and is 1400 * in the UNAVAIL state. Nothing to be done here but return failure. 1401 */ 1402 if (vd->vd_bdh == NULL) { 1403 rw_exit(&vd->vd_lock); 1404 zio->io_error = ENXIO; 1405 zio_interrupt(zio); 1406 return; 1407 } 1408 1409 switch (zio->io_type) { 1410 case ZIO_TYPE_FLUSH: 1411 1412 if (!vdev_readable(v)) { 1413 /* Drive not there, can't flush */ 1414 error = SET_ERROR(ENXIO); 1415 } else if (zfs_nocacheflush) { 1416 /* Flushing disabled by operator, declare success */ 1417 error = 0; 1418 } else if (v->vdev_nowritecache) { 1419 /* This vdev not capable of flushing */ 1420 error = SET_ERROR(ENOTSUP); 1421 } else { 1422 /* 1423 * Issue the flush. If successful, the response will 1424 * be handled in the completion callback, so we're done. 1425 */ 1426 error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); 1427 if (error == 0) { 1428 rw_exit(&vd->vd_lock); 1429 return; 1430 } 1431 } 1432 1433 /* Couldn't issue the flush, so set the error and return it */ 1434 rw_exit(&vd->vd_lock); 1435 zio->io_error = error; 1436 zio_execute(zio); 1437 return; 1438 1439 case ZIO_TYPE_TRIM: 1440 error = vdev_disk_io_trim(zio); 1441 rw_exit(&vd->vd_lock); 1442 if (error) { 1443 zio->io_error = error; 1444 zio_execute(zio); 1445 } 1446 return; 1447 1448 case ZIO_TYPE_READ: 1449 case ZIO_TYPE_WRITE: 1450 zio->io_target_timestamp = zio_handle_io_delay(zio); 1451 error = vdev_disk_io_rw_fn(zio); 1452 rw_exit(&vd->vd_lock); 1453 if (error) { 1454 zio->io_error = error; 1455 zio_interrupt(zio); 1456 } 1457 return; 1458 1459 default: 1460 /* 1461 * Getting here means our parent vdev has made a very strange 1462 * request of us, and shouldn't happen. Assert here to force a 1463 * crash in dev builds, but in production return the IO 1464 * unhandled. The pool will likely suspend anyway but that's 1465 * nicer than crashing the kernel. 1466 */ 1467 ASSERT3S(zio->io_type, ==, -1); 1468 1469 rw_exit(&vd->vd_lock); 1470 zio->io_error = SET_ERROR(ENOTSUP); 1471 zio_interrupt(zio); 1472 return; 1473 } 1474 1475 __builtin_unreachable(); 1476 } 1477 1478 static void 1479 vdev_disk_io_done(zio_t *zio) 1480 { 1481 /* 1482 * If the device returned EIO, we revalidate the media. If it is 1483 * determined the media has changed this triggers the asynchronous 1484 * removal of the device from the configuration. 1485 */ 1486 if (zio->io_error == EIO) { 1487 vdev_t *v = zio->io_vd; 1488 vdev_disk_t *vd = v->vdev_tsd; 1489 1490 if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { 1491 invalidate_bdev(BDH_BDEV(vd->vd_bdh)); 1492 v->vdev_remove_wanted = B_TRUE; 1493 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1494 } 1495 } 1496 } 1497 1498 static void 1499 vdev_disk_hold(vdev_t *vd) 1500 { 1501 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1502 1503 /* We must have a pathname, and it must be absolute. */ 1504 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 1505 return; 1506 1507 /* 1508 * Only prefetch path and devid info if the device has 1509 * never been opened. 1510 */ 1511 if (vd->vdev_tsd != NULL) 1512 return; 1513 1514 } 1515 1516 static void 1517 vdev_disk_rele(vdev_t *vd) 1518 { 1519 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1520 1521 /* XXX: Implement me as a vnode rele for the device */ 1522 } 1523 1524 /* 1525 * BIO submission method. See comment above about vdev_classic. 1526 * Set zfs_vdev_disk_classic=0 for new, =1 for classic 1527 */ 1528 static uint_t zfs_vdev_disk_classic = 0; /* default new */ 1529 1530 /* Set submission function from module parameter */ 1531 static int 1532 vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) 1533 { 1534 int err = param_set_uint(buf, kp); 1535 if (err < 0) 1536 return (SET_ERROR(err)); 1537 1538 vdev_disk_io_rw_fn = 1539 zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; 1540 1541 printk(KERN_INFO "ZFS: forcing %s BIO submission\n", 1542 zfs_vdev_disk_classic ? "classic" : "new"); 1543 1544 return (0); 1545 } 1546 1547 /* 1548 * At first use vdev use, set the submission function from the default value if 1549 * it hasn't been set already. 1550 */ 1551 static int 1552 vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) 1553 { 1554 (void) spa; 1555 (void) nv; 1556 (void) tsd; 1557 1558 if (vdev_disk_io_rw_fn == NULL) 1559 vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? 1560 vdev_classic_physio : vdev_disk_io_rw; 1561 1562 return (0); 1563 } 1564 1565 vdev_ops_t vdev_disk_ops = { 1566 .vdev_op_init = vdev_disk_init, 1567 .vdev_op_fini = NULL, 1568 .vdev_op_open = vdev_disk_open, 1569 .vdev_op_close = vdev_disk_close, 1570 .vdev_op_asize = vdev_default_asize, 1571 .vdev_op_min_asize = vdev_default_min_asize, 1572 .vdev_op_min_alloc = NULL, 1573 .vdev_op_io_start = vdev_disk_io_start, 1574 .vdev_op_io_done = vdev_disk_io_done, 1575 .vdev_op_state_change = NULL, 1576 .vdev_op_need_resilver = NULL, 1577 .vdev_op_hold = vdev_disk_hold, 1578 .vdev_op_rele = vdev_disk_rele, 1579 .vdev_op_remap = NULL, 1580 .vdev_op_xlate = vdev_default_xlate, 1581 .vdev_op_rebuild_asize = NULL, 1582 .vdev_op_metaslab_init = NULL, 1583 .vdev_op_config_generate = NULL, 1584 .vdev_op_nparity = NULL, 1585 .vdev_op_ndisks = NULL, 1586 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1587 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1588 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1589 }; 1590 1591 /* 1592 * The zfs_vdev_scheduler module option has been deprecated. Setting this 1593 * value no longer has any effect. It has not yet been entirely removed 1594 * to allow the module to be loaded if this option is specified in the 1595 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 1596 */ 1597 static int 1598 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1599 { 1600 int error = param_set_charp(val, kp); 1601 if (error == 0) { 1602 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1603 "is not supported.\n"); 1604 } 1605 1606 return (error); 1607 } 1608 1609 static const char *zfs_vdev_scheduler = "unused"; 1610 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1611 param_get_charp, &zfs_vdev_scheduler, 0644); 1612 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1613 1614 int 1615 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1616 { 1617 uint_t val; 1618 int error; 1619 1620 error = kstrtouint(buf, 0, &val); 1621 if (error < 0) 1622 return (SET_ERROR(error)); 1623 1624 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1625 return (SET_ERROR(-EINVAL)); 1626 1627 error = param_set_uint(buf, kp); 1628 if (error < 0) 1629 return (SET_ERROR(error)); 1630 1631 return (0); 1632 } 1633 1634 int 1635 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1636 { 1637 uint_t val; 1638 int error; 1639 1640 error = kstrtouint(buf, 0, &val); 1641 if (error < 0) 1642 return (SET_ERROR(error)); 1643 1644 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1645 return (SET_ERROR(-EINVAL)); 1646 1647 error = param_set_uint(buf, kp); 1648 if (error < 0) 1649 return (SET_ERROR(error)); 1650 1651 return (0); 1652 } 1653 1654 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1655 "Timeout before determining that a device is missing"); 1656 1657 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1658 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1659 1660 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, 1661 "Maximum number of data segments to add to an IO request (min 4)"); 1662 1663 ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, 1664 vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, 1665 "Use classic BIO submission method"); 1666