1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 * Copyright (c) 2023, 2024, Klara Inc. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/vdev_trim.h> 35 #include <sys/abd.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/zio.h> 38 #include <linux/blkpg.h> 39 #include <linux/msdos_fs.h> 40 #include <linux/vfs_compat.h> 41 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 42 #include <linux/blk-cgroup.h> 43 #endif 44 45 /* 46 * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying 47 * block_device. Since it carries the block_device inside, its convenient to 48 * just use the handle as a proxy. 49 * 50 * Linux 6.9.x uses a file for the same purpose. 51 * 52 * For pre-6.8, we just emulate this with a cast, since we don't need any of 53 * the other fields inside the handle. 54 */ 55 #if defined(HAVE_BDEV_OPEN_BY_PATH) 56 typedef struct bdev_handle zfs_bdev_handle_t; 57 #define BDH_BDEV(bdh) ((bdh)->bdev) 58 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 59 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 60 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 61 #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 62 typedef struct file zfs_bdev_handle_t; 63 #define BDH_BDEV(bdh) (file_bdev(bdh)) 64 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 65 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 66 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 67 #else 68 typedef void zfs_bdev_handle_t; 69 #define BDH_BDEV(bdh) ((struct block_device *)bdh) 70 #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) 71 #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) 72 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 73 #endif 74 75 typedef struct vdev_disk { 76 zfs_bdev_handle_t *vd_bdh; 77 krwlock_t vd_lock; 78 } vdev_disk_t; 79 80 /* 81 * Maximum number of segments to add to a bio (min 4). If this is higher than 82 * the maximum allowed by the device queue or the kernel itself, it will be 83 * clamped. Setting it to zero will cause the kernel's ideal size to be used. 84 */ 85 uint_t zfs_vdev_disk_max_segs = 0; 86 87 /* 88 * Unique identifier for the exclusive vdev holder. 89 */ 90 static void *zfs_vdev_holder = VDEV_HOLDER; 91 92 /* 93 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 94 * device is missing. The missing path may be transient since the links 95 * can be briefly removed and recreated in response to udev events. 96 */ 97 static uint_t zfs_vdev_open_timeout_ms = 1000; 98 99 /* 100 * Size of the "reserved" partition, in blocks. 101 */ 102 #define EFI_MIN_RESV_SIZE (16 * 1024) 103 104 /* 105 * BIO request failfast mask. 106 */ 107 108 static unsigned int zfs_vdev_failfast_mask = 1; 109 110 /* 111 * Convert SPA mode flags into bdev open mode flags. 112 */ 113 #ifdef HAVE_BLK_MODE_T 114 typedef blk_mode_t vdev_bdev_mode_t; 115 #define VDEV_BDEV_MODE_READ BLK_OPEN_READ 116 #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE 117 #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL 118 #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) 119 #else 120 typedef fmode_t vdev_bdev_mode_t; 121 #define VDEV_BDEV_MODE_READ FMODE_READ 122 #define VDEV_BDEV_MODE_WRITE FMODE_WRITE 123 #define VDEV_BDEV_MODE_EXCL FMODE_EXCL 124 #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) 125 #endif 126 127 static vdev_bdev_mode_t 128 vdev_bdev_mode(spa_mode_t smode) 129 { 130 ASSERT3U(smode, !=, SPA_MODE_UNINIT); 131 ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); 132 133 vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; 134 135 if (smode & SPA_MODE_READ) 136 bmode |= VDEV_BDEV_MODE_READ; 137 138 if (smode & SPA_MODE_WRITE) 139 bmode |= VDEV_BDEV_MODE_WRITE; 140 141 ASSERT(bmode & VDEV_BDEV_MODE_MASK); 142 ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); 143 144 return (bmode); 145 } 146 147 /* 148 * Returns the usable capacity (in bytes) for the partition or disk. 149 */ 150 static uint64_t 151 bdev_capacity(struct block_device *bdev) 152 { 153 return (i_size_read(bdev->bd_inode)); 154 } 155 156 #if !defined(HAVE_BDEV_WHOLE) 157 static inline struct block_device * 158 bdev_whole(struct block_device *bdev) 159 { 160 return (bdev->bd_contains); 161 } 162 #endif 163 164 #if defined(HAVE_BDEVNAME) 165 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 166 #else 167 static inline void 168 vdev_bdevname(struct block_device *bdev, char *name) 169 { 170 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 171 } 172 #endif 173 174 /* 175 * Returns the maximum expansion capacity of the block device (in bytes). 176 * 177 * It is possible to expand a vdev when it has been created as a wholedisk 178 * and the containing block device has increased in capacity. Or when the 179 * partition containing the pool has been manually increased in size. 180 * 181 * This function is only responsible for calculating the potential expansion 182 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 183 * responsible for verifying the expected partition layout in the wholedisk 184 * case, and updating the partition table if appropriate. Once the partition 185 * size has been increased the additional capacity will be visible using 186 * bdev_capacity(). 187 * 188 * The returned maximum expansion capacity is always expected to be larger, or 189 * at the very least equal, to its usable capacity to prevent overestimating 190 * the pool expandsize. 191 */ 192 static uint64_t 193 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 194 { 195 uint64_t psize; 196 int64_t available; 197 198 if (wholedisk && bdev != bdev_whole(bdev)) { 199 /* 200 * When reporting maximum expansion capacity for a wholedisk 201 * deduct any capacity which is expected to be lost due to 202 * alignment restrictions. Over reporting this value isn't 203 * harmful and would only result in slightly less capacity 204 * than expected post expansion. 205 * The estimated available space may be slightly smaller than 206 * bdev_capacity() for devices where the number of sectors is 207 * not a multiple of the alignment size and the partition layout 208 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 209 * "reserved" EFI partition: in such cases return the device 210 * usable capacity. 211 */ 212 available = i_size_read(bdev_whole(bdev)->bd_inode) - 213 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 214 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 215 psize = MAX(available, bdev_capacity(bdev)); 216 } else { 217 psize = bdev_capacity(bdev); 218 } 219 220 return (psize); 221 } 222 223 static void 224 vdev_disk_error(zio_t *zio) 225 { 226 /* 227 * This function can be called in interrupt context, for instance while 228 * handling IRQs coming from a misbehaving disk device; use printk() 229 * which is safe from any context. 230 */ 231 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 232 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 233 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 234 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 235 zio->io_flags); 236 } 237 238 static void 239 vdev_disk_kobj_evt_post(vdev_t *v) 240 { 241 vdev_disk_t *vd = v->vdev_tsd; 242 if (vd && vd->vd_bdh) { 243 spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); 244 } else { 245 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 246 v->vdev_path); 247 } 248 } 249 250 static zfs_bdev_handle_t * 251 vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) 252 { 253 vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); 254 255 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 256 return (bdev_file_open_by_path(path, bmode, holder, NULL)); 257 #elif defined(HAVE_BDEV_OPEN_BY_PATH) 258 return (bdev_open_by_path(path, bmode, holder, NULL)); 259 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 260 return (blkdev_get_by_path(path, bmode, holder, NULL)); 261 #else 262 return (blkdev_get_by_path(path, bmode, holder)); 263 #endif 264 } 265 266 static void 267 vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) 268 { 269 #if defined(HAVE_BDEV_RELEASE) 270 return (bdev_release(bdh)); 271 #elif defined(HAVE_BLKDEV_PUT_HOLDER) 272 return (blkdev_put(BDH_BDEV(bdh), holder)); 273 #elif defined(HAVE_BLKDEV_PUT) 274 return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); 275 #else 276 fput(bdh); 277 #endif 278 } 279 280 static int 281 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 282 uint64_t *logical_ashift, uint64_t *physical_ashift) 283 { 284 zfs_bdev_handle_t *bdh; 285 spa_mode_t smode = spa_mode(v->vdev_spa); 286 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 287 vdev_disk_t *vd; 288 289 /* Must have a pathname and it must be absolute. */ 290 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 291 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 292 vdev_dbgmsg(v, "invalid vdev_path"); 293 return (SET_ERROR(EINVAL)); 294 } 295 296 /* 297 * Reopen the device if it is currently open. When expanding a 298 * partition force re-scanning the partition table if userland 299 * did not take care of this already. We need to do this while closed 300 * in order to get an accurate updated block device size. Then 301 * since udev may need to recreate the device links increase the 302 * open retry timeout before reporting the device as unavailable. 303 */ 304 vd = v->vdev_tsd; 305 if (vd) { 306 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 307 boolean_t reread_part = B_FALSE; 308 309 rw_enter(&vd->vd_lock, RW_WRITER); 310 bdh = vd->vd_bdh; 311 vd->vd_bdh = NULL; 312 313 if (bdh) { 314 struct block_device *bdev = BDH_BDEV(bdh); 315 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 316 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 317 /* 318 * If userland has BLKPG_RESIZE_PARTITION, 319 * then it should have updated the partition 320 * table already. We can detect this by 321 * comparing our current physical size 322 * with that of the device. If they are 323 * the same, then we must not have 324 * BLKPG_RESIZE_PARTITION or it failed to 325 * update the partition table online. We 326 * fallback to rescanning the partition 327 * table from the kernel below. However, 328 * if the capacity already reflects the 329 * updated partition, then we skip 330 * rescanning the partition table here. 331 */ 332 if (v->vdev_psize == bdev_capacity(bdev)) 333 reread_part = B_TRUE; 334 } 335 336 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 337 } 338 339 if (reread_part) { 340 bdh = vdev_blkdev_get_by_path(disk_name, smode, 341 zfs_vdev_holder); 342 if (!BDH_IS_ERR(bdh)) { 343 int error = 344 vdev_bdev_reread_part(BDH_BDEV(bdh)); 345 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 346 if (error == 0) { 347 timeout = MSEC2NSEC( 348 zfs_vdev_open_timeout_ms * 2); 349 } 350 } 351 } 352 } else { 353 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 354 355 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 356 rw_enter(&vd->vd_lock, RW_WRITER); 357 } 358 359 /* 360 * Devices are always opened by the path provided at configuration 361 * time. This means that if the provided path is a udev by-id path 362 * then drives may be re-cabled without an issue. If the provided 363 * path is a udev by-path path, then the physical location information 364 * will be preserved. This can be critical for more complicated 365 * configurations where drives are located in specific physical 366 * locations to maximize the systems tolerance to component failure. 367 * 368 * Alternatively, you can provide your own udev rule to flexibly map 369 * the drives as you see fit. It is not advised that you use the 370 * /dev/[hd]d devices which may be reordered due to probing order. 371 * Devices in the wrong locations will be detected by the higher 372 * level vdev validation. 373 * 374 * The specified paths may be briefly removed and recreated in 375 * response to udev events. This should be exceptionally unlikely 376 * because the zpool command makes every effort to verify these paths 377 * have already settled prior to reaching this point. Therefore, 378 * a ENOENT failure at this point is highly likely to be transient 379 * and it is reasonable to sleep and retry before giving up. In 380 * practice delays have been observed to be on the order of 100ms. 381 * 382 * When ERESTARTSYS is returned it indicates the block device is 383 * a zvol which could not be opened due to the deadlock detection 384 * logic in zvol_open(). Extend the timeout and retry the open 385 * subsequent attempts are expected to eventually succeed. 386 */ 387 hrtime_t start = gethrtime(); 388 bdh = BDH_ERR_PTR(-ENXIO); 389 while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { 390 bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, 391 zfs_vdev_holder); 392 if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { 393 /* 394 * There is no point of waiting since device is removed 395 * explicitly 396 */ 397 if (v->vdev_removed) 398 break; 399 400 schedule_timeout(MSEC_TO_TICK(10)); 401 } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { 402 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 403 continue; 404 } else if (BDH_IS_ERR(bdh)) { 405 break; 406 } 407 } 408 409 if (BDH_IS_ERR(bdh)) { 410 int error = -BDH_PTR_ERR(bdh); 411 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 412 (u_longlong_t)(gethrtime() - start), 413 (u_longlong_t)timeout); 414 vd->vd_bdh = NULL; 415 v->vdev_tsd = vd; 416 rw_exit(&vd->vd_lock); 417 return (SET_ERROR(error)); 418 } else { 419 vd->vd_bdh = bdh; 420 v->vdev_tsd = vd; 421 rw_exit(&vd->vd_lock); 422 } 423 424 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 425 426 /* Determine the physical block size */ 427 int physical_block_size = bdev_physical_block_size(bdev); 428 429 /* Determine the logical block size */ 430 int logical_block_size = bdev_logical_block_size(bdev); 431 432 /* 433 * If the device has a write cache, clear the nowritecache flag, 434 * so that we start issuing flush requests again. 435 */ 436 v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); 437 438 /* Set when device reports it supports TRIM. */ 439 v->vdev_has_trim = bdev_discard_supported(bdev); 440 441 /* Set when device reports it supports secure TRIM. */ 442 v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); 443 444 /* Inform the ZIO pipeline that we are non-rotational */ 445 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); 446 447 /* Physical volume size in bytes for the partition */ 448 *psize = bdev_capacity(bdev); 449 450 /* Physical volume size in bytes including possible expansion space */ 451 *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); 452 453 /* Based on the minimum sector size set the block size */ 454 *physical_ashift = highbit64(MAX(physical_block_size, 455 SPA_MINBLOCKSIZE)) - 1; 456 457 *logical_ashift = highbit64(MAX(logical_block_size, 458 SPA_MINBLOCKSIZE)) - 1; 459 460 return (0); 461 } 462 463 static void 464 vdev_disk_close(vdev_t *v) 465 { 466 vdev_disk_t *vd = v->vdev_tsd; 467 468 if (v->vdev_reopening || vd == NULL) 469 return; 470 471 if (vd->vd_bdh != NULL) 472 vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), 473 zfs_vdev_holder); 474 475 rw_destroy(&vd->vd_lock); 476 kmem_free(vd, sizeof (vdev_disk_t)); 477 v->vdev_tsd = NULL; 478 } 479 480 static inline void 481 vdev_submit_bio_impl(struct bio *bio) 482 { 483 #ifdef HAVE_1ARG_SUBMIT_BIO 484 (void) submit_bio(bio); 485 #else 486 (void) submit_bio(bio_data_dir(bio), bio); 487 #endif 488 } 489 490 /* 491 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 492 * replace it with preempt_schedule under the following condition: 493 */ 494 #if defined(CONFIG_ARM64) && \ 495 defined(CONFIG_PREEMPTION) && \ 496 defined(CONFIG_BLK_CGROUP) 497 #define preempt_schedule_notrace(x) preempt_schedule(x) 498 #endif 499 500 /* 501 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 502 * as an argument removing the need to set it with bio_set_dev(). This 503 * removes the need for all of the following compatibility code. 504 */ 505 #if !defined(HAVE_BIO_ALLOC_4ARG) 506 507 #ifdef HAVE_BIO_SET_DEV 508 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 509 /* 510 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 511 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 512 * As a side effect the function was converted to GPL-only. Define our 513 * own version when needed which uses rcu_read_lock_sched(). 514 * 515 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 516 * part, moving blkg_tryget into the private one. Define our own version. 517 */ 518 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 519 static inline bool 520 vdev_blkg_tryget(struct blkcg_gq *blkg) 521 { 522 struct percpu_ref *ref = &blkg->refcnt; 523 unsigned long __percpu *count; 524 bool rc; 525 526 rcu_read_lock_sched(); 527 528 if (__ref_is_percpu(ref, &count)) { 529 this_cpu_inc(*count); 530 rc = true; 531 } else { 532 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 533 rc = atomic_long_inc_not_zero(&ref->data->count); 534 #else 535 rc = atomic_long_inc_not_zero(&ref->count); 536 #endif 537 } 538 539 rcu_read_unlock_sched(); 540 541 return (rc); 542 } 543 #else 544 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 545 #endif 546 #ifdef HAVE_BIO_SET_DEV_MACRO 547 /* 548 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 549 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 550 * the entire macro. Provide a minimal version which always assigns the 551 * request queue's root_blkg to the bio. 552 */ 553 static inline void 554 vdev_bio_associate_blkg(struct bio *bio) 555 { 556 #if defined(HAVE_BIO_BDEV_DISK) 557 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 558 #else 559 struct request_queue *q = bio->bi_disk->queue; 560 #endif 561 562 ASSERT3P(q, !=, NULL); 563 ASSERT3P(bio->bi_blkg, ==, NULL); 564 565 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 566 bio->bi_blkg = q->root_blkg; 567 } 568 569 #define bio_associate_blkg vdev_bio_associate_blkg 570 #else 571 static inline void 572 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 573 { 574 #if defined(HAVE_BIO_BDEV_DISK) 575 struct request_queue *q = bdev->bd_disk->queue; 576 #else 577 struct request_queue *q = bio->bi_disk->queue; 578 #endif 579 bio_clear_flag(bio, BIO_REMAPPED); 580 if (bio->bi_bdev != bdev) 581 bio_clear_flag(bio, BIO_THROTTLED); 582 bio->bi_bdev = bdev; 583 584 ASSERT3P(q, !=, NULL); 585 ASSERT3P(bio->bi_blkg, ==, NULL); 586 587 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 588 bio->bi_blkg = q->root_blkg; 589 } 590 #define bio_set_dev vdev_bio_set_dev 591 #endif 592 #endif 593 #else 594 /* 595 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 596 */ 597 static inline void 598 bio_set_dev(struct bio *bio, struct block_device *bdev) 599 { 600 bio->bi_bdev = bdev; 601 } 602 #endif /* HAVE_BIO_SET_DEV */ 603 #endif /* !HAVE_BIO_ALLOC_4ARG */ 604 605 static inline void 606 vdev_submit_bio(struct bio *bio) 607 { 608 struct bio_list *bio_list = current->bio_list; 609 current->bio_list = NULL; 610 vdev_submit_bio_impl(bio); 611 current->bio_list = bio_list; 612 } 613 614 static inline struct bio * 615 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 616 unsigned short nr_vecs) 617 { 618 struct bio *bio; 619 620 #ifdef HAVE_BIO_ALLOC_4ARG 621 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 622 #else 623 bio = bio_alloc(gfp_mask, nr_vecs); 624 if (likely(bio != NULL)) 625 bio_set_dev(bio, bdev); 626 #endif 627 628 return (bio); 629 } 630 631 static inline uint_t 632 vdev_bio_max_segs(struct block_device *bdev) 633 { 634 /* 635 * Smallest of the device max segs and the tuneable max segs. Minimum 636 * 4, so there's room to finish split pages if they come up. 637 */ 638 const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); 639 const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? 640 MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; 641 const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); 642 643 #ifdef HAVE_BIO_MAX_SEGS 644 return (bio_max_segs(max_segs)); 645 #else 646 return (MIN(max_segs, BIO_MAX_PAGES)); 647 #endif 648 } 649 650 static inline uint_t 651 vdev_bio_max_bytes(struct block_device *bdev) 652 { 653 return (queue_max_sectors(bdev_get_queue(bdev)) << 9); 654 } 655 656 657 /* 658 * Virtual block IO object (VBIO) 659 * 660 * Linux block IO (BIO) objects have a limit on how many data segments (pages) 661 * they can hold. Depending on how they're allocated and structured, a large 662 * ZIO can require more than one BIO to be submitted to the kernel, which then 663 * all have to complete before we can return the completed ZIO back to ZFS. 664 * 665 * A VBIO is a wrapper around multiple BIOs, carrying everything needed to 666 * translate a ZIO down into the kernel block layer and back again. 667 * 668 * Note that these are only used for data ZIOs (read/write). Meta-operations 669 * (flush/trim) don't need multiple BIOs and so can just make the call 670 * directly. 671 */ 672 typedef struct { 673 zio_t *vbio_zio; /* parent zio */ 674 675 struct block_device *vbio_bdev; /* blockdev to submit bios to */ 676 677 abd_t *vbio_abd; /* abd carrying borrowed linear buf */ 678 679 uint_t vbio_max_segs; /* max segs per bio */ 680 681 uint_t vbio_max_bytes; /* max bytes per bio */ 682 uint_t vbio_lbs_mask; /* logical block size mask */ 683 684 uint64_t vbio_offset; /* start offset of next bio */ 685 686 struct bio *vbio_bio; /* pointer to the current bio */ 687 int vbio_flags; /* bio flags */ 688 } vbio_t; 689 690 static vbio_t * 691 vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) 692 { 693 vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); 694 695 vbio->vbio_zio = zio; 696 vbio->vbio_bdev = bdev; 697 vbio->vbio_abd = NULL; 698 vbio->vbio_max_segs = vdev_bio_max_segs(bdev); 699 vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); 700 vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); 701 vbio->vbio_offset = zio->io_offset; 702 vbio->vbio_bio = NULL; 703 vbio->vbio_flags = flags; 704 705 return (vbio); 706 } 707 708 BIO_END_IO_PROTO(vbio_completion, bio, error); 709 710 static int 711 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) 712 { 713 struct bio *bio = vbio->vbio_bio; 714 uint_t ssize; 715 716 while (size > 0) { 717 if (bio == NULL) { 718 /* New BIO, allocate and set up */ 719 bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, 720 vbio->vbio_max_segs); 721 VERIFY(bio); 722 723 BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; 724 bio_set_op_attrs(bio, 725 vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? 726 WRITE : READ, vbio->vbio_flags); 727 728 if (vbio->vbio_bio) { 729 bio_chain(vbio->vbio_bio, bio); 730 vdev_submit_bio(vbio->vbio_bio); 731 } 732 vbio->vbio_bio = bio; 733 } 734 735 /* 736 * Only load as much of the current page data as will fit in 737 * the space left in the BIO, respecting lbs alignment. Older 738 * kernels will error if we try to overfill the BIO, while 739 * newer ones will accept it and split the BIO. This ensures 740 * everything works on older kernels, and avoids an additional 741 * overhead on the new. 742 */ 743 ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & 744 vbio->vbio_lbs_mask); 745 if (ssize > 0 && 746 bio_add_page(bio, page, ssize, offset) == ssize) { 747 /* Accepted, adjust and load any remaining. */ 748 size -= ssize; 749 offset += ssize; 750 continue; 751 } 752 753 /* No room, set up for a new BIO and loop */ 754 vbio->vbio_offset += BIO_BI_SIZE(bio); 755 756 /* Signal new BIO allocation wanted */ 757 bio = NULL; 758 } 759 760 return (0); 761 } 762 763 /* Iterator callback to submit ABD pages to the vbio. */ 764 static int 765 vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) 766 { 767 vbio_t *vbio = priv; 768 return (vbio_add_page(vbio, page, len, off)); 769 } 770 771 /* Create some BIOs, fill them with data and submit them */ 772 static void 773 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) 774 { 775 /* 776 * We plug so we can submit the BIOs as we go and only unplug them when 777 * they are fully created and submitted. This is important; if we don't 778 * plug, then the kernel may start executing earlier BIOs while we're 779 * still creating and executing later ones, and if the device goes 780 * away while that's happening, older kernels can get confused and 781 * trample memory. 782 */ 783 struct blk_plug plug; 784 blk_start_plug(&plug); 785 786 (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); 787 ASSERT(vbio->vbio_bio); 788 789 vbio->vbio_bio->bi_end_io = vbio_completion; 790 vbio->vbio_bio->bi_private = vbio; 791 792 /* 793 * Once submitted, vbio_bio now owns vbio (through bi_private) and we 794 * can't touch it again. The bio may complete and vbio_completion() be 795 * called and free the vbio before this task is run again, so we must 796 * consider it invalid from this point. 797 */ 798 vdev_submit_bio(vbio->vbio_bio); 799 800 blk_finish_plug(&plug); 801 } 802 803 /* IO completion callback */ 804 BIO_END_IO_PROTO(vbio_completion, bio, error) 805 { 806 vbio_t *vbio = bio->bi_private; 807 zio_t *zio = vbio->vbio_zio; 808 809 ASSERT(zio); 810 811 /* Capture and log any errors */ 812 #ifdef HAVE_1ARG_BIO_END_IO_T 813 zio->io_error = BIO_END_IO_ERROR(bio); 814 #else 815 zio->io_error = 0; 816 if (error) 817 zio->io_error = -(error); 818 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 819 zio->io_error = EIO; 820 #endif 821 ASSERT3U(zio->io_error, >=, 0); 822 823 if (zio->io_error) 824 vdev_disk_error(zio); 825 826 /* Return the BIO to the kernel */ 827 bio_put(bio); 828 829 /* 830 * If we copied the ABD before issuing it, clean up and return the copy 831 * to the ADB, with changes if appropriate. 832 */ 833 if (vbio->vbio_abd != NULL) { 834 void *buf = abd_to_buf(vbio->vbio_abd); 835 abd_free(vbio->vbio_abd); 836 vbio->vbio_abd = NULL; 837 838 if (zio->io_type == ZIO_TYPE_READ) 839 abd_return_buf_copy(zio->io_abd, buf, zio->io_size); 840 else 841 abd_return_buf(zio->io_abd, buf, zio->io_size); 842 } 843 844 /* Final cleanup */ 845 kmem_free(vbio, sizeof (vbio_t)); 846 847 /* All done, submit for processing */ 848 zio_delay_interrupt(zio); 849 } 850 851 /* 852 * Iterator callback to count ABD pages and check their size & alignment. 853 * 854 * On Linux, each BIO segment can take a page pointer, and an offset+length of 855 * the data within that page. A page can be arbitrarily large ("compound" 856 * pages) but we still have to ensure the data portion is correctly sized and 857 * aligned to the logical block size, to ensure that if the kernel wants to 858 * split the BIO, the two halves will still be properly aligned. 859 * 860 * NOTE: if you change this function, change the copy in 861 * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test 862 * data there to validate the change you're making. 863 * 864 */ 865 typedef struct { 866 uint_t bmask; 867 uint_t npages; 868 uint_t end; 869 } vdev_disk_check_pages_t; 870 871 static int 872 vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) 873 { 874 (void) page; 875 vdev_disk_check_pages_t *s = priv; 876 877 /* 878 * If we didn't finish on a block size boundary last time, then there 879 * would be a gap if we tried to use this ABD as-is, so abort. 880 */ 881 if (s->end != 0) 882 return (1); 883 884 /* 885 * Note if we're taking less than a full block, so we can check it 886 * above on the next call. 887 */ 888 s->end = (off+len) & s->bmask; 889 890 /* All blocks after the first must start on a block size boundary. */ 891 if (s->npages != 0 && (off & s->bmask) != 0) 892 return (1); 893 894 s->npages++; 895 return (0); 896 } 897 898 /* 899 * Check if we can submit the pages in this ABD to the kernel as-is. Returns 900 * the number of pages, or 0 if it can't be submitted like this. 901 */ 902 static boolean_t 903 vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) 904 { 905 vdev_disk_check_pages_t s = { 906 .bmask = bdev_logical_block_size(bdev)-1, 907 .npages = 0, 908 .end = 0, 909 }; 910 911 if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) 912 return (B_FALSE); 913 914 return (B_TRUE); 915 } 916 917 static int 918 vdev_disk_io_rw(zio_t *zio) 919 { 920 vdev_t *v = zio->io_vd; 921 vdev_disk_t *vd = v->vdev_tsd; 922 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 923 int flags = 0; 924 925 /* 926 * Accessing outside the block device is never allowed. 927 */ 928 if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { 929 vdev_dbgmsg(zio->io_vd, 930 "Illegal access %llu size %llu, device size %llu", 931 (u_longlong_t)zio->io_offset, 932 (u_longlong_t)zio->io_size, 933 (u_longlong_t)i_size_read(bdev->bd_inode)); 934 return (SET_ERROR(EIO)); 935 } 936 937 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 938 v->vdev_failfast == B_TRUE) { 939 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 940 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 941 } 942 943 /* 944 * Check alignment of the incoming ABD. If any part of it would require 945 * submitting a page that is not aligned to the logical block size, 946 * then we take a copy into a linear buffer and submit that instead. 947 * This should be impossible on a 512b LBS, and fairly rare on 4K, 948 * usually requiring abnormally-small data blocks (eg gang blocks) 949 * mixed into the same ABD as larger ones (eg aggregated). 950 */ 951 abd_t *abd = zio->io_abd; 952 if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { 953 void *buf; 954 if (zio->io_type == ZIO_TYPE_READ) 955 buf = abd_borrow_buf(zio->io_abd, zio->io_size); 956 else 957 buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); 958 959 /* 960 * Wrap the copy in an abd_t, so we can use the same iterators 961 * to count and fill the vbio later. 962 */ 963 abd = abd_get_from_buf(buf, zio->io_size); 964 965 /* 966 * False here would mean the borrowed copy has an invalid 967 * alignment too, which would mean we've somehow been passed a 968 * linear ABD with an interior page that has a non-zero offset 969 * or a size not a multiple of PAGE_SIZE. This is not possible. 970 * It would mean either zio_buf_alloc() or its underlying 971 * allocators have done something extremely strange, or our 972 * math in vdev_disk_check_pages() is wrong. In either case, 973 * something in seriously wrong and its not safe to continue. 974 */ 975 VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); 976 } 977 978 /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ 979 vbio_t *vbio = vbio_alloc(zio, bdev, flags); 980 if (abd != zio->io_abd) 981 vbio->vbio_abd = abd; 982 983 /* Fill it with data pages and submit it to the kernel */ 984 vbio_submit(vbio, abd, zio->io_size); 985 return (0); 986 } 987 988 /* ========== */ 989 990 /* 991 * This is the classic, battle-tested BIO submission code. Until we're totally 992 * sure that the new code is safe and correct in all cases, this will remain 993 * available and can be enabled by setting zfs_vdev_disk_classic=1 at module 994 * load time. 995 * 996 * These functions have been renamed to vdev_classic_* to make it clear what 997 * they belong to, but their implementations are unchanged. 998 */ 999 1000 /* 1001 * Virtual device vector for disks. 1002 */ 1003 typedef struct dio_request { 1004 zio_t *dr_zio; /* Parent ZIO */ 1005 atomic_t dr_ref; /* References */ 1006 int dr_error; /* Bio error */ 1007 int dr_bio_count; /* Count of bio's */ 1008 struct bio *dr_bio[]; /* Attached bio's */ 1009 } dio_request_t; 1010 1011 static dio_request_t * 1012 vdev_classic_dio_alloc(int bio_count) 1013 { 1014 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 1015 sizeof (struct bio *) * bio_count, KM_SLEEP); 1016 atomic_set(&dr->dr_ref, 0); 1017 dr->dr_bio_count = bio_count; 1018 dr->dr_error = 0; 1019 1020 for (int i = 0; i < dr->dr_bio_count; i++) 1021 dr->dr_bio[i] = NULL; 1022 1023 return (dr); 1024 } 1025 1026 static void 1027 vdev_classic_dio_free(dio_request_t *dr) 1028 { 1029 int i; 1030 1031 for (i = 0; i < dr->dr_bio_count; i++) 1032 if (dr->dr_bio[i]) 1033 bio_put(dr->dr_bio[i]); 1034 1035 kmem_free(dr, sizeof (dio_request_t) + 1036 sizeof (struct bio *) * dr->dr_bio_count); 1037 } 1038 1039 static void 1040 vdev_classic_dio_get(dio_request_t *dr) 1041 { 1042 atomic_inc(&dr->dr_ref); 1043 } 1044 1045 static void 1046 vdev_classic_dio_put(dio_request_t *dr) 1047 { 1048 int rc = atomic_dec_return(&dr->dr_ref); 1049 1050 /* 1051 * Free the dio_request when the last reference is dropped and 1052 * ensure zio_interpret is called only once with the correct zio 1053 */ 1054 if (rc == 0) { 1055 zio_t *zio = dr->dr_zio; 1056 int error = dr->dr_error; 1057 1058 vdev_classic_dio_free(dr); 1059 1060 if (zio) { 1061 zio->io_error = error; 1062 ASSERT3S(zio->io_error, >=, 0); 1063 if (zio->io_error) 1064 vdev_disk_error(zio); 1065 1066 zio_delay_interrupt(zio); 1067 } 1068 } 1069 } 1070 1071 BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) 1072 { 1073 dio_request_t *dr = bio->bi_private; 1074 1075 if (dr->dr_error == 0) { 1076 #ifdef HAVE_1ARG_BIO_END_IO_T 1077 dr->dr_error = BIO_END_IO_ERROR(bio); 1078 #else 1079 if (error) 1080 dr->dr_error = -(error); 1081 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1082 dr->dr_error = EIO; 1083 #endif 1084 } 1085 1086 /* Drop reference acquired by vdev_classic_physio */ 1087 vdev_classic_dio_put(dr); 1088 } 1089 1090 static inline unsigned int 1091 vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 1092 { 1093 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 1094 bio_size, abd_offset); 1095 1096 #ifdef HAVE_BIO_MAX_SEGS 1097 return (bio_max_segs(nr_segs)); 1098 #else 1099 return (MIN(nr_segs, BIO_MAX_PAGES)); 1100 #endif 1101 } 1102 1103 static int 1104 vdev_classic_physio(zio_t *zio) 1105 { 1106 vdev_t *v = zio->io_vd; 1107 vdev_disk_t *vd = v->vdev_tsd; 1108 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 1109 size_t io_size = zio->io_size; 1110 uint64_t io_offset = zio->io_offset; 1111 int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; 1112 int flags = 0; 1113 1114 dio_request_t *dr; 1115 uint64_t abd_offset; 1116 uint64_t bio_offset; 1117 int bio_size; 1118 int bio_count = 16; 1119 int error = 0; 1120 struct blk_plug plug; 1121 unsigned short nr_vecs; 1122 1123 /* 1124 * Accessing outside the block device is never allowed. 1125 */ 1126 if (io_offset + io_size > bdev->bd_inode->i_size) { 1127 vdev_dbgmsg(zio->io_vd, 1128 "Illegal access %llu size %llu, device size %llu", 1129 (u_longlong_t)io_offset, 1130 (u_longlong_t)io_size, 1131 (u_longlong_t)i_size_read(bdev->bd_inode)); 1132 return (SET_ERROR(EIO)); 1133 } 1134 1135 retry: 1136 dr = vdev_classic_dio_alloc(bio_count); 1137 1138 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 1139 zio->io_vd->vdev_failfast == B_TRUE) { 1140 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 1141 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 1142 } 1143 1144 dr->dr_zio = zio; 1145 1146 /* 1147 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 1148 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 1149 * can cover at least 128KB and at most 1MB. When the required number 1150 * of iovec's exceeds this, we are forced to break the IO in multiple 1151 * bio's and wait for them all to complete. This is likely if the 1152 * recordsize property is increased beyond 1MB. The default 1153 * bio_count=16 should typically accommodate the maximum-size zio of 1154 * 16MB. 1155 */ 1156 1157 abd_offset = 0; 1158 bio_offset = io_offset; 1159 bio_size = io_size; 1160 for (int i = 0; i <= dr->dr_bio_count; i++) { 1161 1162 /* Finished constructing bio's for given buffer */ 1163 if (bio_size <= 0) 1164 break; 1165 1166 /* 1167 * If additional bio's are required, we have to retry, but 1168 * this should be rare - see the comment above. 1169 */ 1170 if (dr->dr_bio_count == i) { 1171 vdev_classic_dio_free(dr); 1172 bio_count *= 2; 1173 goto retry; 1174 } 1175 1176 nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); 1177 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 1178 if (unlikely(dr->dr_bio[i] == NULL)) { 1179 vdev_classic_dio_free(dr); 1180 return (SET_ERROR(ENOMEM)); 1181 } 1182 1183 /* Matching put called by vdev_classic_physio_completion */ 1184 vdev_classic_dio_get(dr); 1185 1186 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 1187 dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; 1188 dr->dr_bio[i]->bi_private = dr; 1189 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 1190 1191 /* Remaining size is returned to become the new size */ 1192 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 1193 bio_size, abd_offset); 1194 1195 /* Advance in buffer and construct another bio if needed */ 1196 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1197 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1198 } 1199 1200 /* Extra reference to protect dio_request during vdev_submit_bio */ 1201 vdev_classic_dio_get(dr); 1202 1203 if (dr->dr_bio_count > 1) 1204 blk_start_plug(&plug); 1205 1206 /* Submit all bio's associated with this dio */ 1207 for (int i = 0; i < dr->dr_bio_count; i++) { 1208 if (dr->dr_bio[i]) 1209 vdev_submit_bio(dr->dr_bio[i]); 1210 } 1211 1212 if (dr->dr_bio_count > 1) 1213 blk_finish_plug(&plug); 1214 1215 vdev_classic_dio_put(dr); 1216 1217 return (error); 1218 } 1219 1220 /* ========== */ 1221 1222 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 1223 { 1224 zio_t *zio = bio->bi_private; 1225 #ifdef HAVE_1ARG_BIO_END_IO_T 1226 zio->io_error = BIO_END_IO_ERROR(bio); 1227 #else 1228 zio->io_error = -error; 1229 #endif 1230 1231 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 1232 zio->io_vd->vdev_nowritecache = B_TRUE; 1233 1234 bio_put(bio); 1235 ASSERT3S(zio->io_error, >=, 0); 1236 if (zio->io_error) 1237 vdev_disk_error(zio); 1238 zio_interrupt(zio); 1239 } 1240 1241 static int 1242 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 1243 { 1244 struct request_queue *q; 1245 struct bio *bio; 1246 1247 q = bdev_get_queue(bdev); 1248 if (!q) 1249 return (SET_ERROR(ENXIO)); 1250 1251 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 1252 if (unlikely(bio == NULL)) 1253 return (SET_ERROR(ENOMEM)); 1254 1255 bio->bi_end_io = vdev_disk_io_flush_completion; 1256 bio->bi_private = zio; 1257 bio_set_flush(bio); 1258 vdev_submit_bio(bio); 1259 invalidate_bdev(bdev); 1260 1261 return (0); 1262 } 1263 1264 BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) 1265 { 1266 zio_t *zio = bio->bi_private; 1267 #ifdef HAVE_1ARG_BIO_END_IO_T 1268 zio->io_error = BIO_END_IO_ERROR(bio); 1269 #else 1270 zio->io_error = -error; 1271 #endif 1272 bio_put(bio); 1273 if (zio->io_error) 1274 vdev_disk_error(zio); 1275 zio_interrupt(zio); 1276 } 1277 1278 /* 1279 * Wrappers for the different secure erase and discard APIs. We use async 1280 * when available; in this case, *biop is set to the last bio in the chain. 1281 */ 1282 static int 1283 vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, 1284 sector_t nsect, struct bio **biop) 1285 { 1286 *biop = NULL; 1287 int error; 1288 1289 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 1290 error = blkdev_issue_secure_erase(BDH_BDEV(bdh), 1291 sector, nsect, GFP_NOFS); 1292 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 1293 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1294 sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); 1295 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 1296 error = blkdev_issue_discard(BDH_BDEV(bdh), 1297 sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); 1298 #else 1299 #error "unsupported kernel" 1300 #endif 1301 1302 return (error); 1303 } 1304 1305 static int 1306 vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, 1307 sector_t nsect, struct bio **biop) 1308 { 1309 *biop = NULL; 1310 int error; 1311 1312 #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 1313 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1314 sector, nsect, GFP_NOFS, 0, biop); 1315 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) 1316 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1317 sector, nsect, GFP_NOFS, biop); 1318 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 1319 error = blkdev_issue_discard(BDH_BDEV(bdh), 1320 sector, nsect, GFP_NOFS, 0); 1321 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) 1322 error = blkdev_issue_discard(BDH_BDEV(bdh), 1323 sector, nsect, GFP_NOFS); 1324 #else 1325 #error "unsupported kernel" 1326 #endif 1327 1328 return (error); 1329 } 1330 1331 /* 1332 * Entry point for TRIM ops. This calls the right wrapper for secure erase or 1333 * discard, and then does the appropriate finishing work for error vs success 1334 * and async vs sync. 1335 */ 1336 static int 1337 vdev_disk_io_trim(zio_t *zio) 1338 { 1339 int error; 1340 struct bio *bio; 1341 1342 zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; 1343 sector_t sector = zio->io_offset >> 9; 1344 sector_t nsects = zio->io_size >> 9; 1345 1346 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 1347 error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); 1348 else 1349 error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); 1350 1351 if (error != 0) 1352 return (SET_ERROR(-error)); 1353 1354 if (bio == NULL) { 1355 /* 1356 * This was a synchronous op that completed successfully, so 1357 * return it to ZFS immediately. 1358 */ 1359 zio_interrupt(zio); 1360 } else { 1361 /* 1362 * This was an asynchronous op; set up completion callback and 1363 * issue it. 1364 */ 1365 bio->bi_private = zio; 1366 bio->bi_end_io = vdev_disk_discard_end_io; 1367 vdev_submit_bio(bio); 1368 } 1369 1370 return (0); 1371 } 1372 1373 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; 1374 1375 static void 1376 vdev_disk_io_start(zio_t *zio) 1377 { 1378 vdev_t *v = zio->io_vd; 1379 vdev_disk_t *vd = v->vdev_tsd; 1380 int error; 1381 1382 /* 1383 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 1384 * Nothing to be done here but return failure. 1385 */ 1386 if (vd == NULL) { 1387 zio->io_error = ENXIO; 1388 zio_interrupt(zio); 1389 return; 1390 } 1391 1392 rw_enter(&vd->vd_lock, RW_READER); 1393 1394 /* 1395 * If the vdev is closed, it's likely due to a failed reopen and is 1396 * in the UNAVAIL state. Nothing to be done here but return failure. 1397 */ 1398 if (vd->vd_bdh == NULL) { 1399 rw_exit(&vd->vd_lock); 1400 zio->io_error = ENXIO; 1401 zio_interrupt(zio); 1402 return; 1403 } 1404 1405 switch (zio->io_type) { 1406 case ZIO_TYPE_FLUSH: 1407 1408 if (!vdev_readable(v)) { 1409 /* Drive not there, can't flush */ 1410 error = SET_ERROR(ENXIO); 1411 } else if (zfs_nocacheflush) { 1412 /* Flushing disabled by operator, declare success */ 1413 error = 0; 1414 } else if (v->vdev_nowritecache) { 1415 /* This vdev not capable of flushing */ 1416 error = SET_ERROR(ENOTSUP); 1417 } else { 1418 /* 1419 * Issue the flush. If successful, the response will 1420 * be handled in the completion callback, so we're done. 1421 */ 1422 error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); 1423 if (error == 0) { 1424 rw_exit(&vd->vd_lock); 1425 return; 1426 } 1427 } 1428 1429 /* Couldn't issue the flush, so set the error and return it */ 1430 rw_exit(&vd->vd_lock); 1431 zio->io_error = error; 1432 zio_execute(zio); 1433 return; 1434 1435 case ZIO_TYPE_TRIM: 1436 error = vdev_disk_io_trim(zio); 1437 rw_exit(&vd->vd_lock); 1438 if (error) { 1439 zio->io_error = error; 1440 zio_execute(zio); 1441 } 1442 return; 1443 1444 case ZIO_TYPE_READ: 1445 case ZIO_TYPE_WRITE: 1446 zio->io_target_timestamp = zio_handle_io_delay(zio); 1447 error = vdev_disk_io_rw_fn(zio); 1448 rw_exit(&vd->vd_lock); 1449 if (error) { 1450 zio->io_error = error; 1451 zio_interrupt(zio); 1452 } 1453 return; 1454 1455 default: 1456 /* 1457 * Getting here means our parent vdev has made a very strange 1458 * request of us, and shouldn't happen. Assert here to force a 1459 * crash in dev builds, but in production return the IO 1460 * unhandled. The pool will likely suspend anyway but that's 1461 * nicer than crashing the kernel. 1462 */ 1463 ASSERT3S(zio->io_type, ==, -1); 1464 1465 rw_exit(&vd->vd_lock); 1466 zio->io_error = SET_ERROR(ENOTSUP); 1467 zio_interrupt(zio); 1468 return; 1469 } 1470 1471 __builtin_unreachable(); 1472 } 1473 1474 static void 1475 vdev_disk_io_done(zio_t *zio) 1476 { 1477 /* 1478 * If the device returned EIO, we revalidate the media. If it is 1479 * determined the media has changed this triggers the asynchronous 1480 * removal of the device from the configuration. 1481 */ 1482 if (zio->io_error == EIO) { 1483 vdev_t *v = zio->io_vd; 1484 vdev_disk_t *vd = v->vdev_tsd; 1485 1486 if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { 1487 invalidate_bdev(BDH_BDEV(vd->vd_bdh)); 1488 v->vdev_remove_wanted = B_TRUE; 1489 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1490 } 1491 } 1492 } 1493 1494 static void 1495 vdev_disk_hold(vdev_t *vd) 1496 { 1497 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1498 1499 /* We must have a pathname, and it must be absolute. */ 1500 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 1501 return; 1502 1503 /* 1504 * Only prefetch path and devid info if the device has 1505 * never been opened. 1506 */ 1507 if (vd->vdev_tsd != NULL) 1508 return; 1509 1510 } 1511 1512 static void 1513 vdev_disk_rele(vdev_t *vd) 1514 { 1515 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1516 1517 /* XXX: Implement me as a vnode rele for the device */ 1518 } 1519 1520 /* 1521 * BIO submission method. See comment above about vdev_classic. 1522 * Set zfs_vdev_disk_classic=0 for new, =1 for classic 1523 */ 1524 static uint_t zfs_vdev_disk_classic = 0; /* default new */ 1525 1526 /* Set submission function from module parameter */ 1527 static int 1528 vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) 1529 { 1530 int err = param_set_uint(buf, kp); 1531 if (err < 0) 1532 return (SET_ERROR(err)); 1533 1534 vdev_disk_io_rw_fn = 1535 zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; 1536 1537 printk(KERN_INFO "ZFS: forcing %s BIO submission\n", 1538 zfs_vdev_disk_classic ? "classic" : "new"); 1539 1540 return (0); 1541 } 1542 1543 /* 1544 * At first use vdev use, set the submission function from the default value if 1545 * it hasn't been set already. 1546 */ 1547 static int 1548 vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) 1549 { 1550 (void) spa; 1551 (void) nv; 1552 (void) tsd; 1553 1554 if (vdev_disk_io_rw_fn == NULL) 1555 vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? 1556 vdev_classic_physio : vdev_disk_io_rw; 1557 1558 return (0); 1559 } 1560 1561 vdev_ops_t vdev_disk_ops = { 1562 .vdev_op_init = vdev_disk_init, 1563 .vdev_op_fini = NULL, 1564 .vdev_op_open = vdev_disk_open, 1565 .vdev_op_close = vdev_disk_close, 1566 .vdev_op_asize = vdev_default_asize, 1567 .vdev_op_min_asize = vdev_default_min_asize, 1568 .vdev_op_min_alloc = NULL, 1569 .vdev_op_io_start = vdev_disk_io_start, 1570 .vdev_op_io_done = vdev_disk_io_done, 1571 .vdev_op_state_change = NULL, 1572 .vdev_op_need_resilver = NULL, 1573 .vdev_op_hold = vdev_disk_hold, 1574 .vdev_op_rele = vdev_disk_rele, 1575 .vdev_op_remap = NULL, 1576 .vdev_op_xlate = vdev_default_xlate, 1577 .vdev_op_rebuild_asize = NULL, 1578 .vdev_op_metaslab_init = NULL, 1579 .vdev_op_config_generate = NULL, 1580 .vdev_op_nparity = NULL, 1581 .vdev_op_ndisks = NULL, 1582 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1583 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1584 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1585 }; 1586 1587 /* 1588 * The zfs_vdev_scheduler module option has been deprecated. Setting this 1589 * value no longer has any effect. It has not yet been entirely removed 1590 * to allow the module to be loaded if this option is specified in the 1591 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 1592 */ 1593 static int 1594 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1595 { 1596 int error = param_set_charp(val, kp); 1597 if (error == 0) { 1598 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1599 "is not supported.\n"); 1600 } 1601 1602 return (error); 1603 } 1604 1605 static const char *zfs_vdev_scheduler = "unused"; 1606 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1607 param_get_charp, &zfs_vdev_scheduler, 0644); 1608 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1609 1610 int 1611 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1612 { 1613 uint_t val; 1614 int error; 1615 1616 error = kstrtouint(buf, 0, &val); 1617 if (error < 0) 1618 return (SET_ERROR(error)); 1619 1620 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1621 return (SET_ERROR(-EINVAL)); 1622 1623 error = param_set_uint(buf, kp); 1624 if (error < 0) 1625 return (SET_ERROR(error)); 1626 1627 return (0); 1628 } 1629 1630 int 1631 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1632 { 1633 uint_t val; 1634 int error; 1635 1636 error = kstrtouint(buf, 0, &val); 1637 if (error < 0) 1638 return (SET_ERROR(error)); 1639 1640 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1641 return (SET_ERROR(-EINVAL)); 1642 1643 error = param_set_uint(buf, kp); 1644 if (error < 0) 1645 return (SET_ERROR(error)); 1646 1647 return (0); 1648 } 1649 1650 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1651 "Timeout before determining that a device is missing"); 1652 1653 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1654 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1655 1656 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, 1657 "Maximum number of data segments to add to an IO request (min 4)"); 1658 1659 ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, 1660 vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, 1661 "Use classic BIO submission method"); 1662