1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 * Copyright (c) 2023, 2024, Klara Inc. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_disk.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/vdev_trim.h> 35 #include <sys/abd.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/zio.h> 38 #include <linux/blkpg.h> 39 #include <linux/msdos_fs.h> 40 #include <linux/vfs_compat.h> 41 #include <linux/blk-cgroup.h> 42 43 /* 44 * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying 45 * block_device. Since it carries the block_device inside, its convenient to 46 * just use the handle as a proxy. 47 * 48 * Linux 6.9.x uses a file for the same purpose. 49 * 50 * For pre-6.8, we just emulate this with a cast, since we don't need any of 51 * the other fields inside the handle. 52 */ 53 #if defined(HAVE_BDEV_OPEN_BY_PATH) 54 typedef struct bdev_handle zfs_bdev_handle_t; 55 #define BDH_BDEV(bdh) ((bdh)->bdev) 56 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 57 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 58 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 59 #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 60 typedef struct file zfs_bdev_handle_t; 61 #define BDH_BDEV(bdh) (file_bdev(bdh)) 62 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 63 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 64 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 65 #else 66 typedef void zfs_bdev_handle_t; 67 #define BDH_BDEV(bdh) ((struct block_device *)bdh) 68 #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) 69 #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) 70 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 71 #endif 72 73 typedef struct vdev_disk { 74 zfs_bdev_handle_t *vd_bdh; 75 krwlock_t vd_lock; 76 } vdev_disk_t; 77 78 /* 79 * Maximum number of segments to add to a bio (min 4). If this is higher than 80 * the maximum allowed by the device queue or the kernel itself, it will be 81 * clamped. Setting it to zero will cause the kernel's ideal size to be used. 82 */ 83 uint_t zfs_vdev_disk_max_segs = 0; 84 85 /* 86 * Unique identifier for the exclusive vdev holder. 87 */ 88 static void *zfs_vdev_holder = VDEV_HOLDER; 89 90 /* 91 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 92 * device is missing. The missing path may be transient since the links 93 * can be briefly removed and recreated in response to udev events. 94 */ 95 static uint_t zfs_vdev_open_timeout_ms = 1000; 96 97 /* 98 * Size of the "reserved" partition, in blocks. 99 */ 100 #define EFI_MIN_RESV_SIZE (16 * 1024) 101 102 /* 103 * BIO request failfast mask. 104 */ 105 106 static unsigned int zfs_vdev_failfast_mask = 1; 107 108 /* 109 * Convert SPA mode flags into bdev open mode flags. 110 */ 111 #ifdef HAVE_BLK_MODE_T 112 typedef blk_mode_t vdev_bdev_mode_t; 113 #define VDEV_BDEV_MODE_READ BLK_OPEN_READ 114 #define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE 115 #define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL 116 #define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) 117 #else 118 typedef fmode_t vdev_bdev_mode_t; 119 #define VDEV_BDEV_MODE_READ FMODE_READ 120 #define VDEV_BDEV_MODE_WRITE FMODE_WRITE 121 #define VDEV_BDEV_MODE_EXCL FMODE_EXCL 122 #define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) 123 #endif 124 125 static vdev_bdev_mode_t 126 vdev_bdev_mode(spa_mode_t smode) 127 { 128 ASSERT3U(smode, !=, SPA_MODE_UNINIT); 129 ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); 130 131 vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; 132 133 if (smode & SPA_MODE_READ) 134 bmode |= VDEV_BDEV_MODE_READ; 135 136 if (smode & SPA_MODE_WRITE) 137 bmode |= VDEV_BDEV_MODE_WRITE; 138 139 ASSERT(bmode & VDEV_BDEV_MODE_MASK); 140 ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); 141 142 return (bmode); 143 } 144 145 /* 146 * Returns the usable capacity (in bytes) for the partition or disk. 147 */ 148 static uint64_t 149 bdev_capacity(struct block_device *bdev) 150 { 151 #ifdef HAVE_BDEV_NR_BYTES 152 return (bdev_nr_bytes(bdev)); 153 #else 154 return (i_size_read(bdev->bd_inode)); 155 #endif 156 } 157 158 #if !defined(HAVE_BDEV_WHOLE) 159 static inline struct block_device * 160 bdev_whole(struct block_device *bdev) 161 { 162 return (bdev->bd_contains); 163 } 164 #endif 165 166 #if defined(HAVE_BDEVNAME) 167 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 168 #else 169 static inline void 170 vdev_bdevname(struct block_device *bdev, char *name) 171 { 172 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 173 } 174 #endif 175 176 /* 177 * Returns the maximum expansion capacity of the block device (in bytes). 178 * 179 * It is possible to expand a vdev when it has been created as a wholedisk 180 * and the containing block device has increased in capacity. Or when the 181 * partition containing the pool has been manually increased in size. 182 * 183 * This function is only responsible for calculating the potential expansion 184 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 185 * responsible for verifying the expected partition layout in the wholedisk 186 * case, and updating the partition table if appropriate. Once the partition 187 * size has been increased the additional capacity will be visible using 188 * bdev_capacity(). 189 * 190 * The returned maximum expansion capacity is always expected to be larger, or 191 * at the very least equal, to its usable capacity to prevent overestimating 192 * the pool expandsize. 193 */ 194 static uint64_t 195 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 196 { 197 uint64_t psize; 198 int64_t available; 199 200 if (wholedisk && bdev != bdev_whole(bdev)) { 201 /* 202 * When reporting maximum expansion capacity for a wholedisk 203 * deduct any capacity which is expected to be lost due to 204 * alignment restrictions. Over reporting this value isn't 205 * harmful and would only result in slightly less capacity 206 * than expected post expansion. 207 * The estimated available space may be slightly smaller than 208 * bdev_capacity() for devices where the number of sectors is 209 * not a multiple of the alignment size and the partition layout 210 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 211 * "reserved" EFI partition: in such cases return the device 212 * usable capacity. 213 */ 214 available = bdev_capacity(bdev_whole(bdev)) - 215 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 216 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 217 psize = MAX(available, bdev_capacity(bdev)); 218 } else { 219 psize = bdev_capacity(bdev); 220 } 221 222 return (psize); 223 } 224 225 static void 226 vdev_disk_error(zio_t *zio) 227 { 228 /* 229 * This function can be called in interrupt context, for instance while 230 * handling IRQs coming from a misbehaving disk device; use printk() 231 * which is safe from any context. 232 */ 233 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 234 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 235 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 236 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 237 zio->io_flags); 238 } 239 240 static void 241 vdev_disk_kobj_evt_post(vdev_t *v) 242 { 243 vdev_disk_t *vd = v->vdev_tsd; 244 if (vd && vd->vd_bdh) { 245 spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); 246 } else { 247 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 248 v->vdev_path); 249 } 250 } 251 252 static zfs_bdev_handle_t * 253 vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) 254 { 255 vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); 256 257 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 258 return (bdev_file_open_by_path(path, bmode, holder, NULL)); 259 #elif defined(HAVE_BDEV_OPEN_BY_PATH) 260 return (bdev_open_by_path(path, bmode, holder, NULL)); 261 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 262 return (blkdev_get_by_path(path, bmode, holder, NULL)); 263 #else 264 return (blkdev_get_by_path(path, bmode, holder)); 265 #endif 266 } 267 268 static void 269 vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) 270 { 271 #if defined(HAVE_BDEV_RELEASE) 272 return (bdev_release(bdh)); 273 #elif defined(HAVE_BLKDEV_PUT_HOLDER) 274 return (blkdev_put(BDH_BDEV(bdh), holder)); 275 #elif defined(HAVE_BLKDEV_PUT) 276 return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); 277 #else 278 fput(bdh); 279 #endif 280 } 281 282 static int 283 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 284 uint64_t *logical_ashift, uint64_t *physical_ashift) 285 { 286 zfs_bdev_handle_t *bdh; 287 spa_mode_t smode = spa_mode(v->vdev_spa); 288 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 289 vdev_disk_t *vd; 290 291 /* Must have a pathname and it must be absolute. */ 292 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 293 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 294 vdev_dbgmsg(v, "invalid vdev_path"); 295 return (SET_ERROR(EINVAL)); 296 } 297 298 /* 299 * Reopen the device if it is currently open. When expanding a 300 * partition force re-scanning the partition table if userland 301 * did not take care of this already. We need to do this while closed 302 * in order to get an accurate updated block device size. Then 303 * since udev may need to recreate the device links increase the 304 * open retry timeout before reporting the device as unavailable. 305 */ 306 vd = v->vdev_tsd; 307 if (vd) { 308 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 309 boolean_t reread_part = B_FALSE; 310 311 rw_enter(&vd->vd_lock, RW_WRITER); 312 bdh = vd->vd_bdh; 313 vd->vd_bdh = NULL; 314 315 if (bdh) { 316 struct block_device *bdev = BDH_BDEV(bdh); 317 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 318 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 319 /* 320 * If userland has BLKPG_RESIZE_PARTITION, 321 * then it should have updated the partition 322 * table already. We can detect this by 323 * comparing our current physical size 324 * with that of the device. If they are 325 * the same, then we must not have 326 * BLKPG_RESIZE_PARTITION or it failed to 327 * update the partition table online. We 328 * fallback to rescanning the partition 329 * table from the kernel below. However, 330 * if the capacity already reflects the 331 * updated partition, then we skip 332 * rescanning the partition table here. 333 */ 334 if (v->vdev_psize == bdev_capacity(bdev)) 335 reread_part = B_TRUE; 336 } 337 338 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 339 } 340 341 if (reread_part) { 342 bdh = vdev_blkdev_get_by_path(disk_name, smode, 343 zfs_vdev_holder); 344 if (!BDH_IS_ERR(bdh)) { 345 int error = 346 vdev_bdev_reread_part(BDH_BDEV(bdh)); 347 vdev_blkdev_put(bdh, smode, zfs_vdev_holder); 348 if (error == 0) { 349 timeout = MSEC2NSEC( 350 zfs_vdev_open_timeout_ms * 2); 351 } 352 } 353 } 354 } else { 355 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 356 357 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 358 rw_enter(&vd->vd_lock, RW_WRITER); 359 } 360 361 /* 362 * Devices are always opened by the path provided at configuration 363 * time. This means that if the provided path is a udev by-id path 364 * then drives may be re-cabled without an issue. If the provided 365 * path is a udev by-path path, then the physical location information 366 * will be preserved. This can be critical for more complicated 367 * configurations where drives are located in specific physical 368 * locations to maximize the systems tolerance to component failure. 369 * 370 * Alternatively, you can provide your own udev rule to flexibly map 371 * the drives as you see fit. It is not advised that you use the 372 * /dev/[hd]d devices which may be reordered due to probing order. 373 * Devices in the wrong locations will be detected by the higher 374 * level vdev validation. 375 * 376 * The specified paths may be briefly removed and recreated in 377 * response to udev events. This should be exceptionally unlikely 378 * because the zpool command makes every effort to verify these paths 379 * have already settled prior to reaching this point. Therefore, 380 * a ENOENT failure at this point is highly likely to be transient 381 * and it is reasonable to sleep and retry before giving up. In 382 * practice delays have been observed to be on the order of 100ms. 383 * 384 * When ERESTARTSYS is returned it indicates the block device is 385 * a zvol which could not be opened due to the deadlock detection 386 * logic in zvol_open(). Extend the timeout and retry the open 387 * subsequent attempts are expected to eventually succeed. 388 */ 389 hrtime_t start = gethrtime(); 390 bdh = BDH_ERR_PTR(-ENXIO); 391 while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { 392 bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, 393 zfs_vdev_holder); 394 if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { 395 /* 396 * There is no point of waiting since device is removed 397 * explicitly 398 */ 399 if (v->vdev_removed) 400 break; 401 402 schedule_timeout_interruptible(MSEC_TO_TICK(10)); 403 } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { 404 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 405 continue; 406 } else if (BDH_IS_ERR(bdh)) { 407 break; 408 } 409 } 410 411 if (BDH_IS_ERR(bdh)) { 412 int error = -BDH_PTR_ERR(bdh); 413 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 414 (u_longlong_t)(gethrtime() - start), 415 (u_longlong_t)timeout); 416 vd->vd_bdh = NULL; 417 v->vdev_tsd = vd; 418 rw_exit(&vd->vd_lock); 419 return (SET_ERROR(error)); 420 } else { 421 vd->vd_bdh = bdh; 422 v->vdev_tsd = vd; 423 rw_exit(&vd->vd_lock); 424 } 425 426 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 427 428 /* Determine the physical block size */ 429 int physical_block_size = bdev_physical_block_size(bdev); 430 431 /* Determine the logical block size */ 432 int logical_block_size = bdev_logical_block_size(bdev); 433 434 /* 435 * If the device has a write cache, clear the nowritecache flag, 436 * so that we start issuing flush requests again. 437 */ 438 v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev); 439 440 /* Set when device reports it supports TRIM. */ 441 v->vdev_has_trim = bdev_discard_supported(bdev); 442 443 /* Set when device reports it supports secure TRIM. */ 444 v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); 445 446 /* Inform the ZIO pipeline that we are non-rotational */ 447 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); 448 449 /* Physical volume size in bytes for the partition */ 450 *psize = bdev_capacity(bdev); 451 452 /* Physical volume size in bytes including possible expansion space */ 453 *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); 454 455 /* Based on the minimum sector size set the block size */ 456 *physical_ashift = highbit64(MAX(physical_block_size, 457 SPA_MINBLOCKSIZE)) - 1; 458 459 *logical_ashift = highbit64(MAX(logical_block_size, 460 SPA_MINBLOCKSIZE)) - 1; 461 462 return (0); 463 } 464 465 static void 466 vdev_disk_close(vdev_t *v) 467 { 468 vdev_disk_t *vd = v->vdev_tsd; 469 470 if (v->vdev_reopening || vd == NULL) 471 return; 472 473 if (vd->vd_bdh != NULL) 474 vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), 475 zfs_vdev_holder); 476 477 rw_destroy(&vd->vd_lock); 478 kmem_free(vd, sizeof (vdev_disk_t)); 479 v->vdev_tsd = NULL; 480 } 481 482 /* 483 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 484 * replace it with preempt_schedule under the following condition: 485 */ 486 #if defined(CONFIG_ARM64) && \ 487 defined(CONFIG_PREEMPTION) && \ 488 defined(CONFIG_BLK_CGROUP) 489 #define preempt_schedule_notrace(x) preempt_schedule(x) 490 #endif 491 492 /* 493 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 494 * as an argument removing the need to set it with bio_set_dev(). This 495 * removes the need for all of the following compatibility code. 496 */ 497 #if !defined(HAVE_BIO_ALLOC_4ARG) 498 499 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 500 /* 501 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 502 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 503 * As a side effect the function was converted to GPL-only. Define our 504 * own version when needed which uses rcu_read_lock_sched(). 505 * 506 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 507 * part, moving blkg_tryget into the private one. Define our own version. 508 */ 509 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 510 static inline bool 511 vdev_blkg_tryget(struct blkcg_gq *blkg) 512 { 513 struct percpu_ref *ref = &blkg->refcnt; 514 unsigned long __percpu *count; 515 bool rc; 516 517 rcu_read_lock_sched(); 518 519 if (__ref_is_percpu(ref, &count)) { 520 this_cpu_inc(*count); 521 rc = true; 522 } else { 523 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 524 rc = atomic_long_inc_not_zero(&ref->data->count); 525 #else 526 rc = atomic_long_inc_not_zero(&ref->count); 527 #endif 528 } 529 530 rcu_read_unlock_sched(); 531 532 return (rc); 533 } 534 #else 535 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 536 #endif 537 #ifdef HAVE_BIO_SET_DEV_MACRO 538 /* 539 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 540 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 541 * the entire macro. Provide a minimal version which always assigns the 542 * request queue's root_blkg to the bio. 543 */ 544 static inline void 545 vdev_bio_associate_blkg(struct bio *bio) 546 { 547 #if defined(HAVE_BIO_BDEV_DISK) 548 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 549 #else 550 struct request_queue *q = bio->bi_disk->queue; 551 #endif 552 553 ASSERT3P(q, !=, NULL); 554 ASSERT3P(bio->bi_blkg, ==, NULL); 555 556 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 557 bio->bi_blkg = q->root_blkg; 558 } 559 560 #define bio_associate_blkg vdev_bio_associate_blkg 561 #else 562 static inline void 563 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 564 { 565 #if defined(HAVE_BIO_BDEV_DISK) 566 struct request_queue *q = bdev->bd_disk->queue; 567 #else 568 struct request_queue *q = bio->bi_disk->queue; 569 #endif 570 bio_clear_flag(bio, BIO_REMAPPED); 571 if (bio->bi_bdev != bdev) 572 bio_clear_flag(bio, BIO_THROTTLED); 573 bio->bi_bdev = bdev; 574 575 ASSERT3P(q, !=, NULL); 576 ASSERT3P(bio->bi_blkg, ==, NULL); 577 578 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 579 bio->bi_blkg = q->root_blkg; 580 } 581 #define bio_set_dev vdev_bio_set_dev 582 #endif 583 #endif 584 #endif /* !HAVE_BIO_ALLOC_4ARG */ 585 586 static inline void 587 vdev_submit_bio(struct bio *bio) 588 { 589 struct bio_list *bio_list = current->bio_list; 590 current->bio_list = NULL; 591 (void) submit_bio(bio); 592 current->bio_list = bio_list; 593 } 594 595 static inline struct bio * 596 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 597 unsigned short nr_vecs) 598 { 599 struct bio *bio; 600 601 #ifdef HAVE_BIO_ALLOC_4ARG 602 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 603 #else 604 bio = bio_alloc(gfp_mask, nr_vecs); 605 if (likely(bio != NULL)) 606 bio_set_dev(bio, bdev); 607 #endif 608 609 return (bio); 610 } 611 612 static inline uint_t 613 vdev_bio_max_segs(struct block_device *bdev) 614 { 615 /* 616 * Smallest of the device max segs and the tuneable max segs. Minimum 617 * 4, so there's room to finish split pages if they come up. 618 */ 619 const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); 620 const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? 621 MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; 622 const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); 623 624 #ifdef HAVE_BIO_MAX_SEGS 625 return (bio_max_segs(max_segs)); 626 #else 627 return (MIN(max_segs, BIO_MAX_PAGES)); 628 #endif 629 } 630 631 static inline uint_t 632 vdev_bio_max_bytes(struct block_device *bdev) 633 { 634 return (queue_max_sectors(bdev_get_queue(bdev)) << 9); 635 } 636 637 638 /* 639 * Virtual block IO object (VBIO) 640 * 641 * Linux block IO (BIO) objects have a limit on how many data segments (pages) 642 * they can hold. Depending on how they're allocated and structured, a large 643 * ZIO can require more than one BIO to be submitted to the kernel, which then 644 * all have to complete before we can return the completed ZIO back to ZFS. 645 * 646 * A VBIO is a wrapper around multiple BIOs, carrying everything needed to 647 * translate a ZIO down into the kernel block layer and back again. 648 * 649 * Note that these are only used for data ZIOs (read/write). Meta-operations 650 * (flush/trim) don't need multiple BIOs and so can just make the call 651 * directly. 652 */ 653 typedef struct { 654 zio_t *vbio_zio; /* parent zio */ 655 656 struct block_device *vbio_bdev; /* blockdev to submit bios to */ 657 658 abd_t *vbio_abd; /* abd carrying borrowed linear buf */ 659 660 uint_t vbio_max_segs; /* max segs per bio */ 661 662 uint_t vbio_max_bytes; /* max bytes per bio */ 663 uint_t vbio_lbs_mask; /* logical block size mask */ 664 665 uint64_t vbio_offset; /* start offset of next bio */ 666 667 struct bio *vbio_bio; /* pointer to the current bio */ 668 int vbio_flags; /* bio flags */ 669 } vbio_t; 670 671 static vbio_t * 672 vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) 673 { 674 vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); 675 676 vbio->vbio_zio = zio; 677 vbio->vbio_bdev = bdev; 678 vbio->vbio_abd = NULL; 679 vbio->vbio_max_segs = vdev_bio_max_segs(bdev); 680 vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); 681 vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); 682 vbio->vbio_offset = zio->io_offset; 683 vbio->vbio_bio = NULL; 684 vbio->vbio_flags = flags; 685 686 return (vbio); 687 } 688 689 static void vbio_completion(struct bio *bio); 690 691 static int 692 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) 693 { 694 struct bio *bio = vbio->vbio_bio; 695 uint_t ssize; 696 697 while (size > 0) { 698 if (bio == NULL) { 699 /* New BIO, allocate and set up */ 700 bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, 701 vbio->vbio_max_segs); 702 VERIFY(bio); 703 704 BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; 705 bio_set_op_attrs(bio, 706 vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? 707 WRITE : READ, vbio->vbio_flags); 708 709 if (vbio->vbio_bio) { 710 bio_chain(vbio->vbio_bio, bio); 711 vdev_submit_bio(vbio->vbio_bio); 712 } 713 vbio->vbio_bio = bio; 714 } 715 716 /* 717 * Only load as much of the current page data as will fit in 718 * the space left in the BIO, respecting lbs alignment. Older 719 * kernels will error if we try to overfill the BIO, while 720 * newer ones will accept it and split the BIO. This ensures 721 * everything works on older kernels, and avoids an additional 722 * overhead on the new. 723 */ 724 ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & 725 vbio->vbio_lbs_mask); 726 if (ssize > 0 && 727 bio_add_page(bio, page, ssize, offset) == ssize) { 728 /* Accepted, adjust and load any remaining. */ 729 size -= ssize; 730 offset += ssize; 731 continue; 732 } 733 734 /* No room, set up for a new BIO and loop */ 735 vbio->vbio_offset += BIO_BI_SIZE(bio); 736 737 /* Signal new BIO allocation wanted */ 738 bio = NULL; 739 } 740 741 return (0); 742 } 743 744 /* Iterator callback to submit ABD pages to the vbio. */ 745 static int 746 vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) 747 { 748 vbio_t *vbio = priv; 749 return (vbio_add_page(vbio, page, len, off)); 750 } 751 752 /* Create some BIOs, fill them with data and submit them */ 753 static void 754 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) 755 { 756 /* 757 * We plug so we can submit the BIOs as we go and only unplug them when 758 * they are fully created and submitted. This is important; if we don't 759 * plug, then the kernel may start executing earlier BIOs while we're 760 * still creating and executing later ones, and if the device goes 761 * away while that's happening, older kernels can get confused and 762 * trample memory. 763 */ 764 struct blk_plug plug; 765 blk_start_plug(&plug); 766 767 (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); 768 ASSERT(vbio->vbio_bio); 769 770 vbio->vbio_bio->bi_end_io = vbio_completion; 771 vbio->vbio_bio->bi_private = vbio; 772 773 /* 774 * Once submitted, vbio_bio now owns vbio (through bi_private) and we 775 * can't touch it again. The bio may complete and vbio_completion() be 776 * called and free the vbio before this task is run again, so we must 777 * consider it invalid from this point. 778 */ 779 vdev_submit_bio(vbio->vbio_bio); 780 781 blk_finish_plug(&plug); 782 } 783 784 /* IO completion callback */ 785 static void 786 vbio_completion(struct bio *bio) 787 { 788 vbio_t *vbio = bio->bi_private; 789 zio_t *zio = vbio->vbio_zio; 790 791 ASSERT(zio); 792 793 /* Capture and log any errors */ 794 zio->io_error = bi_status_to_errno(bio->bi_status); 795 ASSERT3U(zio->io_error, >=, 0); 796 797 if (zio->io_error) 798 vdev_disk_error(zio); 799 800 /* Return the BIO to the kernel */ 801 bio_put(bio); 802 803 /* 804 * If we copied the ABD before issuing it, clean up and return the copy 805 * to the ADB, with changes if appropriate. 806 */ 807 if (vbio->vbio_abd != NULL) { 808 void *buf = abd_to_buf(vbio->vbio_abd); 809 abd_free(vbio->vbio_abd); 810 vbio->vbio_abd = NULL; 811 812 if (zio->io_type == ZIO_TYPE_READ) 813 abd_return_buf_copy(zio->io_abd, buf, zio->io_size); 814 else 815 abd_return_buf(zio->io_abd, buf, zio->io_size); 816 } 817 818 /* Final cleanup */ 819 kmem_free(vbio, sizeof (vbio_t)); 820 821 /* All done, submit for processing */ 822 zio_delay_interrupt(zio); 823 } 824 825 /* 826 * Iterator callback to count ABD pages and check their size & alignment. 827 * 828 * On Linux, each BIO segment can take a page pointer, and an offset+length of 829 * the data within that page. A page can be arbitrarily large ("compound" 830 * pages) but we still have to ensure the data portion is correctly sized and 831 * aligned to the logical block size, to ensure that if the kernel wants to 832 * split the BIO, the two halves will still be properly aligned. 833 * 834 * NOTE: if you change this function, change the copy in 835 * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test 836 * data there to validate the change you're making. 837 * 838 */ 839 typedef struct { 840 uint_t bmask; 841 uint_t npages; 842 uint_t end; 843 } vdev_disk_check_pages_t; 844 845 static int 846 vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) 847 { 848 (void) page; 849 vdev_disk_check_pages_t *s = priv; 850 851 /* 852 * If we didn't finish on a block size boundary last time, then there 853 * would be a gap if we tried to use this ABD as-is, so abort. 854 */ 855 if (s->end != 0) 856 return (1); 857 858 /* 859 * Note if we're taking less than a full block, so we can check it 860 * above on the next call. 861 */ 862 s->end = (off+len) & s->bmask; 863 864 /* All blocks after the first must start on a block size boundary. */ 865 if (s->npages != 0 && (off & s->bmask) != 0) 866 return (1); 867 868 s->npages++; 869 return (0); 870 } 871 872 /* 873 * Check if we can submit the pages in this ABD to the kernel as-is. Returns 874 * the number of pages, or 0 if it can't be submitted like this. 875 */ 876 static boolean_t 877 vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) 878 { 879 vdev_disk_check_pages_t s = { 880 .bmask = bdev_logical_block_size(bdev)-1, 881 .npages = 0, 882 .end = 0, 883 }; 884 885 if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) 886 return (B_FALSE); 887 888 return (B_TRUE); 889 } 890 891 static int 892 vdev_disk_io_rw(zio_t *zio) 893 { 894 vdev_t *v = zio->io_vd; 895 vdev_disk_t *vd = v->vdev_tsd; 896 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 897 int flags = 0; 898 899 /* 900 * Accessing outside the block device is never allowed. 901 */ 902 if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) { 903 vdev_dbgmsg(zio->io_vd, 904 "Illegal access %llu size %llu, device size %llu", 905 (u_longlong_t)zio->io_offset, 906 (u_longlong_t)zio->io_size, 907 (u_longlong_t)bdev_capacity(bdev)); 908 return (SET_ERROR(EIO)); 909 } 910 911 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 912 v->vdev_failfast == B_TRUE) { 913 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 914 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 915 } 916 917 /* 918 * Check alignment of the incoming ABD. If any part of it would require 919 * submitting a page that is not aligned to the logical block size, 920 * then we take a copy into a linear buffer and submit that instead. 921 * This should be impossible on a 512b LBS, and fairly rare on 4K, 922 * usually requiring abnormally-small data blocks (eg gang blocks) 923 * mixed into the same ABD as larger ones (eg aggregated). 924 */ 925 abd_t *abd = zio->io_abd; 926 if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { 927 void *buf; 928 if (zio->io_type == ZIO_TYPE_READ) 929 buf = abd_borrow_buf(zio->io_abd, zio->io_size); 930 else 931 buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); 932 933 /* 934 * Wrap the copy in an abd_t, so we can use the same iterators 935 * to count and fill the vbio later. 936 */ 937 abd = abd_get_from_buf(buf, zio->io_size); 938 939 /* 940 * False here would mean the borrowed copy has an invalid 941 * alignment too, which would mean we've somehow been passed a 942 * linear ABD with an interior page that has a non-zero offset 943 * or a size not a multiple of PAGE_SIZE. This is not possible. 944 * It would mean either zio_buf_alloc() or its underlying 945 * allocators have done something extremely strange, or our 946 * math in vdev_disk_check_pages() is wrong. In either case, 947 * something in seriously wrong and its not safe to continue. 948 */ 949 VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); 950 } 951 952 /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ 953 vbio_t *vbio = vbio_alloc(zio, bdev, flags); 954 if (abd != zio->io_abd) 955 vbio->vbio_abd = abd; 956 957 /* Fill it with data pages and submit it to the kernel */ 958 vbio_submit(vbio, abd, zio->io_size); 959 return (0); 960 } 961 962 /* ========== */ 963 964 /* 965 * This is the classic, battle-tested BIO submission code. Until we're totally 966 * sure that the new code is safe and correct in all cases, this will remain 967 * available and can be enabled by setting zfs_vdev_disk_classic=1 at module 968 * load time. 969 * 970 * These functions have been renamed to vdev_classic_* to make it clear what 971 * they belong to, but their implementations are unchanged. 972 */ 973 974 /* 975 * Virtual device vector for disks. 976 */ 977 typedef struct dio_request { 978 zio_t *dr_zio; /* Parent ZIO */ 979 atomic_t dr_ref; /* References */ 980 int dr_error; /* Bio error */ 981 int dr_bio_count; /* Count of bio's */ 982 struct bio *dr_bio[]; /* Attached bio's */ 983 } dio_request_t; 984 985 static dio_request_t * 986 vdev_classic_dio_alloc(int bio_count) 987 { 988 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 989 sizeof (struct bio *) * bio_count, KM_SLEEP); 990 atomic_set(&dr->dr_ref, 0); 991 dr->dr_bio_count = bio_count; 992 dr->dr_error = 0; 993 994 for (int i = 0; i < dr->dr_bio_count; i++) 995 dr->dr_bio[i] = NULL; 996 997 return (dr); 998 } 999 1000 static void 1001 vdev_classic_dio_free(dio_request_t *dr) 1002 { 1003 int i; 1004 1005 for (i = 0; i < dr->dr_bio_count; i++) 1006 if (dr->dr_bio[i]) 1007 bio_put(dr->dr_bio[i]); 1008 1009 kmem_free(dr, sizeof (dio_request_t) + 1010 sizeof (struct bio *) * dr->dr_bio_count); 1011 } 1012 1013 static void 1014 vdev_classic_dio_get(dio_request_t *dr) 1015 { 1016 atomic_inc(&dr->dr_ref); 1017 } 1018 1019 static void 1020 vdev_classic_dio_put(dio_request_t *dr) 1021 { 1022 int rc = atomic_dec_return(&dr->dr_ref); 1023 1024 /* 1025 * Free the dio_request when the last reference is dropped and 1026 * ensure zio_interpret is called only once with the correct zio 1027 */ 1028 if (rc == 0) { 1029 zio_t *zio = dr->dr_zio; 1030 int error = dr->dr_error; 1031 1032 vdev_classic_dio_free(dr); 1033 1034 if (zio) { 1035 zio->io_error = error; 1036 ASSERT3S(zio->io_error, >=, 0); 1037 if (zio->io_error) 1038 vdev_disk_error(zio); 1039 1040 zio_delay_interrupt(zio); 1041 } 1042 } 1043 } 1044 1045 static void 1046 vdev_classic_physio_completion(struct bio *bio) 1047 { 1048 dio_request_t *dr = bio->bi_private; 1049 1050 if (dr->dr_error == 0) { 1051 dr->dr_error = bi_status_to_errno(bio->bi_status); 1052 } 1053 1054 /* Drop reference acquired by vdev_classic_physio */ 1055 vdev_classic_dio_put(dr); 1056 } 1057 1058 static inline unsigned int 1059 vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 1060 { 1061 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 1062 bio_size, abd_offset); 1063 1064 #ifdef HAVE_BIO_MAX_SEGS 1065 return (bio_max_segs(nr_segs)); 1066 #else 1067 return (MIN(nr_segs, BIO_MAX_PAGES)); 1068 #endif 1069 } 1070 1071 static int 1072 vdev_classic_physio(zio_t *zio) 1073 { 1074 vdev_t *v = zio->io_vd; 1075 vdev_disk_t *vd = v->vdev_tsd; 1076 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 1077 size_t io_size = zio->io_size; 1078 uint64_t io_offset = zio->io_offset; 1079 int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; 1080 int flags = 0; 1081 1082 dio_request_t *dr; 1083 uint64_t abd_offset; 1084 uint64_t bio_offset; 1085 int bio_size; 1086 int bio_count = 16; 1087 int error = 0; 1088 struct blk_plug plug; 1089 unsigned short nr_vecs; 1090 1091 /* 1092 * Accessing outside the block device is never allowed. 1093 */ 1094 if (io_offset + io_size > bdev_capacity(bdev)) { 1095 vdev_dbgmsg(zio->io_vd, 1096 "Illegal access %llu size %llu, device size %llu", 1097 (u_longlong_t)io_offset, 1098 (u_longlong_t)io_size, 1099 (u_longlong_t)bdev_capacity(bdev)); 1100 return (SET_ERROR(EIO)); 1101 } 1102 1103 retry: 1104 dr = vdev_classic_dio_alloc(bio_count); 1105 1106 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 1107 zio->io_vd->vdev_failfast == B_TRUE) { 1108 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 1109 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 1110 } 1111 1112 dr->dr_zio = zio; 1113 1114 /* 1115 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 1116 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 1117 * can cover at least 128KB and at most 1MB. When the required number 1118 * of iovec's exceeds this, we are forced to break the IO in multiple 1119 * bio's and wait for them all to complete. This is likely if the 1120 * recordsize property is increased beyond 1MB. The default 1121 * bio_count=16 should typically accommodate the maximum-size zio of 1122 * 16MB. 1123 */ 1124 1125 abd_offset = 0; 1126 bio_offset = io_offset; 1127 bio_size = io_size; 1128 for (int i = 0; i <= dr->dr_bio_count; i++) { 1129 1130 /* Finished constructing bio's for given buffer */ 1131 if (bio_size <= 0) 1132 break; 1133 1134 /* 1135 * If additional bio's are required, we have to retry, but 1136 * this should be rare - see the comment above. 1137 */ 1138 if (dr->dr_bio_count == i) { 1139 vdev_classic_dio_free(dr); 1140 bio_count *= 2; 1141 goto retry; 1142 } 1143 1144 nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); 1145 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 1146 if (unlikely(dr->dr_bio[i] == NULL)) { 1147 vdev_classic_dio_free(dr); 1148 return (SET_ERROR(ENOMEM)); 1149 } 1150 1151 /* Matching put called by vdev_classic_physio_completion */ 1152 vdev_classic_dio_get(dr); 1153 1154 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 1155 dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; 1156 dr->dr_bio[i]->bi_private = dr; 1157 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 1158 1159 /* Remaining size is returned to become the new size */ 1160 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 1161 bio_size, abd_offset); 1162 1163 /* Advance in buffer and construct another bio if needed */ 1164 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1165 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 1166 } 1167 1168 /* Extra reference to protect dio_request during vdev_submit_bio */ 1169 vdev_classic_dio_get(dr); 1170 1171 if (dr->dr_bio_count > 1) 1172 blk_start_plug(&plug); 1173 1174 /* Submit all bio's associated with this dio */ 1175 for (int i = 0; i < dr->dr_bio_count; i++) { 1176 if (dr->dr_bio[i]) 1177 vdev_submit_bio(dr->dr_bio[i]); 1178 } 1179 1180 if (dr->dr_bio_count > 1) 1181 blk_finish_plug(&plug); 1182 1183 vdev_classic_dio_put(dr); 1184 1185 return (error); 1186 } 1187 1188 /* ========== */ 1189 1190 static void 1191 vdev_disk_io_flush_completion(struct bio *bio) 1192 { 1193 zio_t *zio = bio->bi_private; 1194 zio->io_error = bi_status_to_errno(bio->bi_status); 1195 1196 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 1197 zio->io_vd->vdev_nowritecache = B_TRUE; 1198 1199 bio_put(bio); 1200 ASSERT3S(zio->io_error, >=, 0); 1201 if (zio->io_error) 1202 vdev_disk_error(zio); 1203 zio_interrupt(zio); 1204 } 1205 1206 static int 1207 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 1208 { 1209 struct request_queue *q; 1210 struct bio *bio; 1211 1212 q = bdev_get_queue(bdev); 1213 if (!q) 1214 return (SET_ERROR(ENXIO)); 1215 1216 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 1217 if (unlikely(bio == NULL)) 1218 return (SET_ERROR(ENOMEM)); 1219 1220 bio->bi_end_io = vdev_disk_io_flush_completion; 1221 bio->bi_private = zio; 1222 bio_set_flush(bio); 1223 vdev_submit_bio(bio); 1224 invalidate_bdev(bdev); 1225 1226 return (0); 1227 } 1228 1229 static void 1230 vdev_disk_discard_end_io(struct bio *bio) 1231 { 1232 zio_t *zio = bio->bi_private; 1233 zio->io_error = bi_status_to_errno(bio->bi_status); 1234 1235 bio_put(bio); 1236 if (zio->io_error) 1237 vdev_disk_error(zio); 1238 zio_interrupt(zio); 1239 } 1240 1241 /* 1242 * Wrappers for the different secure erase and discard APIs. We use async 1243 * when available; in this case, *biop is set to the last bio in the chain. 1244 */ 1245 static int 1246 vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, 1247 sector_t nsect, struct bio **biop) 1248 { 1249 *biop = NULL; 1250 int error; 1251 1252 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 1253 error = blkdev_issue_secure_erase(BDH_BDEV(bdh), 1254 sector, nsect, GFP_NOFS); 1255 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 1256 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1257 sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); 1258 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 1259 error = blkdev_issue_discard(BDH_BDEV(bdh), 1260 sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); 1261 #else 1262 #error "unsupported kernel" 1263 #endif 1264 1265 return (error); 1266 } 1267 1268 static int 1269 vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, 1270 sector_t nsect, struct bio **biop) 1271 { 1272 *biop = NULL; 1273 int error; 1274 1275 #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) 1276 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1277 sector, nsect, GFP_NOFS, 0, biop); 1278 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) 1279 error = __blkdev_issue_discard(BDH_BDEV(bdh), 1280 sector, nsect, GFP_NOFS, biop); 1281 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) 1282 error = blkdev_issue_discard(BDH_BDEV(bdh), 1283 sector, nsect, GFP_NOFS, 0); 1284 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) 1285 error = blkdev_issue_discard(BDH_BDEV(bdh), 1286 sector, nsect, GFP_NOFS); 1287 #else 1288 #error "unsupported kernel" 1289 #endif 1290 1291 return (error); 1292 } 1293 1294 /* 1295 * Entry point for TRIM ops. This calls the right wrapper for secure erase or 1296 * discard, and then does the appropriate finishing work for error vs success 1297 * and async vs sync. 1298 */ 1299 static int 1300 vdev_disk_io_trim(zio_t *zio) 1301 { 1302 int error; 1303 struct bio *bio; 1304 1305 zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; 1306 sector_t sector = zio->io_offset >> 9; 1307 sector_t nsects = zio->io_size >> 9; 1308 1309 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 1310 error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); 1311 else 1312 error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); 1313 1314 if (error != 0) 1315 return (SET_ERROR(-error)); 1316 1317 if (bio == NULL) { 1318 /* 1319 * This was a synchronous op that completed successfully, so 1320 * return it to ZFS immediately. 1321 */ 1322 zio_interrupt(zio); 1323 } else { 1324 /* 1325 * This was an asynchronous op; set up completion callback and 1326 * issue it. 1327 */ 1328 bio->bi_private = zio; 1329 bio->bi_end_io = vdev_disk_discard_end_io; 1330 vdev_submit_bio(bio); 1331 } 1332 1333 return (0); 1334 } 1335 1336 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; 1337 1338 static void 1339 vdev_disk_io_start(zio_t *zio) 1340 { 1341 vdev_t *v = zio->io_vd; 1342 vdev_disk_t *vd = v->vdev_tsd; 1343 int error; 1344 1345 /* 1346 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 1347 * Nothing to be done here but return failure. 1348 */ 1349 if (vd == NULL) { 1350 zio->io_error = ENXIO; 1351 zio_interrupt(zio); 1352 return; 1353 } 1354 1355 rw_enter(&vd->vd_lock, RW_READER); 1356 1357 /* 1358 * If the vdev is closed, it's likely due to a failed reopen and is 1359 * in the UNAVAIL state. Nothing to be done here but return failure. 1360 */ 1361 if (vd->vd_bdh == NULL) { 1362 rw_exit(&vd->vd_lock); 1363 zio->io_error = ENXIO; 1364 zio_interrupt(zio); 1365 return; 1366 } 1367 1368 switch (zio->io_type) { 1369 case ZIO_TYPE_FLUSH: 1370 1371 if (!vdev_readable(v)) { 1372 /* Drive not there, can't flush */ 1373 error = SET_ERROR(ENXIO); 1374 } else if (zfs_nocacheflush) { 1375 /* Flushing disabled by operator, declare success */ 1376 error = 0; 1377 } else if (v->vdev_nowritecache) { 1378 /* This vdev not capable of flushing */ 1379 error = SET_ERROR(ENOTSUP); 1380 } else { 1381 /* 1382 * Issue the flush. If successful, the response will 1383 * be handled in the completion callback, so we're done. 1384 */ 1385 error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); 1386 if (error == 0) { 1387 rw_exit(&vd->vd_lock); 1388 return; 1389 } 1390 } 1391 1392 /* Couldn't issue the flush, so set the error and return it */ 1393 rw_exit(&vd->vd_lock); 1394 zio->io_error = error; 1395 zio_execute(zio); 1396 return; 1397 1398 case ZIO_TYPE_TRIM: 1399 error = vdev_disk_io_trim(zio); 1400 rw_exit(&vd->vd_lock); 1401 if (error) { 1402 zio->io_error = error; 1403 zio_execute(zio); 1404 } 1405 return; 1406 1407 case ZIO_TYPE_READ: 1408 case ZIO_TYPE_WRITE: 1409 zio->io_target_timestamp = zio_handle_io_delay(zio); 1410 error = vdev_disk_io_rw_fn(zio); 1411 rw_exit(&vd->vd_lock); 1412 if (error) { 1413 zio->io_error = error; 1414 zio_interrupt(zio); 1415 } 1416 return; 1417 1418 default: 1419 /* 1420 * Getting here means our parent vdev has made a very strange 1421 * request of us, and shouldn't happen. Assert here to force a 1422 * crash in dev builds, but in production return the IO 1423 * unhandled. The pool will likely suspend anyway but that's 1424 * nicer than crashing the kernel. 1425 */ 1426 ASSERT3S(zio->io_type, ==, -1); 1427 1428 rw_exit(&vd->vd_lock); 1429 zio->io_error = SET_ERROR(ENOTSUP); 1430 zio_interrupt(zio); 1431 return; 1432 } 1433 1434 __builtin_unreachable(); 1435 } 1436 1437 static void 1438 vdev_disk_io_done(zio_t *zio) 1439 { 1440 /* 1441 * If the device returned EIO, we revalidate the media. If it is 1442 * determined the media has changed this triggers the asynchronous 1443 * removal of the device from the configuration. 1444 */ 1445 if (zio->io_error == EIO) { 1446 vdev_t *v = zio->io_vd; 1447 vdev_disk_t *vd = v->vdev_tsd; 1448 1449 if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { 1450 invalidate_bdev(BDH_BDEV(vd->vd_bdh)); 1451 v->vdev_remove_wanted = B_TRUE; 1452 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1453 } 1454 } 1455 } 1456 1457 static void 1458 vdev_disk_hold(vdev_t *vd) 1459 { 1460 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1461 1462 /* We must have a pathname, and it must be absolute. */ 1463 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 1464 return; 1465 1466 /* 1467 * Only prefetch path and devid info if the device has 1468 * never been opened. 1469 */ 1470 if (vd->vdev_tsd != NULL) 1471 return; 1472 1473 } 1474 1475 static void 1476 vdev_disk_rele(vdev_t *vd) 1477 { 1478 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1479 1480 /* XXX: Implement me as a vnode rele for the device */ 1481 } 1482 1483 /* 1484 * BIO submission method. See comment above about vdev_classic. 1485 * Set zfs_vdev_disk_classic=0 for new, =1 for classic 1486 */ 1487 static uint_t zfs_vdev_disk_classic = 0; /* default new */ 1488 1489 /* Set submission function from module parameter */ 1490 static int 1491 vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) 1492 { 1493 int err = param_set_uint(buf, kp); 1494 if (err < 0) 1495 return (SET_ERROR(err)); 1496 1497 vdev_disk_io_rw_fn = 1498 zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; 1499 1500 printk(KERN_INFO "ZFS: forcing %s BIO submission\n", 1501 zfs_vdev_disk_classic ? "classic" : "new"); 1502 1503 return (0); 1504 } 1505 1506 /* 1507 * At first use vdev use, set the submission function from the default value if 1508 * it hasn't been set already. 1509 */ 1510 static int 1511 vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) 1512 { 1513 (void) spa; 1514 (void) nv; 1515 (void) tsd; 1516 1517 if (vdev_disk_io_rw_fn == NULL) 1518 vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? 1519 vdev_classic_physio : vdev_disk_io_rw; 1520 1521 return (0); 1522 } 1523 1524 vdev_ops_t vdev_disk_ops = { 1525 .vdev_op_init = vdev_disk_init, 1526 .vdev_op_fini = NULL, 1527 .vdev_op_open = vdev_disk_open, 1528 .vdev_op_close = vdev_disk_close, 1529 .vdev_op_asize = vdev_default_asize, 1530 .vdev_op_min_asize = vdev_default_min_asize, 1531 .vdev_op_min_alloc = NULL, 1532 .vdev_op_io_start = vdev_disk_io_start, 1533 .vdev_op_io_done = vdev_disk_io_done, 1534 .vdev_op_state_change = NULL, 1535 .vdev_op_need_resilver = NULL, 1536 .vdev_op_hold = vdev_disk_hold, 1537 .vdev_op_rele = vdev_disk_rele, 1538 .vdev_op_remap = NULL, 1539 .vdev_op_xlate = vdev_default_xlate, 1540 .vdev_op_rebuild_asize = NULL, 1541 .vdev_op_metaslab_init = NULL, 1542 .vdev_op_config_generate = NULL, 1543 .vdev_op_nparity = NULL, 1544 .vdev_op_ndisks = NULL, 1545 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1546 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1547 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1548 }; 1549 1550 /* 1551 * The zfs_vdev_scheduler module option has been deprecated. Setting this 1552 * value no longer has any effect. It has not yet been entirely removed 1553 * to allow the module to be loaded if this option is specified in the 1554 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 1555 */ 1556 static int 1557 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1558 { 1559 int error = param_set_charp(val, kp); 1560 if (error == 0) { 1561 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1562 "is not supported.\n"); 1563 } 1564 1565 return (error); 1566 } 1567 1568 static const char *zfs_vdev_scheduler = "unused"; 1569 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1570 param_get_charp, &zfs_vdev_scheduler, 0644); 1571 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1572 1573 int 1574 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1575 { 1576 uint_t val; 1577 int error; 1578 1579 error = kstrtouint(buf, 0, &val); 1580 if (error < 0) 1581 return (SET_ERROR(error)); 1582 1583 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1584 return (SET_ERROR(-EINVAL)); 1585 1586 error = param_set_uint(buf, kp); 1587 if (error < 0) 1588 return (SET_ERROR(error)); 1589 1590 return (0); 1591 } 1592 1593 int 1594 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1595 { 1596 uint_t val; 1597 int error; 1598 1599 error = kstrtouint(buf, 0, &val); 1600 if (error < 0) 1601 return (SET_ERROR(error)); 1602 1603 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1604 return (SET_ERROR(-EINVAL)); 1605 1606 error = param_set_uint(buf, kp); 1607 if (error < 0) 1608 return (SET_ERROR(error)); 1609 1610 return (0); 1611 } 1612 1613 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1614 "Timeout before determining that a device is missing"); 1615 1616 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1617 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1618 1619 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, 1620 "Maximum number of data segments to add to an IO request (min 4)"); 1621 1622 ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, 1623 vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, 1624 "Use classic BIO submission method"); 1625