1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 41 #include <linux/blk-cgroup.h> 42 #endif 43 44 /* 45 * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying 46 * block_device. Since it carries the block_device inside, its convenient to 47 * just use the handle as a proxy. For pre-6.8, we just emulate this with 48 * a cast, since we don't need any of the other fields inside the handle. 49 */ 50 #ifdef HAVE_BDEV_OPEN_BY_PATH 51 typedef struct bdev_handle zfs_bdev_handle_t; 52 #define BDH_BDEV(bdh) ((bdh)->bdev) 53 #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) 54 #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) 55 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 56 #else 57 typedef void zfs_bdev_handle_t; 58 #define BDH_BDEV(bdh) ((struct block_device *)bdh) 59 #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) 60 #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) 61 #define BDH_ERR_PTR(err) (ERR_PTR(err)) 62 #endif 63 64 typedef struct vdev_disk { 65 zfs_bdev_handle_t *vd_bdh; 66 krwlock_t vd_lock; 67 } vdev_disk_t; 68 69 /* 70 * Unique identifier for the exclusive vdev holder. 71 */ 72 static void *zfs_vdev_holder = VDEV_HOLDER; 73 74 /* 75 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 76 * device is missing. The missing path may be transient since the links 77 * can be briefly removed and recreated in response to udev events. 78 */ 79 static uint_t zfs_vdev_open_timeout_ms = 1000; 80 81 /* 82 * Size of the "reserved" partition, in blocks. 83 */ 84 #define EFI_MIN_RESV_SIZE (16 * 1024) 85 86 /* 87 * Virtual device vector for disks. 88 */ 89 typedef struct dio_request { 90 zio_t *dr_zio; /* Parent ZIO */ 91 atomic_t dr_ref; /* References */ 92 int dr_error; /* Bio error */ 93 int dr_bio_count; /* Count of bio's */ 94 struct bio *dr_bio[]; /* Attached bio's */ 95 } dio_request_t; 96 97 /* 98 * BIO request failfast mask. 99 */ 100 101 static unsigned int zfs_vdev_failfast_mask = 1; 102 103 #ifdef HAVE_BLK_MODE_T 104 static blk_mode_t 105 #else 106 static fmode_t 107 #endif 108 vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) 109 { 110 #ifdef HAVE_BLK_MODE_T 111 blk_mode_t mode = 0; 112 113 if (spa_mode & SPA_MODE_READ) 114 mode |= BLK_OPEN_READ; 115 116 if (spa_mode & SPA_MODE_WRITE) 117 mode |= BLK_OPEN_WRITE; 118 119 if (exclusive) 120 mode |= BLK_OPEN_EXCL; 121 #else 122 fmode_t mode = 0; 123 124 if (spa_mode & SPA_MODE_READ) 125 mode |= FMODE_READ; 126 127 if (spa_mode & SPA_MODE_WRITE) 128 mode |= FMODE_WRITE; 129 130 if (exclusive) 131 mode |= FMODE_EXCL; 132 #endif 133 134 return (mode); 135 } 136 137 /* 138 * Returns the usable capacity (in bytes) for the partition or disk. 139 */ 140 static uint64_t 141 bdev_capacity(struct block_device *bdev) 142 { 143 return (i_size_read(bdev->bd_inode)); 144 } 145 146 #if !defined(HAVE_BDEV_WHOLE) 147 static inline struct block_device * 148 bdev_whole(struct block_device *bdev) 149 { 150 return (bdev->bd_contains); 151 } 152 #endif 153 154 #if defined(HAVE_BDEVNAME) 155 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 156 #else 157 static inline void 158 vdev_bdevname(struct block_device *bdev, char *name) 159 { 160 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 161 } 162 #endif 163 164 /* 165 * Returns the maximum expansion capacity of the block device (in bytes). 166 * 167 * It is possible to expand a vdev when it has been created as a wholedisk 168 * and the containing block device has increased in capacity. Or when the 169 * partition containing the pool has been manually increased in size. 170 * 171 * This function is only responsible for calculating the potential expansion 172 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 173 * responsible for verifying the expected partition layout in the wholedisk 174 * case, and updating the partition table if appropriate. Once the partition 175 * size has been increased the additional capacity will be visible using 176 * bdev_capacity(). 177 * 178 * The returned maximum expansion capacity is always expected to be larger, or 179 * at the very least equal, to its usable capacity to prevent overestimating 180 * the pool expandsize. 181 */ 182 static uint64_t 183 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 184 { 185 uint64_t psize; 186 int64_t available; 187 188 if (wholedisk && bdev != bdev_whole(bdev)) { 189 /* 190 * When reporting maximum expansion capacity for a wholedisk 191 * deduct any capacity which is expected to be lost due to 192 * alignment restrictions. Over reporting this value isn't 193 * harmful and would only result in slightly less capacity 194 * than expected post expansion. 195 * The estimated available space may be slightly smaller than 196 * bdev_capacity() for devices where the number of sectors is 197 * not a multiple of the alignment size and the partition layout 198 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 199 * "reserved" EFI partition: in such cases return the device 200 * usable capacity. 201 */ 202 available = i_size_read(bdev_whole(bdev)->bd_inode) - 203 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 204 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 205 psize = MAX(available, bdev_capacity(bdev)); 206 } else { 207 psize = bdev_capacity(bdev); 208 } 209 210 return (psize); 211 } 212 213 static void 214 vdev_disk_error(zio_t *zio) 215 { 216 /* 217 * This function can be called in interrupt context, for instance while 218 * handling IRQs coming from a misbehaving disk device; use printk() 219 * which is safe from any context. 220 */ 221 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 222 "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), 223 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 224 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 225 zio->io_flags); 226 } 227 228 static void 229 vdev_disk_kobj_evt_post(vdev_t *v) 230 { 231 vdev_disk_t *vd = v->vdev_tsd; 232 if (vd && vd->vd_bdh) { 233 spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); 234 } else { 235 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 236 v->vdev_path); 237 } 238 } 239 240 static zfs_bdev_handle_t * 241 vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder) 242 { 243 #if defined(HAVE_BDEV_OPEN_BY_PATH) 244 return (bdev_open_by_path(path, 245 vdev_bdev_mode(mode, B_TRUE), holder, NULL)); 246 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) 247 return (blkdev_get_by_path(path, 248 vdev_bdev_mode(mode, B_TRUE), holder, NULL)); 249 #else 250 return (blkdev_get_by_path(path, 251 vdev_bdev_mode(mode, B_TRUE), holder)); 252 #endif 253 } 254 255 static void 256 vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder) 257 { 258 #if defined(HAVE_BDEV_RELEASE) 259 return (bdev_release(bdh)); 260 #elif defined(HAVE_BLKDEV_PUT_HOLDER) 261 return (blkdev_put(BDH_BDEV(bdh), holder)); 262 #else 263 return (blkdev_put(BDH_BDEV(bdh), 264 vdev_bdev_mode(mode, B_TRUE))); 265 #endif 266 } 267 268 static int 269 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 270 uint64_t *logical_ashift, uint64_t *physical_ashift) 271 { 272 zfs_bdev_handle_t *bdh; 273 #ifdef HAVE_BLK_MODE_T 274 blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); 275 #else 276 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); 277 #endif 278 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 279 vdev_disk_t *vd; 280 281 /* Must have a pathname and it must be absolute. */ 282 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 283 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 284 vdev_dbgmsg(v, "invalid vdev_path"); 285 return (SET_ERROR(EINVAL)); 286 } 287 288 /* 289 * Reopen the device if it is currently open. When expanding a 290 * partition force re-scanning the partition table if userland 291 * did not take care of this already. We need to do this while closed 292 * in order to get an accurate updated block device size. Then 293 * since udev may need to recreate the device links increase the 294 * open retry timeout before reporting the device as unavailable. 295 */ 296 vd = v->vdev_tsd; 297 if (vd) { 298 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 299 boolean_t reread_part = B_FALSE; 300 301 rw_enter(&vd->vd_lock, RW_WRITER); 302 bdh = vd->vd_bdh; 303 vd->vd_bdh = NULL; 304 305 if (bdh) { 306 struct block_device *bdev = BDH_BDEV(bdh); 307 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 308 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 309 /* 310 * If userland has BLKPG_RESIZE_PARTITION, 311 * then it should have updated the partition 312 * table already. We can detect this by 313 * comparing our current physical size 314 * with that of the device. If they are 315 * the same, then we must not have 316 * BLKPG_RESIZE_PARTITION or it failed to 317 * update the partition table online. We 318 * fallback to rescanning the partition 319 * table from the kernel below. However, 320 * if the capacity already reflects the 321 * updated partition, then we skip 322 * rescanning the partition table here. 323 */ 324 if (v->vdev_psize == bdev_capacity(bdev)) 325 reread_part = B_TRUE; 326 } 327 328 vdev_blkdev_put(bdh, mode, zfs_vdev_holder); 329 } 330 331 if (reread_part) { 332 bdh = vdev_blkdev_get_by_path(disk_name, mode, 333 zfs_vdev_holder); 334 if (!BDH_IS_ERR(bdh)) { 335 int error = 336 vdev_bdev_reread_part(BDH_BDEV(bdh)); 337 vdev_blkdev_put(bdh, mode, zfs_vdev_holder); 338 if (error == 0) { 339 timeout = MSEC2NSEC( 340 zfs_vdev_open_timeout_ms * 2); 341 } 342 } 343 } 344 } else { 345 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 346 347 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 348 rw_enter(&vd->vd_lock, RW_WRITER); 349 } 350 351 /* 352 * Devices are always opened by the path provided at configuration 353 * time. This means that if the provided path is a udev by-id path 354 * then drives may be re-cabled without an issue. If the provided 355 * path is a udev by-path path, then the physical location information 356 * will be preserved. This can be critical for more complicated 357 * configurations where drives are located in specific physical 358 * locations to maximize the systems tolerance to component failure. 359 * 360 * Alternatively, you can provide your own udev rule to flexibly map 361 * the drives as you see fit. It is not advised that you use the 362 * /dev/[hd]d devices which may be reordered due to probing order. 363 * Devices in the wrong locations will be detected by the higher 364 * level vdev validation. 365 * 366 * The specified paths may be briefly removed and recreated in 367 * response to udev events. This should be exceptionally unlikely 368 * because the zpool command makes every effort to verify these paths 369 * have already settled prior to reaching this point. Therefore, 370 * a ENOENT failure at this point is highly likely to be transient 371 * and it is reasonable to sleep and retry before giving up. In 372 * practice delays have been observed to be on the order of 100ms. 373 * 374 * When ERESTARTSYS is returned it indicates the block device is 375 * a zvol which could not be opened due to the deadlock detection 376 * logic in zvol_open(). Extend the timeout and retry the open 377 * subsequent attempts are expected to eventually succeed. 378 */ 379 hrtime_t start = gethrtime(); 380 bdh = BDH_ERR_PTR(-ENXIO); 381 while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { 382 bdh = vdev_blkdev_get_by_path(v->vdev_path, mode, 383 zfs_vdev_holder); 384 if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { 385 /* 386 * There is no point of waiting since device is removed 387 * explicitly 388 */ 389 if (v->vdev_removed) 390 break; 391 392 schedule_timeout(MSEC_TO_TICK(10)); 393 } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { 394 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 395 continue; 396 } else if (BDH_IS_ERR(bdh)) { 397 break; 398 } 399 } 400 401 if (BDH_IS_ERR(bdh)) { 402 int error = -BDH_PTR_ERR(bdh); 403 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 404 (u_longlong_t)(gethrtime() - start), 405 (u_longlong_t)timeout); 406 vd->vd_bdh = NULL; 407 v->vdev_tsd = vd; 408 rw_exit(&vd->vd_lock); 409 return (SET_ERROR(error)); 410 } else { 411 vd->vd_bdh = bdh; 412 v->vdev_tsd = vd; 413 rw_exit(&vd->vd_lock); 414 } 415 416 struct block_device *bdev = BDH_BDEV(vd->vd_bdh); 417 418 /* Determine the physical block size */ 419 int physical_block_size = bdev_physical_block_size(bdev); 420 421 /* Determine the logical block size */ 422 int logical_block_size = bdev_logical_block_size(bdev); 423 424 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 425 v->vdev_nowritecache = B_FALSE; 426 427 /* Set when device reports it supports TRIM. */ 428 v->vdev_has_trim = bdev_discard_supported(bdev); 429 430 /* Set when device reports it supports secure TRIM. */ 431 v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); 432 433 /* Inform the ZIO pipeline that we are non-rotational */ 434 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); 435 436 /* Physical volume size in bytes for the partition */ 437 *psize = bdev_capacity(bdev); 438 439 /* Physical volume size in bytes including possible expansion space */ 440 *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); 441 442 /* Based on the minimum sector size set the block size */ 443 *physical_ashift = highbit64(MAX(physical_block_size, 444 SPA_MINBLOCKSIZE)) - 1; 445 446 *logical_ashift = highbit64(MAX(logical_block_size, 447 SPA_MINBLOCKSIZE)) - 1; 448 449 return (0); 450 } 451 452 static void 453 vdev_disk_close(vdev_t *v) 454 { 455 vdev_disk_t *vd = v->vdev_tsd; 456 457 if (v->vdev_reopening || vd == NULL) 458 return; 459 460 if (vd->vd_bdh != NULL) { 461 vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), 462 zfs_vdev_holder); 463 } 464 465 rw_destroy(&vd->vd_lock); 466 kmem_free(vd, sizeof (vdev_disk_t)); 467 v->vdev_tsd = NULL; 468 } 469 470 static dio_request_t * 471 vdev_disk_dio_alloc(int bio_count) 472 { 473 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 474 sizeof (struct bio *) * bio_count, KM_SLEEP); 475 atomic_set(&dr->dr_ref, 0); 476 dr->dr_bio_count = bio_count; 477 dr->dr_error = 0; 478 479 for (int i = 0; i < dr->dr_bio_count; i++) 480 dr->dr_bio[i] = NULL; 481 482 return (dr); 483 } 484 485 static void 486 vdev_disk_dio_free(dio_request_t *dr) 487 { 488 int i; 489 490 for (i = 0; i < dr->dr_bio_count; i++) 491 if (dr->dr_bio[i]) 492 bio_put(dr->dr_bio[i]); 493 494 kmem_free(dr, sizeof (dio_request_t) + 495 sizeof (struct bio *) * dr->dr_bio_count); 496 } 497 498 static void 499 vdev_disk_dio_get(dio_request_t *dr) 500 { 501 atomic_inc(&dr->dr_ref); 502 } 503 504 static void 505 vdev_disk_dio_put(dio_request_t *dr) 506 { 507 int rc = atomic_dec_return(&dr->dr_ref); 508 509 /* 510 * Free the dio_request when the last reference is dropped and 511 * ensure zio_interpret is called only once with the correct zio 512 */ 513 if (rc == 0) { 514 zio_t *zio = dr->dr_zio; 515 int error = dr->dr_error; 516 517 vdev_disk_dio_free(dr); 518 519 if (zio) { 520 zio->io_error = error; 521 ASSERT3S(zio->io_error, >=, 0); 522 if (zio->io_error) 523 vdev_disk_error(zio); 524 525 zio_delay_interrupt(zio); 526 } 527 } 528 } 529 530 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 531 { 532 dio_request_t *dr = bio->bi_private; 533 534 if (dr->dr_error == 0) { 535 #ifdef HAVE_1ARG_BIO_END_IO_T 536 dr->dr_error = BIO_END_IO_ERROR(bio); 537 #else 538 if (error) 539 dr->dr_error = -(error); 540 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 541 dr->dr_error = EIO; 542 #endif 543 } 544 545 /* Drop reference acquired by __vdev_disk_physio */ 546 vdev_disk_dio_put(dr); 547 } 548 549 static inline void 550 vdev_submit_bio_impl(struct bio *bio) 551 { 552 #ifdef HAVE_1ARG_SUBMIT_BIO 553 (void) submit_bio(bio); 554 #else 555 (void) submit_bio(bio_data_dir(bio), bio); 556 #endif 557 } 558 559 /* 560 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 561 * replace it with preempt_schedule under the following condition: 562 */ 563 #if defined(CONFIG_ARM64) && \ 564 defined(CONFIG_PREEMPTION) && \ 565 defined(CONFIG_BLK_CGROUP) 566 #define preempt_schedule_notrace(x) preempt_schedule(x) 567 #endif 568 569 /* 570 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 571 * as an argument removing the need to set it with bio_set_dev(). This 572 * removes the need for all of the following compatibility code. 573 */ 574 #if !defined(HAVE_BIO_ALLOC_4ARG) 575 576 #ifdef HAVE_BIO_SET_DEV 577 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 578 /* 579 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 580 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 581 * As a side effect the function was converted to GPL-only. Define our 582 * own version when needed which uses rcu_read_lock_sched(). 583 * 584 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 585 * part, moving blkg_tryget into the private one. Define our own version. 586 */ 587 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 588 static inline bool 589 vdev_blkg_tryget(struct blkcg_gq *blkg) 590 { 591 struct percpu_ref *ref = &blkg->refcnt; 592 unsigned long __percpu *count; 593 bool rc; 594 595 rcu_read_lock_sched(); 596 597 if (__ref_is_percpu(ref, &count)) { 598 this_cpu_inc(*count); 599 rc = true; 600 } else { 601 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 602 rc = atomic_long_inc_not_zero(&ref->data->count); 603 #else 604 rc = atomic_long_inc_not_zero(&ref->count); 605 #endif 606 } 607 608 rcu_read_unlock_sched(); 609 610 return (rc); 611 } 612 #else 613 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 614 #endif 615 #ifdef HAVE_BIO_SET_DEV_MACRO 616 /* 617 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 618 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 619 * the entire macro. Provide a minimal version which always assigns the 620 * request queue's root_blkg to the bio. 621 */ 622 static inline void 623 vdev_bio_associate_blkg(struct bio *bio) 624 { 625 #if defined(HAVE_BIO_BDEV_DISK) 626 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 627 #else 628 struct request_queue *q = bio->bi_disk->queue; 629 #endif 630 631 ASSERT3P(q, !=, NULL); 632 ASSERT3P(bio->bi_blkg, ==, NULL); 633 634 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 635 bio->bi_blkg = q->root_blkg; 636 } 637 638 #define bio_associate_blkg vdev_bio_associate_blkg 639 #else 640 static inline void 641 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 642 { 643 #if defined(HAVE_BIO_BDEV_DISK) 644 struct request_queue *q = bdev->bd_disk->queue; 645 #else 646 struct request_queue *q = bio->bi_disk->queue; 647 #endif 648 bio_clear_flag(bio, BIO_REMAPPED); 649 if (bio->bi_bdev != bdev) 650 bio_clear_flag(bio, BIO_THROTTLED); 651 bio->bi_bdev = bdev; 652 653 ASSERT3P(q, !=, NULL); 654 ASSERT3P(bio->bi_blkg, ==, NULL); 655 656 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 657 bio->bi_blkg = q->root_blkg; 658 } 659 #define bio_set_dev vdev_bio_set_dev 660 #endif 661 #endif 662 #else 663 /* 664 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 665 */ 666 static inline void 667 bio_set_dev(struct bio *bio, struct block_device *bdev) 668 { 669 bio->bi_bdev = bdev; 670 } 671 #endif /* HAVE_BIO_SET_DEV */ 672 #endif /* !HAVE_BIO_ALLOC_4ARG */ 673 674 static inline void 675 vdev_submit_bio(struct bio *bio) 676 { 677 struct bio_list *bio_list = current->bio_list; 678 current->bio_list = NULL; 679 vdev_submit_bio_impl(bio); 680 current->bio_list = bio_list; 681 } 682 683 static inline struct bio * 684 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 685 unsigned short nr_vecs) 686 { 687 struct bio *bio; 688 689 #ifdef HAVE_BIO_ALLOC_4ARG 690 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 691 #else 692 bio = bio_alloc(gfp_mask, nr_vecs); 693 if (likely(bio != NULL)) 694 bio_set_dev(bio, bdev); 695 #endif 696 697 return (bio); 698 } 699 700 static inline unsigned int 701 vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 702 { 703 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 704 bio_size, abd_offset); 705 706 #ifdef HAVE_BIO_MAX_SEGS 707 return (bio_max_segs(nr_segs)); 708 #else 709 return (MIN(nr_segs, BIO_MAX_PAGES)); 710 #endif 711 } 712 713 static int 714 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 715 size_t io_size, uint64_t io_offset, int rw, int flags) 716 { 717 dio_request_t *dr; 718 uint64_t abd_offset; 719 uint64_t bio_offset; 720 int bio_size; 721 int bio_count = 16; 722 int error = 0; 723 struct blk_plug plug; 724 unsigned short nr_vecs; 725 726 /* 727 * Accessing outside the block device is never allowed. 728 */ 729 if (io_offset + io_size > bdev->bd_inode->i_size) { 730 vdev_dbgmsg(zio->io_vd, 731 "Illegal access %llu size %llu, device size %llu", 732 (u_longlong_t)io_offset, 733 (u_longlong_t)io_size, 734 (u_longlong_t)i_size_read(bdev->bd_inode)); 735 return (SET_ERROR(EIO)); 736 } 737 738 retry: 739 dr = vdev_disk_dio_alloc(bio_count); 740 741 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && 742 zio->io_vd->vdev_failfast == B_TRUE) { 743 bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, 744 zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); 745 } 746 747 dr->dr_zio = zio; 748 749 /* 750 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 751 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 752 * can cover at least 128KB and at most 1MB. When the required number 753 * of iovec's exceeds this, we are forced to break the IO in multiple 754 * bio's and wait for them all to complete. This is likely if the 755 * recordsize property is increased beyond 1MB. The default 756 * bio_count=16 should typically accommodate the maximum-size zio of 757 * 16MB. 758 */ 759 760 abd_offset = 0; 761 bio_offset = io_offset; 762 bio_size = io_size; 763 for (int i = 0; i <= dr->dr_bio_count; i++) { 764 765 /* Finished constructing bio's for given buffer */ 766 if (bio_size <= 0) 767 break; 768 769 /* 770 * If additional bio's are required, we have to retry, but 771 * this should be rare - see the comment above. 772 */ 773 if (dr->dr_bio_count == i) { 774 vdev_disk_dio_free(dr); 775 bio_count *= 2; 776 goto retry; 777 } 778 779 nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); 780 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 781 if (unlikely(dr->dr_bio[i] == NULL)) { 782 vdev_disk_dio_free(dr); 783 return (SET_ERROR(ENOMEM)); 784 } 785 786 /* Matching put called by vdev_disk_physio_completion */ 787 vdev_disk_dio_get(dr); 788 789 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 790 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 791 dr->dr_bio[i]->bi_private = dr; 792 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 793 794 /* Remaining size is returned to become the new size */ 795 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 796 bio_size, abd_offset); 797 798 /* Advance in buffer and construct another bio if needed */ 799 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 800 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 801 } 802 803 /* Extra reference to protect dio_request during vdev_submit_bio */ 804 vdev_disk_dio_get(dr); 805 806 if (dr->dr_bio_count > 1) 807 blk_start_plug(&plug); 808 809 /* Submit all bio's associated with this dio */ 810 for (int i = 0; i < dr->dr_bio_count; i++) { 811 if (dr->dr_bio[i]) 812 vdev_submit_bio(dr->dr_bio[i]); 813 } 814 815 if (dr->dr_bio_count > 1) 816 blk_finish_plug(&plug); 817 818 vdev_disk_dio_put(dr); 819 820 return (error); 821 } 822 823 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 824 { 825 zio_t *zio = bio->bi_private; 826 #ifdef HAVE_1ARG_BIO_END_IO_T 827 zio->io_error = BIO_END_IO_ERROR(bio); 828 #else 829 zio->io_error = -error; 830 #endif 831 832 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 833 zio->io_vd->vdev_nowritecache = B_TRUE; 834 835 bio_put(bio); 836 ASSERT3S(zio->io_error, >=, 0); 837 if (zio->io_error) 838 vdev_disk_error(zio); 839 zio_interrupt(zio); 840 } 841 842 static int 843 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 844 { 845 struct request_queue *q; 846 struct bio *bio; 847 848 q = bdev_get_queue(bdev); 849 if (!q) 850 return (SET_ERROR(ENXIO)); 851 852 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 853 if (unlikely(bio == NULL)) 854 return (SET_ERROR(ENOMEM)); 855 856 bio->bi_end_io = vdev_disk_io_flush_completion; 857 bio->bi_private = zio; 858 bio_set_flush(bio); 859 vdev_submit_bio(bio); 860 invalidate_bdev(bdev); 861 862 return (0); 863 } 864 865 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ 866 defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) 867 BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) 868 { 869 zio_t *zio = bio->bi_private; 870 #ifdef HAVE_1ARG_BIO_END_IO_T 871 zio->io_error = BIO_END_IO_ERROR(bio); 872 #else 873 zio->io_error = -error; 874 #endif 875 bio_put(bio); 876 if (zio->io_error) 877 vdev_disk_error(zio); 878 zio_interrupt(zio); 879 } 880 881 static int 882 vdev_issue_discard_trim(zio_t *zio, unsigned long flags) 883 { 884 int ret; 885 struct bio *bio = NULL; 886 887 #if defined(BLKDEV_DISCARD_SECURE) 888 ret = - __blkdev_issue_discard( 889 BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), 890 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); 891 #else 892 (void) flags; 893 ret = - __blkdev_issue_discard( 894 BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), 895 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); 896 #endif 897 if (!ret && bio) { 898 bio->bi_private = zio; 899 bio->bi_end_io = vdev_disk_discard_end_io; 900 vdev_submit_bio(bio); 901 } 902 return (ret); 903 } 904 #endif 905 906 static int 907 vdev_disk_io_trim(zio_t *zio) 908 { 909 unsigned long trim_flags = 0; 910 if (zio->io_trim_flags & ZIO_TRIM_SECURE) { 911 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 912 return (-blkdev_issue_secure_erase( 913 BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), 914 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 915 #elif defined(BLKDEV_DISCARD_SECURE) 916 trim_flags |= BLKDEV_DISCARD_SECURE; 917 #endif 918 } 919 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ 920 defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) 921 return (vdev_issue_discard_trim(zio, trim_flags)); 922 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 923 return (-blkdev_issue_discard( 924 BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), 925 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); 926 #else 927 #error "Unsupported kernel" 928 #endif 929 } 930 931 static void 932 vdev_disk_io_start(zio_t *zio) 933 { 934 vdev_t *v = zio->io_vd; 935 vdev_disk_t *vd = v->vdev_tsd; 936 int rw, error; 937 938 /* 939 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 940 * Nothing to be done here but return failure. 941 */ 942 if (vd == NULL) { 943 zio->io_error = ENXIO; 944 zio_interrupt(zio); 945 return; 946 } 947 948 rw_enter(&vd->vd_lock, RW_READER); 949 950 /* 951 * If the vdev is closed, it's likely due to a failed reopen and is 952 * in the UNAVAIL state. Nothing to be done here but return failure. 953 */ 954 if (vd->vd_bdh == NULL) { 955 rw_exit(&vd->vd_lock); 956 zio->io_error = ENXIO; 957 zio_interrupt(zio); 958 return; 959 } 960 961 switch (zio->io_type) { 962 case ZIO_TYPE_IOCTL: 963 964 if (!vdev_readable(v)) { 965 rw_exit(&vd->vd_lock); 966 zio->io_error = SET_ERROR(ENXIO); 967 zio_interrupt(zio); 968 return; 969 } 970 971 switch (zio->io_cmd) { 972 case DKIOCFLUSHWRITECACHE: 973 974 if (zfs_nocacheflush) 975 break; 976 977 if (v->vdev_nowritecache) { 978 zio->io_error = SET_ERROR(ENOTSUP); 979 break; 980 } 981 982 error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); 983 if (error == 0) { 984 rw_exit(&vd->vd_lock); 985 return; 986 } 987 988 zio->io_error = error; 989 990 break; 991 992 default: 993 zio->io_error = SET_ERROR(ENOTSUP); 994 } 995 996 rw_exit(&vd->vd_lock); 997 zio_execute(zio); 998 return; 999 case ZIO_TYPE_WRITE: 1000 rw = WRITE; 1001 break; 1002 1003 case ZIO_TYPE_READ: 1004 rw = READ; 1005 break; 1006 1007 case ZIO_TYPE_TRIM: 1008 zio->io_error = vdev_disk_io_trim(zio); 1009 rw_exit(&vd->vd_lock); 1010 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 1011 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 1012 zio_interrupt(zio); 1013 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 1014 zio_interrupt(zio); 1015 #endif 1016 return; 1017 1018 default: 1019 rw_exit(&vd->vd_lock); 1020 zio->io_error = SET_ERROR(ENOTSUP); 1021 zio_interrupt(zio); 1022 return; 1023 } 1024 1025 zio->io_target_timestamp = zio_handle_io_delay(zio); 1026 error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio, 1027 zio->io_size, zio->io_offset, rw, 0); 1028 rw_exit(&vd->vd_lock); 1029 1030 if (error) { 1031 zio->io_error = error; 1032 zio_interrupt(zio); 1033 return; 1034 } 1035 } 1036 1037 static void 1038 vdev_disk_io_done(zio_t *zio) 1039 { 1040 /* 1041 * If the device returned EIO, we revalidate the media. If it is 1042 * determined the media has changed this triggers the asynchronous 1043 * removal of the device from the configuration. 1044 */ 1045 if (zio->io_error == EIO) { 1046 vdev_t *v = zio->io_vd; 1047 vdev_disk_t *vd = v->vdev_tsd; 1048 1049 if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { 1050 invalidate_bdev(BDH_BDEV(vd->vd_bdh)); 1051 v->vdev_remove_wanted = B_TRUE; 1052 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1053 } 1054 } 1055 } 1056 1057 static void 1058 vdev_disk_hold(vdev_t *vd) 1059 { 1060 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1061 1062 /* We must have a pathname, and it must be absolute. */ 1063 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 1064 return; 1065 1066 /* 1067 * Only prefetch path and devid info if the device has 1068 * never been opened. 1069 */ 1070 if (vd->vdev_tsd != NULL) 1071 return; 1072 1073 } 1074 1075 static void 1076 vdev_disk_rele(vdev_t *vd) 1077 { 1078 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 1079 1080 /* XXX: Implement me as a vnode rele for the device */ 1081 } 1082 1083 vdev_ops_t vdev_disk_ops = { 1084 .vdev_op_init = NULL, 1085 .vdev_op_fini = NULL, 1086 .vdev_op_open = vdev_disk_open, 1087 .vdev_op_close = vdev_disk_close, 1088 .vdev_op_asize = vdev_default_asize, 1089 .vdev_op_min_asize = vdev_default_min_asize, 1090 .vdev_op_min_alloc = NULL, 1091 .vdev_op_io_start = vdev_disk_io_start, 1092 .vdev_op_io_done = vdev_disk_io_done, 1093 .vdev_op_state_change = NULL, 1094 .vdev_op_need_resilver = NULL, 1095 .vdev_op_hold = vdev_disk_hold, 1096 .vdev_op_rele = vdev_disk_rele, 1097 .vdev_op_remap = NULL, 1098 .vdev_op_xlate = vdev_default_xlate, 1099 .vdev_op_rebuild_asize = NULL, 1100 .vdev_op_metaslab_init = NULL, 1101 .vdev_op_config_generate = NULL, 1102 .vdev_op_nparity = NULL, 1103 .vdev_op_ndisks = NULL, 1104 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1105 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 1106 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 1107 }; 1108 1109 /* 1110 * The zfs_vdev_scheduler module option has been deprecated. Setting this 1111 * value no longer has any effect. It has not yet been entirely removed 1112 * to allow the module to be loaded if this option is specified in the 1113 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 1114 */ 1115 static int 1116 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 1117 { 1118 int error = param_set_charp(val, kp); 1119 if (error == 0) { 1120 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 1121 "is not supported.\n"); 1122 } 1123 1124 return (error); 1125 } 1126 1127 static const char *zfs_vdev_scheduler = "unused"; 1128 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1129 param_get_charp, &zfs_vdev_scheduler, 0644); 1130 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1131 1132 int 1133 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1134 { 1135 uint_t val; 1136 int error; 1137 1138 error = kstrtouint(buf, 0, &val); 1139 if (error < 0) 1140 return (SET_ERROR(error)); 1141 1142 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1143 return (SET_ERROR(-EINVAL)); 1144 1145 error = param_set_uint(buf, kp); 1146 if (error < 0) 1147 return (SET_ERROR(error)); 1148 1149 return (0); 1150 } 1151 1152 int 1153 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1154 { 1155 uint_t val; 1156 int error; 1157 1158 error = kstrtouint(buf, 0, &val); 1159 if (error < 0) 1160 return (SET_ERROR(error)); 1161 1162 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1163 return (SET_ERROR(-EINVAL)); 1164 1165 error = param_set_uint(buf, kp); 1166 if (error < 0) 1167 return (SET_ERROR(error)); 1168 1169 return (0); 1170 } 1171 1172 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, 1173 "Timeout before determining that a device is missing"); 1174 1175 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, 1176 "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); 1177