1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 41 #include <linux/blk-cgroup.h> 42 #endif 43 44 typedef struct vdev_disk { 45 struct block_device *vd_bdev; 46 krwlock_t vd_lock; 47 } vdev_disk_t; 48 49 /* 50 * Unique identifier for the exclusive vdev holder. 51 */ 52 static void *zfs_vdev_holder = VDEV_HOLDER; 53 54 /* 55 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 56 * device is missing. The missing path may be transient since the links 57 * can be briefly removed and recreated in response to udev events. 58 */ 59 static unsigned zfs_vdev_open_timeout_ms = 1000; 60 61 /* 62 * Size of the "reserved" partition, in blocks. 63 */ 64 #define EFI_MIN_RESV_SIZE (16 * 1024) 65 66 /* 67 * Virtual device vector for disks. 68 */ 69 typedef struct dio_request { 70 zio_t *dr_zio; /* Parent ZIO */ 71 atomic_t dr_ref; /* References */ 72 int dr_error; /* Bio error */ 73 int dr_bio_count; /* Count of bio's */ 74 struct bio *dr_bio[0]; /* Attached bio's */ 75 } dio_request_t; 76 77 static fmode_t 78 vdev_bdev_mode(spa_mode_t spa_mode) 79 { 80 fmode_t mode = 0; 81 82 if (spa_mode & SPA_MODE_READ) 83 mode |= FMODE_READ; 84 85 if (spa_mode & SPA_MODE_WRITE) 86 mode |= FMODE_WRITE; 87 88 return (mode); 89 } 90 91 /* 92 * Returns the usable capacity (in bytes) for the partition or disk. 93 */ 94 static uint64_t 95 bdev_capacity(struct block_device *bdev) 96 { 97 return (i_size_read(bdev->bd_inode)); 98 } 99 100 #if !defined(HAVE_BDEV_WHOLE) 101 static inline struct block_device * 102 bdev_whole(struct block_device *bdev) 103 { 104 return (bdev->bd_contains); 105 } 106 #endif 107 108 #if defined(HAVE_BDEVNAME) 109 #define vdev_bdevname(bdev, name) bdevname(bdev, name) 110 #else 111 static inline void 112 vdev_bdevname(struct block_device *bdev, char *name) 113 { 114 snprintf(name, BDEVNAME_SIZE, "%pg", bdev); 115 } 116 #endif 117 118 /* 119 * Returns the maximum expansion capacity of the block device (in bytes). 120 * 121 * It is possible to expand a vdev when it has been created as a wholedisk 122 * and the containing block device has increased in capacity. Or when the 123 * partition containing the pool has been manually increased in size. 124 * 125 * This function is only responsible for calculating the potential expansion 126 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 127 * responsible for verifying the expected partition layout in the wholedisk 128 * case, and updating the partition table if appropriate. Once the partition 129 * size has been increased the additional capacity will be visible using 130 * bdev_capacity(). 131 * 132 * The returned maximum expansion capacity is always expected to be larger, or 133 * at the very least equal, to its usable capacity to prevent overestimating 134 * the pool expandsize. 135 */ 136 static uint64_t 137 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 138 { 139 uint64_t psize; 140 int64_t available; 141 142 if (wholedisk && bdev != bdev_whole(bdev)) { 143 /* 144 * When reporting maximum expansion capacity for a wholedisk 145 * deduct any capacity which is expected to be lost due to 146 * alignment restrictions. Over reporting this value isn't 147 * harmful and would only result in slightly less capacity 148 * than expected post expansion. 149 * The estimated available space may be slightly smaller than 150 * bdev_capacity() for devices where the number of sectors is 151 * not a multiple of the alignment size and the partition layout 152 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 153 * "reserved" EFI partition: in such cases return the device 154 * usable capacity. 155 */ 156 available = i_size_read(bdev_whole(bdev)->bd_inode) - 157 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 158 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 159 psize = MAX(available, bdev_capacity(bdev)); 160 } else { 161 psize = bdev_capacity(bdev); 162 } 163 164 return (psize); 165 } 166 167 static void 168 vdev_disk_error(zio_t *zio) 169 { 170 /* 171 * This function can be called in interrupt context, for instance while 172 * handling IRQs coming from a misbehaving disk device; use printk() 173 * which is safe from any context. 174 */ 175 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 176 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), 177 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 178 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 179 zio->io_flags); 180 } 181 182 static void 183 vdev_disk_kobj_evt_post(vdev_t *v) 184 { 185 vdev_disk_t *vd = v->vdev_tsd; 186 if (vd && vd->vd_bdev) { 187 spl_signal_kobj_evt(vd->vd_bdev); 188 } else { 189 vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", 190 v->vdev_path); 191 } 192 } 193 194 static int 195 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 196 uint64_t *logical_ashift, uint64_t *physical_ashift) 197 { 198 struct block_device *bdev; 199 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 200 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 201 vdev_disk_t *vd; 202 203 /* Must have a pathname and it must be absolute. */ 204 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 205 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 206 vdev_dbgmsg(v, "invalid vdev_path"); 207 return (SET_ERROR(EINVAL)); 208 } 209 210 /* 211 * Reopen the device if it is currently open. When expanding a 212 * partition force re-scanning the partition table if userland 213 * did not take care of this already. We need to do this while closed 214 * in order to get an accurate updated block device size. Then 215 * since udev may need to recreate the device links increase the 216 * open retry timeout before reporting the device as unavailable. 217 */ 218 vd = v->vdev_tsd; 219 if (vd) { 220 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 221 boolean_t reread_part = B_FALSE; 222 223 rw_enter(&vd->vd_lock, RW_WRITER); 224 bdev = vd->vd_bdev; 225 vd->vd_bdev = NULL; 226 227 if (bdev) { 228 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 229 vdev_bdevname(bdev_whole(bdev), disk_name + 5); 230 /* 231 * If userland has BLKPG_RESIZE_PARTITION, 232 * then it should have updated the partition 233 * table already. We can detect this by 234 * comparing our current physical size 235 * with that of the device. If they are 236 * the same, then we must not have 237 * BLKPG_RESIZE_PARTITION or it failed to 238 * update the partition table online. We 239 * fallback to rescanning the partition 240 * table from the kernel below. However, 241 * if the capacity already reflects the 242 * updated partition, then we skip 243 * rescanning the partition table here. 244 */ 245 if (v->vdev_psize == bdev_capacity(bdev)) 246 reread_part = B_TRUE; 247 } 248 249 blkdev_put(bdev, mode | FMODE_EXCL); 250 } 251 252 if (reread_part) { 253 bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, 254 zfs_vdev_holder); 255 if (!IS_ERR(bdev)) { 256 int error = vdev_bdev_reread_part(bdev); 257 blkdev_put(bdev, mode | FMODE_EXCL); 258 if (error == 0) { 259 timeout = MSEC2NSEC( 260 zfs_vdev_open_timeout_ms * 2); 261 } 262 } 263 } 264 } else { 265 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 266 267 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 268 rw_enter(&vd->vd_lock, RW_WRITER); 269 } 270 271 /* 272 * Devices are always opened by the path provided at configuration 273 * time. This means that if the provided path is a udev by-id path 274 * then drives may be re-cabled without an issue. If the provided 275 * path is a udev by-path path, then the physical location information 276 * will be preserved. This can be critical for more complicated 277 * configurations where drives are located in specific physical 278 * locations to maximize the systems tolerance to component failure. 279 * 280 * Alternatively, you can provide your own udev rule to flexibly map 281 * the drives as you see fit. It is not advised that you use the 282 * /dev/[hd]d devices which may be reordered due to probing order. 283 * Devices in the wrong locations will be detected by the higher 284 * level vdev validation. 285 * 286 * The specified paths may be briefly removed and recreated in 287 * response to udev events. This should be exceptionally unlikely 288 * because the zpool command makes every effort to verify these paths 289 * have already settled prior to reaching this point. Therefore, 290 * a ENOENT failure at this point is highly likely to be transient 291 * and it is reasonable to sleep and retry before giving up. In 292 * practice delays have been observed to be on the order of 100ms. 293 * 294 * When ERESTARTSYS is returned it indicates the block device is 295 * a zvol which could not be opened due to the deadlock detection 296 * logic in zvol_open(). Extend the timeout and retry the open 297 * subsequent attempts are expected to eventually succeed. 298 */ 299 hrtime_t start = gethrtime(); 300 bdev = ERR_PTR(-ENXIO); 301 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 302 bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, 303 zfs_vdev_holder); 304 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 305 /* 306 * There is no point of waiting since device is removed 307 * explicitly 308 */ 309 if (v->vdev_removed) 310 break; 311 312 schedule_timeout(MSEC_TO_TICK(10)); 313 } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { 314 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 315 continue; 316 } else if (IS_ERR(bdev)) { 317 break; 318 } 319 } 320 321 if (IS_ERR(bdev)) { 322 int error = -PTR_ERR(bdev); 323 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 324 (u_longlong_t)(gethrtime() - start), 325 (u_longlong_t)timeout); 326 vd->vd_bdev = NULL; 327 v->vdev_tsd = vd; 328 rw_exit(&vd->vd_lock); 329 return (SET_ERROR(error)); 330 } else { 331 vd->vd_bdev = bdev; 332 v->vdev_tsd = vd; 333 rw_exit(&vd->vd_lock); 334 } 335 336 /* Determine the physical block size */ 337 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 338 339 /* Determine the logical block size */ 340 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 341 342 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 343 v->vdev_nowritecache = B_FALSE; 344 345 /* Set when device reports it supports TRIM. */ 346 v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); 347 348 /* Set when device reports it supports secure TRIM. */ 349 v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); 350 351 /* Inform the ZIO pipeline that we are non-rotational */ 352 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); 353 354 /* Physical volume size in bytes for the partition */ 355 *psize = bdev_capacity(vd->vd_bdev); 356 357 /* Physical volume size in bytes including possible expansion space */ 358 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 359 360 /* Based on the minimum sector size set the block size */ 361 *physical_ashift = highbit64(MAX(physical_block_size, 362 SPA_MINBLOCKSIZE)) - 1; 363 364 *logical_ashift = highbit64(MAX(logical_block_size, 365 SPA_MINBLOCKSIZE)) - 1; 366 367 return (0); 368 } 369 370 static void 371 vdev_disk_close(vdev_t *v) 372 { 373 vdev_disk_t *vd = v->vdev_tsd; 374 375 if (v->vdev_reopening || vd == NULL) 376 return; 377 378 if (vd->vd_bdev != NULL) { 379 blkdev_put(vd->vd_bdev, 380 vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); 381 } 382 383 rw_destroy(&vd->vd_lock); 384 kmem_free(vd, sizeof (vdev_disk_t)); 385 v->vdev_tsd = NULL; 386 } 387 388 static dio_request_t * 389 vdev_disk_dio_alloc(int bio_count) 390 { 391 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 392 sizeof (struct bio *) * bio_count, KM_SLEEP); 393 atomic_set(&dr->dr_ref, 0); 394 dr->dr_bio_count = bio_count; 395 dr->dr_error = 0; 396 397 for (int i = 0; i < dr->dr_bio_count; i++) 398 dr->dr_bio[i] = NULL; 399 400 return (dr); 401 } 402 403 static void 404 vdev_disk_dio_free(dio_request_t *dr) 405 { 406 int i; 407 408 for (i = 0; i < dr->dr_bio_count; i++) 409 if (dr->dr_bio[i]) 410 bio_put(dr->dr_bio[i]); 411 412 kmem_free(dr, sizeof (dio_request_t) + 413 sizeof (struct bio *) * dr->dr_bio_count); 414 } 415 416 static void 417 vdev_disk_dio_get(dio_request_t *dr) 418 { 419 atomic_inc(&dr->dr_ref); 420 } 421 422 static int 423 vdev_disk_dio_put(dio_request_t *dr) 424 { 425 int rc = atomic_dec_return(&dr->dr_ref); 426 427 /* 428 * Free the dio_request when the last reference is dropped and 429 * ensure zio_interpret is called only once with the correct zio 430 */ 431 if (rc == 0) { 432 zio_t *zio = dr->dr_zio; 433 int error = dr->dr_error; 434 435 vdev_disk_dio_free(dr); 436 437 if (zio) { 438 zio->io_error = error; 439 ASSERT3S(zio->io_error, >=, 0); 440 if (zio->io_error) 441 vdev_disk_error(zio); 442 443 zio_delay_interrupt(zio); 444 } 445 } 446 447 return (rc); 448 } 449 450 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 451 { 452 dio_request_t *dr = bio->bi_private; 453 int rc; 454 455 if (dr->dr_error == 0) { 456 #ifdef HAVE_1ARG_BIO_END_IO_T 457 dr->dr_error = BIO_END_IO_ERROR(bio); 458 #else 459 if (error) 460 dr->dr_error = -(error); 461 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 462 dr->dr_error = EIO; 463 #endif 464 } 465 466 /* Drop reference acquired by __vdev_disk_physio */ 467 rc = vdev_disk_dio_put(dr); 468 } 469 470 static inline void 471 vdev_submit_bio_impl(struct bio *bio) 472 { 473 #ifdef HAVE_1ARG_SUBMIT_BIO 474 (void) submit_bio(bio); 475 #else 476 (void) submit_bio(bio_data_dir(bio), bio); 477 #endif 478 } 479 480 /* 481 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 482 * replace it with preempt_schedule under the following condition: 483 */ 484 #if defined(CONFIG_ARM64) && \ 485 defined(CONFIG_PREEMPTION) && \ 486 defined(CONFIG_BLK_CGROUP) 487 #define preempt_schedule_notrace(x) preempt_schedule(x) 488 #endif 489 490 /* 491 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 492 * as an argument removing the need to set it with bio_set_dev(). This 493 * removes the need for all of the following compatibility code. 494 */ 495 #if !defined(HAVE_BIO_ALLOC_4ARG) 496 497 #ifdef HAVE_BIO_SET_DEV 498 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 499 /* 500 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 501 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 502 * As a side effect the function was converted to GPL-only. Define our 503 * own version when needed which uses rcu_read_lock_sched(). 504 * 505 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 506 * part, moving blkg_tryget into the private one. Define our own version. 507 */ 508 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 509 static inline bool 510 vdev_blkg_tryget(struct blkcg_gq *blkg) 511 { 512 struct percpu_ref *ref = &blkg->refcnt; 513 unsigned long __percpu *count; 514 bool rc; 515 516 rcu_read_lock_sched(); 517 518 if (__ref_is_percpu(ref, &count)) { 519 this_cpu_inc(*count); 520 rc = true; 521 } else { 522 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 523 rc = atomic_long_inc_not_zero(&ref->data->count); 524 #else 525 rc = atomic_long_inc_not_zero(&ref->count); 526 #endif 527 } 528 529 rcu_read_unlock_sched(); 530 531 return (rc); 532 } 533 #else 534 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 535 #endif 536 #ifdef HAVE_BIO_SET_DEV_MACRO 537 /* 538 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 539 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 540 * the entire macro. Provide a minimal version which always assigns the 541 * request queue's root_blkg to the bio. 542 */ 543 static inline void 544 vdev_bio_associate_blkg(struct bio *bio) 545 { 546 #if defined(HAVE_BIO_BDEV_DISK) 547 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 548 #else 549 struct request_queue *q = bio->bi_disk->queue; 550 #endif 551 552 ASSERT3P(q, !=, NULL); 553 ASSERT3P(bio->bi_blkg, ==, NULL); 554 555 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 556 bio->bi_blkg = q->root_blkg; 557 } 558 559 #define bio_associate_blkg vdev_bio_associate_blkg 560 #else 561 static inline void 562 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 563 { 564 #if defined(HAVE_BIO_BDEV_DISK) 565 struct request_queue *q = bdev->bd_disk->queue; 566 #else 567 struct request_queue *q = bio->bi_disk->queue; 568 #endif 569 bio_clear_flag(bio, BIO_REMAPPED); 570 if (bio->bi_bdev != bdev) 571 bio_clear_flag(bio, BIO_THROTTLED); 572 bio->bi_bdev = bdev; 573 574 ASSERT3P(q, !=, NULL); 575 ASSERT3P(bio->bi_blkg, ==, NULL); 576 577 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 578 bio->bi_blkg = q->root_blkg; 579 } 580 #define bio_set_dev vdev_bio_set_dev 581 #endif 582 #endif 583 #else 584 /* 585 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 586 */ 587 static inline void 588 bio_set_dev(struct bio *bio, struct block_device *bdev) 589 { 590 bio->bi_bdev = bdev; 591 } 592 #endif /* HAVE_BIO_SET_DEV */ 593 #endif /* !HAVE_BIO_ALLOC_4ARG */ 594 595 static inline void 596 vdev_submit_bio(struct bio *bio) 597 { 598 struct bio_list *bio_list = current->bio_list; 599 current->bio_list = NULL; 600 vdev_submit_bio_impl(bio); 601 current->bio_list = bio_list; 602 } 603 604 static inline struct bio * 605 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 606 unsigned short nr_vecs) 607 { 608 struct bio *bio; 609 610 #ifdef HAVE_BIO_ALLOC_4ARG 611 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 612 #else 613 bio = bio_alloc(gfp_mask, nr_vecs); 614 if (likely(bio != NULL)) 615 bio_set_dev(bio, bdev); 616 #endif 617 618 return (bio); 619 } 620 621 static inline unsigned int 622 vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 623 { 624 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 625 bio_size, abd_offset); 626 627 #ifdef HAVE_BIO_MAX_SEGS 628 return (bio_max_segs(nr_segs)); 629 #else 630 return (MIN(nr_segs, BIO_MAX_PAGES)); 631 #endif 632 } 633 634 static int 635 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 636 size_t io_size, uint64_t io_offset, int rw, int flags) 637 { 638 dio_request_t *dr; 639 uint64_t abd_offset; 640 uint64_t bio_offset; 641 int bio_size; 642 int bio_count = 16; 643 int error = 0; 644 struct blk_plug plug; 645 unsigned short nr_vecs; 646 647 /* 648 * Accessing outside the block device is never allowed. 649 */ 650 if (io_offset + io_size > bdev->bd_inode->i_size) { 651 vdev_dbgmsg(zio->io_vd, 652 "Illegal access %llu size %llu, device size %llu", 653 (u_longlong_t)io_offset, 654 (u_longlong_t)io_size, 655 (u_longlong_t)i_size_read(bdev->bd_inode)); 656 return (SET_ERROR(EIO)); 657 } 658 659 retry: 660 dr = vdev_disk_dio_alloc(bio_count); 661 662 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 663 bio_set_flags_failfast(bdev, &flags); 664 665 dr->dr_zio = zio; 666 667 /* 668 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 669 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 670 * can cover at least 128KB and at most 1MB. When the required number 671 * of iovec's exceeds this, we are forced to break the IO in multiple 672 * bio's and wait for them all to complete. This is likely if the 673 * recordsize property is increased beyond 1MB. The default 674 * bio_count=16 should typically accommodate the maximum-size zio of 675 * 16MB. 676 */ 677 678 abd_offset = 0; 679 bio_offset = io_offset; 680 bio_size = io_size; 681 for (int i = 0; i <= dr->dr_bio_count; i++) { 682 683 /* Finished constructing bio's for given buffer */ 684 if (bio_size <= 0) 685 break; 686 687 /* 688 * If additional bio's are required, we have to retry, but 689 * this should be rare - see the comment above. 690 */ 691 if (dr->dr_bio_count == i) { 692 vdev_disk_dio_free(dr); 693 bio_count *= 2; 694 goto retry; 695 } 696 697 nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); 698 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 699 if (unlikely(dr->dr_bio[i] == NULL)) { 700 vdev_disk_dio_free(dr); 701 return (SET_ERROR(ENOMEM)); 702 } 703 704 /* Matching put called by vdev_disk_physio_completion */ 705 vdev_disk_dio_get(dr); 706 707 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 708 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 709 dr->dr_bio[i]->bi_private = dr; 710 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 711 712 /* Remaining size is returned to become the new size */ 713 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 714 bio_size, abd_offset); 715 716 /* Advance in buffer and construct another bio if needed */ 717 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 718 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 719 } 720 721 /* Extra reference to protect dio_request during vdev_submit_bio */ 722 vdev_disk_dio_get(dr); 723 724 if (dr->dr_bio_count > 1) 725 blk_start_plug(&plug); 726 727 /* Submit all bio's associated with this dio */ 728 for (int i = 0; i < dr->dr_bio_count; i++) { 729 if (dr->dr_bio[i]) 730 vdev_submit_bio(dr->dr_bio[i]); 731 } 732 733 if (dr->dr_bio_count > 1) 734 blk_finish_plug(&plug); 735 736 (void) vdev_disk_dio_put(dr); 737 738 return (error); 739 } 740 741 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 742 { 743 zio_t *zio = bio->bi_private; 744 #ifdef HAVE_1ARG_BIO_END_IO_T 745 zio->io_error = BIO_END_IO_ERROR(bio); 746 #else 747 zio->io_error = -error; 748 #endif 749 750 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 751 zio->io_vd->vdev_nowritecache = B_TRUE; 752 753 bio_put(bio); 754 ASSERT3S(zio->io_error, >=, 0); 755 if (zio->io_error) 756 vdev_disk_error(zio); 757 zio_interrupt(zio); 758 } 759 760 static int 761 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 762 { 763 struct request_queue *q; 764 struct bio *bio; 765 766 q = bdev_get_queue(bdev); 767 if (!q) 768 return (SET_ERROR(ENXIO)); 769 770 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 771 if (unlikely(bio == NULL)) 772 return (SET_ERROR(ENOMEM)); 773 774 bio->bi_end_io = vdev_disk_io_flush_completion; 775 bio->bi_private = zio; 776 bio_set_flush(bio); 777 vdev_submit_bio(bio); 778 invalidate_bdev(bdev); 779 780 return (0); 781 } 782 783 static int 784 vdev_disk_io_trim(zio_t *zio) 785 { 786 vdev_t *v = zio->io_vd; 787 vdev_disk_t *vd = v->vdev_tsd; 788 789 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 790 if (zio->io_trim_flags & ZIO_TRIM_SECURE) { 791 return (-blkdev_issue_secure_erase(vd->vd_bdev, 792 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 793 } else { 794 return (-blkdev_issue_discard(vd->vd_bdev, 795 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 796 } 797 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 798 unsigned long trim_flags = 0; 799 #if defined(BLKDEV_DISCARD_SECURE) 800 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 801 trim_flags |= BLKDEV_DISCARD_SECURE; 802 #endif 803 return (-blkdev_issue_discard(vd->vd_bdev, 804 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); 805 #else 806 #error "Unsupported kernel" 807 #endif 808 } 809 810 static void 811 vdev_disk_io_start(zio_t *zio) 812 { 813 vdev_t *v = zio->io_vd; 814 vdev_disk_t *vd = v->vdev_tsd; 815 int rw, error; 816 817 /* 818 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 819 * Nothing to be done here but return failure. 820 */ 821 if (vd == NULL) { 822 zio->io_error = ENXIO; 823 zio_interrupt(zio); 824 return; 825 } 826 827 rw_enter(&vd->vd_lock, RW_READER); 828 829 /* 830 * If the vdev is closed, it's likely due to a failed reopen and is 831 * in the UNAVAIL state. Nothing to be done here but return failure. 832 */ 833 if (vd->vd_bdev == NULL) { 834 rw_exit(&vd->vd_lock); 835 zio->io_error = ENXIO; 836 zio_interrupt(zio); 837 return; 838 } 839 840 switch (zio->io_type) { 841 case ZIO_TYPE_IOCTL: 842 843 if (!vdev_readable(v)) { 844 rw_exit(&vd->vd_lock); 845 zio->io_error = SET_ERROR(ENXIO); 846 zio_interrupt(zio); 847 return; 848 } 849 850 switch (zio->io_cmd) { 851 case DKIOCFLUSHWRITECACHE: 852 853 if (zfs_nocacheflush) 854 break; 855 856 if (v->vdev_nowritecache) { 857 zio->io_error = SET_ERROR(ENOTSUP); 858 break; 859 } 860 861 error = vdev_disk_io_flush(vd->vd_bdev, zio); 862 if (error == 0) { 863 rw_exit(&vd->vd_lock); 864 return; 865 } 866 867 zio->io_error = error; 868 869 break; 870 871 default: 872 zio->io_error = SET_ERROR(ENOTSUP); 873 } 874 875 rw_exit(&vd->vd_lock); 876 zio_execute(zio); 877 return; 878 case ZIO_TYPE_WRITE: 879 rw = WRITE; 880 break; 881 882 case ZIO_TYPE_READ: 883 rw = READ; 884 break; 885 886 case ZIO_TYPE_TRIM: 887 zio->io_error = vdev_disk_io_trim(zio); 888 rw_exit(&vd->vd_lock); 889 zio_interrupt(zio); 890 return; 891 892 default: 893 rw_exit(&vd->vd_lock); 894 zio->io_error = SET_ERROR(ENOTSUP); 895 zio_interrupt(zio); 896 return; 897 } 898 899 zio->io_target_timestamp = zio_handle_io_delay(zio); 900 error = __vdev_disk_physio(vd->vd_bdev, zio, 901 zio->io_size, zio->io_offset, rw, 0); 902 rw_exit(&vd->vd_lock); 903 904 if (error) { 905 zio->io_error = error; 906 zio_interrupt(zio); 907 return; 908 } 909 } 910 911 static void 912 vdev_disk_io_done(zio_t *zio) 913 { 914 /* 915 * If the device returned EIO, we revalidate the media. If it is 916 * determined the media has changed this triggers the asynchronous 917 * removal of the device from the configuration. 918 */ 919 if (zio->io_error == EIO) { 920 vdev_t *v = zio->io_vd; 921 vdev_disk_t *vd = v->vdev_tsd; 922 923 if (!zfs_check_disk_status(vd->vd_bdev)) { 924 invalidate_bdev(vd->vd_bdev); 925 v->vdev_remove_wanted = B_TRUE; 926 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 927 } 928 } 929 } 930 931 static void 932 vdev_disk_hold(vdev_t *vd) 933 { 934 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 935 936 /* We must have a pathname, and it must be absolute. */ 937 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 938 return; 939 940 /* 941 * Only prefetch path and devid info if the device has 942 * never been opened. 943 */ 944 if (vd->vdev_tsd != NULL) 945 return; 946 947 } 948 949 static void 950 vdev_disk_rele(vdev_t *vd) 951 { 952 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 953 954 /* XXX: Implement me as a vnode rele for the device */ 955 } 956 957 vdev_ops_t vdev_disk_ops = { 958 .vdev_op_init = NULL, 959 .vdev_op_fini = NULL, 960 .vdev_op_open = vdev_disk_open, 961 .vdev_op_close = vdev_disk_close, 962 .vdev_op_asize = vdev_default_asize, 963 .vdev_op_min_asize = vdev_default_min_asize, 964 .vdev_op_min_alloc = NULL, 965 .vdev_op_io_start = vdev_disk_io_start, 966 .vdev_op_io_done = vdev_disk_io_done, 967 .vdev_op_state_change = NULL, 968 .vdev_op_need_resilver = NULL, 969 .vdev_op_hold = vdev_disk_hold, 970 .vdev_op_rele = vdev_disk_rele, 971 .vdev_op_remap = NULL, 972 .vdev_op_xlate = vdev_default_xlate, 973 .vdev_op_rebuild_asize = NULL, 974 .vdev_op_metaslab_init = NULL, 975 .vdev_op_config_generate = NULL, 976 .vdev_op_nparity = NULL, 977 .vdev_op_ndisks = NULL, 978 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 979 .vdev_op_leaf = B_TRUE, /* leaf vdev */ 980 .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post 981 }; 982 983 /* 984 * The zfs_vdev_scheduler module option has been deprecated. Setting this 985 * value no longer has any effect. It has not yet been entirely removed 986 * to allow the module to be loaded if this option is specified in the 987 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 988 */ 989 static int 990 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 991 { 992 int error = param_set_charp(val, kp); 993 if (error == 0) { 994 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 995 "is not supported.\n"); 996 } 997 998 return (error); 999 } 1000 1001 static const char *zfs_vdev_scheduler = "unused"; 1002 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 1003 param_get_charp, &zfs_vdev_scheduler, 0644); 1004 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 1005 1006 int 1007 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1008 { 1009 uint64_t val; 1010 int error; 1011 1012 error = kstrtoull(buf, 0, &val); 1013 if (error < 0) 1014 return (SET_ERROR(error)); 1015 1016 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 1017 return (SET_ERROR(-EINVAL)); 1018 1019 error = param_set_ulong(buf, kp); 1020 if (error < 0) 1021 return (SET_ERROR(error)); 1022 1023 return (0); 1024 } 1025 1026 int 1027 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 1028 { 1029 uint64_t val; 1030 int error; 1031 1032 error = kstrtoull(buf, 0, &val); 1033 if (error < 0) 1034 return (SET_ERROR(error)); 1035 1036 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1037 return (SET_ERROR(-EINVAL)); 1038 1039 error = param_set_ulong(buf, kp); 1040 if (error < 0) 1041 return (SET_ERROR(error)); 1042 1043 return (0); 1044 } 1045