1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 25 * LLNL-CODE-403049. 26 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_disk.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <linux/blkpg.h> 38 #include <linux/msdos_fs.h> 39 #include <linux/vfs_compat.h> 40 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER 41 #include <linux/blk-cgroup.h> 42 #endif 43 44 typedef struct vdev_disk { 45 struct block_device *vd_bdev; 46 krwlock_t vd_lock; 47 } vdev_disk_t; 48 49 /* 50 * Unique identifier for the exclusive vdev holder. 51 */ 52 static void *zfs_vdev_holder = VDEV_HOLDER; 53 54 /* 55 * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the 56 * device is missing. The missing path may be transient since the links 57 * can be briefly removed and recreated in response to udev events. 58 */ 59 static unsigned zfs_vdev_open_timeout_ms = 1000; 60 61 /* 62 * Size of the "reserved" partition, in blocks. 63 */ 64 #define EFI_MIN_RESV_SIZE (16 * 1024) 65 66 /* 67 * Virtual device vector for disks. 68 */ 69 typedef struct dio_request { 70 zio_t *dr_zio; /* Parent ZIO */ 71 atomic_t dr_ref; /* References */ 72 int dr_error; /* Bio error */ 73 int dr_bio_count; /* Count of bio's */ 74 struct bio *dr_bio[0]; /* Attached bio's */ 75 } dio_request_t; 76 77 static fmode_t 78 vdev_bdev_mode(spa_mode_t spa_mode) 79 { 80 fmode_t mode = 0; 81 82 if (spa_mode & SPA_MODE_READ) 83 mode |= FMODE_READ; 84 85 if (spa_mode & SPA_MODE_WRITE) 86 mode |= FMODE_WRITE; 87 88 return (mode); 89 } 90 91 /* 92 * Returns the usable capacity (in bytes) for the partition or disk. 93 */ 94 static uint64_t 95 bdev_capacity(struct block_device *bdev) 96 { 97 return (i_size_read(bdev->bd_inode)); 98 } 99 100 #if !defined(HAVE_BDEV_WHOLE) 101 static inline struct block_device * 102 bdev_whole(struct block_device *bdev) 103 { 104 return (bdev->bd_contains); 105 } 106 #endif 107 108 /* 109 * Returns the maximum expansion capacity of the block device (in bytes). 110 * 111 * It is possible to expand a vdev when it has been created as a wholedisk 112 * and the containing block device has increased in capacity. Or when the 113 * partition containing the pool has been manually increased in size. 114 * 115 * This function is only responsible for calculating the potential expansion 116 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is 117 * responsible for verifying the expected partition layout in the wholedisk 118 * case, and updating the partition table if appropriate. Once the partition 119 * size has been increased the additional capacity will be visible using 120 * bdev_capacity(). 121 * 122 * The returned maximum expansion capacity is always expected to be larger, or 123 * at the very least equal, to its usable capacity to prevent overestimating 124 * the pool expandsize. 125 */ 126 static uint64_t 127 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) 128 { 129 uint64_t psize; 130 int64_t available; 131 132 if (wholedisk && bdev != bdev_whole(bdev)) { 133 /* 134 * When reporting maximum expansion capacity for a wholedisk 135 * deduct any capacity which is expected to be lost due to 136 * alignment restrictions. Over reporting this value isn't 137 * harmful and would only result in slightly less capacity 138 * than expected post expansion. 139 * The estimated available space may be slightly smaller than 140 * bdev_capacity() for devices where the number of sectors is 141 * not a multiple of the alignment size and the partition layout 142 * is keeping less than PARTITION_END_ALIGNMENT bytes after the 143 * "reserved" EFI partition: in such cases return the device 144 * usable capacity. 145 */ 146 available = i_size_read(bdev_whole(bdev)->bd_inode) - 147 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + 148 PARTITION_END_ALIGNMENT) << SECTOR_BITS); 149 psize = MAX(available, bdev_capacity(bdev)); 150 } else { 151 psize = bdev_capacity(bdev); 152 } 153 154 return (psize); 155 } 156 157 static void 158 vdev_disk_error(zio_t *zio) 159 { 160 /* 161 * This function can be called in interrupt context, for instance while 162 * handling IRQs coming from a misbehaving disk device; use printk() 163 * which is safe from any context. 164 */ 165 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " 166 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), 167 zio->io_vd->vdev_path, zio->io_error, zio->io_type, 168 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, 169 zio->io_flags); 170 } 171 172 static int 173 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, 174 uint64_t *logical_ashift, uint64_t *physical_ashift) 175 { 176 struct block_device *bdev; 177 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); 178 hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); 179 vdev_disk_t *vd; 180 181 /* Must have a pathname and it must be absolute. */ 182 if (v->vdev_path == NULL || v->vdev_path[0] != '/') { 183 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 184 vdev_dbgmsg(v, "invalid vdev_path"); 185 return (SET_ERROR(EINVAL)); 186 } 187 188 /* 189 * Reopen the device if it is currently open. When expanding a 190 * partition force re-scanning the partition table if userland 191 * did not take care of this already. We need to do this while closed 192 * in order to get an accurate updated block device size. Then 193 * since udev may need to recreate the device links increase the 194 * open retry timeout before reporting the device as unavailable. 195 */ 196 vd = v->vdev_tsd; 197 if (vd) { 198 char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; 199 boolean_t reread_part = B_FALSE; 200 201 rw_enter(&vd->vd_lock, RW_WRITER); 202 bdev = vd->vd_bdev; 203 vd->vd_bdev = NULL; 204 205 if (bdev) { 206 if (v->vdev_expanding && bdev != bdev_whole(bdev)) { 207 bdevname(bdev_whole(bdev), disk_name + 5); 208 /* 209 * If userland has BLKPG_RESIZE_PARTITION, 210 * then it should have updated the partition 211 * table already. We can detect this by 212 * comparing our current physical size 213 * with that of the device. If they are 214 * the same, then we must not have 215 * BLKPG_RESIZE_PARTITION or it failed to 216 * update the partition table online. We 217 * fallback to rescanning the partition 218 * table from the kernel below. However, 219 * if the capacity already reflects the 220 * updated partition, then we skip 221 * rescanning the partition table here. 222 */ 223 if (v->vdev_psize == bdev_capacity(bdev)) 224 reread_part = B_TRUE; 225 } 226 227 blkdev_put(bdev, mode | FMODE_EXCL); 228 } 229 230 if (reread_part) { 231 bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, 232 zfs_vdev_holder); 233 if (!IS_ERR(bdev)) { 234 int error = vdev_bdev_reread_part(bdev); 235 blkdev_put(bdev, mode | FMODE_EXCL); 236 if (error == 0) { 237 timeout = MSEC2NSEC( 238 zfs_vdev_open_timeout_ms * 2); 239 } 240 } 241 } 242 } else { 243 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 244 245 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); 246 rw_enter(&vd->vd_lock, RW_WRITER); 247 } 248 249 /* 250 * Devices are always opened by the path provided at configuration 251 * time. This means that if the provided path is a udev by-id path 252 * then drives may be re-cabled without an issue. If the provided 253 * path is a udev by-path path, then the physical location information 254 * will be preserved. This can be critical for more complicated 255 * configurations where drives are located in specific physical 256 * locations to maximize the systems tolerance to component failure. 257 * 258 * Alternatively, you can provide your own udev rule to flexibly map 259 * the drives as you see fit. It is not advised that you use the 260 * /dev/[hd]d devices which may be reordered due to probing order. 261 * Devices in the wrong locations will be detected by the higher 262 * level vdev validation. 263 * 264 * The specified paths may be briefly removed and recreated in 265 * response to udev events. This should be exceptionally unlikely 266 * because the zpool command makes every effort to verify these paths 267 * have already settled prior to reaching this point. Therefore, 268 * a ENOENT failure at this point is highly likely to be transient 269 * and it is reasonable to sleep and retry before giving up. In 270 * practice delays have been observed to be on the order of 100ms. 271 * 272 * When ERESTARTSYS is returned it indicates the block device is 273 * a zvol which could not be opened due to the deadlock detection 274 * logic in zvol_open(). Extend the timeout and retry the open 275 * subsequent attempts are expected to eventually succeed. 276 */ 277 hrtime_t start = gethrtime(); 278 bdev = ERR_PTR(-ENXIO); 279 while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { 280 bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, 281 zfs_vdev_holder); 282 if (unlikely(PTR_ERR(bdev) == -ENOENT)) { 283 schedule_timeout(MSEC_TO_TICK(10)); 284 } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { 285 timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); 286 continue; 287 } else if (IS_ERR(bdev)) { 288 break; 289 } 290 } 291 292 if (IS_ERR(bdev)) { 293 int error = -PTR_ERR(bdev); 294 vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, 295 (u_longlong_t)(gethrtime() - start), 296 (u_longlong_t)timeout); 297 vd->vd_bdev = NULL; 298 v->vdev_tsd = vd; 299 rw_exit(&vd->vd_lock); 300 return (SET_ERROR(error)); 301 } else { 302 vd->vd_bdev = bdev; 303 v->vdev_tsd = vd; 304 rw_exit(&vd->vd_lock); 305 } 306 307 /* Determine the physical block size */ 308 int physical_block_size = bdev_physical_block_size(vd->vd_bdev); 309 310 /* Determine the logical block size */ 311 int logical_block_size = bdev_logical_block_size(vd->vd_bdev); 312 313 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ 314 v->vdev_nowritecache = B_FALSE; 315 316 /* Set when device reports it supports TRIM. */ 317 v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); 318 319 /* Set when device reports it supports secure TRIM. */ 320 v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); 321 322 /* Inform the ZIO pipeline that we are non-rotational */ 323 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); 324 325 /* Physical volume size in bytes for the partition */ 326 *psize = bdev_capacity(vd->vd_bdev); 327 328 /* Physical volume size in bytes including possible expansion space */ 329 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); 330 331 /* Based on the minimum sector size set the block size */ 332 *physical_ashift = highbit64(MAX(physical_block_size, 333 SPA_MINBLOCKSIZE)) - 1; 334 335 *logical_ashift = highbit64(MAX(logical_block_size, 336 SPA_MINBLOCKSIZE)) - 1; 337 338 return (0); 339 } 340 341 static void 342 vdev_disk_close(vdev_t *v) 343 { 344 vdev_disk_t *vd = v->vdev_tsd; 345 346 if (v->vdev_reopening || vd == NULL) 347 return; 348 349 if (vd->vd_bdev != NULL) { 350 blkdev_put(vd->vd_bdev, 351 vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); 352 } 353 354 rw_destroy(&vd->vd_lock); 355 kmem_free(vd, sizeof (vdev_disk_t)); 356 v->vdev_tsd = NULL; 357 } 358 359 static dio_request_t * 360 vdev_disk_dio_alloc(int bio_count) 361 { 362 dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + 363 sizeof (struct bio *) * bio_count, KM_SLEEP); 364 atomic_set(&dr->dr_ref, 0); 365 dr->dr_bio_count = bio_count; 366 dr->dr_error = 0; 367 368 for (int i = 0; i < dr->dr_bio_count; i++) 369 dr->dr_bio[i] = NULL; 370 371 return (dr); 372 } 373 374 static void 375 vdev_disk_dio_free(dio_request_t *dr) 376 { 377 int i; 378 379 for (i = 0; i < dr->dr_bio_count; i++) 380 if (dr->dr_bio[i]) 381 bio_put(dr->dr_bio[i]); 382 383 kmem_free(dr, sizeof (dio_request_t) + 384 sizeof (struct bio *) * dr->dr_bio_count); 385 } 386 387 static void 388 vdev_disk_dio_get(dio_request_t *dr) 389 { 390 atomic_inc(&dr->dr_ref); 391 } 392 393 static int 394 vdev_disk_dio_put(dio_request_t *dr) 395 { 396 int rc = atomic_dec_return(&dr->dr_ref); 397 398 /* 399 * Free the dio_request when the last reference is dropped and 400 * ensure zio_interpret is called only once with the correct zio 401 */ 402 if (rc == 0) { 403 zio_t *zio = dr->dr_zio; 404 int error = dr->dr_error; 405 406 vdev_disk_dio_free(dr); 407 408 if (zio) { 409 zio->io_error = error; 410 ASSERT3S(zio->io_error, >=, 0); 411 if (zio->io_error) 412 vdev_disk_error(zio); 413 414 zio_delay_interrupt(zio); 415 } 416 } 417 418 return (rc); 419 } 420 421 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) 422 { 423 dio_request_t *dr = bio->bi_private; 424 int rc; 425 426 if (dr->dr_error == 0) { 427 #ifdef HAVE_1ARG_BIO_END_IO_T 428 dr->dr_error = BIO_END_IO_ERROR(bio); 429 #else 430 if (error) 431 dr->dr_error = -(error); 432 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 433 dr->dr_error = EIO; 434 #endif 435 } 436 437 /* Drop reference acquired by __vdev_disk_physio */ 438 rc = vdev_disk_dio_put(dr); 439 } 440 441 static inline void 442 vdev_submit_bio_impl(struct bio *bio) 443 { 444 #ifdef HAVE_1ARG_SUBMIT_BIO 445 (void) submit_bio(bio); 446 #else 447 (void) submit_bio(bio_data_dir(bio), bio); 448 #endif 449 } 450 451 /* 452 * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so 453 * replace it with preempt_schedule under the following condition: 454 */ 455 #if defined(CONFIG_ARM64) && \ 456 defined(CONFIG_PREEMPTION) && \ 457 defined(CONFIG_BLK_CGROUP) 458 #define preempt_schedule_notrace(x) preempt_schedule(x) 459 #endif 460 461 /* 462 * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct 463 * as an argument removing the need to set it with bio_set_dev(). This 464 * removes the need for all of the following compatibility code. 465 */ 466 #if !defined(HAVE_BIO_ALLOC_4ARG) 467 468 #ifdef HAVE_BIO_SET_DEV 469 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) 470 /* 471 * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by 472 * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). 473 * As a side effect the function was converted to GPL-only. Define our 474 * own version when needed which uses rcu_read_lock_sched(). 475 * 476 * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public 477 * part, moving blkg_tryget into the private one. Define our own version. 478 */ 479 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) 480 static inline bool 481 vdev_blkg_tryget(struct blkcg_gq *blkg) 482 { 483 struct percpu_ref *ref = &blkg->refcnt; 484 unsigned long __percpu *count; 485 bool rc; 486 487 rcu_read_lock_sched(); 488 489 if (__ref_is_percpu(ref, &count)) { 490 this_cpu_inc(*count); 491 rc = true; 492 } else { 493 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA 494 rc = atomic_long_inc_not_zero(&ref->data->count); 495 #else 496 rc = atomic_long_inc_not_zero(&ref->count); 497 #endif 498 } 499 500 rcu_read_unlock_sched(); 501 502 return (rc); 503 } 504 #else 505 #define vdev_blkg_tryget(bg) blkg_tryget(bg) 506 #endif 507 #ifdef HAVE_BIO_SET_DEV_MACRO 508 /* 509 * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the 510 * GPL-only bio_associate_blkg() symbol thus inadvertently converting 511 * the entire macro. Provide a minimal version which always assigns the 512 * request queue's root_blkg to the bio. 513 */ 514 static inline void 515 vdev_bio_associate_blkg(struct bio *bio) 516 { 517 #if defined(HAVE_BIO_BDEV_DISK) 518 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 519 #else 520 struct request_queue *q = bio->bi_disk->queue; 521 #endif 522 523 ASSERT3P(q, !=, NULL); 524 ASSERT3P(bio->bi_blkg, ==, NULL); 525 526 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 527 bio->bi_blkg = q->root_blkg; 528 } 529 530 #define bio_associate_blkg vdev_bio_associate_blkg 531 #else 532 static inline void 533 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) 534 { 535 #if defined(HAVE_BIO_BDEV_DISK) 536 struct request_queue *q = bdev->bd_disk->queue; 537 #else 538 struct request_queue *q = bio->bi_disk->queue; 539 #endif 540 bio_clear_flag(bio, BIO_REMAPPED); 541 if (bio->bi_bdev != bdev) 542 bio_clear_flag(bio, BIO_THROTTLED); 543 bio->bi_bdev = bdev; 544 545 ASSERT3P(q, !=, NULL); 546 ASSERT3P(bio->bi_blkg, ==, NULL); 547 548 if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) 549 bio->bi_blkg = q->root_blkg; 550 } 551 #define bio_set_dev vdev_bio_set_dev 552 #endif 553 #endif 554 #else 555 /* 556 * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. 557 */ 558 static inline void 559 bio_set_dev(struct bio *bio, struct block_device *bdev) 560 { 561 bio->bi_bdev = bdev; 562 } 563 #endif /* HAVE_BIO_SET_DEV */ 564 #endif /* !HAVE_BIO_ALLOC_4ARG */ 565 566 static inline void 567 vdev_submit_bio(struct bio *bio) 568 { 569 struct bio_list *bio_list = current->bio_list; 570 current->bio_list = NULL; 571 vdev_submit_bio_impl(bio); 572 current->bio_list = bio_list; 573 } 574 575 static inline struct bio * 576 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, 577 unsigned short nr_vecs) 578 { 579 struct bio *bio; 580 581 #ifdef HAVE_BIO_ALLOC_4ARG 582 bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); 583 #else 584 bio = bio_alloc(gfp_mask, nr_vecs); 585 if (likely(bio != NULL)) 586 bio_set_dev(bio, bdev); 587 #endif 588 589 return (bio); 590 } 591 592 static inline unsigned int 593 vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) 594 { 595 unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, 596 bio_size, abd_offset); 597 598 #ifdef HAVE_BIO_MAX_SEGS 599 return (bio_max_segs(nr_segs)); 600 #else 601 return (MIN(nr_segs, BIO_MAX_PAGES)); 602 #endif 603 } 604 605 static int 606 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, 607 size_t io_size, uint64_t io_offset, int rw, int flags) 608 { 609 dio_request_t *dr; 610 uint64_t abd_offset; 611 uint64_t bio_offset; 612 int bio_size; 613 int bio_count = 16; 614 int error = 0; 615 struct blk_plug plug; 616 unsigned short nr_vecs; 617 618 /* 619 * Accessing outside the block device is never allowed. 620 */ 621 if (io_offset + io_size > bdev->bd_inode->i_size) { 622 vdev_dbgmsg(zio->io_vd, 623 "Illegal access %llu size %llu, device size %llu", 624 (u_longlong_t)io_offset, 625 (u_longlong_t)io_size, 626 (u_longlong_t)i_size_read(bdev->bd_inode)); 627 return (SET_ERROR(EIO)); 628 } 629 630 retry: 631 dr = vdev_disk_dio_alloc(bio_count); 632 633 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 634 bio_set_flags_failfast(bdev, &flags); 635 636 dr->dr_zio = zio; 637 638 /* 639 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which 640 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio 641 * can cover at least 128KB and at most 1MB. When the required number 642 * of iovec's exceeds this, we are forced to break the IO in multiple 643 * bio's and wait for them all to complete. This is likely if the 644 * recordsize property is increased beyond 1MB. The default 645 * bio_count=16 should typically accommodate the maximum-size zio of 646 * 16MB. 647 */ 648 649 abd_offset = 0; 650 bio_offset = io_offset; 651 bio_size = io_size; 652 for (int i = 0; i <= dr->dr_bio_count; i++) { 653 654 /* Finished constructing bio's for given buffer */ 655 if (bio_size <= 0) 656 break; 657 658 /* 659 * If additional bio's are required, we have to retry, but 660 * this should be rare - see the comment above. 661 */ 662 if (dr->dr_bio_count == i) { 663 vdev_disk_dio_free(dr); 664 bio_count *= 2; 665 goto retry; 666 } 667 668 nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); 669 dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); 670 if (unlikely(dr->dr_bio[i] == NULL)) { 671 vdev_disk_dio_free(dr); 672 return (SET_ERROR(ENOMEM)); 673 } 674 675 /* Matching put called by vdev_disk_physio_completion */ 676 vdev_disk_dio_get(dr); 677 678 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; 679 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; 680 dr->dr_bio[i]->bi_private = dr; 681 bio_set_op_attrs(dr->dr_bio[i], rw, flags); 682 683 /* Remaining size is returned to become the new size */ 684 bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, 685 bio_size, abd_offset); 686 687 /* Advance in buffer and construct another bio if needed */ 688 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); 689 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); 690 } 691 692 /* Extra reference to protect dio_request during vdev_submit_bio */ 693 vdev_disk_dio_get(dr); 694 695 if (dr->dr_bio_count > 1) 696 blk_start_plug(&plug); 697 698 /* Submit all bio's associated with this dio */ 699 for (int i = 0; i < dr->dr_bio_count; i++) { 700 if (dr->dr_bio[i]) 701 vdev_submit_bio(dr->dr_bio[i]); 702 } 703 704 if (dr->dr_bio_count > 1) 705 blk_finish_plug(&plug); 706 707 (void) vdev_disk_dio_put(dr); 708 709 return (error); 710 } 711 712 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) 713 { 714 zio_t *zio = bio->bi_private; 715 #ifdef HAVE_1ARG_BIO_END_IO_T 716 zio->io_error = BIO_END_IO_ERROR(bio); 717 #else 718 zio->io_error = -error; 719 #endif 720 721 if (zio->io_error && (zio->io_error == EOPNOTSUPP)) 722 zio->io_vd->vdev_nowritecache = B_TRUE; 723 724 bio_put(bio); 725 ASSERT3S(zio->io_error, >=, 0); 726 if (zio->io_error) 727 vdev_disk_error(zio); 728 zio_interrupt(zio); 729 } 730 731 static int 732 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) 733 { 734 struct request_queue *q; 735 struct bio *bio; 736 737 q = bdev_get_queue(bdev); 738 if (!q) 739 return (SET_ERROR(ENXIO)); 740 741 bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); 742 if (unlikely(bio == NULL)) 743 return (SET_ERROR(ENOMEM)); 744 745 bio->bi_end_io = vdev_disk_io_flush_completion; 746 bio->bi_private = zio; 747 bio_set_flush(bio); 748 vdev_submit_bio(bio); 749 invalidate_bdev(bdev); 750 751 return (0); 752 } 753 754 static int 755 vdev_disk_io_trim(zio_t *zio) 756 { 757 vdev_t *v = zio->io_vd; 758 vdev_disk_t *vd = v->vdev_tsd; 759 760 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) 761 if (zio->io_trim_flags & ZIO_TRIM_SECURE) { 762 return (-blkdev_issue_secure_erase(vd->vd_bdev, 763 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 764 } else { 765 return (-blkdev_issue_discard(vd->vd_bdev, 766 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); 767 } 768 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) 769 unsigned long trim_flags = 0; 770 #if defined(BLKDEV_DISCARD_SECURE) 771 if (zio->io_trim_flags & ZIO_TRIM_SECURE) 772 trim_flags |= BLKDEV_DISCARD_SECURE; 773 #endif 774 return (-blkdev_issue_discard(vd->vd_bdev, 775 zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); 776 #else 777 #error "Unsupported kernel" 778 #endif 779 } 780 781 static void 782 vdev_disk_io_start(zio_t *zio) 783 { 784 vdev_t *v = zio->io_vd; 785 vdev_disk_t *vd = v->vdev_tsd; 786 int rw, error; 787 788 /* 789 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 790 * Nothing to be done here but return failure. 791 */ 792 if (vd == NULL) { 793 zio->io_error = ENXIO; 794 zio_interrupt(zio); 795 return; 796 } 797 798 rw_enter(&vd->vd_lock, RW_READER); 799 800 /* 801 * If the vdev is closed, it's likely due to a failed reopen and is 802 * in the UNAVAIL state. Nothing to be done here but return failure. 803 */ 804 if (vd->vd_bdev == NULL) { 805 rw_exit(&vd->vd_lock); 806 zio->io_error = ENXIO; 807 zio_interrupt(zio); 808 return; 809 } 810 811 switch (zio->io_type) { 812 case ZIO_TYPE_IOCTL: 813 814 if (!vdev_readable(v)) { 815 rw_exit(&vd->vd_lock); 816 zio->io_error = SET_ERROR(ENXIO); 817 zio_interrupt(zio); 818 return; 819 } 820 821 switch (zio->io_cmd) { 822 case DKIOCFLUSHWRITECACHE: 823 824 if (zfs_nocacheflush) 825 break; 826 827 if (v->vdev_nowritecache) { 828 zio->io_error = SET_ERROR(ENOTSUP); 829 break; 830 } 831 832 error = vdev_disk_io_flush(vd->vd_bdev, zio); 833 if (error == 0) { 834 rw_exit(&vd->vd_lock); 835 return; 836 } 837 838 zio->io_error = error; 839 840 break; 841 842 default: 843 zio->io_error = SET_ERROR(ENOTSUP); 844 } 845 846 rw_exit(&vd->vd_lock); 847 zio_execute(zio); 848 return; 849 case ZIO_TYPE_WRITE: 850 rw = WRITE; 851 break; 852 853 case ZIO_TYPE_READ: 854 rw = READ; 855 break; 856 857 case ZIO_TYPE_TRIM: 858 zio->io_error = vdev_disk_io_trim(zio); 859 rw_exit(&vd->vd_lock); 860 zio_interrupt(zio); 861 return; 862 863 default: 864 rw_exit(&vd->vd_lock); 865 zio->io_error = SET_ERROR(ENOTSUP); 866 zio_interrupt(zio); 867 return; 868 } 869 870 zio->io_target_timestamp = zio_handle_io_delay(zio); 871 error = __vdev_disk_physio(vd->vd_bdev, zio, 872 zio->io_size, zio->io_offset, rw, 0); 873 rw_exit(&vd->vd_lock); 874 875 if (error) { 876 zio->io_error = error; 877 zio_interrupt(zio); 878 return; 879 } 880 } 881 882 static void 883 vdev_disk_io_done(zio_t *zio) 884 { 885 /* 886 * If the device returned EIO, we revalidate the media. If it is 887 * determined the media has changed this triggers the asynchronous 888 * removal of the device from the configuration. 889 */ 890 if (zio->io_error == EIO) { 891 vdev_t *v = zio->io_vd; 892 vdev_disk_t *vd = v->vdev_tsd; 893 894 if (zfs_check_media_change(vd->vd_bdev)) { 895 invalidate_bdev(vd->vd_bdev); 896 v->vdev_remove_wanted = B_TRUE; 897 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 898 } 899 } 900 } 901 902 static void 903 vdev_disk_hold(vdev_t *vd) 904 { 905 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 906 907 /* We must have a pathname, and it must be absolute. */ 908 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 909 return; 910 911 /* 912 * Only prefetch path and devid info if the device has 913 * never been opened. 914 */ 915 if (vd->vdev_tsd != NULL) 916 return; 917 918 } 919 920 static void 921 vdev_disk_rele(vdev_t *vd) 922 { 923 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 924 925 /* XXX: Implement me as a vnode rele for the device */ 926 } 927 928 vdev_ops_t vdev_disk_ops = { 929 .vdev_op_init = NULL, 930 .vdev_op_fini = NULL, 931 .vdev_op_open = vdev_disk_open, 932 .vdev_op_close = vdev_disk_close, 933 .vdev_op_asize = vdev_default_asize, 934 .vdev_op_min_asize = vdev_default_min_asize, 935 .vdev_op_min_alloc = NULL, 936 .vdev_op_io_start = vdev_disk_io_start, 937 .vdev_op_io_done = vdev_disk_io_done, 938 .vdev_op_state_change = NULL, 939 .vdev_op_need_resilver = NULL, 940 .vdev_op_hold = vdev_disk_hold, 941 .vdev_op_rele = vdev_disk_rele, 942 .vdev_op_remap = NULL, 943 .vdev_op_xlate = vdev_default_xlate, 944 .vdev_op_rebuild_asize = NULL, 945 .vdev_op_metaslab_init = NULL, 946 .vdev_op_config_generate = NULL, 947 .vdev_op_nparity = NULL, 948 .vdev_op_ndisks = NULL, 949 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 950 .vdev_op_leaf = B_TRUE /* leaf vdev */ 951 }; 952 953 /* 954 * The zfs_vdev_scheduler module option has been deprecated. Setting this 955 * value no longer has any effect. It has not yet been entirely removed 956 * to allow the module to be loaded if this option is specified in the 957 * /etc/modprobe.d/zfs.conf file. The following warning will be logged. 958 */ 959 static int 960 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) 961 { 962 int error = param_set_charp(val, kp); 963 if (error == 0) { 964 printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " 965 "is not supported.\n"); 966 } 967 968 return (error); 969 } 970 971 static const char *zfs_vdev_scheduler = "unused"; 972 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, 973 param_get_charp, &zfs_vdev_scheduler, 0644); 974 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); 975 976 int 977 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 978 { 979 uint64_t val; 980 int error; 981 982 error = kstrtoull(buf, 0, &val); 983 if (error < 0) 984 return (SET_ERROR(error)); 985 986 if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) 987 return (SET_ERROR(-EINVAL)); 988 989 error = param_set_ulong(buf, kp); 990 if (error < 0) 991 return (SET_ERROR(error)); 992 993 return (0); 994 } 995 996 int 997 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) 998 { 999 uint64_t val; 1000 int error; 1001 1002 error = kstrtoull(buf, 0, &val); 1003 if (error < 0) 1004 return (SET_ERROR(error)); 1005 1006 if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) 1007 return (SET_ERROR(-EINVAL)); 1008 1009 error = param_set_ulong(buf, kp); 1010 if (error < 0) 1011 return (SET_ERROR(error)); 1012 1013 return (0); 1014 } 1015