1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. 24 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 25 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. 26 * LLNL-CODE-403049. 27 * 28 * ZFS volume emulation driver. 29 * 30 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 31 * Volumes are accessed through the symbolic links named: 32 * 33 * /dev/<pool_name>/<dataset_name> 34 * 35 * Volumes are persistent through reboot and module load. No user command 36 * needs to be run before opening and using a device. 37 * 38 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 39 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 40 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 41 * Copyright (c) 2024, Klara, Inc. 42 */ 43 44 /* 45 * Note on locking of zvol state structures. 46 * 47 * These structures are used to maintain internal state used to emulate block 48 * devices on top of zvols. In particular, management of device minor number 49 * operations - create, remove, rename, and set_snapdev - involves access to 50 * these structures. The zvol_state_lock is primarily used to protect the 51 * zvol_state_list. The zv->zv_state_lock is used to protect the contents 52 * of the zvol_state_t structures, as well as to make sure that when the 53 * time comes to remove the structure from the list, it is not in use, and 54 * therefore, it can be taken off zvol_state_list and freed. 55 * 56 * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, 57 * e.g. for the duration of receive and rollback operations. This lock can be 58 * held for significant periods of time. Given that it is undesirable to hold 59 * mutexes for long periods of time, the following lock ordering applies: 60 * - take zvol_state_lock if necessary, to protect zvol_state_list 61 * - take zv_suspend_lock if necessary, by the code path in question 62 * - take zv_state_lock to protect zvol_state_t 63 * 64 * The minor operations are issued to spa->spa_zvol_taskq queues, that are 65 * single-threaded (to preserve order of minor operations), and are executed 66 * through the zvol_task_cb that dispatches the specific operations. Therefore, 67 * these operations are serialized per pool. Consequently, we can be certain 68 * that for a given zvol, there is only one operation at a time in progress. 69 * That is why one can be sure that first, zvol_state_t for a given zvol is 70 * allocated and placed on zvol_state_list, and then other minor operations 71 * for this zvol are going to proceed in the order of issue. 72 * 73 */ 74 75 #include <sys/dataset_kstats.h> 76 #include <sys/dbuf.h> 77 #include <sys/dmu_traverse.h> 78 #include <sys/dsl_dataset.h> 79 #include <sys/dsl_prop.h> 80 #include <sys/dsl_dir.h> 81 #include <sys/zap.h> 82 #include <sys/zfeature.h> 83 #include <sys/zil_impl.h> 84 #include <sys/dmu_tx.h> 85 #include <sys/zio.h> 86 #include <sys/zfs_rlock.h> 87 #include <sys/spa_impl.h> 88 #include <sys/zvol.h> 89 #include <sys/zvol_impl.h> 90 91 unsigned int zvol_inhibit_dev = 0; 92 unsigned int zvol_prefetch_bytes = (128 * 1024); 93 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; 94 unsigned int zvol_threads = 0; 95 unsigned int zvol_num_taskqs = 0; 96 unsigned int zvol_request_sync = 0; 97 98 struct hlist_head *zvol_htable; 99 static list_t zvol_state_list; 100 krwlock_t zvol_state_lock; 101 extern int zfs_bclone_wait_dirty; 102 zv_taskq_t zvol_taskqs; 103 104 typedef enum { 105 ZVOL_ASYNC_CREATE_MINORS, 106 ZVOL_ASYNC_REMOVE_MINORS, 107 ZVOL_ASYNC_RENAME_MINORS, 108 ZVOL_ASYNC_SET_SNAPDEV, 109 ZVOL_ASYNC_SET_VOLMODE, 110 ZVOL_ASYNC_MAX 111 } zvol_async_op_t; 112 113 typedef struct { 114 zvol_async_op_t zt_op; 115 char zt_name1[MAXNAMELEN]; 116 char zt_name2[MAXNAMELEN]; 117 uint64_t zt_value; 118 uint32_t zt_total; 119 uint32_t zt_done; 120 int32_t zt_status; 121 int zt_error; 122 } zvol_task_t; 123 124 zv_request_task_t * 125 zv_request_task_create(zv_request_t zvr) 126 { 127 zv_request_task_t *task; 128 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 129 taskq_init_ent(&task->ent); 130 task->zvr = zvr; 131 return (task); 132 } 133 134 void 135 zv_request_task_free(zv_request_task_t *task) 136 { 137 kmem_free(task, sizeof (*task)); 138 } 139 140 uint64_t 141 zvol_name_hash(const char *name) 142 { 143 uint64_t crc = -1ULL; 144 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 145 for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++) 146 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; 147 return (crc); 148 } 149 150 /* 151 * Find a zvol_state_t given the name and hash generated by zvol_name_hash. 152 * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, 153 * return (NULL) without the taking locks. The zv_suspend_lock is always taken 154 * before zv_state_lock. The mode argument indicates the mode (including none) 155 * for zv_suspend_lock to be taken. 156 */ 157 zvol_state_t * 158 zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) 159 { 160 zvol_state_t *zv; 161 struct hlist_node *p = NULL; 162 163 rw_enter(&zvol_state_lock, RW_READER); 164 hlist_for_each(p, ZVOL_HT_HEAD(hash)) { 165 zv = hlist_entry(p, zvol_state_t, zv_hlink); 166 mutex_enter(&zv->zv_state_lock); 167 if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) { 168 /* 169 * this is the right zvol, take the locks in the 170 * right order 171 */ 172 if (mode != RW_NONE && 173 !rw_tryenter(&zv->zv_suspend_lock, mode)) { 174 mutex_exit(&zv->zv_state_lock); 175 rw_enter(&zv->zv_suspend_lock, mode); 176 mutex_enter(&zv->zv_state_lock); 177 /* 178 * zvol cannot be renamed as we continue 179 * to hold zvol_state_lock 180 */ 181 ASSERT(zv->zv_hash == hash && 182 strcmp(zv->zv_name, name) == 0); 183 } 184 rw_exit(&zvol_state_lock); 185 return (zv); 186 } 187 mutex_exit(&zv->zv_state_lock); 188 } 189 rw_exit(&zvol_state_lock); 190 191 return (NULL); 192 } 193 194 /* 195 * Find a zvol_state_t given the name. 196 * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, 197 * return (NULL) without the taking locks. The zv_suspend_lock is always taken 198 * before zv_state_lock. The mode argument indicates the mode (including none) 199 * for zv_suspend_lock to be taken. 200 */ 201 static zvol_state_t * 202 zvol_find_by_name(const char *name, int mode) 203 { 204 return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); 205 } 206 207 /* 208 * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. 209 */ 210 void 211 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 212 { 213 zfs_creat_t *zct = arg; 214 nvlist_t *nvprops = zct->zct_props; 215 int error; 216 uint64_t volblocksize, volsize; 217 218 VERIFY(nvlist_lookup_uint64(nvprops, 219 zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); 220 if (nvlist_lookup_uint64(nvprops, 221 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) 222 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); 223 224 /* 225 * These properties must be removed from the list so the generic 226 * property setting step won't apply to them. 227 */ 228 VERIFY(nvlist_remove_all(nvprops, 229 zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); 230 (void) nvlist_remove_all(nvprops, 231 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); 232 233 error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, 234 DMU_OT_NONE, 0, tx); 235 ASSERT(error == 0); 236 237 error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, 238 DMU_OT_NONE, 0, tx); 239 ASSERT(error == 0); 240 241 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); 242 ASSERT(error == 0); 243 } 244 245 /* 246 * ZFS_IOC_OBJSET_STATS entry point. 247 */ 248 int 249 zvol_get_stats(objset_t *os, nvlist_t *nv) 250 { 251 int error; 252 dmu_object_info_t *doi; 253 uint64_t val; 254 255 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); 256 if (error) 257 return (SET_ERROR(error)); 258 259 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); 260 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 261 error = dmu_object_info(os, ZVOL_OBJ, doi); 262 263 if (error == 0) { 264 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, 265 doi->doi_data_block_size); 266 } 267 268 kmem_free(doi, sizeof (dmu_object_info_t)); 269 270 return (SET_ERROR(error)); 271 } 272 273 /* 274 * Sanity check volume size. 275 */ 276 int 277 zvol_check_volsize(uint64_t volsize, uint64_t blocksize) 278 { 279 if (volsize == 0) 280 return (SET_ERROR(EINVAL)); 281 282 if (volsize % blocksize != 0) 283 return (SET_ERROR(EINVAL)); 284 285 #ifdef _ILP32 286 if (volsize - 1 > SPEC_MAXOFFSET_T) 287 return (SET_ERROR(EOVERFLOW)); 288 #endif 289 return (0); 290 } 291 292 /* 293 * Ensure the zap is flushed then inform the VFS of the capacity change. 294 */ 295 static int 296 zvol_update_volsize(uint64_t volsize, objset_t *os) 297 { 298 dmu_tx_t *tx; 299 int error; 300 uint64_t txg; 301 302 tx = dmu_tx_create(os); 303 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 304 dmu_tx_mark_netfree(tx); 305 error = dmu_tx_assign(tx, DMU_TX_WAIT); 306 if (error) { 307 dmu_tx_abort(tx); 308 return (SET_ERROR(error)); 309 } 310 txg = dmu_tx_get_txg(tx); 311 312 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, 313 &volsize, tx); 314 dmu_tx_commit(tx); 315 316 txg_wait_synced(dmu_objset_pool(os), txg); 317 318 if (error == 0) 319 error = dmu_free_long_range(os, 320 ZVOL_OBJ, volsize, DMU_OBJECT_END); 321 322 return (error); 323 } 324 325 /* 326 * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume 327 * size will result in a udev "change" event being generated. 328 */ 329 int 330 zvol_set_volsize(const char *name, uint64_t volsize) 331 { 332 objset_t *os = NULL; 333 uint64_t readonly; 334 int error; 335 boolean_t owned = B_FALSE; 336 337 error = dsl_prop_get_integer(name, 338 zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); 339 if (error != 0) 340 return (SET_ERROR(error)); 341 if (readonly) 342 return (SET_ERROR(EROFS)); 343 344 zvol_state_t *zv = zvol_find_by_name(name, RW_READER); 345 346 ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && 347 RW_READ_HELD(&zv->zv_suspend_lock))); 348 349 if (zv == NULL || zv->zv_objset == NULL) { 350 if (zv != NULL) 351 rw_exit(&zv->zv_suspend_lock); 352 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, 353 FTAG, &os)) != 0) { 354 if (zv != NULL) 355 mutex_exit(&zv->zv_state_lock); 356 return (SET_ERROR(error)); 357 } 358 owned = B_TRUE; 359 if (zv != NULL) 360 zv->zv_objset = os; 361 } else { 362 os = zv->zv_objset; 363 } 364 365 dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP); 366 367 if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || 368 (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) 369 goto out; 370 371 error = zvol_update_volsize(volsize, os); 372 if (error == 0 && zv != NULL) { 373 zv->zv_volsize = volsize; 374 zv->zv_changed = 1; 375 } 376 out: 377 kmem_free(doi, sizeof (dmu_object_info_t)); 378 379 if (owned) { 380 dmu_objset_disown(os, B_TRUE, FTAG); 381 if (zv != NULL) 382 zv->zv_objset = NULL; 383 } else { 384 rw_exit(&zv->zv_suspend_lock); 385 } 386 387 if (zv != NULL) 388 mutex_exit(&zv->zv_state_lock); 389 390 if (error == 0 && zv != NULL) 391 zvol_os_update_volsize(zv, volsize); 392 393 return (SET_ERROR(error)); 394 } 395 396 /* 397 * Update volthreading. 398 */ 399 int 400 zvol_set_volthreading(const char *name, boolean_t value) 401 { 402 zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); 403 if (zv == NULL) 404 return (ENOENT); 405 zv->zv_threading = value; 406 mutex_exit(&zv->zv_state_lock); 407 return (0); 408 } 409 410 /* 411 * Update zvol ro property. 412 */ 413 int 414 zvol_set_ro(const char *name, boolean_t value) 415 { 416 zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); 417 if (zv == NULL) 418 return (-1); 419 if (value) { 420 zvol_os_set_disk_ro(zv, 1); 421 zv->zv_flags |= ZVOL_RDONLY; 422 } else { 423 zvol_os_set_disk_ro(zv, 0); 424 zv->zv_flags &= ~ZVOL_RDONLY; 425 } 426 mutex_exit(&zv->zv_state_lock); 427 return (0); 428 } 429 430 /* 431 * Sanity check volume block size. 432 */ 433 int 434 zvol_check_volblocksize(const char *name, uint64_t volblocksize) 435 { 436 /* Record sizes above 128k need the feature to be enabled */ 437 if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { 438 spa_t *spa; 439 int error; 440 441 if ((error = spa_open(name, &spa, FTAG)) != 0) 442 return (error); 443 444 if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 445 spa_close(spa, FTAG); 446 return (SET_ERROR(ENOTSUP)); 447 } 448 449 /* 450 * We don't allow setting the property above 1MB, 451 * unless the tunable has been changed. 452 */ 453 if (volblocksize > zfs_max_recordsize) 454 return (SET_ERROR(EDOM)); 455 456 spa_close(spa, FTAG); 457 } 458 459 if (volblocksize < SPA_MINBLOCKSIZE || 460 volblocksize > SPA_MAXBLOCKSIZE || 461 !ISP2(volblocksize)) 462 return (SET_ERROR(EDOM)); 463 464 return (0); 465 } 466 467 /* 468 * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we 469 * implement DKIOCFREE/free-long-range. 470 */ 471 static int 472 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 473 { 474 zvol_state_t *zv = arg1; 475 lr_truncate_t *lr = arg2; 476 uint64_t offset, length; 477 478 ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); 479 480 if (byteswap) 481 byteswap_uint64_array(lr, sizeof (*lr)); 482 483 offset = lr->lr_offset; 484 length = lr->lr_length; 485 486 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 487 dmu_tx_mark_netfree(tx); 488 int error = dmu_tx_assign(tx, DMU_TX_WAIT); 489 if (error != 0) { 490 dmu_tx_abort(tx); 491 } else { 492 (void) zil_replaying(zv->zv_zilog, tx); 493 dmu_tx_commit(tx); 494 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, 495 length); 496 } 497 498 return (error); 499 } 500 501 /* 502 * Replay a TX_WRITE ZIL transaction that didn't get committed 503 * after a system failure 504 */ 505 static int 506 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) 507 { 508 zvol_state_t *zv = arg1; 509 lr_write_t *lr = arg2; 510 objset_t *os = zv->zv_objset; 511 char *data = (char *)(lr + 1); /* data follows lr_write_t */ 512 uint64_t offset, length; 513 dmu_tx_t *tx; 514 int error; 515 516 ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); 517 518 if (byteswap) 519 byteswap_uint64_array(lr, sizeof (*lr)); 520 521 offset = lr->lr_offset; 522 length = lr->lr_length; 523 524 /* If it's a dmu_sync() block, write the whole block */ 525 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 526 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 527 if (length < blocksize) { 528 offset -= offset % blocksize; 529 length = blocksize; 530 } 531 } 532 533 tx = dmu_tx_create(os); 534 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); 535 error = dmu_tx_assign(tx, DMU_TX_WAIT); 536 if (error) { 537 dmu_tx_abort(tx); 538 } else { 539 dmu_write(os, ZVOL_OBJ, offset, length, data, tx); 540 (void) zil_replaying(zv->zv_zilog, tx); 541 dmu_tx_commit(tx); 542 } 543 544 return (error); 545 } 546 547 /* 548 * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed 549 * after a system failure 550 */ 551 static int 552 zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) 553 { 554 zvol_state_t *zv = arg1; 555 lr_clone_range_t *lr = arg2; 556 objset_t *os = zv->zv_objset; 557 dmu_tx_t *tx; 558 int error; 559 uint64_t blksz; 560 uint64_t off; 561 uint64_t len; 562 563 ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); 564 ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, 565 lr_bps[lr->lr_nbps])); 566 567 if (byteswap) 568 byteswap_uint64_array(lr, sizeof (*lr)); 569 570 ASSERT(spa_feature_is_enabled(dmu_objset_spa(os), 571 SPA_FEATURE_BLOCK_CLONING)); 572 573 off = lr->lr_offset; 574 len = lr->lr_length; 575 blksz = lr->lr_blksz; 576 577 if ((off % blksz) != 0) { 578 return (SET_ERROR(EINVAL)); 579 } 580 581 error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); 582 if (error != 0 || !zv->zv_dn) 583 return (error); 584 tx = dmu_tx_create(os); 585 dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len, blksz); 586 error = dmu_tx_assign(tx, DMU_TX_WAIT); 587 if (error != 0) { 588 dmu_tx_abort(tx); 589 goto out; 590 } 591 error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len, 592 tx, lr->lr_bps, lr->lr_nbps); 593 if (error != 0) { 594 dmu_tx_commit(tx); 595 goto out; 596 } 597 598 /* 599 * zil_replaying() not only check if we are replaying ZIL, but also 600 * updates the ZIL header to record replay progress. 601 */ 602 VERIFY(zil_replaying(zv->zv_zilog, tx)); 603 dmu_tx_commit(tx); 604 605 out: 606 dnode_rele(zv->zv_dn, zv); 607 zv->zv_dn = NULL; 608 return (error); 609 } 610 611 int 612 zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, 613 uint64_t outoff, uint64_t len) 614 { 615 zilog_t *zilog_dst; 616 zfs_locked_range_t *inlr, *outlr; 617 objset_t *inos, *outos; 618 dmu_tx_t *tx; 619 blkptr_t *bps; 620 size_t maxblocks; 621 int error = EINVAL; 622 623 rw_enter(&zv_dst->zv_suspend_lock, RW_READER); 624 if (zv_dst->zv_zilog == NULL) { 625 rw_exit(&zv_dst->zv_suspend_lock); 626 rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER); 627 if (zv_dst->zv_zilog == NULL) { 628 zv_dst->zv_zilog = zil_open(zv_dst->zv_objset, 629 zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums); 630 zv_dst->zv_flags |= ZVOL_WRITTEN_TO; 631 VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags & 632 ZIL_REPLAY_NEEDED)); 633 } 634 rw_downgrade(&zv_dst->zv_suspend_lock); 635 } 636 if (zv_src != zv_dst) 637 rw_enter(&zv_src->zv_suspend_lock, RW_READER); 638 639 inos = zv_src->zv_objset; 640 outos = zv_dst->zv_objset; 641 642 /* 643 * Sanity checks 644 */ 645 if (!spa_feature_is_enabled(dmu_objset_spa(outos), 646 SPA_FEATURE_BLOCK_CLONING)) { 647 error = EOPNOTSUPP; 648 goto out; 649 } 650 if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { 651 error = EXDEV; 652 goto out; 653 } 654 if (inos->os_encrypted != outos->os_encrypted) { 655 error = EXDEV; 656 goto out; 657 } 658 if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) { 659 error = EINVAL; 660 goto out; 661 } 662 if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) { 663 error = 0; 664 goto out; 665 } 666 667 /* 668 * Do not read beyond boundary 669 */ 670 if (len > zv_src->zv_volsize - inoff) 671 len = zv_src->zv_volsize - inoff; 672 if (len > zv_dst->zv_volsize - outoff) 673 len = zv_dst->zv_volsize - outoff; 674 if (len == 0) { 675 error = 0; 676 goto out; 677 } 678 679 /* 680 * No overlapping if we are cloning within the same file 681 */ 682 if (zv_src == zv_dst) { 683 if (inoff < outoff + len && outoff < inoff + len) { 684 error = EINVAL; 685 goto out; 686 } 687 } 688 689 /* 690 * Offsets and length must be at block boundaries 691 */ 692 if ((inoff % zv_src->zv_volblocksize) != 0 || 693 (outoff % zv_dst->zv_volblocksize) != 0) { 694 error = EINVAL; 695 goto out; 696 } 697 698 /* 699 * Length must be multiple of block size 700 */ 701 if ((len % zv_src->zv_volblocksize) != 0) { 702 error = EINVAL; 703 goto out; 704 } 705 706 zilog_dst = zv_dst->zv_zilog; 707 maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) / 708 sizeof (bps[0]); 709 bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); 710 /* 711 * Maintain predictable lock order. 712 */ 713 if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) { 714 inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, 715 RL_READER); 716 outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, 717 RL_WRITER); 718 } else { 719 outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, 720 RL_WRITER); 721 inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, 722 RL_READER); 723 } 724 725 while (len > 0) { 726 uint64_t size, last_synced_txg; 727 size_t nbps = maxblocks; 728 size = MIN(zv_src->zv_volblocksize * maxblocks, len); 729 last_synced_txg = spa_last_synced_txg( 730 dmu_objset_spa(zv_src->zv_objset)); 731 error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff, 732 size, bps, &nbps); 733 if (error != 0) { 734 /* 735 * If we are trying to clone a block that was created 736 * in the current transaction group, the error will be 737 * EAGAIN here. Based on zfs_bclone_wait_dirty either 738 * return a shortened range to the caller so it can 739 * fallback, or wait for the next TXG and check again. 740 */ 741 if (error == EAGAIN && zfs_bclone_wait_dirty) { 742 txg_wait_synced(dmu_objset_pool 743 (zv_src->zv_objset), last_synced_txg + 1); 744 continue; 745 } 746 break; 747 } 748 749 tx = dmu_tx_create(zv_dst->zv_objset); 750 dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size, 751 zv_src->zv_volblocksize); 752 error = dmu_tx_assign(tx, DMU_TX_WAIT); 753 if (error != 0) { 754 dmu_tx_abort(tx); 755 break; 756 } 757 error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size, 758 tx, bps, nbps); 759 if (error != 0) { 760 dmu_tx_commit(tx); 761 break; 762 } 763 zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff, 764 size, zv_src->zv_volblocksize, bps, nbps); 765 dmu_tx_commit(tx); 766 inoff += size; 767 outoff += size; 768 len -= size; 769 } 770 vmem_free(bps, sizeof (bps[0]) * maxblocks); 771 zfs_rangelock_exit(outlr); 772 zfs_rangelock_exit(inlr); 773 if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { 774 zil_commit(zilog_dst, ZVOL_OBJ); 775 } 776 out: 777 if (zv_src != zv_dst) 778 rw_exit(&zv_src->zv_suspend_lock); 779 rw_exit(&zv_dst->zv_suspend_lock); 780 return (SET_ERROR(error)); 781 } 782 783 /* 784 * Handles TX_CLONE_RANGE transactions. 785 */ 786 void 787 zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, 788 uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) 789 { 790 itx_t *itx; 791 lr_clone_range_t *lr; 792 uint64_t partlen, max_log_data; 793 size_t partnbps; 794 795 if (zil_replaying(zilog, tx)) 796 return; 797 798 max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); 799 800 while (nbps > 0) { 801 partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); 802 partlen = partnbps * blksz; 803 ASSERT3U(partlen, <, len + blksz); 804 partlen = MIN(partlen, len); 805 806 itx = zil_itx_create(txtype, 807 sizeof (*lr) + sizeof (bps[0]) * partnbps); 808 lr = (lr_clone_range_t *)&itx->itx_lr; 809 lr->lr_foid = ZVOL_OBJ; 810 lr->lr_offset = off; 811 lr->lr_length = partlen; 812 lr->lr_blksz = blksz; 813 lr->lr_nbps = partnbps; 814 memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); 815 816 zil_itx_assign(zilog, itx, tx); 817 818 bps += partnbps; 819 ASSERT3U(nbps, >=, partnbps); 820 nbps -= partnbps; 821 off += partlen; 822 ASSERT3U(len, >=, partlen); 823 len -= partlen; 824 } 825 } 826 827 static int 828 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) 829 { 830 (void) arg1, (void) arg2, (void) byteswap; 831 return (SET_ERROR(ENOTSUP)); 832 } 833 834 /* 835 * Callback vectors for replaying records. 836 * Only TX_WRITE and TX_TRUNCATE are needed for zvol. 837 */ 838 zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { 839 zvol_replay_err, /* no such transaction type */ 840 zvol_replay_err, /* TX_CREATE */ 841 zvol_replay_err, /* TX_MKDIR */ 842 zvol_replay_err, /* TX_MKXATTR */ 843 zvol_replay_err, /* TX_SYMLINK */ 844 zvol_replay_err, /* TX_REMOVE */ 845 zvol_replay_err, /* TX_RMDIR */ 846 zvol_replay_err, /* TX_LINK */ 847 zvol_replay_err, /* TX_RENAME */ 848 zvol_replay_write, /* TX_WRITE */ 849 zvol_replay_truncate, /* TX_TRUNCATE */ 850 zvol_replay_err, /* TX_SETATTR */ 851 zvol_replay_err, /* TX_ACL_V0 */ 852 zvol_replay_err, /* TX_ACL */ 853 zvol_replay_err, /* TX_CREATE_ACL */ 854 zvol_replay_err, /* TX_CREATE_ATTR */ 855 zvol_replay_err, /* TX_CREATE_ACL_ATTR */ 856 zvol_replay_err, /* TX_MKDIR_ACL */ 857 zvol_replay_err, /* TX_MKDIR_ATTR */ 858 zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ 859 zvol_replay_err, /* TX_WRITE2 */ 860 zvol_replay_err, /* TX_SETSAXATTR */ 861 zvol_replay_err, /* TX_RENAME_EXCHANGE */ 862 zvol_replay_err, /* TX_RENAME_WHITEOUT */ 863 zvol_replay_clone_range, /* TX_CLONE_RANGE */ 864 }; 865 866 /* 867 * zvol_log_write() handles TX_WRITE transactions. 868 */ 869 void 870 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, 871 uint64_t size, boolean_t commit) 872 { 873 uint32_t blocksize = zv->zv_volblocksize; 874 zilog_t *zilog = zv->zv_zilog; 875 itx_wr_state_t write_state; 876 uint64_t log_size = 0; 877 878 if (zil_replaying(zilog, tx)) 879 return; 880 881 write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit); 882 883 while (size) { 884 itx_t *itx; 885 lr_write_t *lr; 886 itx_wr_state_t wr_state = write_state; 887 ssize_t len = size; 888 889 if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog)) 890 wr_state = WR_NEED_COPY; 891 else if (wr_state == WR_INDIRECT) 892 len = MIN(blocksize - P2PHASE(offset, blocksize), size); 893 894 itx = zil_itx_create(TX_WRITE, sizeof (*lr) + 895 (wr_state == WR_COPIED ? len : 0)); 896 lr = (lr_write_t *)&itx->itx_lr; 897 if (wr_state == WR_COPIED && 898 dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1, 899 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) { 900 zil_itx_destroy(itx); 901 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 902 lr = (lr_write_t *)&itx->itx_lr; 903 wr_state = WR_NEED_COPY; 904 } 905 906 log_size += itx->itx_size; 907 if (wr_state == WR_NEED_COPY) 908 log_size += len; 909 910 itx->itx_wr_state = wr_state; 911 lr->lr_foid = ZVOL_OBJ; 912 lr->lr_offset = offset; 913 lr->lr_length = len; 914 lr->lr_blkoff = 0; 915 BP_ZERO(&lr->lr_blkptr); 916 917 itx->itx_private = zv; 918 919 (void) zil_itx_assign(zilog, itx, tx); 920 921 offset += len; 922 size -= len; 923 } 924 925 dsl_pool_wrlog_count(zilog->zl_dmu_pool, log_size, tx->tx_txg); 926 } 927 928 /* 929 * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. 930 */ 931 void 932 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len) 933 { 934 itx_t *itx; 935 lr_truncate_t *lr; 936 zilog_t *zilog = zv->zv_zilog; 937 938 if (zil_replaying(zilog, tx)) 939 return; 940 941 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 942 lr = (lr_truncate_t *)&itx->itx_lr; 943 lr->lr_foid = ZVOL_OBJ; 944 lr->lr_offset = off; 945 lr->lr_length = len; 946 947 zil_itx_assign(zilog, itx, tx); 948 } 949 950 951 static void 952 zvol_get_done(zgd_t *zgd, int error) 953 { 954 (void) error; 955 if (zgd->zgd_db) 956 dmu_buf_rele(zgd->zgd_db, zgd); 957 958 zfs_rangelock_exit(zgd->zgd_lr); 959 960 kmem_free(zgd, sizeof (zgd_t)); 961 } 962 963 /* 964 * Get data to generate a TX_WRITE intent log record. 965 */ 966 int 967 zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, 968 struct lwb *lwb, zio_t *zio) 969 { 970 zvol_state_t *zv = arg; 971 uint64_t offset = lr->lr_offset; 972 uint64_t size = lr->lr_length; 973 dmu_buf_t *db; 974 zgd_t *zgd; 975 int error; 976 977 ASSERT3P(lwb, !=, NULL); 978 ASSERT3U(size, !=, 0); 979 980 zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 981 zgd->zgd_lwb = lwb; 982 983 /* 984 * Write records come in two flavors: immediate and indirect. 985 * For small writes it's cheaper to store the data with the 986 * log record (immediate); for large writes it's cheaper to 987 * sync the data and get a pointer to it (indirect) so that 988 * we don't have to write the data twice. 989 */ 990 if (buf != NULL) { /* immediate write */ 991 zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, 992 size, RL_READER); 993 error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, 994 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); 995 } else { /* indirect write */ 996 ASSERT3P(zio, !=, NULL); 997 /* 998 * Have to lock the whole block to ensure when it's written out 999 * and its checksum is being calculated that no one can change 1000 * the data. Contrarily to zfs_get_data we need not re-check 1001 * blocksize after we get the lock because it cannot be changed. 1002 */ 1003 size = zv->zv_volblocksize; 1004 offset = P2ALIGN_TYPED(offset, size, uint64_t); 1005 zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, 1006 size, RL_READER); 1007 error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd, 1008 &db); 1009 if (error == 0) { 1010 blkptr_t *bp = &lr->lr_blkptr; 1011 1012 zgd->zgd_db = db; 1013 zgd->zgd_bp = bp; 1014 1015 ASSERT(db != NULL); 1016 ASSERT(db->db_offset == offset); 1017 ASSERT(db->db_size == size); 1018 1019 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1020 zvol_get_done, zgd); 1021 1022 if (error == 0) 1023 return (0); 1024 } 1025 } 1026 1027 zvol_get_done(zgd, error); 1028 1029 return (SET_ERROR(error)); 1030 } 1031 1032 /* 1033 * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. 1034 */ 1035 1036 void 1037 zvol_insert(zvol_state_t *zv) 1038 { 1039 ASSERT(RW_WRITE_HELD(&zvol_state_lock)); 1040 list_insert_head(&zvol_state_list, zv); 1041 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1042 } 1043 1044 /* 1045 * Simply remove the zvol from to list of zvols. 1046 */ 1047 static void 1048 zvol_remove(zvol_state_t *zv) 1049 { 1050 ASSERT(RW_WRITE_HELD(&zvol_state_lock)); 1051 list_remove(&zvol_state_list, zv); 1052 hlist_del(&zv->zv_hlink); 1053 } 1054 1055 /* 1056 * Setup zv after we just own the zv->objset 1057 */ 1058 static int 1059 zvol_setup_zv(zvol_state_t *zv) 1060 { 1061 uint64_t volsize; 1062 int error; 1063 uint64_t ro; 1064 objset_t *os = zv->zv_objset; 1065 1066 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1067 ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); 1068 1069 zv->zv_zilog = NULL; 1070 zv->zv_flags &= ~ZVOL_WRITTEN_TO; 1071 1072 error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); 1073 if (error) 1074 return (SET_ERROR(error)); 1075 1076 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1077 if (error) 1078 return (SET_ERROR(error)); 1079 1080 error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); 1081 if (error) 1082 return (SET_ERROR(error)); 1083 1084 zvol_os_set_capacity(zv, volsize >> 9); 1085 zv->zv_volsize = volsize; 1086 1087 if (ro || dmu_objset_is_snapshot(os) || 1088 !spa_writeable(dmu_objset_spa(os))) { 1089 zvol_os_set_disk_ro(zv, 1); 1090 zv->zv_flags |= ZVOL_RDONLY; 1091 } else { 1092 zvol_os_set_disk_ro(zv, 0); 1093 zv->zv_flags &= ~ZVOL_RDONLY; 1094 } 1095 return (0); 1096 } 1097 1098 /* 1099 * Shutdown every zv_objset related stuff except zv_objset itself. 1100 * The is the reverse of zvol_setup_zv. 1101 */ 1102 static void 1103 zvol_shutdown_zv(zvol_state_t *zv) 1104 { 1105 ASSERT(MUTEX_HELD(&zv->zv_state_lock) && 1106 RW_LOCK_HELD(&zv->zv_suspend_lock)); 1107 1108 if (zv->zv_flags & ZVOL_WRITTEN_TO) { 1109 ASSERT(zv->zv_zilog != NULL); 1110 zil_close(zv->zv_zilog); 1111 } 1112 1113 zv->zv_zilog = NULL; 1114 1115 dnode_rele(zv->zv_dn, zv); 1116 zv->zv_dn = NULL; 1117 1118 /* 1119 * Evict cached data. We must write out any dirty data before 1120 * disowning the dataset. 1121 */ 1122 if (zv->zv_flags & ZVOL_WRITTEN_TO) 1123 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 1124 (void) dmu_objset_evict_dbufs(zv->zv_objset); 1125 } 1126 1127 /* 1128 * return the proper tag for rollback and recv 1129 */ 1130 void * 1131 zvol_tag(zvol_state_t *zv) 1132 { 1133 ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); 1134 return (zv->zv_open_count > 0 ? zv : NULL); 1135 } 1136 1137 /* 1138 * Suspend the zvol for recv and rollback. 1139 */ 1140 zvol_state_t * 1141 zvol_suspend(const char *name) 1142 { 1143 zvol_state_t *zv; 1144 1145 zv = zvol_find_by_name(name, RW_WRITER); 1146 1147 if (zv == NULL) 1148 return (NULL); 1149 1150 /* block all I/O, release in zvol_resume. */ 1151 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1152 ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); 1153 1154 atomic_inc(&zv->zv_suspend_ref); 1155 1156 if (zv->zv_open_count > 0) 1157 zvol_shutdown_zv(zv); 1158 1159 /* 1160 * do not hold zv_state_lock across suspend/resume to 1161 * avoid locking up zvol lookups 1162 */ 1163 mutex_exit(&zv->zv_state_lock); 1164 1165 /* zv_suspend_lock is released in zvol_resume() */ 1166 return (zv); 1167 } 1168 1169 int 1170 zvol_resume(zvol_state_t *zv) 1171 { 1172 int error = 0; 1173 1174 ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); 1175 1176 mutex_enter(&zv->zv_state_lock); 1177 1178 if (zv->zv_open_count > 0) { 1179 VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); 1180 VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); 1181 VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); 1182 dmu_objset_rele(zv->zv_objset, zv); 1183 1184 error = zvol_setup_zv(zv); 1185 } 1186 1187 mutex_exit(&zv->zv_state_lock); 1188 1189 rw_exit(&zv->zv_suspend_lock); 1190 /* 1191 * We need this because we don't hold zvol_state_lock while releasing 1192 * zv_suspend_lock. zvol_remove_minors_impl thus cannot check 1193 * zv_suspend_lock to determine it is safe to free because rwlock is 1194 * not inherent atomic. 1195 */ 1196 atomic_dec(&zv->zv_suspend_ref); 1197 1198 if (zv->zv_flags & ZVOL_REMOVING) 1199 cv_broadcast(&zv->zv_removing_cv); 1200 1201 return (SET_ERROR(error)); 1202 } 1203 1204 int 1205 zvol_first_open(zvol_state_t *zv, boolean_t readonly) 1206 { 1207 objset_t *os; 1208 int error; 1209 1210 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 1211 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1212 ASSERT(mutex_owned(&spa_namespace_lock)); 1213 1214 boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); 1215 error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); 1216 if (error) 1217 return (SET_ERROR(error)); 1218 1219 zv->zv_objset = os; 1220 1221 error = zvol_setup_zv(zv); 1222 if (error) { 1223 dmu_objset_disown(os, 1, zv); 1224 zv->zv_objset = NULL; 1225 } 1226 1227 return (error); 1228 } 1229 1230 void 1231 zvol_last_close(zvol_state_t *zv) 1232 { 1233 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 1234 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1235 1236 if (zv->zv_flags & ZVOL_REMOVING) 1237 cv_broadcast(&zv->zv_removing_cv); 1238 1239 zvol_shutdown_zv(zv); 1240 1241 dmu_objset_disown(zv->zv_objset, 1, zv); 1242 zv->zv_objset = NULL; 1243 } 1244 1245 typedef struct minors_job { 1246 list_t *list; 1247 list_node_t link; 1248 /* input */ 1249 char *name; 1250 /* output */ 1251 int error; 1252 } minors_job_t; 1253 1254 /* 1255 * Prefetch zvol dnodes for the minors_job 1256 */ 1257 static void 1258 zvol_prefetch_minors_impl(void *arg) 1259 { 1260 minors_job_t *job = arg; 1261 char *dsname = job->name; 1262 objset_t *os = NULL; 1263 1264 job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, 1265 FTAG, &os); 1266 if (job->error == 0) { 1267 dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ); 1268 dmu_objset_disown(os, B_TRUE, FTAG); 1269 } 1270 } 1271 1272 /* 1273 * Mask errors to continue dmu_objset_find() traversal 1274 */ 1275 static int 1276 zvol_create_snap_minor_cb(const char *dsname, void *arg) 1277 { 1278 minors_job_t *j = arg; 1279 list_t *minors_list = j->list; 1280 const char *name = j->name; 1281 1282 ASSERT0(MUTEX_HELD(&spa_namespace_lock)); 1283 1284 /* skip the designated dataset */ 1285 if (name && strcmp(dsname, name) == 0) 1286 return (0); 1287 1288 /* at this point, the dsname should name a snapshot */ 1289 if (strchr(dsname, '@') == 0) { 1290 dprintf("zvol_create_snap_minor_cb(): " 1291 "%s is not a snapshot name\n", dsname); 1292 } else { 1293 minors_job_t *job; 1294 char *n = kmem_strdup(dsname); 1295 if (n == NULL) 1296 return (0); 1297 1298 job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); 1299 job->name = n; 1300 job->list = minors_list; 1301 job->error = 0; 1302 list_insert_tail(minors_list, job); 1303 /* don't care if dispatch fails, because job->error is 0 */ 1304 taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, 1305 TQ_SLEEP); 1306 } 1307 1308 return (0); 1309 } 1310 1311 /* 1312 * If spa_keystore_load_wkey() is called for an encrypted zvol, 1313 * we need to look for any clones also using the key. This function 1314 * is "best effort" - so we just skip over it if there are failures. 1315 */ 1316 static void 1317 zvol_add_clones(const char *dsname, list_t *minors_list) 1318 { 1319 /* Also check if it has clones */ 1320 dsl_dir_t *dd = NULL; 1321 dsl_pool_t *dp = NULL; 1322 1323 if (dsl_pool_hold(dsname, FTAG, &dp) != 0) 1324 return; 1325 1326 if (!spa_feature_is_enabled(dp->dp_spa, 1327 SPA_FEATURE_ENCRYPTION)) 1328 goto out; 1329 1330 if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0) 1331 goto out; 1332 1333 if (dsl_dir_phys(dd)->dd_clones == 0) 1334 goto out; 1335 1336 zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); 1337 zap_attribute_t *za = zap_attribute_alloc(); 1338 objset_t *mos = dd->dd_pool->dp_meta_objset; 1339 1340 for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); 1341 zap_cursor_retrieve(zc, za) == 0; 1342 zap_cursor_advance(zc)) { 1343 dsl_dataset_t *clone; 1344 minors_job_t *job; 1345 1346 if (dsl_dataset_hold_obj(dd->dd_pool, 1347 za->za_first_integer, FTAG, &clone) == 0) { 1348 1349 char name[ZFS_MAX_DATASET_NAME_LEN]; 1350 dsl_dataset_name(clone, name); 1351 1352 char *n = kmem_strdup(name); 1353 job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); 1354 job->name = n; 1355 job->list = minors_list; 1356 job->error = 0; 1357 list_insert_tail(minors_list, job); 1358 1359 dsl_dataset_rele(clone, FTAG); 1360 } 1361 } 1362 zap_cursor_fini(zc); 1363 zap_attribute_free(za); 1364 kmem_free(zc, sizeof (zap_cursor_t)); 1365 1366 out: 1367 if (dd != NULL) 1368 dsl_dir_rele(dd, FTAG); 1369 dsl_pool_rele(dp, FTAG); 1370 } 1371 1372 /* 1373 * Mask errors to continue dmu_objset_find() traversal 1374 */ 1375 static int 1376 zvol_create_minors_cb(const char *dsname, void *arg) 1377 { 1378 uint64_t snapdev; 1379 int error; 1380 list_t *minors_list = arg; 1381 1382 ASSERT0(MUTEX_HELD(&spa_namespace_lock)); 1383 1384 error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); 1385 if (error) 1386 return (0); 1387 1388 /* 1389 * Given the name and the 'snapdev' property, create device minor nodes 1390 * with the linkages to zvols/snapshots as needed. 1391 * If the name represents a zvol, create a minor node for the zvol, then 1392 * check if its snapshots are 'visible', and if so, iterate over the 1393 * snapshots and create device minor nodes for those. 1394 */ 1395 if (strchr(dsname, '@') == 0) { 1396 minors_job_t *job; 1397 char *n = kmem_strdup(dsname); 1398 if (n == NULL) 1399 return (0); 1400 1401 job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); 1402 job->name = n; 1403 job->list = minors_list; 1404 job->error = 0; 1405 list_insert_tail(minors_list, job); 1406 /* don't care if dispatch fails, because job->error is 0 */ 1407 taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, 1408 TQ_SLEEP); 1409 1410 zvol_add_clones(dsname, minors_list); 1411 1412 if (snapdev == ZFS_SNAPDEV_VISIBLE) { 1413 /* 1414 * traverse snapshots only, do not traverse children, 1415 * and skip the 'dsname' 1416 */ 1417 (void) dmu_objset_find(dsname, 1418 zvol_create_snap_minor_cb, (void *)job, 1419 DS_FIND_SNAPSHOTS); 1420 } 1421 } else { 1422 dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", 1423 dsname); 1424 } 1425 1426 return (0); 1427 } 1428 1429 static void 1430 zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done, 1431 int error) 1432 { 1433 1434 task->zt_total += total; 1435 task->zt_done += done; 1436 if (task->zt_total != task->zt_done) { 1437 task->zt_status = -1; 1438 if (error) 1439 task->zt_error = error; 1440 } 1441 } 1442 1443 static const char * 1444 zvol_task_op_msg(zvol_async_op_t op) 1445 { 1446 switch (op) { 1447 case ZVOL_ASYNC_CREATE_MINORS: 1448 return ("create"); 1449 case ZVOL_ASYNC_REMOVE_MINORS: 1450 return ("remove"); 1451 case ZVOL_ASYNC_RENAME_MINORS: 1452 return ("rename"); 1453 case ZVOL_ASYNC_SET_SNAPDEV: 1454 case ZVOL_ASYNC_SET_VOLMODE: 1455 return ("set property"); 1456 default: 1457 return ("unknown"); 1458 } 1459 1460 __builtin_unreachable(); 1461 return (NULL); 1462 } 1463 1464 static void 1465 zvol_task_report_status(zvol_task_t *task) 1466 { 1467 1468 if (task->zt_status == 0) 1469 return; 1470 1471 if (task->zt_error) { 1472 dprintf("The %s minors zvol task was not ok, last error %d\n", 1473 zvol_task_op_msg(task->zt_op), task->zt_error); 1474 } else { 1475 dprintf("The %s minors zvol task was not ok\n", 1476 zvol_task_op_msg(task->zt_op)); 1477 } 1478 } 1479 1480 /* 1481 * Create minors for the specified dataset, including children and snapshots. 1482 * Pay attention to the 'snapdev' property and iterate over the snapshots 1483 * only if they are 'visible'. This approach allows one to assure that the 1484 * snapshot metadata is read from disk only if it is needed. 1485 * 1486 * The name can represent a dataset to be recursively scanned for zvols and 1487 * their snapshots, or a single zvol snapshot. If the name represents a 1488 * dataset, the scan is performed in two nested stages: 1489 * - scan the dataset for zvols, and 1490 * - for each zvol, create a minor node, then check if the zvol's snapshots 1491 * are 'visible', and only then iterate over the snapshots if needed 1492 * 1493 * If the name represents a snapshot, a check is performed if the snapshot is 1494 * 'visible' (which also verifies that the parent is a zvol), and if so, 1495 * a minor node for that snapshot is created. 1496 */ 1497 static void 1498 zvol_create_minors_impl(zvol_task_t *task) 1499 { 1500 const char *name = task->zt_name1; 1501 list_t minors_list; 1502 minors_job_t *job; 1503 uint64_t snapdev; 1504 int total = 0, done = 0, last_error, error; 1505 1506 /* 1507 * Note: the dsl_pool_config_lock must not be held. 1508 * Minor node creation needs to obtain the zvol_state_lock. 1509 * zvol_open() obtains the zvol_state_lock and then the dsl pool 1510 * config lock. Therefore, we can't have the config lock now if 1511 * we are going to wait for the zvol_state_lock, because it 1512 * would be a lock order inversion which could lead to deadlock. 1513 */ 1514 1515 if (zvol_inhibit_dev) { 1516 return; 1517 } 1518 1519 /* 1520 * This is the list for prefetch jobs. Whenever we found a match 1521 * during dmu_objset_find, we insert a minors_job to the list and do 1522 * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need 1523 * any lock because all list operation is done on the current thread. 1524 * 1525 * We will use this list to do zvol_os_create_minor after prefetch 1526 * so we don't have to traverse using dmu_objset_find again. 1527 */ 1528 list_create(&minors_list, sizeof (minors_job_t), 1529 offsetof(minors_job_t, link)); 1530 1531 1532 if (strchr(name, '@') != NULL) { 1533 error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); 1534 if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) { 1535 error = zvol_os_create_minor(name); 1536 if (error == 0) { 1537 done++; 1538 } else { 1539 last_error = error; 1540 } 1541 total++; 1542 } 1543 } else { 1544 fstrans_cookie_t cookie = spl_fstrans_mark(); 1545 (void) dmu_objset_find(name, zvol_create_minors_cb, 1546 &minors_list, DS_FIND_CHILDREN); 1547 spl_fstrans_unmark(cookie); 1548 } 1549 1550 taskq_wait_outstanding(system_taskq, 0); 1551 1552 /* 1553 * Prefetch is completed, we can do zvol_os_create_minor 1554 * sequentially. 1555 */ 1556 while ((job = list_remove_head(&minors_list)) != NULL) { 1557 if (!job->error) { 1558 error = zvol_os_create_minor(job->name); 1559 if (error == 0) { 1560 done++; 1561 } else { 1562 last_error = error; 1563 } 1564 } else if (job->error == EINVAL) { 1565 /* 1566 * The objset, with the name requested by current job 1567 * exist, but have the type different from zvol. 1568 * Just ignore this sort of errors. 1569 */ 1570 done++; 1571 } else { 1572 last_error = job->error; 1573 } 1574 total++; 1575 kmem_strfree(job->name); 1576 kmem_free(job, sizeof (minors_job_t)); 1577 } 1578 1579 list_destroy(&minors_list); 1580 zvol_task_update_status(task, total, done, last_error); 1581 } 1582 1583 /* 1584 * Remove minors for specified dataset including children and snapshots. 1585 */ 1586 1587 /* 1588 * Remove the minor for a given zvol. This will do it all: 1589 * - flag the zvol for removal, so new requests are rejected 1590 * - wait until outstanding requests are completed 1591 * - remove it from lists 1592 * - free it 1593 * It's also usable as a taskq task, and smells nice too. 1594 */ 1595 static void 1596 zvol_remove_minor_task(void *arg) 1597 { 1598 zvol_state_t *zv = (zvol_state_t *)arg; 1599 1600 ASSERT(!RW_LOCK_HELD(&zvol_state_lock)); 1601 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1602 1603 mutex_enter(&zv->zv_state_lock); 1604 while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { 1605 zv->zv_flags |= ZVOL_REMOVING; 1606 cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock); 1607 } 1608 mutex_exit(&zv->zv_state_lock); 1609 1610 rw_enter(&zvol_state_lock, RW_WRITER); 1611 mutex_enter(&zv->zv_state_lock); 1612 1613 zvol_remove(zv); 1614 zvol_os_clear_private(zv); 1615 1616 mutex_exit(&zv->zv_state_lock); 1617 rw_exit(&zvol_state_lock); 1618 1619 zvol_os_free(zv); 1620 } 1621 1622 static void 1623 zvol_free_task(void *arg) 1624 { 1625 zvol_os_free(arg); 1626 } 1627 1628 static void 1629 zvol_remove_minors_impl(zvol_task_t *task) 1630 { 1631 zvol_state_t *zv, *zv_next; 1632 const char *name = task ? task->zt_name1 : NULL; 1633 int namelen = ((name) ? strlen(name) : 0); 1634 taskqid_t t; 1635 list_t delay_list, free_list; 1636 1637 if (zvol_inhibit_dev) 1638 return; 1639 1640 list_create(&delay_list, sizeof (zvol_state_t), 1641 offsetof(zvol_state_t, zv_next)); 1642 list_create(&free_list, sizeof (zvol_state_t), 1643 offsetof(zvol_state_t, zv_next)); 1644 1645 rw_enter(&zvol_state_lock, RW_WRITER); 1646 1647 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { 1648 zv_next = list_next(&zvol_state_list, zv); 1649 1650 mutex_enter(&zv->zv_state_lock); 1651 if (name == NULL || strcmp(zv->zv_name, name) == 0 || 1652 (strncmp(zv->zv_name, name, namelen) == 0 && 1653 (zv->zv_name[namelen] == '/' || 1654 zv->zv_name[namelen] == '@'))) { 1655 /* 1656 * By holding zv_state_lock here, we guarantee that no 1657 * one is currently using this zv 1658 */ 1659 1660 /* 1661 * If in use, try to throw everyone off and try again 1662 * later. 1663 */ 1664 if (zv->zv_open_count > 0 || 1665 atomic_read(&zv->zv_suspend_ref)) { 1666 zv->zv_flags |= ZVOL_REMOVING; 1667 t = taskq_dispatch( 1668 zv->zv_objset->os_spa->spa_zvol_taskq, 1669 zvol_remove_minor_task, zv, TQ_SLEEP); 1670 if (t == TASKQID_INVALID) { 1671 /* 1672 * Couldn't create the task, so we'll 1673 * do it in place once the loop is 1674 * finished. 1675 */ 1676 list_insert_head(&delay_list, zv); 1677 } 1678 mutex_exit(&zv->zv_state_lock); 1679 continue; 1680 } 1681 1682 zvol_remove(zv); 1683 1684 /* 1685 * Cleared while holding zvol_state_lock as a writer 1686 * which will prevent zvol_open() from opening it. 1687 */ 1688 zvol_os_clear_private(zv); 1689 1690 /* Drop zv_state_lock before zvol_free() */ 1691 mutex_exit(&zv->zv_state_lock); 1692 1693 /* Try parallel zv_free, if failed do it in place */ 1694 t = taskq_dispatch(system_taskq, zvol_free_task, zv, 1695 TQ_SLEEP); 1696 if (t == TASKQID_INVALID) 1697 list_insert_head(&free_list, zv); 1698 } else { 1699 mutex_exit(&zv->zv_state_lock); 1700 } 1701 } 1702 rw_exit(&zvol_state_lock); 1703 1704 /* Wait for zvols that we couldn't create a remove task for */ 1705 while ((zv = list_remove_head(&delay_list)) != NULL) 1706 zvol_remove_minor_task(zv); 1707 1708 /* Free any that we couldn't free in parallel earlier */ 1709 while ((zv = list_remove_head(&free_list)) != NULL) 1710 zvol_os_free(zv); 1711 } 1712 1713 /* Remove minor for this specific volume only */ 1714 static int 1715 zvol_remove_minor_impl(const char *name) 1716 { 1717 zvol_state_t *zv = NULL, *zv_next; 1718 1719 if (zvol_inhibit_dev) 1720 return (0); 1721 1722 rw_enter(&zvol_state_lock, RW_WRITER); 1723 1724 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { 1725 zv_next = list_next(&zvol_state_list, zv); 1726 1727 mutex_enter(&zv->zv_state_lock); 1728 if (strcmp(zv->zv_name, name) == 0) 1729 /* Found, leave the the loop with zv_lock held */ 1730 break; 1731 mutex_exit(&zv->zv_state_lock); 1732 } 1733 1734 if (zv == NULL) { 1735 rw_exit(&zvol_state_lock); 1736 return (ENOENT); 1737 } 1738 1739 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1740 1741 if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { 1742 /* 1743 * In use, so try to throw everyone off, then wait 1744 * until finished. 1745 */ 1746 zv->zv_flags |= ZVOL_REMOVING; 1747 mutex_exit(&zv->zv_state_lock); 1748 rw_exit(&zvol_state_lock); 1749 zvol_remove_minor_task(zv); 1750 return (0); 1751 } 1752 1753 zvol_remove(zv); 1754 zvol_os_clear_private(zv); 1755 1756 mutex_exit(&zv->zv_state_lock); 1757 rw_exit(&zvol_state_lock); 1758 1759 zvol_os_free(zv); 1760 1761 return (0); 1762 } 1763 1764 /* 1765 * Rename minors for specified dataset including children and snapshots. 1766 */ 1767 static void 1768 zvol_rename_minors_impl(zvol_task_t *task) 1769 { 1770 zvol_state_t *zv, *zv_next; 1771 const char *oldname = task->zt_name1; 1772 const char *newname = task->zt_name2; 1773 int total = 0, done = 0, last_error, error, oldnamelen; 1774 1775 if (zvol_inhibit_dev) 1776 return; 1777 1778 oldnamelen = strlen(oldname); 1779 1780 rw_enter(&zvol_state_lock, RW_READER); 1781 1782 for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { 1783 zv_next = list_next(&zvol_state_list, zv); 1784 1785 mutex_enter(&zv->zv_state_lock); 1786 1787 if (strcmp(zv->zv_name, oldname) == 0) { 1788 error = zvol_os_rename_minor(zv, newname); 1789 } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && 1790 (zv->zv_name[oldnamelen] == '/' || 1791 zv->zv_name[oldnamelen] == '@')) { 1792 char *name = kmem_asprintf("%s%c%s", newname, 1793 zv->zv_name[oldnamelen], 1794 zv->zv_name + oldnamelen + 1); 1795 error = zvol_os_rename_minor(zv, name); 1796 kmem_strfree(name); 1797 } 1798 if (error) { 1799 last_error = error; 1800 } else { 1801 done++; 1802 } 1803 total++; 1804 mutex_exit(&zv->zv_state_lock); 1805 } 1806 1807 rw_exit(&zvol_state_lock); 1808 zvol_task_update_status(task, total, done, last_error); 1809 } 1810 1811 typedef struct zvol_snapdev_cb_arg { 1812 zvol_task_t *task; 1813 uint64_t snapdev; 1814 } zvol_snapdev_cb_arg_t; 1815 1816 static int 1817 zvol_set_snapdev_cb(const char *dsname, void *param) 1818 { 1819 zvol_snapdev_cb_arg_t *arg = param; 1820 int error = 0; 1821 1822 if (strchr(dsname, '@') == NULL) 1823 return (0); 1824 1825 switch (arg->snapdev) { 1826 case ZFS_SNAPDEV_VISIBLE: 1827 error = zvol_os_create_minor(dsname); 1828 break; 1829 case ZFS_SNAPDEV_HIDDEN: 1830 error = zvol_remove_minor_impl(dsname); 1831 break; 1832 } 1833 1834 zvol_task_update_status(arg->task, 1, error == 0, error); 1835 return (0); 1836 } 1837 1838 static void 1839 zvol_set_snapdev_impl(zvol_task_t *task) 1840 { 1841 const char *name = task->zt_name1; 1842 uint64_t snapdev = task->zt_value; 1843 1844 zvol_snapdev_cb_arg_t arg = {task, snapdev}; 1845 fstrans_cookie_t cookie = spl_fstrans_mark(); 1846 /* 1847 * The zvol_set_snapdev_sync() sets snapdev appropriately 1848 * in the dataset hierarchy. Here, we only scan snapshots. 1849 */ 1850 dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); 1851 spl_fstrans_unmark(cookie); 1852 } 1853 1854 static void 1855 zvol_set_volmode_impl(zvol_task_t *task) 1856 { 1857 const char *name = task->zt_name1; 1858 uint64_t volmode = task->zt_value; 1859 fstrans_cookie_t cookie; 1860 uint64_t old_volmode; 1861 zvol_state_t *zv; 1862 int error; 1863 1864 if (strchr(name, '@') != NULL) 1865 return; 1866 1867 /* 1868 * It's unfortunate we need to remove minors before we create new ones: 1869 * this is necessary because our backing gendisk (zvol_state->zv_disk) 1870 * could be different when we set, for instance, volmode from "geom" 1871 * to "dev" (or vice versa). 1872 */ 1873 zv = zvol_find_by_name(name, RW_NONE); 1874 if (zv == NULL && volmode == ZFS_VOLMODE_NONE) 1875 return; 1876 if (zv != NULL) { 1877 old_volmode = zv->zv_volmode; 1878 mutex_exit(&zv->zv_state_lock); 1879 if (old_volmode == volmode) 1880 return; 1881 zvol_wait_close(zv); 1882 } 1883 cookie = spl_fstrans_mark(); 1884 switch (volmode) { 1885 case ZFS_VOLMODE_NONE: 1886 error = zvol_remove_minor_impl(name); 1887 break; 1888 case ZFS_VOLMODE_GEOM: 1889 case ZFS_VOLMODE_DEV: 1890 error = zvol_remove_minor_impl(name); 1891 /* 1892 * The remove minor function call above, might be not 1893 * needed, if volmode was switched from 'none' value. 1894 * Ignore error in this case. 1895 */ 1896 if (error == ENOENT) 1897 error = 0; 1898 else if (error) 1899 break; 1900 error = zvol_os_create_minor(name); 1901 break; 1902 case ZFS_VOLMODE_DEFAULT: 1903 error = zvol_remove_minor_impl(name); 1904 if (zvol_volmode == ZFS_VOLMODE_NONE) 1905 break; 1906 else /* if zvol_volmode is invalid defaults to "geom" */ 1907 error = zvol_os_create_minor(name); 1908 break; 1909 } 1910 zvol_task_update_status(task, 1, error == 0, error); 1911 spl_fstrans_unmark(cookie); 1912 } 1913 1914 /* 1915 * The worker thread function performed asynchronously. 1916 */ 1917 static void 1918 zvol_task_cb(void *arg) 1919 { 1920 zvol_task_t *task = arg; 1921 1922 switch (task->zt_op) { 1923 case ZVOL_ASYNC_CREATE_MINORS: 1924 zvol_create_minors_impl(task); 1925 break; 1926 case ZVOL_ASYNC_REMOVE_MINORS: 1927 zvol_remove_minors_impl(task); 1928 break; 1929 case ZVOL_ASYNC_RENAME_MINORS: 1930 zvol_rename_minors_impl(task); 1931 break; 1932 case ZVOL_ASYNC_SET_SNAPDEV: 1933 zvol_set_snapdev_impl(task); 1934 break; 1935 case ZVOL_ASYNC_SET_VOLMODE: 1936 zvol_set_volmode_impl(task); 1937 break; 1938 default: 1939 VERIFY(0); 1940 break; 1941 } 1942 1943 zvol_task_report_status(task); 1944 kmem_free(task, sizeof (zvol_task_t)); 1945 } 1946 1947 typedef struct zvol_set_prop_int_arg { 1948 const char *zsda_name; 1949 uint64_t zsda_value; 1950 zprop_source_t zsda_source; 1951 zfs_prop_t zsda_prop; 1952 } zvol_set_prop_int_arg_t; 1953 1954 /* 1955 * Sanity check the dataset for safe use by the sync task. No additional 1956 * conditions are imposed. 1957 */ 1958 static int 1959 zvol_set_common_check(void *arg, dmu_tx_t *tx) 1960 { 1961 zvol_set_prop_int_arg_t *zsda = arg; 1962 dsl_pool_t *dp = dmu_tx_pool(tx); 1963 dsl_dir_t *dd; 1964 int error; 1965 1966 error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); 1967 if (error != 0) 1968 return (error); 1969 1970 dsl_dir_rele(dd, FTAG); 1971 1972 return (error); 1973 } 1974 1975 static int 1976 zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1977 { 1978 zvol_set_prop_int_arg_t *zsda = arg; 1979 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 1980 zvol_task_t *task; 1981 uint64_t prop; 1982 1983 const char *prop_name = zfs_prop_to_name(zsda->zsda_prop); 1984 dsl_dataset_name(ds, dsname); 1985 1986 if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0) 1987 return (0); 1988 1989 task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); 1990 if (zsda->zsda_prop == ZFS_PROP_VOLMODE) { 1991 task->zt_op = ZVOL_ASYNC_SET_VOLMODE; 1992 } else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) { 1993 task->zt_op = ZVOL_ASYNC_SET_SNAPDEV; 1994 } else { 1995 kmem_free(task, sizeof (zvol_task_t)); 1996 return (0); 1997 } 1998 task->zt_value = prop; 1999 strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1)); 2000 (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, 2001 task, TQ_SLEEP); 2002 return (0); 2003 } 2004 2005 /* 2006 * Traverse all child datasets and apply the property appropriately. 2007 * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel 2008 * dataset and read the effective "property" on every child in the callback 2009 * function: this is because the value is not guaranteed to be the same in the 2010 * whole dataset hierarchy. 2011 */ 2012 static void 2013 zvol_set_common_sync(void *arg, dmu_tx_t *tx) 2014 { 2015 zvol_set_prop_int_arg_t *zsda = arg; 2016 dsl_pool_t *dp = dmu_tx_pool(tx); 2017 dsl_dir_t *dd; 2018 dsl_dataset_t *ds; 2019 int error; 2020 2021 VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); 2022 2023 error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); 2024 if (error == 0) { 2025 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop), 2026 zsda->zsda_source, sizeof (zsda->zsda_value), 1, 2027 &zsda->zsda_value, tx); 2028 dsl_dataset_rele(ds, FTAG); 2029 } 2030 2031 dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb, 2032 zsda, DS_FIND_CHILDREN); 2033 2034 dsl_dir_rele(dd, FTAG); 2035 } 2036 2037 int 2038 zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source, 2039 uint64_t val) 2040 { 2041 zvol_set_prop_int_arg_t zsda; 2042 2043 zsda.zsda_name = ddname; 2044 zsda.zsda_source = source; 2045 zsda.zsda_value = val; 2046 zsda.zsda_prop = prop; 2047 2048 return (dsl_sync_task(ddname, zvol_set_common_check, 2049 zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); 2050 } 2051 2052 void 2053 zvol_create_minors(const char *name) 2054 { 2055 spa_t *spa; 2056 zvol_task_t *task; 2057 taskqid_t id; 2058 2059 if (spa_open(name, &spa, FTAG) != 0) 2060 return; 2061 2062 task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); 2063 task->zt_op = ZVOL_ASYNC_CREATE_MINORS; 2064 strlcpy(task->zt_name1, name, sizeof (task->zt_name1)); 2065 id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); 2066 if (id != TASKQID_INVALID) 2067 taskq_wait_id(spa->spa_zvol_taskq, id); 2068 2069 spa_close(spa, FTAG); 2070 } 2071 2072 void 2073 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) 2074 { 2075 zvol_task_t *task; 2076 taskqid_t id; 2077 2078 task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); 2079 task->zt_op = ZVOL_ASYNC_REMOVE_MINORS; 2080 strlcpy(task->zt_name1, name, sizeof (task->zt_name1)); 2081 id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); 2082 if ((async == B_FALSE) && (id != TASKQID_INVALID)) 2083 taskq_wait_id(spa->spa_zvol_taskq, id); 2084 } 2085 2086 void 2087 zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, 2088 boolean_t async) 2089 { 2090 zvol_task_t *task; 2091 taskqid_t id; 2092 2093 task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); 2094 task->zt_op = ZVOL_ASYNC_RENAME_MINORS; 2095 strlcpy(task->zt_name1, name1, sizeof (task->zt_name1)); 2096 strlcpy(task->zt_name2, name2, sizeof (task->zt_name2)); 2097 id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); 2098 if ((async == B_FALSE) && (id != TASKQID_INVALID)) 2099 taskq_wait_id(spa->spa_zvol_taskq, id); 2100 } 2101 2102 boolean_t 2103 zvol_is_zvol(const char *name) 2104 { 2105 2106 return (zvol_os_is_zvol(name)); 2107 } 2108 2109 int 2110 zvol_init_impl(void) 2111 { 2112 int i; 2113 2114 /* 2115 * zvol_threads is the module param the user passes in. 2116 * 2117 * zvol_actual_threads is what we use internally, since the user can 2118 * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 2119 */ 2120 static unsigned int zvol_actual_threads; 2121 2122 if (zvol_threads == 0) { 2123 /* 2124 * See dde9380a1 for why 32 was chosen here. This should 2125 * probably be refined to be some multiple of the number 2126 * of CPUs. 2127 */ 2128 zvol_actual_threads = MAX(max_ncpus, 32); 2129 } else { 2130 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 2131 } 2132 2133 /* 2134 * Use at least 32 zvol_threads but for many core system, 2135 * prefer 6 threads per taskq, but no more taskqs 2136 * than threads in them on large systems. 2137 * 2138 * taskq total 2139 * cpus taskqs threads threads 2140 * ------- ------- ------- ------- 2141 * 1 1 32 32 2142 * 2 1 32 32 2143 * 4 1 32 32 2144 * 8 2 16 32 2145 * 16 3 11 33 2146 * 32 5 7 35 2147 * 64 8 8 64 2148 * 128 11 12 132 2149 * 256 16 16 256 2150 */ 2151 zv_taskq_t *ztqs = &zvol_taskqs; 2152 int num_tqs = MIN(max_ncpus, zvol_num_taskqs); 2153 if (num_tqs == 0) { 2154 num_tqs = 1 + max_ncpus / 6; 2155 while (num_tqs * num_tqs > zvol_actual_threads) 2156 num_tqs--; 2157 } 2158 2159 int per_tq_thread = zvol_actual_threads / num_tqs; 2160 if (per_tq_thread * num_tqs < zvol_actual_threads) 2161 per_tq_thread++; 2162 2163 ztqs->tqs_cnt = num_tqs; 2164 ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); 2165 2166 for (uint_t i = 0; i < num_tqs; i++) { 2167 char name[32]; 2168 (void) snprintf(name, sizeof (name), "%s_tq-%u", 2169 ZVOL_DRIVER, i); 2170 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, 2171 maxclsyspri, per_tq_thread, INT_MAX, 2172 TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 2173 if (ztqs->tqs_taskq[i] == NULL) { 2174 for (int j = i - 1; j >= 0; j--) 2175 taskq_destroy(ztqs->tqs_taskq[j]); 2176 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 2177 sizeof (taskq_t *)); 2178 ztqs->tqs_taskq = NULL; 2179 return (SET_ERROR(ENOMEM)); 2180 } 2181 } 2182 2183 list_create(&zvol_state_list, sizeof (zvol_state_t), 2184 offsetof(zvol_state_t, zv_next)); 2185 rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); 2186 2187 zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), 2188 KM_SLEEP); 2189 for (i = 0; i < ZVOL_HT_SIZE; i++) 2190 INIT_HLIST_HEAD(&zvol_htable[i]); 2191 2192 return (0); 2193 } 2194 2195 void 2196 zvol_fini_impl(void) 2197 { 2198 zv_taskq_t *ztqs = &zvol_taskqs; 2199 2200 zvol_remove_minors_impl(NULL); 2201 2202 /* 2203 * The call to "zvol_remove_minors_impl" may dispatch entries to 2204 * the system_taskq, but it doesn't wait for those entries to 2205 * complete before it returns. Thus, we must wait for all of the 2206 * removals to finish, before we can continue. 2207 */ 2208 taskq_wait_outstanding(system_taskq, 0); 2209 2210 kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); 2211 list_destroy(&zvol_state_list); 2212 rw_destroy(&zvol_state_lock); 2213 2214 if (ztqs->tqs_taskq == NULL) { 2215 ASSERT3U(ztqs->tqs_cnt, ==, 0); 2216 } else { 2217 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { 2218 ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); 2219 taskq_destroy(ztqs->tqs_taskq[i]); 2220 } 2221 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 2222 sizeof (taskq_t *)); 2223 ztqs->tqs_taskq = NULL; 2224 } 2225 } 2226 2227 ZFS_MODULE_PARAM(zfs_vol, zvol_, inhibit_dev, UINT, ZMOD_RW, 2228 "Do not create zvol device nodes"); 2229 ZFS_MODULE_PARAM(zfs_vol, zvol_, prefetch_bytes, UINT, ZMOD_RW, 2230 "Prefetch N bytes at zvol start+end"); 2231 ZFS_MODULE_PARAM(zfs_vol, zvol_vol, mode, UINT, ZMOD_RW, 2232 "Default volmode property value"); 2233 ZFS_MODULE_PARAM(zfs_vol, zvol_, threads, UINT, ZMOD_RW, 2234 "Number of threads for I/O requests. Set to 0 to use all active CPUs"); 2235 ZFS_MODULE_PARAM(zfs_vol, zvol_, num_taskqs, UINT, ZMOD_RW, 2236 "Number of zvol taskqs"); 2237 ZFS_MODULE_PARAM(zfs_vol, zvol_, request_sync, UINT, ZMOD_RW, 2238 "Synchronously handle bio requests"); 2239