xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c (revision b670c9bafc0e31c7609969bf374b2e80bdc00211)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
24  * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
25  * Copyright (c) 2024, Klara, Inc.
26  */
27 
28 #include <sys/dataset_kstats.h>
29 #include <sys/dbuf.h>
30 #include <sys/dmu_traverse.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_prop.h>
33 #include <sys/dsl_dir.h>
34 #include <sys/zap.h>
35 #include <sys/zfeature.h>
36 #include <sys/zil_impl.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/zio.h>
39 #include <sys/zfs_rlock.h>
40 #include <sys/spa_impl.h>
41 #include <sys/zvol.h>
42 #include <sys/zvol_impl.h>
43 #include <cityhash.h>
44 
45 #include <linux/blkdev_compat.h>
46 #include <linux/task_io_accounting_ops.h>
47 #include <linux/workqueue.h>
48 #include <linux/blk-mq.h>
49 
50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
51     struct request *rq, boolean_t force_sync);
52 
53 static unsigned int zvol_major = ZVOL_MAJOR;
54 static unsigned int zvol_prefetch_bytes = (128 * 1024);
55 static unsigned long zvol_max_discard_blocks = 16384;
56 
57 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
58 static unsigned int zvol_open_timeout_ms = 1000;
59 #endif
60 
61 static unsigned int zvol_blk_mq_threads = 0;
62 static unsigned int zvol_blk_mq_actual_threads;
63 static boolean_t zvol_use_blk_mq = B_FALSE;
64 
65 /*
66  * The maximum number of volblocksize blocks to process per thread.  Typically,
67  * write heavy workloads preform better with higher values here, and read
68  * heavy workloads preform better with lower values, but that's not a hard
69  * and fast rule.  It's basically a knob to tune between "less overhead with
70  * less parallelism" and "more overhead, but more parallelism".
71  *
72  * '8' was chosen as a reasonable, balanced, default based off of sequential
73  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
74  */
75 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
76 
77 #ifndef	BLKDEV_DEFAULT_RQ
78 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
79 #define	BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
80 #endif
81 
82 /*
83  * Finalize our BIO or request.
84  */
85 static inline void
86 zvol_end_io(struct bio *bio, struct request *rq, int error)
87 {
88 	if (bio) {
89 		bio->bi_status = errno_to_bi_status(-error);
90 		bio_endio(bio);
91 	} else {
92 		blk_mq_end_request(rq, errno_to_bi_status(error));
93 	}
94 }
95 
96 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
97 static unsigned int zvol_actual_blk_mq_queue_depth;
98 
99 struct zvol_state_os {
100 	struct gendisk		*zvo_disk;	/* generic disk */
101 	struct request_queue	*zvo_queue;	/* request queue */
102 	dev_t			zvo_dev;	/* device id */
103 
104 	struct blk_mq_tag_set tag_set;
105 
106 	/* Set from the global 'zvol_use_blk_mq' at zvol load */
107 	boolean_t use_blk_mq;
108 };
109 
110 static struct ida zvol_ida;
111 
112 /*
113  * This is called when a new block multiqueue request comes in.  A request
114  * contains one or more BIOs.
115  */
116 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
117     const struct blk_mq_queue_data *bd)
118 {
119 	struct request *rq = bd->rq;
120 	zvol_state_t *zv = rq->q->queuedata;
121 
122 	/* Tell the kernel that we are starting to process this request */
123 	blk_mq_start_request(rq);
124 
125 	if (blk_rq_is_passthrough(rq)) {
126 		/* Skip non filesystem request */
127 		blk_mq_end_request(rq, BLK_STS_IOERR);
128 		return (BLK_STS_IOERR);
129 	}
130 
131 	zvol_request_impl(zv, NULL, rq, 0);
132 
133 	/* Acknowledge to the kernel that we got this request */
134 	return (BLK_STS_OK);
135 }
136 
137 static struct blk_mq_ops zvol_blk_mq_queue_ops = {
138 	.queue_rq = zvol_mq_queue_rq,
139 };
140 
141 /* Initialize our blk-mq struct */
142 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
143 {
144 	struct zvol_state_os *zso = zv->zv_zso;
145 
146 	memset(&zso->tag_set, 0, sizeof (zso->tag_set));
147 
148 	/* Initialize tag set. */
149 	zso->tag_set.ops = &zvol_blk_mq_queue_ops;
150 	zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
151 	zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
152 	zso->tag_set.numa_node = NUMA_NO_NODE;
153 	zso->tag_set.cmd_size = 0;
154 
155 	/*
156 	 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
157 	 * zvol_request_impl()
158 	 */
159 	zso->tag_set.flags = BLK_MQ_F_BLOCKING;
160 
161 #ifdef BLK_MQ_F_SHOULD_MERGE
162 	/*
163 	 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit.
164 	 * For older kernels, we set it.
165 	 */
166 	zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE;
167 #endif
168 
169 	zso->tag_set.driver_data = zv;
170 
171 	return (blk_mq_alloc_tag_set(&zso->tag_set));
172 }
173 
174 /*
175  * Given a path, return TRUE if path is a ZVOL.
176  */
177 boolean_t
178 zvol_os_is_zvol(const char *path)
179 {
180 	dev_t dev = 0;
181 
182 	if (vdev_lookup_bdev(path, &dev) != 0)
183 		return (B_FALSE);
184 
185 	if (MAJOR(dev) == zvol_major)
186 		return (B_TRUE);
187 
188 	return (B_FALSE);
189 }
190 
191 static void
192 zvol_write(zv_request_t *zvr)
193 {
194 	struct bio *bio = zvr->bio;
195 	struct request *rq = zvr->rq;
196 	int error = 0;
197 	zfs_uio_t uio;
198 	zvol_state_t *zv = zvr->zv;
199 	struct request_queue *q;
200 	struct gendisk *disk;
201 	unsigned long start_time = 0;
202 	boolean_t acct = B_FALSE;
203 
204 	ASSERT3P(zv, !=, NULL);
205 	ASSERT3U(zv->zv_open_count, >, 0);
206 	ASSERT3P(zv->zv_zilog, !=, NULL);
207 
208 	q = zv->zv_zso->zvo_queue;
209 	disk = zv->zv_zso->zvo_disk;
210 
211 	/* bio marked as FLUSH need to flush before write */
212 	if (io_is_flush(bio, rq))
213 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
214 
215 	/* Some requests are just for flush and nothing else. */
216 	if (io_size(bio, rq) == 0) {
217 		rw_exit(&zv->zv_suspend_lock);
218 		zvol_end_io(bio, rq, 0);
219 		return;
220 	}
221 
222 	zfs_uio_bvec_init(&uio, bio, rq);
223 
224 	ssize_t start_resid = uio.uio_resid;
225 
226 	/*
227 	 * With use_blk_mq, accounting is done by blk_mq_start_request()
228 	 * and blk_mq_end_request(), so we can skip it here.
229 	 */
230 	if (bio) {
231 		acct = blk_queue_io_stat(q);
232 		if (acct) {
233 			start_time = blk_generic_start_io_acct(q, disk, WRITE,
234 			    bio);
235 		}
236 	}
237 
238 	boolean_t sync =
239 	    io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
240 
241 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
242 	    uio.uio_loffset, uio.uio_resid, RL_WRITER);
243 
244 	uint64_t volsize = zv->zv_volsize;
245 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
246 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
247 		uint64_t off = uio.uio_loffset;
248 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
249 
250 		if (bytes > volsize - off)	/* don't write past the end */
251 			bytes = volsize - off;
252 
253 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
254 
255 		/* This will only fail for ENOSPC */
256 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
257 		if (error) {
258 			dmu_tx_abort(tx);
259 			break;
260 		}
261 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
262 		    DMU_READ_PREFETCH);
263 		if (error == 0) {
264 			zvol_log_write(zv, tx, off, bytes, sync);
265 		}
266 		dmu_tx_commit(tx);
267 
268 		if (error)
269 			break;
270 	}
271 	zfs_rangelock_exit(lr);
272 
273 	int64_t nwritten = start_resid - uio.uio_resid;
274 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
275 	task_io_account_write(nwritten);
276 
277 	if (sync)
278 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
279 
280 	rw_exit(&zv->zv_suspend_lock);
281 
282 	if (bio && acct) {
283 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
284 	}
285 
286 	zvol_end_io(bio, rq, -error);
287 }
288 
289 static void
290 zvol_write_task(void *arg)
291 {
292 	zv_request_task_t *task = arg;
293 	zvol_write(&task->zvr);
294 	zv_request_task_free(task);
295 }
296 
297 static void
298 zvol_discard(zv_request_t *zvr)
299 {
300 	struct bio *bio = zvr->bio;
301 	struct request *rq = zvr->rq;
302 	zvol_state_t *zv = zvr->zv;
303 	uint64_t start = io_offset(bio, rq);
304 	uint64_t size = io_size(bio, rq);
305 	uint64_t end = start + size;
306 	boolean_t sync;
307 	int error = 0;
308 	dmu_tx_t *tx;
309 	struct request_queue *q = zv->zv_zso->zvo_queue;
310 	struct gendisk *disk = zv->zv_zso->zvo_disk;
311 	unsigned long start_time = 0;
312 	boolean_t acct = B_FALSE;
313 
314 	ASSERT3P(zv, !=, NULL);
315 	ASSERT3U(zv->zv_open_count, >, 0);
316 	ASSERT3P(zv->zv_zilog, !=, NULL);
317 
318 	if (bio) {
319 		acct = blk_queue_io_stat(q);
320 		if (acct) {
321 			start_time = blk_generic_start_io_acct(q, disk, WRITE,
322 			    bio);
323 		}
324 	}
325 
326 	sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
327 
328 	if (end > zv->zv_volsize) {
329 		error = SET_ERROR(EIO);
330 		goto unlock;
331 	}
332 
333 	/*
334 	 * Align the request to volume block boundaries when a secure erase is
335 	 * not required.  This will prevent dnode_free_range() from zeroing out
336 	 * the unaligned parts which is slow (read-modify-write) and useless
337 	 * since we are not freeing any space by doing so.
338 	 */
339 	if (!io_is_secure_erase(bio, rq)) {
340 		start = P2ROUNDUP(start, zv->zv_volblocksize);
341 		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
342 		size = end - start;
343 	}
344 
345 	if (start >= end)
346 		goto unlock;
347 
348 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
349 	    start, size, RL_WRITER);
350 
351 	tx = dmu_tx_create(zv->zv_objset);
352 	dmu_tx_mark_netfree(tx);
353 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
354 	if (error != 0) {
355 		dmu_tx_abort(tx);
356 	} else {
357 		zvol_log_truncate(zv, tx, start, size);
358 		dmu_tx_commit(tx);
359 		error = dmu_free_long_range(zv->zv_objset,
360 		    ZVOL_OBJ, start, size);
361 	}
362 	zfs_rangelock_exit(lr);
363 
364 	if (error == 0 && sync)
365 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
366 
367 unlock:
368 	rw_exit(&zv->zv_suspend_lock);
369 
370 	if (bio && acct) {
371 		blk_generic_end_io_acct(q, disk, WRITE, bio,
372 		    start_time);
373 	}
374 
375 	zvol_end_io(bio, rq, -error);
376 }
377 
378 static void
379 zvol_discard_task(void *arg)
380 {
381 	zv_request_task_t *task = arg;
382 	zvol_discard(&task->zvr);
383 	zv_request_task_free(task);
384 }
385 
386 static void
387 zvol_read(zv_request_t *zvr)
388 {
389 	struct bio *bio = zvr->bio;
390 	struct request *rq = zvr->rq;
391 	int error = 0;
392 	zfs_uio_t uio;
393 	boolean_t acct = B_FALSE;
394 	zvol_state_t *zv = zvr->zv;
395 	struct request_queue *q;
396 	struct gendisk *disk;
397 	unsigned long start_time = 0;
398 
399 	ASSERT3P(zv, !=, NULL);
400 	ASSERT3U(zv->zv_open_count, >, 0);
401 
402 	zfs_uio_bvec_init(&uio, bio, rq);
403 
404 	q = zv->zv_zso->zvo_queue;
405 	disk = zv->zv_zso->zvo_disk;
406 
407 	ssize_t start_resid = uio.uio_resid;
408 
409 	/*
410 	 * When blk-mq is being used, accounting is done by
411 	 * blk_mq_start_request() and blk_mq_end_request().
412 	 */
413 	if (bio) {
414 		acct = blk_queue_io_stat(q);
415 		if (acct)
416 			start_time = blk_generic_start_io_acct(q, disk, READ,
417 			    bio);
418 	}
419 
420 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
421 	    uio.uio_loffset, uio.uio_resid, RL_READER);
422 
423 	uint64_t volsize = zv->zv_volsize;
424 
425 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
426 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
427 
428 		/* don't read past the end */
429 		if (bytes > volsize - uio.uio_loffset)
430 			bytes = volsize - uio.uio_loffset;
431 
432 		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
433 		    DMU_READ_PREFETCH);
434 		if (error) {
435 			/* convert checksum errors into IO errors */
436 			if (error == ECKSUM)
437 				error = SET_ERROR(EIO);
438 			break;
439 		}
440 	}
441 	zfs_rangelock_exit(lr);
442 
443 	int64_t nread = start_resid - uio.uio_resid;
444 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
445 	task_io_account_read(nread);
446 
447 	rw_exit(&zv->zv_suspend_lock);
448 
449 	if (bio && acct) {
450 		blk_generic_end_io_acct(q, disk, READ, bio, start_time);
451 	}
452 
453 	zvol_end_io(bio, rq, -error);
454 }
455 
456 static void
457 zvol_read_task(void *arg)
458 {
459 	zv_request_task_t *task = arg;
460 	zvol_read(&task->zvr);
461 	zv_request_task_free(task);
462 }
463 
464 
465 /*
466  * Process a BIO or request
467  *
468  * Either 'bio' or 'rq' should be set depending on if we are processing a
469  * bio or a request (both should not be set).
470  *
471  * force_sync:	Set to 0 to defer processing to a background taskq
472  *			Set to 1 to process data synchronously
473  */
474 static void
475 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
476     boolean_t force_sync)
477 {
478 	fstrans_cookie_t cookie = spl_fstrans_mark();
479 	uint64_t offset = io_offset(bio, rq);
480 	uint64_t size = io_size(bio, rq);
481 	int rw = io_data_dir(bio, rq);
482 
483 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
484 		zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
485 		goto out;
486 	}
487 
488 	if (zvol_request_sync || zv->zv_threading == B_FALSE)
489 		force_sync = 1;
490 
491 	zv_request_t zvr = {
492 		.zv = zv,
493 		.bio = bio,
494 		.rq = rq,
495 	};
496 
497 	if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
498 		printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
499 		    zv->zv_zso->zvo_disk->disk_name,
500 		    (long long unsigned)offset,
501 		    (long unsigned)size);
502 
503 		zvol_end_io(bio, rq, -SET_ERROR(EIO));
504 		goto out;
505 	}
506 
507 	zv_request_task_t *task;
508 	zv_taskq_t *ztqs = &zvol_taskqs;
509 	uint_t blk_mq_hw_queue = 0;
510 	uint_t tq_idx;
511 	uint_t taskq_hash;
512 	if (rq)
513 #ifdef HAVE_BLK_MQ_RQ_HCTX
514 		blk_mq_hw_queue = rq->mq_hctx->queue_num;
515 #else
516 		blk_mq_hw_queue =
517 		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
518 #endif
519 	taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
520 	    blk_mq_hw_queue);
521 	tq_idx = taskq_hash % ztqs->tqs_cnt;
522 
523 	if (rw == WRITE) {
524 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
525 			zvol_end_io(bio, rq, -SET_ERROR(EROFS));
526 			goto out;
527 		}
528 
529 		/*
530 		 * Prevents the zvol from being suspended, or the ZIL being
531 		 * concurrently opened.  Will be released after the i/o
532 		 * completes.
533 		 */
534 		rw_enter(&zv->zv_suspend_lock, RW_READER);
535 
536 		/*
537 		 * Open a ZIL if this is the first time we have written to this
538 		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
539 		 * than zv_state_lock so that we don't need to acquire an
540 		 * additional lock in this path.
541 		 */
542 		if (zv->zv_zilog == NULL) {
543 			rw_exit(&zv->zv_suspend_lock);
544 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
545 			if (zv->zv_zilog == NULL) {
546 				zv->zv_zilog = zil_open(zv->zv_objset,
547 				    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
548 				zv->zv_flags |= ZVOL_WRITTEN_TO;
549 				/* replay / destroy done in zvol_create_minor */
550 				VERIFY0((zv->zv_zilog->zl_header->zh_flags &
551 				    ZIL_REPLAY_NEEDED));
552 			}
553 			rw_downgrade(&zv->zv_suspend_lock);
554 		}
555 
556 		/*
557 		 * We don't want this thread to be blocked waiting for i/o to
558 		 * complete, so we instead wait from a taskq callback. The
559 		 * i/o may be a ZIL write (via zil_commit()), or a read of an
560 		 * indirect block, or a read of a data block (if this is a
561 		 * partial-block write).  We will indicate that the i/o is
562 		 * complete by calling END_IO() from the taskq callback.
563 		 *
564 		 * This design allows the calling thread to continue and
565 		 * initiate more concurrent operations by calling
566 		 * zvol_request() again. There are typically only a small
567 		 * number of threads available to call zvol_request() (e.g.
568 		 * one per iSCSI target), so keeping the latency of
569 		 * zvol_request() low is important for performance.
570 		 *
571 		 * The zvol_request_sync module parameter allows this
572 		 * behavior to be altered, for performance evaluation
573 		 * purposes.  If the callback blocks, setting
574 		 * zvol_request_sync=1 will result in much worse performance.
575 		 *
576 		 * We can have up to zvol_threads concurrent i/o's being
577 		 * processed for all zvols on the system.  This is typically
578 		 * a vast improvement over the zvol_request_sync=1 behavior
579 		 * of one i/o at a time per zvol.  However, an even better
580 		 * design would be for zvol_request() to initiate the zio
581 		 * directly, and then be notified by the zio_done callback,
582 		 * which would call END_IO().  Unfortunately, the DMU/ZIL
583 		 * interfaces lack this functionality (they block waiting for
584 		 * the i/o to complete).
585 		 */
586 		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
587 			if (force_sync) {
588 				zvol_discard(&zvr);
589 			} else {
590 				task = zv_request_task_create(zvr);
591 				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
592 				    zvol_discard_task, task, 0, &task->ent);
593 			}
594 		} else {
595 			if (force_sync) {
596 				zvol_write(&zvr);
597 			} else {
598 				task = zv_request_task_create(zvr);
599 				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
600 				    zvol_write_task, task, 0, &task->ent);
601 			}
602 		}
603 	} else {
604 		/*
605 		 * The SCST driver, and possibly others, may issue READ I/Os
606 		 * with a length of zero bytes.  These empty I/Os contain no
607 		 * data and require no additional handling.
608 		 */
609 		if (size == 0) {
610 			zvol_end_io(bio, rq, 0);
611 			goto out;
612 		}
613 
614 		rw_enter(&zv->zv_suspend_lock, RW_READER);
615 
616 		/* See comment in WRITE case above. */
617 		if (force_sync) {
618 			zvol_read(&zvr);
619 		} else {
620 			task = zv_request_task_create(zvr);
621 			taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
622 			    zvol_read_task, task, 0, &task->ent);
623 		}
624 	}
625 
626 out:
627 	spl_fstrans_unmark(cookie);
628 }
629 
630 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
631 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
632 static void
633 zvol_submit_bio(struct bio *bio)
634 #else
635 static blk_qc_t
636 zvol_submit_bio(struct bio *bio)
637 #endif
638 #else
639 static MAKE_REQUEST_FN_RET
640 zvol_request(struct request_queue *q, struct bio *bio)
641 #endif
642 {
643 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
644 #if defined(HAVE_BIO_BDEV_DISK)
645 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
646 #else
647 	struct request_queue *q = bio->bi_disk->queue;
648 #endif
649 #endif
650 	zvol_state_t *zv = q->queuedata;
651 
652 	zvol_request_impl(zv, bio, NULL, 0);
653 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
654 	defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
655 	!defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
656 	return (BLK_QC_T_NONE);
657 #endif
658 }
659 
660 static int
661 #ifdef HAVE_BLK_MODE_T
662 zvol_open(struct gendisk *disk, blk_mode_t flag)
663 #else
664 zvol_open(struct block_device *bdev, fmode_t flag)
665 #endif
666 {
667 	zvol_state_t *zv;
668 	int error = 0;
669 	boolean_t drop_suspend = B_FALSE;
670 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
671 	hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
672 	hrtime_t start = gethrtime();
673 
674 retry:
675 #endif
676 	rw_enter(&zvol_state_lock, RW_READER);
677 	/*
678 	 * Obtain a copy of private_data under the zvol_state_lock to make
679 	 * sure that either the result of zvol free code path setting
680 	 * disk->private_data to NULL is observed, or zvol_os_free()
681 	 * is not called on this zv because of the positive zv_open_count.
682 	 */
683 #ifdef HAVE_BLK_MODE_T
684 	zv = disk->private_data;
685 #else
686 	zv = bdev->bd_disk->private_data;
687 #endif
688 	if (zv == NULL) {
689 		rw_exit(&zvol_state_lock);
690 		return (-SET_ERROR(ENXIO));
691 	}
692 
693 	mutex_enter(&zv->zv_state_lock);
694 
695 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
696 		mutex_exit(&zv->zv_state_lock);
697 		rw_exit(&zvol_state_lock);
698 		return (-SET_ERROR(ENXIO));
699 	}
700 
701 	/*
702 	 * Make sure zvol is not suspended during first open
703 	 * (hold zv_suspend_lock) and respect proper lock acquisition
704 	 * ordering - zv_suspend_lock before zv_state_lock
705 	 */
706 	if (zv->zv_open_count == 0) {
707 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
708 			mutex_exit(&zv->zv_state_lock);
709 			rw_enter(&zv->zv_suspend_lock, RW_READER);
710 			mutex_enter(&zv->zv_state_lock);
711 			/* check to see if zv_suspend_lock is needed */
712 			if (zv->zv_open_count != 0) {
713 				rw_exit(&zv->zv_suspend_lock);
714 			} else {
715 				drop_suspend = B_TRUE;
716 			}
717 		} else {
718 			drop_suspend = B_TRUE;
719 		}
720 	}
721 	rw_exit(&zvol_state_lock);
722 
723 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
724 
725 	if (zv->zv_open_count == 0) {
726 		boolean_t drop_namespace = B_FALSE;
727 
728 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
729 
730 		/*
731 		 * In all other call paths the spa_namespace_lock is taken
732 		 * before the bdev->bd_mutex lock.  However, on open(2)
733 		 * the __blkdev_get() function calls fops->open() with the
734 		 * bdev->bd_mutex lock held.  This can result in a deadlock
735 		 * when zvols from one pool are used as vdevs in another.
736 		 *
737 		 * To prevent a lock inversion deadlock we preemptively
738 		 * take the spa_namespace_lock.  Normally the lock will not
739 		 * be contended and this is safe because spa_open_common()
740 		 * handles the case where the caller already holds the
741 		 * spa_namespace_lock.
742 		 *
743 		 * When the lock cannot be aquired after multiple retries
744 		 * this must be the vdev on zvol deadlock case and we have
745 		 * no choice but to return an error.  For 5.12 and older
746 		 * kernels returning -ERESTARTSYS will result in the
747 		 * bdev->bd_mutex being dropped, then reacquired, and
748 		 * fops->open() being called again.  This process can be
749 		 * repeated safely until both locks are acquired.  For 5.13
750 		 * and newer the -ERESTARTSYS retry logic was removed from
751 		 * the kernel so the only option is to return the error for
752 		 * the caller to handle it.
753 		 */
754 		if (!mutex_owned(&spa_namespace_lock)) {
755 			if (!mutex_tryenter(&spa_namespace_lock)) {
756 				mutex_exit(&zv->zv_state_lock);
757 				rw_exit(&zv->zv_suspend_lock);
758 				drop_suspend = B_FALSE;
759 
760 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
761 				schedule();
762 				return (-SET_ERROR(ERESTARTSYS));
763 #else
764 				if ((gethrtime() - start) > timeout)
765 					return (-SET_ERROR(ERESTARTSYS));
766 
767 				schedule_timeout_interruptible(
768 					MSEC_TO_TICK(10));
769 				goto retry;
770 #endif
771 			} else {
772 				drop_namespace = B_TRUE;
773 			}
774 		}
775 
776 		error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
777 
778 		if (drop_namespace)
779 			mutex_exit(&spa_namespace_lock);
780 	}
781 
782 	if (error == 0) {
783 		if ((blk_mode_is_open_write(flag)) &&
784 		    (zv->zv_flags & ZVOL_RDONLY)) {
785 			if (zv->zv_open_count == 0)
786 				zvol_last_close(zv);
787 
788 			error = -SET_ERROR(EROFS);
789 		} else {
790 			zv->zv_open_count++;
791 		}
792 	}
793 
794 	mutex_exit(&zv->zv_state_lock);
795 	if (drop_suspend)
796 		rw_exit(&zv->zv_suspend_lock);
797 
798 	if (error == 0)
799 #ifdef HAVE_BLK_MODE_T
800 		disk_check_media_change(disk);
801 #else
802 		zfs_check_media_change(bdev);
803 #endif
804 
805 	return (error);
806 }
807 
808 static void
809 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
810 zvol_release(struct gendisk *disk)
811 #else
812 zvol_release(struct gendisk *disk, fmode_t unused)
813 #endif
814 {
815 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
816 	(void) unused;
817 #endif
818 	zvol_state_t *zv;
819 	boolean_t drop_suspend = B_TRUE;
820 
821 	rw_enter(&zvol_state_lock, RW_READER);
822 	zv = disk->private_data;
823 
824 	mutex_enter(&zv->zv_state_lock);
825 	ASSERT3U(zv->zv_open_count, >, 0);
826 	/*
827 	 * make sure zvol is not suspended during last close
828 	 * (hold zv_suspend_lock) and respect proper lock acquisition
829 	 * ordering - zv_suspend_lock before zv_state_lock
830 	 */
831 	if (zv->zv_open_count == 1) {
832 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
833 			mutex_exit(&zv->zv_state_lock);
834 			rw_enter(&zv->zv_suspend_lock, RW_READER);
835 			mutex_enter(&zv->zv_state_lock);
836 			/* check to see if zv_suspend_lock is needed */
837 			if (zv->zv_open_count != 1) {
838 				rw_exit(&zv->zv_suspend_lock);
839 				drop_suspend = B_FALSE;
840 			}
841 		}
842 	} else {
843 		drop_suspend = B_FALSE;
844 	}
845 	rw_exit(&zvol_state_lock);
846 
847 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
848 
849 	zv->zv_open_count--;
850 	if (zv->zv_open_count == 0) {
851 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
852 		zvol_last_close(zv);
853 	}
854 
855 	mutex_exit(&zv->zv_state_lock);
856 
857 	if (drop_suspend)
858 		rw_exit(&zv->zv_suspend_lock);
859 }
860 
861 static int
862 zvol_ioctl(struct block_device *bdev, fmode_t mode,
863     unsigned int cmd, unsigned long arg)
864 {
865 	zvol_state_t *zv = bdev->bd_disk->private_data;
866 	int error = 0;
867 
868 	ASSERT3U(zv->zv_open_count, >, 0);
869 
870 	switch (cmd) {
871 	case BLKFLSBUF:
872 #ifdef HAVE_FSYNC_BDEV
873 		fsync_bdev(bdev);
874 #elif defined(HAVE_SYNC_BLOCKDEV)
875 		sync_blockdev(bdev);
876 #else
877 #error "Neither fsync_bdev() nor sync_blockdev() found"
878 #endif
879 		invalidate_bdev(bdev);
880 		rw_enter(&zv->zv_suspend_lock, RW_READER);
881 
882 		if (!(zv->zv_flags & ZVOL_RDONLY))
883 			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
884 
885 		rw_exit(&zv->zv_suspend_lock);
886 		break;
887 
888 	case BLKZNAME:
889 		mutex_enter(&zv->zv_state_lock);
890 		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
891 		mutex_exit(&zv->zv_state_lock);
892 		break;
893 
894 	default:
895 		error = -ENOTTY;
896 		break;
897 	}
898 
899 	return (SET_ERROR(error));
900 }
901 
902 #ifdef CONFIG_COMPAT
903 static int
904 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
905     unsigned cmd, unsigned long arg)
906 {
907 	return (zvol_ioctl(bdev, mode, cmd, arg));
908 }
909 #else
910 #define	zvol_compat_ioctl	NULL
911 #endif
912 
913 static unsigned int
914 zvol_check_events(struct gendisk *disk, unsigned int clearing)
915 {
916 	unsigned int mask = 0;
917 
918 	rw_enter(&zvol_state_lock, RW_READER);
919 
920 	zvol_state_t *zv = disk->private_data;
921 	if (zv != NULL) {
922 		mutex_enter(&zv->zv_state_lock);
923 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
924 		zv->zv_changed = 0;
925 		mutex_exit(&zv->zv_state_lock);
926 	}
927 
928 	rw_exit(&zvol_state_lock);
929 
930 	return (mask);
931 }
932 
933 static int
934 zvol_revalidate_disk(struct gendisk *disk)
935 {
936 	rw_enter(&zvol_state_lock, RW_READER);
937 
938 	zvol_state_t *zv = disk->private_data;
939 	if (zv != NULL) {
940 		mutex_enter(&zv->zv_state_lock);
941 		set_capacity(zv->zv_zso->zvo_disk,
942 		    zv->zv_volsize >> SECTOR_BITS);
943 		mutex_exit(&zv->zv_state_lock);
944 	}
945 
946 	rw_exit(&zvol_state_lock);
947 
948 	return (0);
949 }
950 
951 int
952 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
953 {
954 	struct gendisk *disk = zv->zv_zso->zvo_disk;
955 
956 #if defined(HAVE_REVALIDATE_DISK_SIZE)
957 	revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
958 #elif defined(HAVE_REVALIDATE_DISK)
959 	revalidate_disk(disk);
960 #else
961 	zvol_revalidate_disk(disk);
962 #endif
963 	return (0);
964 }
965 
966 void
967 zvol_os_clear_private(zvol_state_t *zv)
968 {
969 	/*
970 	 * Cleared while holding zvol_state_lock as a writer
971 	 * which will prevent zvol_open() from opening it.
972 	 */
973 	zv->zv_zso->zvo_disk->private_data = NULL;
974 }
975 
976 /*
977  * Provide a simple virtual geometry for legacy compatibility.  For devices
978  * smaller than 1 MiB a small head and sector count is used to allow very
979  * tiny devices.  For devices over 1 Mib a standard head and sector count
980  * is used to keep the cylinders count reasonable.
981  */
982 static int
983 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
984 {
985 	zvol_state_t *zv = bdev->bd_disk->private_data;
986 	sector_t sectors;
987 
988 	ASSERT3U(zv->zv_open_count, >, 0);
989 
990 	sectors = get_capacity(zv->zv_zso->zvo_disk);
991 
992 	if (sectors > 2048) {
993 		geo->heads = 16;
994 		geo->sectors = 63;
995 	} else {
996 		geo->heads = 2;
997 		geo->sectors = 4;
998 	}
999 
1000 	geo->start = 0;
1001 	geo->cylinders = sectors / (geo->heads * geo->sectors);
1002 
1003 	return (0);
1004 }
1005 
1006 /*
1007  * Why have two separate block_device_operations structs?
1008  *
1009  * Normally we'd just have one, and assign 'submit_bio' as needed.  However,
1010  * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
1011  * can't just change submit_bio dynamically at runtime.  So just create two
1012  * separate structs to get around this.
1013  */
1014 static const struct block_device_operations zvol_ops_blk_mq = {
1015 	.open			= zvol_open,
1016 	.release		= zvol_release,
1017 	.ioctl			= zvol_ioctl,
1018 	.compat_ioctl		= zvol_compat_ioctl,
1019 	.check_events		= zvol_check_events,
1020 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1021 	.revalidate_disk	= zvol_revalidate_disk,
1022 #endif
1023 	.getgeo			= zvol_getgeo,
1024 	.owner			= THIS_MODULE,
1025 };
1026 
1027 static const struct block_device_operations zvol_ops = {
1028 	.open			= zvol_open,
1029 	.release		= zvol_release,
1030 	.ioctl			= zvol_ioctl,
1031 	.compat_ioctl		= zvol_compat_ioctl,
1032 	.check_events		= zvol_check_events,
1033 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1034 	.revalidate_disk	= zvol_revalidate_disk,
1035 #endif
1036 	.getgeo			= zvol_getgeo,
1037 	.owner			= THIS_MODULE,
1038 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
1039 	.submit_bio		= zvol_submit_bio,
1040 #endif
1041 };
1042 
1043 /*
1044  * Since 6.9, Linux has been removing queue limit setters in favour of an
1045  * initial queue_limits struct applied when the device is open. Since 6.11,
1046  * queue_limits is being extended to allow more things to be applied when the
1047  * device is open. Setters are also being removed for this.
1048  *
1049  * For OpenZFS, this means that depending on kernel version, some options may
1050  * be set up before the device is open, and some applied to an open device
1051  * (queue) after the fact.
1052  *
1053  * We manage this complexity by having our own limits struct,
1054  * zvol_queue_limits_t, in which we carry any queue config that we're
1055  * interested in setting. This structure is the same on all kernels.
1056  *
1057  * These limits are then applied to the queue at device open time by the most
1058  * appropriate method for the kernel.
1059  *
1060  * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
1061  * blk_alloc_disk() exists). This converts our limits struct to a proper Linux
1062  * struct queue_limits, and passes it in. Any fields added in later kernels are
1063  * (obviously) not set up here.
1064  *
1065  * zvol_queue_limits_apply() is called on all kernel versions after the queue
1066  * is created, and applies any remaining config. Before 6.9 that will be
1067  * everything, via setter methods. After 6.9 that will be whatever couldn't be
1068  * put into struct queue_limits. (This implies that zvol_queue_limits_apply()
1069  * will always be a no-op on the latest kernel we support).
1070  */
1071 typedef struct zvol_queue_limits {
1072 	unsigned int	zql_max_hw_sectors;
1073 	unsigned short	zql_max_segments;
1074 	unsigned int	zql_max_segment_size;
1075 	unsigned int	zql_io_opt;
1076 	unsigned int	zql_physical_block_size;
1077 	unsigned int	zql_max_discard_sectors;
1078 	unsigned int	zql_discard_granularity;
1079 } zvol_queue_limits_t;
1080 
1081 static void
1082 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
1083     boolean_t use_blk_mq)
1084 {
1085 	limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
1086 
1087 	if (use_blk_mq) {
1088 		/*
1089 		 * IO requests can be really big (1MB).  When an IO request
1090 		 * comes in, it is passed off to zvol_read() or zvol_write()
1091 		 * in a new thread, where it is chunked up into 'volblocksize'
1092 		 * sized pieces and processed.  So for example, if the request
1093 		 * is a 1MB write and your volblocksize is 128k, one zvol_write
1094 		 * thread will take that request and sequentially do ten 128k
1095 		 * IOs.  This is due to the fact that the thread needs to lock
1096 		 * each volblocksize sized block.  So you might be wondering:
1097 		 * "instead of passing the whole 1MB request to one thread,
1098 		 * why not pass ten individual 128k chunks to ten threads and
1099 		 * process the whole write in parallel?"  The short answer is
1100 		 * that there's a sweet spot number of chunks that balances
1101 		 * the greater parallelism with the added overhead of more
1102 		 * threads. The sweet spot can be different depending on if you
1103 		 * have a read or write  heavy workload.  Writes typically want
1104 		 * high chunk counts while reads typically want lower ones.  On
1105 		 * a test pool with 6 NVMe drives in a 3x 2-disk mirror
1106 		 * configuration, with volblocksize=8k, the sweet spot for good
1107 		 * sequential reads and writes was at 8 chunks.
1108 		 */
1109 
1110 		/*
1111 		 * Below we tell the kernel how big we want our requests
1112 		 * to be.  You would think that blk_queue_io_opt() would be
1113 		 * used to do this since it is used to "set optimal request
1114 		 * size for the queue", but that doesn't seem to do
1115 		 * anything - the kernel still gives you huge requests
1116 		 * with tons of little PAGE_SIZE segments contained within it.
1117 		 *
1118 		 * Knowing that the kernel will just give you PAGE_SIZE segments
1119 		 * no matter what, you can say "ok, I want PAGE_SIZE byte
1120 		 * segments, and I want 'N' of them per request", where N is
1121 		 * the correct number of segments for the volblocksize and
1122 		 * number of chunks you want.
1123 		 */
1124 		if (zvol_blk_mq_blocks_per_thread != 0) {
1125 			unsigned int chunks;
1126 			chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
1127 
1128 			limits->zql_max_segment_size = PAGE_SIZE;
1129 			limits->zql_max_segments =
1130 			    (zv->zv_volblocksize * chunks) / PAGE_SIZE;
1131 		} else {
1132 			/*
1133 			 * Special case: zvol_blk_mq_blocks_per_thread = 0
1134 			 * Max everything out.
1135 			 */
1136 			limits->zql_max_segments = UINT16_MAX;
1137 			limits->zql_max_segment_size = UINT_MAX;
1138 		}
1139 	} else {
1140 		limits->zql_max_segments = UINT16_MAX;
1141 		limits->zql_max_segment_size = UINT_MAX;
1142 	}
1143 
1144 	limits->zql_io_opt = DMU_MAX_ACCESS / 2;
1145 
1146 	limits->zql_physical_block_size = zv->zv_volblocksize;
1147 	limits->zql_max_discard_sectors =
1148 	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
1149 	limits->zql_discard_granularity = zv->zv_volblocksize;
1150 }
1151 
1152 #ifdef HAVE_BLK_ALLOC_DISK_2ARG
1153 static void
1154 zvol_queue_limits_convert(zvol_queue_limits_t *limits,
1155     struct queue_limits *qlimits)
1156 {
1157 	memset(qlimits, 0, sizeof (struct queue_limits));
1158 	qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
1159 	qlimits->max_segments = limits->zql_max_segments;
1160 	qlimits->max_segment_size = limits->zql_max_segment_size;
1161 	qlimits->io_opt = limits->zql_io_opt;
1162 	qlimits->physical_block_size = limits->zql_physical_block_size;
1163 	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
1164 	qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
1165 	qlimits->discard_granularity = limits->zql_discard_granularity;
1166 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
1167 	qlimits->features =
1168 	    BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
1169 #endif
1170 }
1171 #endif
1172 
1173 static void
1174 zvol_queue_limits_apply(zvol_queue_limits_t *limits,
1175     struct request_queue *queue)
1176 {
1177 #ifndef HAVE_BLK_ALLOC_DISK_2ARG
1178 	blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
1179 	blk_queue_max_segments(queue, limits->zql_max_segments);
1180 	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
1181 	blk_queue_io_opt(queue, limits->zql_io_opt);
1182 	blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
1183 	blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
1184 	blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
1185 #endif
1186 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
1187 	blk_queue_set_write_cache(queue, B_TRUE);
1188 	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
1189 #endif
1190 }
1191 
1192 static int
1193 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
1194 {
1195 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
1196 #if defined(HAVE_BLK_ALLOC_DISK)
1197 	zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
1198 	if (zso->zvo_disk == NULL)
1199 		return (1);
1200 
1201 	zso->zvo_disk->minors = ZVOL_MINORS;
1202 	zso->zvo_queue = zso->zvo_disk->queue;
1203 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1204 	struct queue_limits qlimits;
1205 	zvol_queue_limits_convert(limits, &qlimits);
1206 	struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
1207 	if (IS_ERR(disk)) {
1208 		zso->zvo_disk = NULL;
1209 		return (1);
1210 	}
1211 
1212 	zso->zvo_disk = disk;
1213 	zso->zvo_disk->minors = ZVOL_MINORS;
1214 	zso->zvo_queue = zso->zvo_disk->queue;
1215 
1216 #else
1217 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
1218 	if (zso->zvo_queue == NULL)
1219 		return (1);
1220 
1221 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1222 	if (zso->zvo_disk == NULL) {
1223 		blk_cleanup_queue(zso->zvo_queue);
1224 		return (1);
1225 	}
1226 
1227 	zso->zvo_disk->queue = zso->zvo_queue;
1228 #endif /* HAVE_BLK_ALLOC_DISK */
1229 #else
1230 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
1231 	if (zso->zvo_queue == NULL)
1232 		return (1);
1233 
1234 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1235 	if (zso->zvo_disk == NULL) {
1236 		blk_cleanup_queue(zso->zvo_queue);
1237 		return (1);
1238 	}
1239 
1240 	zso->zvo_disk->queue = zso->zvo_queue;
1241 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
1242 
1243 	zvol_queue_limits_apply(limits, zso->zvo_queue);
1244 
1245 	return (0);
1246 
1247 }
1248 
1249 static int
1250 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
1251 {
1252 	struct zvol_state_os *zso = zv->zv_zso;
1253 
1254 	/* Allocate our blk-mq tag_set */
1255 	if (zvol_blk_mq_alloc_tag_set(zv) != 0)
1256 		return (1);
1257 
1258 #if defined(HAVE_BLK_ALLOC_DISK)
1259 	zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
1260 	if (zso->zvo_disk == NULL) {
1261 		blk_mq_free_tag_set(&zso->tag_set);
1262 		return (1);
1263 	}
1264 	zso->zvo_queue = zso->zvo_disk->queue;
1265 	zso->zvo_disk->minors = ZVOL_MINORS;
1266 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
1267 	struct queue_limits qlimits;
1268 	zvol_queue_limits_convert(limits, &qlimits);
1269 	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
1270 	if (IS_ERR(disk)) {
1271 		zso->zvo_disk = NULL;
1272 		blk_mq_free_tag_set(&zso->tag_set);
1273 		return (1);
1274 	}
1275 
1276 	zso->zvo_disk = disk;
1277 	zso->zvo_queue = zso->zvo_disk->queue;
1278 	zso->zvo_disk->minors = ZVOL_MINORS;
1279 #else
1280 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
1281 	if (zso->zvo_disk == NULL) {
1282 		blk_cleanup_queue(zso->zvo_queue);
1283 		blk_mq_free_tag_set(&zso->tag_set);
1284 		return (1);
1285 	}
1286 	/* Allocate queue */
1287 	zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
1288 	if (IS_ERR(zso->zvo_queue)) {
1289 		blk_mq_free_tag_set(&zso->tag_set);
1290 		return (1);
1291 	}
1292 
1293 	/* Our queue is now created, assign it to our disk */
1294 	zso->zvo_disk->queue = zso->zvo_queue;
1295 #endif
1296 
1297 	zvol_queue_limits_apply(limits, zso->zvo_queue);
1298 
1299 	return (0);
1300 }
1301 
1302 /*
1303  * Allocate memory for a new zvol_state_t and setup the required
1304  * request queue and generic disk structures for the block device.
1305  */
1306 static zvol_state_t *
1307 zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
1308 {
1309 	zvol_state_t *zv;
1310 	struct zvol_state_os *zso;
1311 	uint64_t volmode;
1312 	int ret;
1313 
1314 	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
1315 		return (NULL);
1316 
1317 	if (volmode == ZFS_VOLMODE_DEFAULT)
1318 		volmode = zvol_volmode;
1319 
1320 	if (volmode == ZFS_VOLMODE_NONE)
1321 		return (NULL);
1322 
1323 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
1324 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1325 	zv->zv_zso = zso;
1326 	zv->zv_volmode = volmode;
1327 	zv->zv_volblocksize = volblocksize;
1328 
1329 	list_link_init(&zv->zv_next);
1330 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1331 	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1332 
1333 	zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
1334 
1335 	zvol_queue_limits_t limits;
1336 	zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
1337 
1338 	/*
1339 	 * The block layer has 3 interfaces for getting BIOs:
1340 	 *
1341 	 * 1. blk-mq request queues (new)
1342 	 * 2. submit_bio() (oldest)
1343 	 * 3. regular request queues (old).
1344 	 *
1345 	 * Each of those interfaces has two permutations:
1346 	 *
1347 	 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
1348 	 *    both the disk and its queue (5.14 kernel or newer)
1349 	 *
1350 	 * b) We don't have blk_*alloc_disk(), and have to allocate the
1351 	 *    disk and the queue separately. (5.13 kernel or older)
1352 	 */
1353 	if (zv->zv_zso->use_blk_mq) {
1354 		ret = zvol_alloc_blk_mq(zv, &limits);
1355 		if (ret != 0)
1356 			goto out_kmem;
1357 		zso->zvo_disk->fops = &zvol_ops_blk_mq;
1358 	} else {
1359 		ret = zvol_alloc_non_blk_mq(zso, &limits);
1360 		if (ret != 0)
1361 			goto out_kmem;
1362 		zso->zvo_disk->fops = &zvol_ops;
1363 	}
1364 
1365 	/* Limit read-ahead to a single page to prevent over-prefetching. */
1366 	blk_queue_set_read_ahead(zso->zvo_queue, 1);
1367 
1368 	if (!zv->zv_zso->use_blk_mq) {
1369 		/* Disable write merging in favor of the ZIO pipeline. */
1370 		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
1371 	}
1372 
1373 	zso->zvo_queue->queuedata = zv;
1374 	zso->zvo_dev = dev;
1375 	zv->zv_open_count = 0;
1376 	strlcpy(zv->zv_name, name, sizeof (zv->zv_name));
1377 
1378 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1379 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1380 
1381 	zso->zvo_disk->major = zvol_major;
1382 	zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
1383 
1384 	/*
1385 	 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.
1386 	 * This is accomplished by limiting the number of minors for the
1387 	 * device to one and explicitly disabling partition scanning.
1388 	 */
1389 	if (volmode == ZFS_VOLMODE_DEV) {
1390 		zso->zvo_disk->minors = 1;
1391 		zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
1392 		zso->zvo_disk->flags |= GENHD_FL_NO_PART;
1393 	}
1394 
1395 	zso->zvo_disk->first_minor = (dev & MINORMASK);
1396 	zso->zvo_disk->private_data = zv;
1397 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
1398 	    ZVOL_DEV_NAME, (dev & MINORMASK));
1399 
1400 	return (zv);
1401 
1402 out_kmem:
1403 	kmem_free(zso, sizeof (struct zvol_state_os));
1404 	kmem_free(zv, sizeof (zvol_state_t));
1405 	return (NULL);
1406 }
1407 
1408 /*
1409  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
1410  * At this time, the structure is not opened by anyone, is taken off
1411  * the zvol_state_list, and has its private data set to NULL.
1412  * The zvol_state_lock is dropped.
1413  *
1414  * This function may take many milliseconds to complete (e.g. we've seen
1415  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
1416  * "del_gendisk". Thus, consumers need to be careful to account for this
1417  * latency when calling this function.
1418  */
1419 void
1420 zvol_os_free(zvol_state_t *zv)
1421 {
1422 
1423 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1424 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1425 	ASSERT0(zv->zv_open_count);
1426 	ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
1427 
1428 	rw_destroy(&zv->zv_suspend_lock);
1429 	zfs_rangelock_fini(&zv->zv_rangelock);
1430 
1431 	del_gendisk(zv->zv_zso->zvo_disk);
1432 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
1433 	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
1434 #if defined(HAVE_BLK_CLEANUP_DISK)
1435 	blk_cleanup_disk(zv->zv_zso->zvo_disk);
1436 #else
1437 	put_disk(zv->zv_zso->zvo_disk);
1438 #endif
1439 #else
1440 	blk_cleanup_queue(zv->zv_zso->zvo_queue);
1441 	put_disk(zv->zv_zso->zvo_disk);
1442 #endif
1443 
1444 	if (zv->zv_zso->use_blk_mq)
1445 		blk_mq_free_tag_set(&zv->zv_zso->tag_set);
1446 
1447 	ida_simple_remove(&zvol_ida,
1448 	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
1449 
1450 	cv_destroy(&zv->zv_removing_cv);
1451 	mutex_destroy(&zv->zv_state_lock);
1452 	dataset_kstats_destroy(&zv->zv_kstat);
1453 
1454 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1455 	kmem_free(zv, sizeof (zvol_state_t));
1456 }
1457 
1458 void
1459 zvol_wait_close(zvol_state_t *zv)
1460 {
1461 }
1462 
1463 struct add_disk_work {
1464 	struct delayed_work work;
1465 	struct gendisk *disk;
1466 	int error;
1467 };
1468 
1469 static int
1470 __zvol_os_add_disk(struct gendisk *disk)
1471 {
1472 	int error = 0;
1473 #ifdef HAVE_ADD_DISK_RET
1474 	error = add_disk(disk);
1475 #else
1476 	add_disk(disk);
1477 #endif
1478 	return (error);
1479 }
1480 
1481 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
1482 static void
1483 zvol_os_add_disk_work(struct work_struct *work)
1484 {
1485 	struct add_disk_work *add_disk_work;
1486 	add_disk_work = container_of(work, struct add_disk_work, work.work);
1487 	add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
1488 }
1489 #endif
1490 
1491 /*
1492  * SPECIAL CASE:
1493  *
1494  * This function basically calls add_disk() from a workqueue.   You may be
1495  * thinking: why not just call add_disk() directly?
1496  *
1497  * When you call add_disk(), the zvol appears to the world.  When this happens,
1498  * the kernel calls disk_scan_partitions() on the zvol, which behaves
1499  * differently on the 6.9+ kernels:
1500  *
1501  * - 6.8 and older kernels -
1502  * disk_scan_partitions()
1503  *	handle = bdev_open_by_dev(
1504  *		zvol_open()
1505  *	bdev_release(handle);
1506  *		zvol_release()
1507  *
1508  *
1509  * - 6.9+ kernels -
1510  * disk_scan_partitions()
1511  * 	file = bdev_file_open_by_dev()
1512  *		zvol_open()
1513  *	fput(file)
1514  *	< wait for return to userspace >
1515  *		zvol_release()
1516  *
1517  * The difference is that the bdev_release() from the 6.8 kernel is synchronous
1518  * while the fput() from the 6.9 kernel is async.  Or more specifically it's
1519  * async that has to wait until we return to userspace (since it adds the fput
1520  * into the caller's work queue with the TWA_RESUME flag set).  This is not the
1521  * behavior we want, since we want do things like create+destroy a zvol within
1522  * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
1523  * reference to the zvol while we're in the IOCTL, which can't wait until we
1524  * return to userspace.
1525  *
1526  * We can get around this since fput() has a special codepath for when it's
1527  * running in a kernel thread or interrupt.  In those cases, it just puts the
1528  * fput into the system workqueue, which we can force to run with
1529  * __flush_workqueue().  That is why we call add_disk() from a workqueue - so it
1530  * run from a kernel thread and "tricks" the fput() codepaths.
1531  *
1532  * Note that __flush_workqueue() is slowly getting deprecated.  This may be ok
1533  * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
1534  * fput) to happen, which it eventually, naturally, will from the system_wq
1535  * without us explicitly calling __flush_workqueue().
1536  */
1537 static int
1538 zvol_os_add_disk(struct gendisk *disk)
1539 {
1540 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)	/* 6.9+ kernel */
1541 	struct add_disk_work add_disk_work;
1542 
1543 	INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
1544 	add_disk_work.disk = disk;
1545 	add_disk_work.error = 0;
1546 
1547 	/* Use *_delayed_work functions since they're not GPL'd */
1548 	schedule_delayed_work(&add_disk_work.work, 0);
1549 	flush_delayed_work(&add_disk_work.work);
1550 
1551 	__flush_workqueue(system_wq);
1552 	return (add_disk_work.error);
1553 #else	/* <= 6.8 kernel */
1554 	return (__zvol_os_add_disk(disk));
1555 #endif
1556 }
1557 
1558 /*
1559  * Create a block device minor node and setup the linkage between it
1560  * and the specified volume.  Once this function returns the block
1561  * device is live and ready for use.
1562  */
1563 int
1564 zvol_os_create_minor(const char *name)
1565 {
1566 	zvol_state_t *zv;
1567 	objset_t *os;
1568 	dmu_object_info_t *doi;
1569 	uint64_t volsize;
1570 	uint64_t len;
1571 	unsigned minor = 0;
1572 	int error = 0;
1573 	int idx;
1574 	uint64_t hash = zvol_name_hash(name);
1575 	uint64_t volthreading;
1576 	bool replayed_zil = B_FALSE;
1577 
1578 	if (zvol_inhibit_dev)
1579 		return (0);
1580 
1581 	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
1582 	if (idx < 0)
1583 		return (SET_ERROR(-idx));
1584 	minor = idx << ZVOL_MINOR_BITS;
1585 	if (MINOR(minor) != minor) {
1586 		/* too many partitions can cause an overflow */
1587 		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
1588 		    name, minor, MINOR(minor));
1589 		ida_simple_remove(&zvol_ida, idx);
1590 		return (SET_ERROR(EINVAL));
1591 	}
1592 
1593 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
1594 	if (zv) {
1595 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1596 		mutex_exit(&zv->zv_state_lock);
1597 		ida_simple_remove(&zvol_ida, idx);
1598 		return (SET_ERROR(EEXIST));
1599 	}
1600 
1601 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1602 
1603 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1604 	if (error)
1605 		goto out_doi;
1606 
1607 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1608 	if (error)
1609 		goto out_dmu_objset_disown;
1610 
1611 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1612 	if (error)
1613 		goto out_dmu_objset_disown;
1614 
1615 	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
1616 	    doi->doi_data_block_size);
1617 	if (zv == NULL) {
1618 		error = SET_ERROR(EAGAIN);
1619 		goto out_dmu_objset_disown;
1620 	}
1621 	zv->zv_hash = hash;
1622 
1623 	if (dmu_objset_is_snapshot(os))
1624 		zv->zv_flags |= ZVOL_RDONLY;
1625 
1626 	zv->zv_volsize = volsize;
1627 	zv->zv_objset = os;
1628 
1629 	/* Default */
1630 	zv->zv_threading = B_TRUE;
1631 	if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)
1632 	    == 0)
1633 		zv->zv_threading = volthreading;
1634 
1635 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
1636 
1637 #ifdef QUEUE_FLAG_DISCARD
1638 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
1639 #endif
1640 #ifdef QUEUE_FLAG_NONROT
1641 	blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
1642 #endif
1643 #ifdef QUEUE_FLAG_ADD_RANDOM
1644 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
1645 #endif
1646 	/* This flag was introduced in kernel version 4.12. */
1647 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
1648 	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
1649 #endif
1650 
1651 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1652 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1653 	if (error)
1654 		goto out_dmu_objset_disown;
1655 	ASSERT3P(zv->zv_zilog, ==, NULL);
1656 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1657 	if (spa_writeable(dmu_objset_spa(os))) {
1658 		if (zil_replay_disable)
1659 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1660 		else
1661 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1662 	}
1663 	if (replayed_zil)
1664 		zil_close(zv->zv_zilog);
1665 	zv->zv_zilog = NULL;
1666 
1667 	/*
1668 	 * When udev detects the addition of the device it will immediately
1669 	 * invoke blkid(8) to determine the type of content on the device.
1670 	 * Prefetching the blocks commonly scanned by blkid(8) will speed
1671 	 * up this process.
1672 	 */
1673 	len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1674 	if (len > 0) {
1675 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1676 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1677 		    ZIO_PRIORITY_SYNC_READ);
1678 	}
1679 
1680 	zv->zv_objset = NULL;
1681 out_dmu_objset_disown:
1682 	dmu_objset_disown(os, B_TRUE, FTAG);
1683 out_doi:
1684 	kmem_free(doi, sizeof (dmu_object_info_t));
1685 
1686 	/*
1687 	 * Keep in mind that once add_disk() is called, the zvol is
1688 	 * announced to the world, and zvol_open()/zvol_release() can
1689 	 * be called at any time. Incidentally, add_disk() itself calls
1690 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1691 	 * directly as well.
1692 	 */
1693 	if (error == 0) {
1694 		rw_enter(&zvol_state_lock, RW_WRITER);
1695 		zvol_insert(zv);
1696 		rw_exit(&zvol_state_lock);
1697 		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
1698 	} else {
1699 		ida_simple_remove(&zvol_ida, idx);
1700 	}
1701 
1702 	return (error);
1703 }
1704 
1705 void
1706 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1707 {
1708 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1709 
1710 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1711 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1712 
1713 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1714 
1715 	/* move to new hashtable entry  */
1716 	zv->zv_hash = zvol_name_hash(newname);
1717 	hlist_del(&zv->zv_hlink);
1718 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1719 
1720 	/*
1721 	 * The block device's read-only state is briefly changed causing
1722 	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
1723 	 * the name change and fixes the symlinks.  This does not change
1724 	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1725 	 * changes.  This would normally be done using kobject_uevent() but
1726 	 * that is a GPL-only symbol which is why we need this workaround.
1727 	 */
1728 	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1729 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1730 
1731 	dataset_kstats_rename(&zv->zv_kstat, newname);
1732 }
1733 
1734 void
1735 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1736 {
1737 
1738 	set_disk_ro(zv->zv_zso->zvo_disk, flags);
1739 }
1740 
1741 void
1742 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1743 {
1744 
1745 	set_capacity(zv->zv_zso->zvo_disk, capacity);
1746 }
1747 
1748 int
1749 zvol_init(void)
1750 {
1751 	int error;
1752 
1753 	error = zvol_init_impl();
1754 	if (error) {
1755 		printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);
1756 		return (error);
1757 	}
1758 
1759 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
1760 	if (error) {
1761 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1762 		return (error);
1763 	}
1764 
1765 	if (zvol_blk_mq_queue_depth == 0) {
1766 		zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
1767 	} else {
1768 		zvol_actual_blk_mq_queue_depth =
1769 		    MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
1770 	}
1771 
1772 	if (zvol_blk_mq_threads == 0) {
1773 		zvol_blk_mq_actual_threads = num_online_cpus();
1774 	} else {
1775 		zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
1776 		    1024);
1777 	}
1778 
1779 	ida_init(&zvol_ida);
1780 	return (0);
1781 }
1782 
1783 void
1784 zvol_fini(void)
1785 {
1786 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
1787 
1788 	zvol_fini_impl();
1789 
1790 	ida_destroy(&zvol_ida);
1791 }
1792 
1793 module_param(zvol_major, uint, 0444);
1794 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1795 
1796 module_param(zvol_max_discard_blocks, ulong, 0444);
1797 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1798 
1799 module_param(zvol_prefetch_bytes, uint, 0644);
1800 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1801 
1802 module_param(zvol_volmode, uint, 0644);
1803 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1804 
1805 module_param(zvol_blk_mq_queue_depth, uint, 0644);
1806 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
1807 
1808 module_param(zvol_use_blk_mq, uint, 0644);
1809 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
1810 
1811 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
1812 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
1813 	"Process volblocksize blocks per thread");
1814 
1815 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
1816 module_param(zvol_open_timeout_ms, uint, 0644);
1817 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
1818 #endif
1819