161145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy * CDDL HEADER START
4eda14cbcSMatt Macy *
5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy * See the License for the specific language governing permissions
12eda14cbcSMatt Macy * and limitations under the License.
13eda14cbcSMatt Macy *
14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy *
20eda14cbcSMatt Macy * CDDL HEADER END
21eda14cbcSMatt Macy */
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
24e6e941e6SMartin Matuska * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com>
2553a2e263SMartin Matuska * Copyright (c) 2024, 2025, Klara, Inc.
26eda14cbcSMatt Macy */
27eda14cbcSMatt Macy
28eda14cbcSMatt Macy #include <sys/dataset_kstats.h>
29eda14cbcSMatt Macy #include <sys/dbuf.h>
30eda14cbcSMatt Macy #include <sys/dmu_traverse.h>
31eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
32eda14cbcSMatt Macy #include <sys/dsl_prop.h>
33eda14cbcSMatt Macy #include <sys/dsl_dir.h>
34eda14cbcSMatt Macy #include <sys/zap.h>
35eda14cbcSMatt Macy #include <sys/zfeature.h>
36eda14cbcSMatt Macy #include <sys/zil_impl.h>
37eda14cbcSMatt Macy #include <sys/dmu_tx.h>
38eda14cbcSMatt Macy #include <sys/zio.h>
39eda14cbcSMatt Macy #include <sys/zfs_rlock.h>
40eda14cbcSMatt Macy #include <sys/spa_impl.h>
41eda14cbcSMatt Macy #include <sys/zvol.h>
42eda14cbcSMatt Macy #include <sys/zvol_impl.h>
431719886fSMartin Matuska #include <cityhash.h>
44eda14cbcSMatt Macy
45eda14cbcSMatt Macy #include <linux/blkdev_compat.h>
46eda14cbcSMatt Macy #include <linux/task_io_accounting_ops.h>
4775e1fea6SMartin Matuska #include <linux/workqueue.h>
481f1e2261SMartin Matuska #include <linux/blk-mq.h>
491f1e2261SMartin Matuska
501f1e2261SMartin Matuska static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
511f1e2261SMartin Matuska struct request *rq, boolean_t force_sync);
521f1e2261SMartin Matuska
53e92ffd9bSMartin Matuska static unsigned int zvol_major = ZVOL_MAJOR;
54e92ffd9bSMartin Matuska static unsigned long zvol_max_discard_blocks = 16384;
55716fd348SMartin Matuska
56716fd348SMartin Matuska #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
570a97523dSMartin Matuska static unsigned int zvol_open_timeout_ms = 1000;
58716fd348SMartin Matuska #endif
59eda14cbcSMatt Macy
601f1e2261SMartin Matuska static unsigned int zvol_blk_mq_threads = 0;
611f1e2261SMartin Matuska static unsigned int zvol_blk_mq_actual_threads;
621f1e2261SMartin Matuska static boolean_t zvol_use_blk_mq = B_FALSE;
631f1e2261SMartin Matuska
641f1e2261SMartin Matuska /*
651f1e2261SMartin Matuska * The maximum number of volblocksize blocks to process per thread. Typically,
661f1e2261SMartin Matuska * write heavy workloads preform better with higher values here, and read
671f1e2261SMartin Matuska * heavy workloads preform better with lower values, but that's not a hard
681f1e2261SMartin Matuska * and fast rule. It's basically a knob to tune between "less overhead with
691f1e2261SMartin Matuska * less parallelism" and "more overhead, but more parallelism".
701f1e2261SMartin Matuska *
711f1e2261SMartin Matuska * '8' was chosen as a reasonable, balanced, default based off of sequential
721f1e2261SMartin Matuska * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
731f1e2261SMartin Matuska */
741f1e2261SMartin Matuska static unsigned int zvol_blk_mq_blocks_per_thread = 8;
751f1e2261SMartin Matuska
761f1e2261SMartin Matuska #ifndef BLKDEV_DEFAULT_RQ
771f1e2261SMartin Matuska /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
781f1e2261SMartin Matuska #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
791f1e2261SMartin Matuska #endif
801f1e2261SMartin Matuska
811f1e2261SMartin Matuska /*
821f1e2261SMartin Matuska * Finalize our BIO or request.
831f1e2261SMartin Matuska */
847a7741afSMartin Matuska static inline void
zvol_end_io(struct bio * bio,struct request * rq,int error)857a7741afSMartin Matuska zvol_end_io(struct bio *bio, struct request *rq, int error)
867a7741afSMartin Matuska {
87d0abb9a6SMartin Matuska ASSERT3U(error, >=, 0);
887a7741afSMartin Matuska if (bio) {
89d0abb9a6SMartin Matuska bio->bi_status = errno_to_bi_status(error);
907a7741afSMartin Matuska bio_endio(bio);
917a7741afSMartin Matuska } else {
927a7741afSMartin Matuska blk_mq_end_request(rq, errno_to_bi_status(error));
937a7741afSMartin Matuska }
947a7741afSMartin Matuska }
951f1e2261SMartin Matuska
961f1e2261SMartin Matuska static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
971f1e2261SMartin Matuska static unsigned int zvol_actual_blk_mq_queue_depth;
981f1e2261SMartin Matuska
99eda14cbcSMatt Macy struct zvol_state_os {
100eda14cbcSMatt Macy struct gendisk *zvo_disk; /* generic disk */
101eda14cbcSMatt Macy struct request_queue *zvo_queue; /* request queue */
102eda14cbcSMatt Macy dev_t zvo_dev; /* device id */
1031f1e2261SMartin Matuska
1041f1e2261SMartin Matuska struct blk_mq_tag_set tag_set;
1051f1e2261SMartin Matuska
1061f1e2261SMartin Matuska /* Set from the global 'zvol_use_blk_mq' at zvol load */
1071f1e2261SMartin Matuska boolean_t use_blk_mq;
108eda14cbcSMatt Macy };
109eda14cbcSMatt Macy
110eda14cbcSMatt Macy static struct ida zvol_ida;
111eda14cbcSMatt Macy
1121f1e2261SMartin Matuska /*
1131f1e2261SMartin Matuska * This is called when a new block multiqueue request comes in. A request
1141f1e2261SMartin Matuska * contains one or more BIOs.
1151f1e2261SMartin Matuska */
zvol_mq_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)1161f1e2261SMartin Matuska static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
1171f1e2261SMartin Matuska const struct blk_mq_queue_data *bd)
1181f1e2261SMartin Matuska {
1191f1e2261SMartin Matuska struct request *rq = bd->rq;
1201f1e2261SMartin Matuska zvol_state_t *zv = rq->q->queuedata;
1211f1e2261SMartin Matuska
1221f1e2261SMartin Matuska /* Tell the kernel that we are starting to process this request */
1231f1e2261SMartin Matuska blk_mq_start_request(rq);
1241f1e2261SMartin Matuska
1251f1e2261SMartin Matuska if (blk_rq_is_passthrough(rq)) {
1261f1e2261SMartin Matuska /* Skip non filesystem request */
1271f1e2261SMartin Matuska blk_mq_end_request(rq, BLK_STS_IOERR);
1281f1e2261SMartin Matuska return (BLK_STS_IOERR);
1291f1e2261SMartin Matuska }
1301f1e2261SMartin Matuska
1311f1e2261SMartin Matuska zvol_request_impl(zv, NULL, rq, 0);
1321f1e2261SMartin Matuska
1331f1e2261SMartin Matuska /* Acknowledge to the kernel that we got this request */
1341f1e2261SMartin Matuska return (BLK_STS_OK);
1351f1e2261SMartin Matuska }
1361f1e2261SMartin Matuska
1371f1e2261SMartin Matuska static struct blk_mq_ops zvol_blk_mq_queue_ops = {
1381f1e2261SMartin Matuska .queue_rq = zvol_mq_queue_rq,
1391f1e2261SMartin Matuska };
1401f1e2261SMartin Matuska
1411f1e2261SMartin Matuska /* Initialize our blk-mq struct */
zvol_blk_mq_alloc_tag_set(zvol_state_t * zv)1421f1e2261SMartin Matuska static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
1431f1e2261SMartin Matuska {
1441f1e2261SMartin Matuska struct zvol_state_os *zso = zv->zv_zso;
1451f1e2261SMartin Matuska
1461f1e2261SMartin Matuska memset(&zso->tag_set, 0, sizeof (zso->tag_set));
1471f1e2261SMartin Matuska
1481f1e2261SMartin Matuska /* Initialize tag set. */
1491f1e2261SMartin Matuska zso->tag_set.ops = &zvol_blk_mq_queue_ops;
1501f1e2261SMartin Matuska zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
1511f1e2261SMartin Matuska zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
1521f1e2261SMartin Matuska zso->tag_set.numa_node = NUMA_NO_NODE;
1531f1e2261SMartin Matuska zso->tag_set.cmd_size = 0;
1541f1e2261SMartin Matuska
1551f1e2261SMartin Matuska /*
1561f1e2261SMartin Matuska * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
1571f1e2261SMartin Matuska * zvol_request_impl()
1581f1e2261SMartin Matuska */
159b59a0cdeSMartin Matuska zso->tag_set.flags = BLK_MQ_F_BLOCKING;
160b59a0cdeSMartin Matuska
161b59a0cdeSMartin Matuska #ifdef BLK_MQ_F_SHOULD_MERGE
162b59a0cdeSMartin Matuska /*
163b59a0cdeSMartin Matuska * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit.
164b59a0cdeSMartin Matuska * For older kernels, we set it.
165b59a0cdeSMartin Matuska */
166b59a0cdeSMartin Matuska zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE;
167b59a0cdeSMartin Matuska #endif
168b59a0cdeSMartin Matuska
1691f1e2261SMartin Matuska zso->tag_set.driver_data = zv;
1701f1e2261SMartin Matuska
1711f1e2261SMartin Matuska return (blk_mq_alloc_tag_set(&zso->tag_set));
1721f1e2261SMartin Matuska }
1731f1e2261SMartin Matuska
174eda14cbcSMatt Macy /*
175eda14cbcSMatt Macy * Given a path, return TRUE if path is a ZVOL.
176eda14cbcSMatt Macy */
177c03c5b1cSMartin Matuska boolean_t
zvol_os_is_zvol(const char * path)178c03c5b1cSMartin Matuska zvol_os_is_zvol(const char *path)
179eda14cbcSMatt Macy {
1807877fdebSMatt Macy dev_t dev = 0;
181eda14cbcSMatt Macy
1827877fdebSMatt Macy if (vdev_lookup_bdev(path, &dev) != 0)
183eda14cbcSMatt Macy return (B_FALSE);
184eda14cbcSMatt Macy
1857877fdebSMatt Macy if (MAJOR(dev) == zvol_major)
186eda14cbcSMatt Macy return (B_TRUE);
187eda14cbcSMatt Macy
188eda14cbcSMatt Macy return (B_FALSE);
189eda14cbcSMatt Macy }
190eda14cbcSMatt Macy
191eda14cbcSMatt Macy static void
zvol_write(zv_request_t * zvr)1929db44a8eSMartin Matuska zvol_write(zv_request_t *zvr)
193eda14cbcSMatt Macy {
194eda14cbcSMatt Macy struct bio *bio = zvr->bio;
1951f1e2261SMartin Matuska struct request *rq = zvr->rq;
1967877fdebSMatt Macy int error = 0;
197184c1b94SMartin Matuska zfs_uio_t uio;
198eda14cbcSMatt Macy zvol_state_t *zv = zvr->zv;
1991f1e2261SMartin Matuska struct request_queue *q;
2001f1e2261SMartin Matuska struct gendisk *disk;
2011f1e2261SMartin Matuska unsigned long start_time = 0;
2021f1e2261SMartin Matuska boolean_t acct = B_FALSE;
2031f1e2261SMartin Matuska
2047877fdebSMatt Macy ASSERT3P(zv, !=, NULL);
2057877fdebSMatt Macy ASSERT3U(zv->zv_open_count, >, 0);
2067877fdebSMatt Macy ASSERT3P(zv->zv_zilog, !=, NULL);
207eda14cbcSMatt Macy
2081f1e2261SMartin Matuska q = zv->zv_zso->zvo_queue;
2091f1e2261SMartin Matuska disk = zv->zv_zso->zvo_disk;
2101f1e2261SMartin Matuska
211eda14cbcSMatt Macy /* bio marked as FLUSH need to flush before write */
212d0abb9a6SMartin Matuska if (io_is_flush(bio, rq)) {
213d0abb9a6SMartin Matuska error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
214d0abb9a6SMartin Matuska if (error != 0) {
215d0abb9a6SMartin Matuska rw_exit(&zv->zv_suspend_lock);
216d0abb9a6SMartin Matuska zvol_end_io(bio, rq, -error);
217d0abb9a6SMartin Matuska return;
218d0abb9a6SMartin Matuska }
219d0abb9a6SMartin Matuska }
220eda14cbcSMatt Macy
221eda14cbcSMatt Macy /* Some requests are just for flush and nothing else. */
2221f1e2261SMartin Matuska if (io_size(bio, rq) == 0) {
223eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
2247a7741afSMartin Matuska zvol_end_io(bio, rq, 0);
225eda14cbcSMatt Macy return;
226eda14cbcSMatt Macy }
227eda14cbcSMatt Macy
2281f1e2261SMartin Matuska zfs_uio_bvec_init(&uio, bio, rq);
2297877fdebSMatt Macy
2301f1e2261SMartin Matuska ssize_t start_resid = uio.uio_resid;
2311f1e2261SMartin Matuska
2321f1e2261SMartin Matuska /*
2331f1e2261SMartin Matuska * With use_blk_mq, accounting is done by blk_mq_start_request()
2341f1e2261SMartin Matuska * and blk_mq_end_request(), so we can skip it here.
2351f1e2261SMartin Matuska */
2361f1e2261SMartin Matuska if (bio) {
2371f1e2261SMartin Matuska acct = blk_queue_io_stat(q);
2381f1e2261SMartin Matuska if (acct) {
2391f1e2261SMartin Matuska start_time = blk_generic_start_io_acct(q, disk, WRITE,
2401f1e2261SMartin Matuska bio);
2411f1e2261SMartin Matuska }
2421f1e2261SMartin Matuska }
243eda14cbcSMatt Macy
244eda14cbcSMatt Macy boolean_t sync =
2451f1e2261SMartin Matuska io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
246eda14cbcSMatt Macy
247eda14cbcSMatt Macy zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
248eda14cbcSMatt Macy uio.uio_loffset, uio.uio_resid, RL_WRITER);
249eda14cbcSMatt Macy
250eda14cbcSMatt Macy uint64_t volsize = zv->zv_volsize;
251eda14cbcSMatt Macy while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
252eda14cbcSMatt Macy uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
253eda14cbcSMatt Macy uint64_t off = uio.uio_loffset;
254eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
255eda14cbcSMatt Macy
256eda14cbcSMatt Macy if (bytes > volsize - off) /* don't write past the end */
257eda14cbcSMatt Macy bytes = volsize - off;
258eda14cbcSMatt Macy
259eda14cbcSMatt Macy dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
260eda14cbcSMatt Macy
261eda14cbcSMatt Macy /* This will only fail for ENOSPC */
26261145dc2SMartin Matuska error = dmu_tx_assign(tx, DMU_TX_WAIT);
263eda14cbcSMatt Macy if (error) {
264eda14cbcSMatt Macy dmu_tx_abort(tx);
265eda14cbcSMatt Macy break;
266eda14cbcSMatt Macy }
267b1c1ee44SMartin Matuska error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
268b1c1ee44SMartin Matuska DMU_READ_PREFETCH);
269eda14cbcSMatt Macy if (error == 0) {
270eda14cbcSMatt Macy zvol_log_write(zv, tx, off, bytes, sync);
271eda14cbcSMatt Macy }
272eda14cbcSMatt Macy dmu_tx_commit(tx);
273eda14cbcSMatt Macy
274eda14cbcSMatt Macy if (error)
275eda14cbcSMatt Macy break;
276eda14cbcSMatt Macy }
277eda14cbcSMatt Macy zfs_rangelock_exit(lr);
278eda14cbcSMatt Macy
279eda14cbcSMatt Macy int64_t nwritten = start_resid - uio.uio_resid;
280eda14cbcSMatt Macy dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
281eda14cbcSMatt Macy task_io_account_write(nwritten);
282eda14cbcSMatt Macy
283d0abb9a6SMartin Matuska if (error == 0 && sync)
284d0abb9a6SMartin Matuska error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
285eda14cbcSMatt Macy
286eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
2877877fdebSMatt Macy
2881f1e2261SMartin Matuska if (bio && acct) {
2897877fdebSMatt Macy blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
2901f1e2261SMartin Matuska }
2917877fdebSMatt Macy
292d0abb9a6SMartin Matuska zvol_end_io(bio, rq, error);
293eda14cbcSMatt Macy }
294eda14cbcSMatt Macy
295eda14cbcSMatt Macy static void
zvol_write_task(void * arg)2969db44a8eSMartin Matuska zvol_write_task(void *arg)
297eda14cbcSMatt Macy {
2989db44a8eSMartin Matuska zv_request_task_t *task = arg;
2999db44a8eSMartin Matuska zvol_write(&task->zvr);
3009db44a8eSMartin Matuska zv_request_task_free(task);
3019db44a8eSMartin Matuska }
3029db44a8eSMartin Matuska
3039db44a8eSMartin Matuska static void
zvol_discard(zv_request_t * zvr)3049db44a8eSMartin Matuska zvol_discard(zv_request_t *zvr)
3059db44a8eSMartin Matuska {
306eda14cbcSMatt Macy struct bio *bio = zvr->bio;
3071f1e2261SMartin Matuska struct request *rq = zvr->rq;
308eda14cbcSMatt Macy zvol_state_t *zv = zvr->zv;
3091f1e2261SMartin Matuska uint64_t start = io_offset(bio, rq);
3101f1e2261SMartin Matuska uint64_t size = io_size(bio, rq);
311eda14cbcSMatt Macy uint64_t end = start + size;
312eda14cbcSMatt Macy boolean_t sync;
313eda14cbcSMatt Macy int error = 0;
314eda14cbcSMatt Macy dmu_tx_t *tx;
3151f1e2261SMartin Matuska struct request_queue *q = zv->zv_zso->zvo_queue;
3161f1e2261SMartin Matuska struct gendisk *disk = zv->zv_zso->zvo_disk;
3171f1e2261SMartin Matuska unsigned long start_time = 0;
3182a58b312SMartin Matuska boolean_t acct = B_FALSE;
319eda14cbcSMatt Macy
3207877fdebSMatt Macy ASSERT3P(zv, !=, NULL);
3217877fdebSMatt Macy ASSERT3U(zv->zv_open_count, >, 0);
3227877fdebSMatt Macy ASSERT3P(zv->zv_zilog, !=, NULL);
323eda14cbcSMatt Macy
3241f1e2261SMartin Matuska if (bio) {
3251f1e2261SMartin Matuska acct = blk_queue_io_stat(q);
3261f1e2261SMartin Matuska if (acct) {
3271f1e2261SMartin Matuska start_time = blk_generic_start_io_acct(q, disk, WRITE,
3281f1e2261SMartin Matuska bio);
3291f1e2261SMartin Matuska }
3301f1e2261SMartin Matuska }
3317877fdebSMatt Macy
3321f1e2261SMartin Matuska sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
333eda14cbcSMatt Macy
334eda14cbcSMatt Macy if (end > zv->zv_volsize) {
335eda14cbcSMatt Macy error = SET_ERROR(EIO);
336eda14cbcSMatt Macy goto unlock;
337eda14cbcSMatt Macy }
338eda14cbcSMatt Macy
339eda14cbcSMatt Macy /*
3406e6cde8fSMartin Matuska * Align the request to volume block boundaries. This will prevent
3416e6cde8fSMartin Matuska * dnode_free_range() from zeroing out the unaligned parts which is
3426e6cde8fSMartin Matuska * slow (read-modify-write) and useless since we are not freeing any
3436e6cde8fSMartin Matuska * space by doing so.
344eda14cbcSMatt Macy */
345eda14cbcSMatt Macy start = P2ROUNDUP(start, zv->zv_volblocksize);
346aca928a5SMartin Matuska end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
347eda14cbcSMatt Macy size = end - start;
348eda14cbcSMatt Macy
349eda14cbcSMatt Macy if (start >= end)
350eda14cbcSMatt Macy goto unlock;
351eda14cbcSMatt Macy
352eda14cbcSMatt Macy zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
353eda14cbcSMatt Macy start, size, RL_WRITER);
354eda14cbcSMatt Macy
355eda14cbcSMatt Macy tx = dmu_tx_create(zv->zv_objset);
356eda14cbcSMatt Macy dmu_tx_mark_netfree(tx);
35761145dc2SMartin Matuska error = dmu_tx_assign(tx, DMU_TX_WAIT);
358eda14cbcSMatt Macy if (error != 0) {
359eda14cbcSMatt Macy dmu_tx_abort(tx);
360eda14cbcSMatt Macy } else {
361f8b1db88SMartin Matuska zvol_log_truncate(zv, tx, start, size);
362eda14cbcSMatt Macy dmu_tx_commit(tx);
363eda14cbcSMatt Macy error = dmu_free_long_range(zv->zv_objset,
364eda14cbcSMatt Macy ZVOL_OBJ, start, size);
365eda14cbcSMatt Macy }
366eda14cbcSMatt Macy zfs_rangelock_exit(lr);
367eda14cbcSMatt Macy
368eda14cbcSMatt Macy if (error == 0 && sync)
369d0abb9a6SMartin Matuska error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
370eda14cbcSMatt Macy
371eda14cbcSMatt Macy unlock:
372eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
3737877fdebSMatt Macy
3741f1e2261SMartin Matuska if (bio && acct) {
3751f1e2261SMartin Matuska blk_generic_end_io_acct(q, disk, WRITE, bio,
3761f1e2261SMartin Matuska start_time);
3771f1e2261SMartin Matuska }
3787877fdebSMatt Macy
379d0abb9a6SMartin Matuska zvol_end_io(bio, rq, error);
380eda14cbcSMatt Macy }
381eda14cbcSMatt Macy
382eda14cbcSMatt Macy static void
zvol_discard_task(void * arg)3839db44a8eSMartin Matuska zvol_discard_task(void *arg)
384eda14cbcSMatt Macy {
3859db44a8eSMartin Matuska zv_request_task_t *task = arg;
3869db44a8eSMartin Matuska zvol_discard(&task->zvr);
3879db44a8eSMartin Matuska zv_request_task_free(task);
3889db44a8eSMartin Matuska }
3899db44a8eSMartin Matuska
3909db44a8eSMartin Matuska static void
zvol_read(zv_request_t * zvr)3919db44a8eSMartin Matuska zvol_read(zv_request_t *zvr)
3929db44a8eSMartin Matuska {
393eda14cbcSMatt Macy struct bio *bio = zvr->bio;
3941f1e2261SMartin Matuska struct request *rq = zvr->rq;
3957877fdebSMatt Macy int error = 0;
396184c1b94SMartin Matuska zfs_uio_t uio;
3971f1e2261SMartin Matuska boolean_t acct = B_FALSE;
398eda14cbcSMatt Macy zvol_state_t *zv = zvr->zv;
3991f1e2261SMartin Matuska struct request_queue *q;
4001f1e2261SMartin Matuska struct gendisk *disk;
4011f1e2261SMartin Matuska unsigned long start_time = 0;
4021f1e2261SMartin Matuska
4037877fdebSMatt Macy ASSERT3P(zv, !=, NULL);
4047877fdebSMatt Macy ASSERT3U(zv->zv_open_count, >, 0);
405eda14cbcSMatt Macy
4061f1e2261SMartin Matuska zfs_uio_bvec_init(&uio, bio, rq);
4077877fdebSMatt Macy
4081f1e2261SMartin Matuska q = zv->zv_zso->zvo_queue;
4091f1e2261SMartin Matuska disk = zv->zv_zso->zvo_disk;
4101f1e2261SMartin Matuska
4111f1e2261SMartin Matuska ssize_t start_resid = uio.uio_resid;
4121f1e2261SMartin Matuska
4131f1e2261SMartin Matuska /*
4141f1e2261SMartin Matuska * When blk-mq is being used, accounting is done by
4151f1e2261SMartin Matuska * blk_mq_start_request() and blk_mq_end_request().
4161f1e2261SMartin Matuska */
4171f1e2261SMartin Matuska if (bio) {
4181f1e2261SMartin Matuska acct = blk_queue_io_stat(q);
4197877fdebSMatt Macy if (acct)
4201f1e2261SMartin Matuska start_time = blk_generic_start_io_acct(q, disk, READ,
4211f1e2261SMartin Matuska bio);
4221f1e2261SMartin Matuska }
423eda14cbcSMatt Macy
424eda14cbcSMatt Macy zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
425eda14cbcSMatt Macy uio.uio_loffset, uio.uio_resid, RL_READER);
426eda14cbcSMatt Macy
427eda14cbcSMatt Macy uint64_t volsize = zv->zv_volsize;
4281f1e2261SMartin Matuska
429eda14cbcSMatt Macy while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
430eda14cbcSMatt Macy uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
431eda14cbcSMatt Macy
432eda14cbcSMatt Macy /* don't read past the end */
433eda14cbcSMatt Macy if (bytes > volsize - uio.uio_loffset)
434eda14cbcSMatt Macy bytes = volsize - uio.uio_loffset;
435eda14cbcSMatt Macy
436b1c1ee44SMartin Matuska error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
437b1c1ee44SMartin Matuska DMU_READ_PREFETCH);
438eda14cbcSMatt Macy if (error) {
439eda14cbcSMatt Macy /* convert checksum errors into IO errors */
440eda14cbcSMatt Macy if (error == ECKSUM)
441eda14cbcSMatt Macy error = SET_ERROR(EIO);
442eda14cbcSMatt Macy break;
443eda14cbcSMatt Macy }
444eda14cbcSMatt Macy }
445eda14cbcSMatt Macy zfs_rangelock_exit(lr);
446eda14cbcSMatt Macy
447eda14cbcSMatt Macy int64_t nread = start_resid - uio.uio_resid;
448eda14cbcSMatt Macy dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
449eda14cbcSMatt Macy task_io_account_read(nread);
450eda14cbcSMatt Macy
451eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
4527877fdebSMatt Macy
4531f1e2261SMartin Matuska if (bio && acct) {
4547877fdebSMatt Macy blk_generic_end_io_acct(q, disk, READ, bio, start_time);
4551f1e2261SMartin Matuska }
4567877fdebSMatt Macy
457d0abb9a6SMartin Matuska zvol_end_io(bio, rq, error);
4589db44a8eSMartin Matuska }
4599db44a8eSMartin Matuska
4609db44a8eSMartin Matuska static void
zvol_read_task(void * arg)4619db44a8eSMartin Matuska zvol_read_task(void *arg)
4629db44a8eSMartin Matuska {
4639db44a8eSMartin Matuska zv_request_task_t *task = arg;
4649db44a8eSMartin Matuska zvol_read(&task->zvr);
4659db44a8eSMartin Matuska zv_request_task_free(task);
466eda14cbcSMatt Macy }
467eda14cbcSMatt Macy
4686e6cde8fSMartin Matuska /*
4696e6cde8fSMartin Matuska * Note:
4706e6cde8fSMartin Matuska *
4716e6cde8fSMartin Matuska * The kernel uses different enum names for the IO opcode, depending on the
4726e6cde8fSMartin Matuska * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather
4736e6cde8fSMartin Matuska * than inline functions for these checks.
4746e6cde8fSMartin Matuska */
4756e6cde8fSMartin Matuska /* Should this IO go down the zvol write path? */
4766e6cde8fSMartin Matuska #define ZVOL_OP_IS_WRITE(op) \
4776e6cde8fSMartin Matuska (op == REQ_OP_WRITE || \
4786e6cde8fSMartin Matuska op == REQ_OP_FLUSH || \
4796e6cde8fSMartin Matuska op == REQ_OP_DISCARD)
4806e6cde8fSMartin Matuska
4816e6cde8fSMartin Matuska /* Is this IO type supported by zvols? */
4826e6cde8fSMartin Matuska #define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op))
4836e6cde8fSMartin Matuska
4846e6cde8fSMartin Matuska /* Get the IO opcode */
4856e6cde8fSMartin Matuska #define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq))
4861f1e2261SMartin Matuska
4871f1e2261SMartin Matuska /*
4881f1e2261SMartin Matuska * Process a BIO or request
4891f1e2261SMartin Matuska *
4901f1e2261SMartin Matuska * Either 'bio' or 'rq' should be set depending on if we are processing a
4911f1e2261SMartin Matuska * bio or a request (both should not be set).
4921f1e2261SMartin Matuska *
4931f1e2261SMartin Matuska * force_sync: Set to 0 to defer processing to a background taskq
4941f1e2261SMartin Matuska * Set to 1 to process data synchronously
4951f1e2261SMartin Matuska */
496681ce946SMartin Matuska static void
zvol_request_impl(zvol_state_t * zv,struct bio * bio,struct request * rq,boolean_t force_sync)4971f1e2261SMartin Matuska zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
4981f1e2261SMartin Matuska boolean_t force_sync)
499eda14cbcSMatt Macy {
500eda14cbcSMatt Macy fstrans_cookie_t cookie = spl_fstrans_mark();
5011f1e2261SMartin Matuska uint64_t offset = io_offset(bio, rq);
5021f1e2261SMartin Matuska uint64_t size = io_size(bio, rq);
50336c970edSMartin Matuska int rw;
50436c970edSMartin Matuska
5056e6cde8fSMartin Matuska if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) {
5066e6cde8fSMartin Matuska zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x",
5076e6cde8fSMartin Matuska rq != NULL ? "request" : "BIO",
5086e6cde8fSMartin Matuska ZVOL_OP(bio, rq),
5096e6cde8fSMartin Matuska rq != NULL ? rq->cmd_flags : bio->bi_opf);
5106e6cde8fSMartin Matuska ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)));
5116e6cde8fSMartin Matuska zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP));
5126e6cde8fSMartin Matuska goto out;
5136e6cde8fSMartin Matuska }
5146e6cde8fSMartin Matuska
5156e6cde8fSMartin Matuska if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) {
51636c970edSMartin Matuska rw = WRITE;
51736c970edSMartin Matuska } else {
51836c970edSMartin Matuska rw = READ;
51936c970edSMartin Matuska }
5206e6cde8fSMartin Matuska
5216e6cde8fSMartin Matuska /*
5226e6cde8fSMartin Matuska * Sanity check
5236e6cde8fSMartin Matuska *
5246e6cde8fSMartin Matuska * If we're a BIO, check our rw matches the kernel's
5256e6cde8fSMartin Matuska * bio_data_dir(bio) rw. We need to check because we support fewer
5266e6cde8fSMartin Matuska * IO operations, and want to verify that what we think are reads and
5276e6cde8fSMartin Matuska * writes from those operations match what the kernel thinks.
5286e6cde8fSMartin Matuska */
5296e6cde8fSMartin Matuska ASSERT(rq != NULL || rw == bio_data_dir(bio));
530eda14cbcSMatt Macy
531ce4dcb97SMartin Matuska if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
532d0abb9a6SMartin Matuska zvol_end_io(bio, rq, SET_ERROR(ENXIO));
533ce4dcb97SMartin Matuska goto out;
534ce4dcb97SMartin Matuska }
535ce4dcb97SMartin Matuska
536f8b1db88SMartin Matuska if (zvol_request_sync || zv->zv_threading == B_FALSE)
5371f1e2261SMartin Matuska force_sync = 1;
538eda14cbcSMatt Macy
5399db44a8eSMartin Matuska zv_request_t zvr = {
5409db44a8eSMartin Matuska .zv = zv,
5419db44a8eSMartin Matuska .bio = bio,
5421f1e2261SMartin Matuska .rq = rq,
5439db44a8eSMartin Matuska };
5441f1e2261SMartin Matuska
5451f1e2261SMartin Matuska if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
5461f1e2261SMartin Matuska printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
5471f1e2261SMartin Matuska zv->zv_zso->zvo_disk->disk_name,
5481f1e2261SMartin Matuska (long long unsigned)offset,
5491f1e2261SMartin Matuska (long unsigned)size);
5501f1e2261SMartin Matuska
551d0abb9a6SMartin Matuska zvol_end_io(bio, rq, SET_ERROR(EIO));
5521f1e2261SMartin Matuska goto out;
5531f1e2261SMartin Matuska }
5541f1e2261SMartin Matuska
5559db44a8eSMartin Matuska zv_request_task_t *task;
5561719886fSMartin Matuska zv_taskq_t *ztqs = &zvol_taskqs;
5571719886fSMartin Matuska uint_t blk_mq_hw_queue = 0;
5581719886fSMartin Matuska uint_t tq_idx;
5591719886fSMartin Matuska uint_t taskq_hash;
5601719886fSMartin Matuska if (rq)
5611719886fSMartin Matuska #ifdef HAVE_BLK_MQ_RQ_HCTX
5621719886fSMartin Matuska blk_mq_hw_queue = rq->mq_hctx->queue_num;
5631719886fSMartin Matuska #else
564d0abb9a6SMartin Matuska blk_mq_hw_queue = rq->q->queue_hw_ctx[
565d0abb9a6SMartin Matuska rq->q->mq_map[raw_smp_processor_id()]]->queue_num;
5661719886fSMartin Matuska #endif
5677a7741afSMartin Matuska taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
5687a7741afSMartin Matuska blk_mq_hw_queue);
5691719886fSMartin Matuska tq_idx = taskq_hash % ztqs->tqs_cnt;
5709db44a8eSMartin Matuska
571eda14cbcSMatt Macy if (rw == WRITE) {
572eda14cbcSMatt Macy if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
573d0abb9a6SMartin Matuska zvol_end_io(bio, rq, SET_ERROR(EROFS));
574eda14cbcSMatt Macy goto out;
575eda14cbcSMatt Macy }
576eda14cbcSMatt Macy
577eda14cbcSMatt Macy /*
578eda14cbcSMatt Macy * Prevents the zvol from being suspended, or the ZIL being
579eda14cbcSMatt Macy * concurrently opened. Will be released after the i/o
580eda14cbcSMatt Macy * completes.
581eda14cbcSMatt Macy */
582eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_READER);
583eda14cbcSMatt Macy
584eda14cbcSMatt Macy /*
585eda14cbcSMatt Macy * Open a ZIL if this is the first time we have written to this
586eda14cbcSMatt Macy * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
587eda14cbcSMatt Macy * than zv_state_lock so that we don't need to acquire an
588eda14cbcSMatt Macy * additional lock in this path.
589eda14cbcSMatt Macy */
590eda14cbcSMatt Macy if (zv->zv_zilog == NULL) {
591eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
592eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_WRITER);
593eda14cbcSMatt Macy if (zv->zv_zilog == NULL) {
594eda14cbcSMatt Macy zv->zv_zilog = zil_open(zv->zv_objset,
595271171e0SMartin Matuska zvol_get_data, &zv->zv_kstat.dk_zil_sums);
596eda14cbcSMatt Macy zv->zv_flags |= ZVOL_WRITTEN_TO;
5979db44a8eSMartin Matuska /* replay / destroy done in zvol_create_minor */
5989db44a8eSMartin Matuska VERIFY0((zv->zv_zilog->zl_header->zh_flags &
5999db44a8eSMartin Matuska ZIL_REPLAY_NEEDED));
600eda14cbcSMatt Macy }
601eda14cbcSMatt Macy rw_downgrade(&zv->zv_suspend_lock);
602eda14cbcSMatt Macy }
603eda14cbcSMatt Macy
604eda14cbcSMatt Macy /*
605eda14cbcSMatt Macy * We don't want this thread to be blocked waiting for i/o to
606eda14cbcSMatt Macy * complete, so we instead wait from a taskq callback. The
607eda14cbcSMatt Macy * i/o may be a ZIL write (via zil_commit()), or a read of an
608eda14cbcSMatt Macy * indirect block, or a read of a data block (if this is a
609eda14cbcSMatt Macy * partial-block write). We will indicate that the i/o is
6101f1e2261SMartin Matuska * complete by calling END_IO() from the taskq callback.
611eda14cbcSMatt Macy *
612eda14cbcSMatt Macy * This design allows the calling thread to continue and
613eda14cbcSMatt Macy * initiate more concurrent operations by calling
614eda14cbcSMatt Macy * zvol_request() again. There are typically only a small
615eda14cbcSMatt Macy * number of threads available to call zvol_request() (e.g.
616eda14cbcSMatt Macy * one per iSCSI target), so keeping the latency of
617eda14cbcSMatt Macy * zvol_request() low is important for performance.
618eda14cbcSMatt Macy *
619eda14cbcSMatt Macy * The zvol_request_sync module parameter allows this
620eda14cbcSMatt Macy * behavior to be altered, for performance evaluation
621eda14cbcSMatt Macy * purposes. If the callback blocks, setting
622eda14cbcSMatt Macy * zvol_request_sync=1 will result in much worse performance.
623eda14cbcSMatt Macy *
624eda14cbcSMatt Macy * We can have up to zvol_threads concurrent i/o's being
625eda14cbcSMatt Macy * processed for all zvols on the system. This is typically
626eda14cbcSMatt Macy * a vast improvement over the zvol_request_sync=1 behavior
627eda14cbcSMatt Macy * of one i/o at a time per zvol. However, an even better
628eda14cbcSMatt Macy * design would be for zvol_request() to initiate the zio
629eda14cbcSMatt Macy * directly, and then be notified by the zio_done callback,
6301f1e2261SMartin Matuska * which would call END_IO(). Unfortunately, the DMU/ZIL
631eda14cbcSMatt Macy * interfaces lack this functionality (they block waiting for
632eda14cbcSMatt Macy * the i/o to complete).
633eda14cbcSMatt Macy */
6346e6cde8fSMartin Matuska if (io_is_discard(bio, rq)) {
6351f1e2261SMartin Matuska if (force_sync) {
6369db44a8eSMartin Matuska zvol_discard(&zvr);
637eda14cbcSMatt Macy } else {
6389db44a8eSMartin Matuska task = zv_request_task_create(zvr);
6391719886fSMartin Matuska taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
6409db44a8eSMartin Matuska zvol_discard_task, task, 0, &task->ent);
641eda14cbcSMatt Macy }
642eda14cbcSMatt Macy } else {
6431f1e2261SMartin Matuska if (force_sync) {
6449db44a8eSMartin Matuska zvol_write(&zvr);
645eda14cbcSMatt Macy } else {
6469db44a8eSMartin Matuska task = zv_request_task_create(zvr);
6471719886fSMartin Matuska taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
6489db44a8eSMartin Matuska zvol_write_task, task, 0, &task->ent);
649eda14cbcSMatt Macy }
650eda14cbcSMatt Macy }
651eda14cbcSMatt Macy } else {
652eda14cbcSMatt Macy /*
653eda14cbcSMatt Macy * The SCST driver, and possibly others, may issue READ I/Os
654eda14cbcSMatt Macy * with a length of zero bytes. These empty I/Os contain no
655eda14cbcSMatt Macy * data and require no additional handling.
656eda14cbcSMatt Macy */
657eda14cbcSMatt Macy if (size == 0) {
6587a7741afSMartin Matuska zvol_end_io(bio, rq, 0);
659eda14cbcSMatt Macy goto out;
660eda14cbcSMatt Macy }
661eda14cbcSMatt Macy
662eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_READER);
663eda14cbcSMatt Macy
664eda14cbcSMatt Macy /* See comment in WRITE case above. */
6651f1e2261SMartin Matuska if (force_sync) {
6669db44a8eSMartin Matuska zvol_read(&zvr);
667eda14cbcSMatt Macy } else {
6689db44a8eSMartin Matuska task = zv_request_task_create(zvr);
6691719886fSMartin Matuska taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
6709db44a8eSMartin Matuska zvol_read_task, task, 0, &task->ent);
671eda14cbcSMatt Macy }
672eda14cbcSMatt Macy }
673eda14cbcSMatt Macy
674eda14cbcSMatt Macy out:
675eda14cbcSMatt Macy spl_fstrans_unmark(cookie);
6761f1e2261SMartin Matuska }
6771f1e2261SMartin Matuska
6781f1e2261SMartin Matuska #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
6791f1e2261SMartin Matuska #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
6801f1e2261SMartin Matuska static void
zvol_submit_bio(struct bio * bio)6811f1e2261SMartin Matuska zvol_submit_bio(struct bio *bio)
6821f1e2261SMartin Matuska #else
6831f1e2261SMartin Matuska static blk_qc_t
6841f1e2261SMartin Matuska zvol_submit_bio(struct bio *bio)
6851f1e2261SMartin Matuska #endif
6861f1e2261SMartin Matuska #else
6871f1e2261SMartin Matuska static MAKE_REQUEST_FN_RET
6881f1e2261SMartin Matuska zvol_request(struct request_queue *q, struct bio *bio)
6891f1e2261SMartin Matuska #endif
6901f1e2261SMartin Matuska {
6911f1e2261SMartin Matuska #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
6921f1e2261SMartin Matuska #if defined(HAVE_BIO_BDEV_DISK)
6931f1e2261SMartin Matuska struct request_queue *q = bio->bi_bdev->bd_disk->queue;
6941f1e2261SMartin Matuska #else
6951f1e2261SMartin Matuska struct request_queue *q = bio->bi_disk->queue;
6961f1e2261SMartin Matuska #endif
6971f1e2261SMartin Matuska #endif
6981f1e2261SMartin Matuska zvol_state_t *zv = q->queuedata;
6991f1e2261SMartin Matuska
7001f1e2261SMartin Matuska zvol_request_impl(zv, bio, NULL, 0);
7011f1e2261SMartin Matuska #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
7021f1e2261SMartin Matuska defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
703681ce946SMartin Matuska !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
704eda14cbcSMatt Macy return (BLK_QC_T_NONE);
705eda14cbcSMatt Macy #endif
706eda14cbcSMatt Macy }
707eda14cbcSMatt Macy
708eda14cbcSMatt Macy static int
709315ee00fSMartin Matuska #ifdef HAVE_BLK_MODE_T
zvol_open(struct gendisk * disk,blk_mode_t flag)710315ee00fSMartin Matuska zvol_open(struct gendisk *disk, blk_mode_t flag)
711315ee00fSMartin Matuska #else
712eda14cbcSMatt Macy zvol_open(struct block_device *bdev, fmode_t flag)
713315ee00fSMartin Matuska #endif
714eda14cbcSMatt Macy {
715eda14cbcSMatt Macy zvol_state_t *zv;
716eda14cbcSMatt Macy int error = 0;
717e92ffd9bSMartin Matuska boolean_t drop_suspend = B_FALSE;
718681ce946SMartin Matuska #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
719681ce946SMartin Matuska hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
720681ce946SMartin Matuska hrtime_t start = gethrtime();
721eda14cbcSMatt Macy
722681ce946SMartin Matuska retry:
723681ce946SMartin Matuska #endif
72453a2e263SMartin Matuska
725315ee00fSMartin Matuska #ifdef HAVE_BLK_MODE_T
72653a2e263SMartin Matuska zv = atomic_load_ptr(&disk->private_data);
727315ee00fSMartin Matuska #else
72853a2e263SMartin Matuska zv = atomic_load_ptr(&bdev->bd_disk->private_data);
729315ee00fSMartin Matuska #endif
730eda14cbcSMatt Macy if (zv == NULL) {
731ce4dcb97SMartin Matuska return (-SET_ERROR(ENXIO));
732eda14cbcSMatt Macy }
733eda14cbcSMatt Macy
734e92ffd9bSMartin Matuska mutex_enter(&zv->zv_state_lock);
735ce4dcb97SMartin Matuska if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
736ce4dcb97SMartin Matuska mutex_exit(&zv->zv_state_lock);
737ce4dcb97SMartin Matuska return (-SET_ERROR(ENXIO));
738ce4dcb97SMartin Matuska }
739ce4dcb97SMartin Matuska
740e92ffd9bSMartin Matuska /*
741e92ffd9bSMartin Matuska * Make sure zvol is not suspended during first open
742e92ffd9bSMartin Matuska * (hold zv_suspend_lock) and respect proper lock acquisition
743e92ffd9bSMartin Matuska * ordering - zv_suspend_lock before zv_state_lock
744e92ffd9bSMartin Matuska */
745e92ffd9bSMartin Matuska if (zv->zv_open_count == 0) {
746e92ffd9bSMartin Matuska if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
747e92ffd9bSMartin Matuska mutex_exit(&zv->zv_state_lock);
74853a2e263SMartin Matuska
74953a2e263SMartin Matuska /*
75053a2e263SMartin Matuska * Removal may happen while the locks are down, so
75153a2e263SMartin Matuska * we can't trust zv any longer; we have to start over.
75253a2e263SMartin Matuska */
75353a2e263SMartin Matuska #ifdef HAVE_BLK_MODE_T
75453a2e263SMartin Matuska zv = atomic_load_ptr(&disk->private_data);
75553a2e263SMartin Matuska #else
75653a2e263SMartin Matuska zv = atomic_load_ptr(&bdev->bd_disk->private_data);
75753a2e263SMartin Matuska #endif
75853a2e263SMartin Matuska if (zv == NULL)
75953a2e263SMartin Matuska return (-SET_ERROR(ENXIO));
76053a2e263SMartin Matuska
761e92ffd9bSMartin Matuska rw_enter(&zv->zv_suspend_lock, RW_READER);
762e92ffd9bSMartin Matuska mutex_enter(&zv->zv_state_lock);
76353a2e263SMartin Matuska
76453a2e263SMartin Matuska if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
76553a2e263SMartin Matuska mutex_exit(&zv->zv_state_lock);
76653a2e263SMartin Matuska rw_exit(&zv->zv_suspend_lock);
76753a2e263SMartin Matuska return (-SET_ERROR(ENXIO));
76853a2e263SMartin Matuska }
76953a2e263SMartin Matuska
770e92ffd9bSMartin Matuska /* check to see if zv_suspend_lock is needed */
771e92ffd9bSMartin Matuska if (zv->zv_open_count != 0) {
772e92ffd9bSMartin Matuska rw_exit(&zv->zv_suspend_lock);
773e92ffd9bSMartin Matuska } else {
774e92ffd9bSMartin Matuska drop_suspend = B_TRUE;
775e92ffd9bSMartin Matuska }
776e92ffd9bSMartin Matuska } else {
777e92ffd9bSMartin Matuska drop_suspend = B_TRUE;
778e92ffd9bSMartin Matuska }
779e92ffd9bSMartin Matuska }
780e92ffd9bSMartin Matuska
781e92ffd9bSMartin Matuska ASSERT(MUTEX_HELD(&zv->zv_state_lock));
782e92ffd9bSMartin Matuska
783e92ffd9bSMartin Matuska if (zv->zv_open_count == 0) {
784e92ffd9bSMartin Matuska boolean_t drop_namespace = B_FALSE;
785e92ffd9bSMartin Matuska
786e92ffd9bSMartin Matuska ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
787e92ffd9bSMartin Matuska
788681ce946SMartin Matuska /*
789681ce946SMartin Matuska * In all other call paths the spa_namespace_lock is taken
790681ce946SMartin Matuska * before the bdev->bd_mutex lock. However, on open(2)
791681ce946SMartin Matuska * the __blkdev_get() function calls fops->open() with the
792681ce946SMartin Matuska * bdev->bd_mutex lock held. This can result in a deadlock
793681ce946SMartin Matuska * when zvols from one pool are used as vdevs in another.
794681ce946SMartin Matuska *
795681ce946SMartin Matuska * To prevent a lock inversion deadlock we preemptively
796681ce946SMartin Matuska * take the spa_namespace_lock. Normally the lock will not
797681ce946SMartin Matuska * be contended and this is safe because spa_open_common()
798681ce946SMartin Matuska * handles the case where the caller already holds the
799681ce946SMartin Matuska * spa_namespace_lock.
800681ce946SMartin Matuska *
801681ce946SMartin Matuska * When the lock cannot be aquired after multiple retries
802681ce946SMartin Matuska * this must be the vdev on zvol deadlock case and we have
803681ce946SMartin Matuska * no choice but to return an error. For 5.12 and older
804681ce946SMartin Matuska * kernels returning -ERESTARTSYS will result in the
805681ce946SMartin Matuska * bdev->bd_mutex being dropped, then reacquired, and
806681ce946SMartin Matuska * fops->open() being called again. This process can be
807681ce946SMartin Matuska * repeated safely until both locks are acquired. For 5.13
808681ce946SMartin Matuska * and newer the -ERESTARTSYS retry logic was removed from
809681ce946SMartin Matuska * the kernel so the only option is to return the error for
810681ce946SMartin Matuska * the caller to handle it.
811681ce946SMartin Matuska */
812*8ac904ceSMartin Matuska if (!spa_namespace_held()) {
813*8ac904ceSMartin Matuska if (!spa_namespace_tryenter(FTAG)) {
814e92ffd9bSMartin Matuska mutex_exit(&zv->zv_state_lock);
815e92ffd9bSMartin Matuska rw_exit(&zv->zv_suspend_lock);
81675e1fea6SMartin Matuska drop_suspend = B_FALSE;
817681ce946SMartin Matuska
818681ce946SMartin Matuska #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
819681ce946SMartin Matuska schedule();
820ce4dcb97SMartin Matuska return (-SET_ERROR(ERESTARTSYS));
821681ce946SMartin Matuska #else
822681ce946SMartin Matuska if ((gethrtime() - start) > timeout)
823ce4dcb97SMartin Matuska return (-SET_ERROR(ERESTARTSYS));
824681ce946SMartin Matuska
825aca928a5SMartin Matuska schedule_timeout_interruptible(
826aca928a5SMartin Matuska MSEC_TO_TICK(10));
827681ce946SMartin Matuska goto retry;
828681ce946SMartin Matuska #endif
829681ce946SMartin Matuska } else {
830681ce946SMartin Matuska drop_namespace = B_TRUE;
831681ce946SMartin Matuska }
832681ce946SMartin Matuska }
833681ce946SMartin Matuska
834315ee00fSMartin Matuska error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
835eda14cbcSMatt Macy
836681ce946SMartin Matuska if (drop_namespace)
837*8ac904ceSMartin Matuska spa_namespace_exit(FTAG);
838e92ffd9bSMartin Matuska }
839eda14cbcSMatt Macy
840e92ffd9bSMartin Matuska if (error == 0) {
841315ee00fSMartin Matuska if ((blk_mode_is_open_write(flag)) &&
842315ee00fSMartin Matuska (zv->zv_flags & ZVOL_RDONLY)) {
843eda14cbcSMatt Macy if (zv->zv_open_count == 0)
844eda14cbcSMatt Macy zvol_last_close(zv);
845eda14cbcSMatt Macy
846ce4dcb97SMartin Matuska error = -SET_ERROR(EROFS);
847e92ffd9bSMartin Matuska } else {
848e92ffd9bSMartin Matuska zv->zv_open_count++;
849e92ffd9bSMartin Matuska }
850e92ffd9bSMartin Matuska }
851e92ffd9bSMartin Matuska
852eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock);
853eda14cbcSMatt Macy if (drop_suspend)
854eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
855681ce946SMartin Matuska
856e92ffd9bSMartin Matuska if (error == 0)
857315ee00fSMartin Matuska #ifdef HAVE_BLK_MODE_T
858315ee00fSMartin Matuska disk_check_media_change(disk);
859315ee00fSMartin Matuska #else
860e92ffd9bSMartin Matuska zfs_check_media_change(bdev);
861315ee00fSMartin Matuska #endif
862e92ffd9bSMartin Matuska
863e92ffd9bSMartin Matuska return (error);
864eda14cbcSMatt Macy }
865eda14cbcSMatt Macy
866eda14cbcSMatt Macy static void
867315ee00fSMartin Matuska #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
zvol_release(struct gendisk * disk)868315ee00fSMartin Matuska zvol_release(struct gendisk *disk)
869315ee00fSMartin Matuska #else
870315ee00fSMartin Matuska zvol_release(struct gendisk *disk, fmode_t unused)
871315ee00fSMartin Matuska #endif
872eda14cbcSMatt Macy {
873315ee00fSMartin Matuska #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
874315ee00fSMartin Matuska (void) unused;
875315ee00fSMartin Matuska #endif
876eda14cbcSMatt Macy boolean_t drop_suspend = B_TRUE;
877eda14cbcSMatt Macy
87853a2e263SMartin Matuska zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
87953a2e263SMartin Matuska if (zv == NULL)
88053a2e263SMartin Matuska return;
881eda14cbcSMatt Macy
882eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock);
8837877fdebSMatt Macy ASSERT3U(zv->zv_open_count, >, 0);
884eda14cbcSMatt Macy /*
885eda14cbcSMatt Macy * make sure zvol is not suspended during last close
886eda14cbcSMatt Macy * (hold zv_suspend_lock) and respect proper lock acquisition
887eda14cbcSMatt Macy * ordering - zv_suspend_lock before zv_state_lock
888eda14cbcSMatt Macy */
889eda14cbcSMatt Macy if (zv->zv_open_count == 1) {
890eda14cbcSMatt Macy if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
891eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock);
892eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_READER);
893eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock);
89453a2e263SMartin Matuska
89553a2e263SMartin Matuska /*
89653a2e263SMartin Matuska * Unlike in zvol_open(), we don't check if removal
89753a2e263SMartin Matuska * started here, because we might be one of the openers
89853a2e263SMartin Matuska * that needs to be thrown out! If we're the last, we
89953a2e263SMartin Matuska * need to call zvol_last_close() below to finish
90053a2e263SMartin Matuska * cleanup. So, no special treatment for us.
90153a2e263SMartin Matuska */
90253a2e263SMartin Matuska
903eda14cbcSMatt Macy /* check to see if zv_suspend_lock is needed */
904eda14cbcSMatt Macy if (zv->zv_open_count != 1) {
905eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
906eda14cbcSMatt Macy drop_suspend = B_FALSE;
907eda14cbcSMatt Macy }
908eda14cbcSMatt Macy }
909eda14cbcSMatt Macy } else {
910eda14cbcSMatt Macy drop_suspend = B_FALSE;
911eda14cbcSMatt Macy }
912eda14cbcSMatt Macy
913eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zv->zv_state_lock));
914eda14cbcSMatt Macy
915eda14cbcSMatt Macy zv->zv_open_count--;
9167877fdebSMatt Macy if (zv->zv_open_count == 0) {
9177877fdebSMatt Macy ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
918eda14cbcSMatt Macy zvol_last_close(zv);
9197877fdebSMatt Macy }
920eda14cbcSMatt Macy
921eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock);
922eda14cbcSMatt Macy
923eda14cbcSMatt Macy if (drop_suspend)
924eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
925eda14cbcSMatt Macy }
926eda14cbcSMatt Macy
927eda14cbcSMatt Macy static int
zvol_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)928eda14cbcSMatt Macy zvol_ioctl(struct block_device *bdev, fmode_t mode,
929eda14cbcSMatt Macy unsigned int cmd, unsigned long arg)
930eda14cbcSMatt Macy {
931eda14cbcSMatt Macy int error = 0;
932eda14cbcSMatt Macy
93353a2e263SMartin Matuska zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data);
93453a2e263SMartin Matuska ASSERT3P(zv, !=, NULL);
935eda14cbcSMatt Macy ASSERT3U(zv->zv_open_count, >, 0);
936eda14cbcSMatt Macy
937eda14cbcSMatt Macy switch (cmd) {
938eda14cbcSMatt Macy case BLKFLSBUF:
939abcdc1b9SMartin Matuska #ifdef HAVE_FSYNC_BDEV
940eda14cbcSMatt Macy fsync_bdev(bdev);
941abcdc1b9SMartin Matuska #elif defined(HAVE_SYNC_BLOCKDEV)
942abcdc1b9SMartin Matuska sync_blockdev(bdev);
943abcdc1b9SMartin Matuska #else
944abcdc1b9SMartin Matuska #error "Neither fsync_bdev() nor sync_blockdev() found"
945abcdc1b9SMartin Matuska #endif
946eda14cbcSMatt Macy invalidate_bdev(bdev);
947eda14cbcSMatt Macy rw_enter(&zv->zv_suspend_lock, RW_READER);
948eda14cbcSMatt Macy
949eda14cbcSMatt Macy if (!(zv->zv_flags & ZVOL_RDONLY))
950eda14cbcSMatt Macy txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
951eda14cbcSMatt Macy
952eda14cbcSMatt Macy rw_exit(&zv->zv_suspend_lock);
953eda14cbcSMatt Macy break;
954eda14cbcSMatt Macy
955eda14cbcSMatt Macy case BLKZNAME:
956eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock);
957d0abb9a6SMartin Matuska error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
958eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock);
959d0abb9a6SMartin Matuska if (error)
960d0abb9a6SMartin Matuska error = SET_ERROR(error);
961eda14cbcSMatt Macy break;
962eda14cbcSMatt Macy
963eda14cbcSMatt Macy default:
964d0abb9a6SMartin Matuska error = SET_ERROR(ENOTTY);
965eda14cbcSMatt Macy break;
966eda14cbcSMatt Macy }
967eda14cbcSMatt Macy
968d0abb9a6SMartin Matuska return (-error);
969eda14cbcSMatt Macy }
970eda14cbcSMatt Macy
971eda14cbcSMatt Macy #ifdef CONFIG_COMPAT
972eda14cbcSMatt Macy static int
zvol_compat_ioctl(struct block_device * bdev,fmode_t mode,unsigned cmd,unsigned long arg)973eda14cbcSMatt Macy zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
974eda14cbcSMatt Macy unsigned cmd, unsigned long arg)
975eda14cbcSMatt Macy {
976eda14cbcSMatt Macy return (zvol_ioctl(bdev, mode, cmd, arg));
977eda14cbcSMatt Macy }
978eda14cbcSMatt Macy #else
979eda14cbcSMatt Macy #define zvol_compat_ioctl NULL
980eda14cbcSMatt Macy #endif
981eda14cbcSMatt Macy
982eda14cbcSMatt Macy static unsigned int
zvol_check_events(struct gendisk * disk,unsigned int clearing)983eda14cbcSMatt Macy zvol_check_events(struct gendisk *disk, unsigned int clearing)
984eda14cbcSMatt Macy {
985eda14cbcSMatt Macy unsigned int mask = 0;
986eda14cbcSMatt Macy
98753a2e263SMartin Matuska zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
988eda14cbcSMatt Macy
989eda14cbcSMatt Macy if (zv != NULL) {
990eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock);
991eda14cbcSMatt Macy mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
992eda14cbcSMatt Macy zv->zv_changed = 0;
993eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock);
994eda14cbcSMatt Macy }
995eda14cbcSMatt Macy
996eda14cbcSMatt Macy return (mask);
997eda14cbcSMatt Macy }
998eda14cbcSMatt Macy
999eda14cbcSMatt Macy static int
zvol_revalidate_disk(struct gendisk * disk)1000eda14cbcSMatt Macy zvol_revalidate_disk(struct gendisk *disk)
1001eda14cbcSMatt Macy {
100253a2e263SMartin Matuska zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
1003eda14cbcSMatt Macy
1004eda14cbcSMatt Macy if (zv != NULL) {
1005eda14cbcSMatt Macy mutex_enter(&zv->zv_state_lock);
1006eda14cbcSMatt Macy set_capacity(zv->zv_zso->zvo_disk,
1007eda14cbcSMatt Macy zv->zv_volsize >> SECTOR_BITS);
1008eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock);
1009eda14cbcSMatt Macy }
1010eda14cbcSMatt Macy
1011eda14cbcSMatt Macy return (0);
1012eda14cbcSMatt Macy }
1013eda14cbcSMatt Macy
1014c03c5b1cSMartin Matuska int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1015c03c5b1cSMartin Matuska zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1016eda14cbcSMatt Macy {
10177877fdebSMatt Macy struct gendisk *disk = zv->zv_zso->zvo_disk;
1018eda14cbcSMatt Macy
10197877fdebSMatt Macy #if defined(HAVE_REVALIDATE_DISK_SIZE)
10207877fdebSMatt Macy revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
10217877fdebSMatt Macy #elif defined(HAVE_REVALIDATE_DISK)
10227877fdebSMatt Macy revalidate_disk(disk);
10237877fdebSMatt Macy #else
10247877fdebSMatt Macy zvol_revalidate_disk(disk);
10257877fdebSMatt Macy #endif
1026eda14cbcSMatt Macy return (0);
1027eda14cbcSMatt Macy }
1028eda14cbcSMatt Macy
1029eda14cbcSMatt Macy /*
1030eda14cbcSMatt Macy * Provide a simple virtual geometry for legacy compatibility. For devices
1031eda14cbcSMatt Macy * smaller than 1 MiB a small head and sector count is used to allow very
1032eda14cbcSMatt Macy * tiny devices. For devices over 1 Mib a standard head and sector count
1033eda14cbcSMatt Macy * is used to keep the cylinders count reasonable.
1034eda14cbcSMatt Macy */
1035e6e941e6SMartin Matuska static inline int
zvol_getgeo_impl(struct gendisk * disk,struct hd_geometry * geo)1036e6e941e6SMartin Matuska zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo)
1037eda14cbcSMatt Macy {
1038e6e941e6SMartin Matuska zvol_state_t *zv = atomic_load_ptr(&disk->private_data);
1039eda14cbcSMatt Macy sector_t sectors;
1040eda14cbcSMatt Macy
104153a2e263SMartin Matuska ASSERT3P(zv, !=, NULL);
1042eda14cbcSMatt Macy ASSERT3U(zv->zv_open_count, >, 0);
1043eda14cbcSMatt Macy
1044eda14cbcSMatt Macy sectors = get_capacity(zv->zv_zso->zvo_disk);
1045eda14cbcSMatt Macy
1046eda14cbcSMatt Macy if (sectors > 2048) {
1047eda14cbcSMatt Macy geo->heads = 16;
1048eda14cbcSMatt Macy geo->sectors = 63;
1049eda14cbcSMatt Macy } else {
1050eda14cbcSMatt Macy geo->heads = 2;
1051eda14cbcSMatt Macy geo->sectors = 4;
1052eda14cbcSMatt Macy }
1053eda14cbcSMatt Macy
1054eda14cbcSMatt Macy geo->start = 0;
1055eda14cbcSMatt Macy geo->cylinders = sectors / (geo->heads * geo->sectors);
1056eda14cbcSMatt Macy
1057eda14cbcSMatt Macy return (0);
1058eda14cbcSMatt Macy }
1059eda14cbcSMatt Macy
1060e6e941e6SMartin Matuska #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK
1061e6e941e6SMartin Matuska static int
zvol_getgeo(struct gendisk * disk,struct hd_geometry * geo)1062e6e941e6SMartin Matuska zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo)
1063e6e941e6SMartin Matuska {
1064e6e941e6SMartin Matuska return (zvol_getgeo_impl(disk, geo));
1065e6e941e6SMartin Matuska }
1066e6e941e6SMartin Matuska #else
1067e6e941e6SMartin Matuska static int
zvol_getgeo(struct block_device * bdev,struct hd_geometry * geo)1068e6e941e6SMartin Matuska zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1069e6e941e6SMartin Matuska {
1070e6e941e6SMartin Matuska return (zvol_getgeo_impl(bdev->bd_disk, geo));
1071e6e941e6SMartin Matuska }
1072e6e941e6SMartin Matuska #endif
1073e6e941e6SMartin Matuska
10741f1e2261SMartin Matuska /*
10751f1e2261SMartin Matuska * Why have two separate block_device_operations structs?
10761f1e2261SMartin Matuska *
10771f1e2261SMartin Matuska * Normally we'd just have one, and assign 'submit_bio' as needed. However,
10781f1e2261SMartin Matuska * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
10791f1e2261SMartin Matuska * can't just change submit_bio dynamically at runtime. So just create two
10801f1e2261SMartin Matuska * separate structs to get around this.
10811f1e2261SMartin Matuska */
10821f1e2261SMartin Matuska static const struct block_device_operations zvol_ops_blk_mq = {
10831f1e2261SMartin Matuska .open = zvol_open,
10841f1e2261SMartin Matuska .release = zvol_release,
10851f1e2261SMartin Matuska .ioctl = zvol_ioctl,
10861f1e2261SMartin Matuska .compat_ioctl = zvol_compat_ioctl,
10871f1e2261SMartin Matuska .check_events = zvol_check_events,
10881f1e2261SMartin Matuska #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
10891f1e2261SMartin Matuska .revalidate_disk = zvol_revalidate_disk,
10901f1e2261SMartin Matuska #endif
10911f1e2261SMartin Matuska .getgeo = zvol_getgeo,
10921f1e2261SMartin Matuska .owner = THIS_MODULE,
10931f1e2261SMartin Matuska };
10941f1e2261SMartin Matuska
1095e92ffd9bSMartin Matuska static const struct block_device_operations zvol_ops = {
1096eda14cbcSMatt Macy .open = zvol_open,
1097eda14cbcSMatt Macy .release = zvol_release,
1098eda14cbcSMatt Macy .ioctl = zvol_ioctl,
1099eda14cbcSMatt Macy .compat_ioctl = zvol_compat_ioctl,
1100eda14cbcSMatt Macy .check_events = zvol_check_events,
110116038816SMartin Matuska #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
1102eda14cbcSMatt Macy .revalidate_disk = zvol_revalidate_disk,
110316038816SMartin Matuska #endif
1104eda14cbcSMatt Macy .getgeo = zvol_getgeo,
1105eda14cbcSMatt Macy .owner = THIS_MODULE,
1106eda14cbcSMatt Macy #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
1107eda14cbcSMatt Macy .submit_bio = zvol_submit_bio,
1108eda14cbcSMatt Macy #endif
1109eda14cbcSMatt Macy };
1110eda14cbcSMatt Macy
111129dc9349SMartin Matuska /*
111229dc9349SMartin Matuska * Since 6.9, Linux has been removing queue limit setters in favour of an
111329dc9349SMartin Matuska * initial queue_limits struct applied when the device is open. Since 6.11,
111429dc9349SMartin Matuska * queue_limits is being extended to allow more things to be applied when the
111529dc9349SMartin Matuska * device is open. Setters are also being removed for this.
111629dc9349SMartin Matuska *
111729dc9349SMartin Matuska * For OpenZFS, this means that depending on kernel version, some options may
111829dc9349SMartin Matuska * be set up before the device is open, and some applied to an open device
111929dc9349SMartin Matuska * (queue) after the fact.
112029dc9349SMartin Matuska *
112129dc9349SMartin Matuska * We manage this complexity by having our own limits struct,
112229dc9349SMartin Matuska * zvol_queue_limits_t, in which we carry any queue config that we're
112329dc9349SMartin Matuska * interested in setting. This structure is the same on all kernels.
112429dc9349SMartin Matuska *
112529dc9349SMartin Matuska * These limits are then applied to the queue at device open time by the most
112629dc9349SMartin Matuska * appropriate method for the kernel.
112729dc9349SMartin Matuska *
112829dc9349SMartin Matuska * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
112929dc9349SMartin Matuska * blk_alloc_disk() exists). This converts our limits struct to a proper Linux
113029dc9349SMartin Matuska * struct queue_limits, and passes it in. Any fields added in later kernels are
113129dc9349SMartin Matuska * (obviously) not set up here.
113229dc9349SMartin Matuska *
113329dc9349SMartin Matuska * zvol_queue_limits_apply() is called on all kernel versions after the queue
113429dc9349SMartin Matuska * is created, and applies any remaining config. Before 6.9 that will be
113529dc9349SMartin Matuska * everything, via setter methods. After 6.9 that will be whatever couldn't be
113629dc9349SMartin Matuska * put into struct queue_limits. (This implies that zvol_queue_limits_apply()
113729dc9349SMartin Matuska * will always be a no-op on the latest kernel we support).
113829dc9349SMartin Matuska */
113975e1fea6SMartin Matuska typedef struct zvol_queue_limits {
114075e1fea6SMartin Matuska unsigned int zql_max_hw_sectors;
114175e1fea6SMartin Matuska unsigned short zql_max_segments;
114275e1fea6SMartin Matuska unsigned int zql_max_segment_size;
114375e1fea6SMartin Matuska unsigned int zql_io_opt;
114429dc9349SMartin Matuska unsigned int zql_physical_block_size;
114529dc9349SMartin Matuska unsigned int zql_max_discard_sectors;
114629dc9349SMartin Matuska unsigned int zql_discard_granularity;
114775e1fea6SMartin Matuska } zvol_queue_limits_t;
114875e1fea6SMartin Matuska
114975e1fea6SMartin Matuska static void
zvol_queue_limits_init(zvol_queue_limits_t * limits,zvol_state_t * zv,boolean_t use_blk_mq)115075e1fea6SMartin Matuska zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
115175e1fea6SMartin Matuska boolean_t use_blk_mq)
115275e1fea6SMartin Matuska {
115375e1fea6SMartin Matuska limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
115475e1fea6SMartin Matuska
115575e1fea6SMartin Matuska if (use_blk_mq) {
115675e1fea6SMartin Matuska /*
115775e1fea6SMartin Matuska * IO requests can be really big (1MB). When an IO request
115875e1fea6SMartin Matuska * comes in, it is passed off to zvol_read() or zvol_write()
115975e1fea6SMartin Matuska * in a new thread, where it is chunked up into 'volblocksize'
116075e1fea6SMartin Matuska * sized pieces and processed. So for example, if the request
116175e1fea6SMartin Matuska * is a 1MB write and your volblocksize is 128k, one zvol_write
116275e1fea6SMartin Matuska * thread will take that request and sequentially do ten 128k
116375e1fea6SMartin Matuska * IOs. This is due to the fact that the thread needs to lock
116475e1fea6SMartin Matuska * each volblocksize sized block. So you might be wondering:
116575e1fea6SMartin Matuska * "instead of passing the whole 1MB request to one thread,
116675e1fea6SMartin Matuska * why not pass ten individual 128k chunks to ten threads and
116775e1fea6SMartin Matuska * process the whole write in parallel?" The short answer is
116875e1fea6SMartin Matuska * that there's a sweet spot number of chunks that balances
116975e1fea6SMartin Matuska * the greater parallelism with the added overhead of more
117075e1fea6SMartin Matuska * threads. The sweet spot can be different depending on if you
117175e1fea6SMartin Matuska * have a read or write heavy workload. Writes typically want
117275e1fea6SMartin Matuska * high chunk counts while reads typically want lower ones. On
117375e1fea6SMartin Matuska * a test pool with 6 NVMe drives in a 3x 2-disk mirror
117475e1fea6SMartin Matuska * configuration, with volblocksize=8k, the sweet spot for good
117575e1fea6SMartin Matuska * sequential reads and writes was at 8 chunks.
117675e1fea6SMartin Matuska */
117775e1fea6SMartin Matuska
117875e1fea6SMartin Matuska /*
117975e1fea6SMartin Matuska * Below we tell the kernel how big we want our requests
118075e1fea6SMartin Matuska * to be. You would think that blk_queue_io_opt() would be
118175e1fea6SMartin Matuska * used to do this since it is used to "set optimal request
118275e1fea6SMartin Matuska * size for the queue", but that doesn't seem to do
118375e1fea6SMartin Matuska * anything - the kernel still gives you huge requests
118475e1fea6SMartin Matuska * with tons of little PAGE_SIZE segments contained within it.
118575e1fea6SMartin Matuska *
118675e1fea6SMartin Matuska * Knowing that the kernel will just give you PAGE_SIZE segments
118775e1fea6SMartin Matuska * no matter what, you can say "ok, I want PAGE_SIZE byte
118875e1fea6SMartin Matuska * segments, and I want 'N' of them per request", where N is
118975e1fea6SMartin Matuska * the correct number of segments for the volblocksize and
119075e1fea6SMartin Matuska * number of chunks you want.
119175e1fea6SMartin Matuska */
119275e1fea6SMartin Matuska if (zvol_blk_mq_blocks_per_thread != 0) {
119375e1fea6SMartin Matuska unsigned int chunks;
119475e1fea6SMartin Matuska chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
119575e1fea6SMartin Matuska
119675e1fea6SMartin Matuska limits->zql_max_segment_size = PAGE_SIZE;
119775e1fea6SMartin Matuska limits->zql_max_segments =
119875e1fea6SMartin Matuska (zv->zv_volblocksize * chunks) / PAGE_SIZE;
119975e1fea6SMartin Matuska } else {
120075e1fea6SMartin Matuska /*
120175e1fea6SMartin Matuska * Special case: zvol_blk_mq_blocks_per_thread = 0
120275e1fea6SMartin Matuska * Max everything out.
120375e1fea6SMartin Matuska */
120475e1fea6SMartin Matuska limits->zql_max_segments = UINT16_MAX;
120575e1fea6SMartin Matuska limits->zql_max_segment_size = UINT_MAX;
120675e1fea6SMartin Matuska }
120775e1fea6SMartin Matuska } else {
120875e1fea6SMartin Matuska limits->zql_max_segments = UINT16_MAX;
120975e1fea6SMartin Matuska limits->zql_max_segment_size = UINT_MAX;
121075e1fea6SMartin Matuska }
121175e1fea6SMartin Matuska
1212718519f4SMartin Matuska limits->zql_io_opt = DMU_MAX_ACCESS / 2;
121329dc9349SMartin Matuska
121429dc9349SMartin Matuska limits->zql_physical_block_size = zv->zv_volblocksize;
121529dc9349SMartin Matuska limits->zql_max_discard_sectors =
121629dc9349SMartin Matuska (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
121729dc9349SMartin Matuska limits->zql_discard_granularity = zv->zv_volblocksize;
121875e1fea6SMartin Matuska }
121975e1fea6SMartin Matuska
122075e1fea6SMartin Matuska #ifdef HAVE_BLK_ALLOC_DISK_2ARG
122175e1fea6SMartin Matuska static void
zvol_queue_limits_convert(zvol_queue_limits_t * limits,struct queue_limits * qlimits)122275e1fea6SMartin Matuska zvol_queue_limits_convert(zvol_queue_limits_t *limits,
122375e1fea6SMartin Matuska struct queue_limits *qlimits)
122475e1fea6SMartin Matuska {
122575e1fea6SMartin Matuska memset(qlimits, 0, sizeof (struct queue_limits));
122675e1fea6SMartin Matuska qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
122775e1fea6SMartin Matuska qlimits->max_segments = limits->zql_max_segments;
122875e1fea6SMartin Matuska qlimits->max_segment_size = limits->zql_max_segment_size;
122975e1fea6SMartin Matuska qlimits->io_opt = limits->zql_io_opt;
123029dc9349SMartin Matuska qlimits->physical_block_size = limits->zql_physical_block_size;
123129dc9349SMartin Matuska qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
1232e2df9bb4SMartin Matuska qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
123329dc9349SMartin Matuska qlimits->discard_granularity = limits->zql_discard_granularity;
123429dc9349SMartin Matuska #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
123529dc9349SMartin Matuska qlimits->features =
123629dc9349SMartin Matuska BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
123729dc9349SMartin Matuska #endif
123875e1fea6SMartin Matuska }
123929dc9349SMartin Matuska #endif
124029dc9349SMartin Matuska
124175e1fea6SMartin Matuska static void
zvol_queue_limits_apply(zvol_queue_limits_t * limits,struct request_queue * queue)124275e1fea6SMartin Matuska zvol_queue_limits_apply(zvol_queue_limits_t *limits,
124375e1fea6SMartin Matuska struct request_queue *queue)
124475e1fea6SMartin Matuska {
124529dc9349SMartin Matuska #ifndef HAVE_BLK_ALLOC_DISK_2ARG
124675e1fea6SMartin Matuska blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
124775e1fea6SMartin Matuska blk_queue_max_segments(queue, limits->zql_max_segments);
124875e1fea6SMartin Matuska blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
124975e1fea6SMartin Matuska blk_queue_io_opt(queue, limits->zql_io_opt);
125029dc9349SMartin Matuska blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
125129dc9349SMartin Matuska blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
125229dc9349SMartin Matuska blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
125375e1fea6SMartin Matuska #endif
125429dc9349SMartin Matuska #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
125529dc9349SMartin Matuska blk_queue_set_write_cache(queue, B_TRUE);
125629dc9349SMartin Matuska blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
125729dc9349SMartin Matuska #endif
125829dc9349SMartin Matuska }
125975e1fea6SMartin Matuska
12601f1e2261SMartin Matuska static int
zvol_alloc_non_blk_mq(struct zvol_state_os * zso,zvol_queue_limits_t * limits)126175e1fea6SMartin Matuska zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
12621f1e2261SMartin Matuska {
12631f1e2261SMartin Matuska #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
12641f1e2261SMartin Matuska #if defined(HAVE_BLK_ALLOC_DISK)
12651f1e2261SMartin Matuska zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
12661f1e2261SMartin Matuska if (zso->zvo_disk == NULL)
12671f1e2261SMartin Matuska return (1);
12681f1e2261SMartin Matuska
12691f1e2261SMartin Matuska zso->zvo_disk->minors = ZVOL_MINORS;
12701f1e2261SMartin Matuska zso->zvo_queue = zso->zvo_disk->queue;
12711719886fSMartin Matuska #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
127275e1fea6SMartin Matuska struct queue_limits qlimits;
127375e1fea6SMartin Matuska zvol_queue_limits_convert(limits, &qlimits);
127475e1fea6SMartin Matuska struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
12751719886fSMartin Matuska if (IS_ERR(disk)) {
12761719886fSMartin Matuska zso->zvo_disk = NULL;
12771719886fSMartin Matuska return (1);
12781719886fSMartin Matuska }
12791719886fSMartin Matuska
12801719886fSMartin Matuska zso->zvo_disk = disk;
12811719886fSMartin Matuska zso->zvo_disk->minors = ZVOL_MINORS;
12821719886fSMartin Matuska zso->zvo_queue = zso->zvo_disk->queue;
1283e2df9bb4SMartin Matuska
12841f1e2261SMartin Matuska #else
12851f1e2261SMartin Matuska zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
12861f1e2261SMartin Matuska if (zso->zvo_queue == NULL)
12871f1e2261SMartin Matuska return (1);
12881f1e2261SMartin Matuska
12891f1e2261SMartin Matuska zso->zvo_disk = alloc_disk(ZVOL_MINORS);
12901f1e2261SMartin Matuska if (zso->zvo_disk == NULL) {
12911f1e2261SMartin Matuska blk_cleanup_queue(zso->zvo_queue);
12921f1e2261SMartin Matuska return (1);
12931f1e2261SMartin Matuska }
12941f1e2261SMartin Matuska
12951f1e2261SMartin Matuska zso->zvo_disk->queue = zso->zvo_queue;
12961f1e2261SMartin Matuska #endif /* HAVE_BLK_ALLOC_DISK */
12971f1e2261SMartin Matuska #else
12981f1e2261SMartin Matuska zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
12991f1e2261SMartin Matuska if (zso->zvo_queue == NULL)
13001f1e2261SMartin Matuska return (1);
13011f1e2261SMartin Matuska
13021f1e2261SMartin Matuska zso->zvo_disk = alloc_disk(ZVOL_MINORS);
13031f1e2261SMartin Matuska if (zso->zvo_disk == NULL) {
13041f1e2261SMartin Matuska blk_cleanup_queue(zso->zvo_queue);
13051f1e2261SMartin Matuska return (1);
13061f1e2261SMartin Matuska }
13071f1e2261SMartin Matuska
13081f1e2261SMartin Matuska zso->zvo_disk->queue = zso->zvo_queue;
13091f1e2261SMartin Matuska #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
131029dc9349SMartin Matuska
131129dc9349SMartin Matuska zvol_queue_limits_apply(limits, zso->zvo_queue);
131229dc9349SMartin Matuska
13131f1e2261SMartin Matuska return (0);
13141f1e2261SMartin Matuska
13151f1e2261SMartin Matuska }
13161f1e2261SMartin Matuska
13171f1e2261SMartin Matuska static int
zvol_alloc_blk_mq(zvol_state_t * zv,zvol_queue_limits_t * limits)131875e1fea6SMartin Matuska zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
13191f1e2261SMartin Matuska {
13201f1e2261SMartin Matuska struct zvol_state_os *zso = zv->zv_zso;
13211f1e2261SMartin Matuska
13221f1e2261SMartin Matuska /* Allocate our blk-mq tag_set */
13231f1e2261SMartin Matuska if (zvol_blk_mq_alloc_tag_set(zv) != 0)
13241f1e2261SMartin Matuska return (1);
13251f1e2261SMartin Matuska
13261f1e2261SMartin Matuska #if defined(HAVE_BLK_ALLOC_DISK)
13271f1e2261SMartin Matuska zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
13281f1e2261SMartin Matuska if (zso->zvo_disk == NULL) {
13291f1e2261SMartin Matuska blk_mq_free_tag_set(&zso->tag_set);
13301f1e2261SMartin Matuska return (1);
13311f1e2261SMartin Matuska }
13321f1e2261SMartin Matuska zso->zvo_queue = zso->zvo_disk->queue;
13331f1e2261SMartin Matuska zso->zvo_disk->minors = ZVOL_MINORS;
13341719886fSMartin Matuska #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
133575e1fea6SMartin Matuska struct queue_limits qlimits;
133675e1fea6SMartin Matuska zvol_queue_limits_convert(limits, &qlimits);
133775e1fea6SMartin Matuska struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
13381719886fSMartin Matuska if (IS_ERR(disk)) {
13391719886fSMartin Matuska zso->zvo_disk = NULL;
13401719886fSMartin Matuska blk_mq_free_tag_set(&zso->tag_set);
13411719886fSMartin Matuska return (1);
13421719886fSMartin Matuska }
13431719886fSMartin Matuska
13441719886fSMartin Matuska zso->zvo_disk = disk;
13451719886fSMartin Matuska zso->zvo_queue = zso->zvo_disk->queue;
13461719886fSMartin Matuska zso->zvo_disk->minors = ZVOL_MINORS;
13471f1e2261SMartin Matuska #else
13481f1e2261SMartin Matuska zso->zvo_disk = alloc_disk(ZVOL_MINORS);
13491f1e2261SMartin Matuska if (zso->zvo_disk == NULL) {
13501f1e2261SMartin Matuska blk_cleanup_queue(zso->zvo_queue);
13511f1e2261SMartin Matuska blk_mq_free_tag_set(&zso->tag_set);
13521f1e2261SMartin Matuska return (1);
13531f1e2261SMartin Matuska }
13541f1e2261SMartin Matuska /* Allocate queue */
13551f1e2261SMartin Matuska zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
13561f1e2261SMartin Matuska if (IS_ERR(zso->zvo_queue)) {
13571f1e2261SMartin Matuska blk_mq_free_tag_set(&zso->tag_set);
13581f1e2261SMartin Matuska return (1);
13591f1e2261SMartin Matuska }
13601f1e2261SMartin Matuska
13611f1e2261SMartin Matuska /* Our queue is now created, assign it to our disk */
13621f1e2261SMartin Matuska zso->zvo_disk->queue = zso->zvo_queue;
136329dc9349SMartin Matuska #endif
13641f1e2261SMartin Matuska
136529dc9349SMartin Matuska zvol_queue_limits_apply(limits, zso->zvo_queue);
136629dc9349SMartin Matuska
13671f1e2261SMartin Matuska return (0);
13681f1e2261SMartin Matuska }
13691f1e2261SMartin Matuska
1370eda14cbcSMatt Macy /*
1371eda14cbcSMatt Macy * Allocate memory for a new zvol_state_t and setup the required
1372eda14cbcSMatt Macy * request queue and generic disk structures for the block device.
1373eda14cbcSMatt Macy */
1374df58e8b1SMartin Matuska static int
zvol_alloc(dev_t dev,const char * name,uint64_t volsize,uint64_t volblocksize,zvol_state_t ** zvp)1375df58e8b1SMartin Matuska zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize,
1376df58e8b1SMartin Matuska zvol_state_t **zvp)
1377eda14cbcSMatt Macy {
1378eda14cbcSMatt Macy zvol_state_t *zv;
1379eda14cbcSMatt Macy struct zvol_state_os *zso;
1380eda14cbcSMatt Macy uint64_t volmode;
13811f1e2261SMartin Matuska int ret;
1382eda14cbcSMatt Macy
1383df58e8b1SMartin Matuska ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL);
1384df58e8b1SMartin Matuska if (ret)
1385df58e8b1SMartin Matuska return (ret);
1386eda14cbcSMatt Macy
1387eda14cbcSMatt Macy if (volmode == ZFS_VOLMODE_DEFAULT)
1388eda14cbcSMatt Macy volmode = zvol_volmode;
1389eda14cbcSMatt Macy
1390eda14cbcSMatt Macy if (volmode == ZFS_VOLMODE_NONE)
1391df58e8b1SMartin Matuska return (0);
1392eda14cbcSMatt Macy
1393eda14cbcSMatt Macy zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
1394eda14cbcSMatt Macy zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1395eda14cbcSMatt Macy zv->zv_zso = zso;
13967877fdebSMatt Macy zv->zv_volmode = volmode;
1397df58e8b1SMartin Matuska zv->zv_volsize = volsize;
1398e2df9bb4SMartin Matuska zv->zv_volblocksize = volblocksize;
1399eda14cbcSMatt Macy
1400eda14cbcSMatt Macy list_link_init(&zv->zv_next);
1401eda14cbcSMatt Macy mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1402ce4dcb97SMartin Matuska cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1403eda14cbcSMatt Macy
14041f1e2261SMartin Matuska zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
14051f88aa09SMartin Matuska
140675e1fea6SMartin Matuska zvol_queue_limits_t limits;
140775e1fea6SMartin Matuska zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
140875e1fea6SMartin Matuska
14091f1e2261SMartin Matuska /*
14101f1e2261SMartin Matuska * The block layer has 3 interfaces for getting BIOs:
14111f1e2261SMartin Matuska *
14121f1e2261SMartin Matuska * 1. blk-mq request queues (new)
14131f1e2261SMartin Matuska * 2. submit_bio() (oldest)
14141f1e2261SMartin Matuska * 3. regular request queues (old).
14151f1e2261SMartin Matuska *
14161f1e2261SMartin Matuska * Each of those interfaces has two permutations:
14171f1e2261SMartin Matuska *
14181f1e2261SMartin Matuska * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
14191f1e2261SMartin Matuska * both the disk and its queue (5.14 kernel or newer)
14201f1e2261SMartin Matuska *
14211f1e2261SMartin Matuska * b) We don't have blk_*alloc_disk(), and have to allocate the
14221f1e2261SMartin Matuska * disk and the queue separately. (5.13 kernel or older)
14231f1e2261SMartin Matuska */
14241f1e2261SMartin Matuska if (zv->zv_zso->use_blk_mq) {
142575e1fea6SMartin Matuska ret = zvol_alloc_blk_mq(zv, &limits);
14263747329bSMartin Matuska if (ret != 0)
14273747329bSMartin Matuska goto out_kmem;
14281f1e2261SMartin Matuska zso->zvo_disk->fops = &zvol_ops_blk_mq;
14291f1e2261SMartin Matuska } else {
143075e1fea6SMartin Matuska ret = zvol_alloc_non_blk_mq(zso, &limits);
14311f1e2261SMartin Matuska if (ret != 0)
14321f88aa09SMartin Matuska goto out_kmem;
14333747329bSMartin Matuska zso->zvo_disk->fops = &zvol_ops;
14343747329bSMartin Matuska }
14351f88aa09SMartin Matuska
1436eda14cbcSMatt Macy /* Limit read-ahead to a single page to prevent over-prefetching. */
1437eda14cbcSMatt Macy blk_queue_set_read_ahead(zso->zvo_queue, 1);
1438eda14cbcSMatt Macy
14391f1e2261SMartin Matuska if (!zv->zv_zso->use_blk_mq) {
1440eda14cbcSMatt Macy /* Disable write merging in favor of the ZIO pipeline. */
1441eda14cbcSMatt Macy blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
14421f1e2261SMartin Matuska }
1443eda14cbcSMatt Macy
1444eda14cbcSMatt Macy zso->zvo_queue->queuedata = zv;
1445eda14cbcSMatt Macy zso->zvo_dev = dev;
1446eda14cbcSMatt Macy zv->zv_open_count = 0;
14472276e539SMartin Matuska strlcpy(zv->zv_name, name, sizeof (zv->zv_name));
1448eda14cbcSMatt Macy
1449eda14cbcSMatt Macy zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1450eda14cbcSMatt Macy rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1451eda14cbcSMatt Macy
1452eda14cbcSMatt Macy zso->zvo_disk->major = zvol_major;
1453eda14cbcSMatt Macy zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
1454eda14cbcSMatt Macy
1455eda14cbcSMatt Macy /*
1456716fd348SMartin Matuska * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.
1457716fd348SMartin Matuska * This is accomplished by limiting the number of minors for the
1458716fd348SMartin Matuska * device to one and explicitly disabling partition scanning.
1459eda14cbcSMatt Macy */
1460716fd348SMartin Matuska if (volmode == ZFS_VOLMODE_DEV) {
1461eda14cbcSMatt Macy zso->zvo_disk->minors = 1;
14627a7741afSMartin Matuska zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
14637a7741afSMartin Matuska zso->zvo_disk->flags |= GENHD_FL_NO_PART;
1464eda14cbcSMatt Macy }
1465716fd348SMartin Matuska
1466eda14cbcSMatt Macy zso->zvo_disk->first_minor = (dev & MINORMASK);
1467eda14cbcSMatt Macy zso->zvo_disk->private_data = zv;
1468eda14cbcSMatt Macy snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
1469eda14cbcSMatt Macy ZVOL_DEV_NAME, (dev & MINORMASK));
1470eda14cbcSMatt Macy
1471df58e8b1SMartin Matuska *zvp = zv;
1472df58e8b1SMartin Matuska return (ret);
1473eda14cbcSMatt Macy
1474eda14cbcSMatt Macy out_kmem:
1475eda14cbcSMatt Macy kmem_free(zso, sizeof (struct zvol_state_os));
1476eda14cbcSMatt Macy kmem_free(zv, sizeof (zvol_state_t));
1477df58e8b1SMartin Matuska return (ret);
1478eda14cbcSMatt Macy }
1479eda14cbcSMatt Macy
148053a2e263SMartin Matuska void
zvol_os_remove_minor(zvol_state_t * zv)148153a2e263SMartin Matuska zvol_os_remove_minor(zvol_state_t *zv)
148253a2e263SMartin Matuska {
148353a2e263SMartin Matuska ASSERT(MUTEX_HELD(&zv->zv_state_lock));
148453a2e263SMartin Matuska ASSERT0(zv->zv_open_count);
148553a2e263SMartin Matuska ASSERT0(atomic_read(&zv->zv_suspend_ref));
148653a2e263SMartin Matuska ASSERT(zv->zv_flags & ZVOL_REMOVING);
148753a2e263SMartin Matuska
148853a2e263SMartin Matuska struct zvol_state_os *zso = zv->zv_zso;
148953a2e263SMartin Matuska zv->zv_zso = NULL;
149053a2e263SMartin Matuska
149153a2e263SMartin Matuska /* Clearing private_data will make new callers return immediately. */
149253a2e263SMartin Matuska atomic_store_ptr(&zso->zvo_disk->private_data, NULL);
149353a2e263SMartin Matuska
1494eda14cbcSMatt Macy /*
149553a2e263SMartin Matuska * Drop the state lock before calling del_gendisk(). There may be
149653a2e263SMartin Matuska * callers waiting to acquire it, but del_gendisk() will block until
149753a2e263SMartin Matuska * they exit, which would deadlock.
1498eda14cbcSMatt Macy */
149953a2e263SMartin Matuska mutex_exit(&zv->zv_state_lock);
150053a2e263SMartin Matuska
150153a2e263SMartin Matuska del_gendisk(zso->zvo_disk);
150253a2e263SMartin Matuska #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
150353a2e263SMartin Matuska (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
150453a2e263SMartin Matuska #if defined(HAVE_BLK_CLEANUP_DISK)
150553a2e263SMartin Matuska blk_cleanup_disk(zso->zvo_disk);
150653a2e263SMartin Matuska #else
150753a2e263SMartin Matuska put_disk(zso->zvo_disk);
150853a2e263SMartin Matuska #endif
150953a2e263SMartin Matuska #else
151053a2e263SMartin Matuska blk_cleanup_queue(zso->zvo_queue);
151153a2e263SMartin Matuska put_disk(zso->zvo_disk);
151253a2e263SMartin Matuska #endif
151353a2e263SMartin Matuska
151453a2e263SMartin Matuska if (zso->use_blk_mq)
151553a2e263SMartin Matuska blk_mq_free_tag_set(&zso->tag_set);
151653a2e263SMartin Matuska
1517e6e941e6SMartin Matuska ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS);
151853a2e263SMartin Matuska
151953a2e263SMartin Matuska kmem_free(zso, sizeof (struct zvol_state_os));
152053a2e263SMartin Matuska
152153a2e263SMartin Matuska mutex_enter(&zv->zv_state_lock);
152253a2e263SMartin Matuska }
152353a2e263SMartin Matuska
1524c03c5b1cSMartin Matuska void
zvol_os_free(zvol_state_t * zv)1525c03c5b1cSMartin Matuska zvol_os_free(zvol_state_t *zv)
1526eda14cbcSMatt Macy {
1527eda14cbcSMatt Macy
1528eda14cbcSMatt Macy ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1529eda14cbcSMatt Macy ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
15307877fdebSMatt Macy ASSERT0(zv->zv_open_count);
153153a2e263SMartin Matuska ASSERT0P(zv->zv_zso);
153253a2e263SMartin Matuska
153353a2e263SMartin Matuska ASSERT0P(zv->zv_objset);
153453a2e263SMartin Matuska ASSERT0P(zv->zv_zilog);
153553a2e263SMartin Matuska ASSERT0P(zv->zv_dn);
1536eda14cbcSMatt Macy
1537eda14cbcSMatt Macy rw_destroy(&zv->zv_suspend_lock);
1538eda14cbcSMatt Macy zfs_rangelock_fini(&zv->zv_rangelock);
1539eda14cbcSMatt Macy
1540ce4dcb97SMartin Matuska cv_destroy(&zv->zv_removing_cv);
1541eda14cbcSMatt Macy mutex_destroy(&zv->zv_state_lock);
1542eda14cbcSMatt Macy dataset_kstats_destroy(&zv->zv_kstat);
1543eda14cbcSMatt Macy
1544eda14cbcSMatt Macy kmem_free(zv, sizeof (zvol_state_t));
1545eda14cbcSMatt Macy }
1546eda14cbcSMatt Macy
15477877fdebSMatt Macy void
zvol_wait_close(zvol_state_t * zv)15487877fdebSMatt Macy zvol_wait_close(zvol_state_t *zv)
15497877fdebSMatt Macy {
15507877fdebSMatt Macy }
15517877fdebSMatt Macy
155275e1fea6SMartin Matuska struct add_disk_work {
155375e1fea6SMartin Matuska struct delayed_work work;
155475e1fea6SMartin Matuska struct gendisk *disk;
155575e1fea6SMartin Matuska int error;
155675e1fea6SMartin Matuska };
155775e1fea6SMartin Matuska
155875e1fea6SMartin Matuska static int
__zvol_os_add_disk(struct gendisk * disk)155975e1fea6SMartin Matuska __zvol_os_add_disk(struct gendisk *disk)
156075e1fea6SMartin Matuska {
156175e1fea6SMartin Matuska int error = 0;
156275e1fea6SMartin Matuska #ifdef HAVE_ADD_DISK_RET
1563d0abb9a6SMartin Matuska error = -add_disk(disk);
1564d0abb9a6SMartin Matuska if (error)
1565d0abb9a6SMartin Matuska error = SET_ERROR(error);
156675e1fea6SMartin Matuska #else
156775e1fea6SMartin Matuska add_disk(disk);
156875e1fea6SMartin Matuska #endif
156975e1fea6SMartin Matuska return (error);
157075e1fea6SMartin Matuska }
157175e1fea6SMartin Matuska
157275e1fea6SMartin Matuska #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
157375e1fea6SMartin Matuska static void
zvol_os_add_disk_work(struct work_struct * work)157475e1fea6SMartin Matuska zvol_os_add_disk_work(struct work_struct *work)
157575e1fea6SMartin Matuska {
157675e1fea6SMartin Matuska struct add_disk_work *add_disk_work;
157775e1fea6SMartin Matuska add_disk_work = container_of(work, struct add_disk_work, work.work);
157875e1fea6SMartin Matuska add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
157975e1fea6SMartin Matuska }
158075e1fea6SMartin Matuska #endif
158175e1fea6SMartin Matuska
158275e1fea6SMartin Matuska /*
158375e1fea6SMartin Matuska * SPECIAL CASE:
158475e1fea6SMartin Matuska *
158575e1fea6SMartin Matuska * This function basically calls add_disk() from a workqueue. You may be
158675e1fea6SMartin Matuska * thinking: why not just call add_disk() directly?
158775e1fea6SMartin Matuska *
158875e1fea6SMartin Matuska * When you call add_disk(), the zvol appears to the world. When this happens,
158975e1fea6SMartin Matuska * the kernel calls disk_scan_partitions() on the zvol, which behaves
159075e1fea6SMartin Matuska * differently on the 6.9+ kernels:
159175e1fea6SMartin Matuska *
159275e1fea6SMartin Matuska * - 6.8 and older kernels -
159375e1fea6SMartin Matuska * disk_scan_partitions()
159475e1fea6SMartin Matuska * handle = bdev_open_by_dev(
159575e1fea6SMartin Matuska * zvol_open()
159675e1fea6SMartin Matuska * bdev_release(handle);
159775e1fea6SMartin Matuska * zvol_release()
159875e1fea6SMartin Matuska *
159975e1fea6SMartin Matuska *
160075e1fea6SMartin Matuska * - 6.9+ kernels -
160175e1fea6SMartin Matuska * disk_scan_partitions()
160275e1fea6SMartin Matuska * file = bdev_file_open_by_dev()
160375e1fea6SMartin Matuska * zvol_open()
160475e1fea6SMartin Matuska * fput(file)
160575e1fea6SMartin Matuska * < wait for return to userspace >
160675e1fea6SMartin Matuska * zvol_release()
160775e1fea6SMartin Matuska *
160875e1fea6SMartin Matuska * The difference is that the bdev_release() from the 6.8 kernel is synchronous
160975e1fea6SMartin Matuska * while the fput() from the 6.9 kernel is async. Or more specifically it's
161075e1fea6SMartin Matuska * async that has to wait until we return to userspace (since it adds the fput
161175e1fea6SMartin Matuska * into the caller's work queue with the TWA_RESUME flag set). This is not the
161275e1fea6SMartin Matuska * behavior we want, since we want do things like create+destroy a zvol within
161375e1fea6SMartin Matuska * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
161475e1fea6SMartin Matuska * reference to the zvol while we're in the IOCTL, which can't wait until we
161575e1fea6SMartin Matuska * return to userspace.
161675e1fea6SMartin Matuska *
161775e1fea6SMartin Matuska * We can get around this since fput() has a special codepath for when it's
161875e1fea6SMartin Matuska * running in a kernel thread or interrupt. In those cases, it just puts the
161975e1fea6SMartin Matuska * fput into the system workqueue, which we can force to run with
162075e1fea6SMartin Matuska * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it
162175e1fea6SMartin Matuska * run from a kernel thread and "tricks" the fput() codepaths.
162275e1fea6SMartin Matuska *
162375e1fea6SMartin Matuska * Note that __flush_workqueue() is slowly getting deprecated. This may be ok
162475e1fea6SMartin Matuska * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
162575e1fea6SMartin Matuska * fput) to happen, which it eventually, naturally, will from the system_wq
162675e1fea6SMartin Matuska * without us explicitly calling __flush_workqueue().
162775e1fea6SMartin Matuska */
162875e1fea6SMartin Matuska static int
zvol_os_add_disk(struct gendisk * disk)162975e1fea6SMartin Matuska zvol_os_add_disk(struct gendisk *disk)
163075e1fea6SMartin Matuska {
163175e1fea6SMartin Matuska #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */
163275e1fea6SMartin Matuska struct add_disk_work add_disk_work;
163375e1fea6SMartin Matuska
163475e1fea6SMartin Matuska INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
163575e1fea6SMartin Matuska add_disk_work.disk = disk;
163675e1fea6SMartin Matuska add_disk_work.error = 0;
163775e1fea6SMartin Matuska
163875e1fea6SMartin Matuska /* Use *_delayed_work functions since they're not GPL'd */
163975e1fea6SMartin Matuska schedule_delayed_work(&add_disk_work.work, 0);
164075e1fea6SMartin Matuska flush_delayed_work(&add_disk_work.work);
164175e1fea6SMartin Matuska
164275e1fea6SMartin Matuska __flush_workqueue(system_wq);
164375e1fea6SMartin Matuska return (add_disk_work.error);
164475e1fea6SMartin Matuska #else /* <= 6.8 kernel */
164575e1fea6SMartin Matuska return (__zvol_os_add_disk(disk));
164675e1fea6SMartin Matuska #endif
164775e1fea6SMartin Matuska }
164875e1fea6SMartin Matuska
1649eda14cbcSMatt Macy /*
1650eda14cbcSMatt Macy * Create a block device minor node and setup the linkage between it
1651eda14cbcSMatt Macy * and the specified volume. Once this function returns the block
1652eda14cbcSMatt Macy * device is live and ready for use.
1653eda14cbcSMatt Macy */
1654c03c5b1cSMartin Matuska int
zvol_os_create_minor(const char * name)1655eda14cbcSMatt Macy zvol_os_create_minor(const char *name)
1656eda14cbcSMatt Macy {
1657df58e8b1SMartin Matuska zvol_state_t *zv = NULL;
1658eda14cbcSMatt Macy objset_t *os;
1659eda14cbcSMatt Macy dmu_object_info_t *doi;
1660eda14cbcSMatt Macy uint64_t volsize;
1661eda14cbcSMatt Macy uint64_t len;
1662eda14cbcSMatt Macy unsigned minor = 0;
1663eda14cbcSMatt Macy int error = 0;
1664eda14cbcSMatt Macy int idx;
1665eda14cbcSMatt Macy uint64_t hash = zvol_name_hash(name);
1666f8b1db88SMartin Matuska uint64_t volthreading;
1667dbd5678dSMartin Matuska bool replayed_zil = B_FALSE;
1668eda14cbcSMatt Macy
1669eda14cbcSMatt Macy if (zvol_inhibit_dev)
1670eda14cbcSMatt Macy return (0);
1671eda14cbcSMatt Macy
1672e6e941e6SMartin Matuska idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP));
1673eda14cbcSMatt Macy if (idx < 0)
1674eda14cbcSMatt Macy return (SET_ERROR(-idx));
1675eda14cbcSMatt Macy minor = idx << ZVOL_MINOR_BITS;
1676783d3ff6SMartin Matuska if (MINOR(minor) != minor) {
1677783d3ff6SMartin Matuska /* too many partitions can cause an overflow */
1678783d3ff6SMartin Matuska zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
1679783d3ff6SMartin Matuska name, minor, MINOR(minor));
1680e6e941e6SMartin Matuska ida_free(&zvol_ida, idx);
1681783d3ff6SMartin Matuska return (SET_ERROR(EINVAL));
1682783d3ff6SMartin Matuska }
1683eda14cbcSMatt Macy
1684eda14cbcSMatt Macy zv = zvol_find_by_name_hash(name, hash, RW_NONE);
1685eda14cbcSMatt Macy if (zv) {
1686eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1687eda14cbcSMatt Macy mutex_exit(&zv->zv_state_lock);
1688e6e941e6SMartin Matuska ida_free(&zvol_ida, idx);
1689eda14cbcSMatt Macy return (SET_ERROR(EEXIST));
1690eda14cbcSMatt Macy }
1691eda14cbcSMatt Macy
1692eda14cbcSMatt Macy doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1693eda14cbcSMatt Macy
1694eda14cbcSMatt Macy error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1695eda14cbcSMatt Macy if (error)
1696eda14cbcSMatt Macy goto out_doi;
1697eda14cbcSMatt Macy
1698eda14cbcSMatt Macy error = dmu_object_info(os, ZVOL_OBJ, doi);
1699eda14cbcSMatt Macy if (error)
1700eda14cbcSMatt Macy goto out_dmu_objset_disown;
1701eda14cbcSMatt Macy
1702eda14cbcSMatt Macy error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1703eda14cbcSMatt Macy if (error)
1704eda14cbcSMatt Macy goto out_dmu_objset_disown;
1705eda14cbcSMatt Macy
1706df58e8b1SMartin Matuska error = zvol_alloc(MKDEV(zvol_major, minor), name,
1707df58e8b1SMartin Matuska volsize, doi->doi_data_block_size, &zv);
1708df58e8b1SMartin Matuska if (error || zv == NULL)
1709eda14cbcSMatt Macy goto out_dmu_objset_disown;
1710df58e8b1SMartin Matuska
1711eda14cbcSMatt Macy zv->zv_hash = hash;
1712eda14cbcSMatt Macy
1713eda14cbcSMatt Macy if (dmu_objset_is_snapshot(os))
1714eda14cbcSMatt Macy zv->zv_flags |= ZVOL_RDONLY;
1715eda14cbcSMatt Macy
1716eda14cbcSMatt Macy zv->zv_objset = os;
1717eda14cbcSMatt Macy
1718f8b1db88SMartin Matuska /* Default */
1719f8b1db88SMartin Matuska zv->zv_threading = B_TRUE;
1720f8b1db88SMartin Matuska if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)
1721f8b1db88SMartin Matuska == 0)
1722f8b1db88SMartin Matuska zv->zv_threading = volthreading;
1723f8b1db88SMartin Matuska
1724eda14cbcSMatt Macy set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
1725eda14cbcSMatt Macy
1726e3aa18adSMartin Matuska #ifdef QUEUE_FLAG_DISCARD
1727eda14cbcSMatt Macy blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
1728e3aa18adSMartin Matuska #endif
1729eda14cbcSMatt Macy #ifdef QUEUE_FLAG_NONROT
1730eda14cbcSMatt Macy blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
1731eda14cbcSMatt Macy #endif
1732eda14cbcSMatt Macy #ifdef QUEUE_FLAG_ADD_RANDOM
1733eda14cbcSMatt Macy blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
1734eda14cbcSMatt Macy #endif
1735eda14cbcSMatt Macy /* This flag was introduced in kernel version 4.12. */
1736eda14cbcSMatt Macy #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
1737eda14cbcSMatt Macy blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
1738eda14cbcSMatt Macy #endif
1739eda14cbcSMatt Macy
1740d0abb9a6SMartin Matuska ASSERT0P(zv->zv_kstat.dk_kstats);
1741271171e0SMartin Matuska error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1742271171e0SMartin Matuska if (error)
1743271171e0SMartin Matuska goto out_dmu_objset_disown;
1744d0abb9a6SMartin Matuska ASSERT0P(zv->zv_zilog);
1745271171e0SMartin Matuska zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1746eda14cbcSMatt Macy if (spa_writeable(dmu_objset_spa(os))) {
1747eda14cbcSMatt Macy if (zil_replay_disable)
1748dbd5678dSMartin Matuska replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1749eda14cbcSMatt Macy else
1750dbd5678dSMartin Matuska replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1751eda14cbcSMatt Macy }
1752dbd5678dSMartin Matuska if (replayed_zil)
17539db44a8eSMartin Matuska zil_close(zv->zv_zilog);
17549db44a8eSMartin Matuska zv->zv_zilog = NULL;
1755eda14cbcSMatt Macy
1756eda14cbcSMatt Macy /*
1757eda14cbcSMatt Macy * When udev detects the addition of the device it will immediately
1758eda14cbcSMatt Macy * invoke blkid(8) to determine the type of content on the device.
1759eda14cbcSMatt Macy * Prefetching the blocks commonly scanned by blkid(8) will speed
1760eda14cbcSMatt Macy * up this process.
1761eda14cbcSMatt Macy */
1762be181ee2SMartin Matuska len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1763eda14cbcSMatt Macy if (len > 0) {
1764eda14cbcSMatt Macy dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1765eda14cbcSMatt Macy dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1766eda14cbcSMatt Macy ZIO_PRIORITY_SYNC_READ);
1767eda14cbcSMatt Macy }
1768eda14cbcSMatt Macy
1769eda14cbcSMatt Macy zv->zv_objset = NULL;
1770eda14cbcSMatt Macy out_dmu_objset_disown:
1771eda14cbcSMatt Macy dmu_objset_disown(os, B_TRUE, FTAG);
1772eda14cbcSMatt Macy out_doi:
1773eda14cbcSMatt Macy kmem_free(doi, sizeof (dmu_object_info_t));
1774eda14cbcSMatt Macy
1775eda14cbcSMatt Macy /*
1776eda14cbcSMatt Macy * Keep in mind that once add_disk() is called, the zvol is
1777eda14cbcSMatt Macy * announced to the world, and zvol_open()/zvol_release() can
1778eda14cbcSMatt Macy * be called at any time. Incidentally, add_disk() itself calls
1779eda14cbcSMatt Macy * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1780eda14cbcSMatt Macy * directly as well.
1781eda14cbcSMatt Macy */
1782df58e8b1SMartin Matuska if (error == 0 && zv) {
1783eda14cbcSMatt Macy rw_enter(&zvol_state_lock, RW_WRITER);
1784eda14cbcSMatt Macy zvol_insert(zv);
1785eda14cbcSMatt Macy rw_exit(&zvol_state_lock);
178675e1fea6SMartin Matuska error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
1787eda14cbcSMatt Macy } else {
1788e6e941e6SMartin Matuska ida_free(&zvol_ida, idx);
1789eda14cbcSMatt Macy }
1790eda14cbcSMatt Macy
1791eda14cbcSMatt Macy return (error);
1792eda14cbcSMatt Macy }
1793eda14cbcSMatt Macy
1794df58e8b1SMartin Matuska int
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1795c03c5b1cSMartin Matuska zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1796eda14cbcSMatt Macy {
1797eda14cbcSMatt Macy int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1798eda14cbcSMatt Macy
1799eda14cbcSMatt Macy ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1800eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1801eda14cbcSMatt Macy
1802eda14cbcSMatt Macy strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1803eda14cbcSMatt Macy
1804eda14cbcSMatt Macy /* move to new hashtable entry */
1805b985c9caSMartin Matuska zv->zv_hash = zvol_name_hash(newname);
1806eda14cbcSMatt Macy hlist_del(&zv->zv_hlink);
1807eda14cbcSMatt Macy hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1808eda14cbcSMatt Macy
1809eda14cbcSMatt Macy /*
1810eda14cbcSMatt Macy * The block device's read-only state is briefly changed causing
1811eda14cbcSMatt Macy * a KOBJ_CHANGE uevent to be issued. This ensures udev detects
1812eda14cbcSMatt Macy * the name change and fixes the symlinks. This does not change
1813eda14cbcSMatt Macy * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1814eda14cbcSMatt Macy * changes. This would normally be done using kobject_uevent() but
1815eda14cbcSMatt Macy * that is a GPL-only symbol which is why we need this workaround.
1816eda14cbcSMatt Macy */
1817eda14cbcSMatt Macy set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1818eda14cbcSMatt Macy set_disk_ro(zv->zv_zso->zvo_disk, readonly);
181914c2e0a0SMartin Matuska
182014c2e0a0SMartin Matuska dataset_kstats_rename(&zv->zv_kstat, newname);
1821df58e8b1SMartin Matuska
1822df58e8b1SMartin Matuska return (0);
1823eda14cbcSMatt Macy }
1824eda14cbcSMatt Macy
1825c03c5b1cSMartin Matuska void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1826c03c5b1cSMartin Matuska zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1827eda14cbcSMatt Macy {
1828eda14cbcSMatt Macy
1829eda14cbcSMatt Macy set_disk_ro(zv->zv_zso->zvo_disk, flags);
1830eda14cbcSMatt Macy }
1831eda14cbcSMatt Macy
1832c03c5b1cSMartin Matuska void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1833c03c5b1cSMartin Matuska zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1834eda14cbcSMatt Macy {
1835eda14cbcSMatt Macy
1836eda14cbcSMatt Macy set_capacity(zv->zv_zso->zvo_disk, capacity);
1837eda14cbcSMatt Macy }
1838eda14cbcSMatt Macy
1839eda14cbcSMatt Macy int
zvol_init(void)1840eda14cbcSMatt Macy zvol_init(void)
1841eda14cbcSMatt Macy {
1842eda14cbcSMatt Macy int error;
18431f1e2261SMartin Matuska
1844b1c1ee44SMartin Matuska error = zvol_init_impl();
1845b1c1ee44SMartin Matuska if (error) {
1846b1c1ee44SMartin Matuska printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error);
1847b1c1ee44SMartin Matuska return (error);
18481f1e2261SMartin Matuska }
1849eda14cbcSMatt Macy
1850d0abb9a6SMartin Matuska error = -register_blkdev(zvol_major, ZVOL_DRIVER);
1851eda14cbcSMatt Macy if (error) {
1852eda14cbcSMatt Macy printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1853d0abb9a6SMartin Matuska return (SET_ERROR(error));
1854eda14cbcSMatt Macy }
18551f1e2261SMartin Matuska
18561f1e2261SMartin Matuska if (zvol_blk_mq_queue_depth == 0) {
18571f1e2261SMartin Matuska zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
18581f1e2261SMartin Matuska } else {
18591f1e2261SMartin Matuska zvol_actual_blk_mq_queue_depth =
18601f1e2261SMartin Matuska MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
18611f1e2261SMartin Matuska }
18621f1e2261SMartin Matuska
18631f1e2261SMartin Matuska if (zvol_blk_mq_threads == 0) {
18641f1e2261SMartin Matuska zvol_blk_mq_actual_threads = num_online_cpus();
18651f1e2261SMartin Matuska } else {
18661f1e2261SMartin Matuska zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
18671f1e2261SMartin Matuska 1024);
18681f1e2261SMartin Matuska }
18697a7741afSMartin Matuska
1870eda14cbcSMatt Macy ida_init(&zvol_ida);
1871eda14cbcSMatt Macy return (0);
1872eda14cbcSMatt Macy }
1873eda14cbcSMatt Macy
1874eda14cbcSMatt Macy void
zvol_fini(void)1875eda14cbcSMatt Macy zvol_fini(void)
1876eda14cbcSMatt Macy {
1877eda14cbcSMatt Macy unregister_blkdev(zvol_major, ZVOL_DRIVER);
18781719886fSMartin Matuska
1879b1c1ee44SMartin Matuska zvol_fini_impl();
18801719886fSMartin Matuska
1881eda14cbcSMatt Macy ida_destroy(&zvol_ida);
1882eda14cbcSMatt Macy }
1883eda14cbcSMatt Macy
1884eda14cbcSMatt Macy module_param(zvol_major, uint, 0444);
1885eda14cbcSMatt Macy MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1886eda14cbcSMatt Macy
1887eda14cbcSMatt Macy module_param(zvol_max_discard_blocks, ulong, 0444);
1888eda14cbcSMatt Macy MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1889eda14cbcSMatt Macy
18906c1e79dfSMartin Matuska module_param(zvol_blk_mq_queue_depth, uint, 0644);
18916c1e79dfSMartin Matuska MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
18926c1e79dfSMartin Matuska
18936c1e79dfSMartin Matuska module_param(zvol_use_blk_mq, uint, 0644);
18946c1e79dfSMartin Matuska MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
18956c1e79dfSMartin Matuska
18966c1e79dfSMartin Matuska module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
18976c1e79dfSMartin Matuska MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
18986c1e79dfSMartin Matuska "Process volblocksize blocks per thread");
18996c1e79dfSMartin Matuska
19000a97523dSMartin Matuska #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
19010a97523dSMartin Matuska module_param(zvol_open_timeout_ms, uint, 0644);
19020a97523dSMartin Matuska MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
19030a97523dSMartin Matuska #endif
1904