1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 *
25 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
26 * All rights reserved.
27 *
28 * Portions Copyright 2010 Robert Milkowski
29 *
30 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
31 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
32 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
33 * Copyright (c) 2014 Integros [integros.com]
34 * Copyright (c) 2024, 2025, Klara, Inc.
35 */
36
37 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
38
39 /*
40 * ZFS volume emulation driver.
41 *
42 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
43 * Volumes are accessed through the symbolic links named:
44 *
45 * /dev/zvol/<pool_name>/<dataset_name>
46 *
47 * Volumes are persistent through reboot. No user command needs to be
48 * run before opening and using a device.
49 *
50 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
51 * in the system. Except when they're simply character devices (volmode=dev).
52 */
53
54 #include <sys/types.h>
55 #include <sys/param.h>
56 #include <sys/kernel.h>
57 #include <sys/errno.h>
58 #include <sys/uio.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/kmem.h>
62 #include <sys/conf.h>
63 #include <sys/cmn_err.h>
64 #include <sys/stat.h>
65 #include <sys/proc.h>
66 #include <sys/zap.h>
67 #include <sys/spa.h>
68 #include <sys/spa_impl.h>
69 #include <sys/zio.h>
70 #include <sys/disk.h>
71 #include <sys/dmu_traverse.h>
72 #include <sys/dnode.h>
73 #include <sys/dsl_dataset.h>
74 #include <sys/dsl_prop.h>
75 #include <sys/dsl_dir.h>
76 #include <sys/byteorder.h>
77 #include <sys/sunddi.h>
78 #include <sys/dirent.h>
79 #include <sys/policy.h>
80 #include <sys/queue.h>
81 #include <sys/fs/zfs.h>
82 #include <sys/zfs_ioctl.h>
83 #include <sys/zil.h>
84 #include <sys/zfs_znode.h>
85 #include <sys/zfs_rlock.h>
86 #include <sys/vdev_impl.h>
87 #include <sys/vdev_raidz.h>
88 #include <sys/zvol.h>
89 #include <sys/zil_impl.h>
90 #include <sys/dataset_kstats.h>
91 #include <sys/dbuf.h>
92 #include <sys/dmu_tx.h>
93 #include <sys/zfeature.h>
94 #include <sys/zio_checksum.h>
95 #include <sys/zil_impl.h>
96 #include <sys/filio.h>
97 #include <sys/freebsd_event.h>
98
99 #include <geom/geom.h>
100 #include <sys/zvol.h>
101 #include <sys/zvol_impl.h>
102 #include <cityhash.h>
103
104 #include "zfs_namecheck.h"
105
106 #define ZVOL_DUMPSIZE "dumpsize"
107
108 #ifdef ZVOL_LOCK_DEBUG
109 #define ZVOL_RW_READER RW_WRITER
110 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
111 #else
112 #define ZVOL_RW_READER RW_READER
113 #define ZVOL_RW_READ_HELD RW_READ_HELD
114 #endif
115
116 struct zvol_state_os {
117 #define zso_dev _zso_state._zso_dev
118 #define zso_geom _zso_state._zso_geom
119 union {
120 /* volmode=dev */
121 struct zvol_state_dev {
122 struct cdev *zsd_cdev;
123 struct selinfo zsd_selinfo;
124 } _zso_dev;
125
126 /* volmode=geom */
127 struct zvol_state_geom {
128 struct g_provider *zsg_provider;
129 } _zso_geom;
130 } _zso_state;
131 int zso_dying;
132 };
133
134 static uint32_t zvol_minors;
135
136 SYSCTL_DECL(_vfs_zfs);
137 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
138
139 static boolean_t zpool_on_zvol = B_FALSE;
140 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
141 "Allow zpools to use zvols as vdevs (DANGEROUS)");
142
143 /*
144 * Toggle unmap functionality.
145 */
146 boolean_t zvol_unmap_enabled = B_TRUE;
147
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
149 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
150
151 /*
152 * zvol maximum transfer in one DMU tx.
153 */
154 int zvol_maxphys = DMU_MAX_ACCESS / 2;
155
156 static void zvol_ensure_zilog(zvol_state_t *zv);
157
158 static d_open_t zvol_cdev_open;
159 static d_close_t zvol_cdev_close;
160 static d_ioctl_t zvol_cdev_ioctl;
161 static d_read_t zvol_cdev_read;
162 static d_write_t zvol_cdev_write;
163 static d_strategy_t zvol_cdev_bio_strategy;
164 static d_kqfilter_t zvol_cdev_kqfilter;
165
166 static struct cdevsw zvol_cdevsw = {
167 .d_name = "zvol",
168 .d_version = D_VERSION,
169 .d_flags = D_DISK | D_TRACKCLOSE,
170 .d_open = zvol_cdev_open,
171 .d_close = zvol_cdev_close,
172 .d_ioctl = zvol_cdev_ioctl,
173 .d_read = zvol_cdev_read,
174 .d_write = zvol_cdev_write,
175 .d_strategy = zvol_cdev_bio_strategy,
176 .d_kqfilter = zvol_cdev_kqfilter,
177 };
178
179 static void zvol_filter_detach(struct knote *kn);
180 static int zvol_filter_vnode(struct knote *kn, long hint);
181
182 static struct filterops zvol_filterops_vnode = {
183 .f_isfd = 1,
184 .f_detach = zvol_filter_detach,
185 .f_event = zvol_filter_vnode,
186 };
187
188 extern uint_t zfs_geom_probe_vdev_key;
189
190 struct g_class zfs_zvol_class = {
191 .name = "ZFS::ZVOL",
192 .version = G_VERSION,
193 };
194
195 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
196
197 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
198 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
199 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
200 static void zvol_geom_bio_start(struct bio *bp);
201 static int zvol_geom_bio_getattr(struct bio *bp);
202 static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync);
203
204 /*
205 * GEOM mode implementation
206 */
207
208 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)209 zvol_geom_open(struct g_provider *pp, int flag, int count)
210 {
211 zvol_state_t *zv;
212 int err = 0;
213 boolean_t drop_suspend = B_FALSE;
214
215 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
216 /*
217 * If zfs_geom_probe_vdev_key is set, that means that zfs is
218 * attempting to probe geom providers while looking for a
219 * replacement for a missing VDEV. In this case, the
220 * spa_namespace_lock will not be held, but it is still illegal
221 * to use a zvol as a vdev. Deadlocks can result if another
222 * thread has spa_namespace_lock.
223 */
224 return (SET_ERROR(EOPNOTSUPP));
225 }
226
227 retry:
228 zv = atomic_load_ptr(&pp->private);
229 if (zv == NULL)
230 return (SET_ERROR(ENXIO));
231
232 mutex_enter(&zv->zv_state_lock);
233 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
234 err = SET_ERROR(ENXIO);
235 goto out_locked;
236 }
237 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
238
239 /*
240 * Make sure zvol is not suspended during first open
241 * (hold zv_suspend_lock) and respect proper lock acquisition
242 * ordering - zv_suspend_lock before zv_state_lock.
243 */
244 if (zv->zv_open_count == 0) {
245 drop_suspend = B_TRUE;
246 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
247 mutex_exit(&zv->zv_state_lock);
248
249 /*
250 * Removal may happen while the locks are down, so
251 * we can't trust zv any longer; we have to start over.
252 */
253 zv = atomic_load_ptr(&pp->private);
254 if (zv == NULL)
255 return (SET_ERROR(ENXIO));
256
257 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
258 mutex_enter(&zv->zv_state_lock);
259
260 if (zv->zv_zso->zso_dying ||
261 zv->zv_flags & ZVOL_REMOVING) {
262 err = SET_ERROR(ENXIO);
263 goto out_locked;
264 }
265
266 /* Check to see if zv_suspend_lock is needed. */
267 if (zv->zv_open_count != 0) {
268 rw_exit(&zv->zv_suspend_lock);
269 drop_suspend = B_FALSE;
270 }
271 }
272 }
273
274 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
275
276 if (zv->zv_open_count == 0) {
277 boolean_t drop_namespace = B_FALSE;
278
279 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
280
281 /*
282 * Take spa_namespace_lock to prevent lock inversion when
283 * zvols from one pool are opened as vdevs in another.
284 */
285 if (!mutex_owned(&spa_namespace_lock)) {
286 if (!mutex_tryenter(&spa_namespace_lock)) {
287 mutex_exit(&zv->zv_state_lock);
288 rw_exit(&zv->zv_suspend_lock);
289 drop_suspend = B_FALSE;
290 kern_yield(PRI_USER);
291 goto retry;
292 } else {
293 drop_namespace = B_TRUE;
294 }
295 }
296 err = zvol_first_open(zv, !(flag & FWRITE));
297 if (drop_namespace)
298 mutex_exit(&spa_namespace_lock);
299 if (err)
300 goto out_locked;
301 pp->mediasize = zv->zv_volsize;
302 pp->stripeoffset = 0;
303 pp->stripesize = zv->zv_volblocksize;
304 }
305
306 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
307
308 /*
309 * Check for a bad on-disk format version now since we
310 * lied about owning the dataset readonly before.
311 */
312 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
313 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
314 err = SET_ERROR(EROFS);
315 goto out_opened;
316 }
317 if (zv->zv_flags & ZVOL_EXCL) {
318 err = SET_ERROR(EBUSY);
319 goto out_opened;
320 }
321 if (flag & O_EXCL) {
322 if (zv->zv_open_count != 0) {
323 err = SET_ERROR(EBUSY);
324 goto out_opened;
325 }
326 zv->zv_flags |= ZVOL_EXCL;
327 }
328
329 zv->zv_open_count += count;
330 out_opened:
331 if (zv->zv_open_count == 0) {
332 zvol_last_close(zv);
333 wakeup(zv);
334 }
335 out_locked:
336 mutex_exit(&zv->zv_state_lock);
337 if (drop_suspend)
338 rw_exit(&zv->zv_suspend_lock);
339 return (err);
340 }
341
342 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)343 zvol_geom_close(struct g_provider *pp, int flag, int count)
344 {
345 (void) flag;
346 zvol_state_t *zv;
347 boolean_t drop_suspend = B_TRUE;
348 int new_open_count;
349
350 zv = atomic_load_ptr(&pp->private);
351 if (zv == NULL)
352 return (SET_ERROR(ENXIO));
353
354 mutex_enter(&zv->zv_state_lock);
355 if (zv->zv_flags & ZVOL_EXCL) {
356 ASSERT3U(zv->zv_open_count, ==, 1);
357 zv->zv_flags &= ~ZVOL_EXCL;
358 }
359
360 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
361
362 /*
363 * If the open count is zero, this is a spurious close.
364 * That indicates a bug in the kernel / DDI framework.
365 */
366 ASSERT3U(zv->zv_open_count, >, 0);
367
368 /*
369 * Make sure zvol is not suspended during last close
370 * (hold zv_suspend_lock) and respect proper lock acquisition
371 * ordering - zv_suspend_lock before zv_state_lock.
372 */
373 new_open_count = zv->zv_open_count - count;
374 if (new_open_count == 0) {
375 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
376 mutex_exit(&zv->zv_state_lock);
377 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
378 mutex_enter(&zv->zv_state_lock);
379
380 /*
381 * Unlike in zvol_geom_open(), we don't check if
382 * removal started here, because we might be one of the
383 * openers that needs to be thrown out! If we're the
384 * last, we need to call zvol_last_close() below to
385 * finish cleanup. So, no special treatment for us.
386 */
387
388 /* Check to see if zv_suspend_lock is needed. */
389 new_open_count = zv->zv_open_count - count;
390 if (new_open_count != 0) {
391 rw_exit(&zv->zv_suspend_lock);
392 drop_suspend = B_FALSE;
393 }
394 }
395 } else {
396 drop_suspend = B_FALSE;
397 }
398
399 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
400
401 /*
402 * You may get multiple opens, but only one close.
403 */
404 zv->zv_open_count = new_open_count;
405 if (zv->zv_open_count == 0) {
406 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
407 zvol_last_close(zv);
408 wakeup(zv);
409 }
410
411 mutex_exit(&zv->zv_state_lock);
412
413 if (drop_suspend)
414 rw_exit(&zv->zv_suspend_lock);
415 return (0);
416 }
417
418 void
zvol_wait_close(zvol_state_t * zv)419 zvol_wait_close(zvol_state_t *zv)
420 {
421
422 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
423 return;
424 mutex_enter(&zv->zv_state_lock);
425 zv->zv_zso->zso_dying = B_TRUE;
426
427 if (zv->zv_open_count)
428 msleep(zv, &zv->zv_state_lock,
429 PRIBIO, "zvol:dying", 10*hz);
430 mutex_exit(&zv->zv_state_lock);
431 }
432
433
434 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)435 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
436 {
437 int count, error, flags;
438
439 g_topology_assert();
440
441 /*
442 * To make it easier we expect either open or close, but not both
443 * at the same time.
444 */
445 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
446 (acr <= 0 && acw <= 0 && ace <= 0),
447 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
448 pp->name, acr, acw, ace));
449
450 if (atomic_load_ptr(&pp->private) == NULL) {
451 if (acr <= 0 && acw <= 0 && ace <= 0)
452 return (0);
453 return (pp->error);
454 }
455
456 /*
457 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
458 * ace != 0, because GEOM already handles that and handles it a bit
459 * differently. GEOM allows for multiple read/exclusive consumers and
460 * ZFS allows only one exclusive consumer, no matter if it is reader or
461 * writer. I like better the way GEOM works so I'll leave it for GEOM
462 * to decide what to do.
463 */
464
465 count = acr + acw + ace;
466 if (count == 0)
467 return (0);
468
469 flags = 0;
470 if (acr != 0 || ace != 0)
471 flags |= FREAD;
472 if (acw != 0)
473 flags |= FWRITE;
474
475 g_topology_unlock();
476 if (count > 0)
477 error = zvol_geom_open(pp, flags, count);
478 else
479 error = zvol_geom_close(pp, flags, -count);
480 g_topology_lock();
481 return (error);
482 }
483
484 static void
zvol_geom_bio_start(struct bio * bp)485 zvol_geom_bio_start(struct bio *bp)
486 {
487 zvol_state_t *zv = bp->bio_to->private;
488
489 if (zv == NULL) {
490 g_io_deliver(bp, ENXIO);
491 return;
492 }
493 if (bp->bio_cmd == BIO_GETATTR) {
494 if (zvol_geom_bio_getattr(bp))
495 g_io_deliver(bp, EOPNOTSUPP);
496 return;
497 }
498
499 zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) &&
500 THREAD_CAN_SLEEP());
501 }
502
503 static int
zvol_geom_bio_getattr(struct bio * bp)504 zvol_geom_bio_getattr(struct bio *bp)
505 {
506 zvol_state_t *zv;
507
508 zv = bp->bio_to->private;
509 ASSERT3P(zv, !=, NULL);
510
511 spa_t *spa = dmu_objset_spa(zv->zv_objset);
512 uint64_t refd, avail, usedobjs, availobjs;
513
514 if (g_handleattr_int(bp, "GEOM::candelete", 1))
515 return (0);
516 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
517 dmu_objset_space(zv->zv_objset, &refd, &avail,
518 &usedobjs, &availobjs);
519 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
520 return (0);
521 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
522 dmu_objset_space(zv->zv_objset, &refd, &avail,
523 &usedobjs, &availobjs);
524 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
525 return (0);
526 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
527 avail = metaslab_class_get_space(spa_normal_class(spa));
528 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
529 if (g_handleattr_off_t(bp, "poolblocksavail",
530 avail / DEV_BSIZE))
531 return (0);
532 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
533 refd = metaslab_class_get_alloc(spa_normal_class(spa));
534 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
535 return (0);
536 }
537 return (1);
538 }
539
540 static void
zvol_filter_detach(struct knote * kn)541 zvol_filter_detach(struct knote *kn)
542 {
543 zvol_state_t *zv;
544 struct zvol_state_dev *zsd;
545
546 zv = kn->kn_hook;
547 zsd = &zv->zv_zso->zso_dev;
548
549 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
550 }
551
552 static int
zvol_filter_vnode(struct knote * kn,long hint)553 zvol_filter_vnode(struct knote *kn, long hint)
554 {
555 kn->kn_fflags |= kn->kn_sfflags & hint;
556
557 return (kn->kn_fflags != 0);
558 }
559
560 static int
zvol_cdev_kqfilter(struct cdev * dev,struct knote * kn)561 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
562 {
563 zvol_state_t *zv;
564 struct zvol_state_dev *zsd;
565
566 zv = dev->si_drv2;
567 zsd = &zv->zv_zso->zso_dev;
568
569 if (kn->kn_filter != EVFILT_VNODE)
570 return (EINVAL);
571
572 /* XXX: extend support for other NOTE_* events */
573 if (kn->kn_sfflags != NOTE_ATTRIB)
574 return (EINVAL);
575
576 kn->kn_fop = &zvol_filterops_vnode;
577 kn->kn_hook = zv;
578 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
579
580 return (0);
581 }
582
583 static void
zvol_strategy_impl(zv_request_t * zvr)584 zvol_strategy_impl(zv_request_t *zvr)
585 {
586 zvol_state_t *zv;
587 struct bio *bp;
588 uint64_t off, volsize;
589 size_t resid;
590 char *addr;
591 objset_t *os;
592 zfs_locked_range_t *lr;
593 int error = 0;
594 boolean_t doread = B_FALSE;
595 boolean_t is_dumpified;
596 boolean_t commit;
597
598 bp = zvr->bio;
599 zv = zvr->zv;
600 if (zv == NULL) {
601 error = SET_ERROR(ENXIO);
602 goto out;
603 }
604
605 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
606
607 if (zv->zv_flags & ZVOL_REMOVING) {
608 error = SET_ERROR(ENXIO);
609 goto resume;
610 }
611
612 switch (bp->bio_cmd) {
613 case BIO_READ:
614 doread = B_TRUE;
615 break;
616 case BIO_WRITE:
617 case BIO_FLUSH:
618 case BIO_DELETE:
619 if (zv->zv_flags & ZVOL_RDONLY) {
620 error = SET_ERROR(EROFS);
621 goto resume;
622 }
623 zvol_ensure_zilog(zv);
624 if (bp->bio_cmd == BIO_FLUSH)
625 goto commit;
626 break;
627 default:
628 error = SET_ERROR(EOPNOTSUPP);
629 goto resume;
630 }
631
632 off = bp->bio_offset;
633 volsize = zv->zv_volsize;
634
635 os = zv->zv_objset;
636 ASSERT3P(os, !=, NULL);
637
638 addr = bp->bio_data;
639 resid = bp->bio_length;
640
641 if (resid > 0 && off >= volsize) {
642 error = SET_ERROR(EIO);
643 goto resume;
644 }
645
646 is_dumpified = B_FALSE;
647 commit = !doread && !is_dumpified &&
648 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
649
650 /*
651 * There must be no buffer changes when doing a dmu_sync() because
652 * we can't change the data whilst calculating the checksum.
653 */
654 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
655 doread ? RL_READER : RL_WRITER);
656
657 if (bp->bio_cmd == BIO_DELETE) {
658 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
659 error = dmu_tx_assign(tx, DMU_TX_WAIT);
660 if (error != 0) {
661 dmu_tx_abort(tx);
662 } else {
663 zvol_log_truncate(zv, tx, off, resid);
664 dmu_tx_commit(tx);
665 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
666 off, resid);
667 resid = 0;
668 }
669 goto unlock;
670 }
671 while (resid != 0 && off < volsize) {
672 size_t size = MIN(resid, zvol_maxphys);
673 if (doread) {
674 error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
675 DMU_READ_PREFETCH);
676 } else {
677 dmu_tx_t *tx = dmu_tx_create(os);
678 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
679 error = dmu_tx_assign(tx, DMU_TX_WAIT);
680 if (error) {
681 dmu_tx_abort(tx);
682 } else {
683 dmu_write_by_dnode(zv->zv_dn, off, size, addr,
684 tx, DMU_READ_PREFETCH);
685 zvol_log_write(zv, tx, off, size, commit);
686 dmu_tx_commit(tx);
687 }
688 }
689 if (error) {
690 /* Convert checksum errors into IO errors. */
691 if (error == ECKSUM)
692 error = SET_ERROR(EIO);
693 break;
694 }
695 off += size;
696 addr += size;
697 resid -= size;
698 }
699 unlock:
700 zfs_rangelock_exit(lr);
701
702 bp->bio_completed = bp->bio_length - resid;
703 if (bp->bio_completed < bp->bio_length && off > volsize)
704 error = SET_ERROR(EINVAL);
705
706 switch (bp->bio_cmd) {
707 case BIO_FLUSH:
708 break;
709 case BIO_READ:
710 dataset_kstats_update_read_kstats(&zv->zv_kstat,
711 bp->bio_completed);
712 break;
713 case BIO_WRITE:
714 dataset_kstats_update_write_kstats(&zv->zv_kstat,
715 bp->bio_completed);
716 break;
717 case BIO_DELETE:
718 break;
719 default:
720 break;
721 }
722
723 if (error == 0 && commit) {
724 commit:
725 error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
726 }
727 resume:
728 rw_exit(&zv->zv_suspend_lock);
729 out:
730 if (bp->bio_to)
731 g_io_deliver(bp, error);
732 else
733 biofinish(bp, NULL, error);
734 }
735
736 static void
zvol_strategy_task(void * arg)737 zvol_strategy_task(void *arg)
738 {
739 zv_request_task_t *task = arg;
740
741 zvol_strategy_impl(&task->zvr);
742 zv_request_task_free(task);
743 }
744
745 static void
zvol_geom_bio_strategy(struct bio * bp,boolean_t sync)746 zvol_geom_bio_strategy(struct bio *bp, boolean_t sync)
747 {
748 zv_taskq_t *ztqs = &zvol_taskqs;
749 zv_request_task_t *task;
750 zvol_state_t *zv;
751 uint_t tq_idx;
752 uint_t taskq_hash;
753 int error;
754
755 if (bp->bio_to)
756 zv = bp->bio_to->private;
757 else
758 zv = bp->bio_dev->si_drv2;
759
760 if (zv == NULL) {
761 error = SET_ERROR(ENXIO);
762 if (bp->bio_to)
763 g_io_deliver(bp, error);
764 else
765 biofinish(bp, NULL, error);
766 return;
767 }
768
769 zv_request_t zvr = {
770 .zv = zv,
771 .bio = bp,
772 };
773
774 if (sync || zvol_request_sync) {
775 zvol_strategy_impl(&zvr);
776 return;
777 }
778
779 taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >>
780 ZVOL_TASKQ_OFFSET_SHIFT);
781 tq_idx = taskq_hash % ztqs->tqs_cnt;
782 task = zv_request_task_create(zvr);
783 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task,
784 0, &task->ent);
785 }
786
787 static void
zvol_cdev_bio_strategy(struct bio * bp)788 zvol_cdev_bio_strategy(struct bio *bp)
789 {
790 zvol_geom_bio_strategy(bp, B_FALSE);
791 }
792
793 /*
794 * Character device mode implementation
795 */
796
797 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio_s,int ioflag)798 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
799 {
800 zvol_state_t *zv;
801 uint64_t volsize;
802 zfs_locked_range_t *lr;
803 int error = 0;
804 zfs_uio_t uio;
805
806 zfs_uio_init(&uio, uio_s);
807
808 zv = dev->si_drv2;
809
810 volsize = zv->zv_volsize;
811 /*
812 * uio_loffset == volsize isn't an error as
813 * it's required for EOF processing.
814 */
815 if (zfs_uio_resid(&uio) > 0 &&
816 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
817 return (SET_ERROR(EIO));
818
819 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
820 ssize_t start_resid = zfs_uio_resid(&uio);
821 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
822 zfs_uio_resid(&uio), RL_READER);
823 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
824 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
825
826 /* Don't read past the end. */
827 if (bytes > volsize - zfs_uio_offset(&uio))
828 bytes = volsize - zfs_uio_offset(&uio);
829
830 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
831 DMU_READ_PREFETCH);
832 if (error) {
833 /* Convert checksum errors into IO errors. */
834 if (error == ECKSUM)
835 error = SET_ERROR(EIO);
836 break;
837 }
838 }
839 zfs_rangelock_exit(lr);
840 int64_t nread = start_resid - zfs_uio_resid(&uio);
841 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
842 rw_exit(&zv->zv_suspend_lock);
843
844 return (error);
845 }
846
847 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio_s,int ioflag)848 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
849 {
850 zvol_state_t *zv;
851 uint64_t volsize;
852 zfs_locked_range_t *lr;
853 int error = 0;
854 boolean_t commit;
855 zfs_uio_t uio;
856
857 zv = dev->si_drv2;
858
859 volsize = zv->zv_volsize;
860
861 zfs_uio_init(&uio, uio_s);
862
863 if (zfs_uio_resid(&uio) > 0 &&
864 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
865 return (SET_ERROR(EIO));
866
867 ssize_t start_resid = zfs_uio_resid(&uio);
868 commit = (ioflag & IO_SYNC) ||
869 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
870
871 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
872 zvol_ensure_zilog(zv);
873
874 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
875 zfs_uio_resid(&uio), RL_WRITER);
876 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
877 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
878 uint64_t off = zfs_uio_offset(&uio);
879 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
880
881 if (bytes > volsize - off) /* Don't write past the end. */
882 bytes = volsize - off;
883
884 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
885 error = dmu_tx_assign(tx, DMU_TX_WAIT);
886 if (error) {
887 dmu_tx_abort(tx);
888 break;
889 }
890 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
891 DMU_READ_PREFETCH);
892 if (error == 0)
893 zvol_log_write(zv, tx, off, bytes, commit);
894 dmu_tx_commit(tx);
895
896 if (error)
897 break;
898 }
899 zfs_rangelock_exit(lr);
900 int64_t nwritten = start_resid - zfs_uio_resid(&uio);
901 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
902 if (error == 0 && commit)
903 error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
904 rw_exit(&zv->zv_suspend_lock);
905
906 return (error);
907 }
908
909 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)910 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
911 {
912 zvol_state_t *zv;
913 int err = 0;
914 boolean_t drop_suspend = B_FALSE;
915
916 retry:
917 zv = atomic_load_ptr(&dev->si_drv2);
918 if (zv == NULL)
919 return (SET_ERROR(ENXIO));
920
921 mutex_enter(&zv->zv_state_lock);
922 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
923 err = SET_ERROR(ENXIO);
924 goto out_locked;
925 }
926 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
927
928 /*
929 * Make sure zvol is not suspended during first open
930 * (hold zv_suspend_lock) and respect proper lock acquisition
931 * ordering - zv_suspend_lock before zv_state_lock.
932 */
933 if (zv->zv_open_count == 0) {
934 drop_suspend = B_TRUE;
935 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
936 mutex_exit(&zv->zv_state_lock);
937 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
938 mutex_enter(&zv->zv_state_lock);
939
940 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
941 /* Removal started while locks were down. */
942 err = SET_ERROR(ENXIO);
943 goto out_locked;
944 }
945
946 /* Check to see if zv_suspend_lock is needed. */
947 if (zv->zv_open_count != 0) {
948 rw_exit(&zv->zv_suspend_lock);
949 drop_suspend = B_FALSE;
950 }
951 }
952 }
953
954 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
955
956 if (zv->zv_open_count == 0) {
957 boolean_t drop_namespace = B_FALSE;
958
959 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
960
961 /*
962 * Take spa_namespace_lock to prevent lock inversion when
963 * zvols from one pool are opened as vdevs in another.
964 */
965 if (!mutex_owned(&spa_namespace_lock)) {
966 if (!mutex_tryenter(&spa_namespace_lock)) {
967 mutex_exit(&zv->zv_state_lock);
968 rw_exit(&zv->zv_suspend_lock);
969 drop_suspend = B_FALSE;
970 kern_yield(PRI_USER);
971 goto retry;
972 } else {
973 drop_namespace = B_TRUE;
974 }
975 }
976 err = zvol_first_open(zv, !(flags & FWRITE));
977 if (drop_namespace)
978 mutex_exit(&spa_namespace_lock);
979 if (err)
980 goto out_locked;
981 }
982
983 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
984
985 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
986 err = SET_ERROR(EROFS);
987 goto out_opened;
988 }
989 if (zv->zv_flags & ZVOL_EXCL) {
990 err = SET_ERROR(EBUSY);
991 goto out_opened;
992 }
993 if (flags & O_EXCL) {
994 if (zv->zv_open_count != 0) {
995 err = SET_ERROR(EBUSY);
996 goto out_opened;
997 }
998 zv->zv_flags |= ZVOL_EXCL;
999 }
1000
1001 zv->zv_open_count++;
1002 out_opened:
1003 if (zv->zv_open_count == 0) {
1004 zvol_last_close(zv);
1005 wakeup(zv);
1006 }
1007 out_locked:
1008 mutex_exit(&zv->zv_state_lock);
1009 if (drop_suspend)
1010 rw_exit(&zv->zv_suspend_lock);
1011 return (err);
1012 }
1013
1014 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)1015 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1016 {
1017 zvol_state_t *zv;
1018 boolean_t drop_suspend = B_TRUE;
1019
1020 zv = atomic_load_ptr(&dev->si_drv2);
1021 if (zv == NULL)
1022 return (SET_ERROR(ENXIO));
1023
1024 mutex_enter(&zv->zv_state_lock);
1025 if (zv->zv_flags & ZVOL_EXCL) {
1026 ASSERT3U(zv->zv_open_count, ==, 1);
1027 zv->zv_flags &= ~ZVOL_EXCL;
1028 }
1029
1030 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1031
1032 /*
1033 * If the open count is zero, this is a spurious close.
1034 * That indicates a bug in the kernel / DDI framework.
1035 */
1036 ASSERT3U(zv->zv_open_count, >, 0);
1037 /*
1038 * Make sure zvol is not suspended during last close
1039 * (hold zv_suspend_lock) and respect proper lock acquisition
1040 * ordering - zv_suspend_lock before zv_state_lock.
1041 */
1042 if (zv->zv_open_count == 1) {
1043 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1044 mutex_exit(&zv->zv_state_lock);
1045 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1046 mutex_enter(&zv->zv_state_lock);
1047
1048 /*
1049 * Unlike in zvol_cdev_open(), we don't check if
1050 * removal started here, because we might be one of the
1051 * openers that needs to be thrown out! If we're the
1052 * last, we need to call zvol_last_close() below to
1053 * finish cleanup. So, no special treatment for us.
1054 */
1055
1056 /* Check to see if zv_suspend_lock is needed. */
1057 if (zv->zv_open_count != 1) {
1058 rw_exit(&zv->zv_suspend_lock);
1059 drop_suspend = B_FALSE;
1060 }
1061 }
1062 } else {
1063 drop_suspend = B_FALSE;
1064 }
1065
1066 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1067
1068 /*
1069 * You may get multiple opens, but only one close.
1070 */
1071 zv->zv_open_count--;
1072
1073 if (zv->zv_open_count == 0) {
1074 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1075 zvol_last_close(zv);
1076 wakeup(zv);
1077 }
1078
1079 mutex_exit(&zv->zv_state_lock);
1080
1081 if (drop_suspend)
1082 rw_exit(&zv->zv_suspend_lock);
1083 return (0);
1084 }
1085
1086 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1087 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1088 int fflag, struct thread *td)
1089 {
1090 zvol_state_t *zv;
1091 zfs_locked_range_t *lr;
1092 off_t offset, length;
1093 int error;
1094 boolean_t sync;
1095
1096 zv = atomic_load_ptr(&dev->si_drv2);
1097 ASSERT3P(zv, !=, NULL);
1098
1099 error = 0;
1100 KASSERT(zv->zv_open_count > 0,
1101 ("Device with zero access count in %s", __func__));
1102
1103 switch (cmd) {
1104 case DIOCGSECTORSIZE:
1105 *(uint32_t *)data = DEV_BSIZE;
1106 break;
1107 case DIOCGMEDIASIZE:
1108 *(off_t *)data = zv->zv_volsize;
1109 break;
1110 case DIOCGFLUSH:
1111 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1112 if (zv->zv_zilog != NULL)
1113 error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
1114 rw_exit(&zv->zv_suspend_lock);
1115 break;
1116 case DIOCGDELETE:
1117 if (!zvol_unmap_enabled)
1118 break;
1119
1120 offset = ((off_t *)data)[0];
1121 length = ((off_t *)data)[1];
1122 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1123 offset < 0 || offset >= zv->zv_volsize ||
1124 length <= 0) {
1125 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1126 length);
1127 error = SET_ERROR(EINVAL);
1128 break;
1129 }
1130 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1131 zvol_ensure_zilog(zv);
1132 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1133 RL_WRITER);
1134 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1135 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1136 if (error != 0) {
1137 sync = FALSE;
1138 dmu_tx_abort(tx);
1139 } else {
1140 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1141 zvol_log_truncate(zv, tx, offset, length);
1142 dmu_tx_commit(tx);
1143 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1144 offset, length);
1145 }
1146 zfs_rangelock_exit(lr);
1147 if (sync)
1148 error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
1149 rw_exit(&zv->zv_suspend_lock);
1150 break;
1151 case DIOCGSTRIPESIZE:
1152 *(off_t *)data = zv->zv_volblocksize;
1153 break;
1154 case DIOCGSTRIPEOFFSET:
1155 *(off_t *)data = 0;
1156 break;
1157 case DIOCGATTR: {
1158 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1159 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1160 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1161 uint64_t refd, avail, usedobjs, availobjs;
1162
1163 if (strcmp(arg->name, "GEOM::candelete") == 0)
1164 arg->value.i = 1;
1165 else if (strcmp(arg->name, "blocksavail") == 0) {
1166 dmu_objset_space(zv->zv_objset, &refd, &avail,
1167 &usedobjs, &availobjs);
1168 arg->value.off = avail / DEV_BSIZE;
1169 } else if (strcmp(arg->name, "blocksused") == 0) {
1170 dmu_objset_space(zv->zv_objset, &refd, &avail,
1171 &usedobjs, &availobjs);
1172 arg->value.off = refd / DEV_BSIZE;
1173 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1174 avail = metaslab_class_get_space(spa_normal_class(spa));
1175 avail -= metaslab_class_get_alloc(
1176 spa_normal_class(spa));
1177 arg->value.off = avail / DEV_BSIZE;
1178 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1179 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1180 arg->value.off = refd / DEV_BSIZE;
1181 } else
1182 error = SET_ERROR(ENOIOCTL);
1183 rw_exit(&zv->zv_suspend_lock);
1184 break;
1185 }
1186 case FIOSEEKHOLE:
1187 case FIOSEEKDATA: {
1188 off_t *off = (off_t *)data;
1189 uint64_t noff;
1190 boolean_t hole;
1191
1192 hole = (cmd == FIOSEEKHOLE);
1193 noff = *off;
1194 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1195 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1196 RL_READER);
1197 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1198 zfs_rangelock_exit(lr);
1199 rw_exit(&zv->zv_suspend_lock);
1200 *off = noff;
1201 break;
1202 }
1203 default:
1204 error = SET_ERROR(ENOIOCTL);
1205 }
1206
1207 return (error);
1208 }
1209
1210 /*
1211 * Misc. helpers
1212 */
1213
1214 static void
zvol_ensure_zilog(zvol_state_t * zv)1215 zvol_ensure_zilog(zvol_state_t *zv)
1216 {
1217 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1218
1219 /*
1220 * Open a ZIL if this is the first time we have written to this
1221 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1222 * than zv_state_lock so that we don't need to acquire an
1223 * additional lock in this path.
1224 */
1225 if (zv->zv_zilog == NULL) {
1226 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1227 rw_exit(&zv->zv_suspend_lock);
1228 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1229 }
1230 if (zv->zv_zilog == NULL) {
1231 zv->zv_zilog = zil_open(zv->zv_objset,
1232 zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1233 zv->zv_flags |= ZVOL_WRITTEN_TO;
1234 /* replay / destroy done in zvol_os_create_minor() */
1235 VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1236 ZIL_REPLAY_NEEDED);
1237 }
1238 rw_downgrade(&zv->zv_suspend_lock);
1239 }
1240 }
1241
1242 boolean_t
zvol_os_is_zvol(const char * device)1243 zvol_os_is_zvol(const char *device)
1244 {
1245 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1246 }
1247
1248 int
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1249 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1250 {
1251 int error = 0;
1252
1253 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1254 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1255
1256 /* Move to a new hashtable entry. */
1257 zv->zv_hash = zvol_name_hash(newname);
1258 hlist_del(&zv->zv_hlink);
1259 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1260
1261 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1262 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1263 struct g_provider *pp = zsg->zsg_provider;
1264 struct g_geom *gp;
1265
1266 g_topology_lock();
1267 gp = pp->geom;
1268 ASSERT3P(gp, !=, NULL);
1269
1270 zsg->zsg_provider = NULL;
1271 g_wither_provider(pp, ENXIO);
1272
1273 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1274 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1275 pp->sectorsize = DEV_BSIZE;
1276 pp->mediasize = zv->zv_volsize;
1277 pp->private = zv;
1278 zsg->zsg_provider = pp;
1279 g_error_provider(pp, 0);
1280 g_topology_unlock();
1281 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1282 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1283 struct cdev *dev;
1284 struct make_dev_args args;
1285
1286 dev = zsd->zsd_cdev;
1287 if (dev != NULL) {
1288 destroy_dev(dev);
1289 dev = zsd->zsd_cdev = NULL;
1290 if (zv->zv_open_count > 0) {
1291 zv->zv_flags &= ~ZVOL_EXCL;
1292 zv->zv_open_count = 0;
1293 /* XXX need suspend lock but lock order */
1294 zvol_last_close(zv);
1295 }
1296 }
1297
1298 make_dev_args_init(&args);
1299 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1300 args.mda_devsw = &zvol_cdevsw;
1301 args.mda_cr = NULL;
1302 args.mda_uid = UID_ROOT;
1303 args.mda_gid = GID_OPERATOR;
1304 args.mda_mode = 0640;
1305 args.mda_si_drv2 = zv;
1306 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
1307 if (error == 0) {
1308 dev->si_iosize_max = maxphys;
1309 zsd->zsd_cdev = dev;
1310 }
1311 }
1312 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1313 dataset_kstats_rename(&zv->zv_kstat, newname);
1314
1315 return (error);
1316 }
1317
1318 /*
1319 * Allocate memory for a new zvol_state_t and setup the required
1320 * request queue and generic disk structures for the block device.
1321 */
1322 static int
zvol_alloc(const char * name,uint64_t volsize,uint64_t volblocksize,zvol_state_t ** zvp)1323 zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
1324 zvol_state_t **zvp)
1325 {
1326 zvol_state_t *zv;
1327 uint64_t volmode;
1328 int error;
1329
1330 error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
1331 &volmode, NULL);
1332 if (error)
1333 return (error);
1334
1335 if (volmode == ZFS_VOLMODE_DEFAULT)
1336 volmode = zvol_volmode;
1337
1338 if (volmode == ZFS_VOLMODE_NONE)
1339 return (0);
1340
1341 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1342 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1343 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1344 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1345 zv->zv_volmode = volmode;
1346 zv->zv_volsize = volsize;
1347 zv->zv_volblocksize = volblocksize;
1348 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1349 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1350 struct g_provider *pp;
1351 struct g_geom *gp;
1352
1353 g_topology_lock();
1354 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1355 gp->start = zvol_geom_bio_start;
1356 gp->access = zvol_geom_access;
1357 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1358 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1359 pp->sectorsize = DEV_BSIZE;
1360 pp->mediasize = 0;
1361 pp->private = zv;
1362
1363 zsg->zsg_provider = pp;
1364 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1365 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1366 struct cdev *dev;
1367 struct make_dev_args args;
1368
1369 make_dev_args_init(&args);
1370 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1371 args.mda_devsw = &zvol_cdevsw;
1372 args.mda_cr = NULL;
1373 args.mda_uid = UID_ROOT;
1374 args.mda_gid = GID_OPERATOR;
1375 args.mda_mode = 0640;
1376 args.mda_si_drv2 = zv;
1377 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1378 if (error) {
1379 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1380 kmem_free(zv, sizeof (zvol_state_t));
1381 return (error);
1382 }
1383
1384 dev->si_iosize_max = maxphys;
1385 zsd->zsd_cdev = dev;
1386 knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
1387 }
1388 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1389 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1390 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1391
1392 *zvp = zv;
1393 return (error);
1394 }
1395
1396 /*
1397 * Remove minor node for the specified volume.
1398 */
1399 void
zvol_os_remove_minor(zvol_state_t * zv)1400 zvol_os_remove_minor(zvol_state_t *zv)
1401 {
1402 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1403 ASSERT0(zv->zv_open_count);
1404 ASSERT0(atomic_read(&zv->zv_suspend_ref));
1405 ASSERT(zv->zv_flags & ZVOL_REMOVING);
1406
1407 struct zvol_state_os *zso = zv->zv_zso;
1408 zv->zv_zso = NULL;
1409
1410 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1411 struct zvol_state_geom *zsg = &zso->zso_geom;
1412 struct g_provider *pp = zsg->zsg_provider;
1413 atomic_store_ptr(&pp->private, NULL);
1414 mutex_exit(&zv->zv_state_lock);
1415
1416 g_topology_lock();
1417 g_wither_geom(pp->geom, ENXIO);
1418 g_topology_unlock();
1419 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1420 struct zvol_state_dev *zsd = &zso->zso_dev;
1421 struct cdev *dev = zsd->zsd_cdev;
1422
1423 if (dev != NULL)
1424 atomic_store_ptr(&dev->si_drv2, NULL);
1425 mutex_exit(&zv->zv_state_lock);
1426
1427 if (dev != NULL) {
1428 destroy_dev(dev);
1429 knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1430 knlist_destroy(&zsd->zsd_selinfo.si_note);
1431 }
1432 }
1433
1434 kmem_free(zso, sizeof (struct zvol_state_os));
1435
1436 mutex_enter(&zv->zv_state_lock);
1437 }
1438
1439 void
zvol_os_free(zvol_state_t * zv)1440 zvol_os_free(zvol_state_t *zv)
1441 {
1442 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1443 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1444 ASSERT0(zv->zv_open_count);
1445 ASSERT0P(zv->zv_zso);
1446
1447 ASSERT0P(zv->zv_objset);
1448 ASSERT0P(zv->zv_zilog);
1449 ASSERT0P(zv->zv_dn);
1450
1451 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1452
1453 rw_destroy(&zv->zv_suspend_lock);
1454 zfs_rangelock_fini(&zv->zv_rangelock);
1455
1456 mutex_destroy(&zv->zv_state_lock);
1457 cv_destroy(&zv->zv_removing_cv);
1458 dataset_kstats_destroy(&zv->zv_kstat);
1459 kmem_free(zv, sizeof (zvol_state_t));
1460 zvol_minors--;
1461 }
1462
1463 /*
1464 * Create a minor node (plus a whole lot more) for the specified volume.
1465 */
1466 int
zvol_os_create_minor(const char * name)1467 zvol_os_create_minor(const char *name)
1468 {
1469 zvol_state_t *zv = NULL;
1470 objset_t *os;
1471 dmu_object_info_t *doi;
1472 uint64_t volsize;
1473 uint64_t hash, len;
1474 int error;
1475 bool replayed_zil = B_FALSE;
1476
1477 if (zvol_inhibit_dev)
1478 return (0);
1479
1480 ZFS_LOG(1, "Creating ZVOL %s...", name);
1481 hash = zvol_name_hash(name);
1482 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1483 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1484 mutex_exit(&zv->zv_state_lock);
1485 return (SET_ERROR(EEXIST));
1486 }
1487
1488 DROP_GIANT();
1489
1490 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1491
1492 /* Lie and say we're read-only. */
1493 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1494 if (error)
1495 goto out_doi;
1496
1497 error = dmu_object_info(os, ZVOL_OBJ, doi);
1498 if (error)
1499 goto out_dmu_objset_disown;
1500
1501 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1502 if (error)
1503 goto out_dmu_objset_disown;
1504
1505 error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
1506 if (error || zv == NULL)
1507 goto out_dmu_objset_disown;
1508
1509 zv->zv_hash = hash;
1510
1511 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1512 zv->zv_flags |= ZVOL_RDONLY;
1513
1514 zv->zv_objset = os;
1515
1516 ASSERT0P(zv->zv_kstat.dk_kstats);
1517 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1518 if (error)
1519 goto out_dmu_objset_disown;
1520 ASSERT0P(zv->zv_zilog);
1521 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1522 if (spa_writeable(dmu_objset_spa(os))) {
1523 if (zil_replay_disable)
1524 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1525 else
1526 replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1527 }
1528 if (replayed_zil)
1529 zil_close(zv->zv_zilog);
1530 zv->zv_zilog = NULL;
1531
1532 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1533 if (len > 0) {
1534 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ);
1535 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1536 ZIO_PRIORITY_ASYNC_READ);
1537 }
1538
1539 zv->zv_objset = NULL;
1540 out_dmu_objset_disown:
1541 dmu_objset_disown(os, B_TRUE, FTAG);
1542
1543 if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1544 g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
1545 /* geom was locked inside zvol_alloc() function */
1546 g_topology_unlock();
1547 }
1548 out_doi:
1549 kmem_free(doi, sizeof (dmu_object_info_t));
1550 if (error == 0 && zv) {
1551 rw_enter(&zvol_state_lock, RW_WRITER);
1552 zvol_insert(zv);
1553 zvol_minors++;
1554 rw_exit(&zvol_state_lock);
1555 ZFS_LOG(1, "ZVOL %s created.", name);
1556 }
1557 PICKUP_GIANT();
1558 return (error);
1559 }
1560
1561 int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1562 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1563 {
1564 zv->zv_volsize = volsize;
1565 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1566 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1567 struct g_provider *pp = zsg->zsg_provider;
1568
1569 g_topology_lock();
1570
1571 if (pp->private == NULL) {
1572 g_topology_unlock();
1573 return (SET_ERROR(ENXIO));
1574 }
1575
1576 /*
1577 * Do not invoke resize event when initial size was zero.
1578 * ZVOL initializes the size on first open, this is not
1579 * real resizing.
1580 */
1581 if (pp->mediasize == 0)
1582 pp->mediasize = zv->zv_volsize;
1583 else
1584 g_resize_provider(pp, zv->zv_volsize);
1585
1586 g_topology_unlock();
1587 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1588 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1589
1590 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1591 }
1592 return (0);
1593 }
1594
1595 void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1596 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1597 {
1598 /*
1599 * The ro/rw ZVOL mode is switched using zvol_set_ro() function by
1600 * enabling/disabling ZVOL_RDONLY flag. No additional FreeBSD-specific
1601 * actions are required for readonly zfs property switching.
1602 */
1603 }
1604
1605 void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1606 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1607 {
1608 /*
1609 * The ZVOL size/capacity is changed by zvol_set_volsize() function.
1610 * Leave this method empty, because all required job is doing by
1611 * zvol_os_update_volsize() platform-specific function.
1612 */
1613 }
1614
1615 /*
1616 * Public interfaces
1617 */
1618
1619 int
zvol_busy(void)1620 zvol_busy(void)
1621 {
1622 return (zvol_minors != 0);
1623 }
1624
1625 int
zvol_init(void)1626 zvol_init(void)
1627 {
1628 return (zvol_init_impl());
1629 }
1630
1631 void
zvol_fini(void)1632 zvol_fini(void)
1633 {
1634 zvol_fini_impl();
1635 }
1636