1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 *
25 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
26 * All rights reserved.
27 *
28 * Portions Copyright 2010 Robert Milkowski
29 *
30 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
31 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
32 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
33 * Copyright (c) 2014 Integros [integros.com]
34 * Copyright (c) 2024, 2025, Klara, Inc.
35 */
36
37 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
38
39 /*
40 * ZFS volume emulation driver.
41 *
42 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
43 * Volumes are accessed through the symbolic links named:
44 *
45 * /dev/zvol/<pool_name>/<dataset_name>
46 *
47 * Volumes are persistent through reboot. No user command needs to be
48 * run before opening and using a device.
49 *
50 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
51 * in the system. Except when they're simply character devices (volmode=dev).
52 */
53
54 #include <sys/types.h>
55 #include <sys/param.h>
56 #include <sys/kernel.h>
57 #include <sys/errno.h>
58 #include <sys/uio.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/kmem.h>
62 #include <sys/conf.h>
63 #include <sys/cmn_err.h>
64 #include <sys/stat.h>
65 #include <sys/proc.h>
66 #include <sys/zap.h>
67 #include <sys/spa.h>
68 #include <sys/spa_impl.h>
69 #include <sys/zio.h>
70 #include <sys/disk.h>
71 #include <sys/dmu_traverse.h>
72 #include <sys/dnode.h>
73 #include <sys/dsl_dataset.h>
74 #include <sys/dsl_prop.h>
75 #include <sys/dsl_dir.h>
76 #include <sys/byteorder.h>
77 #include <sys/sunddi.h>
78 #include <sys/dirent.h>
79 #include <sys/policy.h>
80 #include <sys/queue.h>
81 #include <sys/fs/zfs.h>
82 #include <sys/zfs_ioctl.h>
83 #include <sys/zil.h>
84 #include <sys/zfs_znode.h>
85 #include <sys/zfs_rlock.h>
86 #include <sys/vdev_impl.h>
87 #include <sys/vdev_raidz.h>
88 #include <sys/zvol.h>
89 #include <sys/zil_impl.h>
90 #include <sys/dataset_kstats.h>
91 #include <sys/dbuf.h>
92 #include <sys/dmu_tx.h>
93 #include <sys/zfeature.h>
94 #include <sys/zio_checksum.h>
95 #include <sys/zil_impl.h>
96 #include <sys/filio.h>
97 #include <sys/freebsd_event.h>
98
99 #include <geom/geom.h>
100 #include <sys/zvol.h>
101 #include <sys/zvol_impl.h>
102 #include <cityhash.h>
103
104 #include "zfs_namecheck.h"
105
106 #define ZVOL_DUMPSIZE "dumpsize"
107
108 #ifdef ZVOL_LOCK_DEBUG
109 #define ZVOL_RW_READER RW_WRITER
110 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
111 #else
112 #define ZVOL_RW_READER RW_READER
113 #define ZVOL_RW_READ_HELD RW_READ_HELD
114 #endif
115
116 struct zvol_state_os {
117 #define zso_dev _zso_state._zso_dev
118 #define zso_geom _zso_state._zso_geom
119 union {
120 /* volmode=dev */
121 struct zvol_state_dev {
122 struct cdev *zsd_cdev;
123 struct selinfo zsd_selinfo;
124 } _zso_dev;
125
126 /* volmode=geom */
127 struct zvol_state_geom {
128 struct g_provider *zsg_provider;
129 } _zso_geom;
130 } _zso_state;
131 int zso_dying;
132 };
133
134 static uint32_t zvol_minors;
135
136 SYSCTL_DECL(_vfs_zfs);
137 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
138
139 static boolean_t zpool_on_zvol = B_FALSE;
140 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
141 "Allow zpools to use zvols as vdevs (DANGEROUS)");
142
143 /*
144 * Toggle unmap functionality.
145 */
146 boolean_t zvol_unmap_enabled = B_TRUE;
147
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
149 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
150
151 /*
152 * zvol maximum transfer in one DMU tx.
153 */
154 int zvol_maxphys = DMU_MAX_ACCESS / 2;
155
156 static void zvol_ensure_zilog(zvol_state_t *zv);
157
158 static d_open_t zvol_cdev_open;
159 static d_close_t zvol_cdev_close;
160 static d_ioctl_t zvol_cdev_ioctl;
161 static d_read_t zvol_cdev_read;
162 static d_write_t zvol_cdev_write;
163 static d_strategy_t zvol_cdev_bio_strategy;
164 static d_kqfilter_t zvol_cdev_kqfilter;
165
166 static struct cdevsw zvol_cdevsw = {
167 .d_name = "zvol",
168 .d_version = D_VERSION,
169 .d_flags = D_DISK | D_TRACKCLOSE,
170 .d_open = zvol_cdev_open,
171 .d_close = zvol_cdev_close,
172 .d_ioctl = zvol_cdev_ioctl,
173 .d_read = zvol_cdev_read,
174 .d_write = zvol_cdev_write,
175 .d_strategy = zvol_cdev_bio_strategy,
176 .d_kqfilter = zvol_cdev_kqfilter,
177 };
178
179 static void zvol_filter_detach(struct knote *kn);
180 static int zvol_filter_vnode(struct knote *kn, long hint);
181
182 static struct filterops zvol_filterops_vnode = {
183 .f_isfd = 1,
184 .f_detach = zvol_filter_detach,
185 .f_event = zvol_filter_vnode,
186 .f_copy = knote_triv_copy,
187 };
188
189 extern uint_t zfs_geom_probe_vdev_key;
190
191 struct g_class zfs_zvol_class = {
192 .name = "ZFS::ZVOL",
193 .version = G_VERSION,
194 };
195
196 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
197
198 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
199 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
200 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
201 static void zvol_geom_bio_start(struct bio *bp);
202 static int zvol_geom_bio_getattr(struct bio *bp);
203 static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync);
204
205 /*
206 * GEOM mode implementation
207 */
208
209 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)210 zvol_geom_open(struct g_provider *pp, int flag, int count)
211 {
212 zvol_state_t *zv;
213 int err = 0;
214 boolean_t drop_suspend = B_FALSE;
215
216 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
217 /*
218 * If zfs_geom_probe_vdev_key is set, that means that zfs is
219 * attempting to probe geom providers while looking for a
220 * replacement for a missing VDEV. In this case, the
221 * spa_namespace_lock will not be held, but it is still illegal
222 * to use a zvol as a vdev. Deadlocks can result if another
223 * thread has spa_namespace_lock.
224 */
225 return (SET_ERROR(EOPNOTSUPP));
226 }
227
228 retry:
229 zv = atomic_load_ptr(&pp->private);
230 if (zv == NULL)
231 return (SET_ERROR(ENXIO));
232
233 mutex_enter(&zv->zv_state_lock);
234 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
235 err = SET_ERROR(ENXIO);
236 goto out_locked;
237 }
238 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
239
240 /*
241 * Make sure zvol is not suspended during first open
242 * (hold zv_suspend_lock) and respect proper lock acquisition
243 * ordering - zv_suspend_lock before zv_state_lock.
244 */
245 if (zv->zv_open_count == 0) {
246 drop_suspend = B_TRUE;
247 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
248 mutex_exit(&zv->zv_state_lock);
249
250 /*
251 * Removal may happen while the locks are down, so
252 * we can't trust zv any longer; we have to start over.
253 */
254 zv = atomic_load_ptr(&pp->private);
255 if (zv == NULL)
256 return (SET_ERROR(ENXIO));
257
258 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
259 mutex_enter(&zv->zv_state_lock);
260
261 if (zv->zv_zso->zso_dying ||
262 zv->zv_flags & ZVOL_REMOVING) {
263 err = SET_ERROR(ENXIO);
264 goto out_locked;
265 }
266
267 /* Check to see if zv_suspend_lock is needed. */
268 if (zv->zv_open_count != 0) {
269 rw_exit(&zv->zv_suspend_lock);
270 drop_suspend = B_FALSE;
271 }
272 }
273 }
274
275 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
276
277 if (zv->zv_open_count == 0) {
278 boolean_t drop_namespace = B_FALSE;
279
280 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
281
282 /*
283 * Take spa_namespace_lock to prevent lock inversion when
284 * zvols from one pool are opened as vdevs in another.
285 */
286 if (!spa_namespace_held()) {
287 if (!spa_namespace_tryenter(FTAG)) {
288 mutex_exit(&zv->zv_state_lock);
289 rw_exit(&zv->zv_suspend_lock);
290 drop_suspend = B_FALSE;
291 kern_yield(PRI_USER);
292 goto retry;
293 } else {
294 drop_namespace = B_TRUE;
295 }
296 }
297 err = zvol_first_open(zv, !(flag & FWRITE));
298 if (drop_namespace)
299 spa_namespace_exit(FTAG);
300 if (err)
301 goto out_locked;
302 pp->mediasize = zv->zv_volsize;
303 pp->stripeoffset = 0;
304 pp->stripesize = zv->zv_volblocksize;
305 }
306
307 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
308
309 /*
310 * Check for a bad on-disk format version now since we
311 * lied about owning the dataset readonly before.
312 */
313 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
314 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
315 err = SET_ERROR(EROFS);
316 goto out_opened;
317 }
318 if (zv->zv_flags & ZVOL_EXCL) {
319 err = SET_ERROR(EBUSY);
320 goto out_opened;
321 }
322 if (flag & O_EXCL) {
323 if (zv->zv_open_count != 0) {
324 err = SET_ERROR(EBUSY);
325 goto out_opened;
326 }
327 zv->zv_flags |= ZVOL_EXCL;
328 }
329
330 zv->zv_open_count += count;
331 out_opened:
332 if (zv->zv_open_count == 0) {
333 zvol_last_close(zv);
334 wakeup(zv);
335 }
336 out_locked:
337 mutex_exit(&zv->zv_state_lock);
338 if (drop_suspend)
339 rw_exit(&zv->zv_suspend_lock);
340 return (err);
341 }
342
343 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)344 zvol_geom_close(struct g_provider *pp, int flag, int count)
345 {
346 (void) flag;
347 zvol_state_t *zv;
348 boolean_t drop_suspend = B_TRUE;
349 int new_open_count;
350
351 zv = atomic_load_ptr(&pp->private);
352 if (zv == NULL)
353 return (SET_ERROR(ENXIO));
354
355 mutex_enter(&zv->zv_state_lock);
356 if (zv->zv_flags & ZVOL_EXCL) {
357 ASSERT3U(zv->zv_open_count, ==, 1);
358 zv->zv_flags &= ~ZVOL_EXCL;
359 }
360
361 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
362
363 /*
364 * If the open count is zero, this is a spurious close.
365 * That indicates a bug in the kernel / DDI framework.
366 */
367 ASSERT3U(zv->zv_open_count, >, 0);
368
369 /*
370 * Make sure zvol is not suspended during last close
371 * (hold zv_suspend_lock) and respect proper lock acquisition
372 * ordering - zv_suspend_lock before zv_state_lock.
373 */
374 new_open_count = zv->zv_open_count - count;
375 if (new_open_count == 0) {
376 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
377 mutex_exit(&zv->zv_state_lock);
378 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
379 mutex_enter(&zv->zv_state_lock);
380
381 /*
382 * Unlike in zvol_geom_open(), we don't check if
383 * removal started here, because we might be one of the
384 * openers that needs to be thrown out! If we're the
385 * last, we need to call zvol_last_close() below to
386 * finish cleanup. So, no special treatment for us.
387 */
388
389 /* Check to see if zv_suspend_lock is needed. */
390 new_open_count = zv->zv_open_count - count;
391 if (new_open_count != 0) {
392 rw_exit(&zv->zv_suspend_lock);
393 drop_suspend = B_FALSE;
394 }
395 }
396 } else {
397 drop_suspend = B_FALSE;
398 }
399
400 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
401
402 /*
403 * You may get multiple opens, but only one close.
404 */
405 zv->zv_open_count = new_open_count;
406 if (zv->zv_open_count == 0) {
407 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
408 zvol_last_close(zv);
409 wakeup(zv);
410 }
411
412 mutex_exit(&zv->zv_state_lock);
413
414 if (drop_suspend)
415 rw_exit(&zv->zv_suspend_lock);
416 return (0);
417 }
418
419 void
zvol_wait_close(zvol_state_t * zv)420 zvol_wait_close(zvol_state_t *zv)
421 {
422
423 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
424 return;
425 mutex_enter(&zv->zv_state_lock);
426 zv->zv_zso->zso_dying = B_TRUE;
427
428 if (zv->zv_open_count)
429 msleep(zv, &zv->zv_state_lock,
430 PRIBIO, "zvol:dying", 10*hz);
431 mutex_exit(&zv->zv_state_lock);
432 }
433
434
435 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)436 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
437 {
438 int count, error, flags;
439
440 g_topology_assert();
441
442 /*
443 * To make it easier we expect either open or close, but not both
444 * at the same time.
445 */
446 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
447 (acr <= 0 && acw <= 0 && ace <= 0),
448 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
449 pp->name, acr, acw, ace));
450
451 if (atomic_load_ptr(&pp->private) == NULL) {
452 if (acr <= 0 && acw <= 0 && ace <= 0)
453 return (0);
454 return (pp->error);
455 }
456
457 /*
458 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
459 * ace != 0, because GEOM already handles that and handles it a bit
460 * differently. GEOM allows for multiple read/exclusive consumers and
461 * ZFS allows only one exclusive consumer, no matter if it is reader or
462 * writer. I like better the way GEOM works so I'll leave it for GEOM
463 * to decide what to do.
464 */
465
466 count = acr + acw + ace;
467 if (count == 0)
468 return (0);
469
470 flags = 0;
471 if (acr != 0 || ace != 0)
472 flags |= FREAD;
473 if (acw != 0)
474 flags |= FWRITE;
475
476 g_topology_unlock();
477 if (count > 0)
478 error = zvol_geom_open(pp, flags, count);
479 else
480 error = zvol_geom_close(pp, flags, -count);
481 g_topology_lock();
482 return (error);
483 }
484
485 static void
zvol_geom_bio_start(struct bio * bp)486 zvol_geom_bio_start(struct bio *bp)
487 {
488 zvol_state_t *zv = bp->bio_to->private;
489
490 if (zv == NULL) {
491 g_io_deliver(bp, ENXIO);
492 return;
493 }
494 if (bp->bio_cmd == BIO_GETATTR) {
495 if (zvol_geom_bio_getattr(bp))
496 g_io_deliver(bp, EOPNOTSUPP);
497 return;
498 }
499
500 zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) &&
501 THREAD_CAN_SLEEP());
502 }
503
504 static int
zvol_geom_bio_getattr(struct bio * bp)505 zvol_geom_bio_getattr(struct bio *bp)
506 {
507 zvol_state_t *zv;
508
509 zv = bp->bio_to->private;
510 ASSERT3P(zv, !=, NULL);
511
512 spa_t *spa = dmu_objset_spa(zv->zv_objset);
513 uint64_t refd, avail, usedobjs, availobjs;
514
515 if (g_handleattr_int(bp, "GEOM::candelete", 1))
516 return (0);
517 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
518 dmu_objset_space(zv->zv_objset, &refd, &avail,
519 &usedobjs, &availobjs);
520 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
521 return (0);
522 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
523 dmu_objset_space(zv->zv_objset, &refd, &avail,
524 &usedobjs, &availobjs);
525 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
526 return (0);
527 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
528 avail = metaslab_class_get_space(spa_normal_class(spa));
529 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
530 if (g_handleattr_off_t(bp, "poolblocksavail",
531 avail / DEV_BSIZE))
532 return (0);
533 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
534 refd = metaslab_class_get_alloc(spa_normal_class(spa));
535 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
536 return (0);
537 }
538 return (1);
539 }
540
541 static void
zvol_filter_detach(struct knote * kn)542 zvol_filter_detach(struct knote *kn)
543 {
544 zvol_state_t *zv;
545 struct zvol_state_dev *zsd;
546
547 zv = kn->kn_hook;
548 zsd = &zv->zv_zso->zso_dev;
549
550 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
551 }
552
553 static int
zvol_filter_vnode(struct knote * kn,long hint)554 zvol_filter_vnode(struct knote *kn, long hint)
555 {
556 kn->kn_fflags |= kn->kn_sfflags & hint;
557
558 return (kn->kn_fflags != 0);
559 }
560
561 static int
zvol_cdev_kqfilter(struct cdev * dev,struct knote * kn)562 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
563 {
564 zvol_state_t *zv;
565 struct zvol_state_dev *zsd;
566
567 zv = dev->si_drv2;
568 zsd = &zv->zv_zso->zso_dev;
569
570 if (kn->kn_filter != EVFILT_VNODE)
571 return (EINVAL);
572
573 /* XXX: extend support for other NOTE_* events */
574 if (kn->kn_sfflags != NOTE_ATTRIB)
575 return (EINVAL);
576
577 kn->kn_fop = &zvol_filterops_vnode;
578 kn->kn_hook = zv;
579 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
580
581 return (0);
582 }
583
584 static void
zvol_strategy_impl(zv_request_t * zvr)585 zvol_strategy_impl(zv_request_t *zvr)
586 {
587 zvol_state_t *zv;
588 struct bio *bp;
589 uint64_t off, volsize;
590 size_t resid;
591 char *addr;
592 objset_t *os;
593 zfs_locked_range_t *lr;
594 int error = 0;
595 boolean_t doread = B_FALSE;
596 boolean_t is_dumpified;
597 boolean_t commit;
598
599 bp = zvr->bio;
600 zv = zvr->zv;
601 if (zv == NULL) {
602 error = SET_ERROR(ENXIO);
603 goto out;
604 }
605
606 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
607
608 if (zv->zv_flags & ZVOL_REMOVING) {
609 error = SET_ERROR(ENXIO);
610 goto resume;
611 }
612
613 switch (bp->bio_cmd) {
614 case BIO_READ:
615 doread = B_TRUE;
616 break;
617 case BIO_WRITE:
618 case BIO_FLUSH:
619 case BIO_DELETE:
620 if (zv->zv_flags & ZVOL_RDONLY) {
621 error = SET_ERROR(EROFS);
622 goto resume;
623 }
624 zvol_ensure_zilog(zv);
625 if (bp->bio_cmd == BIO_FLUSH)
626 goto commit;
627 break;
628 default:
629 error = SET_ERROR(EOPNOTSUPP);
630 goto resume;
631 }
632
633 off = bp->bio_offset;
634 volsize = zv->zv_volsize;
635
636 os = zv->zv_objset;
637 ASSERT3P(os, !=, NULL);
638
639 addr = bp->bio_data;
640 resid = bp->bio_length;
641
642 if (resid > 0 && off >= volsize) {
643 error = SET_ERROR(EIO);
644 goto resume;
645 }
646
647 is_dumpified = B_FALSE;
648 commit = !doread && !is_dumpified &&
649 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
650
651 /*
652 * There must be no buffer changes when doing a dmu_sync() because
653 * we can't change the data whilst calculating the checksum.
654 */
655 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
656 doread ? RL_READER : RL_WRITER);
657
658 if (bp->bio_cmd == BIO_DELETE) {
659 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
660 error = dmu_tx_assign(tx, DMU_TX_WAIT);
661 if (error != 0) {
662 dmu_tx_abort(tx);
663 } else {
664 zvol_log_truncate(zv, tx, off, resid);
665 dmu_tx_commit(tx);
666 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
667 off, resid);
668 resid = 0;
669 }
670 goto unlock;
671 }
672 while (resid != 0 && off < volsize) {
673 size_t size = MIN(resid, zvol_maxphys);
674 if (doread) {
675 error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
676 DMU_READ_PREFETCH);
677 } else {
678 dmu_tx_t *tx = dmu_tx_create(os);
679 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
680 error = dmu_tx_assign(tx, DMU_TX_WAIT);
681 if (error) {
682 dmu_tx_abort(tx);
683 } else {
684 dmu_write_by_dnode(zv->zv_dn, off, size, addr,
685 tx, DMU_READ_PREFETCH);
686 zvol_log_write(zv, tx, off, size, commit);
687 dmu_tx_commit(tx);
688 }
689 }
690 if (error) {
691 /* Convert checksum errors into IO errors. */
692 if (error == ECKSUM)
693 error = SET_ERROR(EIO);
694 break;
695 }
696 off += size;
697 addr += size;
698 resid -= size;
699 }
700 unlock:
701 zfs_rangelock_exit(lr);
702
703 bp->bio_completed = bp->bio_length - resid;
704 if (bp->bio_completed < bp->bio_length && off > volsize)
705 error = SET_ERROR(EINVAL);
706
707 switch (bp->bio_cmd) {
708 case BIO_FLUSH:
709 break;
710 case BIO_READ:
711 dataset_kstats_update_read_kstats(&zv->zv_kstat,
712 bp->bio_completed);
713 break;
714 case BIO_WRITE:
715 dataset_kstats_update_write_kstats(&zv->zv_kstat,
716 bp->bio_completed);
717 break;
718 case BIO_DELETE:
719 break;
720 default:
721 break;
722 }
723
724 if (error == 0 && commit) {
725 commit:
726 error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
727 }
728 resume:
729 rw_exit(&zv->zv_suspend_lock);
730 out:
731 if (bp->bio_to)
732 g_io_deliver(bp, error);
733 else
734 biofinish(bp, NULL, error);
735 }
736
737 static void
zvol_strategy_task(void * arg)738 zvol_strategy_task(void *arg)
739 {
740 zv_request_task_t *task = arg;
741
742 zvol_strategy_impl(&task->zvr);
743 zv_request_task_free(task);
744 }
745
746 static void
zvol_geom_bio_strategy(struct bio * bp,boolean_t sync)747 zvol_geom_bio_strategy(struct bio *bp, boolean_t sync)
748 {
749 zv_taskq_t *ztqs = &zvol_taskqs;
750 zv_request_task_t *task;
751 zvol_state_t *zv;
752 uint_t tq_idx;
753 uint_t taskq_hash;
754 int error;
755
756 if (bp->bio_to)
757 zv = bp->bio_to->private;
758 else
759 zv = bp->bio_dev->si_drv2;
760
761 if (zv == NULL) {
762 error = SET_ERROR(ENXIO);
763 if (bp->bio_to)
764 g_io_deliver(bp, error);
765 else
766 biofinish(bp, NULL, error);
767 return;
768 }
769
770 zv_request_t zvr = {
771 .zv = zv,
772 .bio = bp,
773 };
774
775 if (sync || zvol_request_sync) {
776 zvol_strategy_impl(&zvr);
777 return;
778 }
779
780 taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >>
781 ZVOL_TASKQ_OFFSET_SHIFT);
782 tq_idx = taskq_hash % ztqs->tqs_cnt;
783 task = zv_request_task_create(zvr);
784 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task,
785 0, &task->ent);
786 }
787
788 static void
zvol_cdev_bio_strategy(struct bio * bp)789 zvol_cdev_bio_strategy(struct bio *bp)
790 {
791 zvol_geom_bio_strategy(bp, B_FALSE);
792 }
793
794 /*
795 * Character device mode implementation
796 */
797
798 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio_s,int ioflag)799 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
800 {
801 zvol_state_t *zv;
802 uint64_t volsize;
803 zfs_locked_range_t *lr;
804 int error = 0;
805 zfs_uio_t uio;
806
807 zfs_uio_init(&uio, uio_s);
808
809 zv = dev->si_drv2;
810
811 volsize = zv->zv_volsize;
812 /*
813 * uio_loffset == volsize isn't an error as
814 * it's required for EOF processing.
815 */
816 if (zfs_uio_resid(&uio) > 0 &&
817 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
818 return (SET_ERROR(EIO));
819
820 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
821 ssize_t start_resid = zfs_uio_resid(&uio);
822 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
823 zfs_uio_resid(&uio), RL_READER);
824 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
825 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
826
827 /* Don't read past the end. */
828 if (bytes > volsize - zfs_uio_offset(&uio))
829 bytes = volsize - zfs_uio_offset(&uio);
830
831 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
832 DMU_READ_PREFETCH);
833 if (error) {
834 /* Convert checksum errors into IO errors. */
835 if (error == ECKSUM)
836 error = SET_ERROR(EIO);
837 break;
838 }
839 }
840 zfs_rangelock_exit(lr);
841 int64_t nread = start_resid - zfs_uio_resid(&uio);
842 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
843 rw_exit(&zv->zv_suspend_lock);
844
845 return (error);
846 }
847
848 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio_s,int ioflag)849 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
850 {
851 zvol_state_t *zv;
852 uint64_t volsize;
853 zfs_locked_range_t *lr;
854 int error = 0;
855 boolean_t commit;
856 zfs_uio_t uio;
857
858 zv = dev->si_drv2;
859
860 volsize = zv->zv_volsize;
861
862 zfs_uio_init(&uio, uio_s);
863
864 if (zfs_uio_resid(&uio) > 0 &&
865 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
866 return (SET_ERROR(EIO));
867
868 ssize_t start_resid = zfs_uio_resid(&uio);
869 commit = (ioflag & IO_SYNC) ||
870 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
871
872 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
873 zvol_ensure_zilog(zv);
874
875 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
876 zfs_uio_resid(&uio), RL_WRITER);
877 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
878 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
879 uint64_t off = zfs_uio_offset(&uio);
880 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
881
882 if (bytes > volsize - off) /* Don't write past the end. */
883 bytes = volsize - off;
884
885 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
886 error = dmu_tx_assign(tx, DMU_TX_WAIT);
887 if (error) {
888 dmu_tx_abort(tx);
889 break;
890 }
891 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
892 DMU_READ_PREFETCH);
893 if (error == 0)
894 zvol_log_write(zv, tx, off, bytes, commit);
895 dmu_tx_commit(tx);
896
897 if (error)
898 break;
899 }
900 zfs_rangelock_exit(lr);
901 int64_t nwritten = start_resid - zfs_uio_resid(&uio);
902 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
903 if (error == 0 && commit)
904 error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
905 rw_exit(&zv->zv_suspend_lock);
906
907 return (error);
908 }
909
910 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)911 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
912 {
913 zvol_state_t *zv;
914 int err = 0;
915 boolean_t drop_suspend = B_FALSE;
916
917 retry:
918 zv = atomic_load_ptr(&dev->si_drv2);
919 if (zv == NULL)
920 return (SET_ERROR(ENXIO));
921
922 mutex_enter(&zv->zv_state_lock);
923 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
924 err = SET_ERROR(ENXIO);
925 goto out_locked;
926 }
927 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
928
929 /*
930 * Make sure zvol is not suspended during first open
931 * (hold zv_suspend_lock) and respect proper lock acquisition
932 * ordering - zv_suspend_lock before zv_state_lock.
933 */
934 if (zv->zv_open_count == 0) {
935 drop_suspend = B_TRUE;
936 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
937 mutex_exit(&zv->zv_state_lock);
938 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
939 mutex_enter(&zv->zv_state_lock);
940
941 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
942 /* Removal started while locks were down. */
943 err = SET_ERROR(ENXIO);
944 goto out_locked;
945 }
946
947 /* Check to see if zv_suspend_lock is needed. */
948 if (zv->zv_open_count != 0) {
949 rw_exit(&zv->zv_suspend_lock);
950 drop_suspend = B_FALSE;
951 }
952 }
953 }
954
955 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
956
957 if (zv->zv_open_count == 0) {
958 boolean_t drop_namespace = B_FALSE;
959
960 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
961
962 /*
963 * Take spa_namespace_lock to prevent lock inversion when
964 * zvols from one pool are opened as vdevs in another.
965 */
966 if (!spa_namespace_held()) {
967 if (!spa_namespace_tryenter(FTAG)) {
968 mutex_exit(&zv->zv_state_lock);
969 rw_exit(&zv->zv_suspend_lock);
970 drop_suspend = B_FALSE;
971 kern_yield(PRI_USER);
972 goto retry;
973 } else {
974 drop_namespace = B_TRUE;
975 }
976 }
977 err = zvol_first_open(zv, !(flags & FWRITE));
978 if (drop_namespace)
979 spa_namespace_exit(FTAG);
980 if (err)
981 goto out_locked;
982 }
983
984 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
985
986 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
987 err = SET_ERROR(EROFS);
988 goto out_opened;
989 }
990 if (zv->zv_flags & ZVOL_EXCL) {
991 err = SET_ERROR(EBUSY);
992 goto out_opened;
993 }
994 if (flags & O_EXCL) {
995 if (zv->zv_open_count != 0) {
996 err = SET_ERROR(EBUSY);
997 goto out_opened;
998 }
999 zv->zv_flags |= ZVOL_EXCL;
1000 }
1001
1002 zv->zv_open_count++;
1003 out_opened:
1004 if (zv->zv_open_count == 0) {
1005 zvol_last_close(zv);
1006 wakeup(zv);
1007 }
1008 out_locked:
1009 mutex_exit(&zv->zv_state_lock);
1010 if (drop_suspend)
1011 rw_exit(&zv->zv_suspend_lock);
1012 return (err);
1013 }
1014
1015 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)1016 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1017 {
1018 zvol_state_t *zv;
1019 boolean_t drop_suspend = B_TRUE;
1020
1021 zv = atomic_load_ptr(&dev->si_drv2);
1022 if (zv == NULL)
1023 return (SET_ERROR(ENXIO));
1024
1025 mutex_enter(&zv->zv_state_lock);
1026 if (zv->zv_flags & ZVOL_EXCL) {
1027 ASSERT3U(zv->zv_open_count, ==, 1);
1028 zv->zv_flags &= ~ZVOL_EXCL;
1029 }
1030
1031 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1032
1033 /*
1034 * If the open count is zero, this is a spurious close.
1035 * That indicates a bug in the kernel / DDI framework.
1036 */
1037 ASSERT3U(zv->zv_open_count, >, 0);
1038 /*
1039 * Make sure zvol is not suspended during last close
1040 * (hold zv_suspend_lock) and respect proper lock acquisition
1041 * ordering - zv_suspend_lock before zv_state_lock.
1042 */
1043 if (zv->zv_open_count == 1) {
1044 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1045 mutex_exit(&zv->zv_state_lock);
1046 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1047 mutex_enter(&zv->zv_state_lock);
1048
1049 /*
1050 * Unlike in zvol_cdev_open(), we don't check if
1051 * removal started here, because we might be one of the
1052 * openers that needs to be thrown out! If we're the
1053 * last, we need to call zvol_last_close() below to
1054 * finish cleanup. So, no special treatment for us.
1055 */
1056
1057 /* Check to see if zv_suspend_lock is needed. */
1058 if (zv->zv_open_count != 1) {
1059 rw_exit(&zv->zv_suspend_lock);
1060 drop_suspend = B_FALSE;
1061 }
1062 }
1063 } else {
1064 drop_suspend = B_FALSE;
1065 }
1066
1067 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1068
1069 /*
1070 * You may get multiple opens, but only one close.
1071 */
1072 zv->zv_open_count--;
1073
1074 if (zv->zv_open_count == 0) {
1075 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1076 zvol_last_close(zv);
1077 wakeup(zv);
1078 }
1079
1080 mutex_exit(&zv->zv_state_lock);
1081
1082 if (drop_suspend)
1083 rw_exit(&zv->zv_suspend_lock);
1084 return (0);
1085 }
1086
1087 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1088 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1089 int fflag, struct thread *td)
1090 {
1091 zvol_state_t *zv;
1092 zfs_locked_range_t *lr;
1093 off_t offset, length;
1094 int error;
1095 boolean_t sync;
1096
1097 zv = atomic_load_ptr(&dev->si_drv2);
1098 ASSERT3P(zv, !=, NULL);
1099
1100 error = 0;
1101 KASSERT(zv->zv_open_count > 0,
1102 ("Device with zero access count in %s", __func__));
1103
1104 switch (cmd) {
1105 case DIOCGSECTORSIZE:
1106 *(uint32_t *)data = DEV_BSIZE;
1107 break;
1108 case DIOCGMEDIASIZE:
1109 *(off_t *)data = zv->zv_volsize;
1110 break;
1111 case DIOCGFLUSH:
1112 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1113 if (zv->zv_zilog != NULL)
1114 error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
1115 rw_exit(&zv->zv_suspend_lock);
1116 break;
1117 case DIOCGDELETE:
1118 if (!zvol_unmap_enabled)
1119 break;
1120
1121 offset = ((off_t *)data)[0];
1122 length = ((off_t *)data)[1];
1123 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1124 offset < 0 || offset >= zv->zv_volsize ||
1125 length <= 0) {
1126 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1127 length);
1128 error = SET_ERROR(EINVAL);
1129 break;
1130 }
1131 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1132 zvol_ensure_zilog(zv);
1133 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1134 RL_WRITER);
1135 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1136 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1137 if (error != 0) {
1138 sync = FALSE;
1139 dmu_tx_abort(tx);
1140 } else {
1141 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1142 zvol_log_truncate(zv, tx, offset, length);
1143 dmu_tx_commit(tx);
1144 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1145 offset, length);
1146 }
1147 zfs_rangelock_exit(lr);
1148 if (sync)
1149 error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
1150 rw_exit(&zv->zv_suspend_lock);
1151 break;
1152 case DIOCGSTRIPESIZE:
1153 *(off_t *)data = zv->zv_volblocksize;
1154 break;
1155 case DIOCGSTRIPEOFFSET:
1156 *(off_t *)data = 0;
1157 break;
1158 case DIOCGATTR: {
1159 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1160 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1161 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1162 uint64_t refd, avail, usedobjs, availobjs;
1163
1164 if (strcmp(arg->name, "GEOM::candelete") == 0)
1165 arg->value.i = 1;
1166 else if (strcmp(arg->name, "blocksavail") == 0) {
1167 dmu_objset_space(zv->zv_objset, &refd, &avail,
1168 &usedobjs, &availobjs);
1169 arg->value.off = avail / DEV_BSIZE;
1170 } else if (strcmp(arg->name, "blocksused") == 0) {
1171 dmu_objset_space(zv->zv_objset, &refd, &avail,
1172 &usedobjs, &availobjs);
1173 arg->value.off = refd / DEV_BSIZE;
1174 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1175 avail = metaslab_class_get_space(spa_normal_class(spa));
1176 avail -= metaslab_class_get_alloc(
1177 spa_normal_class(spa));
1178 arg->value.off = avail / DEV_BSIZE;
1179 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1180 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1181 arg->value.off = refd / DEV_BSIZE;
1182 } else
1183 error = SET_ERROR(ENOIOCTL);
1184 rw_exit(&zv->zv_suspend_lock);
1185 break;
1186 }
1187 case FIOSEEKHOLE:
1188 case FIOSEEKDATA: {
1189 off_t *off = (off_t *)data;
1190 uint64_t noff;
1191 boolean_t hole;
1192
1193 hole = (cmd == FIOSEEKHOLE);
1194 noff = *off;
1195 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1196 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1197 RL_READER);
1198 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1199 zfs_rangelock_exit(lr);
1200 rw_exit(&zv->zv_suspend_lock);
1201 *off = noff;
1202 break;
1203 }
1204 default:
1205 error = SET_ERROR(ENOIOCTL);
1206 }
1207
1208 return (error);
1209 }
1210
1211 /*
1212 * Misc. helpers
1213 */
1214
1215 static void
zvol_ensure_zilog(zvol_state_t * zv)1216 zvol_ensure_zilog(zvol_state_t *zv)
1217 {
1218 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1219
1220 /*
1221 * Open a ZIL if this is the first time we have written to this
1222 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1223 * than zv_state_lock so that we don't need to acquire an
1224 * additional lock in this path.
1225 */
1226 if (zv->zv_zilog == NULL) {
1227 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1228 rw_exit(&zv->zv_suspend_lock);
1229 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1230 }
1231 if (zv->zv_zilog == NULL) {
1232 zv->zv_zilog = zil_open(zv->zv_objset,
1233 zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1234 zv->zv_flags |= ZVOL_WRITTEN_TO;
1235 /* replay / destroy done in zvol_os_create_minor() */
1236 VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1237 ZIL_REPLAY_NEEDED);
1238 }
1239 rw_downgrade(&zv->zv_suspend_lock);
1240 }
1241 }
1242
1243 boolean_t
zvol_os_is_zvol(const char * device)1244 zvol_os_is_zvol(const char *device)
1245 {
1246 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1247 }
1248
1249 int
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1250 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1251 {
1252 int error = 0;
1253
1254 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1255 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1256
1257 /* Move to a new hashtable entry. */
1258 zv->zv_hash = zvol_name_hash(newname);
1259 hlist_del(&zv->zv_hlink);
1260 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1261
1262 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1263 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1264 struct g_provider *pp = zsg->zsg_provider;
1265 struct g_geom *gp;
1266
1267 g_topology_lock();
1268 gp = pp->geom;
1269 ASSERT3P(gp, !=, NULL);
1270
1271 zsg->zsg_provider = NULL;
1272 g_wither_provider(pp, ENXIO);
1273
1274 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1275 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1276 pp->sectorsize = DEV_BSIZE;
1277 pp->mediasize = zv->zv_volsize;
1278 pp->private = zv;
1279 zsg->zsg_provider = pp;
1280 g_error_provider(pp, 0);
1281 g_topology_unlock();
1282 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1283 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1284 struct cdev *dev;
1285 struct make_dev_args args;
1286
1287 dev = zsd->zsd_cdev;
1288 if (dev != NULL) {
1289 destroy_dev(dev);
1290 dev = zsd->zsd_cdev = NULL;
1291 if (zv->zv_open_count > 0) {
1292 zv->zv_flags &= ~ZVOL_EXCL;
1293 zv->zv_open_count = 0;
1294 /* XXX need suspend lock but lock order */
1295 zvol_last_close(zv);
1296 }
1297 }
1298
1299 make_dev_args_init(&args);
1300 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1301 args.mda_devsw = &zvol_cdevsw;
1302 args.mda_cr = NULL;
1303 args.mda_uid = UID_ROOT;
1304 args.mda_gid = GID_OPERATOR;
1305 args.mda_mode = 0640;
1306 args.mda_si_drv2 = zv;
1307 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
1308 if (error == 0) {
1309 dev->si_iosize_max = maxphys;
1310 zsd->zsd_cdev = dev;
1311 }
1312 }
1313 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1314 dataset_kstats_rename(&zv->zv_kstat, newname);
1315
1316 return (error);
1317 }
1318
1319 /*
1320 * Allocate memory for a new zvol_state_t and setup the required
1321 * request queue and generic disk structures for the block device.
1322 */
1323 static int
zvol_alloc(const char * name,uint64_t volsize,uint64_t volblocksize,zvol_state_t ** zvp)1324 zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
1325 zvol_state_t **zvp)
1326 {
1327 zvol_state_t *zv;
1328 uint64_t volmode;
1329 int error;
1330
1331 error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
1332 &volmode, NULL);
1333 if (error)
1334 return (error);
1335
1336 if (volmode == ZFS_VOLMODE_DEFAULT)
1337 volmode = zvol_volmode;
1338
1339 if (volmode == ZFS_VOLMODE_NONE)
1340 return (0);
1341
1342 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1343 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1344 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1345 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1346 zv->zv_volmode = volmode;
1347 zv->zv_volsize = volsize;
1348 zv->zv_volblocksize = volblocksize;
1349 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1350 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1351 struct g_provider *pp;
1352 struct g_geom *gp;
1353
1354 g_topology_lock();
1355 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1356 gp->start = zvol_geom_bio_start;
1357 gp->access = zvol_geom_access;
1358 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1359 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1360 pp->sectorsize = DEV_BSIZE;
1361 pp->mediasize = 0;
1362 pp->private = zv;
1363
1364 zsg->zsg_provider = pp;
1365 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1366 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1367 struct cdev *dev;
1368 struct make_dev_args args;
1369
1370 make_dev_args_init(&args);
1371 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1372 args.mda_devsw = &zvol_cdevsw;
1373 args.mda_cr = NULL;
1374 args.mda_uid = UID_ROOT;
1375 args.mda_gid = GID_OPERATOR;
1376 args.mda_mode = 0640;
1377 args.mda_si_drv2 = zv;
1378 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1379 if (error) {
1380 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1381 kmem_free(zv, sizeof (zvol_state_t));
1382 return (error);
1383 }
1384
1385 dev->si_iosize_max = maxphys;
1386 zsd->zsd_cdev = dev;
1387 knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
1388 }
1389 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1390 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1391 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1392
1393 *zvp = zv;
1394 return (error);
1395 }
1396
1397 /*
1398 * Remove minor node for the specified volume.
1399 */
1400 void
zvol_os_remove_minor(zvol_state_t * zv)1401 zvol_os_remove_minor(zvol_state_t *zv)
1402 {
1403 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1404 ASSERT0(zv->zv_open_count);
1405 ASSERT0(atomic_read(&zv->zv_suspend_ref));
1406 ASSERT(zv->zv_flags & ZVOL_REMOVING);
1407
1408 struct zvol_state_os *zso = zv->zv_zso;
1409 zv->zv_zso = NULL;
1410
1411 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1412 struct zvol_state_geom *zsg = &zso->zso_geom;
1413 struct g_provider *pp = zsg->zsg_provider;
1414 atomic_store_ptr(&pp->private, NULL);
1415 mutex_exit(&zv->zv_state_lock);
1416
1417 g_topology_lock();
1418 g_wither_geom(pp->geom, ENXIO);
1419 g_topology_unlock();
1420 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1421 struct zvol_state_dev *zsd = &zso->zso_dev;
1422 struct cdev *dev = zsd->zsd_cdev;
1423
1424 if (dev != NULL)
1425 atomic_store_ptr(&dev->si_drv2, NULL);
1426 mutex_exit(&zv->zv_state_lock);
1427
1428 if (dev != NULL) {
1429 destroy_dev(dev);
1430 knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1431 knlist_destroy(&zsd->zsd_selinfo.si_note);
1432 }
1433 }
1434
1435 kmem_free(zso, sizeof (struct zvol_state_os));
1436
1437 mutex_enter(&zv->zv_state_lock);
1438 }
1439
1440 void
zvol_os_free(zvol_state_t * zv)1441 zvol_os_free(zvol_state_t *zv)
1442 {
1443 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1444 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1445 ASSERT0(zv->zv_open_count);
1446 ASSERT0P(zv->zv_zso);
1447
1448 ASSERT0P(zv->zv_objset);
1449 ASSERT0P(zv->zv_zilog);
1450 ASSERT0P(zv->zv_dn);
1451
1452 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1453
1454 rw_destroy(&zv->zv_suspend_lock);
1455 zfs_rangelock_fini(&zv->zv_rangelock);
1456
1457 mutex_destroy(&zv->zv_state_lock);
1458 cv_destroy(&zv->zv_removing_cv);
1459 dataset_kstats_destroy(&zv->zv_kstat);
1460 kmem_free(zv, sizeof (zvol_state_t));
1461 zvol_minors--;
1462 }
1463
1464 /*
1465 * Create a minor node (plus a whole lot more) for the specified volume.
1466 */
1467 int
zvol_os_create_minor(const char * name)1468 zvol_os_create_minor(const char *name)
1469 {
1470 zvol_state_t *zv = NULL;
1471 objset_t *os;
1472 dmu_object_info_t *doi;
1473 uint64_t volsize;
1474 uint64_t hash, len;
1475 int error;
1476 bool replayed_zil = B_FALSE;
1477
1478 if (zvol_inhibit_dev)
1479 return (0);
1480
1481 ZFS_LOG(1, "Creating ZVOL %s...", name);
1482 hash = zvol_name_hash(name);
1483 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1484 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1485 mutex_exit(&zv->zv_state_lock);
1486 return (SET_ERROR(EEXIST));
1487 }
1488
1489 DROP_GIANT();
1490
1491 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1492
1493 /* Lie and say we're read-only. */
1494 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1495 if (error)
1496 goto out_doi;
1497
1498 error = dmu_object_info(os, ZVOL_OBJ, doi);
1499 if (error)
1500 goto out_dmu_objset_disown;
1501
1502 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1503 if (error)
1504 goto out_dmu_objset_disown;
1505
1506 error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
1507 if (error || zv == NULL)
1508 goto out_dmu_objset_disown;
1509
1510 zv->zv_hash = hash;
1511
1512 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1513 zv->zv_flags |= ZVOL_RDONLY;
1514
1515 zv->zv_objset = os;
1516
1517 ASSERT0P(zv->zv_kstat.dk_kstats);
1518 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1519 if (error)
1520 goto out_dmu_objset_disown;
1521 ASSERT0P(zv->zv_zilog);
1522 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1523 if (spa_writeable(dmu_objset_spa(os))) {
1524 if (zil_replay_disable)
1525 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1526 else
1527 replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1528 }
1529 if (replayed_zil)
1530 zil_close(zv->zv_zilog);
1531 zv->zv_zilog = NULL;
1532
1533 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1534 if (len > 0) {
1535 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ);
1536 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1537 ZIO_PRIORITY_ASYNC_READ);
1538 }
1539
1540 zv->zv_objset = NULL;
1541 out_dmu_objset_disown:
1542 dmu_objset_disown(os, B_TRUE, FTAG);
1543
1544 if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1545 g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
1546 /* geom was locked inside zvol_alloc() function */
1547 g_topology_unlock();
1548 }
1549 out_doi:
1550 kmem_free(doi, sizeof (dmu_object_info_t));
1551 if (error == 0 && zv) {
1552 rw_enter(&zvol_state_lock, RW_WRITER);
1553 zvol_insert(zv);
1554 zvol_minors++;
1555 rw_exit(&zvol_state_lock);
1556 ZFS_LOG(1, "ZVOL %s created.", name);
1557 }
1558 PICKUP_GIANT();
1559 return (error);
1560 }
1561
1562 int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1563 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1564 {
1565 zv->zv_volsize = volsize;
1566 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1567 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1568 struct g_provider *pp = zsg->zsg_provider;
1569
1570 g_topology_lock();
1571
1572 if (pp->private == NULL) {
1573 g_topology_unlock();
1574 return (SET_ERROR(ENXIO));
1575 }
1576
1577 /*
1578 * Do not invoke resize event when initial size was zero.
1579 * ZVOL initializes the size on first open, this is not
1580 * real resizing.
1581 */
1582 if (pp->mediasize == 0)
1583 pp->mediasize = zv->zv_volsize;
1584 else
1585 g_resize_provider(pp, zv->zv_volsize);
1586
1587 g_topology_unlock();
1588 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1589 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1590
1591 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1592 }
1593 return (0);
1594 }
1595
1596 void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1597 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1598 {
1599 /*
1600 * The ro/rw ZVOL mode is switched using zvol_set_ro() function by
1601 * enabling/disabling ZVOL_RDONLY flag. No additional FreeBSD-specific
1602 * actions are required for readonly zfs property switching.
1603 */
1604 }
1605
1606 void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1607 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1608 {
1609 /*
1610 * The ZVOL size/capacity is changed by zvol_set_volsize() function.
1611 * Leave this method empty, because all required job is doing by
1612 * zvol_os_update_volsize() platform-specific function.
1613 */
1614 }
1615
1616 /*
1617 * Public interfaces
1618 */
1619
1620 int
zvol_busy(void)1621 zvol_busy(void)
1622 {
1623 return (zvol_minors != 0);
1624 }
1625
1626 int
zvol_init(void)1627 zvol_init(void)
1628 {
1629 return (zvol_init_impl());
1630 }
1631
1632 void
zvol_fini(void)1633 zvol_fini(void)
1634 {
1635 zvol_fini_impl();
1636 }
1637