xref: /freebsd/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c (revision 53a2e2635ab2d17bed1de7b4e0d782dd23ceb6ea)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  *
25  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
26  * All rights reserved.
27  *
28  * Portions Copyright 2010 Robert Milkowski
29  *
30  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
31  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
32  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
33  * Copyright (c) 2014 Integros [integros.com]
34  * Copyright (c) 2024, 2025, Klara, Inc.
35  */
36 
37 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
38 
39 /*
40  * ZFS volume emulation driver.
41  *
42  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
43  * Volumes are accessed through the symbolic links named:
44  *
45  * /dev/zvol/<pool_name>/<dataset_name>
46  *
47  * Volumes are persistent through reboot.  No user command needs to be
48  * run before opening and using a device.
49  *
50  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
51  * in the system. Except when they're simply character devices (volmode=dev).
52  */
53 
54 #include <sys/types.h>
55 #include <sys/param.h>
56 #include <sys/kernel.h>
57 #include <sys/errno.h>
58 #include <sys/uio.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/kmem.h>
62 #include <sys/conf.h>
63 #include <sys/cmn_err.h>
64 #include <sys/stat.h>
65 #include <sys/proc.h>
66 #include <sys/zap.h>
67 #include <sys/spa.h>
68 #include <sys/spa_impl.h>
69 #include <sys/zio.h>
70 #include <sys/disk.h>
71 #include <sys/dmu_traverse.h>
72 #include <sys/dnode.h>
73 #include <sys/dsl_dataset.h>
74 #include <sys/dsl_prop.h>
75 #include <sys/dsl_dir.h>
76 #include <sys/byteorder.h>
77 #include <sys/sunddi.h>
78 #include <sys/dirent.h>
79 #include <sys/policy.h>
80 #include <sys/queue.h>
81 #include <sys/fs/zfs.h>
82 #include <sys/zfs_ioctl.h>
83 #include <sys/zil.h>
84 #include <sys/zfs_znode.h>
85 #include <sys/zfs_rlock.h>
86 #include <sys/vdev_impl.h>
87 #include <sys/vdev_raidz.h>
88 #include <sys/zvol.h>
89 #include <sys/zil_impl.h>
90 #include <sys/dataset_kstats.h>
91 #include <sys/dbuf.h>
92 #include <sys/dmu_tx.h>
93 #include <sys/zfeature.h>
94 #include <sys/zio_checksum.h>
95 #include <sys/zil_impl.h>
96 #include <sys/filio.h>
97 #include <sys/freebsd_event.h>
98 
99 #include <geom/geom.h>
100 #include <sys/zvol.h>
101 #include <sys/zvol_impl.h>
102 #include <cityhash.h>
103 
104 #include "zfs_namecheck.h"
105 
106 #define	ZVOL_DUMPSIZE		"dumpsize"
107 
108 #ifdef ZVOL_LOCK_DEBUG
109 #define	ZVOL_RW_READER		RW_WRITER
110 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
111 #else
112 #define	ZVOL_RW_READER		RW_READER
113 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
114 #endif
115 
116 struct zvol_state_os {
117 #define	zso_dev		_zso_state._zso_dev
118 #define	zso_geom	_zso_state._zso_geom
119 	union {
120 		/* volmode=dev */
121 		struct zvol_state_dev {
122 			struct cdev *zsd_cdev;
123 			struct selinfo zsd_selinfo;
124 		} _zso_dev;
125 
126 		/* volmode=geom */
127 		struct zvol_state_geom {
128 			struct g_provider *zsg_provider;
129 		} _zso_geom;
130 	} _zso_state;
131 	int zso_dying;
132 };
133 
134 static uint32_t zvol_minors;
135 
136 SYSCTL_DECL(_vfs_zfs);
137 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
138 
139 static boolean_t zpool_on_zvol = B_FALSE;
140 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
141 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
142 
143 /*
144  * Toggle unmap functionality.
145  */
146 boolean_t zvol_unmap_enabled = B_TRUE;
147 
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
149 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
150 
151 /*
152  * zvol maximum transfer in one DMU tx.
153  */
154 int zvol_maxphys = DMU_MAX_ACCESS / 2;
155 
156 static void zvol_ensure_zilog(zvol_state_t *zv);
157 
158 static d_open_t		zvol_cdev_open;
159 static d_close_t	zvol_cdev_close;
160 static d_ioctl_t	zvol_cdev_ioctl;
161 static d_read_t		zvol_cdev_read;
162 static d_write_t	zvol_cdev_write;
163 static d_strategy_t	zvol_cdev_bio_strategy;
164 static d_kqfilter_t	zvol_cdev_kqfilter;
165 
166 static struct cdevsw zvol_cdevsw = {
167 	.d_name =	"zvol",
168 	.d_version =	D_VERSION,
169 	.d_flags =	D_DISK | D_TRACKCLOSE,
170 	.d_open =	zvol_cdev_open,
171 	.d_close =	zvol_cdev_close,
172 	.d_ioctl =	zvol_cdev_ioctl,
173 	.d_read =	zvol_cdev_read,
174 	.d_write =	zvol_cdev_write,
175 	.d_strategy =	zvol_cdev_bio_strategy,
176 	.d_kqfilter =	zvol_cdev_kqfilter,
177 };
178 
179 static void		zvol_filter_detach(struct knote *kn);
180 static int		zvol_filter_vnode(struct knote *kn, long hint);
181 
182 static struct filterops zvol_filterops_vnode = {
183 	.f_isfd = 1,
184 	.f_detach = zvol_filter_detach,
185 	.f_event = zvol_filter_vnode,
186 };
187 
188 extern uint_t zfs_geom_probe_vdev_key;
189 
190 struct g_class zfs_zvol_class = {
191 	.name = "ZFS::ZVOL",
192 	.version = G_VERSION,
193 };
194 
195 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
196 
197 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
198 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
199 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
200 static void zvol_geom_bio_start(struct bio *bp);
201 static int zvol_geom_bio_getattr(struct bio *bp);
202 static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync);
203 
204 /*
205  * GEOM mode implementation
206  */
207 
208 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)209 zvol_geom_open(struct g_provider *pp, int flag, int count)
210 {
211 	zvol_state_t *zv;
212 	int err = 0;
213 	boolean_t drop_suspend = B_FALSE;
214 
215 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
216 		/*
217 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
218 		 * attempting to probe geom providers while looking for a
219 		 * replacement for a missing VDEV.  In this case, the
220 		 * spa_namespace_lock will not be held, but it is still illegal
221 		 * to use a zvol as a vdev.  Deadlocks can result if another
222 		 * thread has spa_namespace_lock.
223 		 */
224 		return (SET_ERROR(EOPNOTSUPP));
225 	}
226 
227 retry:
228 	zv = atomic_load_ptr(&pp->private);
229 	if (zv == NULL)
230 		return (SET_ERROR(ENXIO));
231 
232 	mutex_enter(&zv->zv_state_lock);
233 	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
234 		err = SET_ERROR(ENXIO);
235 		goto out_locked;
236 	}
237 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
238 
239 	/*
240 	 * Make sure zvol is not suspended during first open
241 	 * (hold zv_suspend_lock) and respect proper lock acquisition
242 	 * ordering - zv_suspend_lock before zv_state_lock.
243 	 */
244 	if (zv->zv_open_count == 0) {
245 		drop_suspend = B_TRUE;
246 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
247 			mutex_exit(&zv->zv_state_lock);
248 
249 			/*
250 			 * Removal may happen while the locks are down, so
251 			 * we can't trust zv any longer; we have to start over.
252 			 */
253 			zv = atomic_load_ptr(&pp->private);
254 			if (zv == NULL)
255 				return (SET_ERROR(ENXIO));
256 
257 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
258 			mutex_enter(&zv->zv_state_lock);
259 
260 			if (zv->zv_zso->zso_dying ||
261 			    zv->zv_flags & ZVOL_REMOVING) {
262 				err = SET_ERROR(ENXIO);
263 				goto out_locked;
264 			}
265 
266 			/* Check to see if zv_suspend_lock is needed. */
267 			if (zv->zv_open_count != 0) {
268 				rw_exit(&zv->zv_suspend_lock);
269 				drop_suspend = B_FALSE;
270 			}
271 		}
272 	}
273 
274 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
275 
276 	if (zv->zv_open_count == 0) {
277 		boolean_t drop_namespace = B_FALSE;
278 
279 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
280 
281 		/*
282 		 * Take spa_namespace_lock to prevent lock inversion when
283 		 * zvols from one pool are opened as vdevs in another.
284 		 */
285 		if (!mutex_owned(&spa_namespace_lock)) {
286 			if (!mutex_tryenter(&spa_namespace_lock)) {
287 				mutex_exit(&zv->zv_state_lock);
288 				rw_exit(&zv->zv_suspend_lock);
289 				drop_suspend = B_FALSE;
290 				kern_yield(PRI_USER);
291 				goto retry;
292 			} else {
293 				drop_namespace = B_TRUE;
294 			}
295 		}
296 		err = zvol_first_open(zv, !(flag & FWRITE));
297 		if (drop_namespace)
298 			mutex_exit(&spa_namespace_lock);
299 		if (err)
300 			goto out_locked;
301 		pp->mediasize = zv->zv_volsize;
302 		pp->stripeoffset = 0;
303 		pp->stripesize = zv->zv_volblocksize;
304 	}
305 
306 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
307 
308 	/*
309 	 * Check for a bad on-disk format version now since we
310 	 * lied about owning the dataset readonly before.
311 	 */
312 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
313 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
314 		err = SET_ERROR(EROFS);
315 		goto out_opened;
316 	}
317 	if (zv->zv_flags & ZVOL_EXCL) {
318 		err = SET_ERROR(EBUSY);
319 		goto out_opened;
320 	}
321 	if (flag & O_EXCL) {
322 		if (zv->zv_open_count != 0) {
323 			err = SET_ERROR(EBUSY);
324 			goto out_opened;
325 		}
326 		zv->zv_flags |= ZVOL_EXCL;
327 	}
328 
329 	zv->zv_open_count += count;
330 out_opened:
331 	if (zv->zv_open_count == 0) {
332 		zvol_last_close(zv);
333 		wakeup(zv);
334 	}
335 out_locked:
336 	mutex_exit(&zv->zv_state_lock);
337 	if (drop_suspend)
338 		rw_exit(&zv->zv_suspend_lock);
339 	return (err);
340 }
341 
342 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)343 zvol_geom_close(struct g_provider *pp, int flag, int count)
344 {
345 	(void) flag;
346 	zvol_state_t *zv;
347 	boolean_t drop_suspend = B_TRUE;
348 	int new_open_count;
349 
350 	zv = atomic_load_ptr(&pp->private);
351 	if (zv == NULL)
352 		return (SET_ERROR(ENXIO));
353 
354 	mutex_enter(&zv->zv_state_lock);
355 	if (zv->zv_flags & ZVOL_EXCL) {
356 		ASSERT3U(zv->zv_open_count, ==, 1);
357 		zv->zv_flags &= ~ZVOL_EXCL;
358 	}
359 
360 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
361 
362 	/*
363 	 * If the open count is zero, this is a spurious close.
364 	 * That indicates a bug in the kernel / DDI framework.
365 	 */
366 	ASSERT3U(zv->zv_open_count, >, 0);
367 
368 	/*
369 	 * Make sure zvol is not suspended during last close
370 	 * (hold zv_suspend_lock) and respect proper lock acquisition
371 	 * ordering - zv_suspend_lock before zv_state_lock.
372 	 */
373 	new_open_count = zv->zv_open_count - count;
374 	if (new_open_count == 0) {
375 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
376 			mutex_exit(&zv->zv_state_lock);
377 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
378 			mutex_enter(&zv->zv_state_lock);
379 
380 			/*
381 			 * Unlike in zvol_geom_open(), we don't check if
382 			 * removal started here, because we might be one of the
383 			 * openers that needs to be thrown out! If we're the
384 			 * last, we need to call zvol_last_close() below to
385 			 * finish cleanup. So, no special treatment for us.
386 			 */
387 
388 			/* Check to see if zv_suspend_lock is needed. */
389 			new_open_count = zv->zv_open_count - count;
390 			if (new_open_count != 0) {
391 				rw_exit(&zv->zv_suspend_lock);
392 				drop_suspend = B_FALSE;
393 			}
394 		}
395 	} else {
396 		drop_suspend = B_FALSE;
397 	}
398 
399 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
400 
401 	/*
402 	 * You may get multiple opens, but only one close.
403 	 */
404 	zv->zv_open_count = new_open_count;
405 	if (zv->zv_open_count == 0) {
406 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
407 		zvol_last_close(zv);
408 		wakeup(zv);
409 	}
410 
411 	mutex_exit(&zv->zv_state_lock);
412 
413 	if (drop_suspend)
414 		rw_exit(&zv->zv_suspend_lock);
415 	return (0);
416 }
417 
418 void
zvol_wait_close(zvol_state_t * zv)419 zvol_wait_close(zvol_state_t *zv)
420 {
421 
422 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
423 		return;
424 	mutex_enter(&zv->zv_state_lock);
425 	zv->zv_zso->zso_dying = B_TRUE;
426 
427 	if (zv->zv_open_count)
428 		msleep(zv, &zv->zv_state_lock,
429 		    PRIBIO, "zvol:dying", 10*hz);
430 	mutex_exit(&zv->zv_state_lock);
431 }
432 
433 
434 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)435 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
436 {
437 	int count, error, flags;
438 
439 	g_topology_assert();
440 
441 	/*
442 	 * To make it easier we expect either open or close, but not both
443 	 * at the same time.
444 	 */
445 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
446 	    (acr <= 0 && acw <= 0 && ace <= 0),
447 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
448 	    pp->name, acr, acw, ace));
449 
450 	if (atomic_load_ptr(&pp->private) == NULL) {
451 		if (acr <= 0 && acw <= 0 && ace <= 0)
452 			return (0);
453 		return (pp->error);
454 	}
455 
456 	/*
457 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
458 	 * ace != 0, because GEOM already handles that and handles it a bit
459 	 * differently. GEOM allows for multiple read/exclusive consumers and
460 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
461 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
462 	 * to decide what to do.
463 	 */
464 
465 	count = acr + acw + ace;
466 	if (count == 0)
467 		return (0);
468 
469 	flags = 0;
470 	if (acr != 0 || ace != 0)
471 		flags |= FREAD;
472 	if (acw != 0)
473 		flags |= FWRITE;
474 
475 	g_topology_unlock();
476 	if (count > 0)
477 		error = zvol_geom_open(pp, flags, count);
478 	else
479 		error = zvol_geom_close(pp, flags, -count);
480 	g_topology_lock();
481 	return (error);
482 }
483 
484 static void
zvol_geom_bio_start(struct bio * bp)485 zvol_geom_bio_start(struct bio *bp)
486 {
487 	zvol_state_t *zv = bp->bio_to->private;
488 
489 	if (zv == NULL) {
490 		g_io_deliver(bp, ENXIO);
491 		return;
492 	}
493 	if (bp->bio_cmd == BIO_GETATTR) {
494 		if (zvol_geom_bio_getattr(bp))
495 			g_io_deliver(bp, EOPNOTSUPP);
496 		return;
497 	}
498 
499 	zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) &&
500 	    THREAD_CAN_SLEEP());
501 }
502 
503 static int
zvol_geom_bio_getattr(struct bio * bp)504 zvol_geom_bio_getattr(struct bio *bp)
505 {
506 	zvol_state_t *zv;
507 
508 	zv = bp->bio_to->private;
509 	ASSERT3P(zv, !=, NULL);
510 
511 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
512 	uint64_t refd, avail, usedobjs, availobjs;
513 
514 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
515 		return (0);
516 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
517 		dmu_objset_space(zv->zv_objset, &refd, &avail,
518 		    &usedobjs, &availobjs);
519 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
520 			return (0);
521 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
522 		dmu_objset_space(zv->zv_objset, &refd, &avail,
523 		    &usedobjs, &availobjs);
524 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
525 			return (0);
526 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
527 		avail = metaslab_class_get_space(spa_normal_class(spa));
528 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
529 		if (g_handleattr_off_t(bp, "poolblocksavail",
530 		    avail / DEV_BSIZE))
531 			return (0);
532 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
533 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
534 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
535 			return (0);
536 	}
537 	return (1);
538 }
539 
540 static void
zvol_filter_detach(struct knote * kn)541 zvol_filter_detach(struct knote *kn)
542 {
543 	zvol_state_t *zv;
544 	struct zvol_state_dev *zsd;
545 
546 	zv = kn->kn_hook;
547 	zsd = &zv->zv_zso->zso_dev;
548 
549 	knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
550 }
551 
552 static int
zvol_filter_vnode(struct knote * kn,long hint)553 zvol_filter_vnode(struct knote *kn, long hint)
554 {
555 	kn->kn_fflags |= kn->kn_sfflags & hint;
556 
557 	return (kn->kn_fflags != 0);
558 }
559 
560 static int
zvol_cdev_kqfilter(struct cdev * dev,struct knote * kn)561 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
562 {
563 	zvol_state_t *zv;
564 	struct zvol_state_dev *zsd;
565 
566 	zv = dev->si_drv2;
567 	zsd = &zv->zv_zso->zso_dev;
568 
569 	if (kn->kn_filter != EVFILT_VNODE)
570 		return (EINVAL);
571 
572 	/* XXX: extend support for other NOTE_* events */
573 	if (kn->kn_sfflags != NOTE_ATTRIB)
574 		return (EINVAL);
575 
576 	kn->kn_fop = &zvol_filterops_vnode;
577 	kn->kn_hook = zv;
578 	knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
579 
580 	return (0);
581 }
582 
583 static void
zvol_strategy_impl(zv_request_t * zvr)584 zvol_strategy_impl(zv_request_t *zvr)
585 {
586 	zvol_state_t *zv;
587 	struct bio *bp;
588 	uint64_t off, volsize;
589 	size_t resid;
590 	char *addr;
591 	objset_t *os;
592 	zfs_locked_range_t *lr;
593 	int error = 0;
594 	boolean_t doread = B_FALSE;
595 	boolean_t is_dumpified;
596 	boolean_t commit;
597 
598 	bp = zvr->bio;
599 	zv = zvr->zv;
600 	if (zv == NULL) {
601 		error = SET_ERROR(ENXIO);
602 		goto out;
603 	}
604 
605 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
606 
607 	if (zv->zv_flags & ZVOL_REMOVING) {
608 		error = SET_ERROR(ENXIO);
609 		goto resume;
610 	}
611 
612 	switch (bp->bio_cmd) {
613 	case BIO_READ:
614 		doread = B_TRUE;
615 		break;
616 	case BIO_WRITE:
617 	case BIO_FLUSH:
618 	case BIO_DELETE:
619 		if (zv->zv_flags & ZVOL_RDONLY) {
620 			error = SET_ERROR(EROFS);
621 			goto resume;
622 		}
623 		zvol_ensure_zilog(zv);
624 		if (bp->bio_cmd == BIO_FLUSH)
625 			goto commit;
626 		break;
627 	default:
628 		error = SET_ERROR(EOPNOTSUPP);
629 		goto resume;
630 	}
631 
632 	off = bp->bio_offset;
633 	volsize = zv->zv_volsize;
634 
635 	os = zv->zv_objset;
636 	ASSERT3P(os, !=, NULL);
637 
638 	addr = bp->bio_data;
639 	resid = bp->bio_length;
640 
641 	if (resid > 0 && off >= volsize) {
642 		error = SET_ERROR(EIO);
643 		goto resume;
644 	}
645 
646 	is_dumpified = B_FALSE;
647 	commit = !doread && !is_dumpified &&
648 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
649 
650 	/*
651 	 * There must be no buffer changes when doing a dmu_sync() because
652 	 * we can't change the data whilst calculating the checksum.
653 	 */
654 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
655 	    doread ? RL_READER : RL_WRITER);
656 
657 	if (bp->bio_cmd == BIO_DELETE) {
658 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
659 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
660 		if (error != 0) {
661 			dmu_tx_abort(tx);
662 		} else {
663 			zvol_log_truncate(zv, tx, off, resid);
664 			dmu_tx_commit(tx);
665 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
666 			    off, resid);
667 			resid = 0;
668 		}
669 		goto unlock;
670 	}
671 	while (resid != 0 && off < volsize) {
672 		size_t size = MIN(resid, zvol_maxphys);
673 		if (doread) {
674 			error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
675 			    DMU_READ_PREFETCH);
676 		} else {
677 			dmu_tx_t *tx = dmu_tx_create(os);
678 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
679 			error = dmu_tx_assign(tx, DMU_TX_WAIT);
680 			if (error) {
681 				dmu_tx_abort(tx);
682 			} else {
683 				dmu_write_by_dnode(zv->zv_dn, off, size, addr,
684 				    tx, DMU_READ_PREFETCH);
685 				zvol_log_write(zv, tx, off, size, commit);
686 				dmu_tx_commit(tx);
687 			}
688 		}
689 		if (error) {
690 			/* Convert checksum errors into IO errors. */
691 			if (error == ECKSUM)
692 				error = SET_ERROR(EIO);
693 			break;
694 		}
695 		off += size;
696 		addr += size;
697 		resid -= size;
698 	}
699 unlock:
700 	zfs_rangelock_exit(lr);
701 
702 	bp->bio_completed = bp->bio_length - resid;
703 	if (bp->bio_completed < bp->bio_length && off > volsize)
704 		error = SET_ERROR(EINVAL);
705 
706 	switch (bp->bio_cmd) {
707 	case BIO_FLUSH:
708 		break;
709 	case BIO_READ:
710 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
711 		    bp->bio_completed);
712 		break;
713 	case BIO_WRITE:
714 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
715 		    bp->bio_completed);
716 		break;
717 	case BIO_DELETE:
718 		break;
719 	default:
720 		break;
721 	}
722 
723 	if (error == 0 && commit) {
724 commit:
725 		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
726 	}
727 resume:
728 	rw_exit(&zv->zv_suspend_lock);
729 out:
730 	if (bp->bio_to)
731 		g_io_deliver(bp, error);
732 	else
733 		biofinish(bp, NULL, error);
734 }
735 
736 static void
zvol_strategy_task(void * arg)737 zvol_strategy_task(void *arg)
738 {
739 	zv_request_task_t *task = arg;
740 
741 	zvol_strategy_impl(&task->zvr);
742 	zv_request_task_free(task);
743 }
744 
745 static void
zvol_geom_bio_strategy(struct bio * bp,boolean_t sync)746 zvol_geom_bio_strategy(struct bio *bp, boolean_t sync)
747 {
748 	zv_taskq_t *ztqs = &zvol_taskqs;
749 	zv_request_task_t *task;
750 	zvol_state_t *zv;
751 	uint_t tq_idx;
752 	uint_t taskq_hash;
753 	int error;
754 
755 	if (bp->bio_to)
756 		zv = bp->bio_to->private;
757 	else
758 		zv = bp->bio_dev->si_drv2;
759 
760 	if (zv == NULL) {
761 		error = SET_ERROR(ENXIO);
762 		if (bp->bio_to)
763 			g_io_deliver(bp, error);
764 		else
765 			biofinish(bp, NULL, error);
766 		return;
767 	}
768 
769 	zv_request_t zvr = {
770 		.zv = zv,
771 		.bio = bp,
772 	};
773 
774 	if (sync || zvol_request_sync) {
775 		zvol_strategy_impl(&zvr);
776 		return;
777 	}
778 
779 	taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >>
780 	    ZVOL_TASKQ_OFFSET_SHIFT);
781 	tq_idx = taskq_hash % ztqs->tqs_cnt;
782 	task = zv_request_task_create(zvr);
783 	taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task,
784 	    0, &task->ent);
785 }
786 
787 static void
zvol_cdev_bio_strategy(struct bio * bp)788 zvol_cdev_bio_strategy(struct bio *bp)
789 {
790 	zvol_geom_bio_strategy(bp, B_FALSE);
791 }
792 
793 /*
794  * Character device mode implementation
795  */
796 
797 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio_s,int ioflag)798 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
799 {
800 	zvol_state_t *zv;
801 	uint64_t volsize;
802 	zfs_locked_range_t *lr;
803 	int error = 0;
804 	zfs_uio_t uio;
805 
806 	zfs_uio_init(&uio, uio_s);
807 
808 	zv = dev->si_drv2;
809 
810 	volsize = zv->zv_volsize;
811 	/*
812 	 * uio_loffset == volsize isn't an error as
813 	 * it's required for EOF processing.
814 	 */
815 	if (zfs_uio_resid(&uio) > 0 &&
816 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
817 		return (SET_ERROR(EIO));
818 
819 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
820 	ssize_t start_resid = zfs_uio_resid(&uio);
821 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
822 	    zfs_uio_resid(&uio), RL_READER);
823 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
824 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
825 
826 		/* Don't read past the end. */
827 		if (bytes > volsize - zfs_uio_offset(&uio))
828 			bytes = volsize - zfs_uio_offset(&uio);
829 
830 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
831 		    DMU_READ_PREFETCH);
832 		if (error) {
833 			/* Convert checksum errors into IO errors. */
834 			if (error == ECKSUM)
835 				error = SET_ERROR(EIO);
836 			break;
837 		}
838 	}
839 	zfs_rangelock_exit(lr);
840 	int64_t nread = start_resid - zfs_uio_resid(&uio);
841 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
842 	rw_exit(&zv->zv_suspend_lock);
843 
844 	return (error);
845 }
846 
847 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio_s,int ioflag)848 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
849 {
850 	zvol_state_t *zv;
851 	uint64_t volsize;
852 	zfs_locked_range_t *lr;
853 	int error = 0;
854 	boolean_t commit;
855 	zfs_uio_t uio;
856 
857 	zv = dev->si_drv2;
858 
859 	volsize = zv->zv_volsize;
860 
861 	zfs_uio_init(&uio, uio_s);
862 
863 	if (zfs_uio_resid(&uio) > 0 &&
864 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
865 		return (SET_ERROR(EIO));
866 
867 	ssize_t start_resid = zfs_uio_resid(&uio);
868 	commit = (ioflag & IO_SYNC) ||
869 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
870 
871 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
872 	zvol_ensure_zilog(zv);
873 
874 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
875 	    zfs_uio_resid(&uio), RL_WRITER);
876 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
877 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
878 		uint64_t off = zfs_uio_offset(&uio);
879 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
880 
881 		if (bytes > volsize - off)	/* Don't write past the end. */
882 			bytes = volsize - off;
883 
884 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
885 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
886 		if (error) {
887 			dmu_tx_abort(tx);
888 			break;
889 		}
890 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
891 		    DMU_READ_PREFETCH);
892 		if (error == 0)
893 			zvol_log_write(zv, tx, off, bytes, commit);
894 		dmu_tx_commit(tx);
895 
896 		if (error)
897 			break;
898 	}
899 	zfs_rangelock_exit(lr);
900 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
901 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
902 	if (error == 0 && commit)
903 		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
904 	rw_exit(&zv->zv_suspend_lock);
905 
906 	return (error);
907 }
908 
909 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)910 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
911 {
912 	zvol_state_t *zv;
913 	int err = 0;
914 	boolean_t drop_suspend = B_FALSE;
915 
916 retry:
917 	zv = atomic_load_ptr(&dev->si_drv2);
918 	if (zv == NULL)
919 		return (SET_ERROR(ENXIO));
920 
921 	mutex_enter(&zv->zv_state_lock);
922 	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
923 		err = SET_ERROR(ENXIO);
924 		goto out_locked;
925 	}
926 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
927 
928 	/*
929 	 * Make sure zvol is not suspended during first open
930 	 * (hold zv_suspend_lock) and respect proper lock acquisition
931 	 * ordering - zv_suspend_lock before zv_state_lock.
932 	 */
933 	if (zv->zv_open_count == 0) {
934 		drop_suspend = B_TRUE;
935 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
936 			mutex_exit(&zv->zv_state_lock);
937 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
938 			mutex_enter(&zv->zv_state_lock);
939 
940 			if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
941 				/* Removal started while locks were down. */
942 				err = SET_ERROR(ENXIO);
943 				goto out_locked;
944 			}
945 
946 			/* Check to see if zv_suspend_lock is needed. */
947 			if (zv->zv_open_count != 0) {
948 				rw_exit(&zv->zv_suspend_lock);
949 				drop_suspend = B_FALSE;
950 			}
951 		}
952 	}
953 
954 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
955 
956 	if (zv->zv_open_count == 0) {
957 		boolean_t drop_namespace = B_FALSE;
958 
959 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
960 
961 		/*
962 		 * Take spa_namespace_lock to prevent lock inversion when
963 		 * zvols from one pool are opened as vdevs in another.
964 		 */
965 		if (!mutex_owned(&spa_namespace_lock)) {
966 			if (!mutex_tryenter(&spa_namespace_lock)) {
967 				mutex_exit(&zv->zv_state_lock);
968 				rw_exit(&zv->zv_suspend_lock);
969 				drop_suspend = B_FALSE;
970 				kern_yield(PRI_USER);
971 				goto retry;
972 			} else {
973 				drop_namespace = B_TRUE;
974 			}
975 		}
976 		err = zvol_first_open(zv, !(flags & FWRITE));
977 		if (drop_namespace)
978 			mutex_exit(&spa_namespace_lock);
979 		if (err)
980 			goto out_locked;
981 	}
982 
983 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
984 
985 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
986 		err = SET_ERROR(EROFS);
987 		goto out_opened;
988 	}
989 	if (zv->zv_flags & ZVOL_EXCL) {
990 		err = SET_ERROR(EBUSY);
991 		goto out_opened;
992 	}
993 	if (flags & O_EXCL) {
994 		if (zv->zv_open_count != 0) {
995 			err = SET_ERROR(EBUSY);
996 			goto out_opened;
997 		}
998 		zv->zv_flags |= ZVOL_EXCL;
999 	}
1000 
1001 	zv->zv_open_count++;
1002 out_opened:
1003 	if (zv->zv_open_count == 0) {
1004 		zvol_last_close(zv);
1005 		wakeup(zv);
1006 	}
1007 out_locked:
1008 	mutex_exit(&zv->zv_state_lock);
1009 	if (drop_suspend)
1010 		rw_exit(&zv->zv_suspend_lock);
1011 	return (err);
1012 }
1013 
1014 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)1015 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1016 {
1017 	zvol_state_t *zv;
1018 	boolean_t drop_suspend = B_TRUE;
1019 
1020 	zv = atomic_load_ptr(&dev->si_drv2);
1021 	if (zv == NULL)
1022 		return (SET_ERROR(ENXIO));
1023 
1024 	mutex_enter(&zv->zv_state_lock);
1025 	if (zv->zv_flags & ZVOL_EXCL) {
1026 		ASSERT3U(zv->zv_open_count, ==, 1);
1027 		zv->zv_flags &= ~ZVOL_EXCL;
1028 	}
1029 
1030 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1031 
1032 	/*
1033 	 * If the open count is zero, this is a spurious close.
1034 	 * That indicates a bug in the kernel / DDI framework.
1035 	 */
1036 	ASSERT3U(zv->zv_open_count, >, 0);
1037 	/*
1038 	 * Make sure zvol is not suspended during last close
1039 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1040 	 * ordering - zv_suspend_lock before zv_state_lock.
1041 	 */
1042 	if (zv->zv_open_count == 1) {
1043 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1044 			mutex_exit(&zv->zv_state_lock);
1045 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1046 			mutex_enter(&zv->zv_state_lock);
1047 
1048 			/*
1049 			 * Unlike in zvol_cdev_open(), we don't check if
1050 			 * removal started here, because we might be one of the
1051 			 * openers that needs to be thrown out! If we're the
1052 			 * last, we need to call zvol_last_close() below to
1053 			 * finish cleanup. So, no special treatment for us.
1054 			 */
1055 
1056 			/* Check to see if zv_suspend_lock is needed. */
1057 			if (zv->zv_open_count != 1) {
1058 				rw_exit(&zv->zv_suspend_lock);
1059 				drop_suspend = B_FALSE;
1060 			}
1061 		}
1062 	} else {
1063 		drop_suspend = B_FALSE;
1064 	}
1065 
1066 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1067 
1068 	/*
1069 	 * You may get multiple opens, but only one close.
1070 	 */
1071 	zv->zv_open_count--;
1072 
1073 	if (zv->zv_open_count == 0) {
1074 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1075 		zvol_last_close(zv);
1076 		wakeup(zv);
1077 	}
1078 
1079 	mutex_exit(&zv->zv_state_lock);
1080 
1081 	if (drop_suspend)
1082 		rw_exit(&zv->zv_suspend_lock);
1083 	return (0);
1084 }
1085 
1086 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1087 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1088     int fflag, struct thread *td)
1089 {
1090 	zvol_state_t *zv;
1091 	zfs_locked_range_t *lr;
1092 	off_t offset, length;
1093 	int error;
1094 	boolean_t sync;
1095 
1096 	zv = atomic_load_ptr(&dev->si_drv2);
1097 	ASSERT3P(zv, !=, NULL);
1098 
1099 	error = 0;
1100 	KASSERT(zv->zv_open_count > 0,
1101 	    ("Device with zero access count in %s", __func__));
1102 
1103 	switch (cmd) {
1104 	case DIOCGSECTORSIZE:
1105 		*(uint32_t *)data = DEV_BSIZE;
1106 		break;
1107 	case DIOCGMEDIASIZE:
1108 		*(off_t *)data = zv->zv_volsize;
1109 		break;
1110 	case DIOCGFLUSH:
1111 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1112 		if (zv->zv_zilog != NULL)
1113 			error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
1114 		rw_exit(&zv->zv_suspend_lock);
1115 		break;
1116 	case DIOCGDELETE:
1117 		if (!zvol_unmap_enabled)
1118 			break;
1119 
1120 		offset = ((off_t *)data)[0];
1121 		length = ((off_t *)data)[1];
1122 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1123 		    offset < 0 || offset >= zv->zv_volsize ||
1124 		    length <= 0) {
1125 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1126 			    length);
1127 			error = SET_ERROR(EINVAL);
1128 			break;
1129 		}
1130 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1131 		zvol_ensure_zilog(zv);
1132 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1133 		    RL_WRITER);
1134 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1135 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
1136 		if (error != 0) {
1137 			sync = FALSE;
1138 			dmu_tx_abort(tx);
1139 		} else {
1140 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1141 			zvol_log_truncate(zv, tx, offset, length);
1142 			dmu_tx_commit(tx);
1143 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1144 			    offset, length);
1145 		}
1146 		zfs_rangelock_exit(lr);
1147 		if (sync)
1148 			error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
1149 		rw_exit(&zv->zv_suspend_lock);
1150 		break;
1151 	case DIOCGSTRIPESIZE:
1152 		*(off_t *)data = zv->zv_volblocksize;
1153 		break;
1154 	case DIOCGSTRIPEOFFSET:
1155 		*(off_t *)data = 0;
1156 		break;
1157 	case DIOCGATTR: {
1158 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1159 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1160 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1161 		uint64_t refd, avail, usedobjs, availobjs;
1162 
1163 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1164 			arg->value.i = 1;
1165 		else if (strcmp(arg->name, "blocksavail") == 0) {
1166 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1167 			    &usedobjs, &availobjs);
1168 			arg->value.off = avail / DEV_BSIZE;
1169 		} else if (strcmp(arg->name, "blocksused") == 0) {
1170 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1171 			    &usedobjs, &availobjs);
1172 			arg->value.off = refd / DEV_BSIZE;
1173 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1174 			avail = metaslab_class_get_space(spa_normal_class(spa));
1175 			avail -= metaslab_class_get_alloc(
1176 			    spa_normal_class(spa));
1177 			arg->value.off = avail / DEV_BSIZE;
1178 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1179 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1180 			arg->value.off = refd / DEV_BSIZE;
1181 		} else
1182 			error = SET_ERROR(ENOIOCTL);
1183 		rw_exit(&zv->zv_suspend_lock);
1184 		break;
1185 	}
1186 	case FIOSEEKHOLE:
1187 	case FIOSEEKDATA: {
1188 		off_t *off = (off_t *)data;
1189 		uint64_t noff;
1190 		boolean_t hole;
1191 
1192 		hole = (cmd == FIOSEEKHOLE);
1193 		noff = *off;
1194 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1195 		lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1196 		    RL_READER);
1197 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1198 		zfs_rangelock_exit(lr);
1199 		rw_exit(&zv->zv_suspend_lock);
1200 		*off = noff;
1201 		break;
1202 	}
1203 	default:
1204 		error = SET_ERROR(ENOIOCTL);
1205 	}
1206 
1207 	return (error);
1208 }
1209 
1210 /*
1211  * Misc. helpers
1212  */
1213 
1214 static void
zvol_ensure_zilog(zvol_state_t * zv)1215 zvol_ensure_zilog(zvol_state_t *zv)
1216 {
1217 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1218 
1219 	/*
1220 	 * Open a ZIL if this is the first time we have written to this
1221 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1222 	 * than zv_state_lock so that we don't need to acquire an
1223 	 * additional lock in this path.
1224 	 */
1225 	if (zv->zv_zilog == NULL) {
1226 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1227 			rw_exit(&zv->zv_suspend_lock);
1228 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1229 		}
1230 		if (zv->zv_zilog == NULL) {
1231 			zv->zv_zilog = zil_open(zv->zv_objset,
1232 			    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1233 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1234 			/* replay / destroy done in zvol_os_create_minor() */
1235 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1236 			    ZIL_REPLAY_NEEDED);
1237 		}
1238 		rw_downgrade(&zv->zv_suspend_lock);
1239 	}
1240 }
1241 
1242 boolean_t
zvol_os_is_zvol(const char * device)1243 zvol_os_is_zvol(const char *device)
1244 {
1245 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1246 }
1247 
1248 int
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1249 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1250 {
1251 	int error = 0;
1252 
1253 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1254 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1255 
1256 	/* Move to a new hashtable entry.  */
1257 	zv->zv_hash = zvol_name_hash(newname);
1258 	hlist_del(&zv->zv_hlink);
1259 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1260 
1261 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1262 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1263 		struct g_provider *pp = zsg->zsg_provider;
1264 		struct g_geom *gp;
1265 
1266 		g_topology_lock();
1267 		gp = pp->geom;
1268 		ASSERT3P(gp, !=, NULL);
1269 
1270 		zsg->zsg_provider = NULL;
1271 		g_wither_provider(pp, ENXIO);
1272 
1273 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1274 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1275 		pp->sectorsize = DEV_BSIZE;
1276 		pp->mediasize = zv->zv_volsize;
1277 		pp->private = zv;
1278 		zsg->zsg_provider = pp;
1279 		g_error_provider(pp, 0);
1280 		g_topology_unlock();
1281 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1282 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1283 		struct cdev *dev;
1284 		struct make_dev_args args;
1285 
1286 		dev = zsd->zsd_cdev;
1287 		if (dev != NULL) {
1288 			destroy_dev(dev);
1289 			dev = zsd->zsd_cdev = NULL;
1290 			if (zv->zv_open_count > 0) {
1291 				zv->zv_flags &= ~ZVOL_EXCL;
1292 				zv->zv_open_count = 0;
1293 				/* XXX  need suspend lock but lock order */
1294 				zvol_last_close(zv);
1295 			}
1296 		}
1297 
1298 		make_dev_args_init(&args);
1299 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1300 		args.mda_devsw = &zvol_cdevsw;
1301 		args.mda_cr = NULL;
1302 		args.mda_uid = UID_ROOT;
1303 		args.mda_gid = GID_OPERATOR;
1304 		args.mda_mode = 0640;
1305 		args.mda_si_drv2 = zv;
1306 		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
1307 		if (error == 0) {
1308 			dev->si_iosize_max = maxphys;
1309 			zsd->zsd_cdev = dev;
1310 		}
1311 	}
1312 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1313 	dataset_kstats_rename(&zv->zv_kstat, newname);
1314 
1315 	return (error);
1316 }
1317 
1318 /*
1319  * Allocate memory for a new zvol_state_t and setup the required
1320  * request queue and generic disk structures for the block device.
1321  */
1322 static int
zvol_alloc(const char * name,uint64_t volsize,uint64_t volblocksize,zvol_state_t ** zvp)1323 zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
1324     zvol_state_t **zvp)
1325 {
1326 	zvol_state_t *zv;
1327 	uint64_t volmode;
1328 	int error;
1329 
1330 	error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
1331 	    &volmode, NULL);
1332 	if (error)
1333 		return (error);
1334 
1335 	if (volmode == ZFS_VOLMODE_DEFAULT)
1336 		volmode = zvol_volmode;
1337 
1338 	if (volmode == ZFS_VOLMODE_NONE)
1339 		return (0);
1340 
1341 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1342 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1343 	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1344 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1345 	zv->zv_volmode = volmode;
1346 	zv->zv_volsize = volsize;
1347 	zv->zv_volblocksize = volblocksize;
1348 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1349 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1350 		struct g_provider *pp;
1351 		struct g_geom *gp;
1352 
1353 		g_topology_lock();
1354 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1355 		gp->start = zvol_geom_bio_start;
1356 		gp->access = zvol_geom_access;
1357 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1358 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1359 		pp->sectorsize = DEV_BSIZE;
1360 		pp->mediasize = 0;
1361 		pp->private = zv;
1362 
1363 		zsg->zsg_provider = pp;
1364 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1365 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1366 		struct cdev *dev;
1367 		struct make_dev_args args;
1368 
1369 		make_dev_args_init(&args);
1370 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1371 		args.mda_devsw = &zvol_cdevsw;
1372 		args.mda_cr = NULL;
1373 		args.mda_uid = UID_ROOT;
1374 		args.mda_gid = GID_OPERATOR;
1375 		args.mda_mode = 0640;
1376 		args.mda_si_drv2 = zv;
1377 		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1378 		if (error) {
1379 			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1380 			kmem_free(zv, sizeof (zvol_state_t));
1381 			return (error);
1382 		}
1383 
1384 		dev->si_iosize_max = maxphys;
1385 		zsd->zsd_cdev = dev;
1386 		knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
1387 	}
1388 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1389 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1390 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1391 
1392 	*zvp = zv;
1393 	return (error);
1394 }
1395 
1396 /*
1397  * Remove minor node for the specified volume.
1398  */
1399 void
zvol_os_remove_minor(zvol_state_t * zv)1400 zvol_os_remove_minor(zvol_state_t *zv)
1401 {
1402 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1403 	ASSERT0(zv->zv_open_count);
1404 	ASSERT0(atomic_read(&zv->zv_suspend_ref));
1405 	ASSERT(zv->zv_flags & ZVOL_REMOVING);
1406 
1407 	struct zvol_state_os *zso = zv->zv_zso;
1408 	zv->zv_zso = NULL;
1409 
1410 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1411 		struct zvol_state_geom *zsg = &zso->zso_geom;
1412 		struct g_provider *pp = zsg->zsg_provider;
1413 		atomic_store_ptr(&pp->private, NULL);
1414 		mutex_exit(&zv->zv_state_lock);
1415 
1416 		g_topology_lock();
1417 		g_wither_geom(pp->geom, ENXIO);
1418 		g_topology_unlock();
1419 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1420 		struct zvol_state_dev *zsd = &zso->zso_dev;
1421 		struct cdev *dev = zsd->zsd_cdev;
1422 
1423 		if (dev != NULL)
1424 			atomic_store_ptr(&dev->si_drv2, NULL);
1425 		mutex_exit(&zv->zv_state_lock);
1426 
1427 		if (dev != NULL) {
1428 			destroy_dev(dev);
1429 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1430 			knlist_destroy(&zsd->zsd_selinfo.si_note);
1431 		}
1432 	}
1433 
1434 	kmem_free(zso, sizeof (struct zvol_state_os));
1435 
1436 	mutex_enter(&zv->zv_state_lock);
1437 }
1438 
1439 void
zvol_os_free(zvol_state_t * zv)1440 zvol_os_free(zvol_state_t *zv)
1441 {
1442 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1443 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1444 	ASSERT0(zv->zv_open_count);
1445 	ASSERT0P(zv->zv_zso);
1446 
1447 	ASSERT0P(zv->zv_objset);
1448 	ASSERT0P(zv->zv_zilog);
1449 	ASSERT0P(zv->zv_dn);
1450 
1451 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1452 
1453 	rw_destroy(&zv->zv_suspend_lock);
1454 	zfs_rangelock_fini(&zv->zv_rangelock);
1455 
1456 	mutex_destroy(&zv->zv_state_lock);
1457 	cv_destroy(&zv->zv_removing_cv);
1458 	dataset_kstats_destroy(&zv->zv_kstat);
1459 	kmem_free(zv, sizeof (zvol_state_t));
1460 	zvol_minors--;
1461 }
1462 
1463 /*
1464  * Create a minor node (plus a whole lot more) for the specified volume.
1465  */
1466 int
zvol_os_create_minor(const char * name)1467 zvol_os_create_minor(const char *name)
1468 {
1469 	zvol_state_t *zv = NULL;
1470 	objset_t *os;
1471 	dmu_object_info_t *doi;
1472 	uint64_t volsize;
1473 	uint64_t hash, len;
1474 	int error;
1475 	bool replayed_zil = B_FALSE;
1476 
1477 	if (zvol_inhibit_dev)
1478 		return (0);
1479 
1480 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1481 	hash = zvol_name_hash(name);
1482 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1483 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1484 		mutex_exit(&zv->zv_state_lock);
1485 		return (SET_ERROR(EEXIST));
1486 	}
1487 
1488 	DROP_GIANT();
1489 
1490 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1491 
1492 	/* Lie and say we're read-only. */
1493 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1494 	if (error)
1495 		goto out_doi;
1496 
1497 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1498 	if (error)
1499 		goto out_dmu_objset_disown;
1500 
1501 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1502 	if (error)
1503 		goto out_dmu_objset_disown;
1504 
1505 	error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
1506 	if (error || zv == NULL)
1507 		goto out_dmu_objset_disown;
1508 
1509 	zv->zv_hash = hash;
1510 
1511 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1512 		zv->zv_flags |= ZVOL_RDONLY;
1513 
1514 	zv->zv_objset = os;
1515 
1516 	ASSERT0P(zv->zv_kstat.dk_kstats);
1517 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1518 	if (error)
1519 		goto out_dmu_objset_disown;
1520 	ASSERT0P(zv->zv_zilog);
1521 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1522 	if (spa_writeable(dmu_objset_spa(os))) {
1523 		if (zil_replay_disable)
1524 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1525 		else
1526 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1527 	}
1528 	if (replayed_zil)
1529 		zil_close(zv->zv_zilog);
1530 	zv->zv_zilog = NULL;
1531 
1532 	len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1533 	if (len > 0) {
1534 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ);
1535 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1536 		    ZIO_PRIORITY_ASYNC_READ);
1537 	}
1538 
1539 	zv->zv_objset = NULL;
1540 out_dmu_objset_disown:
1541 	dmu_objset_disown(os, B_TRUE, FTAG);
1542 
1543 	if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1544 		g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
1545 		/* geom was locked inside zvol_alloc() function */
1546 		g_topology_unlock();
1547 	}
1548 out_doi:
1549 	kmem_free(doi, sizeof (dmu_object_info_t));
1550 	if (error == 0 && zv) {
1551 		rw_enter(&zvol_state_lock, RW_WRITER);
1552 		zvol_insert(zv);
1553 		zvol_minors++;
1554 		rw_exit(&zvol_state_lock);
1555 		ZFS_LOG(1, "ZVOL %s created.", name);
1556 	}
1557 	PICKUP_GIANT();
1558 	return (error);
1559 }
1560 
1561 int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1562 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1563 {
1564 	zv->zv_volsize = volsize;
1565 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1566 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1567 		struct g_provider *pp = zsg->zsg_provider;
1568 
1569 		g_topology_lock();
1570 
1571 		if (pp->private == NULL) {
1572 			g_topology_unlock();
1573 			return (SET_ERROR(ENXIO));
1574 		}
1575 
1576 		/*
1577 		 * Do not invoke resize event when initial size was zero.
1578 		 * ZVOL initializes the size on first open, this is not
1579 		 * real resizing.
1580 		 */
1581 		if (pp->mediasize == 0)
1582 			pp->mediasize = zv->zv_volsize;
1583 		else
1584 			g_resize_provider(pp, zv->zv_volsize);
1585 
1586 		g_topology_unlock();
1587 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1588 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1589 
1590 		KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1591 	}
1592 	return (0);
1593 }
1594 
1595 void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1596 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1597 {
1598 	/*
1599 	 * The ro/rw ZVOL mode is switched using zvol_set_ro() function by
1600 	 * enabling/disabling ZVOL_RDONLY flag.  No additional FreeBSD-specific
1601 	 * actions are required for readonly zfs property switching.
1602 	 */
1603 }
1604 
1605 void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1606 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1607 {
1608 	/*
1609 	 * The ZVOL size/capacity is changed by zvol_set_volsize() function.
1610 	 * Leave this method empty, because all required job is doing by
1611 	 * zvol_os_update_volsize() platform-specific function.
1612 	 */
1613 }
1614 
1615 /*
1616  * Public interfaces
1617  */
1618 
1619 int
zvol_busy(void)1620 zvol_busy(void)
1621 {
1622 	return (zvol_minors != 0);
1623 }
1624 
1625 int
zvol_init(void)1626 zvol_init(void)
1627 {
1628 	return (zvol_init_impl());
1629 }
1630 
1631 void
zvol_fini(void)1632 zvol_fini(void)
1633 {
1634 	zvol_fini_impl();
1635 }
1636