xref: /freebsd/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c (revision 8ac904ce090b1c2e355da8aa122ca2252183f4e1)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  *
25  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
26  * All rights reserved.
27  *
28  * Portions Copyright 2010 Robert Milkowski
29  *
30  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
31  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
32  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
33  * Copyright (c) 2014 Integros [integros.com]
34  * Copyright (c) 2024, 2025, Klara, Inc.
35  */
36 
37 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
38 
39 /*
40  * ZFS volume emulation driver.
41  *
42  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
43  * Volumes are accessed through the symbolic links named:
44  *
45  * /dev/zvol/<pool_name>/<dataset_name>
46  *
47  * Volumes are persistent through reboot.  No user command needs to be
48  * run before opening and using a device.
49  *
50  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
51  * in the system. Except when they're simply character devices (volmode=dev).
52  */
53 
54 #include <sys/types.h>
55 #include <sys/param.h>
56 #include <sys/kernel.h>
57 #include <sys/errno.h>
58 #include <sys/uio.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/kmem.h>
62 #include <sys/conf.h>
63 #include <sys/cmn_err.h>
64 #include <sys/stat.h>
65 #include <sys/proc.h>
66 #include <sys/zap.h>
67 #include <sys/spa.h>
68 #include <sys/spa_impl.h>
69 #include <sys/zio.h>
70 #include <sys/disk.h>
71 #include <sys/dmu_traverse.h>
72 #include <sys/dnode.h>
73 #include <sys/dsl_dataset.h>
74 #include <sys/dsl_prop.h>
75 #include <sys/dsl_dir.h>
76 #include <sys/byteorder.h>
77 #include <sys/sunddi.h>
78 #include <sys/dirent.h>
79 #include <sys/policy.h>
80 #include <sys/queue.h>
81 #include <sys/fs/zfs.h>
82 #include <sys/zfs_ioctl.h>
83 #include <sys/zil.h>
84 #include <sys/zfs_znode.h>
85 #include <sys/zfs_rlock.h>
86 #include <sys/vdev_impl.h>
87 #include <sys/vdev_raidz.h>
88 #include <sys/zvol.h>
89 #include <sys/zil_impl.h>
90 #include <sys/dataset_kstats.h>
91 #include <sys/dbuf.h>
92 #include <sys/dmu_tx.h>
93 #include <sys/zfeature.h>
94 #include <sys/zio_checksum.h>
95 #include <sys/zil_impl.h>
96 #include <sys/filio.h>
97 #include <sys/freebsd_event.h>
98 
99 #include <geom/geom.h>
100 #include <sys/zvol.h>
101 #include <sys/zvol_impl.h>
102 #include <cityhash.h>
103 
104 #include "zfs_namecheck.h"
105 
106 #define	ZVOL_DUMPSIZE		"dumpsize"
107 
108 #ifdef ZVOL_LOCK_DEBUG
109 #define	ZVOL_RW_READER		RW_WRITER
110 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
111 #else
112 #define	ZVOL_RW_READER		RW_READER
113 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
114 #endif
115 
116 struct zvol_state_os {
117 #define	zso_dev		_zso_state._zso_dev
118 #define	zso_geom	_zso_state._zso_geom
119 	union {
120 		/* volmode=dev */
121 		struct zvol_state_dev {
122 			struct cdev *zsd_cdev;
123 			struct selinfo zsd_selinfo;
124 		} _zso_dev;
125 
126 		/* volmode=geom */
127 		struct zvol_state_geom {
128 			struct g_provider *zsg_provider;
129 		} _zso_geom;
130 	} _zso_state;
131 	int zso_dying;
132 };
133 
134 static uint32_t zvol_minors;
135 
136 SYSCTL_DECL(_vfs_zfs);
137 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
138 
139 static boolean_t zpool_on_zvol = B_FALSE;
140 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
141 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
142 
143 /*
144  * Toggle unmap functionality.
145  */
146 boolean_t zvol_unmap_enabled = B_TRUE;
147 
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
149 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
150 
151 /*
152  * zvol maximum transfer in one DMU tx.
153  */
154 int zvol_maxphys = DMU_MAX_ACCESS / 2;
155 
156 static void zvol_ensure_zilog(zvol_state_t *zv);
157 
158 static d_open_t		zvol_cdev_open;
159 static d_close_t	zvol_cdev_close;
160 static d_ioctl_t	zvol_cdev_ioctl;
161 static d_read_t		zvol_cdev_read;
162 static d_write_t	zvol_cdev_write;
163 static d_strategy_t	zvol_cdev_bio_strategy;
164 static d_kqfilter_t	zvol_cdev_kqfilter;
165 
166 static struct cdevsw zvol_cdevsw = {
167 	.d_name =	"zvol",
168 	.d_version =	D_VERSION,
169 	.d_flags =	D_DISK | D_TRACKCLOSE,
170 	.d_open =	zvol_cdev_open,
171 	.d_close =	zvol_cdev_close,
172 	.d_ioctl =	zvol_cdev_ioctl,
173 	.d_read =	zvol_cdev_read,
174 	.d_write =	zvol_cdev_write,
175 	.d_strategy =	zvol_cdev_bio_strategy,
176 	.d_kqfilter =	zvol_cdev_kqfilter,
177 };
178 
179 static void		zvol_filter_detach(struct knote *kn);
180 static int		zvol_filter_vnode(struct knote *kn, long hint);
181 
182 static struct filterops zvol_filterops_vnode = {
183 	.f_isfd = 1,
184 	.f_detach = zvol_filter_detach,
185 	.f_event = zvol_filter_vnode,
186 	.f_copy = knote_triv_copy,
187 };
188 
189 extern uint_t zfs_geom_probe_vdev_key;
190 
191 struct g_class zfs_zvol_class = {
192 	.name = "ZFS::ZVOL",
193 	.version = G_VERSION,
194 };
195 
196 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
197 
198 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
199 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
200 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
201 static void zvol_geom_bio_start(struct bio *bp);
202 static int zvol_geom_bio_getattr(struct bio *bp);
203 static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync);
204 
205 /*
206  * GEOM mode implementation
207  */
208 
209 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)210 zvol_geom_open(struct g_provider *pp, int flag, int count)
211 {
212 	zvol_state_t *zv;
213 	int err = 0;
214 	boolean_t drop_suspend = B_FALSE;
215 
216 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
217 		/*
218 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
219 		 * attempting to probe geom providers while looking for a
220 		 * replacement for a missing VDEV.  In this case, the
221 		 * spa_namespace_lock will not be held, but it is still illegal
222 		 * to use a zvol as a vdev.  Deadlocks can result if another
223 		 * thread has spa_namespace_lock.
224 		 */
225 		return (SET_ERROR(EOPNOTSUPP));
226 	}
227 
228 retry:
229 	zv = atomic_load_ptr(&pp->private);
230 	if (zv == NULL)
231 		return (SET_ERROR(ENXIO));
232 
233 	mutex_enter(&zv->zv_state_lock);
234 	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
235 		err = SET_ERROR(ENXIO);
236 		goto out_locked;
237 	}
238 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
239 
240 	/*
241 	 * Make sure zvol is not suspended during first open
242 	 * (hold zv_suspend_lock) and respect proper lock acquisition
243 	 * ordering - zv_suspend_lock before zv_state_lock.
244 	 */
245 	if (zv->zv_open_count == 0) {
246 		drop_suspend = B_TRUE;
247 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
248 			mutex_exit(&zv->zv_state_lock);
249 
250 			/*
251 			 * Removal may happen while the locks are down, so
252 			 * we can't trust zv any longer; we have to start over.
253 			 */
254 			zv = atomic_load_ptr(&pp->private);
255 			if (zv == NULL)
256 				return (SET_ERROR(ENXIO));
257 
258 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
259 			mutex_enter(&zv->zv_state_lock);
260 
261 			if (zv->zv_zso->zso_dying ||
262 			    zv->zv_flags & ZVOL_REMOVING) {
263 				err = SET_ERROR(ENXIO);
264 				goto out_locked;
265 			}
266 
267 			/* Check to see if zv_suspend_lock is needed. */
268 			if (zv->zv_open_count != 0) {
269 				rw_exit(&zv->zv_suspend_lock);
270 				drop_suspend = B_FALSE;
271 			}
272 		}
273 	}
274 
275 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
276 
277 	if (zv->zv_open_count == 0) {
278 		boolean_t drop_namespace = B_FALSE;
279 
280 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
281 
282 		/*
283 		 * Take spa_namespace_lock to prevent lock inversion when
284 		 * zvols from one pool are opened as vdevs in another.
285 		 */
286 		if (!spa_namespace_held()) {
287 			if (!spa_namespace_tryenter(FTAG)) {
288 				mutex_exit(&zv->zv_state_lock);
289 				rw_exit(&zv->zv_suspend_lock);
290 				drop_suspend = B_FALSE;
291 				kern_yield(PRI_USER);
292 				goto retry;
293 			} else {
294 				drop_namespace = B_TRUE;
295 			}
296 		}
297 		err = zvol_first_open(zv, !(flag & FWRITE));
298 		if (drop_namespace)
299 			spa_namespace_exit(FTAG);
300 		if (err)
301 			goto out_locked;
302 		pp->mediasize = zv->zv_volsize;
303 		pp->stripeoffset = 0;
304 		pp->stripesize = zv->zv_volblocksize;
305 	}
306 
307 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
308 
309 	/*
310 	 * Check for a bad on-disk format version now since we
311 	 * lied about owning the dataset readonly before.
312 	 */
313 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
314 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
315 		err = SET_ERROR(EROFS);
316 		goto out_opened;
317 	}
318 	if (zv->zv_flags & ZVOL_EXCL) {
319 		err = SET_ERROR(EBUSY);
320 		goto out_opened;
321 	}
322 	if (flag & O_EXCL) {
323 		if (zv->zv_open_count != 0) {
324 			err = SET_ERROR(EBUSY);
325 			goto out_opened;
326 		}
327 		zv->zv_flags |= ZVOL_EXCL;
328 	}
329 
330 	zv->zv_open_count += count;
331 out_opened:
332 	if (zv->zv_open_count == 0) {
333 		zvol_last_close(zv);
334 		wakeup(zv);
335 	}
336 out_locked:
337 	mutex_exit(&zv->zv_state_lock);
338 	if (drop_suspend)
339 		rw_exit(&zv->zv_suspend_lock);
340 	return (err);
341 }
342 
343 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)344 zvol_geom_close(struct g_provider *pp, int flag, int count)
345 {
346 	(void) flag;
347 	zvol_state_t *zv;
348 	boolean_t drop_suspend = B_TRUE;
349 	int new_open_count;
350 
351 	zv = atomic_load_ptr(&pp->private);
352 	if (zv == NULL)
353 		return (SET_ERROR(ENXIO));
354 
355 	mutex_enter(&zv->zv_state_lock);
356 	if (zv->zv_flags & ZVOL_EXCL) {
357 		ASSERT3U(zv->zv_open_count, ==, 1);
358 		zv->zv_flags &= ~ZVOL_EXCL;
359 	}
360 
361 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
362 
363 	/*
364 	 * If the open count is zero, this is a spurious close.
365 	 * That indicates a bug in the kernel / DDI framework.
366 	 */
367 	ASSERT3U(zv->zv_open_count, >, 0);
368 
369 	/*
370 	 * Make sure zvol is not suspended during last close
371 	 * (hold zv_suspend_lock) and respect proper lock acquisition
372 	 * ordering - zv_suspend_lock before zv_state_lock.
373 	 */
374 	new_open_count = zv->zv_open_count - count;
375 	if (new_open_count == 0) {
376 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
377 			mutex_exit(&zv->zv_state_lock);
378 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
379 			mutex_enter(&zv->zv_state_lock);
380 
381 			/*
382 			 * Unlike in zvol_geom_open(), we don't check if
383 			 * removal started here, because we might be one of the
384 			 * openers that needs to be thrown out! If we're the
385 			 * last, we need to call zvol_last_close() below to
386 			 * finish cleanup. So, no special treatment for us.
387 			 */
388 
389 			/* Check to see if zv_suspend_lock is needed. */
390 			new_open_count = zv->zv_open_count - count;
391 			if (new_open_count != 0) {
392 				rw_exit(&zv->zv_suspend_lock);
393 				drop_suspend = B_FALSE;
394 			}
395 		}
396 	} else {
397 		drop_suspend = B_FALSE;
398 	}
399 
400 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
401 
402 	/*
403 	 * You may get multiple opens, but only one close.
404 	 */
405 	zv->zv_open_count = new_open_count;
406 	if (zv->zv_open_count == 0) {
407 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
408 		zvol_last_close(zv);
409 		wakeup(zv);
410 	}
411 
412 	mutex_exit(&zv->zv_state_lock);
413 
414 	if (drop_suspend)
415 		rw_exit(&zv->zv_suspend_lock);
416 	return (0);
417 }
418 
419 void
zvol_wait_close(zvol_state_t * zv)420 zvol_wait_close(zvol_state_t *zv)
421 {
422 
423 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
424 		return;
425 	mutex_enter(&zv->zv_state_lock);
426 	zv->zv_zso->zso_dying = B_TRUE;
427 
428 	if (zv->zv_open_count)
429 		msleep(zv, &zv->zv_state_lock,
430 		    PRIBIO, "zvol:dying", 10*hz);
431 	mutex_exit(&zv->zv_state_lock);
432 }
433 
434 
435 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)436 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
437 {
438 	int count, error, flags;
439 
440 	g_topology_assert();
441 
442 	/*
443 	 * To make it easier we expect either open or close, but not both
444 	 * at the same time.
445 	 */
446 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
447 	    (acr <= 0 && acw <= 0 && ace <= 0),
448 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
449 	    pp->name, acr, acw, ace));
450 
451 	if (atomic_load_ptr(&pp->private) == NULL) {
452 		if (acr <= 0 && acw <= 0 && ace <= 0)
453 			return (0);
454 		return (pp->error);
455 	}
456 
457 	/*
458 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
459 	 * ace != 0, because GEOM already handles that and handles it a bit
460 	 * differently. GEOM allows for multiple read/exclusive consumers and
461 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
462 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
463 	 * to decide what to do.
464 	 */
465 
466 	count = acr + acw + ace;
467 	if (count == 0)
468 		return (0);
469 
470 	flags = 0;
471 	if (acr != 0 || ace != 0)
472 		flags |= FREAD;
473 	if (acw != 0)
474 		flags |= FWRITE;
475 
476 	g_topology_unlock();
477 	if (count > 0)
478 		error = zvol_geom_open(pp, flags, count);
479 	else
480 		error = zvol_geom_close(pp, flags, -count);
481 	g_topology_lock();
482 	return (error);
483 }
484 
485 static void
zvol_geom_bio_start(struct bio * bp)486 zvol_geom_bio_start(struct bio *bp)
487 {
488 	zvol_state_t *zv = bp->bio_to->private;
489 
490 	if (zv == NULL) {
491 		g_io_deliver(bp, ENXIO);
492 		return;
493 	}
494 	if (bp->bio_cmd == BIO_GETATTR) {
495 		if (zvol_geom_bio_getattr(bp))
496 			g_io_deliver(bp, EOPNOTSUPP);
497 		return;
498 	}
499 
500 	zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) &&
501 	    THREAD_CAN_SLEEP());
502 }
503 
504 static int
zvol_geom_bio_getattr(struct bio * bp)505 zvol_geom_bio_getattr(struct bio *bp)
506 {
507 	zvol_state_t *zv;
508 
509 	zv = bp->bio_to->private;
510 	ASSERT3P(zv, !=, NULL);
511 
512 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
513 	uint64_t refd, avail, usedobjs, availobjs;
514 
515 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
516 		return (0);
517 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
518 		dmu_objset_space(zv->zv_objset, &refd, &avail,
519 		    &usedobjs, &availobjs);
520 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
521 			return (0);
522 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
523 		dmu_objset_space(zv->zv_objset, &refd, &avail,
524 		    &usedobjs, &availobjs);
525 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
526 			return (0);
527 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
528 		avail = metaslab_class_get_space(spa_normal_class(spa));
529 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
530 		if (g_handleattr_off_t(bp, "poolblocksavail",
531 		    avail / DEV_BSIZE))
532 			return (0);
533 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
534 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
535 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
536 			return (0);
537 	}
538 	return (1);
539 }
540 
541 static void
zvol_filter_detach(struct knote * kn)542 zvol_filter_detach(struct knote *kn)
543 {
544 	zvol_state_t *zv;
545 	struct zvol_state_dev *zsd;
546 
547 	zv = kn->kn_hook;
548 	zsd = &zv->zv_zso->zso_dev;
549 
550 	knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
551 }
552 
553 static int
zvol_filter_vnode(struct knote * kn,long hint)554 zvol_filter_vnode(struct knote *kn, long hint)
555 {
556 	kn->kn_fflags |= kn->kn_sfflags & hint;
557 
558 	return (kn->kn_fflags != 0);
559 }
560 
561 static int
zvol_cdev_kqfilter(struct cdev * dev,struct knote * kn)562 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
563 {
564 	zvol_state_t *zv;
565 	struct zvol_state_dev *zsd;
566 
567 	zv = dev->si_drv2;
568 	zsd = &zv->zv_zso->zso_dev;
569 
570 	if (kn->kn_filter != EVFILT_VNODE)
571 		return (EINVAL);
572 
573 	/* XXX: extend support for other NOTE_* events */
574 	if (kn->kn_sfflags != NOTE_ATTRIB)
575 		return (EINVAL);
576 
577 	kn->kn_fop = &zvol_filterops_vnode;
578 	kn->kn_hook = zv;
579 	knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
580 
581 	return (0);
582 }
583 
584 static void
zvol_strategy_impl(zv_request_t * zvr)585 zvol_strategy_impl(zv_request_t *zvr)
586 {
587 	zvol_state_t *zv;
588 	struct bio *bp;
589 	uint64_t off, volsize;
590 	size_t resid;
591 	char *addr;
592 	objset_t *os;
593 	zfs_locked_range_t *lr;
594 	int error = 0;
595 	boolean_t doread = B_FALSE;
596 	boolean_t is_dumpified;
597 	boolean_t commit;
598 
599 	bp = zvr->bio;
600 	zv = zvr->zv;
601 	if (zv == NULL) {
602 		error = SET_ERROR(ENXIO);
603 		goto out;
604 	}
605 
606 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
607 
608 	if (zv->zv_flags & ZVOL_REMOVING) {
609 		error = SET_ERROR(ENXIO);
610 		goto resume;
611 	}
612 
613 	switch (bp->bio_cmd) {
614 	case BIO_READ:
615 		doread = B_TRUE;
616 		break;
617 	case BIO_WRITE:
618 	case BIO_FLUSH:
619 	case BIO_DELETE:
620 		if (zv->zv_flags & ZVOL_RDONLY) {
621 			error = SET_ERROR(EROFS);
622 			goto resume;
623 		}
624 		zvol_ensure_zilog(zv);
625 		if (bp->bio_cmd == BIO_FLUSH)
626 			goto commit;
627 		break;
628 	default:
629 		error = SET_ERROR(EOPNOTSUPP);
630 		goto resume;
631 	}
632 
633 	off = bp->bio_offset;
634 	volsize = zv->zv_volsize;
635 
636 	os = zv->zv_objset;
637 	ASSERT3P(os, !=, NULL);
638 
639 	addr = bp->bio_data;
640 	resid = bp->bio_length;
641 
642 	if (resid > 0 && off >= volsize) {
643 		error = SET_ERROR(EIO);
644 		goto resume;
645 	}
646 
647 	is_dumpified = B_FALSE;
648 	commit = !doread && !is_dumpified &&
649 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
650 
651 	/*
652 	 * There must be no buffer changes when doing a dmu_sync() because
653 	 * we can't change the data whilst calculating the checksum.
654 	 */
655 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
656 	    doread ? RL_READER : RL_WRITER);
657 
658 	if (bp->bio_cmd == BIO_DELETE) {
659 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
660 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
661 		if (error != 0) {
662 			dmu_tx_abort(tx);
663 		} else {
664 			zvol_log_truncate(zv, tx, off, resid);
665 			dmu_tx_commit(tx);
666 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
667 			    off, resid);
668 			resid = 0;
669 		}
670 		goto unlock;
671 	}
672 	while (resid != 0 && off < volsize) {
673 		size_t size = MIN(resid, zvol_maxphys);
674 		if (doread) {
675 			error = dmu_read_by_dnode(zv->zv_dn, off, size, addr,
676 			    DMU_READ_PREFETCH);
677 		} else {
678 			dmu_tx_t *tx = dmu_tx_create(os);
679 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
680 			error = dmu_tx_assign(tx, DMU_TX_WAIT);
681 			if (error) {
682 				dmu_tx_abort(tx);
683 			} else {
684 				dmu_write_by_dnode(zv->zv_dn, off, size, addr,
685 				    tx, DMU_READ_PREFETCH);
686 				zvol_log_write(zv, tx, off, size, commit);
687 				dmu_tx_commit(tx);
688 			}
689 		}
690 		if (error) {
691 			/* Convert checksum errors into IO errors. */
692 			if (error == ECKSUM)
693 				error = SET_ERROR(EIO);
694 			break;
695 		}
696 		off += size;
697 		addr += size;
698 		resid -= size;
699 	}
700 unlock:
701 	zfs_rangelock_exit(lr);
702 
703 	bp->bio_completed = bp->bio_length - resid;
704 	if (bp->bio_completed < bp->bio_length && off > volsize)
705 		error = SET_ERROR(EINVAL);
706 
707 	switch (bp->bio_cmd) {
708 	case BIO_FLUSH:
709 		break;
710 	case BIO_READ:
711 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
712 		    bp->bio_completed);
713 		break;
714 	case BIO_WRITE:
715 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
716 		    bp->bio_completed);
717 		break;
718 	case BIO_DELETE:
719 		break;
720 	default:
721 		break;
722 	}
723 
724 	if (error == 0 && commit) {
725 commit:
726 		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
727 	}
728 resume:
729 	rw_exit(&zv->zv_suspend_lock);
730 out:
731 	if (bp->bio_to)
732 		g_io_deliver(bp, error);
733 	else
734 		biofinish(bp, NULL, error);
735 }
736 
737 static void
zvol_strategy_task(void * arg)738 zvol_strategy_task(void *arg)
739 {
740 	zv_request_task_t *task = arg;
741 
742 	zvol_strategy_impl(&task->zvr);
743 	zv_request_task_free(task);
744 }
745 
746 static void
zvol_geom_bio_strategy(struct bio * bp,boolean_t sync)747 zvol_geom_bio_strategy(struct bio *bp, boolean_t sync)
748 {
749 	zv_taskq_t *ztqs = &zvol_taskqs;
750 	zv_request_task_t *task;
751 	zvol_state_t *zv;
752 	uint_t tq_idx;
753 	uint_t taskq_hash;
754 	int error;
755 
756 	if (bp->bio_to)
757 		zv = bp->bio_to->private;
758 	else
759 		zv = bp->bio_dev->si_drv2;
760 
761 	if (zv == NULL) {
762 		error = SET_ERROR(ENXIO);
763 		if (bp->bio_to)
764 			g_io_deliver(bp, error);
765 		else
766 			biofinish(bp, NULL, error);
767 		return;
768 	}
769 
770 	zv_request_t zvr = {
771 		.zv = zv,
772 		.bio = bp,
773 	};
774 
775 	if (sync || zvol_request_sync) {
776 		zvol_strategy_impl(&zvr);
777 		return;
778 	}
779 
780 	taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >>
781 	    ZVOL_TASKQ_OFFSET_SHIFT);
782 	tq_idx = taskq_hash % ztqs->tqs_cnt;
783 	task = zv_request_task_create(zvr);
784 	taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task,
785 	    0, &task->ent);
786 }
787 
788 static void
zvol_cdev_bio_strategy(struct bio * bp)789 zvol_cdev_bio_strategy(struct bio *bp)
790 {
791 	zvol_geom_bio_strategy(bp, B_FALSE);
792 }
793 
794 /*
795  * Character device mode implementation
796  */
797 
798 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio_s,int ioflag)799 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
800 {
801 	zvol_state_t *zv;
802 	uint64_t volsize;
803 	zfs_locked_range_t *lr;
804 	int error = 0;
805 	zfs_uio_t uio;
806 
807 	zfs_uio_init(&uio, uio_s);
808 
809 	zv = dev->si_drv2;
810 
811 	volsize = zv->zv_volsize;
812 	/*
813 	 * uio_loffset == volsize isn't an error as
814 	 * it's required for EOF processing.
815 	 */
816 	if (zfs_uio_resid(&uio) > 0 &&
817 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
818 		return (SET_ERROR(EIO));
819 
820 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
821 	ssize_t start_resid = zfs_uio_resid(&uio);
822 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
823 	    zfs_uio_resid(&uio), RL_READER);
824 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
825 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
826 
827 		/* Don't read past the end. */
828 		if (bytes > volsize - zfs_uio_offset(&uio))
829 			bytes = volsize - zfs_uio_offset(&uio);
830 
831 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes,
832 		    DMU_READ_PREFETCH);
833 		if (error) {
834 			/* Convert checksum errors into IO errors. */
835 			if (error == ECKSUM)
836 				error = SET_ERROR(EIO);
837 			break;
838 		}
839 	}
840 	zfs_rangelock_exit(lr);
841 	int64_t nread = start_resid - zfs_uio_resid(&uio);
842 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
843 	rw_exit(&zv->zv_suspend_lock);
844 
845 	return (error);
846 }
847 
848 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio_s,int ioflag)849 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
850 {
851 	zvol_state_t *zv;
852 	uint64_t volsize;
853 	zfs_locked_range_t *lr;
854 	int error = 0;
855 	boolean_t commit;
856 	zfs_uio_t uio;
857 
858 	zv = dev->si_drv2;
859 
860 	volsize = zv->zv_volsize;
861 
862 	zfs_uio_init(&uio, uio_s);
863 
864 	if (zfs_uio_resid(&uio) > 0 &&
865 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
866 		return (SET_ERROR(EIO));
867 
868 	ssize_t start_resid = zfs_uio_resid(&uio);
869 	commit = (ioflag & IO_SYNC) ||
870 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
871 
872 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
873 	zvol_ensure_zilog(zv);
874 
875 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
876 	    zfs_uio_resid(&uio), RL_WRITER);
877 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
878 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
879 		uint64_t off = zfs_uio_offset(&uio);
880 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
881 
882 		if (bytes > volsize - off)	/* Don't write past the end. */
883 			bytes = volsize - off;
884 
885 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
886 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
887 		if (error) {
888 			dmu_tx_abort(tx);
889 			break;
890 		}
891 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
892 		    DMU_READ_PREFETCH);
893 		if (error == 0)
894 			zvol_log_write(zv, tx, off, bytes, commit);
895 		dmu_tx_commit(tx);
896 
897 		if (error)
898 			break;
899 	}
900 	zfs_rangelock_exit(lr);
901 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
902 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
903 	if (error == 0 && commit)
904 		error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
905 	rw_exit(&zv->zv_suspend_lock);
906 
907 	return (error);
908 }
909 
910 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)911 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
912 {
913 	zvol_state_t *zv;
914 	int err = 0;
915 	boolean_t drop_suspend = B_FALSE;
916 
917 retry:
918 	zv = atomic_load_ptr(&dev->si_drv2);
919 	if (zv == NULL)
920 		return (SET_ERROR(ENXIO));
921 
922 	mutex_enter(&zv->zv_state_lock);
923 	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
924 		err = SET_ERROR(ENXIO);
925 		goto out_locked;
926 	}
927 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
928 
929 	/*
930 	 * Make sure zvol is not suspended during first open
931 	 * (hold zv_suspend_lock) and respect proper lock acquisition
932 	 * ordering - zv_suspend_lock before zv_state_lock.
933 	 */
934 	if (zv->zv_open_count == 0) {
935 		drop_suspend = B_TRUE;
936 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
937 			mutex_exit(&zv->zv_state_lock);
938 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
939 			mutex_enter(&zv->zv_state_lock);
940 
941 			if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
942 				/* Removal started while locks were down. */
943 				err = SET_ERROR(ENXIO);
944 				goto out_locked;
945 			}
946 
947 			/* Check to see if zv_suspend_lock is needed. */
948 			if (zv->zv_open_count != 0) {
949 				rw_exit(&zv->zv_suspend_lock);
950 				drop_suspend = B_FALSE;
951 			}
952 		}
953 	}
954 
955 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
956 
957 	if (zv->zv_open_count == 0) {
958 		boolean_t drop_namespace = B_FALSE;
959 
960 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
961 
962 		/*
963 		 * Take spa_namespace_lock to prevent lock inversion when
964 		 * zvols from one pool are opened as vdevs in another.
965 		 */
966 		if (!spa_namespace_held()) {
967 			if (!spa_namespace_tryenter(FTAG)) {
968 				mutex_exit(&zv->zv_state_lock);
969 				rw_exit(&zv->zv_suspend_lock);
970 				drop_suspend = B_FALSE;
971 				kern_yield(PRI_USER);
972 				goto retry;
973 			} else {
974 				drop_namespace = B_TRUE;
975 			}
976 		}
977 		err = zvol_first_open(zv, !(flags & FWRITE));
978 		if (drop_namespace)
979 			spa_namespace_exit(FTAG);
980 		if (err)
981 			goto out_locked;
982 	}
983 
984 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
985 
986 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
987 		err = SET_ERROR(EROFS);
988 		goto out_opened;
989 	}
990 	if (zv->zv_flags & ZVOL_EXCL) {
991 		err = SET_ERROR(EBUSY);
992 		goto out_opened;
993 	}
994 	if (flags & O_EXCL) {
995 		if (zv->zv_open_count != 0) {
996 			err = SET_ERROR(EBUSY);
997 			goto out_opened;
998 		}
999 		zv->zv_flags |= ZVOL_EXCL;
1000 	}
1001 
1002 	zv->zv_open_count++;
1003 out_opened:
1004 	if (zv->zv_open_count == 0) {
1005 		zvol_last_close(zv);
1006 		wakeup(zv);
1007 	}
1008 out_locked:
1009 	mutex_exit(&zv->zv_state_lock);
1010 	if (drop_suspend)
1011 		rw_exit(&zv->zv_suspend_lock);
1012 	return (err);
1013 }
1014 
1015 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)1016 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1017 {
1018 	zvol_state_t *zv;
1019 	boolean_t drop_suspend = B_TRUE;
1020 
1021 	zv = atomic_load_ptr(&dev->si_drv2);
1022 	if (zv == NULL)
1023 		return (SET_ERROR(ENXIO));
1024 
1025 	mutex_enter(&zv->zv_state_lock);
1026 	if (zv->zv_flags & ZVOL_EXCL) {
1027 		ASSERT3U(zv->zv_open_count, ==, 1);
1028 		zv->zv_flags &= ~ZVOL_EXCL;
1029 	}
1030 
1031 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1032 
1033 	/*
1034 	 * If the open count is zero, this is a spurious close.
1035 	 * That indicates a bug in the kernel / DDI framework.
1036 	 */
1037 	ASSERT3U(zv->zv_open_count, >, 0);
1038 	/*
1039 	 * Make sure zvol is not suspended during last close
1040 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1041 	 * ordering - zv_suspend_lock before zv_state_lock.
1042 	 */
1043 	if (zv->zv_open_count == 1) {
1044 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1045 			mutex_exit(&zv->zv_state_lock);
1046 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1047 			mutex_enter(&zv->zv_state_lock);
1048 
1049 			/*
1050 			 * Unlike in zvol_cdev_open(), we don't check if
1051 			 * removal started here, because we might be one of the
1052 			 * openers that needs to be thrown out! If we're the
1053 			 * last, we need to call zvol_last_close() below to
1054 			 * finish cleanup. So, no special treatment for us.
1055 			 */
1056 
1057 			/* Check to see if zv_suspend_lock is needed. */
1058 			if (zv->zv_open_count != 1) {
1059 				rw_exit(&zv->zv_suspend_lock);
1060 				drop_suspend = B_FALSE;
1061 			}
1062 		}
1063 	} else {
1064 		drop_suspend = B_FALSE;
1065 	}
1066 
1067 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1068 
1069 	/*
1070 	 * You may get multiple opens, but only one close.
1071 	 */
1072 	zv->zv_open_count--;
1073 
1074 	if (zv->zv_open_count == 0) {
1075 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1076 		zvol_last_close(zv);
1077 		wakeup(zv);
1078 	}
1079 
1080 	mutex_exit(&zv->zv_state_lock);
1081 
1082 	if (drop_suspend)
1083 		rw_exit(&zv->zv_suspend_lock);
1084 	return (0);
1085 }
1086 
1087 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1088 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1089     int fflag, struct thread *td)
1090 {
1091 	zvol_state_t *zv;
1092 	zfs_locked_range_t *lr;
1093 	off_t offset, length;
1094 	int error;
1095 	boolean_t sync;
1096 
1097 	zv = atomic_load_ptr(&dev->si_drv2);
1098 	ASSERT3P(zv, !=, NULL);
1099 
1100 	error = 0;
1101 	KASSERT(zv->zv_open_count > 0,
1102 	    ("Device with zero access count in %s", __func__));
1103 
1104 	switch (cmd) {
1105 	case DIOCGSECTORSIZE:
1106 		*(uint32_t *)data = DEV_BSIZE;
1107 		break;
1108 	case DIOCGMEDIASIZE:
1109 		*(off_t *)data = zv->zv_volsize;
1110 		break;
1111 	case DIOCGFLUSH:
1112 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1113 		if (zv->zv_zilog != NULL)
1114 			error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
1115 		rw_exit(&zv->zv_suspend_lock);
1116 		break;
1117 	case DIOCGDELETE:
1118 		if (!zvol_unmap_enabled)
1119 			break;
1120 
1121 		offset = ((off_t *)data)[0];
1122 		length = ((off_t *)data)[1];
1123 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1124 		    offset < 0 || offset >= zv->zv_volsize ||
1125 		    length <= 0) {
1126 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1127 			    length);
1128 			error = SET_ERROR(EINVAL);
1129 			break;
1130 		}
1131 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1132 		zvol_ensure_zilog(zv);
1133 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1134 		    RL_WRITER);
1135 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1136 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
1137 		if (error != 0) {
1138 			sync = FALSE;
1139 			dmu_tx_abort(tx);
1140 		} else {
1141 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1142 			zvol_log_truncate(zv, tx, offset, length);
1143 			dmu_tx_commit(tx);
1144 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1145 			    offset, length);
1146 		}
1147 		zfs_rangelock_exit(lr);
1148 		if (sync)
1149 			error = zil_commit(zv->zv_zilog, ZVOL_OBJ);
1150 		rw_exit(&zv->zv_suspend_lock);
1151 		break;
1152 	case DIOCGSTRIPESIZE:
1153 		*(off_t *)data = zv->zv_volblocksize;
1154 		break;
1155 	case DIOCGSTRIPEOFFSET:
1156 		*(off_t *)data = 0;
1157 		break;
1158 	case DIOCGATTR: {
1159 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1160 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1161 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1162 		uint64_t refd, avail, usedobjs, availobjs;
1163 
1164 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1165 			arg->value.i = 1;
1166 		else if (strcmp(arg->name, "blocksavail") == 0) {
1167 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1168 			    &usedobjs, &availobjs);
1169 			arg->value.off = avail / DEV_BSIZE;
1170 		} else if (strcmp(arg->name, "blocksused") == 0) {
1171 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1172 			    &usedobjs, &availobjs);
1173 			arg->value.off = refd / DEV_BSIZE;
1174 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1175 			avail = metaslab_class_get_space(spa_normal_class(spa));
1176 			avail -= metaslab_class_get_alloc(
1177 			    spa_normal_class(spa));
1178 			arg->value.off = avail / DEV_BSIZE;
1179 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1180 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1181 			arg->value.off = refd / DEV_BSIZE;
1182 		} else
1183 			error = SET_ERROR(ENOIOCTL);
1184 		rw_exit(&zv->zv_suspend_lock);
1185 		break;
1186 	}
1187 	case FIOSEEKHOLE:
1188 	case FIOSEEKDATA: {
1189 		off_t *off = (off_t *)data;
1190 		uint64_t noff;
1191 		boolean_t hole;
1192 
1193 		hole = (cmd == FIOSEEKHOLE);
1194 		noff = *off;
1195 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1196 		lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1197 		    RL_READER);
1198 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1199 		zfs_rangelock_exit(lr);
1200 		rw_exit(&zv->zv_suspend_lock);
1201 		*off = noff;
1202 		break;
1203 	}
1204 	default:
1205 		error = SET_ERROR(ENOIOCTL);
1206 	}
1207 
1208 	return (error);
1209 }
1210 
1211 /*
1212  * Misc. helpers
1213  */
1214 
1215 static void
zvol_ensure_zilog(zvol_state_t * zv)1216 zvol_ensure_zilog(zvol_state_t *zv)
1217 {
1218 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1219 
1220 	/*
1221 	 * Open a ZIL if this is the first time we have written to this
1222 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1223 	 * than zv_state_lock so that we don't need to acquire an
1224 	 * additional lock in this path.
1225 	 */
1226 	if (zv->zv_zilog == NULL) {
1227 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1228 			rw_exit(&zv->zv_suspend_lock);
1229 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1230 		}
1231 		if (zv->zv_zilog == NULL) {
1232 			zv->zv_zilog = zil_open(zv->zv_objset,
1233 			    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1234 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1235 			/* replay / destroy done in zvol_os_create_minor() */
1236 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1237 			    ZIL_REPLAY_NEEDED);
1238 		}
1239 		rw_downgrade(&zv->zv_suspend_lock);
1240 	}
1241 }
1242 
1243 boolean_t
zvol_os_is_zvol(const char * device)1244 zvol_os_is_zvol(const char *device)
1245 {
1246 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1247 }
1248 
1249 int
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1250 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1251 {
1252 	int error = 0;
1253 
1254 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1255 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1256 
1257 	/* Move to a new hashtable entry.  */
1258 	zv->zv_hash = zvol_name_hash(newname);
1259 	hlist_del(&zv->zv_hlink);
1260 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1261 
1262 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1263 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1264 		struct g_provider *pp = zsg->zsg_provider;
1265 		struct g_geom *gp;
1266 
1267 		g_topology_lock();
1268 		gp = pp->geom;
1269 		ASSERT3P(gp, !=, NULL);
1270 
1271 		zsg->zsg_provider = NULL;
1272 		g_wither_provider(pp, ENXIO);
1273 
1274 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1275 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1276 		pp->sectorsize = DEV_BSIZE;
1277 		pp->mediasize = zv->zv_volsize;
1278 		pp->private = zv;
1279 		zsg->zsg_provider = pp;
1280 		g_error_provider(pp, 0);
1281 		g_topology_unlock();
1282 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1283 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1284 		struct cdev *dev;
1285 		struct make_dev_args args;
1286 
1287 		dev = zsd->zsd_cdev;
1288 		if (dev != NULL) {
1289 			destroy_dev(dev);
1290 			dev = zsd->zsd_cdev = NULL;
1291 			if (zv->zv_open_count > 0) {
1292 				zv->zv_flags &= ~ZVOL_EXCL;
1293 				zv->zv_open_count = 0;
1294 				/* XXX  need suspend lock but lock order */
1295 				zvol_last_close(zv);
1296 			}
1297 		}
1298 
1299 		make_dev_args_init(&args);
1300 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1301 		args.mda_devsw = &zvol_cdevsw;
1302 		args.mda_cr = NULL;
1303 		args.mda_uid = UID_ROOT;
1304 		args.mda_gid = GID_OPERATOR;
1305 		args.mda_mode = 0640;
1306 		args.mda_si_drv2 = zv;
1307 		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname);
1308 		if (error == 0) {
1309 			dev->si_iosize_max = maxphys;
1310 			zsd->zsd_cdev = dev;
1311 		}
1312 	}
1313 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1314 	dataset_kstats_rename(&zv->zv_kstat, newname);
1315 
1316 	return (error);
1317 }
1318 
1319 /*
1320  * Allocate memory for a new zvol_state_t and setup the required
1321  * request queue and generic disk structures for the block device.
1322  */
1323 static int
zvol_alloc(const char * name,uint64_t volsize,uint64_t volblocksize,zvol_state_t ** zvp)1324 zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize,
1325     zvol_state_t **zvp)
1326 {
1327 	zvol_state_t *zv;
1328 	uint64_t volmode;
1329 	int error;
1330 
1331 	error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE),
1332 	    &volmode, NULL);
1333 	if (error)
1334 		return (error);
1335 
1336 	if (volmode == ZFS_VOLMODE_DEFAULT)
1337 		volmode = zvol_volmode;
1338 
1339 	if (volmode == ZFS_VOLMODE_NONE)
1340 		return (0);
1341 
1342 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1343 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1344 	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1345 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1346 	zv->zv_volmode = volmode;
1347 	zv->zv_volsize = volsize;
1348 	zv->zv_volblocksize = volblocksize;
1349 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1350 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1351 		struct g_provider *pp;
1352 		struct g_geom *gp;
1353 
1354 		g_topology_lock();
1355 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1356 		gp->start = zvol_geom_bio_start;
1357 		gp->access = zvol_geom_access;
1358 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1359 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1360 		pp->sectorsize = DEV_BSIZE;
1361 		pp->mediasize = 0;
1362 		pp->private = zv;
1363 
1364 		zsg->zsg_provider = pp;
1365 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1366 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1367 		struct cdev *dev;
1368 		struct make_dev_args args;
1369 
1370 		make_dev_args_init(&args);
1371 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1372 		args.mda_devsw = &zvol_cdevsw;
1373 		args.mda_cr = NULL;
1374 		args.mda_uid = UID_ROOT;
1375 		args.mda_gid = GID_OPERATOR;
1376 		args.mda_mode = 0640;
1377 		args.mda_si_drv2 = zv;
1378 		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1379 		if (error) {
1380 			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1381 			kmem_free(zv, sizeof (zvol_state_t));
1382 			return (error);
1383 		}
1384 
1385 		dev->si_iosize_max = maxphys;
1386 		zsd->zsd_cdev = dev;
1387 		knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock);
1388 	}
1389 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1390 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1391 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1392 
1393 	*zvp = zv;
1394 	return (error);
1395 }
1396 
1397 /*
1398  * Remove minor node for the specified volume.
1399  */
1400 void
zvol_os_remove_minor(zvol_state_t * zv)1401 zvol_os_remove_minor(zvol_state_t *zv)
1402 {
1403 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1404 	ASSERT0(zv->zv_open_count);
1405 	ASSERT0(atomic_read(&zv->zv_suspend_ref));
1406 	ASSERT(zv->zv_flags & ZVOL_REMOVING);
1407 
1408 	struct zvol_state_os *zso = zv->zv_zso;
1409 	zv->zv_zso = NULL;
1410 
1411 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1412 		struct zvol_state_geom *zsg = &zso->zso_geom;
1413 		struct g_provider *pp = zsg->zsg_provider;
1414 		atomic_store_ptr(&pp->private, NULL);
1415 		mutex_exit(&zv->zv_state_lock);
1416 
1417 		g_topology_lock();
1418 		g_wither_geom(pp->geom, ENXIO);
1419 		g_topology_unlock();
1420 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1421 		struct zvol_state_dev *zsd = &zso->zso_dev;
1422 		struct cdev *dev = zsd->zsd_cdev;
1423 
1424 		if (dev != NULL)
1425 			atomic_store_ptr(&dev->si_drv2, NULL);
1426 		mutex_exit(&zv->zv_state_lock);
1427 
1428 		if (dev != NULL) {
1429 			destroy_dev(dev);
1430 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1431 			knlist_destroy(&zsd->zsd_selinfo.si_note);
1432 		}
1433 	}
1434 
1435 	kmem_free(zso, sizeof (struct zvol_state_os));
1436 
1437 	mutex_enter(&zv->zv_state_lock);
1438 }
1439 
1440 void
zvol_os_free(zvol_state_t * zv)1441 zvol_os_free(zvol_state_t *zv)
1442 {
1443 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1444 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1445 	ASSERT0(zv->zv_open_count);
1446 	ASSERT0P(zv->zv_zso);
1447 
1448 	ASSERT0P(zv->zv_objset);
1449 	ASSERT0P(zv->zv_zilog);
1450 	ASSERT0P(zv->zv_dn);
1451 
1452 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1453 
1454 	rw_destroy(&zv->zv_suspend_lock);
1455 	zfs_rangelock_fini(&zv->zv_rangelock);
1456 
1457 	mutex_destroy(&zv->zv_state_lock);
1458 	cv_destroy(&zv->zv_removing_cv);
1459 	dataset_kstats_destroy(&zv->zv_kstat);
1460 	kmem_free(zv, sizeof (zvol_state_t));
1461 	zvol_minors--;
1462 }
1463 
1464 /*
1465  * Create a minor node (plus a whole lot more) for the specified volume.
1466  */
1467 int
zvol_os_create_minor(const char * name)1468 zvol_os_create_minor(const char *name)
1469 {
1470 	zvol_state_t *zv = NULL;
1471 	objset_t *os;
1472 	dmu_object_info_t *doi;
1473 	uint64_t volsize;
1474 	uint64_t hash, len;
1475 	int error;
1476 	bool replayed_zil = B_FALSE;
1477 
1478 	if (zvol_inhibit_dev)
1479 		return (0);
1480 
1481 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1482 	hash = zvol_name_hash(name);
1483 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1484 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1485 		mutex_exit(&zv->zv_state_lock);
1486 		return (SET_ERROR(EEXIST));
1487 	}
1488 
1489 	DROP_GIANT();
1490 
1491 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1492 
1493 	/* Lie and say we're read-only. */
1494 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1495 	if (error)
1496 		goto out_doi;
1497 
1498 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1499 	if (error)
1500 		goto out_dmu_objset_disown;
1501 
1502 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1503 	if (error)
1504 		goto out_dmu_objset_disown;
1505 
1506 	error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv);
1507 	if (error || zv == NULL)
1508 		goto out_dmu_objset_disown;
1509 
1510 	zv->zv_hash = hash;
1511 
1512 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1513 		zv->zv_flags |= ZVOL_RDONLY;
1514 
1515 	zv->zv_objset = os;
1516 
1517 	ASSERT0P(zv->zv_kstat.dk_kstats);
1518 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1519 	if (error)
1520 		goto out_dmu_objset_disown;
1521 	ASSERT0P(zv->zv_zilog);
1522 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1523 	if (spa_writeable(dmu_objset_spa(os))) {
1524 		if (zil_replay_disable)
1525 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1526 		else
1527 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1528 	}
1529 	if (replayed_zil)
1530 		zil_close(zv->zv_zilog);
1531 	zv->zv_zilog = NULL;
1532 
1533 	len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
1534 	if (len > 0) {
1535 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ);
1536 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1537 		    ZIO_PRIORITY_ASYNC_READ);
1538 	}
1539 
1540 	zv->zv_objset = NULL;
1541 out_dmu_objset_disown:
1542 	dmu_objset_disown(os, B_TRUE, FTAG);
1543 
1544 	if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1545 		g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0);
1546 		/* geom was locked inside zvol_alloc() function */
1547 		g_topology_unlock();
1548 	}
1549 out_doi:
1550 	kmem_free(doi, sizeof (dmu_object_info_t));
1551 	if (error == 0 && zv) {
1552 		rw_enter(&zvol_state_lock, RW_WRITER);
1553 		zvol_insert(zv);
1554 		zvol_minors++;
1555 		rw_exit(&zvol_state_lock);
1556 		ZFS_LOG(1, "ZVOL %s created.", name);
1557 	}
1558 	PICKUP_GIANT();
1559 	return (error);
1560 }
1561 
1562 int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1563 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1564 {
1565 	zv->zv_volsize = volsize;
1566 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1567 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1568 		struct g_provider *pp = zsg->zsg_provider;
1569 
1570 		g_topology_lock();
1571 
1572 		if (pp->private == NULL) {
1573 			g_topology_unlock();
1574 			return (SET_ERROR(ENXIO));
1575 		}
1576 
1577 		/*
1578 		 * Do not invoke resize event when initial size was zero.
1579 		 * ZVOL initializes the size on first open, this is not
1580 		 * real resizing.
1581 		 */
1582 		if (pp->mediasize == 0)
1583 			pp->mediasize = zv->zv_volsize;
1584 		else
1585 			g_resize_provider(pp, zv->zv_volsize);
1586 
1587 		g_topology_unlock();
1588 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1589 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1590 
1591 		KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1592 	}
1593 	return (0);
1594 }
1595 
1596 void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1597 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1598 {
1599 	/*
1600 	 * The ro/rw ZVOL mode is switched using zvol_set_ro() function by
1601 	 * enabling/disabling ZVOL_RDONLY flag.  No additional FreeBSD-specific
1602 	 * actions are required for readonly zfs property switching.
1603 	 */
1604 }
1605 
1606 void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1607 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1608 {
1609 	/*
1610 	 * The ZVOL size/capacity is changed by zvol_set_volsize() function.
1611 	 * Leave this method empty, because all required job is doing by
1612 	 * zvol_os_update_volsize() platform-specific function.
1613 	 */
1614 }
1615 
1616 /*
1617  * Public interfaces
1618  */
1619 
1620 int
zvol_busy(void)1621 zvol_busy(void)
1622 {
1623 	return (zvol_minors != 0);
1624 }
1625 
1626 int
zvol_init(void)1627 zvol_init(void)
1628 {
1629 	return (zvol_init_impl());
1630 }
1631 
1632 void
zvol_fini(void)1633 zvol_fini(void)
1634 {
1635 	zvol_fini_impl();
1636 }
1637