xref: /freebsd/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c (revision 7a7741af18d6c8a804cc643cb7ecda9d730c6aa6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  * Copyright (c) 2024, Klara, Inc.
34  */
35 
36 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
37 
38 /*
39  * ZFS volume emulation driver.
40  *
41  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
42  * Volumes are accessed through the symbolic links named:
43  *
44  * /dev/zvol/<pool_name>/<dataset_name>
45  *
46  * Volumes are persistent through reboot.  No user command needs to be
47  * run before opening and using a device.
48  *
49  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
50  * in the system. Except when they're simply character devices (volmode=dev).
51  */
52 
53 #include <sys/types.h>
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/errno.h>
57 #include <sys/uio.h>
58 #include <sys/bio.h>
59 #include <sys/buf.h>
60 #include <sys/kmem.h>
61 #include <sys/conf.h>
62 #include <sys/cmn_err.h>
63 #include <sys/stat.h>
64 #include <sys/proc.h>
65 #include <sys/zap.h>
66 #include <sys/spa.h>
67 #include <sys/spa_impl.h>
68 #include <sys/zio.h>
69 #include <sys/disk.h>
70 #include <sys/dmu_traverse.h>
71 #include <sys/dnode.h>
72 #include <sys/dsl_dataset.h>
73 #include <sys/dsl_prop.h>
74 #include <sys/dsl_dir.h>
75 #include <sys/byteorder.h>
76 #include <sys/sunddi.h>
77 #include <sys/dirent.h>
78 #include <sys/policy.h>
79 #include <sys/queue.h>
80 #include <sys/fs/zfs.h>
81 #include <sys/zfs_ioctl.h>
82 #include <sys/zil.h>
83 #include <sys/zfs_znode.h>
84 #include <sys/zfs_rlock.h>
85 #include <sys/vdev_impl.h>
86 #include <sys/vdev_raidz.h>
87 #include <sys/zvol.h>
88 #include <sys/zil_impl.h>
89 #include <sys/dataset_kstats.h>
90 #include <sys/dbuf.h>
91 #include <sys/dmu_tx.h>
92 #include <sys/zfeature.h>
93 #include <sys/zio_checksum.h>
94 #include <sys/zil_impl.h>
95 #include <sys/filio.h>
96 #include <sys/freebsd_event.h>
97 
98 #include <geom/geom.h>
99 #include <sys/zvol.h>
100 #include <sys/zvol_impl.h>
101 
102 #include "zfs_namecheck.h"
103 
104 #define	ZVOL_DUMPSIZE		"dumpsize"
105 
106 #ifdef ZVOL_LOCK_DEBUG
107 #define	ZVOL_RW_READER		RW_WRITER
108 #define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
109 #else
110 #define	ZVOL_RW_READER		RW_READER
111 #define	ZVOL_RW_READ_HELD	RW_READ_HELD
112 #endif
113 
114 enum zvol_geom_state {
115 	ZVOL_GEOM_UNINIT,
116 	ZVOL_GEOM_STOPPED,
117 	ZVOL_GEOM_RUNNING,
118 };
119 
120 struct zvol_state_os {
121 #define	zso_dev		_zso_state._zso_dev
122 #define	zso_geom	_zso_state._zso_geom
123 	union {
124 		/* volmode=dev */
125 		struct zvol_state_dev {
126 			struct cdev *zsd_cdev;
127 			struct selinfo zsd_selinfo;
128 		} _zso_dev;
129 
130 		/* volmode=geom */
131 		struct zvol_state_geom {
132 			struct g_provider *zsg_provider;
133 			struct bio_queue_head zsg_queue;
134 			struct mtx zsg_queue_mtx;
135 			enum zvol_geom_state zsg_state;
136 		} _zso_geom;
137 	} _zso_state;
138 	int zso_dying;
139 };
140 
141 static uint32_t zvol_minors;
142 
143 SYSCTL_DECL(_vfs_zfs);
144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146 	"Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol = B_FALSE;
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149 	"Allow zpools to use zvols as vdevs (DANGEROUS)");
150 
151 /*
152  * Toggle unmap functionality.
153  */
154 boolean_t zvol_unmap_enabled = B_TRUE;
155 
156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157 	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
158 
159 /*
160  * zvol maximum transfer in one DMU tx.
161  */
162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
163 
164 static void zvol_ensure_zilog(zvol_state_t *zv);
165 
166 static d_open_t		zvol_cdev_open;
167 static d_close_t	zvol_cdev_close;
168 static d_ioctl_t	zvol_cdev_ioctl;
169 static d_read_t		zvol_cdev_read;
170 static d_write_t	zvol_cdev_write;
171 static d_strategy_t	zvol_geom_bio_strategy;
172 static d_kqfilter_t	zvol_cdev_kqfilter;
173 
174 static struct cdevsw zvol_cdevsw = {
175 	.d_name =	"zvol",
176 	.d_version =	D_VERSION,
177 	.d_flags =	D_DISK | D_TRACKCLOSE,
178 	.d_open =	zvol_cdev_open,
179 	.d_close =	zvol_cdev_close,
180 	.d_ioctl =	zvol_cdev_ioctl,
181 	.d_read =	zvol_cdev_read,
182 	.d_write =	zvol_cdev_write,
183 	.d_strategy =	zvol_geom_bio_strategy,
184 	.d_kqfilter =	zvol_cdev_kqfilter,
185 };
186 
187 static void		zvol_filter_detach(struct knote *kn);
188 static int		zvol_filter_vnode(struct knote *kn, long hint);
189 
190 static struct filterops zvol_filterops_vnode = {
191 	.f_isfd = 1,
192 	.f_detach = zvol_filter_detach,
193 	.f_event = zvol_filter_vnode,
194 };
195 
196 extern uint_t zfs_geom_probe_vdev_key;
197 
198 struct g_class zfs_zvol_class = {
199 	.name = "ZFS::ZVOL",
200 	.version = G_VERSION,
201 };
202 
203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
204 
205 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
206 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
207 static void zvol_geom_run(zvol_state_t *zv);
208 static void zvol_geom_destroy(zvol_state_t *zv);
209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
210 static void zvol_geom_worker(void *arg);
211 static void zvol_geom_bio_start(struct bio *bp);
212 static int zvol_geom_bio_getattr(struct bio *bp);
213 /* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
214 
215 /*
216  * GEOM mode implementation
217  */
218 
219 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)220 zvol_geom_open(struct g_provider *pp, int flag, int count)
221 {
222 	zvol_state_t *zv;
223 	int err = 0;
224 	boolean_t drop_suspend = B_FALSE;
225 
226 	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
227 		/*
228 		 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229 		 * attempting to probe geom providers while looking for a
230 		 * replacement for a missing VDEV.  In this case, the
231 		 * spa_namespace_lock will not be held, but it is still illegal
232 		 * to use a zvol as a vdev.  Deadlocks can result if another
233 		 * thread has spa_namespace_lock.
234 		 */
235 		return (SET_ERROR(EOPNOTSUPP));
236 	}
237 
238 retry:
239 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
240 	/*
241 	 * Obtain a copy of private under zvol_state_lock to make sure either
242 	 * the result of zvol free code setting private to NULL is observed,
243 	 * or the zv is protected from being freed because of the positive
244 	 * zv_open_count.
245 	 */
246 	zv = pp->private;
247 	if (zv == NULL) {
248 		rw_exit(&zvol_state_lock);
249 		err = SET_ERROR(ENXIO);
250 		goto out_locked;
251 	}
252 
253 	mutex_enter(&zv->zv_state_lock);
254 	if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
255 		rw_exit(&zvol_state_lock);
256 		err = SET_ERROR(ENXIO);
257 		goto out_zv_locked;
258 	}
259 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
260 
261 	/*
262 	 * Make sure zvol is not suspended during first open
263 	 * (hold zv_suspend_lock) and respect proper lock acquisition
264 	 * ordering - zv_suspend_lock before zv_state_lock.
265 	 */
266 	if (zv->zv_open_count == 0) {
267 		drop_suspend = B_TRUE;
268 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
269 			mutex_exit(&zv->zv_state_lock);
270 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
271 			mutex_enter(&zv->zv_state_lock);
272 			/* Check to see if zv_suspend_lock is needed. */
273 			if (zv->zv_open_count != 0) {
274 				rw_exit(&zv->zv_suspend_lock);
275 				drop_suspend = B_FALSE;
276 			}
277 		}
278 	}
279 	rw_exit(&zvol_state_lock);
280 
281 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
282 
283 	if (zv->zv_open_count == 0) {
284 		boolean_t drop_namespace = B_FALSE;
285 
286 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
287 
288 		/*
289 		 * Take spa_namespace_lock to prevent lock inversion when
290 		 * zvols from one pool are opened as vdevs in another.
291 		 */
292 		if (!mutex_owned(&spa_namespace_lock)) {
293 			if (!mutex_tryenter(&spa_namespace_lock)) {
294 				mutex_exit(&zv->zv_state_lock);
295 				rw_exit(&zv->zv_suspend_lock);
296 				drop_suspend = B_FALSE;
297 				kern_yield(PRI_USER);
298 				goto retry;
299 			} else {
300 				drop_namespace = B_TRUE;
301 			}
302 		}
303 		err = zvol_first_open(zv, !(flag & FWRITE));
304 		if (drop_namespace)
305 			mutex_exit(&spa_namespace_lock);
306 		if (err)
307 			goto out_zv_locked;
308 		pp->mediasize = zv->zv_volsize;
309 		pp->stripeoffset = 0;
310 		pp->stripesize = zv->zv_volblocksize;
311 	}
312 
313 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
314 
315 	/*
316 	 * Check for a bad on-disk format version now since we
317 	 * lied about owning the dataset readonly before.
318 	 */
319 	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
320 	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
321 		err = SET_ERROR(EROFS);
322 		goto out_opened;
323 	}
324 	if (zv->zv_flags & ZVOL_EXCL) {
325 		err = SET_ERROR(EBUSY);
326 		goto out_opened;
327 	}
328 	if (flag & O_EXCL) {
329 		if (zv->zv_open_count != 0) {
330 			err = SET_ERROR(EBUSY);
331 			goto out_opened;
332 		}
333 		zv->zv_flags |= ZVOL_EXCL;
334 	}
335 
336 	zv->zv_open_count += count;
337 out_opened:
338 	if (zv->zv_open_count == 0) {
339 		zvol_last_close(zv);
340 		wakeup(zv);
341 	}
342 out_zv_locked:
343 	mutex_exit(&zv->zv_state_lock);
344 out_locked:
345 	if (drop_suspend)
346 		rw_exit(&zv->zv_suspend_lock);
347 	return (err);
348 }
349 
350 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)351 zvol_geom_close(struct g_provider *pp, int flag, int count)
352 {
353 	(void) flag;
354 	zvol_state_t *zv;
355 	boolean_t drop_suspend = B_TRUE;
356 	int new_open_count;
357 
358 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
359 	zv = pp->private;
360 	if (zv == NULL) {
361 		rw_exit(&zvol_state_lock);
362 		return (SET_ERROR(ENXIO));
363 	}
364 
365 	mutex_enter(&zv->zv_state_lock);
366 	if (zv->zv_flags & ZVOL_EXCL) {
367 		ASSERT3U(zv->zv_open_count, ==, 1);
368 		zv->zv_flags &= ~ZVOL_EXCL;
369 	}
370 
371 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
372 
373 	/*
374 	 * If the open count is zero, this is a spurious close.
375 	 * That indicates a bug in the kernel / DDI framework.
376 	 */
377 	ASSERT3U(zv->zv_open_count, >, 0);
378 
379 	/*
380 	 * Make sure zvol is not suspended during last close
381 	 * (hold zv_suspend_lock) and respect proper lock acquisition
382 	 * ordering - zv_suspend_lock before zv_state_lock.
383 	 */
384 	new_open_count = zv->zv_open_count - count;
385 	if (new_open_count == 0) {
386 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
387 			mutex_exit(&zv->zv_state_lock);
388 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
389 			mutex_enter(&zv->zv_state_lock);
390 			/* Check to see if zv_suspend_lock is needed. */
391 			new_open_count = zv->zv_open_count - count;
392 			if (new_open_count != 0) {
393 				rw_exit(&zv->zv_suspend_lock);
394 				drop_suspend = B_FALSE;
395 			}
396 		}
397 	} else {
398 		drop_suspend = B_FALSE;
399 	}
400 	rw_exit(&zvol_state_lock);
401 
402 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
403 
404 	/*
405 	 * You may get multiple opens, but only one close.
406 	 */
407 	zv->zv_open_count = new_open_count;
408 	if (zv->zv_open_count == 0) {
409 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
410 		zvol_last_close(zv);
411 		wakeup(zv);
412 	}
413 
414 	mutex_exit(&zv->zv_state_lock);
415 
416 	if (drop_suspend)
417 		rw_exit(&zv->zv_suspend_lock);
418 	return (0);
419 }
420 
421 static void
zvol_geom_run(zvol_state_t * zv)422 zvol_geom_run(zvol_state_t *zv)
423 {
424 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
425 	struct g_provider *pp = zsg->zsg_provider;
426 
427 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
428 
429 	g_error_provider(pp, 0);
430 
431 	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
432 	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
433 }
434 
435 static void
zvol_geom_destroy(zvol_state_t * zv)436 zvol_geom_destroy(zvol_state_t *zv)
437 {
438 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
439 	struct g_provider *pp = zsg->zsg_provider;
440 
441 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
442 
443 	g_topology_assert();
444 
445 	mutex_enter(&zv->zv_state_lock);
446 	VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
447 	mutex_exit(&zv->zv_state_lock);
448 	zsg->zsg_provider = NULL;
449 	g_wither_geom(pp->geom, ENXIO);
450 }
451 
452 void
zvol_wait_close(zvol_state_t * zv)453 zvol_wait_close(zvol_state_t *zv)
454 {
455 
456 	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
457 		return;
458 	mutex_enter(&zv->zv_state_lock);
459 	zv->zv_zso->zso_dying = B_TRUE;
460 
461 	if (zv->zv_open_count)
462 		msleep(zv, &zv->zv_state_lock,
463 		    PRIBIO, "zvol:dying", 10*hz);
464 	mutex_exit(&zv->zv_state_lock);
465 }
466 
467 
468 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)469 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
470 {
471 	int count, error, flags;
472 
473 	g_topology_assert();
474 
475 	/*
476 	 * To make it easier we expect either open or close, but not both
477 	 * at the same time.
478 	 */
479 	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
480 	    (acr <= 0 && acw <= 0 && ace <= 0),
481 	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
482 	    pp->name, acr, acw, ace));
483 
484 	if (pp->private == NULL) {
485 		if (acr <= 0 && acw <= 0 && ace <= 0)
486 			return (0);
487 		return (pp->error);
488 	}
489 
490 	/*
491 	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
492 	 * ace != 0, because GEOM already handles that and handles it a bit
493 	 * differently. GEOM allows for multiple read/exclusive consumers and
494 	 * ZFS allows only one exclusive consumer, no matter if it is reader or
495 	 * writer. I like better the way GEOM works so I'll leave it for GEOM
496 	 * to decide what to do.
497 	 */
498 
499 	count = acr + acw + ace;
500 	if (count == 0)
501 		return (0);
502 
503 	flags = 0;
504 	if (acr != 0 || ace != 0)
505 		flags |= FREAD;
506 	if (acw != 0)
507 		flags |= FWRITE;
508 
509 	g_topology_unlock();
510 	if (count > 0)
511 		error = zvol_geom_open(pp, flags, count);
512 	else
513 		error = zvol_geom_close(pp, flags, -count);
514 	g_topology_lock();
515 	return (error);
516 }
517 
518 static void
zvol_geom_worker(void * arg)519 zvol_geom_worker(void *arg)
520 {
521 	zvol_state_t *zv = arg;
522 	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
523 	struct bio *bp;
524 
525 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
526 
527 	thread_lock(curthread);
528 	sched_prio(curthread, PRIBIO);
529 	thread_unlock(curthread);
530 
531 	for (;;) {
532 		mtx_lock(&zsg->zsg_queue_mtx);
533 		bp = bioq_takefirst(&zsg->zsg_queue);
534 		if (bp == NULL) {
535 			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
536 				zsg->zsg_state = ZVOL_GEOM_RUNNING;
537 				wakeup(&zsg->zsg_state);
538 				mtx_unlock(&zsg->zsg_queue_mtx);
539 				kthread_exit();
540 			}
541 			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
542 			    PRIBIO | PDROP, "zvol:io", 0);
543 			continue;
544 		}
545 		mtx_unlock(&zsg->zsg_queue_mtx);
546 		zvol_geom_bio_strategy(bp);
547 	}
548 }
549 
550 static void
zvol_geom_bio_start(struct bio * bp)551 zvol_geom_bio_start(struct bio *bp)
552 {
553 	zvol_state_t *zv = bp->bio_to->private;
554 	struct zvol_state_geom *zsg;
555 	boolean_t first;
556 
557 	if (zv == NULL) {
558 		g_io_deliver(bp, ENXIO);
559 		return;
560 	}
561 	if (bp->bio_cmd == BIO_GETATTR) {
562 		if (zvol_geom_bio_getattr(bp))
563 			g_io_deliver(bp, EOPNOTSUPP);
564 		return;
565 	}
566 
567 	if (!THREAD_CAN_SLEEP()) {
568 		zsg = &zv->zv_zso->zso_geom;
569 		mtx_lock(&zsg->zsg_queue_mtx);
570 		first = (bioq_first(&zsg->zsg_queue) == NULL);
571 		bioq_insert_tail(&zsg->zsg_queue, bp);
572 		mtx_unlock(&zsg->zsg_queue_mtx);
573 		if (first)
574 			wakeup_one(&zsg->zsg_queue);
575 		return;
576 	}
577 
578 	zvol_geom_bio_strategy(bp);
579 }
580 
581 static int
zvol_geom_bio_getattr(struct bio * bp)582 zvol_geom_bio_getattr(struct bio *bp)
583 {
584 	zvol_state_t *zv;
585 
586 	zv = bp->bio_to->private;
587 	ASSERT3P(zv, !=, NULL);
588 
589 	spa_t *spa = dmu_objset_spa(zv->zv_objset);
590 	uint64_t refd, avail, usedobjs, availobjs;
591 
592 	if (g_handleattr_int(bp, "GEOM::candelete", 1))
593 		return (0);
594 	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
595 		dmu_objset_space(zv->zv_objset, &refd, &avail,
596 		    &usedobjs, &availobjs);
597 		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
598 			return (0);
599 	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
600 		dmu_objset_space(zv->zv_objset, &refd, &avail,
601 		    &usedobjs, &availobjs);
602 		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
603 			return (0);
604 	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
605 		avail = metaslab_class_get_space(spa_normal_class(spa));
606 		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
607 		if (g_handleattr_off_t(bp, "poolblocksavail",
608 		    avail / DEV_BSIZE))
609 			return (0);
610 	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
611 		refd = metaslab_class_get_alloc(spa_normal_class(spa));
612 		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
613 			return (0);
614 	}
615 	return (1);
616 }
617 
618 static void
zvol_filter_detach(struct knote * kn)619 zvol_filter_detach(struct knote *kn)
620 {
621 	zvol_state_t *zv;
622 	struct zvol_state_dev *zsd;
623 
624 	zv = kn->kn_hook;
625 	zsd = &zv->zv_zso->zso_dev;
626 
627 	knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
628 }
629 
630 static int
zvol_filter_vnode(struct knote * kn,long hint)631 zvol_filter_vnode(struct knote *kn, long hint)
632 {
633 	kn->kn_fflags |= kn->kn_sfflags & hint;
634 
635 	return (kn->kn_fflags != 0);
636 }
637 
638 static int
zvol_cdev_kqfilter(struct cdev * dev,struct knote * kn)639 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
640 {
641 	zvol_state_t *zv;
642 	struct zvol_state_dev *zsd;
643 
644 	zv = dev->si_drv2;
645 	zsd = &zv->zv_zso->zso_dev;
646 
647 	if (kn->kn_filter != EVFILT_VNODE)
648 		return (EINVAL);
649 
650 	/* XXX: extend support for other NOTE_* events */
651 	if (kn->kn_sfflags != NOTE_ATTRIB)
652 		return (EINVAL);
653 
654 	kn->kn_fop = &zvol_filterops_vnode;
655 	kn->kn_hook = zv;
656 	knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
657 
658 	return (0);
659 }
660 
661 static void
zvol_geom_bio_strategy(struct bio * bp)662 zvol_geom_bio_strategy(struct bio *bp)
663 {
664 	zvol_state_t *zv;
665 	uint64_t off, volsize;
666 	size_t resid;
667 	char *addr;
668 	objset_t *os;
669 	zfs_locked_range_t *lr;
670 	int error = 0;
671 	boolean_t doread = B_FALSE;
672 	boolean_t is_dumpified;
673 	boolean_t commit;
674 
675 	if (bp->bio_to)
676 		zv = bp->bio_to->private;
677 	else
678 		zv = bp->bio_dev->si_drv2;
679 
680 	if (zv == NULL) {
681 		error = SET_ERROR(ENXIO);
682 		goto out;
683 	}
684 
685 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
686 
687 	if (zv->zv_flags & ZVOL_REMOVING) {
688 		error = SET_ERROR(ENXIO);
689 		goto resume;
690 	}
691 
692 	switch (bp->bio_cmd) {
693 	case BIO_READ:
694 		doread = B_TRUE;
695 		break;
696 	case BIO_WRITE:
697 	case BIO_FLUSH:
698 	case BIO_DELETE:
699 		if (zv->zv_flags & ZVOL_RDONLY) {
700 			error = SET_ERROR(EROFS);
701 			goto resume;
702 		}
703 		zvol_ensure_zilog(zv);
704 		if (bp->bio_cmd == BIO_FLUSH)
705 			goto commit;
706 		break;
707 	default:
708 		error = SET_ERROR(EOPNOTSUPP);
709 		goto resume;
710 	}
711 
712 	off = bp->bio_offset;
713 	volsize = zv->zv_volsize;
714 
715 	os = zv->zv_objset;
716 	ASSERT3P(os, !=, NULL);
717 
718 	addr = bp->bio_data;
719 	resid = bp->bio_length;
720 
721 	if (resid > 0 && off >= volsize) {
722 		error = SET_ERROR(EIO);
723 		goto resume;
724 	}
725 
726 	is_dumpified = B_FALSE;
727 	commit = !doread && !is_dumpified &&
728 	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
729 
730 	/*
731 	 * There must be no buffer changes when doing a dmu_sync() because
732 	 * we can't change the data whilst calculating the checksum.
733 	 */
734 	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
735 	    doread ? RL_READER : RL_WRITER);
736 
737 	if (bp->bio_cmd == BIO_DELETE) {
738 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
739 		error = dmu_tx_assign(tx, TXG_WAIT);
740 		if (error != 0) {
741 			dmu_tx_abort(tx);
742 		} else {
743 			zvol_log_truncate(zv, tx, off, resid);
744 			dmu_tx_commit(tx);
745 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
746 			    off, resid);
747 			resid = 0;
748 		}
749 		goto unlock;
750 	}
751 	while (resid != 0 && off < volsize) {
752 		size_t size = MIN(resid, zvol_maxphys);
753 		if (doread) {
754 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
755 			    DMU_READ_PREFETCH);
756 		} else {
757 			dmu_tx_t *tx = dmu_tx_create(os);
758 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
759 			error = dmu_tx_assign(tx, TXG_WAIT);
760 			if (error) {
761 				dmu_tx_abort(tx);
762 			} else {
763 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
764 				zvol_log_write(zv, tx, off, size, commit);
765 				dmu_tx_commit(tx);
766 			}
767 		}
768 		if (error) {
769 			/* Convert checksum errors into IO errors. */
770 			if (error == ECKSUM)
771 				error = SET_ERROR(EIO);
772 			break;
773 		}
774 		off += size;
775 		addr += size;
776 		resid -= size;
777 	}
778 unlock:
779 	zfs_rangelock_exit(lr);
780 
781 	bp->bio_completed = bp->bio_length - resid;
782 	if (bp->bio_completed < bp->bio_length && off > volsize)
783 		error = SET_ERROR(EINVAL);
784 
785 	switch (bp->bio_cmd) {
786 	case BIO_FLUSH:
787 		break;
788 	case BIO_READ:
789 		dataset_kstats_update_read_kstats(&zv->zv_kstat,
790 		    bp->bio_completed);
791 		break;
792 	case BIO_WRITE:
793 		dataset_kstats_update_write_kstats(&zv->zv_kstat,
794 		    bp->bio_completed);
795 		break;
796 	case BIO_DELETE:
797 		break;
798 	default:
799 		break;
800 	}
801 
802 	if (commit) {
803 commit:
804 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
805 	}
806 resume:
807 	rw_exit(&zv->zv_suspend_lock);
808 out:
809 	if (bp->bio_to)
810 		g_io_deliver(bp, error);
811 	else
812 		biofinish(bp, NULL, error);
813 }
814 
815 /*
816  * Character device mode implementation
817  */
818 
819 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio_s,int ioflag)820 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
821 {
822 	zvol_state_t *zv;
823 	uint64_t volsize;
824 	zfs_locked_range_t *lr;
825 	int error = 0;
826 	zfs_uio_t uio;
827 
828 	zfs_uio_init(&uio, uio_s);
829 
830 	zv = dev->si_drv2;
831 
832 	volsize = zv->zv_volsize;
833 	/*
834 	 * uio_loffset == volsize isn't an error as
835 	 * it's required for EOF processing.
836 	 */
837 	if (zfs_uio_resid(&uio) > 0 &&
838 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
839 		return (SET_ERROR(EIO));
840 
841 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
842 	ssize_t start_resid = zfs_uio_resid(&uio);
843 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
844 	    zfs_uio_resid(&uio), RL_READER);
845 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
846 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
847 
848 		/* Don't read past the end. */
849 		if (bytes > volsize - zfs_uio_offset(&uio))
850 			bytes = volsize - zfs_uio_offset(&uio);
851 
852 		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
853 		if (error) {
854 			/* Convert checksum errors into IO errors. */
855 			if (error == ECKSUM)
856 				error = SET_ERROR(EIO);
857 			break;
858 		}
859 	}
860 	zfs_rangelock_exit(lr);
861 	int64_t nread = start_resid - zfs_uio_resid(&uio);
862 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
863 	rw_exit(&zv->zv_suspend_lock);
864 
865 	return (error);
866 }
867 
868 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio_s,int ioflag)869 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
870 {
871 	zvol_state_t *zv;
872 	uint64_t volsize;
873 	zfs_locked_range_t *lr;
874 	int error = 0;
875 	boolean_t commit;
876 	zfs_uio_t uio;
877 
878 	zv = dev->si_drv2;
879 
880 	volsize = zv->zv_volsize;
881 
882 	zfs_uio_init(&uio, uio_s);
883 
884 	if (zfs_uio_resid(&uio) > 0 &&
885 	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
886 		return (SET_ERROR(EIO));
887 
888 	ssize_t start_resid = zfs_uio_resid(&uio);
889 	commit = (ioflag & IO_SYNC) ||
890 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
891 
892 	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
893 	zvol_ensure_zilog(zv);
894 
895 	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
896 	    zfs_uio_resid(&uio), RL_WRITER);
897 	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
898 		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
899 		uint64_t off = zfs_uio_offset(&uio);
900 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
901 
902 		if (bytes > volsize - off)	/* Don't write past the end. */
903 			bytes = volsize - off;
904 
905 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
906 		error = dmu_tx_assign(tx, TXG_WAIT);
907 		if (error) {
908 			dmu_tx_abort(tx);
909 			break;
910 		}
911 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
912 		if (error == 0)
913 			zvol_log_write(zv, tx, off, bytes, commit);
914 		dmu_tx_commit(tx);
915 
916 		if (error)
917 			break;
918 	}
919 	zfs_rangelock_exit(lr);
920 	int64_t nwritten = start_resid - zfs_uio_resid(&uio);
921 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
922 	if (commit)
923 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
924 	rw_exit(&zv->zv_suspend_lock);
925 
926 	return (error);
927 }
928 
929 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)930 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
931 {
932 	zvol_state_t *zv;
933 	int err = 0;
934 	boolean_t drop_suspend = B_FALSE;
935 
936 retry:
937 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
938 	/*
939 	 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
940 	 * the result of zvol free code setting si_drv2 to NULL is observed,
941 	 * or the zv is protected from being freed because of the positive
942 	 * zv_open_count.
943 	 */
944 	zv = dev->si_drv2;
945 	if (zv == NULL) {
946 		rw_exit(&zvol_state_lock);
947 		err = SET_ERROR(ENXIO);
948 		goto out_locked;
949 	}
950 
951 	mutex_enter(&zv->zv_state_lock);
952 	if (zv->zv_zso->zso_dying) {
953 		rw_exit(&zvol_state_lock);
954 		err = SET_ERROR(ENXIO);
955 		goto out_zv_locked;
956 	}
957 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
958 
959 	/*
960 	 * Make sure zvol is not suspended during first open
961 	 * (hold zv_suspend_lock) and respect proper lock acquisition
962 	 * ordering - zv_suspend_lock before zv_state_lock.
963 	 */
964 	if (zv->zv_open_count == 0) {
965 		drop_suspend = B_TRUE;
966 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
967 			mutex_exit(&zv->zv_state_lock);
968 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
969 			mutex_enter(&zv->zv_state_lock);
970 			/* Check to see if zv_suspend_lock is needed. */
971 			if (zv->zv_open_count != 0) {
972 				rw_exit(&zv->zv_suspend_lock);
973 				drop_suspend = B_FALSE;
974 			}
975 		}
976 	}
977 	rw_exit(&zvol_state_lock);
978 
979 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
980 
981 	if (zv->zv_open_count == 0) {
982 		boolean_t drop_namespace = B_FALSE;
983 
984 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
985 
986 		/*
987 		 * Take spa_namespace_lock to prevent lock inversion when
988 		 * zvols from one pool are opened as vdevs in another.
989 		 */
990 		if (!mutex_owned(&spa_namespace_lock)) {
991 			if (!mutex_tryenter(&spa_namespace_lock)) {
992 				mutex_exit(&zv->zv_state_lock);
993 				rw_exit(&zv->zv_suspend_lock);
994 				drop_suspend = B_FALSE;
995 				kern_yield(PRI_USER);
996 				goto retry;
997 			} else {
998 				drop_namespace = B_TRUE;
999 			}
1000 		}
1001 		err = zvol_first_open(zv, !(flags & FWRITE));
1002 		if (drop_namespace)
1003 			mutex_exit(&spa_namespace_lock);
1004 		if (err)
1005 			goto out_zv_locked;
1006 	}
1007 
1008 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1009 
1010 	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1011 		err = SET_ERROR(EROFS);
1012 		goto out_opened;
1013 	}
1014 	if (zv->zv_flags & ZVOL_EXCL) {
1015 		err = SET_ERROR(EBUSY);
1016 		goto out_opened;
1017 	}
1018 	if (flags & O_EXCL) {
1019 		if (zv->zv_open_count != 0) {
1020 			err = SET_ERROR(EBUSY);
1021 			goto out_opened;
1022 		}
1023 		zv->zv_flags |= ZVOL_EXCL;
1024 	}
1025 
1026 	zv->zv_open_count++;
1027 out_opened:
1028 	if (zv->zv_open_count == 0) {
1029 		zvol_last_close(zv);
1030 		wakeup(zv);
1031 	}
1032 out_zv_locked:
1033 	mutex_exit(&zv->zv_state_lock);
1034 out_locked:
1035 	if (drop_suspend)
1036 		rw_exit(&zv->zv_suspend_lock);
1037 	return (err);
1038 }
1039 
1040 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)1041 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1042 {
1043 	zvol_state_t *zv;
1044 	boolean_t drop_suspend = B_TRUE;
1045 
1046 	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1047 	zv = dev->si_drv2;
1048 	if (zv == NULL) {
1049 		rw_exit(&zvol_state_lock);
1050 		return (SET_ERROR(ENXIO));
1051 	}
1052 
1053 	mutex_enter(&zv->zv_state_lock);
1054 	if (zv->zv_flags & ZVOL_EXCL) {
1055 		ASSERT3U(zv->zv_open_count, ==, 1);
1056 		zv->zv_flags &= ~ZVOL_EXCL;
1057 	}
1058 
1059 	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1060 
1061 	/*
1062 	 * If the open count is zero, this is a spurious close.
1063 	 * That indicates a bug in the kernel / DDI framework.
1064 	 */
1065 	ASSERT3U(zv->zv_open_count, >, 0);
1066 	/*
1067 	 * Make sure zvol is not suspended during last close
1068 	 * (hold zv_suspend_lock) and respect proper lock acquisition
1069 	 * ordering - zv_suspend_lock before zv_state_lock.
1070 	 */
1071 	if (zv->zv_open_count == 1) {
1072 		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1073 			mutex_exit(&zv->zv_state_lock);
1074 			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1075 			mutex_enter(&zv->zv_state_lock);
1076 			/* Check to see if zv_suspend_lock is needed. */
1077 			if (zv->zv_open_count != 1) {
1078 				rw_exit(&zv->zv_suspend_lock);
1079 				drop_suspend = B_FALSE;
1080 			}
1081 		}
1082 	} else {
1083 		drop_suspend = B_FALSE;
1084 	}
1085 	rw_exit(&zvol_state_lock);
1086 
1087 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1088 
1089 	/*
1090 	 * You may get multiple opens, but only one close.
1091 	 */
1092 	zv->zv_open_count--;
1093 
1094 	if (zv->zv_open_count == 0) {
1095 		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1096 		zvol_last_close(zv);
1097 		wakeup(zv);
1098 	}
1099 
1100 	mutex_exit(&zv->zv_state_lock);
1101 
1102 	if (drop_suspend)
1103 		rw_exit(&zv->zv_suspend_lock);
1104 	return (0);
1105 }
1106 
1107 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1108 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1109     int fflag, struct thread *td)
1110 {
1111 	zvol_state_t *zv;
1112 	zfs_locked_range_t *lr;
1113 	off_t offset, length;
1114 	int error;
1115 	boolean_t sync;
1116 
1117 	zv = dev->si_drv2;
1118 
1119 	error = 0;
1120 	KASSERT(zv->zv_open_count > 0,
1121 	    ("Device with zero access count in %s", __func__));
1122 
1123 	switch (cmd) {
1124 	case DIOCGSECTORSIZE:
1125 		*(uint32_t *)data = DEV_BSIZE;
1126 		break;
1127 	case DIOCGMEDIASIZE:
1128 		*(off_t *)data = zv->zv_volsize;
1129 		break;
1130 	case DIOCGFLUSH:
1131 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1132 		if (zv->zv_zilog != NULL)
1133 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1134 		rw_exit(&zv->zv_suspend_lock);
1135 		break;
1136 	case DIOCGDELETE:
1137 		if (!zvol_unmap_enabled)
1138 			break;
1139 
1140 		offset = ((off_t *)data)[0];
1141 		length = ((off_t *)data)[1];
1142 		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1143 		    offset < 0 || offset >= zv->zv_volsize ||
1144 		    length <= 0) {
1145 			printf("%s: offset=%jd length=%jd\n", __func__, offset,
1146 			    length);
1147 			error = SET_ERROR(EINVAL);
1148 			break;
1149 		}
1150 		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1151 		zvol_ensure_zilog(zv);
1152 		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1153 		    RL_WRITER);
1154 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1155 		error = dmu_tx_assign(tx, TXG_WAIT);
1156 		if (error != 0) {
1157 			sync = FALSE;
1158 			dmu_tx_abort(tx);
1159 		} else {
1160 			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1161 			zvol_log_truncate(zv, tx, offset, length);
1162 			dmu_tx_commit(tx);
1163 			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1164 			    offset, length);
1165 		}
1166 		zfs_rangelock_exit(lr);
1167 		if (sync)
1168 			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1169 		rw_exit(&zv->zv_suspend_lock);
1170 		break;
1171 	case DIOCGSTRIPESIZE:
1172 		*(off_t *)data = zv->zv_volblocksize;
1173 		break;
1174 	case DIOCGSTRIPEOFFSET:
1175 		*(off_t *)data = 0;
1176 		break;
1177 	case DIOCGATTR: {
1178 		spa_t *spa = dmu_objset_spa(zv->zv_objset);
1179 		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1180 		uint64_t refd, avail, usedobjs, availobjs;
1181 
1182 		if (strcmp(arg->name, "GEOM::candelete") == 0)
1183 			arg->value.i = 1;
1184 		else if (strcmp(arg->name, "blocksavail") == 0) {
1185 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1186 			    &usedobjs, &availobjs);
1187 			arg->value.off = avail / DEV_BSIZE;
1188 		} else if (strcmp(arg->name, "blocksused") == 0) {
1189 			dmu_objset_space(zv->zv_objset, &refd, &avail,
1190 			    &usedobjs, &availobjs);
1191 			arg->value.off = refd / DEV_BSIZE;
1192 		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
1193 			avail = metaslab_class_get_space(spa_normal_class(spa));
1194 			avail -= metaslab_class_get_alloc(
1195 			    spa_normal_class(spa));
1196 			arg->value.off = avail / DEV_BSIZE;
1197 		} else if (strcmp(arg->name, "poolblocksused") == 0) {
1198 			refd = metaslab_class_get_alloc(spa_normal_class(spa));
1199 			arg->value.off = refd / DEV_BSIZE;
1200 		} else
1201 			error = SET_ERROR(ENOIOCTL);
1202 		break;
1203 	}
1204 	case FIOSEEKHOLE:
1205 	case FIOSEEKDATA: {
1206 		off_t *off = (off_t *)data;
1207 		uint64_t noff;
1208 		boolean_t hole;
1209 
1210 		hole = (cmd == FIOSEEKHOLE);
1211 		noff = *off;
1212 		lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1213 		    RL_READER);
1214 		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1215 		zfs_rangelock_exit(lr);
1216 		*off = noff;
1217 		break;
1218 	}
1219 	default:
1220 		error = SET_ERROR(ENOIOCTL);
1221 	}
1222 
1223 	return (error);
1224 }
1225 
1226 /*
1227  * Misc. helpers
1228  */
1229 
1230 static void
zvol_ensure_zilog(zvol_state_t * zv)1231 zvol_ensure_zilog(zvol_state_t *zv)
1232 {
1233 	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1234 
1235 	/*
1236 	 * Open a ZIL if this is the first time we have written to this
1237 	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1238 	 * than zv_state_lock so that we don't need to acquire an
1239 	 * additional lock in this path.
1240 	 */
1241 	if (zv->zv_zilog == NULL) {
1242 		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1243 			rw_exit(&zv->zv_suspend_lock);
1244 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1245 		}
1246 		if (zv->zv_zilog == NULL) {
1247 			zv->zv_zilog = zil_open(zv->zv_objset,
1248 			    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1249 			zv->zv_flags |= ZVOL_WRITTEN_TO;
1250 			/* replay / destroy done in zvol_os_create_minor() */
1251 			VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1252 			    ZIL_REPLAY_NEEDED);
1253 		}
1254 		rw_downgrade(&zv->zv_suspend_lock);
1255 	}
1256 }
1257 
1258 boolean_t
zvol_os_is_zvol(const char * device)1259 zvol_os_is_zvol(const char *device)
1260 {
1261 	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1262 }
1263 
1264 void
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1266 {
1267 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1268 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1269 
1270 	/* Move to a new hashtable entry.  */
1271 	zv->zv_hash = zvol_name_hash(newname);
1272 	hlist_del(&zv->zv_hlink);
1273 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1274 
1275 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1276 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1277 		struct g_provider *pp = zsg->zsg_provider;
1278 		struct g_geom *gp;
1279 
1280 		g_topology_lock();
1281 		gp = pp->geom;
1282 		ASSERT3P(gp, !=, NULL);
1283 
1284 		zsg->zsg_provider = NULL;
1285 		g_wither_provider(pp, ENXIO);
1286 
1287 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1288 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1289 		pp->sectorsize = DEV_BSIZE;
1290 		pp->mediasize = zv->zv_volsize;
1291 		pp->private = zv;
1292 		zsg->zsg_provider = pp;
1293 		g_error_provider(pp, 0);
1294 		g_topology_unlock();
1295 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1296 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1297 		struct cdev *dev;
1298 		struct make_dev_args args;
1299 
1300 		dev = zsd->zsd_cdev;
1301 		if (dev != NULL) {
1302 			destroy_dev(dev);
1303 			dev = zsd->zsd_cdev = NULL;
1304 			if (zv->zv_open_count > 0) {
1305 				zv->zv_flags &= ~ZVOL_EXCL;
1306 				zv->zv_open_count = 0;
1307 				/* XXX  need suspend lock but lock order */
1308 				zvol_last_close(zv);
1309 			}
1310 		}
1311 
1312 		make_dev_args_init(&args);
1313 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1314 		args.mda_devsw = &zvol_cdevsw;
1315 		args.mda_cr = NULL;
1316 		args.mda_uid = UID_ROOT;
1317 		args.mda_gid = GID_OPERATOR;
1318 		args.mda_mode = 0640;
1319 		args.mda_si_drv2 = zv;
1320 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1321 		    == 0) {
1322 			dev->si_iosize_max = maxphys;
1323 			zsd->zsd_cdev = dev;
1324 		}
1325 	}
1326 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1327 	dataset_kstats_rename(&zv->zv_kstat, newname);
1328 }
1329 
1330 /*
1331  * Remove minor node for the specified volume.
1332  */
1333 void
zvol_os_free(zvol_state_t * zv)1334 zvol_os_free(zvol_state_t *zv)
1335 {
1336 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1337 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1338 	ASSERT0(zv->zv_open_count);
1339 
1340 	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1341 
1342 	rw_destroy(&zv->zv_suspend_lock);
1343 	zfs_rangelock_fini(&zv->zv_rangelock);
1344 
1345 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1346 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1347 		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1348 
1349 		ASSERT3P(pp->private, ==, NULL);
1350 
1351 		g_topology_lock();
1352 		zvol_geom_destroy(zv);
1353 		g_topology_unlock();
1354 		mtx_destroy(&zsg->zsg_queue_mtx);
1355 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1356 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1357 		struct cdev *dev = zsd->zsd_cdev;
1358 
1359 		if (dev != NULL) {
1360 			ASSERT3P(dev->si_drv2, ==, NULL);
1361 			destroy_dev(dev);
1362 			knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1363 			knlist_destroy(&zsd->zsd_selinfo.si_note);
1364 		}
1365 	}
1366 
1367 	mutex_destroy(&zv->zv_state_lock);
1368 	cv_destroy(&zv->zv_removing_cv);
1369 	dataset_kstats_destroy(&zv->zv_kstat);
1370 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1371 	kmem_free(zv, sizeof (zvol_state_t));
1372 	zvol_minors--;
1373 }
1374 
1375 /*
1376  * Create a minor node (plus a whole lot more) for the specified volume.
1377  */
1378 int
zvol_os_create_minor(const char * name)1379 zvol_os_create_minor(const char *name)
1380 {
1381 	zvol_state_t *zv;
1382 	objset_t *os;
1383 	dmu_object_info_t *doi;
1384 	uint64_t volsize;
1385 	uint64_t volmode, hash;
1386 	int error;
1387 	bool replayed_zil = B_FALSE;
1388 
1389 	ZFS_LOG(1, "Creating ZVOL %s...", name);
1390 	hash = zvol_name_hash(name);
1391 	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1392 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1393 		mutex_exit(&zv->zv_state_lock);
1394 		return (SET_ERROR(EEXIST));
1395 	}
1396 
1397 	DROP_GIANT();
1398 
1399 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1400 
1401 	/* Lie and say we're read-only. */
1402 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1403 	if (error)
1404 		goto out_doi;
1405 
1406 	error = dmu_object_info(os, ZVOL_OBJ, doi);
1407 	if (error)
1408 		goto out_dmu_objset_disown;
1409 
1410 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1411 	if (error)
1412 		goto out_dmu_objset_disown;
1413 
1414 	error = dsl_prop_get_integer(name,
1415 	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1416 	if (error || volmode == ZFS_VOLMODE_DEFAULT)
1417 		volmode = zvol_volmode;
1418 	error = 0;
1419 
1420 	/*
1421 	 * zvol_alloc equivalent ...
1422 	 */
1423 	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1424 	zv->zv_hash = hash;
1425 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1426 	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1427 	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1428 	zv->zv_volmode = volmode;
1429 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1430 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1431 		struct g_provider *pp;
1432 		struct g_geom *gp;
1433 
1434 		zsg->zsg_state = ZVOL_GEOM_UNINIT;
1435 		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1436 
1437 		g_topology_lock();
1438 		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1439 		gp->start = zvol_geom_bio_start;
1440 		gp->access = zvol_geom_access;
1441 		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1442 		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1443 		pp->sectorsize = DEV_BSIZE;
1444 		pp->mediasize = 0;
1445 		pp->private = zv;
1446 
1447 		zsg->zsg_provider = pp;
1448 		bioq_init(&zsg->zsg_queue);
1449 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1450 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1451 		struct cdev *dev;
1452 		struct make_dev_args args;
1453 
1454 		make_dev_args_init(&args);
1455 		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1456 		args.mda_devsw = &zvol_cdevsw;
1457 		args.mda_cr = NULL;
1458 		args.mda_uid = UID_ROOT;
1459 		args.mda_gid = GID_OPERATOR;
1460 		args.mda_mode = 0640;
1461 		args.mda_si_drv2 = zv;
1462 		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1463 		    == 0) {
1464 			dev->si_iosize_max = maxphys;
1465 			zsd->zsd_cdev = dev;
1466 			knlist_init_sx(&zsd->zsd_selinfo.si_note,
1467 			    &zv->zv_state_lock);
1468 		}
1469 	}
1470 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1471 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1472 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1473 
1474 	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1475 		zv->zv_flags |= ZVOL_RDONLY;
1476 
1477 	zv->zv_volblocksize = doi->doi_data_block_size;
1478 	zv->zv_volsize = volsize;
1479 	zv->zv_objset = os;
1480 
1481 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1482 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1483 	if (error)
1484 		goto out_dmu_objset_disown;
1485 	ASSERT3P(zv->zv_zilog, ==, NULL);
1486 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1487 	if (spa_writeable(dmu_objset_spa(os))) {
1488 		if (zil_replay_disable)
1489 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1490 		else
1491 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1492 	}
1493 	if (replayed_zil)
1494 		zil_close(zv->zv_zilog);
1495 	zv->zv_zilog = NULL;
1496 
1497 	/* TODO: prefetch for geom tasting */
1498 
1499 	zv->zv_objset = NULL;
1500 out_dmu_objset_disown:
1501 	dmu_objset_disown(os, B_TRUE, FTAG);
1502 
1503 	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1504 		zvol_geom_run(zv);
1505 		g_topology_unlock();
1506 	}
1507 out_doi:
1508 	kmem_free(doi, sizeof (dmu_object_info_t));
1509 	if (error == 0) {
1510 		rw_enter(&zvol_state_lock, RW_WRITER);
1511 		zvol_insert(zv);
1512 		zvol_minors++;
1513 		rw_exit(&zvol_state_lock);
1514 		ZFS_LOG(1, "ZVOL %s created.", name);
1515 	}
1516 	PICKUP_GIANT();
1517 	return (error);
1518 }
1519 
1520 void
zvol_os_clear_private(zvol_state_t * zv)1521 zvol_os_clear_private(zvol_state_t *zv)
1522 {
1523 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1524 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1525 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1526 		struct g_provider *pp = zsg->zsg_provider;
1527 
1528 		if (pp->private == NULL) /* already cleared */
1529 			return;
1530 
1531 		mtx_lock(&zsg->zsg_queue_mtx);
1532 		zsg->zsg_state = ZVOL_GEOM_STOPPED;
1533 		pp->private = NULL;
1534 		wakeup_one(&zsg->zsg_queue);
1535 		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1536 			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1537 			    0, "zvol:w", 0);
1538 		mtx_unlock(&zsg->zsg_queue_mtx);
1539 		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1540 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1541 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1542 		struct cdev *dev = zsd->zsd_cdev;
1543 
1544 		if (dev != NULL)
1545 			dev->si_drv2 = NULL;
1546 	}
1547 }
1548 
1549 int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1550 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1551 {
1552 	zv->zv_volsize = volsize;
1553 	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1554 		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1555 		struct g_provider *pp = zsg->zsg_provider;
1556 
1557 		g_topology_lock();
1558 
1559 		if (pp->private == NULL) {
1560 			g_topology_unlock();
1561 			return (SET_ERROR(ENXIO));
1562 		}
1563 
1564 		/*
1565 		 * Do not invoke resize event when initial size was zero.
1566 		 * ZVOL initializes the size on first open, this is not
1567 		 * real resizing.
1568 		 */
1569 		if (pp->mediasize == 0)
1570 			pp->mediasize = zv->zv_volsize;
1571 		else
1572 			g_resize_provider(pp, zv->zv_volsize);
1573 
1574 		g_topology_unlock();
1575 	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1576 		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1577 
1578 		KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1579 	}
1580 	return (0);
1581 }
1582 
1583 void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1584 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1585 {
1586 	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1587 }
1588 
1589 void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1590 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1591 {
1592 	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1593 }
1594 
1595 /*
1596  * Public interfaces
1597  */
1598 
1599 int
zvol_busy(void)1600 zvol_busy(void)
1601 {
1602 	return (zvol_minors != 0);
1603 }
1604 
1605 int
zvol_init(void)1606 zvol_init(void)
1607 {
1608 	zvol_init_impl();
1609 	return (0);
1610 }
1611 
1612 void
zvol_fini(void)1613 zvol_fini(void)
1614 {
1615 	zvol_fini_impl();
1616 }
1617