1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 *
27 * Portions Copyright 2010 Robert Milkowski
28 *
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
33 * Copyright (c) 2024, Klara, Inc.
34 */
35
36 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
37
38 /*
39 * ZFS volume emulation driver.
40 *
41 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
42 * Volumes are accessed through the symbolic links named:
43 *
44 * /dev/zvol/<pool_name>/<dataset_name>
45 *
46 * Volumes are persistent through reboot. No user command needs to be
47 * run before opening and using a device.
48 *
49 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
50 * in the system. Except when they're simply character devices (volmode=dev).
51 */
52
53 #include <sys/types.h>
54 #include <sys/param.h>
55 #include <sys/kernel.h>
56 #include <sys/errno.h>
57 #include <sys/uio.h>
58 #include <sys/bio.h>
59 #include <sys/buf.h>
60 #include <sys/kmem.h>
61 #include <sys/conf.h>
62 #include <sys/cmn_err.h>
63 #include <sys/stat.h>
64 #include <sys/proc.h>
65 #include <sys/zap.h>
66 #include <sys/spa.h>
67 #include <sys/spa_impl.h>
68 #include <sys/zio.h>
69 #include <sys/disk.h>
70 #include <sys/dmu_traverse.h>
71 #include <sys/dnode.h>
72 #include <sys/dsl_dataset.h>
73 #include <sys/dsl_prop.h>
74 #include <sys/dsl_dir.h>
75 #include <sys/byteorder.h>
76 #include <sys/sunddi.h>
77 #include <sys/dirent.h>
78 #include <sys/policy.h>
79 #include <sys/queue.h>
80 #include <sys/fs/zfs.h>
81 #include <sys/zfs_ioctl.h>
82 #include <sys/zil.h>
83 #include <sys/zfs_znode.h>
84 #include <sys/zfs_rlock.h>
85 #include <sys/vdev_impl.h>
86 #include <sys/vdev_raidz.h>
87 #include <sys/zvol.h>
88 #include <sys/zil_impl.h>
89 #include <sys/dataset_kstats.h>
90 #include <sys/dbuf.h>
91 #include <sys/dmu_tx.h>
92 #include <sys/zfeature.h>
93 #include <sys/zio_checksum.h>
94 #include <sys/zil_impl.h>
95 #include <sys/filio.h>
96 #include <sys/freebsd_event.h>
97
98 #include <geom/geom.h>
99 #include <sys/zvol.h>
100 #include <sys/zvol_impl.h>
101
102 #include "zfs_namecheck.h"
103
104 #define ZVOL_DUMPSIZE "dumpsize"
105
106 #ifdef ZVOL_LOCK_DEBUG
107 #define ZVOL_RW_READER RW_WRITER
108 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
109 #else
110 #define ZVOL_RW_READER RW_READER
111 #define ZVOL_RW_READ_HELD RW_READ_HELD
112 #endif
113
114 enum zvol_geom_state {
115 ZVOL_GEOM_UNINIT,
116 ZVOL_GEOM_STOPPED,
117 ZVOL_GEOM_RUNNING,
118 };
119
120 struct zvol_state_os {
121 #define zso_dev _zso_state._zso_dev
122 #define zso_geom _zso_state._zso_geom
123 union {
124 /* volmode=dev */
125 struct zvol_state_dev {
126 struct cdev *zsd_cdev;
127 struct selinfo zsd_selinfo;
128 } _zso_dev;
129
130 /* volmode=geom */
131 struct zvol_state_geom {
132 struct g_provider *zsg_provider;
133 struct bio_queue_head zsg_queue;
134 struct mtx zsg_queue_mtx;
135 enum zvol_geom_state zsg_state;
136 } _zso_geom;
137 } _zso_state;
138 int zso_dying;
139 };
140
141 static uint32_t zvol_minors;
142
143 SYSCTL_DECL(_vfs_zfs);
144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146 "Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol = B_FALSE;
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149 "Allow zpools to use zvols as vdevs (DANGEROUS)");
150
151 /*
152 * Toggle unmap functionality.
153 */
154 boolean_t zvol_unmap_enabled = B_TRUE;
155
156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
158
159 /*
160 * zvol maximum transfer in one DMU tx.
161 */
162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
163
164 static void zvol_ensure_zilog(zvol_state_t *zv);
165
166 static d_open_t zvol_cdev_open;
167 static d_close_t zvol_cdev_close;
168 static d_ioctl_t zvol_cdev_ioctl;
169 static d_read_t zvol_cdev_read;
170 static d_write_t zvol_cdev_write;
171 static d_strategy_t zvol_geom_bio_strategy;
172 static d_kqfilter_t zvol_cdev_kqfilter;
173
174 static struct cdevsw zvol_cdevsw = {
175 .d_name = "zvol",
176 .d_version = D_VERSION,
177 .d_flags = D_DISK | D_TRACKCLOSE,
178 .d_open = zvol_cdev_open,
179 .d_close = zvol_cdev_close,
180 .d_ioctl = zvol_cdev_ioctl,
181 .d_read = zvol_cdev_read,
182 .d_write = zvol_cdev_write,
183 .d_strategy = zvol_geom_bio_strategy,
184 .d_kqfilter = zvol_cdev_kqfilter,
185 };
186
187 static void zvol_filter_detach(struct knote *kn);
188 static int zvol_filter_vnode(struct knote *kn, long hint);
189
190 static struct filterops zvol_filterops_vnode = {
191 .f_isfd = 1,
192 .f_detach = zvol_filter_detach,
193 .f_event = zvol_filter_vnode,
194 };
195
196 extern uint_t zfs_geom_probe_vdev_key;
197
198 struct g_class zfs_zvol_class = {
199 .name = "ZFS::ZVOL",
200 .version = G_VERSION,
201 };
202
203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
204
205 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
206 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
207 static void zvol_geom_run(zvol_state_t *zv);
208 static void zvol_geom_destroy(zvol_state_t *zv);
209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
210 static void zvol_geom_worker(void *arg);
211 static void zvol_geom_bio_start(struct bio *bp);
212 static int zvol_geom_bio_getattr(struct bio *bp);
213 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
214
215 /*
216 * GEOM mode implementation
217 */
218
219 static int
zvol_geom_open(struct g_provider * pp,int flag,int count)220 zvol_geom_open(struct g_provider *pp, int flag, int count)
221 {
222 zvol_state_t *zv;
223 int err = 0;
224 boolean_t drop_suspend = B_FALSE;
225
226 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
227 /*
228 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229 * attempting to probe geom providers while looking for a
230 * replacement for a missing VDEV. In this case, the
231 * spa_namespace_lock will not be held, but it is still illegal
232 * to use a zvol as a vdev. Deadlocks can result if another
233 * thread has spa_namespace_lock.
234 */
235 return (SET_ERROR(EOPNOTSUPP));
236 }
237
238 retry:
239 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
240 /*
241 * Obtain a copy of private under zvol_state_lock to make sure either
242 * the result of zvol free code setting private to NULL is observed,
243 * or the zv is protected from being freed because of the positive
244 * zv_open_count.
245 */
246 zv = pp->private;
247 if (zv == NULL) {
248 rw_exit(&zvol_state_lock);
249 err = SET_ERROR(ENXIO);
250 goto out_locked;
251 }
252
253 mutex_enter(&zv->zv_state_lock);
254 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
255 rw_exit(&zvol_state_lock);
256 err = SET_ERROR(ENXIO);
257 goto out_zv_locked;
258 }
259 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
260
261 /*
262 * Make sure zvol is not suspended during first open
263 * (hold zv_suspend_lock) and respect proper lock acquisition
264 * ordering - zv_suspend_lock before zv_state_lock.
265 */
266 if (zv->zv_open_count == 0) {
267 drop_suspend = B_TRUE;
268 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
269 mutex_exit(&zv->zv_state_lock);
270 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
271 mutex_enter(&zv->zv_state_lock);
272 /* Check to see if zv_suspend_lock is needed. */
273 if (zv->zv_open_count != 0) {
274 rw_exit(&zv->zv_suspend_lock);
275 drop_suspend = B_FALSE;
276 }
277 }
278 }
279 rw_exit(&zvol_state_lock);
280
281 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
282
283 if (zv->zv_open_count == 0) {
284 boolean_t drop_namespace = B_FALSE;
285
286 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
287
288 /*
289 * Take spa_namespace_lock to prevent lock inversion when
290 * zvols from one pool are opened as vdevs in another.
291 */
292 if (!mutex_owned(&spa_namespace_lock)) {
293 if (!mutex_tryenter(&spa_namespace_lock)) {
294 mutex_exit(&zv->zv_state_lock);
295 rw_exit(&zv->zv_suspend_lock);
296 drop_suspend = B_FALSE;
297 kern_yield(PRI_USER);
298 goto retry;
299 } else {
300 drop_namespace = B_TRUE;
301 }
302 }
303 err = zvol_first_open(zv, !(flag & FWRITE));
304 if (drop_namespace)
305 mutex_exit(&spa_namespace_lock);
306 if (err)
307 goto out_zv_locked;
308 pp->mediasize = zv->zv_volsize;
309 pp->stripeoffset = 0;
310 pp->stripesize = zv->zv_volblocksize;
311 }
312
313 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
314
315 /*
316 * Check for a bad on-disk format version now since we
317 * lied about owning the dataset readonly before.
318 */
319 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
320 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
321 err = SET_ERROR(EROFS);
322 goto out_opened;
323 }
324 if (zv->zv_flags & ZVOL_EXCL) {
325 err = SET_ERROR(EBUSY);
326 goto out_opened;
327 }
328 if (flag & O_EXCL) {
329 if (zv->zv_open_count != 0) {
330 err = SET_ERROR(EBUSY);
331 goto out_opened;
332 }
333 zv->zv_flags |= ZVOL_EXCL;
334 }
335
336 zv->zv_open_count += count;
337 out_opened:
338 if (zv->zv_open_count == 0) {
339 zvol_last_close(zv);
340 wakeup(zv);
341 }
342 out_zv_locked:
343 mutex_exit(&zv->zv_state_lock);
344 out_locked:
345 if (drop_suspend)
346 rw_exit(&zv->zv_suspend_lock);
347 return (err);
348 }
349
350 static int
zvol_geom_close(struct g_provider * pp,int flag,int count)351 zvol_geom_close(struct g_provider *pp, int flag, int count)
352 {
353 (void) flag;
354 zvol_state_t *zv;
355 boolean_t drop_suspend = B_TRUE;
356 int new_open_count;
357
358 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
359 zv = pp->private;
360 if (zv == NULL) {
361 rw_exit(&zvol_state_lock);
362 return (SET_ERROR(ENXIO));
363 }
364
365 mutex_enter(&zv->zv_state_lock);
366 if (zv->zv_flags & ZVOL_EXCL) {
367 ASSERT3U(zv->zv_open_count, ==, 1);
368 zv->zv_flags &= ~ZVOL_EXCL;
369 }
370
371 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
372
373 /*
374 * If the open count is zero, this is a spurious close.
375 * That indicates a bug in the kernel / DDI framework.
376 */
377 ASSERT3U(zv->zv_open_count, >, 0);
378
379 /*
380 * Make sure zvol is not suspended during last close
381 * (hold zv_suspend_lock) and respect proper lock acquisition
382 * ordering - zv_suspend_lock before zv_state_lock.
383 */
384 new_open_count = zv->zv_open_count - count;
385 if (new_open_count == 0) {
386 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
387 mutex_exit(&zv->zv_state_lock);
388 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
389 mutex_enter(&zv->zv_state_lock);
390 /* Check to see if zv_suspend_lock is needed. */
391 new_open_count = zv->zv_open_count - count;
392 if (new_open_count != 0) {
393 rw_exit(&zv->zv_suspend_lock);
394 drop_suspend = B_FALSE;
395 }
396 }
397 } else {
398 drop_suspend = B_FALSE;
399 }
400 rw_exit(&zvol_state_lock);
401
402 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
403
404 /*
405 * You may get multiple opens, but only one close.
406 */
407 zv->zv_open_count = new_open_count;
408 if (zv->zv_open_count == 0) {
409 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
410 zvol_last_close(zv);
411 wakeup(zv);
412 }
413
414 mutex_exit(&zv->zv_state_lock);
415
416 if (drop_suspend)
417 rw_exit(&zv->zv_suspend_lock);
418 return (0);
419 }
420
421 static void
zvol_geom_run(zvol_state_t * zv)422 zvol_geom_run(zvol_state_t *zv)
423 {
424 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
425 struct g_provider *pp = zsg->zsg_provider;
426
427 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
428
429 g_error_provider(pp, 0);
430
431 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
432 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
433 }
434
435 static void
zvol_geom_destroy(zvol_state_t * zv)436 zvol_geom_destroy(zvol_state_t *zv)
437 {
438 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
439 struct g_provider *pp = zsg->zsg_provider;
440
441 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
442
443 g_topology_assert();
444
445 mutex_enter(&zv->zv_state_lock);
446 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
447 mutex_exit(&zv->zv_state_lock);
448 zsg->zsg_provider = NULL;
449 g_wither_geom(pp->geom, ENXIO);
450 }
451
452 void
zvol_wait_close(zvol_state_t * zv)453 zvol_wait_close(zvol_state_t *zv)
454 {
455
456 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
457 return;
458 mutex_enter(&zv->zv_state_lock);
459 zv->zv_zso->zso_dying = B_TRUE;
460
461 if (zv->zv_open_count)
462 msleep(zv, &zv->zv_state_lock,
463 PRIBIO, "zvol:dying", 10*hz);
464 mutex_exit(&zv->zv_state_lock);
465 }
466
467
468 static int
zvol_geom_access(struct g_provider * pp,int acr,int acw,int ace)469 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
470 {
471 int count, error, flags;
472
473 g_topology_assert();
474
475 /*
476 * To make it easier we expect either open or close, but not both
477 * at the same time.
478 */
479 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
480 (acr <= 0 && acw <= 0 && ace <= 0),
481 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
482 pp->name, acr, acw, ace));
483
484 if (pp->private == NULL) {
485 if (acr <= 0 && acw <= 0 && ace <= 0)
486 return (0);
487 return (pp->error);
488 }
489
490 /*
491 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
492 * ace != 0, because GEOM already handles that and handles it a bit
493 * differently. GEOM allows for multiple read/exclusive consumers and
494 * ZFS allows only one exclusive consumer, no matter if it is reader or
495 * writer. I like better the way GEOM works so I'll leave it for GEOM
496 * to decide what to do.
497 */
498
499 count = acr + acw + ace;
500 if (count == 0)
501 return (0);
502
503 flags = 0;
504 if (acr != 0 || ace != 0)
505 flags |= FREAD;
506 if (acw != 0)
507 flags |= FWRITE;
508
509 g_topology_unlock();
510 if (count > 0)
511 error = zvol_geom_open(pp, flags, count);
512 else
513 error = zvol_geom_close(pp, flags, -count);
514 g_topology_lock();
515 return (error);
516 }
517
518 static void
zvol_geom_worker(void * arg)519 zvol_geom_worker(void *arg)
520 {
521 zvol_state_t *zv = arg;
522 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
523 struct bio *bp;
524
525 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
526
527 thread_lock(curthread);
528 sched_prio(curthread, PRIBIO);
529 thread_unlock(curthread);
530
531 for (;;) {
532 mtx_lock(&zsg->zsg_queue_mtx);
533 bp = bioq_takefirst(&zsg->zsg_queue);
534 if (bp == NULL) {
535 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
536 zsg->zsg_state = ZVOL_GEOM_RUNNING;
537 wakeup(&zsg->zsg_state);
538 mtx_unlock(&zsg->zsg_queue_mtx);
539 kthread_exit();
540 }
541 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
542 PRIBIO | PDROP, "zvol:io", 0);
543 continue;
544 }
545 mtx_unlock(&zsg->zsg_queue_mtx);
546 zvol_geom_bio_strategy(bp);
547 }
548 }
549
550 static void
zvol_geom_bio_start(struct bio * bp)551 zvol_geom_bio_start(struct bio *bp)
552 {
553 zvol_state_t *zv = bp->bio_to->private;
554 struct zvol_state_geom *zsg;
555 boolean_t first;
556
557 if (zv == NULL) {
558 g_io_deliver(bp, ENXIO);
559 return;
560 }
561 if (bp->bio_cmd == BIO_GETATTR) {
562 if (zvol_geom_bio_getattr(bp))
563 g_io_deliver(bp, EOPNOTSUPP);
564 return;
565 }
566
567 if (!THREAD_CAN_SLEEP()) {
568 zsg = &zv->zv_zso->zso_geom;
569 mtx_lock(&zsg->zsg_queue_mtx);
570 first = (bioq_first(&zsg->zsg_queue) == NULL);
571 bioq_insert_tail(&zsg->zsg_queue, bp);
572 mtx_unlock(&zsg->zsg_queue_mtx);
573 if (first)
574 wakeup_one(&zsg->zsg_queue);
575 return;
576 }
577
578 zvol_geom_bio_strategy(bp);
579 }
580
581 static int
zvol_geom_bio_getattr(struct bio * bp)582 zvol_geom_bio_getattr(struct bio *bp)
583 {
584 zvol_state_t *zv;
585
586 zv = bp->bio_to->private;
587 ASSERT3P(zv, !=, NULL);
588
589 spa_t *spa = dmu_objset_spa(zv->zv_objset);
590 uint64_t refd, avail, usedobjs, availobjs;
591
592 if (g_handleattr_int(bp, "GEOM::candelete", 1))
593 return (0);
594 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
595 dmu_objset_space(zv->zv_objset, &refd, &avail,
596 &usedobjs, &availobjs);
597 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
598 return (0);
599 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
600 dmu_objset_space(zv->zv_objset, &refd, &avail,
601 &usedobjs, &availobjs);
602 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
603 return (0);
604 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
605 avail = metaslab_class_get_space(spa_normal_class(spa));
606 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
607 if (g_handleattr_off_t(bp, "poolblocksavail",
608 avail / DEV_BSIZE))
609 return (0);
610 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
611 refd = metaslab_class_get_alloc(spa_normal_class(spa));
612 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
613 return (0);
614 }
615 return (1);
616 }
617
618 static void
zvol_filter_detach(struct knote * kn)619 zvol_filter_detach(struct knote *kn)
620 {
621 zvol_state_t *zv;
622 struct zvol_state_dev *zsd;
623
624 zv = kn->kn_hook;
625 zsd = &zv->zv_zso->zso_dev;
626
627 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
628 }
629
630 static int
zvol_filter_vnode(struct knote * kn,long hint)631 zvol_filter_vnode(struct knote *kn, long hint)
632 {
633 kn->kn_fflags |= kn->kn_sfflags & hint;
634
635 return (kn->kn_fflags != 0);
636 }
637
638 static int
zvol_cdev_kqfilter(struct cdev * dev,struct knote * kn)639 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
640 {
641 zvol_state_t *zv;
642 struct zvol_state_dev *zsd;
643
644 zv = dev->si_drv2;
645 zsd = &zv->zv_zso->zso_dev;
646
647 if (kn->kn_filter != EVFILT_VNODE)
648 return (EINVAL);
649
650 /* XXX: extend support for other NOTE_* events */
651 if (kn->kn_sfflags != NOTE_ATTRIB)
652 return (EINVAL);
653
654 kn->kn_fop = &zvol_filterops_vnode;
655 kn->kn_hook = zv;
656 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
657
658 return (0);
659 }
660
661 static void
zvol_geom_bio_strategy(struct bio * bp)662 zvol_geom_bio_strategy(struct bio *bp)
663 {
664 zvol_state_t *zv;
665 uint64_t off, volsize;
666 size_t resid;
667 char *addr;
668 objset_t *os;
669 zfs_locked_range_t *lr;
670 int error = 0;
671 boolean_t doread = B_FALSE;
672 boolean_t is_dumpified;
673 boolean_t commit;
674
675 if (bp->bio_to)
676 zv = bp->bio_to->private;
677 else
678 zv = bp->bio_dev->si_drv2;
679
680 if (zv == NULL) {
681 error = SET_ERROR(ENXIO);
682 goto out;
683 }
684
685 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
686
687 if (zv->zv_flags & ZVOL_REMOVING) {
688 error = SET_ERROR(ENXIO);
689 goto resume;
690 }
691
692 switch (bp->bio_cmd) {
693 case BIO_READ:
694 doread = B_TRUE;
695 break;
696 case BIO_WRITE:
697 case BIO_FLUSH:
698 case BIO_DELETE:
699 if (zv->zv_flags & ZVOL_RDONLY) {
700 error = SET_ERROR(EROFS);
701 goto resume;
702 }
703 zvol_ensure_zilog(zv);
704 if (bp->bio_cmd == BIO_FLUSH)
705 goto commit;
706 break;
707 default:
708 error = SET_ERROR(EOPNOTSUPP);
709 goto resume;
710 }
711
712 off = bp->bio_offset;
713 volsize = zv->zv_volsize;
714
715 os = zv->zv_objset;
716 ASSERT3P(os, !=, NULL);
717
718 addr = bp->bio_data;
719 resid = bp->bio_length;
720
721 if (resid > 0 && off >= volsize) {
722 error = SET_ERROR(EIO);
723 goto resume;
724 }
725
726 is_dumpified = B_FALSE;
727 commit = !doread && !is_dumpified &&
728 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
729
730 /*
731 * There must be no buffer changes when doing a dmu_sync() because
732 * we can't change the data whilst calculating the checksum.
733 */
734 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
735 doread ? RL_READER : RL_WRITER);
736
737 if (bp->bio_cmd == BIO_DELETE) {
738 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
739 error = dmu_tx_assign(tx, TXG_WAIT);
740 if (error != 0) {
741 dmu_tx_abort(tx);
742 } else {
743 zvol_log_truncate(zv, tx, off, resid);
744 dmu_tx_commit(tx);
745 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
746 off, resid);
747 resid = 0;
748 }
749 goto unlock;
750 }
751 while (resid != 0 && off < volsize) {
752 size_t size = MIN(resid, zvol_maxphys);
753 if (doread) {
754 error = dmu_read(os, ZVOL_OBJ, off, size, addr,
755 DMU_READ_PREFETCH);
756 } else {
757 dmu_tx_t *tx = dmu_tx_create(os);
758 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
759 error = dmu_tx_assign(tx, TXG_WAIT);
760 if (error) {
761 dmu_tx_abort(tx);
762 } else {
763 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
764 zvol_log_write(zv, tx, off, size, commit);
765 dmu_tx_commit(tx);
766 }
767 }
768 if (error) {
769 /* Convert checksum errors into IO errors. */
770 if (error == ECKSUM)
771 error = SET_ERROR(EIO);
772 break;
773 }
774 off += size;
775 addr += size;
776 resid -= size;
777 }
778 unlock:
779 zfs_rangelock_exit(lr);
780
781 bp->bio_completed = bp->bio_length - resid;
782 if (bp->bio_completed < bp->bio_length && off > volsize)
783 error = SET_ERROR(EINVAL);
784
785 switch (bp->bio_cmd) {
786 case BIO_FLUSH:
787 break;
788 case BIO_READ:
789 dataset_kstats_update_read_kstats(&zv->zv_kstat,
790 bp->bio_completed);
791 break;
792 case BIO_WRITE:
793 dataset_kstats_update_write_kstats(&zv->zv_kstat,
794 bp->bio_completed);
795 break;
796 case BIO_DELETE:
797 break;
798 default:
799 break;
800 }
801
802 if (commit) {
803 commit:
804 zil_commit(zv->zv_zilog, ZVOL_OBJ);
805 }
806 resume:
807 rw_exit(&zv->zv_suspend_lock);
808 out:
809 if (bp->bio_to)
810 g_io_deliver(bp, error);
811 else
812 biofinish(bp, NULL, error);
813 }
814
815 /*
816 * Character device mode implementation
817 */
818
819 static int
zvol_cdev_read(struct cdev * dev,struct uio * uio_s,int ioflag)820 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
821 {
822 zvol_state_t *zv;
823 uint64_t volsize;
824 zfs_locked_range_t *lr;
825 int error = 0;
826 zfs_uio_t uio;
827
828 zfs_uio_init(&uio, uio_s);
829
830 zv = dev->si_drv2;
831
832 volsize = zv->zv_volsize;
833 /*
834 * uio_loffset == volsize isn't an error as
835 * it's required for EOF processing.
836 */
837 if (zfs_uio_resid(&uio) > 0 &&
838 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
839 return (SET_ERROR(EIO));
840
841 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
842 ssize_t start_resid = zfs_uio_resid(&uio);
843 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
844 zfs_uio_resid(&uio), RL_READER);
845 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
846 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
847
848 /* Don't read past the end. */
849 if (bytes > volsize - zfs_uio_offset(&uio))
850 bytes = volsize - zfs_uio_offset(&uio);
851
852 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
853 if (error) {
854 /* Convert checksum errors into IO errors. */
855 if (error == ECKSUM)
856 error = SET_ERROR(EIO);
857 break;
858 }
859 }
860 zfs_rangelock_exit(lr);
861 int64_t nread = start_resid - zfs_uio_resid(&uio);
862 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
863 rw_exit(&zv->zv_suspend_lock);
864
865 return (error);
866 }
867
868 static int
zvol_cdev_write(struct cdev * dev,struct uio * uio_s,int ioflag)869 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
870 {
871 zvol_state_t *zv;
872 uint64_t volsize;
873 zfs_locked_range_t *lr;
874 int error = 0;
875 boolean_t commit;
876 zfs_uio_t uio;
877
878 zv = dev->si_drv2;
879
880 volsize = zv->zv_volsize;
881
882 zfs_uio_init(&uio, uio_s);
883
884 if (zfs_uio_resid(&uio) > 0 &&
885 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
886 return (SET_ERROR(EIO));
887
888 ssize_t start_resid = zfs_uio_resid(&uio);
889 commit = (ioflag & IO_SYNC) ||
890 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
891
892 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
893 zvol_ensure_zilog(zv);
894
895 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
896 zfs_uio_resid(&uio), RL_WRITER);
897 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
898 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
899 uint64_t off = zfs_uio_offset(&uio);
900 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
901
902 if (bytes > volsize - off) /* Don't write past the end. */
903 bytes = volsize - off;
904
905 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
906 error = dmu_tx_assign(tx, TXG_WAIT);
907 if (error) {
908 dmu_tx_abort(tx);
909 break;
910 }
911 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
912 if (error == 0)
913 zvol_log_write(zv, tx, off, bytes, commit);
914 dmu_tx_commit(tx);
915
916 if (error)
917 break;
918 }
919 zfs_rangelock_exit(lr);
920 int64_t nwritten = start_resid - zfs_uio_resid(&uio);
921 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
922 if (commit)
923 zil_commit(zv->zv_zilog, ZVOL_OBJ);
924 rw_exit(&zv->zv_suspend_lock);
925
926 return (error);
927 }
928
929 static int
zvol_cdev_open(struct cdev * dev,int flags,int fmt,struct thread * td)930 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
931 {
932 zvol_state_t *zv;
933 int err = 0;
934 boolean_t drop_suspend = B_FALSE;
935
936 retry:
937 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
938 /*
939 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
940 * the result of zvol free code setting si_drv2 to NULL is observed,
941 * or the zv is protected from being freed because of the positive
942 * zv_open_count.
943 */
944 zv = dev->si_drv2;
945 if (zv == NULL) {
946 rw_exit(&zvol_state_lock);
947 err = SET_ERROR(ENXIO);
948 goto out_locked;
949 }
950
951 mutex_enter(&zv->zv_state_lock);
952 if (zv->zv_zso->zso_dying) {
953 rw_exit(&zvol_state_lock);
954 err = SET_ERROR(ENXIO);
955 goto out_zv_locked;
956 }
957 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
958
959 /*
960 * Make sure zvol is not suspended during first open
961 * (hold zv_suspend_lock) and respect proper lock acquisition
962 * ordering - zv_suspend_lock before zv_state_lock.
963 */
964 if (zv->zv_open_count == 0) {
965 drop_suspend = B_TRUE;
966 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
967 mutex_exit(&zv->zv_state_lock);
968 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
969 mutex_enter(&zv->zv_state_lock);
970 /* Check to see if zv_suspend_lock is needed. */
971 if (zv->zv_open_count != 0) {
972 rw_exit(&zv->zv_suspend_lock);
973 drop_suspend = B_FALSE;
974 }
975 }
976 }
977 rw_exit(&zvol_state_lock);
978
979 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
980
981 if (zv->zv_open_count == 0) {
982 boolean_t drop_namespace = B_FALSE;
983
984 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
985
986 /*
987 * Take spa_namespace_lock to prevent lock inversion when
988 * zvols from one pool are opened as vdevs in another.
989 */
990 if (!mutex_owned(&spa_namespace_lock)) {
991 if (!mutex_tryenter(&spa_namespace_lock)) {
992 mutex_exit(&zv->zv_state_lock);
993 rw_exit(&zv->zv_suspend_lock);
994 drop_suspend = B_FALSE;
995 kern_yield(PRI_USER);
996 goto retry;
997 } else {
998 drop_namespace = B_TRUE;
999 }
1000 }
1001 err = zvol_first_open(zv, !(flags & FWRITE));
1002 if (drop_namespace)
1003 mutex_exit(&spa_namespace_lock);
1004 if (err)
1005 goto out_zv_locked;
1006 }
1007
1008 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1009
1010 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1011 err = SET_ERROR(EROFS);
1012 goto out_opened;
1013 }
1014 if (zv->zv_flags & ZVOL_EXCL) {
1015 err = SET_ERROR(EBUSY);
1016 goto out_opened;
1017 }
1018 if (flags & O_EXCL) {
1019 if (zv->zv_open_count != 0) {
1020 err = SET_ERROR(EBUSY);
1021 goto out_opened;
1022 }
1023 zv->zv_flags |= ZVOL_EXCL;
1024 }
1025
1026 zv->zv_open_count++;
1027 out_opened:
1028 if (zv->zv_open_count == 0) {
1029 zvol_last_close(zv);
1030 wakeup(zv);
1031 }
1032 out_zv_locked:
1033 mutex_exit(&zv->zv_state_lock);
1034 out_locked:
1035 if (drop_suspend)
1036 rw_exit(&zv->zv_suspend_lock);
1037 return (err);
1038 }
1039
1040 static int
zvol_cdev_close(struct cdev * dev,int flags,int fmt,struct thread * td)1041 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1042 {
1043 zvol_state_t *zv;
1044 boolean_t drop_suspend = B_TRUE;
1045
1046 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1047 zv = dev->si_drv2;
1048 if (zv == NULL) {
1049 rw_exit(&zvol_state_lock);
1050 return (SET_ERROR(ENXIO));
1051 }
1052
1053 mutex_enter(&zv->zv_state_lock);
1054 if (zv->zv_flags & ZVOL_EXCL) {
1055 ASSERT3U(zv->zv_open_count, ==, 1);
1056 zv->zv_flags &= ~ZVOL_EXCL;
1057 }
1058
1059 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1060
1061 /*
1062 * If the open count is zero, this is a spurious close.
1063 * That indicates a bug in the kernel / DDI framework.
1064 */
1065 ASSERT3U(zv->zv_open_count, >, 0);
1066 /*
1067 * Make sure zvol is not suspended during last close
1068 * (hold zv_suspend_lock) and respect proper lock acquisition
1069 * ordering - zv_suspend_lock before zv_state_lock.
1070 */
1071 if (zv->zv_open_count == 1) {
1072 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1073 mutex_exit(&zv->zv_state_lock);
1074 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1075 mutex_enter(&zv->zv_state_lock);
1076 /* Check to see if zv_suspend_lock is needed. */
1077 if (zv->zv_open_count != 1) {
1078 rw_exit(&zv->zv_suspend_lock);
1079 drop_suspend = B_FALSE;
1080 }
1081 }
1082 } else {
1083 drop_suspend = B_FALSE;
1084 }
1085 rw_exit(&zvol_state_lock);
1086
1087 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1088
1089 /*
1090 * You may get multiple opens, but only one close.
1091 */
1092 zv->zv_open_count--;
1093
1094 if (zv->zv_open_count == 0) {
1095 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1096 zvol_last_close(zv);
1097 wakeup(zv);
1098 }
1099
1100 mutex_exit(&zv->zv_state_lock);
1101
1102 if (drop_suspend)
1103 rw_exit(&zv->zv_suspend_lock);
1104 return (0);
1105 }
1106
1107 static int
zvol_cdev_ioctl(struct cdev * dev,ulong_t cmd,caddr_t data,int fflag,struct thread * td)1108 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1109 int fflag, struct thread *td)
1110 {
1111 zvol_state_t *zv;
1112 zfs_locked_range_t *lr;
1113 off_t offset, length;
1114 int error;
1115 boolean_t sync;
1116
1117 zv = dev->si_drv2;
1118
1119 error = 0;
1120 KASSERT(zv->zv_open_count > 0,
1121 ("Device with zero access count in %s", __func__));
1122
1123 switch (cmd) {
1124 case DIOCGSECTORSIZE:
1125 *(uint32_t *)data = DEV_BSIZE;
1126 break;
1127 case DIOCGMEDIASIZE:
1128 *(off_t *)data = zv->zv_volsize;
1129 break;
1130 case DIOCGFLUSH:
1131 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1132 if (zv->zv_zilog != NULL)
1133 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1134 rw_exit(&zv->zv_suspend_lock);
1135 break;
1136 case DIOCGDELETE:
1137 if (!zvol_unmap_enabled)
1138 break;
1139
1140 offset = ((off_t *)data)[0];
1141 length = ((off_t *)data)[1];
1142 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1143 offset < 0 || offset >= zv->zv_volsize ||
1144 length <= 0) {
1145 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1146 length);
1147 error = SET_ERROR(EINVAL);
1148 break;
1149 }
1150 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1151 zvol_ensure_zilog(zv);
1152 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1153 RL_WRITER);
1154 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1155 error = dmu_tx_assign(tx, TXG_WAIT);
1156 if (error != 0) {
1157 sync = FALSE;
1158 dmu_tx_abort(tx);
1159 } else {
1160 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1161 zvol_log_truncate(zv, tx, offset, length);
1162 dmu_tx_commit(tx);
1163 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1164 offset, length);
1165 }
1166 zfs_rangelock_exit(lr);
1167 if (sync)
1168 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1169 rw_exit(&zv->zv_suspend_lock);
1170 break;
1171 case DIOCGSTRIPESIZE:
1172 *(off_t *)data = zv->zv_volblocksize;
1173 break;
1174 case DIOCGSTRIPEOFFSET:
1175 *(off_t *)data = 0;
1176 break;
1177 case DIOCGATTR: {
1178 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1179 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1180 uint64_t refd, avail, usedobjs, availobjs;
1181
1182 if (strcmp(arg->name, "GEOM::candelete") == 0)
1183 arg->value.i = 1;
1184 else if (strcmp(arg->name, "blocksavail") == 0) {
1185 dmu_objset_space(zv->zv_objset, &refd, &avail,
1186 &usedobjs, &availobjs);
1187 arg->value.off = avail / DEV_BSIZE;
1188 } else if (strcmp(arg->name, "blocksused") == 0) {
1189 dmu_objset_space(zv->zv_objset, &refd, &avail,
1190 &usedobjs, &availobjs);
1191 arg->value.off = refd / DEV_BSIZE;
1192 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1193 avail = metaslab_class_get_space(spa_normal_class(spa));
1194 avail -= metaslab_class_get_alloc(
1195 spa_normal_class(spa));
1196 arg->value.off = avail / DEV_BSIZE;
1197 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1198 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1199 arg->value.off = refd / DEV_BSIZE;
1200 } else
1201 error = SET_ERROR(ENOIOCTL);
1202 break;
1203 }
1204 case FIOSEEKHOLE:
1205 case FIOSEEKDATA: {
1206 off_t *off = (off_t *)data;
1207 uint64_t noff;
1208 boolean_t hole;
1209
1210 hole = (cmd == FIOSEEKHOLE);
1211 noff = *off;
1212 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
1213 RL_READER);
1214 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1215 zfs_rangelock_exit(lr);
1216 *off = noff;
1217 break;
1218 }
1219 default:
1220 error = SET_ERROR(ENOIOCTL);
1221 }
1222
1223 return (error);
1224 }
1225
1226 /*
1227 * Misc. helpers
1228 */
1229
1230 static void
zvol_ensure_zilog(zvol_state_t * zv)1231 zvol_ensure_zilog(zvol_state_t *zv)
1232 {
1233 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1234
1235 /*
1236 * Open a ZIL if this is the first time we have written to this
1237 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1238 * than zv_state_lock so that we don't need to acquire an
1239 * additional lock in this path.
1240 */
1241 if (zv->zv_zilog == NULL) {
1242 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1243 rw_exit(&zv->zv_suspend_lock);
1244 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1245 }
1246 if (zv->zv_zilog == NULL) {
1247 zv->zv_zilog = zil_open(zv->zv_objset,
1248 zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1249 zv->zv_flags |= ZVOL_WRITTEN_TO;
1250 /* replay / destroy done in zvol_os_create_minor() */
1251 VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1252 ZIL_REPLAY_NEEDED);
1253 }
1254 rw_downgrade(&zv->zv_suspend_lock);
1255 }
1256 }
1257
1258 boolean_t
zvol_os_is_zvol(const char * device)1259 zvol_os_is_zvol(const char *device)
1260 {
1261 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1262 }
1263
1264 void
zvol_os_rename_minor(zvol_state_t * zv,const char * newname)1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1266 {
1267 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1268 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1269
1270 /* Move to a new hashtable entry. */
1271 zv->zv_hash = zvol_name_hash(newname);
1272 hlist_del(&zv->zv_hlink);
1273 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1274
1275 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1276 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1277 struct g_provider *pp = zsg->zsg_provider;
1278 struct g_geom *gp;
1279
1280 g_topology_lock();
1281 gp = pp->geom;
1282 ASSERT3P(gp, !=, NULL);
1283
1284 zsg->zsg_provider = NULL;
1285 g_wither_provider(pp, ENXIO);
1286
1287 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1288 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1289 pp->sectorsize = DEV_BSIZE;
1290 pp->mediasize = zv->zv_volsize;
1291 pp->private = zv;
1292 zsg->zsg_provider = pp;
1293 g_error_provider(pp, 0);
1294 g_topology_unlock();
1295 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1296 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1297 struct cdev *dev;
1298 struct make_dev_args args;
1299
1300 dev = zsd->zsd_cdev;
1301 if (dev != NULL) {
1302 destroy_dev(dev);
1303 dev = zsd->zsd_cdev = NULL;
1304 if (zv->zv_open_count > 0) {
1305 zv->zv_flags &= ~ZVOL_EXCL;
1306 zv->zv_open_count = 0;
1307 /* XXX need suspend lock but lock order */
1308 zvol_last_close(zv);
1309 }
1310 }
1311
1312 make_dev_args_init(&args);
1313 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1314 args.mda_devsw = &zvol_cdevsw;
1315 args.mda_cr = NULL;
1316 args.mda_uid = UID_ROOT;
1317 args.mda_gid = GID_OPERATOR;
1318 args.mda_mode = 0640;
1319 args.mda_si_drv2 = zv;
1320 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1321 == 0) {
1322 dev->si_iosize_max = maxphys;
1323 zsd->zsd_cdev = dev;
1324 }
1325 }
1326 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1327 dataset_kstats_rename(&zv->zv_kstat, newname);
1328 }
1329
1330 /*
1331 * Remove minor node for the specified volume.
1332 */
1333 void
zvol_os_free(zvol_state_t * zv)1334 zvol_os_free(zvol_state_t *zv)
1335 {
1336 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1337 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1338 ASSERT0(zv->zv_open_count);
1339
1340 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1341
1342 rw_destroy(&zv->zv_suspend_lock);
1343 zfs_rangelock_fini(&zv->zv_rangelock);
1344
1345 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1346 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1347 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1348
1349 ASSERT3P(pp->private, ==, NULL);
1350
1351 g_topology_lock();
1352 zvol_geom_destroy(zv);
1353 g_topology_unlock();
1354 mtx_destroy(&zsg->zsg_queue_mtx);
1355 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1356 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1357 struct cdev *dev = zsd->zsd_cdev;
1358
1359 if (dev != NULL) {
1360 ASSERT3P(dev->si_drv2, ==, NULL);
1361 destroy_dev(dev);
1362 knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1363 knlist_destroy(&zsd->zsd_selinfo.si_note);
1364 }
1365 }
1366
1367 mutex_destroy(&zv->zv_state_lock);
1368 cv_destroy(&zv->zv_removing_cv);
1369 dataset_kstats_destroy(&zv->zv_kstat);
1370 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1371 kmem_free(zv, sizeof (zvol_state_t));
1372 zvol_minors--;
1373 }
1374
1375 /*
1376 * Create a minor node (plus a whole lot more) for the specified volume.
1377 */
1378 int
zvol_os_create_minor(const char * name)1379 zvol_os_create_minor(const char *name)
1380 {
1381 zvol_state_t *zv;
1382 objset_t *os;
1383 dmu_object_info_t *doi;
1384 uint64_t volsize;
1385 uint64_t volmode, hash;
1386 int error;
1387 bool replayed_zil = B_FALSE;
1388
1389 ZFS_LOG(1, "Creating ZVOL %s...", name);
1390 hash = zvol_name_hash(name);
1391 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1392 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1393 mutex_exit(&zv->zv_state_lock);
1394 return (SET_ERROR(EEXIST));
1395 }
1396
1397 DROP_GIANT();
1398
1399 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1400
1401 /* Lie and say we're read-only. */
1402 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1403 if (error)
1404 goto out_doi;
1405
1406 error = dmu_object_info(os, ZVOL_OBJ, doi);
1407 if (error)
1408 goto out_dmu_objset_disown;
1409
1410 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1411 if (error)
1412 goto out_dmu_objset_disown;
1413
1414 error = dsl_prop_get_integer(name,
1415 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1416 if (error || volmode == ZFS_VOLMODE_DEFAULT)
1417 volmode = zvol_volmode;
1418 error = 0;
1419
1420 /*
1421 * zvol_alloc equivalent ...
1422 */
1423 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1424 zv->zv_hash = hash;
1425 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1426 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
1427 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1428 zv->zv_volmode = volmode;
1429 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1430 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1431 struct g_provider *pp;
1432 struct g_geom *gp;
1433
1434 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1435 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1436
1437 g_topology_lock();
1438 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1439 gp->start = zvol_geom_bio_start;
1440 gp->access = zvol_geom_access;
1441 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1442 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1443 pp->sectorsize = DEV_BSIZE;
1444 pp->mediasize = 0;
1445 pp->private = zv;
1446
1447 zsg->zsg_provider = pp;
1448 bioq_init(&zsg->zsg_queue);
1449 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1450 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1451 struct cdev *dev;
1452 struct make_dev_args args;
1453
1454 make_dev_args_init(&args);
1455 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1456 args.mda_devsw = &zvol_cdevsw;
1457 args.mda_cr = NULL;
1458 args.mda_uid = UID_ROOT;
1459 args.mda_gid = GID_OPERATOR;
1460 args.mda_mode = 0640;
1461 args.mda_si_drv2 = zv;
1462 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1463 == 0) {
1464 dev->si_iosize_max = maxphys;
1465 zsd->zsd_cdev = dev;
1466 knlist_init_sx(&zsd->zsd_selinfo.si_note,
1467 &zv->zv_state_lock);
1468 }
1469 }
1470 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1471 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1472 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1473
1474 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1475 zv->zv_flags |= ZVOL_RDONLY;
1476
1477 zv->zv_volblocksize = doi->doi_data_block_size;
1478 zv->zv_volsize = volsize;
1479 zv->zv_objset = os;
1480
1481 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1482 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1483 if (error)
1484 goto out_dmu_objset_disown;
1485 ASSERT3P(zv->zv_zilog, ==, NULL);
1486 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1487 if (spa_writeable(dmu_objset_spa(os))) {
1488 if (zil_replay_disable)
1489 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1490 else
1491 replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1492 }
1493 if (replayed_zil)
1494 zil_close(zv->zv_zilog);
1495 zv->zv_zilog = NULL;
1496
1497 /* TODO: prefetch for geom tasting */
1498
1499 zv->zv_objset = NULL;
1500 out_dmu_objset_disown:
1501 dmu_objset_disown(os, B_TRUE, FTAG);
1502
1503 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1504 zvol_geom_run(zv);
1505 g_topology_unlock();
1506 }
1507 out_doi:
1508 kmem_free(doi, sizeof (dmu_object_info_t));
1509 if (error == 0) {
1510 rw_enter(&zvol_state_lock, RW_WRITER);
1511 zvol_insert(zv);
1512 zvol_minors++;
1513 rw_exit(&zvol_state_lock);
1514 ZFS_LOG(1, "ZVOL %s created.", name);
1515 }
1516 PICKUP_GIANT();
1517 return (error);
1518 }
1519
1520 void
zvol_os_clear_private(zvol_state_t * zv)1521 zvol_os_clear_private(zvol_state_t *zv)
1522 {
1523 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1524 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1525 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1526 struct g_provider *pp = zsg->zsg_provider;
1527
1528 if (pp->private == NULL) /* already cleared */
1529 return;
1530
1531 mtx_lock(&zsg->zsg_queue_mtx);
1532 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1533 pp->private = NULL;
1534 wakeup_one(&zsg->zsg_queue);
1535 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1536 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1537 0, "zvol:w", 0);
1538 mtx_unlock(&zsg->zsg_queue_mtx);
1539 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1540 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1541 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1542 struct cdev *dev = zsd->zsd_cdev;
1543
1544 if (dev != NULL)
1545 dev->si_drv2 = NULL;
1546 }
1547 }
1548
1549 int
zvol_os_update_volsize(zvol_state_t * zv,uint64_t volsize)1550 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1551 {
1552 zv->zv_volsize = volsize;
1553 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1554 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1555 struct g_provider *pp = zsg->zsg_provider;
1556
1557 g_topology_lock();
1558
1559 if (pp->private == NULL) {
1560 g_topology_unlock();
1561 return (SET_ERROR(ENXIO));
1562 }
1563
1564 /*
1565 * Do not invoke resize event when initial size was zero.
1566 * ZVOL initializes the size on first open, this is not
1567 * real resizing.
1568 */
1569 if (pp->mediasize == 0)
1570 pp->mediasize = zv->zv_volsize;
1571 else
1572 g_resize_provider(pp, zv->zv_volsize);
1573
1574 g_topology_unlock();
1575 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1576 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1577
1578 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1579 }
1580 return (0);
1581 }
1582
1583 void
zvol_os_set_disk_ro(zvol_state_t * zv,int flags)1584 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1585 {
1586 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1587 }
1588
1589 void
zvol_os_set_capacity(zvol_state_t * zv,uint64_t capacity)1590 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1591 {
1592 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1593 }
1594
1595 /*
1596 * Public interfaces
1597 */
1598
1599 int
zvol_busy(void)1600 zvol_busy(void)
1601 {
1602 return (zvol_minors != 0);
1603 }
1604
1605 int
zvol_init(void)1606 zvol_init(void)
1607 {
1608 zvol_init_impl();
1609 return (0);
1610 }
1611
1612 void
zvol_fini(void)1613 zvol_fini(void)
1614 {
1615 zvol_fini_impl();
1616 }
1617