xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c (revision 6132212808e8dccedc9e5d85fea4390c2f38059a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
23  */
24 
25 #include <sys/dataset_kstats.h>
26 #include <sys/dbuf.h>
27 #include <sys/dmu_traverse.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_prop.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/zap.h>
32 #include <sys/zfeature.h>
33 #include <sys/zil_impl.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/zio.h>
36 #include <sys/zfs_rlock.h>
37 #include <sys/spa_impl.h>
38 #include <sys/zvol.h>
39 #include <sys/zvol_impl.h>
40 
41 #include <linux/blkdev_compat.h>
42 #include <linux/task_io_accounting_ops.h>
43 
44 unsigned int zvol_major = ZVOL_MAJOR;
45 unsigned int zvol_request_sync = 0;
46 unsigned int zvol_prefetch_bytes = (128 * 1024);
47 unsigned long zvol_max_discard_blocks = 16384;
48 unsigned int zvol_threads = 32;
49 
50 struct zvol_state_os {
51 	struct gendisk		*zvo_disk;	/* generic disk */
52 	struct request_queue	*zvo_queue;	/* request queue */
53 	dev_t			zvo_dev;	/* device id */
54 };
55 
56 taskq_t *zvol_taskq;
57 static struct ida zvol_ida;
58 
59 typedef struct zv_request {
60 	zvol_state_t	*zv;
61 	struct bio	*bio;
62 	taskq_ent_t	ent;
63 } zv_request_t;
64 
65 /*
66  * Given a path, return TRUE if path is a ZVOL.
67  */
68 static boolean_t
69 zvol_is_zvol_impl(const char *device)
70 {
71 	struct block_device *bdev;
72 	unsigned int major;
73 
74 	bdev = vdev_lookup_bdev(device);
75 	if (IS_ERR(bdev))
76 		return (B_FALSE);
77 
78 	major = MAJOR(bdev->bd_dev);
79 	bdput(bdev);
80 
81 	if (major == zvol_major)
82 		return (B_TRUE);
83 
84 	return (B_FALSE);
85 }
86 
87 static void
88 uio_from_bio(uio_t *uio, struct bio *bio)
89 {
90 	uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
91 	uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
92 	uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
93 	uio->uio_segflg = UIO_BVEC;
94 	uio->uio_limit = MAXOFFSET_T;
95 	uio->uio_resid = BIO_BI_SIZE(bio);
96 	uio->uio_skip = BIO_BI_SKIP(bio);
97 }
98 
99 static void
100 zvol_write(void *arg)
101 {
102 	int error = 0;
103 
104 	zv_request_t *zvr = arg;
105 	struct bio *bio = zvr->bio;
106 	uio_t uio = { { 0 }, 0 };
107 	uio_from_bio(&uio, bio);
108 
109 	zvol_state_t *zv = zvr->zv;
110 	ASSERT(zv && zv->zv_open_count > 0);
111 	ASSERT(zv->zv_zilog != NULL);
112 
113 	/* bio marked as FLUSH need to flush before write */
114 	if (bio_is_flush(bio))
115 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
116 
117 	/* Some requests are just for flush and nothing else. */
118 	if (uio.uio_resid == 0) {
119 		rw_exit(&zv->zv_suspend_lock);
120 		BIO_END_IO(bio, 0);
121 		kmem_free(zvr, sizeof (zv_request_t));
122 		return;
123 	}
124 
125 	ssize_t start_resid = uio.uio_resid;
126 	unsigned long start_jif = jiffies;
127 	blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
128 	    bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
129 
130 	boolean_t sync =
131 	    bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
132 
133 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
134 	    uio.uio_loffset, uio.uio_resid, RL_WRITER);
135 
136 	uint64_t volsize = zv->zv_volsize;
137 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
138 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
139 		uint64_t off = uio.uio_loffset;
140 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
141 
142 		if (bytes > volsize - off)	/* don't write past the end */
143 			bytes = volsize - off;
144 
145 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
146 
147 		/* This will only fail for ENOSPC */
148 		error = dmu_tx_assign(tx, TXG_WAIT);
149 		if (error) {
150 			dmu_tx_abort(tx);
151 			break;
152 		}
153 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
154 		if (error == 0) {
155 			zvol_log_write(zv, tx, off, bytes, sync);
156 		}
157 		dmu_tx_commit(tx);
158 
159 		if (error)
160 			break;
161 	}
162 	zfs_rangelock_exit(lr);
163 
164 	int64_t nwritten = start_resid - uio.uio_resid;
165 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
166 	task_io_account_write(nwritten);
167 
168 	if (sync)
169 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
170 
171 	rw_exit(&zv->zv_suspend_lock);
172 	blk_generic_end_io_acct(zv->zv_zso->zvo_queue,
173 	    WRITE, &zv->zv_zso->zvo_disk->part0, start_jif);
174 	BIO_END_IO(bio, -error);
175 	kmem_free(zvr, sizeof (zv_request_t));
176 }
177 
178 static void
179 zvol_discard(void *arg)
180 {
181 	zv_request_t *zvr = arg;
182 	struct bio *bio = zvr->bio;
183 	zvol_state_t *zv = zvr->zv;
184 	uint64_t start = BIO_BI_SECTOR(bio) << 9;
185 	uint64_t size = BIO_BI_SIZE(bio);
186 	uint64_t end = start + size;
187 	boolean_t sync;
188 	int error = 0;
189 	dmu_tx_t *tx;
190 	unsigned long start_jif;
191 
192 	ASSERT(zv && zv->zv_open_count > 0);
193 	ASSERT(zv->zv_zilog != NULL);
194 
195 	start_jif = jiffies;
196 	blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
197 	    bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
198 
199 	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
200 
201 	if (end > zv->zv_volsize) {
202 		error = SET_ERROR(EIO);
203 		goto unlock;
204 	}
205 
206 	/*
207 	 * Align the request to volume block boundaries when a secure erase is
208 	 * not required.  This will prevent dnode_free_range() from zeroing out
209 	 * the unaligned parts which is slow (read-modify-write) and useless
210 	 * since we are not freeing any space by doing so.
211 	 */
212 	if (!bio_is_secure_erase(bio)) {
213 		start = P2ROUNDUP(start, zv->zv_volblocksize);
214 		end = P2ALIGN(end, zv->zv_volblocksize);
215 		size = end - start;
216 	}
217 
218 	if (start >= end)
219 		goto unlock;
220 
221 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
222 	    start, size, RL_WRITER);
223 
224 	tx = dmu_tx_create(zv->zv_objset);
225 	dmu_tx_mark_netfree(tx);
226 	error = dmu_tx_assign(tx, TXG_WAIT);
227 	if (error != 0) {
228 		dmu_tx_abort(tx);
229 	} else {
230 		zvol_log_truncate(zv, tx, start, size, B_TRUE);
231 		dmu_tx_commit(tx);
232 		error = dmu_free_long_range(zv->zv_objset,
233 		    ZVOL_OBJ, start, size);
234 	}
235 	zfs_rangelock_exit(lr);
236 
237 	if (error == 0 && sync)
238 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
239 
240 unlock:
241 	rw_exit(&zv->zv_suspend_lock);
242 	blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE,
243 	    &zv->zv_zso->zvo_disk->part0, start_jif);
244 	BIO_END_IO(bio, -error);
245 	kmem_free(zvr, sizeof (zv_request_t));
246 }
247 
248 static void
249 zvol_read(void *arg)
250 {
251 	int error = 0;
252 
253 	zv_request_t *zvr = arg;
254 	struct bio *bio = zvr->bio;
255 	uio_t uio = { { 0 }, 0 };
256 	uio_from_bio(&uio, bio);
257 
258 	zvol_state_t *zv = zvr->zv;
259 	ASSERT(zv && zv->zv_open_count > 0);
260 
261 	ssize_t start_resid = uio.uio_resid;
262 	unsigned long start_jif = jiffies;
263 	blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio),
264 	    &zv->zv_zso->zvo_disk->part0);
265 
266 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
267 	    uio.uio_loffset, uio.uio_resid, RL_READER);
268 
269 	uint64_t volsize = zv->zv_volsize;
270 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
271 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
272 
273 		/* don't read past the end */
274 		if (bytes > volsize - uio.uio_loffset)
275 			bytes = volsize - uio.uio_loffset;
276 
277 		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
278 		if (error) {
279 			/* convert checksum errors into IO errors */
280 			if (error == ECKSUM)
281 				error = SET_ERROR(EIO);
282 			break;
283 		}
284 	}
285 	zfs_rangelock_exit(lr);
286 
287 	int64_t nread = start_resid - uio.uio_resid;
288 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
289 	task_io_account_read(nread);
290 
291 	rw_exit(&zv->zv_suspend_lock);
292 	blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ,
293 	    &zv->zv_zso->zvo_disk->part0, start_jif);
294 	BIO_END_IO(bio, -error);
295 	kmem_free(zvr, sizeof (zv_request_t));
296 }
297 
298 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
299 static blk_qc_t
300 zvol_submit_bio(struct bio *bio)
301 #else
302 static MAKE_REQUEST_FN_RET
303 zvol_request(struct request_queue *q, struct bio *bio)
304 #endif
305 {
306 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
307 	struct request_queue *q = bio->bi_disk->queue;
308 #endif
309 	zvol_state_t *zv = q->queuedata;
310 	fstrans_cookie_t cookie = spl_fstrans_mark();
311 	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
312 	uint64_t size = BIO_BI_SIZE(bio);
313 	int rw = bio_data_dir(bio);
314 	zv_request_t *zvr;
315 
316 	if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
317 		printk(KERN_INFO
318 		    "%s: bad access: offset=%llu, size=%lu\n",
319 		    zv->zv_zso->zvo_disk->disk_name,
320 		    (long long unsigned)offset,
321 		    (long unsigned)size);
322 
323 		BIO_END_IO(bio, -SET_ERROR(EIO));
324 		goto out;
325 	}
326 
327 	if (rw == WRITE) {
328 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
329 			BIO_END_IO(bio, -SET_ERROR(EROFS));
330 			goto out;
331 		}
332 
333 		/*
334 		 * Prevents the zvol from being suspended, or the ZIL being
335 		 * concurrently opened.  Will be released after the i/o
336 		 * completes.
337 		 */
338 		rw_enter(&zv->zv_suspend_lock, RW_READER);
339 
340 		/*
341 		 * Open a ZIL if this is the first time we have written to this
342 		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
343 		 * than zv_state_lock so that we don't need to acquire an
344 		 * additional lock in this path.
345 		 */
346 		if (zv->zv_zilog == NULL) {
347 			rw_exit(&zv->zv_suspend_lock);
348 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
349 			if (zv->zv_zilog == NULL) {
350 				zv->zv_zilog = zil_open(zv->zv_objset,
351 				    zvol_get_data);
352 				zv->zv_flags |= ZVOL_WRITTEN_TO;
353 			}
354 			rw_downgrade(&zv->zv_suspend_lock);
355 		}
356 
357 		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
358 		zvr->zv = zv;
359 		zvr->bio = bio;
360 		taskq_init_ent(&zvr->ent);
361 
362 		/*
363 		 * We don't want this thread to be blocked waiting for i/o to
364 		 * complete, so we instead wait from a taskq callback. The
365 		 * i/o may be a ZIL write (via zil_commit()), or a read of an
366 		 * indirect block, or a read of a data block (if this is a
367 		 * partial-block write).  We will indicate that the i/o is
368 		 * complete by calling BIO_END_IO() from the taskq callback.
369 		 *
370 		 * This design allows the calling thread to continue and
371 		 * initiate more concurrent operations by calling
372 		 * zvol_request() again. There are typically only a small
373 		 * number of threads available to call zvol_request() (e.g.
374 		 * one per iSCSI target), so keeping the latency of
375 		 * zvol_request() low is important for performance.
376 		 *
377 		 * The zvol_request_sync module parameter allows this
378 		 * behavior to be altered, for performance evaluation
379 		 * purposes.  If the callback blocks, setting
380 		 * zvol_request_sync=1 will result in much worse performance.
381 		 *
382 		 * We can have up to zvol_threads concurrent i/o's being
383 		 * processed for all zvols on the system.  This is typically
384 		 * a vast improvement over the zvol_request_sync=1 behavior
385 		 * of one i/o at a time per zvol.  However, an even better
386 		 * design would be for zvol_request() to initiate the zio
387 		 * directly, and then be notified by the zio_done callback,
388 		 * which would call BIO_END_IO().  Unfortunately, the DMU/ZIL
389 		 * interfaces lack this functionality (they block waiting for
390 		 * the i/o to complete).
391 		 */
392 		if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
393 			if (zvol_request_sync) {
394 				zvol_discard(zvr);
395 			} else {
396 				taskq_dispatch_ent(zvol_taskq,
397 				    zvol_discard, zvr, 0, &zvr->ent);
398 			}
399 		} else {
400 			if (zvol_request_sync) {
401 				zvol_write(zvr);
402 			} else {
403 				taskq_dispatch_ent(zvol_taskq,
404 				    zvol_write, zvr, 0, &zvr->ent);
405 			}
406 		}
407 	} else {
408 		/*
409 		 * The SCST driver, and possibly others, may issue READ I/Os
410 		 * with a length of zero bytes.  These empty I/Os contain no
411 		 * data and require no additional handling.
412 		 */
413 		if (size == 0) {
414 			BIO_END_IO(bio, 0);
415 			goto out;
416 		}
417 
418 		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
419 		zvr->zv = zv;
420 		zvr->bio = bio;
421 		taskq_init_ent(&zvr->ent);
422 
423 		rw_enter(&zv->zv_suspend_lock, RW_READER);
424 
425 		/* See comment in WRITE case above. */
426 		if (zvol_request_sync) {
427 			zvol_read(zvr);
428 		} else {
429 			taskq_dispatch_ent(zvol_taskq,
430 			    zvol_read, zvr, 0, &zvr->ent);
431 		}
432 	}
433 
434 out:
435 	spl_fstrans_unmark(cookie);
436 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
437 	defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
438 	return (BLK_QC_T_NONE);
439 #endif
440 }
441 
442 static int
443 zvol_open(struct block_device *bdev, fmode_t flag)
444 {
445 	zvol_state_t *zv;
446 	int error = 0;
447 	boolean_t drop_suspend = B_TRUE;
448 
449 	rw_enter(&zvol_state_lock, RW_READER);
450 	/*
451 	 * Obtain a copy of private_data under the zvol_state_lock to make
452 	 * sure that either the result of zvol free code path setting
453 	 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
454 	 * is not called on this zv because of the positive zv_open_count.
455 	 */
456 	zv = bdev->bd_disk->private_data;
457 	if (zv == NULL) {
458 		rw_exit(&zvol_state_lock);
459 		return (SET_ERROR(-ENXIO));
460 	}
461 
462 	mutex_enter(&zv->zv_state_lock);
463 	/*
464 	 * make sure zvol is not suspended during first open
465 	 * (hold zv_suspend_lock) and respect proper lock acquisition
466 	 * ordering - zv_suspend_lock before zv_state_lock
467 	 */
468 	if (zv->zv_open_count == 0) {
469 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
470 			mutex_exit(&zv->zv_state_lock);
471 			rw_enter(&zv->zv_suspend_lock, RW_READER);
472 			mutex_enter(&zv->zv_state_lock);
473 			/* check to see if zv_suspend_lock is needed */
474 			if (zv->zv_open_count != 0) {
475 				rw_exit(&zv->zv_suspend_lock);
476 				drop_suspend = B_FALSE;
477 			}
478 		}
479 	} else {
480 		drop_suspend = B_FALSE;
481 	}
482 	rw_exit(&zvol_state_lock);
483 
484 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
485 	ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
486 
487 	if (zv->zv_open_count == 0) {
488 		error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
489 		if (error)
490 			goto out_mutex;
491 	}
492 
493 	if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
494 		error = -EROFS;
495 		goto out_open_count;
496 	}
497 
498 	zv->zv_open_count++;
499 
500 	mutex_exit(&zv->zv_state_lock);
501 	if (drop_suspend)
502 		rw_exit(&zv->zv_suspend_lock);
503 
504 	check_disk_change(bdev);
505 
506 	return (0);
507 
508 out_open_count:
509 	if (zv->zv_open_count == 0)
510 		zvol_last_close(zv);
511 
512 out_mutex:
513 	mutex_exit(&zv->zv_state_lock);
514 	if (drop_suspend)
515 		rw_exit(&zv->zv_suspend_lock);
516 	if (error == -EINTR) {
517 		error = -ERESTARTSYS;
518 		schedule();
519 	}
520 	return (SET_ERROR(error));
521 }
522 
523 static void
524 zvol_release(struct gendisk *disk, fmode_t mode)
525 {
526 	zvol_state_t *zv;
527 	boolean_t drop_suspend = B_TRUE;
528 
529 	rw_enter(&zvol_state_lock, RW_READER);
530 	zv = disk->private_data;
531 
532 	mutex_enter(&zv->zv_state_lock);
533 	ASSERT(zv->zv_open_count > 0);
534 	/*
535 	 * make sure zvol is not suspended during last close
536 	 * (hold zv_suspend_lock) and respect proper lock acquisition
537 	 * ordering - zv_suspend_lock before zv_state_lock
538 	 */
539 	if (zv->zv_open_count == 1) {
540 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
541 			mutex_exit(&zv->zv_state_lock);
542 			rw_enter(&zv->zv_suspend_lock, RW_READER);
543 			mutex_enter(&zv->zv_state_lock);
544 			/* check to see if zv_suspend_lock is needed */
545 			if (zv->zv_open_count != 1) {
546 				rw_exit(&zv->zv_suspend_lock);
547 				drop_suspend = B_FALSE;
548 			}
549 		}
550 	} else {
551 		drop_suspend = B_FALSE;
552 	}
553 	rw_exit(&zvol_state_lock);
554 
555 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
556 	ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
557 
558 	zv->zv_open_count--;
559 	if (zv->zv_open_count == 0)
560 		zvol_last_close(zv);
561 
562 	mutex_exit(&zv->zv_state_lock);
563 
564 	if (drop_suspend)
565 		rw_exit(&zv->zv_suspend_lock);
566 }
567 
568 static int
569 zvol_ioctl(struct block_device *bdev, fmode_t mode,
570     unsigned int cmd, unsigned long arg)
571 {
572 	zvol_state_t *zv = bdev->bd_disk->private_data;
573 	int error = 0;
574 
575 	ASSERT3U(zv->zv_open_count, >, 0);
576 
577 	switch (cmd) {
578 	case BLKFLSBUF:
579 		fsync_bdev(bdev);
580 		invalidate_bdev(bdev);
581 		rw_enter(&zv->zv_suspend_lock, RW_READER);
582 
583 		if (!(zv->zv_flags & ZVOL_RDONLY))
584 			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
585 
586 		rw_exit(&zv->zv_suspend_lock);
587 		break;
588 
589 	case BLKZNAME:
590 		mutex_enter(&zv->zv_state_lock);
591 		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
592 		mutex_exit(&zv->zv_state_lock);
593 		break;
594 
595 	default:
596 		error = -ENOTTY;
597 		break;
598 	}
599 
600 	return (SET_ERROR(error));
601 }
602 
603 #ifdef CONFIG_COMPAT
604 static int
605 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
606     unsigned cmd, unsigned long arg)
607 {
608 	return (zvol_ioctl(bdev, mode, cmd, arg));
609 }
610 #else
611 #define	zvol_compat_ioctl	NULL
612 #endif
613 
614 static unsigned int
615 zvol_check_events(struct gendisk *disk, unsigned int clearing)
616 {
617 	unsigned int mask = 0;
618 
619 	rw_enter(&zvol_state_lock, RW_READER);
620 
621 	zvol_state_t *zv = disk->private_data;
622 	if (zv != NULL) {
623 		mutex_enter(&zv->zv_state_lock);
624 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
625 		zv->zv_changed = 0;
626 		mutex_exit(&zv->zv_state_lock);
627 	}
628 
629 	rw_exit(&zvol_state_lock);
630 
631 	return (mask);
632 }
633 
634 static int
635 zvol_revalidate_disk(struct gendisk *disk)
636 {
637 	rw_enter(&zvol_state_lock, RW_READER);
638 
639 	zvol_state_t *zv = disk->private_data;
640 	if (zv != NULL) {
641 		mutex_enter(&zv->zv_state_lock);
642 		set_capacity(zv->zv_zso->zvo_disk,
643 		    zv->zv_volsize >> SECTOR_BITS);
644 		mutex_exit(&zv->zv_state_lock);
645 	}
646 
647 	rw_exit(&zvol_state_lock);
648 
649 	return (0);
650 }
651 
652 static int
653 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
654 {
655 
656 	revalidate_disk(zv->zv_zso->zvo_disk);
657 	return (0);
658 }
659 
660 static void
661 zvol_clear_private(zvol_state_t *zv)
662 {
663 	/*
664 	 * Cleared while holding zvol_state_lock as a writer
665 	 * which will prevent zvol_open() from opening it.
666 	 */
667 	zv->zv_zso->zvo_disk->private_data = NULL;
668 }
669 
670 /*
671  * Provide a simple virtual geometry for legacy compatibility.  For devices
672  * smaller than 1 MiB a small head and sector count is used to allow very
673  * tiny devices.  For devices over 1 Mib a standard head and sector count
674  * is used to keep the cylinders count reasonable.
675  */
676 static int
677 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
678 {
679 	zvol_state_t *zv = bdev->bd_disk->private_data;
680 	sector_t sectors;
681 
682 	ASSERT3U(zv->zv_open_count, >, 0);
683 
684 	sectors = get_capacity(zv->zv_zso->zvo_disk);
685 
686 	if (sectors > 2048) {
687 		geo->heads = 16;
688 		geo->sectors = 63;
689 	} else {
690 		geo->heads = 2;
691 		geo->sectors = 4;
692 	}
693 
694 	geo->start = 0;
695 	geo->cylinders = sectors / (geo->heads * geo->sectors);
696 
697 	return (0);
698 }
699 
700 /*
701  * Find a zvol_state_t given the full major+minor dev_t. If found,
702  * return with zv_state_lock taken, otherwise, return (NULL) without
703  * taking zv_state_lock.
704  */
705 static zvol_state_t *
706 zvol_find_by_dev(dev_t dev)
707 {
708 	zvol_state_t *zv;
709 
710 	rw_enter(&zvol_state_lock, RW_READER);
711 	for (zv = list_head(&zvol_state_list); zv != NULL;
712 	    zv = list_next(&zvol_state_list, zv)) {
713 		mutex_enter(&zv->zv_state_lock);
714 		if (zv->zv_zso->zvo_dev == dev) {
715 			rw_exit(&zvol_state_lock);
716 			return (zv);
717 		}
718 		mutex_exit(&zv->zv_state_lock);
719 	}
720 	rw_exit(&zvol_state_lock);
721 
722 	return (NULL);
723 }
724 
725 static struct kobject *
726 zvol_probe(dev_t dev, int *part, void *arg)
727 {
728 	zvol_state_t *zv;
729 	struct kobject *kobj;
730 
731 	zv = zvol_find_by_dev(dev);
732 	kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL;
733 	ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
734 	if (zv)
735 		mutex_exit(&zv->zv_state_lock);
736 
737 	return (kobj);
738 }
739 
740 static struct block_device_operations zvol_ops = {
741 	.open			= zvol_open,
742 	.release		= zvol_release,
743 	.ioctl			= zvol_ioctl,
744 	.compat_ioctl		= zvol_compat_ioctl,
745 	.check_events		= zvol_check_events,
746 	.revalidate_disk	= zvol_revalidate_disk,
747 	.getgeo			= zvol_getgeo,
748 	.owner			= THIS_MODULE,
749 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
750     .submit_bio		= zvol_submit_bio,
751 #endif
752 };
753 
754 /*
755  * Allocate memory for a new zvol_state_t and setup the required
756  * request queue and generic disk structures for the block device.
757  */
758 static zvol_state_t *
759 zvol_alloc(dev_t dev, const char *name)
760 {
761 	zvol_state_t *zv;
762 	struct zvol_state_os *zso;
763 	uint64_t volmode;
764 
765 	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
766 		return (NULL);
767 
768 	if (volmode == ZFS_VOLMODE_DEFAULT)
769 		volmode = zvol_volmode;
770 
771 	if (volmode == ZFS_VOLMODE_NONE)
772 		return (NULL);
773 
774 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
775 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
776 	zv->zv_zso = zso;
777 
778 	list_link_init(&zv->zv_next);
779 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
780 
781 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
782 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
783 #else
784 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
785 #endif
786 	if (zso->zvo_queue == NULL)
787 		goto out_kmem;
788 
789 	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
790 
791 	/* Limit read-ahead to a single page to prevent over-prefetching. */
792 	blk_queue_set_read_ahead(zso->zvo_queue, 1);
793 
794 	/* Disable write merging in favor of the ZIO pipeline. */
795 	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
796 
797 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
798 	if (zso->zvo_disk == NULL)
799 		goto out_queue;
800 
801 	zso->zvo_queue->queuedata = zv;
802 	zso->zvo_dev = dev;
803 	zv->zv_open_count = 0;
804 	strlcpy(zv->zv_name, name, MAXNAMELEN);
805 
806 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
807 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
808 
809 	zso->zvo_disk->major = zvol_major;
810 	zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
811 
812 	if (volmode == ZFS_VOLMODE_DEV) {
813 		/*
814 		 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
815 		 * gendisk->minors = 1 as noted in include/linux/genhd.h.
816 		 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
817 		 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
818 		 * setting gendisk->flags accordingly.
819 		 */
820 		zso->zvo_disk->minors = 1;
821 #if defined(GENHD_FL_EXT_DEVT)
822 		zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
823 #endif
824 #if defined(GENHD_FL_NO_PART_SCAN)
825 		zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
826 #endif
827 	}
828 	zso->zvo_disk->first_minor = (dev & MINORMASK);
829 	zso->zvo_disk->fops = &zvol_ops;
830 	zso->zvo_disk->private_data = zv;
831 	zso->zvo_disk->queue = zso->zvo_queue;
832 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
833 	    ZVOL_DEV_NAME, (dev & MINORMASK));
834 
835 	return (zv);
836 
837 out_queue:
838 	blk_cleanup_queue(zso->zvo_queue);
839 out_kmem:
840 	kmem_free(zso, sizeof (struct zvol_state_os));
841 	kmem_free(zv, sizeof (zvol_state_t));
842 	return (NULL);
843 }
844 
845 /*
846  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
847  * At this time, the structure is not opened by anyone, is taken off
848  * the zvol_state_list, and has its private data set to NULL.
849  * The zvol_state_lock is dropped.
850  *
851  * This function may take many milliseconds to complete (e.g. we've seen
852  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
853  * "del_gendisk". Thus, consumers need to be careful to account for this
854  * latency when calling this function.
855  */
856 static void
857 zvol_free(zvol_state_t *zv)
858 {
859 
860 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
861 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
862 	ASSERT(zv->zv_open_count == 0);
863 	ASSERT(zv->zv_zso->zvo_disk->private_data == NULL);
864 
865 	rw_destroy(&zv->zv_suspend_lock);
866 	zfs_rangelock_fini(&zv->zv_rangelock);
867 
868 	del_gendisk(zv->zv_zso->zvo_disk);
869 	blk_cleanup_queue(zv->zv_zso->zvo_queue);
870 	put_disk(zv->zv_zso->zvo_disk);
871 
872 	ida_simple_remove(&zvol_ida,
873 	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
874 
875 	mutex_destroy(&zv->zv_state_lock);
876 	dataset_kstats_destroy(&zv->zv_kstat);
877 
878 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
879 	kmem_free(zv, sizeof (zvol_state_t));
880 }
881 
882 /*
883  * Create a block device minor node and setup the linkage between it
884  * and the specified volume.  Once this function returns the block
885  * device is live and ready for use.
886  */
887 static int
888 zvol_os_create_minor(const char *name)
889 {
890 	zvol_state_t *zv;
891 	objset_t *os;
892 	dmu_object_info_t *doi;
893 	uint64_t volsize;
894 	uint64_t len;
895 	unsigned minor = 0;
896 	int error = 0;
897 	int idx;
898 	uint64_t hash = zvol_name_hash(name);
899 
900 	if (zvol_inhibit_dev)
901 		return (0);
902 
903 	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
904 	if (idx < 0)
905 		return (SET_ERROR(-idx));
906 	minor = idx << ZVOL_MINOR_BITS;
907 
908 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
909 	if (zv) {
910 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
911 		mutex_exit(&zv->zv_state_lock);
912 		ida_simple_remove(&zvol_ida, idx);
913 		return (SET_ERROR(EEXIST));
914 	}
915 
916 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
917 
918 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
919 	if (error)
920 		goto out_doi;
921 
922 	error = dmu_object_info(os, ZVOL_OBJ, doi);
923 	if (error)
924 		goto out_dmu_objset_disown;
925 
926 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
927 	if (error)
928 		goto out_dmu_objset_disown;
929 
930 	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
931 	if (zv == NULL) {
932 		error = SET_ERROR(EAGAIN);
933 		goto out_dmu_objset_disown;
934 	}
935 	zv->zv_hash = hash;
936 
937 	if (dmu_objset_is_snapshot(os))
938 		zv->zv_flags |= ZVOL_RDONLY;
939 
940 	zv->zv_volblocksize = doi->doi_data_block_size;
941 	zv->zv_volsize = volsize;
942 	zv->zv_objset = os;
943 
944 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
945 
946 	blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
947 	    (DMU_MAX_ACCESS / 4) >> 9);
948 	blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
949 	blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
950 	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
951 	    zv->zv_volblocksize);
952 	blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
953 	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
954 	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
955 	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
956 	    zv->zv_volblocksize);
957 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
958 #ifdef QUEUE_FLAG_NONROT
959 	blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
960 #endif
961 #ifdef QUEUE_FLAG_ADD_RANDOM
962 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
963 #endif
964 	/* This flag was introduced in kernel version 4.12. */
965 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
966 	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
967 #endif
968 
969 	if (spa_writeable(dmu_objset_spa(os))) {
970 		if (zil_replay_disable)
971 			zil_destroy(dmu_objset_zil(os), B_FALSE);
972 		else
973 			zil_replay(os, zv, zvol_replay_vector);
974 	}
975 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
976 	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
977 
978 	/*
979 	 * When udev detects the addition of the device it will immediately
980 	 * invoke blkid(8) to determine the type of content on the device.
981 	 * Prefetching the blocks commonly scanned by blkid(8) will speed
982 	 * up this process.
983 	 */
984 	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
985 	if (len > 0) {
986 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
987 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
988 		    ZIO_PRIORITY_SYNC_READ);
989 	}
990 
991 	zv->zv_objset = NULL;
992 out_dmu_objset_disown:
993 	dmu_objset_disown(os, B_TRUE, FTAG);
994 out_doi:
995 	kmem_free(doi, sizeof (dmu_object_info_t));
996 
997 	/*
998 	 * Keep in mind that once add_disk() is called, the zvol is
999 	 * announced to the world, and zvol_open()/zvol_release() can
1000 	 * be called at any time. Incidentally, add_disk() itself calls
1001 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1002 	 * directly as well.
1003 	 */
1004 	if (error == 0) {
1005 		rw_enter(&zvol_state_lock, RW_WRITER);
1006 		zvol_insert(zv);
1007 		rw_exit(&zvol_state_lock);
1008 		add_disk(zv->zv_zso->zvo_disk);
1009 	} else {
1010 		ida_simple_remove(&zvol_ida, idx);
1011 	}
1012 
1013 	return (error);
1014 }
1015 
1016 static void
1017 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1018 {
1019 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1020 
1021 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1022 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1023 
1024 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1025 
1026 	/* move to new hashtable entry  */
1027 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1028 	hlist_del(&zv->zv_hlink);
1029 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1030 
1031 	/*
1032 	 * The block device's read-only state is briefly changed causing
1033 	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
1034 	 * the name change and fixes the symlinks.  This does not change
1035 	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1036 	 * changes.  This would normally be done using kobject_uevent() but
1037 	 * that is a GPL-only symbol which is why we need this workaround.
1038 	 */
1039 	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1040 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1041 }
1042 
1043 static void
1044 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1045 {
1046 
1047 	set_disk_ro(zv->zv_zso->zvo_disk, flags);
1048 }
1049 
1050 static void
1051 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1052 {
1053 
1054 	set_capacity(zv->zv_zso->zvo_disk, capacity);
1055 }
1056 
1057 const static zvol_platform_ops_t zvol_linux_ops = {
1058 	.zv_free = zvol_free,
1059 	.zv_rename_minor = zvol_rename_minor,
1060 	.zv_create_minor = zvol_os_create_minor,
1061 	.zv_update_volsize = zvol_update_volsize,
1062 	.zv_clear_private = zvol_clear_private,
1063 	.zv_is_zvol = zvol_is_zvol_impl,
1064 	.zv_set_disk_ro = zvol_set_disk_ro_impl,
1065 	.zv_set_capacity = zvol_set_capacity_impl,
1066 };
1067 
1068 int
1069 zvol_init(void)
1070 {
1071 	int error;
1072 	int threads = MIN(MAX(zvol_threads, 1), 1024);
1073 
1074 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
1075 	if (error) {
1076 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1077 		return (error);
1078 	}
1079 	zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
1080 	    threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1081 	if (zvol_taskq == NULL) {
1082 		unregister_blkdev(zvol_major, ZVOL_DRIVER);
1083 		return (-ENOMEM);
1084 	}
1085 	zvol_init_impl();
1086 	blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
1087 	    THIS_MODULE, zvol_probe, NULL, NULL);
1088 
1089 	ida_init(&zvol_ida);
1090 	zvol_register_ops(&zvol_linux_ops);
1091 	return (0);
1092 }
1093 
1094 void
1095 zvol_fini(void)
1096 {
1097 	zvol_fini_impl();
1098 	blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
1099 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
1100 	taskq_destroy(zvol_taskq);
1101 	ida_destroy(&zvol_ida);
1102 }
1103 
1104 /* BEGIN CSTYLED */
1105 module_param(zvol_inhibit_dev, uint, 0644);
1106 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1107 
1108 module_param(zvol_major, uint, 0444);
1109 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1110 
1111 module_param(zvol_threads, uint, 0444);
1112 MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
1113 
1114 module_param(zvol_request_sync, uint, 0644);
1115 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1116 
1117 module_param(zvol_max_discard_blocks, ulong, 0444);
1118 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1119 
1120 module_param(zvol_prefetch_bytes, uint, 0644);
1121 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1122 
1123 module_param(zvol_volmode, uint, 0644);
1124 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1125 /* END CSTYLED */
1126