xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c (revision bc5304a006238115291e7568583632889dffbab9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
23  */
24 
25 #include <sys/dataset_kstats.h>
26 #include <sys/dbuf.h>
27 #include <sys/dmu_traverse.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_prop.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/zap.h>
32 #include <sys/zfeature.h>
33 #include <sys/zil_impl.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/zio.h>
36 #include <sys/zfs_rlock.h>
37 #include <sys/spa_impl.h>
38 #include <sys/zvol.h>
39 #include <sys/zvol_impl.h>
40 
41 #include <linux/blkdev_compat.h>
42 #include <linux/task_io_accounting_ops.h>
43 
44 unsigned int zvol_major = ZVOL_MAJOR;
45 unsigned int zvol_request_sync = 0;
46 unsigned int zvol_prefetch_bytes = (128 * 1024);
47 unsigned long zvol_max_discard_blocks = 16384;
48 unsigned int zvol_threads = 32;
49 
50 struct zvol_state_os {
51 	struct gendisk		*zvo_disk;	/* generic disk */
52 	struct request_queue	*zvo_queue;	/* request queue */
53 	dev_t			zvo_dev;	/* device id */
54 };
55 
56 taskq_t *zvol_taskq;
57 static struct ida zvol_ida;
58 
59 typedef struct zv_request_stack {
60 	zvol_state_t	*zv;
61 	struct bio	*bio;
62 } zv_request_t;
63 
64 typedef struct zv_request_task {
65 	zv_request_t zvr;
66 	taskq_ent_t	ent;
67 } zv_request_task_t;
68 
69 static zv_request_task_t *
70 zv_request_task_create(zv_request_t zvr)
71 {
72 	zv_request_task_t *task;
73 	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
74 	taskq_init_ent(&task->ent);
75 	task->zvr = zvr;
76 	return (task);
77 }
78 
79 static void
80 zv_request_task_free(zv_request_task_t *task)
81 {
82 	kmem_free(task, sizeof (*task));
83 }
84 
85 /*
86  * Given a path, return TRUE if path is a ZVOL.
87  */
88 static boolean_t
89 zvol_is_zvol_impl(const char *path)
90 {
91 	dev_t dev = 0;
92 
93 	if (vdev_lookup_bdev(path, &dev) != 0)
94 		return (B_FALSE);
95 
96 	if (MAJOR(dev) == zvol_major)
97 		return (B_TRUE);
98 
99 	return (B_FALSE);
100 }
101 
102 static void
103 zvol_write(zv_request_t *zvr)
104 {
105 	struct bio *bio = zvr->bio;
106 	int error = 0;
107 	zfs_uio_t uio;
108 
109 	zfs_uio_bvec_init(&uio, bio);
110 
111 	zvol_state_t *zv = zvr->zv;
112 	ASSERT3P(zv, !=, NULL);
113 	ASSERT3U(zv->zv_open_count, >, 0);
114 	ASSERT3P(zv->zv_zilog, !=, NULL);
115 
116 	/* bio marked as FLUSH need to flush before write */
117 	if (bio_is_flush(bio))
118 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
119 
120 	/* Some requests are just for flush and nothing else. */
121 	if (uio.uio_resid == 0) {
122 		rw_exit(&zv->zv_suspend_lock);
123 		BIO_END_IO(bio, 0);
124 		return;
125 	}
126 
127 	struct request_queue *q = zv->zv_zso->zvo_queue;
128 	struct gendisk *disk = zv->zv_zso->zvo_disk;
129 	ssize_t start_resid = uio.uio_resid;
130 	unsigned long start_time;
131 
132 	boolean_t acct = blk_queue_io_stat(q);
133 	if (acct)
134 		start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
135 
136 	boolean_t sync =
137 	    bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
138 
139 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
140 	    uio.uio_loffset, uio.uio_resid, RL_WRITER);
141 
142 	uint64_t volsize = zv->zv_volsize;
143 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
144 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
145 		uint64_t off = uio.uio_loffset;
146 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
147 
148 		if (bytes > volsize - off)	/* don't write past the end */
149 			bytes = volsize - off;
150 
151 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
152 
153 		/* This will only fail for ENOSPC */
154 		error = dmu_tx_assign(tx, TXG_WAIT);
155 		if (error) {
156 			dmu_tx_abort(tx);
157 			break;
158 		}
159 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
160 		if (error == 0) {
161 			zvol_log_write(zv, tx, off, bytes, sync);
162 		}
163 		dmu_tx_commit(tx);
164 
165 		if (error)
166 			break;
167 	}
168 	zfs_rangelock_exit(lr);
169 
170 	int64_t nwritten = start_resid - uio.uio_resid;
171 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
172 	task_io_account_write(nwritten);
173 
174 	if (sync)
175 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
176 
177 	rw_exit(&zv->zv_suspend_lock);
178 
179 	if (acct)
180 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
181 
182 	BIO_END_IO(bio, -error);
183 }
184 
185 static void
186 zvol_write_task(void *arg)
187 {
188 	zv_request_task_t *task = arg;
189 	zvol_write(&task->zvr);
190 	zv_request_task_free(task);
191 }
192 
193 static void
194 zvol_discard(zv_request_t *zvr)
195 {
196 	struct bio *bio = zvr->bio;
197 	zvol_state_t *zv = zvr->zv;
198 	uint64_t start = BIO_BI_SECTOR(bio) << 9;
199 	uint64_t size = BIO_BI_SIZE(bio);
200 	uint64_t end = start + size;
201 	boolean_t sync;
202 	int error = 0;
203 	dmu_tx_t *tx;
204 
205 	ASSERT3P(zv, !=, NULL);
206 	ASSERT3U(zv->zv_open_count, >, 0);
207 	ASSERT3P(zv->zv_zilog, !=, NULL);
208 
209 	struct request_queue *q = zv->zv_zso->zvo_queue;
210 	struct gendisk *disk = zv->zv_zso->zvo_disk;
211 	unsigned long start_time;
212 
213 	boolean_t acct = blk_queue_io_stat(q);
214 	if (acct)
215 		start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
216 
217 	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
218 
219 	if (end > zv->zv_volsize) {
220 		error = SET_ERROR(EIO);
221 		goto unlock;
222 	}
223 
224 	/*
225 	 * Align the request to volume block boundaries when a secure erase is
226 	 * not required.  This will prevent dnode_free_range() from zeroing out
227 	 * the unaligned parts which is slow (read-modify-write) and useless
228 	 * since we are not freeing any space by doing so.
229 	 */
230 	if (!bio_is_secure_erase(bio)) {
231 		start = P2ROUNDUP(start, zv->zv_volblocksize);
232 		end = P2ALIGN(end, zv->zv_volblocksize);
233 		size = end - start;
234 	}
235 
236 	if (start >= end)
237 		goto unlock;
238 
239 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
240 	    start, size, RL_WRITER);
241 
242 	tx = dmu_tx_create(zv->zv_objset);
243 	dmu_tx_mark_netfree(tx);
244 	error = dmu_tx_assign(tx, TXG_WAIT);
245 	if (error != 0) {
246 		dmu_tx_abort(tx);
247 	} else {
248 		zvol_log_truncate(zv, tx, start, size, B_TRUE);
249 		dmu_tx_commit(tx);
250 		error = dmu_free_long_range(zv->zv_objset,
251 		    ZVOL_OBJ, start, size);
252 	}
253 	zfs_rangelock_exit(lr);
254 
255 	if (error == 0 && sync)
256 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
257 
258 unlock:
259 	rw_exit(&zv->zv_suspend_lock);
260 
261 	if (acct)
262 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
263 
264 	BIO_END_IO(bio, -error);
265 }
266 
267 static void
268 zvol_discard_task(void *arg)
269 {
270 	zv_request_task_t *task = arg;
271 	zvol_discard(&task->zvr);
272 	zv_request_task_free(task);
273 }
274 
275 static void
276 zvol_read(zv_request_t *zvr)
277 {
278 	struct bio *bio = zvr->bio;
279 	int error = 0;
280 	zfs_uio_t uio;
281 
282 	zfs_uio_bvec_init(&uio, bio);
283 
284 	zvol_state_t *zv = zvr->zv;
285 	ASSERT3P(zv, !=, NULL);
286 	ASSERT3U(zv->zv_open_count, >, 0);
287 
288 	struct request_queue *q = zv->zv_zso->zvo_queue;
289 	struct gendisk *disk = zv->zv_zso->zvo_disk;
290 	ssize_t start_resid = uio.uio_resid;
291 	unsigned long start_time;
292 
293 	boolean_t acct = blk_queue_io_stat(q);
294 	if (acct)
295 		start_time = blk_generic_start_io_acct(q, disk, READ, bio);
296 
297 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
298 	    uio.uio_loffset, uio.uio_resid, RL_READER);
299 
300 	uint64_t volsize = zv->zv_volsize;
301 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
302 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
303 
304 		/* don't read past the end */
305 		if (bytes > volsize - uio.uio_loffset)
306 			bytes = volsize - uio.uio_loffset;
307 
308 		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
309 		if (error) {
310 			/* convert checksum errors into IO errors */
311 			if (error == ECKSUM)
312 				error = SET_ERROR(EIO);
313 			break;
314 		}
315 	}
316 	zfs_rangelock_exit(lr);
317 
318 	int64_t nread = start_resid - uio.uio_resid;
319 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
320 	task_io_account_read(nread);
321 
322 	rw_exit(&zv->zv_suspend_lock);
323 
324 	if (acct)
325 		blk_generic_end_io_acct(q, disk, READ, bio, start_time);
326 
327 	BIO_END_IO(bio, -error);
328 }
329 
330 static void
331 zvol_read_task(void *arg)
332 {
333 	zv_request_task_t *task = arg;
334 	zvol_read(&task->zvr);
335 	zv_request_task_free(task);
336 }
337 
338 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
339 static blk_qc_t
340 zvol_submit_bio(struct bio *bio)
341 #else
342 static MAKE_REQUEST_FN_RET
343 zvol_request(struct request_queue *q, struct bio *bio)
344 #endif
345 {
346 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
347 #if defined(HAVE_BIO_BDEV_DISK)
348 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
349 #else
350 	struct request_queue *q = bio->bi_disk->queue;
351 #endif
352 #endif
353 	zvol_state_t *zv = q->queuedata;
354 	fstrans_cookie_t cookie = spl_fstrans_mark();
355 	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
356 	uint64_t size = BIO_BI_SIZE(bio);
357 	int rw = bio_data_dir(bio);
358 
359 	if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
360 		printk(KERN_INFO
361 		    "%s: bad access: offset=%llu, size=%lu\n",
362 		    zv->zv_zso->zvo_disk->disk_name,
363 		    (long long unsigned)offset,
364 		    (long unsigned)size);
365 
366 		BIO_END_IO(bio, -SET_ERROR(EIO));
367 		goto out;
368 	}
369 
370 	zv_request_t zvr = {
371 		.zv = zv,
372 		.bio = bio,
373 	};
374 	zv_request_task_t *task;
375 
376 	if (rw == WRITE) {
377 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
378 			BIO_END_IO(bio, -SET_ERROR(EROFS));
379 			goto out;
380 		}
381 
382 		/*
383 		 * Prevents the zvol from being suspended, or the ZIL being
384 		 * concurrently opened.  Will be released after the i/o
385 		 * completes.
386 		 */
387 		rw_enter(&zv->zv_suspend_lock, RW_READER);
388 
389 		/*
390 		 * Open a ZIL if this is the first time we have written to this
391 		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
392 		 * than zv_state_lock so that we don't need to acquire an
393 		 * additional lock in this path.
394 		 */
395 		if (zv->zv_zilog == NULL) {
396 			rw_exit(&zv->zv_suspend_lock);
397 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
398 			if (zv->zv_zilog == NULL) {
399 				zv->zv_zilog = zil_open(zv->zv_objset,
400 				    zvol_get_data);
401 				zv->zv_flags |= ZVOL_WRITTEN_TO;
402 				/* replay / destroy done in zvol_create_minor */
403 				VERIFY0((zv->zv_zilog->zl_header->zh_flags &
404 				    ZIL_REPLAY_NEEDED));
405 			}
406 			rw_downgrade(&zv->zv_suspend_lock);
407 		}
408 
409 		/*
410 		 * We don't want this thread to be blocked waiting for i/o to
411 		 * complete, so we instead wait from a taskq callback. The
412 		 * i/o may be a ZIL write (via zil_commit()), or a read of an
413 		 * indirect block, or a read of a data block (if this is a
414 		 * partial-block write).  We will indicate that the i/o is
415 		 * complete by calling BIO_END_IO() from the taskq callback.
416 		 *
417 		 * This design allows the calling thread to continue and
418 		 * initiate more concurrent operations by calling
419 		 * zvol_request() again. There are typically only a small
420 		 * number of threads available to call zvol_request() (e.g.
421 		 * one per iSCSI target), so keeping the latency of
422 		 * zvol_request() low is important for performance.
423 		 *
424 		 * The zvol_request_sync module parameter allows this
425 		 * behavior to be altered, for performance evaluation
426 		 * purposes.  If the callback blocks, setting
427 		 * zvol_request_sync=1 will result in much worse performance.
428 		 *
429 		 * We can have up to zvol_threads concurrent i/o's being
430 		 * processed for all zvols on the system.  This is typically
431 		 * a vast improvement over the zvol_request_sync=1 behavior
432 		 * of one i/o at a time per zvol.  However, an even better
433 		 * design would be for zvol_request() to initiate the zio
434 		 * directly, and then be notified by the zio_done callback,
435 		 * which would call BIO_END_IO().  Unfortunately, the DMU/ZIL
436 		 * interfaces lack this functionality (they block waiting for
437 		 * the i/o to complete).
438 		 */
439 		if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
440 			if (zvol_request_sync) {
441 				zvol_discard(&zvr);
442 			} else {
443 				task = zv_request_task_create(zvr);
444 				taskq_dispatch_ent(zvol_taskq,
445 				    zvol_discard_task, task, 0, &task->ent);
446 			}
447 		} else {
448 			if (zvol_request_sync) {
449 				zvol_write(&zvr);
450 			} else {
451 				task = zv_request_task_create(zvr);
452 				taskq_dispatch_ent(zvol_taskq,
453 				    zvol_write_task, task, 0, &task->ent);
454 			}
455 		}
456 	} else {
457 		/*
458 		 * The SCST driver, and possibly others, may issue READ I/Os
459 		 * with a length of zero bytes.  These empty I/Os contain no
460 		 * data and require no additional handling.
461 		 */
462 		if (size == 0) {
463 			BIO_END_IO(bio, 0);
464 			goto out;
465 		}
466 
467 		rw_enter(&zv->zv_suspend_lock, RW_READER);
468 
469 		/* See comment in WRITE case above. */
470 		if (zvol_request_sync) {
471 			zvol_read(&zvr);
472 		} else {
473 			task = zv_request_task_create(zvr);
474 			taskq_dispatch_ent(zvol_taskq,
475 			    zvol_read_task, task, 0, &task->ent);
476 		}
477 	}
478 
479 out:
480 	spl_fstrans_unmark(cookie);
481 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
482 	defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
483 	return (BLK_QC_T_NONE);
484 #endif
485 }
486 
487 static int
488 zvol_open(struct block_device *bdev, fmode_t flag)
489 {
490 	zvol_state_t *zv;
491 	int error = 0;
492 	boolean_t drop_suspend = B_TRUE;
493 
494 	rw_enter(&zvol_state_lock, RW_READER);
495 	/*
496 	 * Obtain a copy of private_data under the zvol_state_lock to make
497 	 * sure that either the result of zvol free code path setting
498 	 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
499 	 * is not called on this zv because of the positive zv_open_count.
500 	 */
501 	zv = bdev->bd_disk->private_data;
502 	if (zv == NULL) {
503 		rw_exit(&zvol_state_lock);
504 		return (SET_ERROR(-ENXIO));
505 	}
506 
507 	mutex_enter(&zv->zv_state_lock);
508 	/*
509 	 * make sure zvol is not suspended during first open
510 	 * (hold zv_suspend_lock) and respect proper lock acquisition
511 	 * ordering - zv_suspend_lock before zv_state_lock
512 	 */
513 	if (zv->zv_open_count == 0) {
514 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
515 			mutex_exit(&zv->zv_state_lock);
516 			rw_enter(&zv->zv_suspend_lock, RW_READER);
517 			mutex_enter(&zv->zv_state_lock);
518 			/* check to see if zv_suspend_lock is needed */
519 			if (zv->zv_open_count != 0) {
520 				rw_exit(&zv->zv_suspend_lock);
521 				drop_suspend = B_FALSE;
522 			}
523 		}
524 	} else {
525 		drop_suspend = B_FALSE;
526 	}
527 	rw_exit(&zvol_state_lock);
528 
529 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
530 
531 	if (zv->zv_open_count == 0) {
532 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
533 		error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
534 		if (error)
535 			goto out_mutex;
536 	}
537 
538 	if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
539 		error = -EROFS;
540 		goto out_open_count;
541 	}
542 
543 	zv->zv_open_count++;
544 
545 	mutex_exit(&zv->zv_state_lock);
546 	if (drop_suspend)
547 		rw_exit(&zv->zv_suspend_lock);
548 
549 	zfs_check_media_change(bdev);
550 
551 	return (0);
552 
553 out_open_count:
554 	if (zv->zv_open_count == 0)
555 		zvol_last_close(zv);
556 
557 out_mutex:
558 	mutex_exit(&zv->zv_state_lock);
559 	if (drop_suspend)
560 		rw_exit(&zv->zv_suspend_lock);
561 	if (error == -EINTR) {
562 		error = -ERESTARTSYS;
563 		schedule();
564 	}
565 	return (SET_ERROR(error));
566 }
567 
568 static void
569 zvol_release(struct gendisk *disk, fmode_t mode)
570 {
571 	zvol_state_t *zv;
572 	boolean_t drop_suspend = B_TRUE;
573 
574 	rw_enter(&zvol_state_lock, RW_READER);
575 	zv = disk->private_data;
576 
577 	mutex_enter(&zv->zv_state_lock);
578 	ASSERT3U(zv->zv_open_count, >, 0);
579 	/*
580 	 * make sure zvol is not suspended during last close
581 	 * (hold zv_suspend_lock) and respect proper lock acquisition
582 	 * ordering - zv_suspend_lock before zv_state_lock
583 	 */
584 	if (zv->zv_open_count == 1) {
585 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
586 			mutex_exit(&zv->zv_state_lock);
587 			rw_enter(&zv->zv_suspend_lock, RW_READER);
588 			mutex_enter(&zv->zv_state_lock);
589 			/* check to see if zv_suspend_lock is needed */
590 			if (zv->zv_open_count != 1) {
591 				rw_exit(&zv->zv_suspend_lock);
592 				drop_suspend = B_FALSE;
593 			}
594 		}
595 	} else {
596 		drop_suspend = B_FALSE;
597 	}
598 	rw_exit(&zvol_state_lock);
599 
600 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
601 
602 	zv->zv_open_count--;
603 	if (zv->zv_open_count == 0) {
604 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
605 		zvol_last_close(zv);
606 	}
607 
608 	mutex_exit(&zv->zv_state_lock);
609 
610 	if (drop_suspend)
611 		rw_exit(&zv->zv_suspend_lock);
612 }
613 
614 static int
615 zvol_ioctl(struct block_device *bdev, fmode_t mode,
616     unsigned int cmd, unsigned long arg)
617 {
618 	zvol_state_t *zv = bdev->bd_disk->private_data;
619 	int error = 0;
620 
621 	ASSERT3U(zv->zv_open_count, >, 0);
622 
623 	switch (cmd) {
624 	case BLKFLSBUF:
625 		fsync_bdev(bdev);
626 		invalidate_bdev(bdev);
627 		rw_enter(&zv->zv_suspend_lock, RW_READER);
628 
629 		if (!(zv->zv_flags & ZVOL_RDONLY))
630 			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
631 
632 		rw_exit(&zv->zv_suspend_lock);
633 		break;
634 
635 	case BLKZNAME:
636 		mutex_enter(&zv->zv_state_lock);
637 		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
638 		mutex_exit(&zv->zv_state_lock);
639 		break;
640 
641 	default:
642 		error = -ENOTTY;
643 		break;
644 	}
645 
646 	return (SET_ERROR(error));
647 }
648 
649 #ifdef CONFIG_COMPAT
650 static int
651 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
652     unsigned cmd, unsigned long arg)
653 {
654 	return (zvol_ioctl(bdev, mode, cmd, arg));
655 }
656 #else
657 #define	zvol_compat_ioctl	NULL
658 #endif
659 
660 static unsigned int
661 zvol_check_events(struct gendisk *disk, unsigned int clearing)
662 {
663 	unsigned int mask = 0;
664 
665 	rw_enter(&zvol_state_lock, RW_READER);
666 
667 	zvol_state_t *zv = disk->private_data;
668 	if (zv != NULL) {
669 		mutex_enter(&zv->zv_state_lock);
670 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
671 		zv->zv_changed = 0;
672 		mutex_exit(&zv->zv_state_lock);
673 	}
674 
675 	rw_exit(&zvol_state_lock);
676 
677 	return (mask);
678 }
679 
680 static int
681 zvol_revalidate_disk(struct gendisk *disk)
682 {
683 	rw_enter(&zvol_state_lock, RW_READER);
684 
685 	zvol_state_t *zv = disk->private_data;
686 	if (zv != NULL) {
687 		mutex_enter(&zv->zv_state_lock);
688 		set_capacity(zv->zv_zso->zvo_disk,
689 		    zv->zv_volsize >> SECTOR_BITS);
690 		mutex_exit(&zv->zv_state_lock);
691 	}
692 
693 	rw_exit(&zvol_state_lock);
694 
695 	return (0);
696 }
697 
698 static int
699 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
700 {
701 	struct gendisk *disk = zv->zv_zso->zvo_disk;
702 
703 #if defined(HAVE_REVALIDATE_DISK_SIZE)
704 	revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
705 #elif defined(HAVE_REVALIDATE_DISK)
706 	revalidate_disk(disk);
707 #else
708 	zvol_revalidate_disk(disk);
709 #endif
710 	return (0);
711 }
712 
713 static void
714 zvol_clear_private(zvol_state_t *zv)
715 {
716 	/*
717 	 * Cleared while holding zvol_state_lock as a writer
718 	 * which will prevent zvol_open() from opening it.
719 	 */
720 	zv->zv_zso->zvo_disk->private_data = NULL;
721 }
722 
723 /*
724  * Provide a simple virtual geometry for legacy compatibility.  For devices
725  * smaller than 1 MiB a small head and sector count is used to allow very
726  * tiny devices.  For devices over 1 Mib a standard head and sector count
727  * is used to keep the cylinders count reasonable.
728  */
729 static int
730 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
731 {
732 	zvol_state_t *zv = bdev->bd_disk->private_data;
733 	sector_t sectors;
734 
735 	ASSERT3U(zv->zv_open_count, >, 0);
736 
737 	sectors = get_capacity(zv->zv_zso->zvo_disk);
738 
739 	if (sectors > 2048) {
740 		geo->heads = 16;
741 		geo->sectors = 63;
742 	} else {
743 		geo->heads = 2;
744 		geo->sectors = 4;
745 	}
746 
747 	geo->start = 0;
748 	geo->cylinders = sectors / (geo->heads * geo->sectors);
749 
750 	return (0);
751 }
752 
753 static struct block_device_operations zvol_ops = {
754 	.open			= zvol_open,
755 	.release		= zvol_release,
756 	.ioctl			= zvol_ioctl,
757 	.compat_ioctl		= zvol_compat_ioctl,
758 	.check_events		= zvol_check_events,
759 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
760 	.revalidate_disk	= zvol_revalidate_disk,
761 #endif
762 	.getgeo			= zvol_getgeo,
763 	.owner			= THIS_MODULE,
764 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
765 	.submit_bio		= zvol_submit_bio,
766 #endif
767 };
768 
769 /*
770  * Allocate memory for a new zvol_state_t and setup the required
771  * request queue and generic disk structures for the block device.
772  */
773 static zvol_state_t *
774 zvol_alloc(dev_t dev, const char *name)
775 {
776 	zvol_state_t *zv;
777 	struct zvol_state_os *zso;
778 	uint64_t volmode;
779 
780 	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
781 		return (NULL);
782 
783 	if (volmode == ZFS_VOLMODE_DEFAULT)
784 		volmode = zvol_volmode;
785 
786 	if (volmode == ZFS_VOLMODE_NONE)
787 		return (NULL);
788 
789 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
790 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
791 	zv->zv_zso = zso;
792 	zv->zv_volmode = volmode;
793 
794 	list_link_init(&zv->zv_next);
795 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
796 
797 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
798 #ifdef HAVE_BLK_ALLOC_DISK
799 	zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
800 	if (zso->zvo_disk == NULL)
801 		goto out_kmem;
802 
803 	zso->zvo_disk->minors = ZVOL_MINORS;
804 	zso->zvo_queue = zso->zvo_disk->queue;
805 #else
806 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
807 	if (zso->zvo_queue == NULL)
808 		goto out_kmem;
809 
810 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
811 	if (zso->zvo_disk == NULL) {
812 		blk_cleanup_queue(zso->zvo_queue);
813 		goto out_kmem;
814 	}
815 
816 	zso->zvo_disk->queue = zso->zvo_queue;
817 #endif /* HAVE_BLK_ALLOC_DISK */
818 #else
819 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
820 	if (zso->zvo_queue == NULL)
821 		goto out_kmem;
822 
823 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
824 	if (zso->zvo_disk == NULL) {
825 		blk_cleanup_queue(zso->zvo_queue);
826 		goto out_kmem;
827 	}
828 
829 	zso->zvo_disk->queue = zso->zvo_queue;
830 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
831 
832 	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
833 
834 	/* Limit read-ahead to a single page to prevent over-prefetching. */
835 	blk_queue_set_read_ahead(zso->zvo_queue, 1);
836 
837 	/* Disable write merging in favor of the ZIO pipeline. */
838 	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
839 
840 	/* Enable /proc/diskstats */
841 	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
842 
843 	zso->zvo_queue->queuedata = zv;
844 	zso->zvo_dev = dev;
845 	zv->zv_open_count = 0;
846 	strlcpy(zv->zv_name, name, MAXNAMELEN);
847 
848 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
849 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
850 
851 	zso->zvo_disk->major = zvol_major;
852 	zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
853 
854 	if (volmode == ZFS_VOLMODE_DEV) {
855 		/*
856 		 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
857 		 * gendisk->minors = 1 as noted in include/linux/genhd.h.
858 		 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
859 		 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
860 		 * setting gendisk->flags accordingly.
861 		 */
862 		zso->zvo_disk->minors = 1;
863 #if defined(GENHD_FL_EXT_DEVT)
864 		zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
865 #endif
866 #if defined(GENHD_FL_NO_PART_SCAN)
867 		zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
868 #endif
869 	}
870 	zso->zvo_disk->first_minor = (dev & MINORMASK);
871 	zso->zvo_disk->fops = &zvol_ops;
872 	zso->zvo_disk->private_data = zv;
873 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
874 	    ZVOL_DEV_NAME, (dev & MINORMASK));
875 
876 	return (zv);
877 
878 out_kmem:
879 	kmem_free(zso, sizeof (struct zvol_state_os));
880 	kmem_free(zv, sizeof (zvol_state_t));
881 	return (NULL);
882 }
883 
884 /*
885  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
886  * At this time, the structure is not opened by anyone, is taken off
887  * the zvol_state_list, and has its private data set to NULL.
888  * The zvol_state_lock is dropped.
889  *
890  * This function may take many milliseconds to complete (e.g. we've seen
891  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
892  * "del_gendisk". Thus, consumers need to be careful to account for this
893  * latency when calling this function.
894  */
895 static void
896 zvol_free(zvol_state_t *zv)
897 {
898 
899 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
900 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
901 	ASSERT0(zv->zv_open_count);
902 	ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
903 
904 	rw_destroy(&zv->zv_suspend_lock);
905 	zfs_rangelock_fini(&zv->zv_rangelock);
906 
907 	del_gendisk(zv->zv_zso->zvo_disk);
908 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
909 	defined(HAVE_BLK_ALLOC_DISK)
910 	blk_cleanup_disk(zv->zv_zso->zvo_disk);
911 #else
912 	blk_cleanup_queue(zv->zv_zso->zvo_queue);
913 	put_disk(zv->zv_zso->zvo_disk);
914 #endif
915 
916 	ida_simple_remove(&zvol_ida,
917 	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
918 
919 	mutex_destroy(&zv->zv_state_lock);
920 	dataset_kstats_destroy(&zv->zv_kstat);
921 
922 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
923 	kmem_free(zv, sizeof (zvol_state_t));
924 }
925 
926 void
927 zvol_wait_close(zvol_state_t *zv)
928 {
929 }
930 
931 /*
932  * Create a block device minor node and setup the linkage between it
933  * and the specified volume.  Once this function returns the block
934  * device is live and ready for use.
935  */
936 static int
937 zvol_os_create_minor(const char *name)
938 {
939 	zvol_state_t *zv;
940 	objset_t *os;
941 	dmu_object_info_t *doi;
942 	uint64_t volsize;
943 	uint64_t len;
944 	unsigned minor = 0;
945 	int error = 0;
946 	int idx;
947 	uint64_t hash = zvol_name_hash(name);
948 
949 	if (zvol_inhibit_dev)
950 		return (0);
951 
952 	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
953 	if (idx < 0)
954 		return (SET_ERROR(-idx));
955 	minor = idx << ZVOL_MINOR_BITS;
956 
957 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
958 	if (zv) {
959 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
960 		mutex_exit(&zv->zv_state_lock);
961 		ida_simple_remove(&zvol_ida, idx);
962 		return (SET_ERROR(EEXIST));
963 	}
964 
965 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
966 
967 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
968 	if (error)
969 		goto out_doi;
970 
971 	error = dmu_object_info(os, ZVOL_OBJ, doi);
972 	if (error)
973 		goto out_dmu_objset_disown;
974 
975 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
976 	if (error)
977 		goto out_dmu_objset_disown;
978 
979 	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
980 	if (zv == NULL) {
981 		error = SET_ERROR(EAGAIN);
982 		goto out_dmu_objset_disown;
983 	}
984 	zv->zv_hash = hash;
985 
986 	if (dmu_objset_is_snapshot(os))
987 		zv->zv_flags |= ZVOL_RDONLY;
988 
989 	zv->zv_volblocksize = doi->doi_data_block_size;
990 	zv->zv_volsize = volsize;
991 	zv->zv_objset = os;
992 
993 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
994 
995 	blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
996 	    (DMU_MAX_ACCESS / 4) >> 9);
997 	blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
998 	blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
999 	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
1000 	    zv->zv_volblocksize);
1001 	blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
1002 	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
1003 	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
1004 	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
1005 	    zv->zv_volblocksize);
1006 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
1007 #ifdef QUEUE_FLAG_NONROT
1008 	blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
1009 #endif
1010 #ifdef QUEUE_FLAG_ADD_RANDOM
1011 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
1012 #endif
1013 	/* This flag was introduced in kernel version 4.12. */
1014 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
1015 	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
1016 #endif
1017 
1018 	ASSERT3P(zv->zv_zilog, ==, NULL);
1019 	zv->zv_zilog = zil_open(os, zvol_get_data);
1020 	if (spa_writeable(dmu_objset_spa(os))) {
1021 		if (zil_replay_disable)
1022 			zil_destroy(zv->zv_zilog, B_FALSE);
1023 		else
1024 			zil_replay(os, zv, zvol_replay_vector);
1025 	}
1026 	zil_close(zv->zv_zilog);
1027 	zv->zv_zilog = NULL;
1028 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1029 	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1030 
1031 	/*
1032 	 * When udev detects the addition of the device it will immediately
1033 	 * invoke blkid(8) to determine the type of content on the device.
1034 	 * Prefetching the blocks commonly scanned by blkid(8) will speed
1035 	 * up this process.
1036 	 */
1037 	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
1038 	if (len > 0) {
1039 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1040 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1041 		    ZIO_PRIORITY_SYNC_READ);
1042 	}
1043 
1044 	zv->zv_objset = NULL;
1045 out_dmu_objset_disown:
1046 	dmu_objset_disown(os, B_TRUE, FTAG);
1047 out_doi:
1048 	kmem_free(doi, sizeof (dmu_object_info_t));
1049 
1050 	/*
1051 	 * Keep in mind that once add_disk() is called, the zvol is
1052 	 * announced to the world, and zvol_open()/zvol_release() can
1053 	 * be called at any time. Incidentally, add_disk() itself calls
1054 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1055 	 * directly as well.
1056 	 */
1057 	if (error == 0) {
1058 		rw_enter(&zvol_state_lock, RW_WRITER);
1059 		zvol_insert(zv);
1060 		rw_exit(&zvol_state_lock);
1061 		add_disk(zv->zv_zso->zvo_disk);
1062 	} else {
1063 		ida_simple_remove(&zvol_ida, idx);
1064 	}
1065 
1066 	return (error);
1067 }
1068 
1069 static void
1070 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1071 {
1072 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1073 
1074 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1075 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1076 
1077 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1078 
1079 	/* move to new hashtable entry  */
1080 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1081 	hlist_del(&zv->zv_hlink);
1082 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1083 
1084 	/*
1085 	 * The block device's read-only state is briefly changed causing
1086 	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
1087 	 * the name change and fixes the symlinks.  This does not change
1088 	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1089 	 * changes.  This would normally be done using kobject_uevent() but
1090 	 * that is a GPL-only symbol which is why we need this workaround.
1091 	 */
1092 	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1093 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1094 }
1095 
1096 static void
1097 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1098 {
1099 
1100 	set_disk_ro(zv->zv_zso->zvo_disk, flags);
1101 }
1102 
1103 static void
1104 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1105 {
1106 
1107 	set_capacity(zv->zv_zso->zvo_disk, capacity);
1108 }
1109 
1110 const static zvol_platform_ops_t zvol_linux_ops = {
1111 	.zv_free = zvol_free,
1112 	.zv_rename_minor = zvol_rename_minor,
1113 	.zv_create_minor = zvol_os_create_minor,
1114 	.zv_update_volsize = zvol_update_volsize,
1115 	.zv_clear_private = zvol_clear_private,
1116 	.zv_is_zvol = zvol_is_zvol_impl,
1117 	.zv_set_disk_ro = zvol_set_disk_ro_impl,
1118 	.zv_set_capacity = zvol_set_capacity_impl,
1119 };
1120 
1121 int
1122 zvol_init(void)
1123 {
1124 	int error;
1125 	int threads = MIN(MAX(zvol_threads, 1), 1024);
1126 
1127 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
1128 	if (error) {
1129 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1130 		return (error);
1131 	}
1132 	zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
1133 	    threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1134 	if (zvol_taskq == NULL) {
1135 		unregister_blkdev(zvol_major, ZVOL_DRIVER);
1136 		return (-ENOMEM);
1137 	}
1138 	zvol_init_impl();
1139 	ida_init(&zvol_ida);
1140 	zvol_register_ops(&zvol_linux_ops);
1141 	return (0);
1142 }
1143 
1144 void
1145 zvol_fini(void)
1146 {
1147 	zvol_fini_impl();
1148 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
1149 	taskq_destroy(zvol_taskq);
1150 	ida_destroy(&zvol_ida);
1151 }
1152 
1153 /* BEGIN CSTYLED */
1154 module_param(zvol_inhibit_dev, uint, 0644);
1155 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1156 
1157 module_param(zvol_major, uint, 0444);
1158 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1159 
1160 module_param(zvol_threads, uint, 0444);
1161 MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
1162 
1163 module_param(zvol_request_sync, uint, 0644);
1164 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1165 
1166 module_param(zvol_max_discard_blocks, ulong, 0444);
1167 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1168 
1169 module_param(zvol_prefetch_bytes, uint, 0644);
1170 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1171 
1172 module_param(zvol_volmode, uint, 0644);
1173 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1174 /* END CSTYLED */
1175