xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c (revision dd41de95a84d979615a2ef11df6850622bf6184e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
23  */
24 
25 #include <sys/dataset_kstats.h>
26 #include <sys/dbuf.h>
27 #include <sys/dmu_traverse.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_prop.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/zap.h>
32 #include <sys/zfeature.h>
33 #include <sys/zil_impl.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/zio.h>
36 #include <sys/zfs_rlock.h>
37 #include <sys/spa_impl.h>
38 #include <sys/zvol.h>
39 #include <sys/zvol_impl.h>
40 
41 #include <linux/blkdev_compat.h>
42 #include <linux/task_io_accounting_ops.h>
43 
44 unsigned int zvol_major = ZVOL_MAJOR;
45 unsigned int zvol_request_sync = 0;
46 unsigned int zvol_prefetch_bytes = (128 * 1024);
47 unsigned long zvol_max_discard_blocks = 16384;
48 unsigned int zvol_threads = 32;
49 
50 struct zvol_state_os {
51 	struct gendisk		*zvo_disk;	/* generic disk */
52 	struct request_queue	*zvo_queue;	/* request queue */
53 	dev_t			zvo_dev;	/* device id */
54 };
55 
56 taskq_t *zvol_taskq;
57 static struct ida zvol_ida;
58 
59 typedef struct zv_request_stack {
60 	zvol_state_t	*zv;
61 	struct bio	*bio;
62 } zv_request_t;
63 
64 typedef struct zv_request_task {
65 	zv_request_t zvr;
66 	taskq_ent_t	ent;
67 } zv_request_task_t;
68 
69 static zv_request_task_t *
70 zv_request_task_create(zv_request_t zvr)
71 {
72 	zv_request_task_t *task;
73 	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
74 	taskq_init_ent(&task->ent);
75 	task->zvr = zvr;
76 	return (task);
77 }
78 
79 static void
80 zv_request_task_free(zv_request_task_t *task)
81 {
82 	kmem_free(task, sizeof (*task));
83 }
84 
85 /*
86  * Given a path, return TRUE if path is a ZVOL.
87  */
88 static boolean_t
89 zvol_is_zvol_impl(const char *path)
90 {
91 	dev_t dev = 0;
92 
93 	if (vdev_lookup_bdev(path, &dev) != 0)
94 		return (B_FALSE);
95 
96 	if (MAJOR(dev) == zvol_major)
97 		return (B_TRUE);
98 
99 	return (B_FALSE);
100 }
101 
102 static void
103 zvol_write(zv_request_t *zvr)
104 {
105 	struct bio *bio = zvr->bio;
106 	int error = 0;
107 	zfs_uio_t uio;
108 
109 	zfs_uio_bvec_init(&uio, bio);
110 
111 	zvol_state_t *zv = zvr->zv;
112 	ASSERT3P(zv, !=, NULL);
113 	ASSERT3U(zv->zv_open_count, >, 0);
114 	ASSERT3P(zv->zv_zilog, !=, NULL);
115 
116 	/* bio marked as FLUSH need to flush before write */
117 	if (bio_is_flush(bio))
118 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
119 
120 	/* Some requests are just for flush and nothing else. */
121 	if (uio.uio_resid == 0) {
122 		rw_exit(&zv->zv_suspend_lock);
123 		BIO_END_IO(bio, 0);
124 		return;
125 	}
126 
127 	struct request_queue *q = zv->zv_zso->zvo_queue;
128 	struct gendisk *disk = zv->zv_zso->zvo_disk;
129 	ssize_t start_resid = uio.uio_resid;
130 	unsigned long start_time;
131 
132 	boolean_t acct = blk_queue_io_stat(q);
133 	if (acct)
134 		start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
135 
136 	boolean_t sync =
137 	    bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
138 
139 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
140 	    uio.uio_loffset, uio.uio_resid, RL_WRITER);
141 
142 	uint64_t volsize = zv->zv_volsize;
143 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
144 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
145 		uint64_t off = uio.uio_loffset;
146 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
147 
148 		if (bytes > volsize - off)	/* don't write past the end */
149 			bytes = volsize - off;
150 
151 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
152 
153 		/* This will only fail for ENOSPC */
154 		error = dmu_tx_assign(tx, TXG_WAIT);
155 		if (error) {
156 			dmu_tx_abort(tx);
157 			break;
158 		}
159 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
160 		if (error == 0) {
161 			zvol_log_write(zv, tx, off, bytes, sync);
162 		}
163 		dmu_tx_commit(tx);
164 
165 		if (error)
166 			break;
167 	}
168 	zfs_rangelock_exit(lr);
169 
170 	int64_t nwritten = start_resid - uio.uio_resid;
171 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
172 	task_io_account_write(nwritten);
173 
174 	if (sync)
175 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
176 
177 	rw_exit(&zv->zv_suspend_lock);
178 
179 	if (acct)
180 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
181 
182 	BIO_END_IO(bio, -error);
183 }
184 
185 static void
186 zvol_write_task(void *arg)
187 {
188 	zv_request_task_t *task = arg;
189 	zvol_write(&task->zvr);
190 	zv_request_task_free(task);
191 }
192 
193 static void
194 zvol_discard(zv_request_t *zvr)
195 {
196 	struct bio *bio = zvr->bio;
197 	zvol_state_t *zv = zvr->zv;
198 	uint64_t start = BIO_BI_SECTOR(bio) << 9;
199 	uint64_t size = BIO_BI_SIZE(bio);
200 	uint64_t end = start + size;
201 	boolean_t sync;
202 	int error = 0;
203 	dmu_tx_t *tx;
204 
205 	ASSERT3P(zv, !=, NULL);
206 	ASSERT3U(zv->zv_open_count, >, 0);
207 	ASSERT3P(zv->zv_zilog, !=, NULL);
208 
209 	struct request_queue *q = zv->zv_zso->zvo_queue;
210 	struct gendisk *disk = zv->zv_zso->zvo_disk;
211 	unsigned long start_time;
212 
213 	boolean_t acct = blk_queue_io_stat(q);
214 	if (acct)
215 		start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
216 
217 	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
218 
219 	if (end > zv->zv_volsize) {
220 		error = SET_ERROR(EIO);
221 		goto unlock;
222 	}
223 
224 	/*
225 	 * Align the request to volume block boundaries when a secure erase is
226 	 * not required.  This will prevent dnode_free_range() from zeroing out
227 	 * the unaligned parts which is slow (read-modify-write) and useless
228 	 * since we are not freeing any space by doing so.
229 	 */
230 	if (!bio_is_secure_erase(bio)) {
231 		start = P2ROUNDUP(start, zv->zv_volblocksize);
232 		end = P2ALIGN(end, zv->zv_volblocksize);
233 		size = end - start;
234 	}
235 
236 	if (start >= end)
237 		goto unlock;
238 
239 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
240 	    start, size, RL_WRITER);
241 
242 	tx = dmu_tx_create(zv->zv_objset);
243 	dmu_tx_mark_netfree(tx);
244 	error = dmu_tx_assign(tx, TXG_WAIT);
245 	if (error != 0) {
246 		dmu_tx_abort(tx);
247 	} else {
248 		zvol_log_truncate(zv, tx, start, size, B_TRUE);
249 		dmu_tx_commit(tx);
250 		error = dmu_free_long_range(zv->zv_objset,
251 		    ZVOL_OBJ, start, size);
252 	}
253 	zfs_rangelock_exit(lr);
254 
255 	if (error == 0 && sync)
256 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
257 
258 unlock:
259 	rw_exit(&zv->zv_suspend_lock);
260 
261 	if (acct)
262 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
263 
264 	BIO_END_IO(bio, -error);
265 }
266 
267 static void
268 zvol_discard_task(void *arg)
269 {
270 	zv_request_task_t *task = arg;
271 	zvol_discard(&task->zvr);
272 	zv_request_task_free(task);
273 }
274 
275 static void
276 zvol_read(zv_request_t *zvr)
277 {
278 	struct bio *bio = zvr->bio;
279 	int error = 0;
280 	zfs_uio_t uio;
281 
282 	zfs_uio_bvec_init(&uio, bio);
283 
284 	zvol_state_t *zv = zvr->zv;
285 	ASSERT3P(zv, !=, NULL);
286 	ASSERT3U(zv->zv_open_count, >, 0);
287 
288 	struct request_queue *q = zv->zv_zso->zvo_queue;
289 	struct gendisk *disk = zv->zv_zso->zvo_disk;
290 	ssize_t start_resid = uio.uio_resid;
291 	unsigned long start_time;
292 
293 	boolean_t acct = blk_queue_io_stat(q);
294 	if (acct)
295 		start_time = blk_generic_start_io_acct(q, disk, READ, bio);
296 
297 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
298 	    uio.uio_loffset, uio.uio_resid, RL_READER);
299 
300 	uint64_t volsize = zv->zv_volsize;
301 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
302 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
303 
304 		/* don't read past the end */
305 		if (bytes > volsize - uio.uio_loffset)
306 			bytes = volsize - uio.uio_loffset;
307 
308 		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
309 		if (error) {
310 			/* convert checksum errors into IO errors */
311 			if (error == ECKSUM)
312 				error = SET_ERROR(EIO);
313 			break;
314 		}
315 	}
316 	zfs_rangelock_exit(lr);
317 
318 	int64_t nread = start_resid - uio.uio_resid;
319 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
320 	task_io_account_read(nread);
321 
322 	rw_exit(&zv->zv_suspend_lock);
323 
324 	if (acct)
325 		blk_generic_end_io_acct(q, disk, READ, bio, start_time);
326 
327 	BIO_END_IO(bio, -error);
328 }
329 
330 static void
331 zvol_read_task(void *arg)
332 {
333 	zv_request_task_t *task = arg;
334 	zvol_read(&task->zvr);
335 	zv_request_task_free(task);
336 }
337 
338 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
339 static blk_qc_t
340 zvol_submit_bio(struct bio *bio)
341 #else
342 static MAKE_REQUEST_FN_RET
343 zvol_request(struct request_queue *q, struct bio *bio)
344 #endif
345 {
346 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
347 #if defined(HAVE_BIO_BDEV_DISK)
348 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
349 #else
350 	struct request_queue *q = bio->bi_disk->queue;
351 #endif
352 #endif
353 	zvol_state_t *zv = q->queuedata;
354 	fstrans_cookie_t cookie = spl_fstrans_mark();
355 	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
356 	uint64_t size = BIO_BI_SIZE(bio);
357 	int rw = bio_data_dir(bio);
358 
359 	if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
360 		printk(KERN_INFO
361 		    "%s: bad access: offset=%llu, size=%lu\n",
362 		    zv->zv_zso->zvo_disk->disk_name,
363 		    (long long unsigned)offset,
364 		    (long unsigned)size);
365 
366 		BIO_END_IO(bio, -SET_ERROR(EIO));
367 		goto out;
368 	}
369 
370 	zv_request_t zvr = {
371 		.zv = zv,
372 		.bio = bio,
373 	};
374 	zv_request_task_t *task;
375 
376 	if (rw == WRITE) {
377 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
378 			BIO_END_IO(bio, -SET_ERROR(EROFS));
379 			goto out;
380 		}
381 
382 		/*
383 		 * Prevents the zvol from being suspended, or the ZIL being
384 		 * concurrently opened.  Will be released after the i/o
385 		 * completes.
386 		 */
387 		rw_enter(&zv->zv_suspend_lock, RW_READER);
388 
389 		/*
390 		 * Open a ZIL if this is the first time we have written to this
391 		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
392 		 * than zv_state_lock so that we don't need to acquire an
393 		 * additional lock in this path.
394 		 */
395 		if (zv->zv_zilog == NULL) {
396 			rw_exit(&zv->zv_suspend_lock);
397 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
398 			if (zv->zv_zilog == NULL) {
399 				zv->zv_zilog = zil_open(zv->zv_objset,
400 				    zvol_get_data);
401 				zv->zv_flags |= ZVOL_WRITTEN_TO;
402 				/* replay / destroy done in zvol_create_minor */
403 				VERIFY0((zv->zv_zilog->zl_header->zh_flags &
404 				    ZIL_REPLAY_NEEDED));
405 			}
406 			rw_downgrade(&zv->zv_suspend_lock);
407 		}
408 
409 		/*
410 		 * We don't want this thread to be blocked waiting for i/o to
411 		 * complete, so we instead wait from a taskq callback. The
412 		 * i/o may be a ZIL write (via zil_commit()), or a read of an
413 		 * indirect block, or a read of a data block (if this is a
414 		 * partial-block write).  We will indicate that the i/o is
415 		 * complete by calling BIO_END_IO() from the taskq callback.
416 		 *
417 		 * This design allows the calling thread to continue and
418 		 * initiate more concurrent operations by calling
419 		 * zvol_request() again. There are typically only a small
420 		 * number of threads available to call zvol_request() (e.g.
421 		 * one per iSCSI target), so keeping the latency of
422 		 * zvol_request() low is important for performance.
423 		 *
424 		 * The zvol_request_sync module parameter allows this
425 		 * behavior to be altered, for performance evaluation
426 		 * purposes.  If the callback blocks, setting
427 		 * zvol_request_sync=1 will result in much worse performance.
428 		 *
429 		 * We can have up to zvol_threads concurrent i/o's being
430 		 * processed for all zvols on the system.  This is typically
431 		 * a vast improvement over the zvol_request_sync=1 behavior
432 		 * of one i/o at a time per zvol.  However, an even better
433 		 * design would be for zvol_request() to initiate the zio
434 		 * directly, and then be notified by the zio_done callback,
435 		 * which would call BIO_END_IO().  Unfortunately, the DMU/ZIL
436 		 * interfaces lack this functionality (they block waiting for
437 		 * the i/o to complete).
438 		 */
439 		if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
440 			if (zvol_request_sync) {
441 				zvol_discard(&zvr);
442 			} else {
443 				task = zv_request_task_create(zvr);
444 				taskq_dispatch_ent(zvol_taskq,
445 				    zvol_discard_task, task, 0, &task->ent);
446 			}
447 		} else {
448 			if (zvol_request_sync) {
449 				zvol_write(&zvr);
450 			} else {
451 				task = zv_request_task_create(zvr);
452 				taskq_dispatch_ent(zvol_taskq,
453 				    zvol_write_task, task, 0, &task->ent);
454 			}
455 		}
456 	} else {
457 		/*
458 		 * The SCST driver, and possibly others, may issue READ I/Os
459 		 * with a length of zero bytes.  These empty I/Os contain no
460 		 * data and require no additional handling.
461 		 */
462 		if (size == 0) {
463 			BIO_END_IO(bio, 0);
464 			goto out;
465 		}
466 
467 		rw_enter(&zv->zv_suspend_lock, RW_READER);
468 
469 		/* See comment in WRITE case above. */
470 		if (zvol_request_sync) {
471 			zvol_read(&zvr);
472 		} else {
473 			task = zv_request_task_create(zvr);
474 			taskq_dispatch_ent(zvol_taskq,
475 			    zvol_read_task, task, 0, &task->ent);
476 		}
477 	}
478 
479 out:
480 	spl_fstrans_unmark(cookie);
481 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
482 	defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
483 	return (BLK_QC_T_NONE);
484 #endif
485 }
486 
487 static int
488 zvol_open(struct block_device *bdev, fmode_t flag)
489 {
490 	zvol_state_t *zv;
491 	int error = 0;
492 	boolean_t drop_suspend = B_TRUE;
493 
494 	rw_enter(&zvol_state_lock, RW_READER);
495 	/*
496 	 * Obtain a copy of private_data under the zvol_state_lock to make
497 	 * sure that either the result of zvol free code path setting
498 	 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
499 	 * is not called on this zv because of the positive zv_open_count.
500 	 */
501 	zv = bdev->bd_disk->private_data;
502 	if (zv == NULL) {
503 		rw_exit(&zvol_state_lock);
504 		return (SET_ERROR(-ENXIO));
505 	}
506 
507 	mutex_enter(&zv->zv_state_lock);
508 	/*
509 	 * make sure zvol is not suspended during first open
510 	 * (hold zv_suspend_lock) and respect proper lock acquisition
511 	 * ordering - zv_suspend_lock before zv_state_lock
512 	 */
513 	if (zv->zv_open_count == 0) {
514 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
515 			mutex_exit(&zv->zv_state_lock);
516 			rw_enter(&zv->zv_suspend_lock, RW_READER);
517 			mutex_enter(&zv->zv_state_lock);
518 			/* check to see if zv_suspend_lock is needed */
519 			if (zv->zv_open_count != 0) {
520 				rw_exit(&zv->zv_suspend_lock);
521 				drop_suspend = B_FALSE;
522 			}
523 		}
524 	} else {
525 		drop_suspend = B_FALSE;
526 	}
527 	rw_exit(&zvol_state_lock);
528 
529 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
530 
531 	if (zv->zv_open_count == 0) {
532 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
533 		error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
534 		if (error)
535 			goto out_mutex;
536 	}
537 
538 	if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
539 		error = -EROFS;
540 		goto out_open_count;
541 	}
542 
543 	zv->zv_open_count++;
544 
545 	mutex_exit(&zv->zv_state_lock);
546 	if (drop_suspend)
547 		rw_exit(&zv->zv_suspend_lock);
548 
549 	zfs_check_media_change(bdev);
550 
551 	return (0);
552 
553 out_open_count:
554 	if (zv->zv_open_count == 0)
555 		zvol_last_close(zv);
556 
557 out_mutex:
558 	mutex_exit(&zv->zv_state_lock);
559 	if (drop_suspend)
560 		rw_exit(&zv->zv_suspend_lock);
561 	if (error == -EINTR) {
562 		error = -ERESTARTSYS;
563 		schedule();
564 	}
565 	return (SET_ERROR(error));
566 }
567 
568 static void
569 zvol_release(struct gendisk *disk, fmode_t mode)
570 {
571 	zvol_state_t *zv;
572 	boolean_t drop_suspend = B_TRUE;
573 
574 	rw_enter(&zvol_state_lock, RW_READER);
575 	zv = disk->private_data;
576 
577 	mutex_enter(&zv->zv_state_lock);
578 	ASSERT3U(zv->zv_open_count, >, 0);
579 	/*
580 	 * make sure zvol is not suspended during last close
581 	 * (hold zv_suspend_lock) and respect proper lock acquisition
582 	 * ordering - zv_suspend_lock before zv_state_lock
583 	 */
584 	if (zv->zv_open_count == 1) {
585 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
586 			mutex_exit(&zv->zv_state_lock);
587 			rw_enter(&zv->zv_suspend_lock, RW_READER);
588 			mutex_enter(&zv->zv_state_lock);
589 			/* check to see if zv_suspend_lock is needed */
590 			if (zv->zv_open_count != 1) {
591 				rw_exit(&zv->zv_suspend_lock);
592 				drop_suspend = B_FALSE;
593 			}
594 		}
595 	} else {
596 		drop_suspend = B_FALSE;
597 	}
598 	rw_exit(&zvol_state_lock);
599 
600 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
601 
602 	zv->zv_open_count--;
603 	if (zv->zv_open_count == 0) {
604 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
605 		zvol_last_close(zv);
606 	}
607 
608 	mutex_exit(&zv->zv_state_lock);
609 
610 	if (drop_suspend)
611 		rw_exit(&zv->zv_suspend_lock);
612 }
613 
614 static int
615 zvol_ioctl(struct block_device *bdev, fmode_t mode,
616     unsigned int cmd, unsigned long arg)
617 {
618 	zvol_state_t *zv = bdev->bd_disk->private_data;
619 	int error = 0;
620 
621 	ASSERT3U(zv->zv_open_count, >, 0);
622 
623 	switch (cmd) {
624 	case BLKFLSBUF:
625 		fsync_bdev(bdev);
626 		invalidate_bdev(bdev);
627 		rw_enter(&zv->zv_suspend_lock, RW_READER);
628 
629 		if (!(zv->zv_flags & ZVOL_RDONLY))
630 			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
631 
632 		rw_exit(&zv->zv_suspend_lock);
633 		break;
634 
635 	case BLKZNAME:
636 		mutex_enter(&zv->zv_state_lock);
637 		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
638 		mutex_exit(&zv->zv_state_lock);
639 		break;
640 
641 	default:
642 		error = -ENOTTY;
643 		break;
644 	}
645 
646 	return (SET_ERROR(error));
647 }
648 
649 #ifdef CONFIG_COMPAT
650 static int
651 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
652     unsigned cmd, unsigned long arg)
653 {
654 	return (zvol_ioctl(bdev, mode, cmd, arg));
655 }
656 #else
657 #define	zvol_compat_ioctl	NULL
658 #endif
659 
660 static unsigned int
661 zvol_check_events(struct gendisk *disk, unsigned int clearing)
662 {
663 	unsigned int mask = 0;
664 
665 	rw_enter(&zvol_state_lock, RW_READER);
666 
667 	zvol_state_t *zv = disk->private_data;
668 	if (zv != NULL) {
669 		mutex_enter(&zv->zv_state_lock);
670 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
671 		zv->zv_changed = 0;
672 		mutex_exit(&zv->zv_state_lock);
673 	}
674 
675 	rw_exit(&zvol_state_lock);
676 
677 	return (mask);
678 }
679 
680 static int
681 zvol_revalidate_disk(struct gendisk *disk)
682 {
683 	rw_enter(&zvol_state_lock, RW_READER);
684 
685 	zvol_state_t *zv = disk->private_data;
686 	if (zv != NULL) {
687 		mutex_enter(&zv->zv_state_lock);
688 		set_capacity(zv->zv_zso->zvo_disk,
689 		    zv->zv_volsize >> SECTOR_BITS);
690 		mutex_exit(&zv->zv_state_lock);
691 	}
692 
693 	rw_exit(&zvol_state_lock);
694 
695 	return (0);
696 }
697 
698 static int
699 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
700 {
701 	struct gendisk *disk = zv->zv_zso->zvo_disk;
702 
703 #if defined(HAVE_REVALIDATE_DISK_SIZE)
704 	revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
705 #elif defined(HAVE_REVALIDATE_DISK)
706 	revalidate_disk(disk);
707 #else
708 	zvol_revalidate_disk(disk);
709 #endif
710 	return (0);
711 }
712 
713 static void
714 zvol_clear_private(zvol_state_t *zv)
715 {
716 	/*
717 	 * Cleared while holding zvol_state_lock as a writer
718 	 * which will prevent zvol_open() from opening it.
719 	 */
720 	zv->zv_zso->zvo_disk->private_data = NULL;
721 }
722 
723 /*
724  * Provide a simple virtual geometry for legacy compatibility.  For devices
725  * smaller than 1 MiB a small head and sector count is used to allow very
726  * tiny devices.  For devices over 1 Mib a standard head and sector count
727  * is used to keep the cylinders count reasonable.
728  */
729 static int
730 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
731 {
732 	zvol_state_t *zv = bdev->bd_disk->private_data;
733 	sector_t sectors;
734 
735 	ASSERT3U(zv->zv_open_count, >, 0);
736 
737 	sectors = get_capacity(zv->zv_zso->zvo_disk);
738 
739 	if (sectors > 2048) {
740 		geo->heads = 16;
741 		geo->sectors = 63;
742 	} else {
743 		geo->heads = 2;
744 		geo->sectors = 4;
745 	}
746 
747 	geo->start = 0;
748 	geo->cylinders = sectors / (geo->heads * geo->sectors);
749 
750 	return (0);
751 }
752 
753 static struct block_device_operations zvol_ops = {
754 	.open			= zvol_open,
755 	.release		= zvol_release,
756 	.ioctl			= zvol_ioctl,
757 	.compat_ioctl		= zvol_compat_ioctl,
758 	.check_events		= zvol_check_events,
759 	.revalidate_disk	= zvol_revalidate_disk,
760 	.getgeo			= zvol_getgeo,
761 	.owner			= THIS_MODULE,
762 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
763     .submit_bio		= zvol_submit_bio,
764 #endif
765 };
766 
767 /*
768  * Allocate memory for a new zvol_state_t and setup the required
769  * request queue and generic disk structures for the block device.
770  */
771 static zvol_state_t *
772 zvol_alloc(dev_t dev, const char *name)
773 {
774 	zvol_state_t *zv;
775 	struct zvol_state_os *zso;
776 	uint64_t volmode;
777 
778 	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
779 		return (NULL);
780 
781 	if (volmode == ZFS_VOLMODE_DEFAULT)
782 		volmode = zvol_volmode;
783 
784 	if (volmode == ZFS_VOLMODE_NONE)
785 		return (NULL);
786 
787 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
788 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
789 	zv->zv_zso = zso;
790 	zv->zv_volmode = volmode;
791 
792 	list_link_init(&zv->zv_next);
793 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
794 
795 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
796 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
797 #else
798 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
799 #endif
800 	if (zso->zvo_queue == NULL)
801 		goto out_kmem;
802 
803 	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
804 
805 	/* Limit read-ahead to a single page to prevent over-prefetching. */
806 	blk_queue_set_read_ahead(zso->zvo_queue, 1);
807 
808 	/* Disable write merging in favor of the ZIO pipeline. */
809 	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
810 
811 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
812 	if (zso->zvo_disk == NULL)
813 		goto out_queue;
814 
815 	zso->zvo_queue->queuedata = zv;
816 	zso->zvo_dev = dev;
817 	zv->zv_open_count = 0;
818 	strlcpy(zv->zv_name, name, MAXNAMELEN);
819 
820 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
821 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
822 
823 	zso->zvo_disk->major = zvol_major;
824 	zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
825 
826 	if (volmode == ZFS_VOLMODE_DEV) {
827 		/*
828 		 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
829 		 * gendisk->minors = 1 as noted in include/linux/genhd.h.
830 		 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
831 		 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
832 		 * setting gendisk->flags accordingly.
833 		 */
834 		zso->zvo_disk->minors = 1;
835 #if defined(GENHD_FL_EXT_DEVT)
836 		zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
837 #endif
838 #if defined(GENHD_FL_NO_PART_SCAN)
839 		zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
840 #endif
841 	}
842 	zso->zvo_disk->first_minor = (dev & MINORMASK);
843 	zso->zvo_disk->fops = &zvol_ops;
844 	zso->zvo_disk->private_data = zv;
845 	zso->zvo_disk->queue = zso->zvo_queue;
846 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
847 	    ZVOL_DEV_NAME, (dev & MINORMASK));
848 
849 	return (zv);
850 
851 out_queue:
852 	blk_cleanup_queue(zso->zvo_queue);
853 out_kmem:
854 	kmem_free(zso, sizeof (struct zvol_state_os));
855 	kmem_free(zv, sizeof (zvol_state_t));
856 	return (NULL);
857 }
858 
859 /*
860  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
861  * At this time, the structure is not opened by anyone, is taken off
862  * the zvol_state_list, and has its private data set to NULL.
863  * The zvol_state_lock is dropped.
864  *
865  * This function may take many milliseconds to complete (e.g. we've seen
866  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
867  * "del_gendisk". Thus, consumers need to be careful to account for this
868  * latency when calling this function.
869  */
870 static void
871 zvol_free(zvol_state_t *zv)
872 {
873 
874 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
875 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
876 	ASSERT0(zv->zv_open_count);
877 	ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
878 
879 	rw_destroy(&zv->zv_suspend_lock);
880 	zfs_rangelock_fini(&zv->zv_rangelock);
881 
882 	del_gendisk(zv->zv_zso->zvo_disk);
883 	blk_cleanup_queue(zv->zv_zso->zvo_queue);
884 	put_disk(zv->zv_zso->zvo_disk);
885 
886 	ida_simple_remove(&zvol_ida,
887 	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
888 
889 	mutex_destroy(&zv->zv_state_lock);
890 	dataset_kstats_destroy(&zv->zv_kstat);
891 
892 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
893 	kmem_free(zv, sizeof (zvol_state_t));
894 }
895 
896 void
897 zvol_wait_close(zvol_state_t *zv)
898 {
899 }
900 
901 /*
902  * Create a block device minor node and setup the linkage between it
903  * and the specified volume.  Once this function returns the block
904  * device is live and ready for use.
905  */
906 static int
907 zvol_os_create_minor(const char *name)
908 {
909 	zvol_state_t *zv;
910 	objset_t *os;
911 	dmu_object_info_t *doi;
912 	uint64_t volsize;
913 	uint64_t len;
914 	unsigned minor = 0;
915 	int error = 0;
916 	int idx;
917 	uint64_t hash = zvol_name_hash(name);
918 
919 	if (zvol_inhibit_dev)
920 		return (0);
921 
922 	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
923 	if (idx < 0)
924 		return (SET_ERROR(-idx));
925 	minor = idx << ZVOL_MINOR_BITS;
926 
927 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
928 	if (zv) {
929 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
930 		mutex_exit(&zv->zv_state_lock);
931 		ida_simple_remove(&zvol_ida, idx);
932 		return (SET_ERROR(EEXIST));
933 	}
934 
935 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
936 
937 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
938 	if (error)
939 		goto out_doi;
940 
941 	error = dmu_object_info(os, ZVOL_OBJ, doi);
942 	if (error)
943 		goto out_dmu_objset_disown;
944 
945 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
946 	if (error)
947 		goto out_dmu_objset_disown;
948 
949 	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
950 	if (zv == NULL) {
951 		error = SET_ERROR(EAGAIN);
952 		goto out_dmu_objset_disown;
953 	}
954 	zv->zv_hash = hash;
955 
956 	if (dmu_objset_is_snapshot(os))
957 		zv->zv_flags |= ZVOL_RDONLY;
958 
959 	zv->zv_volblocksize = doi->doi_data_block_size;
960 	zv->zv_volsize = volsize;
961 	zv->zv_objset = os;
962 
963 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
964 
965 	blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
966 	    (DMU_MAX_ACCESS / 4) >> 9);
967 	blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
968 	blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
969 	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
970 	    zv->zv_volblocksize);
971 	blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
972 	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
973 	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
974 	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
975 	    zv->zv_volblocksize);
976 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
977 #ifdef QUEUE_FLAG_NONROT
978 	blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
979 #endif
980 #ifdef QUEUE_FLAG_ADD_RANDOM
981 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
982 #endif
983 	/* This flag was introduced in kernel version 4.12. */
984 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
985 	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
986 #endif
987 
988 	ASSERT3P(zv->zv_zilog, ==, NULL);
989 	zv->zv_zilog = zil_open(os, zvol_get_data);
990 	if (spa_writeable(dmu_objset_spa(os))) {
991 		if (zil_replay_disable)
992 			zil_destroy(zv->zv_zilog, B_FALSE);
993 		else
994 			zil_replay(os, zv, zvol_replay_vector);
995 	}
996 	zil_close(zv->zv_zilog);
997 	zv->zv_zilog = NULL;
998 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
999 	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1000 
1001 	/*
1002 	 * When udev detects the addition of the device it will immediately
1003 	 * invoke blkid(8) to determine the type of content on the device.
1004 	 * Prefetching the blocks commonly scanned by blkid(8) will speed
1005 	 * up this process.
1006 	 */
1007 	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
1008 	if (len > 0) {
1009 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
1010 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
1011 		    ZIO_PRIORITY_SYNC_READ);
1012 	}
1013 
1014 	zv->zv_objset = NULL;
1015 out_dmu_objset_disown:
1016 	dmu_objset_disown(os, B_TRUE, FTAG);
1017 out_doi:
1018 	kmem_free(doi, sizeof (dmu_object_info_t));
1019 
1020 	/*
1021 	 * Keep in mind that once add_disk() is called, the zvol is
1022 	 * announced to the world, and zvol_open()/zvol_release() can
1023 	 * be called at any time. Incidentally, add_disk() itself calls
1024 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
1025 	 * directly as well.
1026 	 */
1027 	if (error == 0) {
1028 		rw_enter(&zvol_state_lock, RW_WRITER);
1029 		zvol_insert(zv);
1030 		rw_exit(&zvol_state_lock);
1031 		add_disk(zv->zv_zso->zvo_disk);
1032 	} else {
1033 		ida_simple_remove(&zvol_ida, idx);
1034 	}
1035 
1036 	return (error);
1037 }
1038 
1039 static void
1040 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1041 {
1042 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
1043 
1044 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1045 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1046 
1047 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1048 
1049 	/* move to new hashtable entry  */
1050 	zv->zv_hash = zvol_name_hash(zv->zv_name);
1051 	hlist_del(&zv->zv_hlink);
1052 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1053 
1054 	/*
1055 	 * The block device's read-only state is briefly changed causing
1056 	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
1057 	 * the name change and fixes the symlinks.  This does not change
1058 	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
1059 	 * changes.  This would normally be done using kobject_uevent() but
1060 	 * that is a GPL-only symbol which is why we need this workaround.
1061 	 */
1062 	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
1063 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
1064 }
1065 
1066 static void
1067 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1068 {
1069 
1070 	set_disk_ro(zv->zv_zso->zvo_disk, flags);
1071 }
1072 
1073 static void
1074 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1075 {
1076 
1077 	set_capacity(zv->zv_zso->zvo_disk, capacity);
1078 }
1079 
1080 const static zvol_platform_ops_t zvol_linux_ops = {
1081 	.zv_free = zvol_free,
1082 	.zv_rename_minor = zvol_rename_minor,
1083 	.zv_create_minor = zvol_os_create_minor,
1084 	.zv_update_volsize = zvol_update_volsize,
1085 	.zv_clear_private = zvol_clear_private,
1086 	.zv_is_zvol = zvol_is_zvol_impl,
1087 	.zv_set_disk_ro = zvol_set_disk_ro_impl,
1088 	.zv_set_capacity = zvol_set_capacity_impl,
1089 };
1090 
1091 int
1092 zvol_init(void)
1093 {
1094 	int error;
1095 	int threads = MIN(MAX(zvol_threads, 1), 1024);
1096 
1097 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
1098 	if (error) {
1099 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
1100 		return (error);
1101 	}
1102 	zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
1103 	    threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1104 	if (zvol_taskq == NULL) {
1105 		unregister_blkdev(zvol_major, ZVOL_DRIVER);
1106 		return (-ENOMEM);
1107 	}
1108 	zvol_init_impl();
1109 	ida_init(&zvol_ida);
1110 	zvol_register_ops(&zvol_linux_ops);
1111 	return (0);
1112 }
1113 
1114 void
1115 zvol_fini(void)
1116 {
1117 	zvol_fini_impl();
1118 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
1119 	taskq_destroy(zvol_taskq);
1120 	ida_destroy(&zvol_ida);
1121 }
1122 
1123 /* BEGIN CSTYLED */
1124 module_param(zvol_inhibit_dev, uint, 0644);
1125 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
1126 
1127 module_param(zvol_major, uint, 0444);
1128 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
1129 
1130 module_param(zvol_threads, uint, 0444);
1131 MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
1132 
1133 module_param(zvol_request_sync, uint, 0644);
1134 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
1135 
1136 module_param(zvol_max_discard_blocks, ulong, 0444);
1137 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
1138 
1139 module_param(zvol_prefetch_bytes, uint, 0644);
1140 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
1141 
1142 module_param(zvol_volmode, uint, 0644);
1143 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
1144 /* END CSTYLED */
1145