xref: /titanic_41/usr/src/uts/common/avs/ns/sv/sv.c (revision b2514ea1e4c90e705852a2668ed730087a89f38c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26  */
27 
28 /*
29  * Storage Volume Character and Block Driver (SV)
30  *
31  * This driver implements a simplistic /dev/{r}dsk/ interface to a
32  * specified disk volume that is otherwise managed by the Prism
33  * software.  The SV driver layers itself onto the underlying disk
34  * device driver by changing function pointers in the cb_ops
35  * structure.
36  *
37  * CONFIGURATION:
38  *
39  * 1. Configure the driver using the svadm utility.
40  * 2. Access the device as before through /dev/rdsk/c?t?d?s?
41  *
42  * LIMITATIONS:
43  *
44  * This driver should NOT be used to share a device between another
45  * DataServices user interface module (e.g., STE) and a user accessing
46  * the device through the block device in O_WRITE mode.  This is because
47  * writes through the block device are asynchronous (due to the page
48  * cache) and so consistency between the block device user and the
49  * STE user cannot be guaranteed.
50  *
51  * Data is copied between system struct buf(9s) and nsc_vec_t.  This is
52  * wasteful and slow.
53  */
54 
55 #include <sys/debug.h>
56 #include <sys/types.h>
57 
58 #include <sys/ksynch.h>
59 #include <sys/kmem.h>
60 #include <sys/errno.h>
61 #include <sys/varargs.h>
62 #include <sys/file.h>
63 #include <sys/open.h>
64 #include <sys/conf.h>
65 #include <sys/cred.h>
66 #include <sys/buf.h>
67 #include <sys/uio.h>
68 #ifndef DS_DDICT
69 #include <sys/pathname.h>
70 #endif
71 #include <sys/aio_req.h>
72 #include <sys/dkio.h>
73 #include <sys/vtoc.h>
74 #include <sys/cmn_err.h>
75 #include <sys/modctl.h>
76 #include <sys/ddi.h>
77 #include <sys/sunddi.h>
78 #include <sys/sunldi.h>
79 #include <sys/nsctl/nsvers.h>
80 
81 #include <sys/nsc_thread.h>
82 #include <sys/unistat/spcs_s.h>
83 #include <sys/unistat/spcs_s_k.h>
84 #include <sys/unistat/spcs_errors.h>
85 
86 #ifdef DS_DDICT
87 #include "../contract.h"
88 #endif
89 
90 #include "../nsctl.h"
91 
92 
93 #include <sys/sdt.h>		/* dtrace is S10 or later */
94 
95 #include "sv.h"
96 #include "sv_impl.h"
97 #include "sv_efi.h"
98 
99 #define	MAX_EINTR_COUNT 1000
100 
101 /*
102  * sv_mod_status
103  */
104 #define	SV_PREVENT_UNLOAD 1
105 #define	SV_ALLOW_UNLOAD	2
106 
107 static const int sv_major_rev = ISS_VERSION_MAJ;	/* Major number */
108 static const int sv_minor_rev = ISS_VERSION_MIN;	/* Minor number */
109 static const int sv_micro_rev = ISS_VERSION_MIC;	/* Micro number */
110 static const int sv_baseline_rev = ISS_VERSION_NUM;	/* Baseline number */
111 
112 #ifdef DKIOCPARTITION
113 /*
114  * CRC32 polynomial table needed for computing the checksums
115  * in an EFI vtoc.
116  */
117 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
118 #endif
119 
120 static clock_t sv_config_time;		/* Time of successful {en,dis}able */
121 static int sv_debug;			/* Set non-zero for debug to syslog */
122 static int sv_mod_status;		/* Set to prevent modunload */
123 
124 static dev_info_t *sv_dip;		/* Single DIP for driver */
125 static kmutex_t sv_mutex;		/* Protect global lists, etc. */
126 
127 static nsc_mem_t	*sv_mem;	/* nsctl memory allocator token */
128 
129 
130 /*
131  * Per device and per major state.
132  */
133 
134 #ifndef _SunOS_5_6
135 #define	UNSAFE_ENTER()
136 #define	UNSAFE_EXIT()
137 #else
138 #define	UNSAFE_ENTER()	mutex_enter(&unsafe_driver)
139 #define	UNSAFE_EXIT()	mutex_exit(&unsafe_driver)
140 #endif
141 
142 					/* hash table of major dev structures */
143 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
144 static sv_dev_t *sv_devs;		/* array of per device structures */
145 static int sv_max_devices;		/* SV version of nsc_max_devices() */
146 static int sv_ndevices;			/* number of SV enabled devices */
147 
148 /*
149  * Threading.
150  */
151 
152 int sv_threads_max = 1024;		/* maximum # to dynamically alloc */
153 int sv_threads = 32;			/* # to pre-allocate (see sv.conf) */
154 int sv_threads_extra = 0;		/* addl # we would have alloc'ed */
155 
156 static nstset_t *sv_tset;		/* the threadset pointer */
157 
158 static int sv_threads_hysteresis = 4;	/* hysteresis for threadset resizing */
159 static int sv_threads_dev = 2;		/* # of threads to alloc per device */
160 static int sv_threads_inc = 8;		/* increment for changing the set */
161 static int sv_threads_needed;		/* number of threads needed */
162 static int sv_no_threads;		/* number of nsc_create errors */
163 static int sv_max_nlive;		/* max number of threads running */
164 
165 
166 
167 /*
168  * nsctl fd callbacks.
169  */
170 
171 static int svattach_fd(blind_t);
172 static int svdetach_fd(blind_t);
173 
174 static nsc_def_t sv_fd_def[] = {
175 	{ "Attach",	(uintptr_t)svattach_fd, },
176 	{ "Detach",	(uintptr_t)svdetach_fd, },
177 	{ 0, 0, }
178 };
179 
180 /*
181  * cb_ops functions.
182  */
183 
184 static int svopen(dev_t *, int, int, cred_t *);
185 static int svclose(dev_t, int, int, cred_t *);
186 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
187 static int svprint(dev_t, char *);
188 
189 /*
190  * These next functions are layered into the underlying driver's devops.
191  */
192 
193 static int sv_lyr_open(dev_t *, int, int, cred_t *);
194 static int sv_lyr_close(dev_t, int, int, cred_t *);
195 static int sv_lyr_strategy(struct buf *);
196 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
197 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
198 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
199 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
200 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
201 
202 static struct cb_ops sv_cb_ops = {
203 	svopen,		/* open */
204 	svclose,	/* close */
205 	nulldev,	/* strategy */
206 	svprint,
207 	nodev,		/* dump */
208 	nodev,		/* read */
209 	nodev,		/* write */
210 	svioctl,
211 	nodev,		/* devmap */
212 	nodev,		/* mmap */
213 	nodev,		/* segmap */
214 	nochpoll,	/* poll */
215 	ddi_prop_op,
216 	NULL,		/* NOT a stream */
217 	D_NEW | D_MP | D_64BIT,
218 	CB_REV,
219 	nodev,		/* aread */
220 	nodev,		/* awrite */
221 };
222 
223 
224 /*
225  * dev_ops functions.
226  */
227 
228 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
229 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
230 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
231 
232 static struct dev_ops sv_ops = {
233 	DEVO_REV,
234 	0,
235 	sv_getinfo,
236 	nulldev,	/* identify */
237 	nulldev,	/* probe */
238 	sv_attach,
239 	sv_detach,
240 	nodev,		/* reset */
241 	&sv_cb_ops,
242 	(struct bus_ops *)0
243 };
244 
245 /*
246  * Module linkage.
247  */
248 
249 extern struct mod_ops mod_driverops;
250 
251 static struct modldrv modldrv = {
252 	&mod_driverops,
253 	"nws:Storage Volume:" ISS_VERSION_STR,
254 	&sv_ops
255 };
256 
257 static struct modlinkage modlinkage = {
258 	MODREV_1,
259 	&modldrv,
260 	0
261 };
262 
263 
264 int
_init(void)265 _init(void)
266 {
267 	int error;
268 
269 	mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
270 
271 	if ((error = mod_install(&modlinkage)) != 0) {
272 		mutex_destroy(&sv_mutex);
273 		return (error);
274 	}
275 
276 #ifdef DEBUG
277 	cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
278 	    sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
279 	    ISS_VERSION_STR, BUILD_DATE_STR);
280 #else
281 	if (sv_micro_rev) {
282 		cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
283 		    sv_major_rev, sv_minor_rev, sv_micro_rev,
284 		    ISS_VERSION_STR, BUILD_DATE_STR);
285 	} else {
286 		cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
287 		    sv_major_rev, sv_minor_rev,
288 		    ISS_VERSION_STR, BUILD_DATE_STR);
289 	}
290 #endif
291 
292 	return (error);
293 }
294 
295 
296 int
_fini(void)297 _fini(void)
298 {
299 	int error;
300 
301 	if ((error = mod_remove(&modlinkage)) != 0)
302 		return (error);
303 
304 	mutex_destroy(&sv_mutex);
305 
306 	return (error);
307 }
308 
309 
310 int
_info(struct modinfo * modinfop)311 _info(struct modinfo *modinfop)
312 {
313 	return (mod_info(&modlinkage, modinfop));
314 }
315 
316 
317 /*
318  * Locking & State.
319  *
320  * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
321  * threadset creation and sizing; sv_ndevices.
322  *
323  * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
324  * must be acquired first.
325  *
326  * sv_lock protects the sv_dev_t structure for an individual device.
327  *
328  * sv_olock protects the otyp/open members of the sv_dev_t.  If we need
329  * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
330  * first.
331  *
332  * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
333  * I/O operations to a device simultaneously, as above.
334  *
335  * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
336  * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
337  * and (sv_pending == curthread) so that any recursion through
338  * sv_lyr_open/sv_lyr_close can be detected.
339  */
340 
341 
342 static int
sv_init_devs(void)343 sv_init_devs(void)
344 {
345 	int i;
346 
347 	ASSERT(MUTEX_HELD(&sv_mutex));
348 
349 	if (sv_max_devices > 0)
350 		return (0);
351 
352 	sv_max_devices = nsc_max_devices();
353 
354 	if (sv_max_devices <= 0) {
355 		/* nsctl is not attached (nskernd not running) */
356 		if (sv_debug > 0)
357 			cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
358 		return (EAGAIN);
359 	}
360 
361 	sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
362 	    KM_NOSLEEP, sv_mem);
363 
364 	if (sv_devs == NULL) {
365 		cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
366 		return (ENOMEM);
367 	}
368 
369 	for (i = 0; i < sv_max_devices; i++) {
370 		mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
371 		rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
372 	}
373 
374 	if (sv_debug > 0)
375 		cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
376 
377 	return (0);
378 }
379 
380 
381 static int
sv_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)382 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
383 {
384 	int rc;
385 
386 	switch (cmd) {
387 
388 	case DDI_ATTACH:
389 		sv_dip = dip;
390 
391 		if (ddi_create_minor_node(dip, "sv", S_IFCHR,
392 		    0, DDI_PSEUDO, 0) != DDI_SUCCESS)
393 			goto failed;
394 
395 		mutex_enter(&sv_mutex);
396 
397 		sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
398 		if (sv_mem == NULL) {
399 			mutex_exit(&sv_mutex);
400 			goto failed;
401 		}
402 
403 		rc = sv_init_devs();
404 		if (rc != 0 && rc != EAGAIN) {
405 			mutex_exit(&sv_mutex);
406 			goto failed;
407 		}
408 
409 		mutex_exit(&sv_mutex);
410 
411 
412 		ddi_report_dev(dip);
413 
414 		sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
415 		    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
416 		    "sv_threads", sv_threads);
417 
418 		if (sv_debug > 0)
419 			cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
420 
421 		if (sv_threads > sv_threads_max)
422 			sv_threads_max = sv_threads;
423 
424 		return (DDI_SUCCESS);
425 
426 	default:
427 		return (DDI_FAILURE);
428 	}
429 
430 failed:
431 	DTRACE_PROBE(sv_attach_failed);
432 	(void) sv_detach(dip, DDI_DETACH);
433 	return (DDI_FAILURE);
434 }
435 
436 
437 static int
sv_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)438 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
439 {
440 	sv_dev_t *svp;
441 	int i;
442 
443 	switch (cmd) {
444 
445 	case DDI_DETACH:
446 
447 		/*
448 		 * Check that everything is disabled.
449 		 */
450 
451 		mutex_enter(&sv_mutex);
452 
453 		if (sv_mod_status == SV_PREVENT_UNLOAD) {
454 			mutex_exit(&sv_mutex);
455 			DTRACE_PROBE(sv_detach_err_prevent);
456 			return (DDI_FAILURE);
457 		}
458 
459 		for (i = 0; sv_devs && i < sv_max_devices; i++) {
460 			svp = &sv_devs[i];
461 
462 			if (svp->sv_state != SV_DISABLE) {
463 				mutex_exit(&sv_mutex);
464 				DTRACE_PROBE(sv_detach_err_busy);
465 				return (DDI_FAILURE);
466 			}
467 		}
468 
469 
470 		for (i = 0; sv_devs && i < sv_max_devices; i++) {
471 			mutex_destroy(&sv_devs[i].sv_olock);
472 			rw_destroy(&sv_devs[i].sv_lock);
473 		}
474 
475 		if (sv_devs) {
476 			nsc_kmem_free(sv_devs,
477 			    (sv_max_devices * sizeof (*sv_devs)));
478 			sv_devs = NULL;
479 		}
480 		sv_max_devices = 0;
481 
482 		if (sv_mem) {
483 			nsc_unregister_mem(sv_mem);
484 			sv_mem = NULL;
485 		}
486 
487 		mutex_exit(&sv_mutex);
488 
489 		/*
490 		 * Remove all minor nodes.
491 		 */
492 
493 		ddi_remove_minor_node(dip, NULL);
494 		sv_dip = NULL;
495 
496 		return (DDI_SUCCESS);
497 
498 	default:
499 		return (DDI_FAILURE);
500 	}
501 }
502 
503 static sv_maj_t *
sv_getmajor(const dev_t dev)504 sv_getmajor(const dev_t dev)
505 {
506 	sv_maj_t **insert, *maj;
507 	major_t umaj = getmajor(dev);
508 
509 	/*
510 	 * See if the hash table entry, or one of the hash chains
511 	 * is already allocated for this major number
512 	 */
513 	if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
514 		do {
515 			if (maj->sm_major == umaj)
516 				return (maj);
517 		} while ((maj = maj->sm_next) != 0);
518 	}
519 
520 	/*
521 	 * If the sv_mutex is held, there is design flaw, as the only non-mutex
522 	 * held callers can be sv_enable() or sv_dev_to_sv()
523 	 * Return an error, instead of panicing the system
524 	 */
525 	if (MUTEX_HELD(&sv_mutex)) {
526 		cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
527 		return (NULL);
528 	}
529 
530 	/*
531 	 * Determine where to allocate a new element in the hash table
532 	 */
533 	mutex_enter(&sv_mutex);
534 	insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
535 	for (maj = *insert; maj; maj = maj->sm_next) {
536 
537 		/* Did another thread beat us to it? */
538 		if (maj->sm_major == umaj)
539 			return (maj);
540 
541 		/* Find a NULL insert point? */
542 		if (maj->sm_next == NULL)
543 			insert = &maj->sm_next;
544 	}
545 
546 	/*
547 	 * Located the new insert point
548 	 */
549 	*insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
550 	if ((maj = *insert) != 0)
551 		maj->sm_major = umaj;
552 	else
553 		cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
554 
555 	mutex_exit(&sv_mutex);
556 
557 	return (maj);
558 }
559 
560 /* ARGSUSED */
561 
562 static int
sv_getinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)563 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
564 {
565 	int rc = DDI_FAILURE;
566 
567 	switch (infocmd) {
568 
569 	case DDI_INFO_DEVT2DEVINFO:
570 		*result = sv_dip;
571 		rc = DDI_SUCCESS;
572 		break;
573 
574 	case DDI_INFO_DEVT2INSTANCE:
575 		/*
576 		 * We only have a single instance.
577 		 */
578 		*result = 0;
579 		rc = DDI_SUCCESS;
580 		break;
581 
582 	default:
583 		break;
584 	}
585 
586 	return (rc);
587 }
588 
589 
590 /*
591  * Hashing of devices onto major device structures.
592  *
593  * Individual device structures are hashed onto one of the sm_hash[]
594  * buckets in the relevant major device structure.
595  *
596  * Hash insertion and deletion -must- be done with sv_mutex held.  Hash
597  * searching does not require the mutex because of the sm_seq member.
598  * sm_seq is incremented on each insertion (-after- hash chain pointer
599  * manipulation) and each deletion (-before- hash chain pointer
600  * manipulation).  When searching the hash chain, the seq number is
601  * checked before accessing each device structure, if the seq number has
602  * changed, then we restart the search from the top of the hash chain.
603  * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
604  * the hash chain (we are guaranteed that this search cannot be
605  * interrupted).
606  */
607 
608 #define	SV_HASH_RETRY	16
609 
610 static sv_dev_t *
sv_dev_to_sv(const dev_t dev,sv_maj_t ** majpp)611 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
612 {
613 	minor_t umin = getminor(dev);
614 	sv_dev_t **hb, *next, *svp;
615 	sv_maj_t *maj;
616 	int seq;
617 	int try;
618 
619 	/* Get major hash table */
620 	maj = sv_getmajor(dev);
621 	if (majpp)
622 		*majpp = maj;
623 	if (maj == NULL)
624 		return (NULL);
625 
626 	if (maj->sm_inuse == 0) {
627 		DTRACE_PROBE1(
628 		    sv_dev_to_sv_end,
629 		    dev_t, dev);
630 		return (NULL);
631 	}
632 
633 	hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
634 	try = 0;
635 
636 retry:
637 	if (try > SV_HASH_RETRY)
638 		mutex_enter(&sv_mutex);
639 
640 	seq = maj->sm_seq;
641 	for (svp = *hb; svp; svp = next) {
642 		next = svp->sv_hash;
643 
644 		nsc_membar_stld();	/* preserve register load order */
645 
646 		if (maj->sm_seq != seq) {
647 			DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
648 			try++;
649 			goto retry;
650 		}
651 
652 		if (svp->sv_dev == dev)
653 			break;
654 	}
655 
656 	if (try > SV_HASH_RETRY)
657 		mutex_exit(&sv_mutex);
658 
659 	return (svp);
660 }
661 
662 
663 /*
664  * Must be called with sv_mutex held.
665  */
666 
667 static int
sv_get_state(const dev_t udev,sv_dev_t ** svpp)668 sv_get_state(const dev_t udev, sv_dev_t **svpp)
669 {
670 	sv_dev_t **hb, **insert, *svp;
671 	sv_maj_t *maj;
672 	minor_t umin;
673 	int i;
674 
675 	/* Get major hash table */
676 	if ((maj = sv_getmajor(udev)) == NULL)
677 		return (NULL);
678 
679 	/* Determine which minor hash table */
680 	umin = getminor(udev);
681 	hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
682 
683 	/* look for clash */
684 
685 	insert = hb;
686 
687 	for (svp = *hb; svp; svp = svp->sv_hash) {
688 		if (svp->sv_dev == udev)
689 			break;
690 
691 		if (svp->sv_hash == NULL)
692 			insert = &svp->sv_hash;
693 	}
694 
695 	if (svp) {
696 		DTRACE_PROBE1(
697 		    sv_get_state_enabled,
698 		    dev_t, udev);
699 		return (SV_EENABLED);
700 	}
701 
702 	/* look for spare sv_devs slot */
703 
704 	for (i = 0; i < sv_max_devices; i++) {
705 		svp = &sv_devs[i];
706 
707 		if (svp->sv_state == SV_DISABLE)
708 			break;
709 	}
710 
711 	if (i >= sv_max_devices) {
712 		DTRACE_PROBE1(
713 		    sv_get_state_noslots,
714 		    dev_t, udev);
715 		return (SV_ENOSLOTS);
716 	}
717 
718 	svp->sv_state = SV_PENDING;
719 	svp->sv_pending = curthread;
720 
721 	*insert = svp;
722 	svp->sv_hash = NULL;
723 	maj->sm_seq++;		/* must be after the store to the hash chain */
724 
725 	*svpp = svp;
726 
727 	/*
728 	 * We do not know the size of the underlying device at
729 	 * this stage, so initialise "nblocks" property to
730 	 * zero, and update it whenever we succeed in
731 	 * nsc_reserve'ing the underlying nsc_fd_t.
732 	 */
733 
734 	svp->sv_nblocks = 0;
735 
736 	return (0);
737 }
738 
739 
740 /*
741  * Remove a device structure from it's hash chain.
742  * Must be called with sv_mutex held.
743  */
744 
745 static void
sv_rm_hash(sv_dev_t * svp)746 sv_rm_hash(sv_dev_t *svp)
747 {
748 	sv_dev_t **svpp;
749 	sv_maj_t *maj;
750 
751 	/* Get major hash table */
752 	if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
753 		return;
754 
755 	/* remove svp from hash chain */
756 
757 	svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
758 	while (*svpp) {
759 		if (*svpp == svp) {
760 			/*
761 			 * increment of sm_seq must be before the
762 			 * removal from the hash chain
763 			 */
764 			maj->sm_seq++;
765 			*svpp = svp->sv_hash;
766 			break;
767 		}
768 
769 		svpp = &(*svpp)->sv_hash;
770 	}
771 
772 	svp->sv_hash = NULL;
773 }
774 
775 /*
776  * Free (disable) a device structure.
777  * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
778  * perform the exits during its processing.
779  */
780 
781 static int
sv_free(sv_dev_t * svp,const int error)782 sv_free(sv_dev_t *svp, const int error)
783 {
784 	struct cb_ops *cb_ops;
785 	sv_maj_t *maj;
786 
787 	/* Get major hash table */
788 	if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
789 		return (NULL);
790 
791 	svp->sv_state = SV_PENDING;
792 	svp->sv_pending = curthread;
793 
794 	/*
795 	 * Close the fd's before removing from the hash or swapping
796 	 * back the cb_ops pointers so that the cache flushes before new
797 	 * io can come in.
798 	 */
799 
800 	if (svp->sv_fd) {
801 		(void) nsc_close(svp->sv_fd);
802 		svp->sv_fd = 0;
803 	}
804 
805 	sv_rm_hash(svp);
806 
807 	if (error != SV_ESDOPEN &&
808 	    error != SV_ELYROPEN && --maj->sm_inuse == 0) {
809 
810 		if (maj->sm_dev_ops)
811 			cb_ops = maj->sm_dev_ops->devo_cb_ops;
812 		else
813 			cb_ops = NULL;
814 
815 		if (cb_ops && maj->sm_strategy != NULL) {
816 			cb_ops->cb_strategy = maj->sm_strategy;
817 			cb_ops->cb_close = maj->sm_close;
818 			cb_ops->cb_ioctl = maj->sm_ioctl;
819 			cb_ops->cb_write = maj->sm_write;
820 			cb_ops->cb_open = maj->sm_open;
821 			cb_ops->cb_read = maj->sm_read;
822 			cb_ops->cb_flag = maj->sm_flag;
823 
824 			if (maj->sm_awrite)
825 				cb_ops->cb_awrite = maj->sm_awrite;
826 
827 			if (maj->sm_aread)
828 				cb_ops->cb_aread = maj->sm_aread;
829 
830 			/*
831 			 * corbin XXX
832 			 * Leave backing device ops in maj->sm_*
833 			 * to handle any requests that might come
834 			 * in during the disable.  This could be
835 			 * a problem however if the backing device
836 			 * driver is changed while we process these
837 			 * requests.
838 			 *
839 			 * maj->sm_strategy = 0;
840 			 * maj->sm_awrite = 0;
841 			 * maj->sm_write = 0;
842 			 * maj->sm_ioctl = 0;
843 			 * maj->sm_close = 0;
844 			 * maj->sm_aread = 0;
845 			 * maj->sm_read = 0;
846 			 * maj->sm_open = 0;
847 			 * maj->sm_flag = 0;
848 			 *
849 			 */
850 		}
851 
852 		if (maj->sm_dev_ops) {
853 			maj->sm_dev_ops = 0;
854 		}
855 	}
856 
857 	if (svp->sv_lh) {
858 		cred_t *crp = ddi_get_cred();
859 
860 		/*
861 		 * Close the protective layered driver open using the
862 		 * Sun Private layered driver i/f.
863 		 */
864 
865 		(void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
866 		svp->sv_lh = NULL;
867 	}
868 
869 	svp->sv_timestamp = nsc_lbolt();
870 	svp->sv_state = SV_DISABLE;
871 	svp->sv_pending = NULL;
872 	rw_exit(&svp->sv_lock);
873 	mutex_exit(&sv_mutex);
874 
875 	return (error);
876 }
877 
878 /*
879  * Reserve the device, taking into account the possibility that
880  * the reserve might have to be retried.
881  */
882 static int
sv_reserve(nsc_fd_t * fd,int flags)883 sv_reserve(nsc_fd_t *fd, int flags)
884 {
885 	int eintr_count;
886 	int rc;
887 
888 	eintr_count = 0;
889 	do {
890 		rc = nsc_reserve(fd, flags);
891 		if (rc == EINTR) {
892 			++eintr_count;
893 			delay(2);
894 		}
895 	} while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
896 
897 	return (rc);
898 }
899 
900 static int
sv_enable(const caddr_t path,const int flag,const dev_t udev,spcs_s_info_t kstatus)901 sv_enable(const caddr_t path, const int flag,
902     const dev_t udev, spcs_s_info_t kstatus)
903 {
904 	struct dev_ops *dev_ops;
905 	struct cb_ops *cb_ops;
906 	sv_dev_t *svp;
907 	sv_maj_t *maj;
908 	nsc_size_t nblocks;
909 	int rc;
910 	cred_t *crp;
911 	ldi_ident_t	li;
912 
913 	if (udev == (dev_t)-1 || udev == 0) {
914 		DTRACE_PROBE1(
915 		    sv_enable_err_baddev,
916 		    dev_t, udev);
917 		return (SV_EBADDEV);
918 	}
919 
920 	if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
921 		DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
922 		return (SV_EAMODE);
923 	}
924 
925 	/* Get major hash table */
926 	if ((maj = sv_getmajor(udev)) == NULL)
927 		return (SV_EBADDEV);
928 
929 	mutex_enter(&sv_mutex);
930 
931 	rc = sv_get_state(udev, &svp);
932 	if (rc) {
933 		mutex_exit(&sv_mutex);
934 		DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
935 		return (rc);
936 	}
937 
938 	rw_enter(&svp->sv_lock, RW_WRITER);
939 
940 	/*
941 	 * Get real fd used for io
942 	 */
943 
944 	svp->sv_dev = udev;
945 	svp->sv_flag = flag;
946 
947 	/*
948 	 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
949 	 * function pointer before sv swaps them out.
950 	 */
951 
952 	svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
953 	    sv_fd_def, (blind_t)udev, &rc);
954 
955 	if (svp->sv_fd == NULL) {
956 		if (kstatus)
957 			spcs_s_add(kstatus, rc);
958 		DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
959 		return (sv_free(svp, SV_ESDOPEN));
960 	}
961 
962 	/*
963 	 * Perform a layered driver open using the Sun Private layered
964 	 * driver i/f to ensure that the cb_ops structure for the driver
965 	 * is not detached out from under us whilst sv is enabled.
966 	 *
967 	 */
968 
969 	crp = ddi_get_cred();
970 	svp->sv_lh = NULL;
971 
972 	if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
973 		rc = ldi_open_by_dev(&svp->sv_dev,
974 		    OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
975 	}
976 
977 	if (rc != 0) {
978 		if (kstatus)
979 			spcs_s_add(kstatus, rc);
980 		DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
981 		return (sv_free(svp, SV_ELYROPEN));
982 	}
983 
984 	/*
985 	 * Do layering if required - must happen after nsc_open().
986 	 */
987 
988 	if (maj->sm_inuse++ == 0) {
989 		maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
990 
991 		if (maj->sm_dev_ops == NULL ||
992 		    maj->sm_dev_ops->devo_cb_ops == NULL) {
993 			DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
994 			return (sv_free(svp, SV_ELOAD));
995 		}
996 
997 		dev_ops = maj->sm_dev_ops;
998 		cb_ops = dev_ops->devo_cb_ops;
999 
1000 		if (cb_ops->cb_strategy == NULL ||
1001 		    cb_ops->cb_strategy == nodev ||
1002 		    cb_ops->cb_strategy == nulldev) {
1003 			DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1004 			return (sv_free(svp, SV_ELOAD));
1005 		}
1006 
1007 		if (cb_ops->cb_strategy == sv_lyr_strategy) {
1008 			DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1009 			return (sv_free(svp, SV_ESTRATEGY));
1010 		}
1011 
1012 		maj->sm_strategy = cb_ops->cb_strategy;
1013 		maj->sm_close = cb_ops->cb_close;
1014 		maj->sm_ioctl = cb_ops->cb_ioctl;
1015 		maj->sm_write = cb_ops->cb_write;
1016 		maj->sm_open = cb_ops->cb_open;
1017 		maj->sm_read = cb_ops->cb_read;
1018 		maj->sm_flag = cb_ops->cb_flag;
1019 
1020 		cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1021 		cb_ops->cb_strategy = sv_lyr_strategy;
1022 		cb_ops->cb_close = sv_lyr_close;
1023 		cb_ops->cb_ioctl = sv_lyr_ioctl;
1024 		cb_ops->cb_write = sv_lyr_write;
1025 		cb_ops->cb_open = sv_lyr_open;
1026 		cb_ops->cb_read = sv_lyr_read;
1027 
1028 		/*
1029 		 * Check that the driver has async I/O entry points
1030 		 * before changing them.
1031 		 */
1032 
1033 		if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1034 			maj->sm_awrite = 0;
1035 			maj->sm_aread = 0;
1036 		} else {
1037 			maj->sm_awrite = cb_ops->cb_awrite;
1038 			maj->sm_aread = cb_ops->cb_aread;
1039 
1040 			cb_ops->cb_awrite = sv_lyr_awrite;
1041 			cb_ops->cb_aread = sv_lyr_aread;
1042 		}
1043 
1044 		/*
1045 		 * Bug 4645743
1046 		 *
1047 		 * Prevent sv from ever unloading after it has interposed
1048 		 * on a major device because there is a race between
1049 		 * sv removing its layered entry points from the target
1050 		 * dev_ops, a client coming in and accessing the driver,
1051 		 * and the kernel modunloading the sv text.
1052 		 *
1053 		 * To allow unload, do svboot -u, which only happens in
1054 		 * pkgrm time.
1055 		 */
1056 		ASSERT(MUTEX_HELD(&sv_mutex));
1057 		sv_mod_status = SV_PREVENT_UNLOAD;
1058 	}
1059 
1060 
1061 	svp->sv_timestamp = nsc_lbolt();
1062 	svp->sv_state = SV_ENABLE;
1063 	svp->sv_pending = NULL;
1064 	rw_exit(&svp->sv_lock);
1065 
1066 	sv_ndevices++;
1067 	mutex_exit(&sv_mutex);
1068 
1069 	nblocks = 0;
1070 	if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1071 		nblocks = svp->sv_nblocks;
1072 		nsc_release(svp->sv_fd);
1073 	}
1074 
1075 	cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1076 	    svp->sv_dev, nblocks);
1077 
1078 	return (0);
1079 }
1080 
1081 
1082 static int
sv_prepare_unload()1083 sv_prepare_unload()
1084 {
1085 	int rc = 0;
1086 
1087 	mutex_enter(&sv_mutex);
1088 
1089 	if (sv_mod_status == SV_PREVENT_UNLOAD) {
1090 		if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1091 			rc = EBUSY;
1092 		} else {
1093 			sv_mod_status = SV_ALLOW_UNLOAD;
1094 			delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1095 		}
1096 	}
1097 
1098 	mutex_exit(&sv_mutex);
1099 	return (rc);
1100 }
1101 
1102 static int
svattach_fd(blind_t arg)1103 svattach_fd(blind_t arg)
1104 {
1105 	dev_t dev = (dev_t)arg;
1106 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1107 	int rc;
1108 
1109 	if (sv_debug > 0)
1110 		cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1111 
1112 	if (svp == NULL) {
1113 		cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1114 		return (0);
1115 	}
1116 
1117 	if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1118 		cmn_err(CE_WARN,
1119 		    "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1120 		svp->sv_nblocks = 0;
1121 	}
1122 
1123 	if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1124 		cmn_err(CE_WARN,
1125 		    "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1126 		svp->sv_maxfbas = 0;
1127 	}
1128 
1129 	if (sv_debug > 0) {
1130 		cmn_err(CE_CONT,
1131 		    "!svattach_fd(%p): size %" NSC_SZFMT ", "
1132 		    "maxfbas %" NSC_SZFMT "\n",
1133 		    arg, svp->sv_nblocks, svp->sv_maxfbas);
1134 	}
1135 
1136 	return (0);
1137 }
1138 
1139 
1140 static int
svdetach_fd(blind_t arg)1141 svdetach_fd(blind_t arg)
1142 {
1143 	dev_t dev = (dev_t)arg;
1144 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1145 
1146 	if (sv_debug > 0)
1147 		cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1148 
1149 	/* svp can be NULL during disable of an sv */
1150 	if (svp == NULL)
1151 		return (0);
1152 
1153 	svp->sv_maxfbas = 0;
1154 	svp->sv_nblocks = 0;
1155 	return (0);
1156 }
1157 
1158 
1159 /*
1160  * Side effect: if called with (guard != 0), then expects both sv_mutex
1161  * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1162  */
1163 
1164 /* ARGSUSED */
1165 static int
sv_disable(dev_t dev,spcs_s_info_t kstatus)1166 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1167 {
1168 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1169 
1170 	if (svp == NULL) {
1171 
1172 		DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1173 		return (SV_ENODEV);
1174 	}
1175 
1176 	mutex_enter(&sv_mutex);
1177 	rw_enter(&svp->sv_lock, RW_WRITER);
1178 
1179 	if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1180 		rw_exit(&svp->sv_lock);
1181 		mutex_exit(&sv_mutex);
1182 
1183 		DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1184 		return (SV_EDISABLED);
1185 	}
1186 
1187 
1188 	sv_ndevices--;
1189 	return (sv_free(svp, 0));
1190 }
1191 
1192 
1193 
1194 static int
sv_lyr_open(dev_t * devp,int flag,int otyp,cred_t * crp)1195 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1196 {
1197 	nsc_buf_t *tmph;
1198 	sv_dev_t *svp;
1199 	sv_maj_t *maj;
1200 	int (*fn)();
1201 	dev_t odev;
1202 	int ret;
1203 	int rc;
1204 
1205 	svp = sv_dev_to_sv(*devp, &maj);
1206 
1207 	if (svp) {
1208 		if (svp->sv_state == SV_PENDING &&
1209 		    svp->sv_pending == curthread) {
1210 			/*
1211 			 * This is a recursive open from a call to
1212 			 * ddi_lyr_open_by_devt and so we just want
1213 			 * to pass it straight through to the
1214 			 * underlying driver.
1215 			 */
1216 			DTRACE_PROBE2(sv_lyr_open_recursive,
1217 			    sv_dev_t *, svp,
1218 			    dev_t, *devp);
1219 			svp = NULL;
1220 		} else
1221 			rw_enter(&svp->sv_lock, RW_READER);
1222 	}
1223 
1224 	odev = *devp;
1225 
1226 	if (maj && (fn = maj->sm_open) != 0) {
1227 		if (!(maj->sm_flag & D_MP)) {
1228 			UNSAFE_ENTER();
1229 			ret = (*fn)(devp, flag, otyp, crp);
1230 			UNSAFE_EXIT();
1231 		} else {
1232 			ret = (*fn)(devp, flag, otyp, crp);
1233 		}
1234 
1235 		if (ret == 0) {
1236 			/*
1237 			 * Re-acquire svp if the driver changed *devp.
1238 			 */
1239 
1240 			if (*devp != odev) {
1241 				if (svp != NULL)
1242 					rw_exit(&svp->sv_lock);
1243 
1244 				svp = sv_dev_to_sv(*devp, NULL);
1245 
1246 				if (svp) {
1247 					rw_enter(&svp->sv_lock, RW_READER);
1248 				}
1249 			}
1250 		}
1251 	} else {
1252 		ret = ENODEV;
1253 	}
1254 
1255 	if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1256 		/*
1257 		 * Underlying DDI open failed, but we have this
1258 		 * device SV enabled.  If we can read some data
1259 		 * from the device, fake a successful open (this
1260 		 * probably means that this device is RDC'd and we
1261 		 * are getting the data from the secondary node).
1262 		 *
1263 		 * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1264 		 * ensure that it does not deadlock if this open is
1265 		 * coming from nskernd:get_bsize().
1266 		 */
1267 		rc = sv_reserve(svp->sv_fd,
1268 		    NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1269 		if (rc == 0) {
1270 			tmph = NULL;
1271 
1272 			rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1273 			if (rc <= 0) {
1274 				/* success */
1275 				ret = 0;
1276 			}
1277 
1278 			if (tmph) {
1279 				(void) nsc_free_buf(tmph);
1280 				tmph = NULL;
1281 			}
1282 
1283 			nsc_release(svp->sv_fd);
1284 
1285 			/*
1286 			 * Count the number of layered opens that we
1287 			 * fake since we have to fake a matching number
1288 			 * of closes (OTYP_LYR open/close calls must be
1289 			 * paired).
1290 			 */
1291 
1292 			if (ret == 0 && otyp == OTYP_LYR) {
1293 				mutex_enter(&svp->sv_olock);
1294 				svp->sv_openlcnt++;
1295 				mutex_exit(&svp->sv_olock);
1296 			}
1297 		}
1298 	}
1299 
1300 	if (svp) {
1301 		rw_exit(&svp->sv_lock);
1302 	}
1303 
1304 	return (ret);
1305 }
1306 
1307 
1308 static int
sv_lyr_close(dev_t dev,int flag,int otyp,cred_t * crp)1309 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1310 {
1311 	sv_dev_t *svp;
1312 	sv_maj_t *maj;
1313 	int (*fn)();
1314 	int ret;
1315 
1316 	svp = sv_dev_to_sv(dev, &maj);
1317 
1318 	if (svp &&
1319 	    svp->sv_state == SV_PENDING &&
1320 	    svp->sv_pending == curthread) {
1321 		/*
1322 		 * This is a recursive open from a call to
1323 		 * ddi_lyr_close and so we just want
1324 		 * to pass it straight through to the
1325 		 * underlying driver.
1326 		 */
1327 		DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1328 		    dev_t, dev);
1329 		svp = NULL;
1330 	}
1331 
1332 	if (svp) {
1333 		rw_enter(&svp->sv_lock, RW_READER);
1334 
1335 		if (otyp == OTYP_LYR) {
1336 			mutex_enter(&svp->sv_olock);
1337 
1338 			if (svp->sv_openlcnt) {
1339 				/*
1340 				 * Consume sufficient layered closes to
1341 				 * account for the opens that we faked
1342 				 * whilst the device was failed.
1343 				 */
1344 				svp->sv_openlcnt--;
1345 				mutex_exit(&svp->sv_olock);
1346 				rw_exit(&svp->sv_lock);
1347 
1348 				DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1349 
1350 				return (0);
1351 			}
1352 
1353 			mutex_exit(&svp->sv_olock);
1354 		}
1355 	}
1356 
1357 	if (maj && (fn = maj->sm_close) != 0) {
1358 		if (!(maj->sm_flag & D_MP)) {
1359 			UNSAFE_ENTER();
1360 			ret = (*fn)(dev, flag, otyp, crp);
1361 			UNSAFE_EXIT();
1362 		} else {
1363 			ret = (*fn)(dev, flag, otyp, crp);
1364 		}
1365 	} else {
1366 		ret = ENODEV;
1367 	}
1368 
1369 	if (svp) {
1370 		rw_exit(&svp->sv_lock);
1371 	}
1372 
1373 	return (ret);
1374 }
1375 
1376 
1377 /*
1378  * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1379  * return NULL.
1380  */
1381 static sv_dev_t *
sv_find_enabled(const dev_t dev,sv_maj_t ** majpp)1382 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1383 {
1384 	sv_dev_t *svp;
1385 
1386 	while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1387 		rw_enter(&svp->sv_lock, RW_READER);
1388 
1389 		if (svp->sv_state == SV_ENABLE) {
1390 			/* locked and enabled */
1391 			break;
1392 		}
1393 
1394 		/*
1395 		 * State was changed while waiting on the lock.
1396 		 * Wait for a stable state.
1397 		 */
1398 		rw_exit(&svp->sv_lock);
1399 
1400 		DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1401 
1402 		delay(2);
1403 	}
1404 
1405 	return (svp);
1406 }
1407 
1408 
1409 static int
sv_lyr_uio(dev_t dev,uio_t * uiop,cred_t * crp,int rw)1410 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1411 {
1412 	sv_dev_t *svp;
1413 	sv_maj_t *maj;
1414 	int (*fn)();
1415 	int rc;
1416 
1417 	svp = sv_find_enabled(dev, &maj);
1418 	if (svp == NULL) {
1419 		if (maj) {
1420 			if (rw == NSC_READ)
1421 				fn = maj->sm_read;
1422 			else
1423 				fn = maj->sm_write;
1424 
1425 			if (fn != 0) {
1426 				if (!(maj->sm_flag & D_MP)) {
1427 					UNSAFE_ENTER();
1428 					rc = (*fn)(dev, uiop, crp);
1429 					UNSAFE_EXIT();
1430 				} else {
1431 					rc = (*fn)(dev, uiop, crp);
1432 				}
1433 			}
1434 
1435 			return (rc);
1436 		} else {
1437 			return (ENODEV);
1438 		}
1439 	}
1440 
1441 	ASSERT(RW_READ_HELD(&svp->sv_lock));
1442 
1443 	if (svp->sv_flag == 0) {
1444 		/*
1445 		 * guard access mode
1446 		 * - prevent user level access to the device
1447 		 */
1448 		DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1449 		rc = EPERM;
1450 		goto out;
1451 	}
1452 
1453 	if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1454 		DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1455 		goto out;
1456 	}
1457 
1458 	if (rw == NSC_READ)
1459 		rc = nsc_uread(svp->sv_fd, uiop, crp);
1460 	else
1461 		rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1462 
1463 	nsc_release(svp->sv_fd);
1464 
1465 out:
1466 	rw_exit(&svp->sv_lock);
1467 
1468 	return (rc);
1469 }
1470 
1471 
1472 static int
sv_lyr_read(dev_t dev,uio_t * uiop,cred_t * crp)1473 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1474 {
1475 	return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1476 }
1477 
1478 
1479 static int
sv_lyr_write(dev_t dev,uio_t * uiop,cred_t * crp)1480 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1481 {
1482 	return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1483 }
1484 
1485 
1486 /* ARGSUSED */
1487 
1488 static int
sv_lyr_aread(dev_t dev,struct aio_req * aio,cred_t * crp)1489 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1490 {
1491 	return (aphysio(sv_lyr_strategy,
1492 	    anocancel, dev, B_READ, minphys, aio));
1493 }
1494 
1495 
1496 /* ARGSUSED */
1497 
1498 static int
sv_lyr_awrite(dev_t dev,struct aio_req * aio,cred_t * crp)1499 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1500 {
1501 	return (aphysio(sv_lyr_strategy,
1502 	    anocancel, dev, B_WRITE, minphys, aio));
1503 }
1504 
1505 
1506 /*
1507  * Set up an array containing the list of raw path names
1508  * The array for the paths is svl and the size of the array is
1509  * in size.
1510  *
1511  * If there are more layered devices than will fit in the array,
1512  * the number of extra layered devices is returned.  Otherwise
1513  * zero is return.
1514  *
1515  * Input:
1516  *	svn	: array for paths
1517  *	size	: size of the array
1518  *
1519  * Output (extra):
1520  *	zero	: All paths fit in array
1521  *	>0	: Number of defined layered devices don't fit in array
1522  */
1523 
1524 static int
sv_list(void * ptr,const int size,int * extra,const int ilp32)1525 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1526 {
1527 	sv_name32_t *svn32;
1528 	sv_name_t *svn;
1529 	sv_dev_t *svp;
1530 	int *mode, *nblocks;
1531 	int i, index;
1532 	char *path;
1533 
1534 	*extra = 0;
1535 	index = 0;
1536 
1537 	if (ilp32)
1538 		svn32 = ptr;
1539 	else
1540 		svn = ptr;
1541 
1542 	mutex_enter(&sv_mutex);
1543 	for (i = 0; i < sv_max_devices; i++) {
1544 		svp = &sv_devs[i];
1545 
1546 		rw_enter(&svp->sv_lock, RW_READER);
1547 
1548 		if (svp->sv_state != SV_ENABLE) {
1549 			rw_exit(&svp->sv_lock);
1550 			continue;
1551 		}
1552 
1553 		if ((*extra) != 0 || ptr == NULL) {
1554 			/* Another overflow entry */
1555 			rw_exit(&svp->sv_lock);
1556 			(*extra)++;
1557 			continue;
1558 		}
1559 
1560 		if (ilp32) {
1561 			nblocks = &svn32->svn_nblocks;
1562 			mode = &svn32->svn_mode;
1563 			path = svn32->svn_path;
1564 
1565 			svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1566 			svn32++;
1567 		} else {
1568 			nblocks = &svn->svn_nblocks;
1569 			mode = &svn->svn_mode;
1570 			path = svn->svn_path;
1571 
1572 			svn->svn_timestamp = svp->sv_timestamp;
1573 			svn++;
1574 		}
1575 
1576 		(void) strcpy(path, nsc_pathname(svp->sv_fd));
1577 		*nblocks = svp->sv_nblocks;
1578 		*mode = svp->sv_flag;
1579 
1580 		if (*nblocks == 0) {
1581 			if (sv_debug > 3)
1582 				cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1583 
1584 			if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1585 				*nblocks = svp->sv_nblocks;
1586 				nsc_release(svp->sv_fd);
1587 			}
1588 		}
1589 
1590 		if (++index >= size) {
1591 			/* Out of space */
1592 			(*extra)++;
1593 		}
1594 
1595 		rw_exit(&svp->sv_lock);
1596 	}
1597 	mutex_exit(&sv_mutex);
1598 
1599 	if (index < size) {
1600 		/* NULL terminated list */
1601 		if (ilp32)
1602 			svn32->svn_path[0] = '\0';
1603 		else
1604 			svn->svn_path[0] = '\0';
1605 	}
1606 
1607 	return (0);
1608 }
1609 
1610 
1611 static void
sv_thread_tune(int threads)1612 sv_thread_tune(int threads)
1613 {
1614 	int incr = (threads > 0) ? 1 : -1;
1615 	int change = 0;
1616 	int nthreads;
1617 
1618 	ASSERT(MUTEX_HELD(&sv_mutex));
1619 
1620 	if (sv_threads_extra) {
1621 		/* keep track of any additional threads requested */
1622 		if (threads > 0) {
1623 			sv_threads_extra += threads;
1624 			return;
1625 		}
1626 		threads = -threads;
1627 		if (threads >= sv_threads_extra) {
1628 			threads -= sv_threads_extra;
1629 			sv_threads_extra = 0;
1630 			/* fall through to while loop */
1631 		} else {
1632 			sv_threads_extra -= threads;
1633 			return;
1634 		}
1635 	} else if (threads > 0) {
1636 		/*
1637 		 * do not increase the number of threads beyond
1638 		 * sv_threads_max when doing dynamic thread tuning
1639 		 */
1640 		nthreads = nst_nthread(sv_tset);
1641 		if ((nthreads + threads) > sv_threads_max) {
1642 			sv_threads_extra = nthreads + threads - sv_threads_max;
1643 			threads = sv_threads_max - nthreads;
1644 			if (threads <= 0)
1645 				return;
1646 		}
1647 	}
1648 
1649 	if (threads < 0)
1650 		threads = -threads;
1651 
1652 	while (threads--) {
1653 		nthreads = nst_nthread(sv_tset);
1654 		sv_threads_needed += incr;
1655 
1656 		if (sv_threads_needed >= nthreads)
1657 			change += nst_add_thread(sv_tset, sv_threads_inc);
1658 		else if ((sv_threads_needed <
1659 		    (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1660 		    ((nthreads - sv_threads_inc) >= sv_threads))
1661 			change -= nst_del_thread(sv_tset, sv_threads_inc);
1662 	}
1663 
1664 #ifdef DEBUG
1665 	if (change) {
1666 		cmn_err(CE_NOTE,
1667 		    "!sv_thread_tune: threads needed %d, nthreads %d, "
1668 		    "nthreads change %d",
1669 		    sv_threads_needed, nst_nthread(sv_tset), change);
1670 	}
1671 #endif
1672 }
1673 
1674 
1675 /* ARGSUSED */
1676 static int
svopen(dev_t * devp,int flag,int otyp,cred_t * crp)1677 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1678 {
1679 	int rc;
1680 
1681 	mutex_enter(&sv_mutex);
1682 	rc = sv_init_devs();
1683 	mutex_exit(&sv_mutex);
1684 
1685 	return (rc);
1686 }
1687 
1688 
1689 /* ARGSUSED */
1690 static int
svclose(dev_t dev,int flag,int otyp,cred_t * crp)1691 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1692 {
1693 	const int secs = HZ * 5;
1694 	const int ticks = HZ / 10;
1695 	int loops = secs / ticks;
1696 
1697 	mutex_enter(&sv_mutex);
1698 	while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1699 		if (nst_nlive(sv_tset) <= 0) {
1700 			nst_destroy(sv_tset);
1701 			sv_tset = NULL;
1702 			break;
1703 		}
1704 
1705 		/* threads still active - wait for them to exit */
1706 		mutex_exit(&sv_mutex);
1707 		delay(ticks);
1708 		loops--;
1709 		mutex_enter(&sv_mutex);
1710 	}
1711 	mutex_exit(&sv_mutex);
1712 
1713 	if (loops <= 0) {
1714 		cmn_err(CE_WARN,
1715 #ifndef DEBUG
1716 		    /* do not write to console when non-DEBUG */
1717 		    "!"
1718 #endif
1719 		    "sv:svclose: threads still active "
1720 		    "after %d sec - leaking thread set", secs);
1721 	}
1722 
1723 	return (0);
1724 }
1725 
1726 
1727 static int
svioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * crp,int * rvalp)1728 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1729 {
1730 	char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1731 	spcs_s_info_t kstatus;	/* Kernel version of spcs status */
1732 	spcs_s_info_t ustatus;	/* Address of user version of spcs status */
1733 	sv_list32_t svl32;	/* 32 bit Initial structure for SVIOC_LIST */
1734 	sv_version_t svv;	/* Version structure */
1735 	sv_conf_t svc;		/* User config structure */
1736 	sv_list_t svl;		/* Initial structure for SVIOC_LIST */
1737 	void *usvn;		/* Address of user sv_name_t */
1738 	void *svn = NULL;	/* Array for SVIOC_LIST */
1739 	uint64_t phash;		/* pathname hash */
1740 	int rc = 0;		/* Return code -- errno */
1741 	int size;		/* Number of items in array */
1742 	int bytes;		/* Byte size of array */
1743 	int ilp32;		/* Convert data structures for ilp32 userland */
1744 
1745 	*rvalp = 0;
1746 
1747 	/*
1748 	 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1749 	 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1750 	 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1751 	 *
1752 	 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1753 	 */
1754 	if (sv_mod_status == SV_ALLOW_UNLOAD) {
1755 		return (EBUSY);
1756 	}
1757 
1758 	if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1759 		return (rc);
1760 
1761 	kstatus = spcs_s_kcreate();
1762 	if (!kstatus) {
1763 		DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1764 		return (ENOMEM);
1765 	}
1766 
1767 	ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1768 
1769 	switch (cmd) {
1770 
1771 	case SVIOC_ENABLE:
1772 
1773 		if (ilp32) {
1774 			sv_conf32_t svc32;
1775 
1776 			if (ddi_copyin((void *)arg, &svc32,
1777 			    sizeof (svc32), mode) < 0) {
1778 				spcs_s_kfree(kstatus);
1779 				return (EFAULT);
1780 			}
1781 
1782 			svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1783 			(void) strcpy(svc.svc_path, svc32.svc_path);
1784 			svc.svc_flag  = svc32.svc_flag;
1785 			svc.svc_major = svc32.svc_major;
1786 			svc.svc_minor = svc32.svc_minor;
1787 		} else {
1788 			if (ddi_copyin((void *)arg, &svc,
1789 			    sizeof (svc), mode) < 0) {
1790 				spcs_s_kfree(kstatus);
1791 				return (EFAULT);
1792 			}
1793 		}
1794 
1795 		/* force to raw access */
1796 		svc.svc_flag = NSC_DEVICE;
1797 
1798 		if (sv_tset == NULL) {
1799 			mutex_enter(&sv_mutex);
1800 
1801 			if (sv_tset == NULL) {
1802 				sv_tset = nst_init("sv_thr", sv_threads);
1803 			}
1804 
1805 			mutex_exit(&sv_mutex);
1806 
1807 			if (sv_tset == NULL) {
1808 				cmn_err(CE_WARN,
1809 				    "!sv: could not allocate %d threads",
1810 				    sv_threads);
1811 			}
1812 		}
1813 
1814 		rc = sv_enable(svc.svc_path, svc.svc_flag,
1815 		    makedevice(svc.svc_major, svc.svc_minor), kstatus);
1816 
1817 		if (rc == 0) {
1818 			sv_config_time = nsc_lbolt();
1819 
1820 			mutex_enter(&sv_mutex);
1821 			sv_thread_tune(sv_threads_dev);
1822 			mutex_exit(&sv_mutex);
1823 		}
1824 
1825 		DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1826 
1827 		return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1828 		/* NOTREACHED */
1829 
1830 	case SVIOC_DISABLE:
1831 
1832 		if (ilp32) {
1833 			sv_conf32_t svc32;
1834 
1835 			if (ddi_copyin((void *)arg, &svc32,
1836 			    sizeof (svc32), mode) < 0) {
1837 				spcs_s_kfree(kstatus);
1838 				return (EFAULT);
1839 			}
1840 
1841 			svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1842 			svc.svc_major = svc32.svc_major;
1843 			svc.svc_minor = svc32.svc_minor;
1844 			(void) strcpy(svc.svc_path, svc32.svc_path);
1845 			svc.svc_flag  = svc32.svc_flag;
1846 		} else {
1847 			if (ddi_copyin((void *)arg, &svc,
1848 			    sizeof (svc), mode) < 0) {
1849 				spcs_s_kfree(kstatus);
1850 				return (EFAULT);
1851 			}
1852 		}
1853 
1854 		if (svc.svc_major == (major_t)-1 &&
1855 		    svc.svc_minor == (minor_t)-1) {
1856 			sv_dev_t *svp;
1857 			int i;
1858 
1859 			/*
1860 			 * User level could not find the minor device
1861 			 * node, so do this the slow way by searching
1862 			 * the entire sv config for a matching pathname.
1863 			 */
1864 
1865 			phash = nsc_strhash(svc.svc_path);
1866 
1867 			mutex_enter(&sv_mutex);
1868 
1869 			for (i = 0; i < sv_max_devices; i++) {
1870 				svp = &sv_devs[i];
1871 
1872 				if (svp->sv_state == SV_DISABLE ||
1873 				    svp->sv_fd == NULL)
1874 					continue;
1875 
1876 				if (nsc_fdpathcmp(svp->sv_fd, phash,
1877 				    svc.svc_path) == 0) {
1878 					svc.svc_major = getmajor(svp->sv_dev);
1879 					svc.svc_minor = getminor(svp->sv_dev);
1880 					break;
1881 				}
1882 			}
1883 
1884 			mutex_exit(&sv_mutex);
1885 
1886 			if (svc.svc_major == (major_t)-1 &&
1887 			    svc.svc_minor == (minor_t)-1)
1888 				return (spcs_s_ocopyoutf(&kstatus,
1889 				    svc.svc_error, SV_ENODEV));
1890 		}
1891 
1892 		rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1893 		    kstatus);
1894 
1895 		if (rc == 0) {
1896 			sv_config_time = nsc_lbolt();
1897 
1898 			mutex_enter(&sv_mutex);
1899 			sv_thread_tune(-sv_threads_dev);
1900 			mutex_exit(&sv_mutex);
1901 		}
1902 
1903 		DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1904 
1905 		return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1906 		/* NOTREACHED */
1907 
1908 	case SVIOC_LIST:
1909 
1910 		if (ilp32) {
1911 			if (ddi_copyin((void *)arg, &svl32,
1912 			    sizeof (svl32), mode) < 0) {
1913 				spcs_s_kfree(kstatus);
1914 				return (EFAULT);
1915 			}
1916 
1917 			ustatus = (spcs_s_info_t)svl32.svl_error;
1918 			size = svl32.svl_count;
1919 			usvn = (void *)(unsigned long)svl32.svl_names;
1920 		} else {
1921 			if (ddi_copyin((void *)arg, &svl,
1922 			    sizeof (svl), mode) < 0) {
1923 				spcs_s_kfree(kstatus);
1924 				return (EFAULT);
1925 			}
1926 
1927 			ustatus = svl.svl_error;
1928 			size = svl.svl_count;
1929 			usvn = svl.svl_names;
1930 		}
1931 
1932 		/* Do some boundary checking */
1933 		if ((size < 0) || (size > sv_max_devices)) {
1934 			/* Array size is out of range */
1935 			return (spcs_s_ocopyoutf(&kstatus, ustatus,
1936 			    SV_EARRBOUNDS, "0",
1937 			    spcs_s_inttostring(sv_max_devices, itmp1,
1938 			    sizeof (itmp1), 0),
1939 			    spcs_s_inttostring(size, itmp2,
1940 			    sizeof (itmp2), 0)));
1941 		}
1942 
1943 		if (ilp32)
1944 			bytes = size * sizeof (sv_name32_t);
1945 		else
1946 			bytes = size * sizeof (sv_name_t);
1947 
1948 		/* Allocate memory for the array of structures */
1949 		if (bytes != 0) {
1950 			svn = kmem_zalloc(bytes, KM_SLEEP);
1951 			if (!svn) {
1952 				return (spcs_s_ocopyoutf(&kstatus,
1953 				    ustatus, ENOMEM));
1954 			}
1955 		}
1956 
1957 		rc = sv_list(svn, size, rvalp, ilp32);
1958 		if (rc) {
1959 			if (svn != NULL)
1960 				kmem_free(svn, bytes);
1961 			return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1962 		}
1963 
1964 		if (ilp32) {
1965 			svl32.svl_timestamp = (uint32_t)sv_config_time;
1966 			svl32.svl_maxdevs = (int32_t)sv_max_devices;
1967 
1968 			/* Return the list structure */
1969 			if (ddi_copyout(&svl32, (void *)arg,
1970 			    sizeof (svl32), mode) < 0) {
1971 				spcs_s_kfree(kstatus);
1972 				if (svn != NULL)
1973 					kmem_free(svn, bytes);
1974 				return (EFAULT);
1975 			}
1976 		} else {
1977 			svl.svl_timestamp = sv_config_time;
1978 			svl.svl_maxdevs = sv_max_devices;
1979 
1980 			/* Return the list structure */
1981 			if (ddi_copyout(&svl, (void *)arg,
1982 			    sizeof (svl), mode) < 0) {
1983 				spcs_s_kfree(kstatus);
1984 				if (svn != NULL)
1985 					kmem_free(svn, bytes);
1986 				return (EFAULT);
1987 			}
1988 		}
1989 
1990 		/* Return the array */
1991 		if (svn != NULL) {
1992 			if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1993 				kmem_free(svn, bytes);
1994 				spcs_s_kfree(kstatus);
1995 				return (EFAULT);
1996 			}
1997 			kmem_free(svn, bytes);
1998 		}
1999 
2000 		DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
2001 
2002 		return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2003 		/* NOTREACHED */
2004 
2005 	case SVIOC_VERSION:
2006 
2007 		if (ilp32) {
2008 			sv_version32_t svv32;
2009 
2010 			if (ddi_copyin((void *)arg, &svv32,
2011 			    sizeof (svv32), mode) < 0) {
2012 				spcs_s_kfree(kstatus);
2013 				return (EFAULT);
2014 			}
2015 
2016 			svv32.svv_major_rev = sv_major_rev;
2017 			svv32.svv_minor_rev = sv_minor_rev;
2018 			svv32.svv_micro_rev = sv_micro_rev;
2019 			svv32.svv_baseline_rev = sv_baseline_rev;
2020 
2021 			if (ddi_copyout(&svv32, (void *)arg,
2022 			    sizeof (svv32), mode) < 0) {
2023 				spcs_s_kfree(kstatus);
2024 				return (EFAULT);
2025 			}
2026 
2027 			ustatus = (spcs_s_info_t)svv32.svv_error;
2028 		} else {
2029 			if (ddi_copyin((void *)arg, &svv,
2030 			    sizeof (svv), mode) < 0) {
2031 				spcs_s_kfree(kstatus);
2032 				return (EFAULT);
2033 			}
2034 
2035 			svv.svv_major_rev = sv_major_rev;
2036 			svv.svv_minor_rev = sv_minor_rev;
2037 			svv.svv_micro_rev = sv_micro_rev;
2038 			svv.svv_baseline_rev = sv_baseline_rev;
2039 
2040 			if (ddi_copyout(&svv, (void *)arg,
2041 			    sizeof (svv), mode) < 0) {
2042 				spcs_s_kfree(kstatus);
2043 				return (EFAULT);
2044 			}
2045 
2046 			ustatus = svv.svv_error;
2047 		}
2048 
2049 		DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2050 
2051 		return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2052 		/* NOTREACHED */
2053 
2054 	case SVIOC_UNLOAD:
2055 		rc = sv_prepare_unload();
2056 
2057 		if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2058 			rc = EFAULT;
2059 		}
2060 
2061 		spcs_s_kfree(kstatus);
2062 		return (rc);
2063 
2064 	default:
2065 		spcs_s_kfree(kstatus);
2066 
2067 		DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2068 
2069 		return (EINVAL);
2070 		/* NOTREACHED */
2071 	}
2072 
2073 	/* NOTREACHED */
2074 }
2075 
2076 
2077 /* ARGSUSED */
2078 static int
svprint(dev_t dev,char * str)2079 svprint(dev_t dev, char *str)
2080 {
2081 	int instance = ddi_get_instance(sv_dip);
2082 	cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2083 	return (0);
2084 }
2085 
2086 
2087 static void
_sv_lyr_strategy(struct buf * bp)2088 _sv_lyr_strategy(struct buf *bp)
2089 {
2090 	caddr_t buf_addr;		/* pointer to linear buffer in bp */
2091 	nsc_buf_t *bufh = NULL;
2092 	nsc_buf_t *hndl = NULL;
2093 	sv_dev_t *svp;
2094 	nsc_vec_t *v;
2095 	sv_maj_t *maj;
2096 	nsc_size_t fba_req, fba_len;	/* FBA lengths */
2097 	nsc_off_t fba_off;		/* FBA offset */
2098 	size_t tocopy, nbytes;		/* byte lengths */
2099 	int rw, rc;			/* flags and return codes */
2100 	int (*fn)();
2101 
2102 	rc = 0;
2103 
2104 	if (sv_debug > 5)
2105 		cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2106 
2107 	svp = sv_find_enabled(bp->b_edev, &maj);
2108 	if (svp == NULL) {
2109 		if (maj && (fn = maj->sm_strategy) != 0) {
2110 			if (!(maj->sm_flag & D_MP)) {
2111 				UNSAFE_ENTER();
2112 				rc = (*fn)(bp);
2113 				UNSAFE_EXIT();
2114 			} else {
2115 				rc = (*fn)(bp);
2116 			}
2117 			return;
2118 		} else {
2119 			bioerror(bp, ENODEV);
2120 			biodone(bp);
2121 			return;
2122 		}
2123 	}
2124 
2125 	ASSERT(RW_READ_HELD(&svp->sv_lock));
2126 
2127 	if (svp->sv_flag == 0) {
2128 		/*
2129 		 * guard access mode
2130 		 * - prevent user level access to the device
2131 		 */
2132 		DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2133 		bioerror(bp, EPERM);
2134 		goto out;
2135 	}
2136 
2137 	if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2138 		DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2139 
2140 		if (rc == EINTR)
2141 			cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2142 		bioerror(bp, rc);
2143 		goto out;
2144 	}
2145 
2146 	if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2147 		DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2148 
2149 		if (bp->b_flags & B_READ) {
2150 			/* return EOF, not an error */
2151 			bp->b_resid = bp->b_bcount;
2152 			bioerror(bp, 0);
2153 		} else
2154 			bioerror(bp, EINVAL);
2155 
2156 		goto done;
2157 	}
2158 
2159 	/*
2160 	 * Preallocate a handle once per call to strategy.
2161 	 * If this fails, then the nsc_alloc_buf() will allocate
2162 	 * a temporary handle per allocation/free pair.
2163 	 */
2164 
2165 	DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2166 
2167 	bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2168 
2169 	DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2170 
2171 	if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2172 		DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2173 
2174 		cmn_err(CE_WARN,
2175 		    "!sv: allocated active handle (bufh %p, flags %x)",
2176 		    (void *)bufh, bufh->sb_flag);
2177 
2178 		bioerror(bp, ENXIO);
2179 		goto done;
2180 	}
2181 
2182 	fba_req = FBA_LEN(bp->b_bcount);
2183 	if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2184 		fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2185 
2186 	rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2187 
2188 	bp_mapin(bp);
2189 
2190 	bp->b_resid = bp->b_bcount;
2191 	buf_addr = bp->b_un.b_addr;
2192 	fba_off = 0;
2193 
2194 	/*
2195 	 * fba_req  - requested size of transfer in FBAs after
2196 	 *		truncation to device extent, and allowing for
2197 	 *		possible non-FBA bounded final chunk.
2198 	 * fba_off  - offset of start of chunk from start of bp in FBAs.
2199 	 * fba_len  - size of this chunk in FBAs.
2200 	 */
2201 
2202 loop:
2203 	fba_len = min(fba_req, svp->sv_maxfbas);
2204 	hndl = bufh;
2205 
2206 	DTRACE_PROBE4(sv_dbg_allocb_start,
2207 	    sv_dev_t *, svp,
2208 	    uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2209 	    uint64_t, (uint64_t)fba_len,
2210 	    int, rw);
2211 
2212 	rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2213 	    fba_len, rw, &hndl);
2214 
2215 	DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2216 
2217 	if (rc > 0) {
2218 		DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2219 		bioerror(bp, rc);
2220 		if (hndl != bufh)
2221 			(void) nsc_free_buf(hndl);
2222 		hndl = NULL;
2223 		goto done;
2224 	}
2225 
2226 	tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2227 	v = hndl->sb_vec;
2228 
2229 	if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2230 		/*
2231 		 * Not overwriting all of the last FBA, so read in the
2232 		 * old contents now before we overwrite it with the new
2233 		 * data.
2234 		 */
2235 
2236 		DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2237 		    uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2238 
2239 		rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2240 		if (rc > 0) {
2241 			bioerror(bp, rc);
2242 			goto done;
2243 		}
2244 
2245 		DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2246 	}
2247 
2248 	DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2249 
2250 	while (tocopy > 0) {
2251 		nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2252 
2253 		if (bp->b_flags & B_READ)
2254 			(void) bcopy(v->sv_addr, buf_addr, nbytes);
2255 		else
2256 			(void) bcopy(buf_addr, v->sv_addr, nbytes);
2257 
2258 		bp->b_resid -= nbytes;
2259 		buf_addr += nbytes;
2260 		tocopy -= nbytes;
2261 		v++;
2262 	}
2263 
2264 	DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2265 
2266 	if ((bp->b_flags & B_READ) == 0) {
2267 		DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2268 		    uint64_t, (uint64_t)hndl->sb_pos,
2269 		    uint64_t, (uint64_t)hndl->sb_len);
2270 
2271 		rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2272 
2273 		DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2274 
2275 		if (rc > 0) {
2276 			bioerror(bp, rc);
2277 			goto done;
2278 		}
2279 	}
2280 
2281 	/*
2282 	 * Adjust FBA offset and requested (ie. remaining) length,
2283 	 * loop if more data to transfer.
2284 	 */
2285 
2286 	fba_off += fba_len;
2287 	fba_req -= fba_len;
2288 
2289 	if (fba_req > 0) {
2290 		DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2291 
2292 		rc = nsc_free_buf(hndl);
2293 
2294 		DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2295 
2296 		if (rc > 0) {
2297 			DTRACE_PROBE1(sv_lyr_strategy_err_free,
2298 			    struct buf *, bp);
2299 			bioerror(bp, rc);
2300 		}
2301 
2302 		hndl = NULL;
2303 
2304 		if (rc <= 0)
2305 			goto loop;
2306 	}
2307 
2308 done:
2309 	if (hndl != NULL) {
2310 		DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2311 
2312 		rc = nsc_free_buf(hndl);
2313 
2314 		DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2315 
2316 		if (rc > 0) {
2317 			DTRACE_PROBE1(sv_lyr_strategy_err_free,
2318 			    struct buf *, bp);
2319 			bioerror(bp, rc);
2320 		}
2321 
2322 		hndl = NULL;
2323 	}
2324 
2325 	if (bufh)
2326 		(void) nsc_free_handle(bufh);
2327 
2328 	DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2329 
2330 	nsc_release(svp->sv_fd);
2331 
2332 	DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2333 
2334 out:
2335 	if (sv_debug > 5) {
2336 		cmn_err(CE_CONT,
2337 		    "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2338 		    (void *)bp, (void *)bufh, bp->b_error);
2339 	}
2340 
2341 	DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2342 
2343 	rw_exit(&svp->sv_lock);
2344 	biodone(bp);
2345 }
2346 
2347 
2348 static void
sv_async_strategy(blind_t arg)2349 sv_async_strategy(blind_t arg)
2350 {
2351 	struct buf *bp = (struct buf *)arg;
2352 	_sv_lyr_strategy(bp);
2353 }
2354 
2355 
2356 static int
sv_lyr_strategy(struct buf * bp)2357 sv_lyr_strategy(struct buf *bp)
2358 {
2359 	nsthread_t *tp;
2360 	int nlive;
2361 
2362 	/*
2363 	 * If B_ASYNC was part of the DDI we could use it as a hint to
2364 	 * not create a thread for synchronous i/o.
2365 	 */
2366 	if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2367 		/* not sv enabled - just pass through */
2368 		DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2369 		_sv_lyr_strategy(bp);
2370 		return (0);
2371 	}
2372 
2373 	if (sv_debug > 4) {
2374 		cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2375 		    nst_nthread(sv_tset), nst_nlive(sv_tset));
2376 	}
2377 
2378 	/*
2379 	 * If there are only guard devices enabled there
2380 	 * won't be a threadset, so don't try and use it.
2381 	 */
2382 	tp = NULL;
2383 	if (sv_tset != NULL) {
2384 		tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2385 	}
2386 
2387 	if (tp == NULL) {
2388 		/*
2389 		 * out of threads, so fall back to synchronous io.
2390 		 */
2391 		if (sv_debug > 0) {
2392 			cmn_err(CE_CONT,
2393 			    "!sv_lyr_strategy: thread alloc failed\n");
2394 		}
2395 
2396 		DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2397 		    struct buf *, bp);
2398 
2399 		_sv_lyr_strategy(bp);
2400 		sv_no_threads++;
2401 	} else {
2402 		nlive = nst_nlive(sv_tset);
2403 		if (nlive > sv_max_nlive) {
2404 			if (sv_debug > 0) {
2405 				cmn_err(CE_CONT,
2406 				    "!sv_lyr_strategy: "
2407 				    "new max nlive %d (nthread %d)\n",
2408 				    nlive, nst_nthread(sv_tset));
2409 			}
2410 
2411 			sv_max_nlive = nlive;
2412 		}
2413 	}
2414 
2415 	return (0);
2416 }
2417 
2418 
2419 #ifndef offsetof
2420 #define	offsetof(s, m)	((size_t)(&((s *)0)->m))
2421 #endif
2422 
2423 /*
2424  * re-write the size of the current partition
2425  */
2426 static int
sv_fix_dkiocgvtoc(const intptr_t arg,const int mode,sv_dev_t * svp)2427 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2428 {
2429 	size_t offset;
2430 	int ilp32;
2431 	int pnum;
2432 	int rc;
2433 
2434 	ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2435 
2436 	rc = nskern_partition(svp->sv_dev, &pnum);
2437 	if (rc != 0) {
2438 		return (rc);
2439 	}
2440 
2441 	if (pnum < 0 || pnum >= V_NUMPAR) {
2442 		cmn_err(CE_WARN,
2443 		    "!sv_gvtoc: unable to determine partition number "
2444 		    "for dev %lx", svp->sv_dev);
2445 		return (EINVAL);
2446 	}
2447 
2448 	if (ilp32) {
2449 		int32_t p_size;
2450 
2451 #ifdef _SunOS_5_6
2452 		offset = offsetof(struct vtoc, v_part);
2453 		offset += sizeof (struct partition) * pnum;
2454 		offset += offsetof(struct partition, p_size);
2455 #else
2456 		offset = offsetof(struct vtoc32, v_part);
2457 		offset += sizeof (struct partition32) * pnum;
2458 		offset += offsetof(struct partition32, p_size);
2459 #endif
2460 
2461 		p_size = (int32_t)svp->sv_nblocks;
2462 		if (p_size == 0) {
2463 			if (sv_reserve(svp->sv_fd,
2464 			    NSC_MULTI|NSC_PCATCH) == 0) {
2465 				p_size = (int32_t)svp->sv_nblocks;
2466 				nsc_release(svp->sv_fd);
2467 			} else {
2468 				rc = EINTR;
2469 			}
2470 		}
2471 
2472 		if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2473 		    sizeof (p_size), mode) != 0) {
2474 			rc = EFAULT;
2475 		}
2476 	} else {
2477 		long p_size;
2478 
2479 		offset = offsetof(struct vtoc, v_part);
2480 		offset += sizeof (struct partition) * pnum;
2481 		offset += offsetof(struct partition, p_size);
2482 
2483 		p_size = (long)svp->sv_nblocks;
2484 		if (p_size == 0) {
2485 			if (sv_reserve(svp->sv_fd,
2486 			    NSC_MULTI|NSC_PCATCH) == 0) {
2487 				p_size = (long)svp->sv_nblocks;
2488 				nsc_release(svp->sv_fd);
2489 			} else {
2490 				rc = EINTR;
2491 			}
2492 		}
2493 
2494 		if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2495 		    sizeof (p_size), mode) != 0) {
2496 			rc = EFAULT;
2497 		}
2498 	}
2499 
2500 	return (rc);
2501 }
2502 
2503 
2504 #ifdef DKIOCPARTITION
2505 /*
2506  * re-write the size of the current partition
2507  *
2508  * arg is dk_efi_t.
2509  *
2510  * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2511  *
2512  * dk_efi_t->dki_data --> efi_gpt_t (label header)
2513  * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2514  *
2515  * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2516  * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2517  *
2518  * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2519  * logical block on the disk.
2520  *
2521  * Everything is little endian (i.e. disk format).
2522  */
2523 static int
sv_fix_dkiocgetefi(const intptr_t arg,const int mode,sv_dev_t * svp)2524 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2525 {
2526 	dk_efi_t efi;
2527 	efi_gpt_t gpt;
2528 	efi_gpe_t *gpe = NULL;
2529 	size_t sgpe;
2530 	uint64_t p_size;	/* virtual partition size from nsctl */
2531 	uint32_t crc;
2532 	int unparts;		/* number of parts in user's array */
2533 	int pnum;
2534 	int rc;
2535 
2536 	rc = nskern_partition(svp->sv_dev, &pnum);
2537 	if (rc != 0) {
2538 		return (rc);
2539 	}
2540 
2541 	if (pnum < 0) {
2542 		cmn_err(CE_WARN,
2543 		    "!sv_efi: unable to determine partition number for dev %lx",
2544 		    svp->sv_dev);
2545 		return (EINVAL);
2546 	}
2547 
2548 	if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2549 		return (EFAULT);
2550 	}
2551 
2552 	efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2553 
2554 	if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2555 		return (EINVAL);
2556 	}
2557 
2558 	if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2559 		rc = EFAULT;
2560 		goto out;
2561 	}
2562 
2563 	if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2564 		unparts = 1;
2565 	else if (pnum >= unparts) {
2566 		cmn_err(CE_WARN,
2567 		    "!sv_efi: partition# beyond end of user array (%d >= %d)",
2568 		    pnum, unparts);
2569 		return (EINVAL);
2570 	}
2571 
2572 	sgpe = sizeof (*gpe) * unparts;
2573 	gpe = kmem_alloc(sgpe, KM_SLEEP);
2574 
2575 	if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2576 		rc = EFAULT;
2577 		goto out;
2578 	}
2579 
2580 	p_size = svp->sv_nblocks;
2581 	if (p_size == 0) {
2582 		if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2583 			p_size = (diskaddr_t)svp->sv_nblocks;
2584 			nsc_release(svp->sv_fd);
2585 		} else {
2586 			rc = EINTR;
2587 		}
2588 	}
2589 
2590 	gpe[pnum].efi_gpe_EndingLBA = LE_64(
2591 	    LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2592 
2593 	gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2594 	CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2595 	gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2596 
2597 	gpt.efi_gpt_HeaderCRC32 = 0;
2598 	CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2599 	gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2600 
2601 	if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2602 		rc = EFAULT;
2603 		goto out;
2604 	}
2605 
2606 	if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2607 		rc = EFAULT;
2608 		goto out;
2609 	}
2610 
2611 out:
2612 	if (gpe) {
2613 		kmem_free(gpe, sgpe);
2614 	}
2615 
2616 	return (rc);
2617 }
2618 
2619 
2620 /*
2621  * Re-write the size of the partition specified by p_partno
2622  *
2623  * Note that if a DKIOCPARTITION is issued to an fd opened against a
2624  * non-sv'd device, but p_partno requests the size for a different
2625  * device that is sv'd, this function will *not* be called as sv is
2626  * not interposed on the original device (the fd).
2627  *
2628  * It would not be easy to change this as we cannot get the partition
2629  * number for the non-sv'd device, so cannot compute the dev_t of the
2630  * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2631  * its size from nsctl.
2632  *
2633  * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2634  */
2635 static int
sv_fix_dkiocpartition(const intptr_t arg,const int mode,sv_dev_t * svp)2636 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2637 {
2638 	struct partition64 p64;
2639 	sv_dev_t *nsvp = NULL;
2640 	diskaddr_t p_size;
2641 	minor_t nminor;
2642 	int pnum, rc;
2643 	dev_t ndev;
2644 
2645 	rc = nskern_partition(svp->sv_dev, &pnum);
2646 	if (rc != 0) {
2647 		return (rc);
2648 	}
2649 
2650 	if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2651 		return (EFAULT);
2652 	}
2653 
2654 	if (p64.p_partno != pnum) {
2655 		/* switch to requested partition, not the current one */
2656 		nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2657 		ndev = makedevice(getmajor(svp->sv_dev), nminor);
2658 		nsvp = sv_find_enabled(ndev, NULL);
2659 		if (nsvp == NULL) {
2660 			/* not sv device - just return */
2661 			return (0);
2662 		}
2663 
2664 		svp = nsvp;
2665 	}
2666 
2667 	p_size = svp->sv_nblocks;
2668 	if (p_size == 0) {
2669 		if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2670 			p_size = (diskaddr_t)svp->sv_nblocks;
2671 			nsc_release(svp->sv_fd);
2672 		} else {
2673 			rc = EINTR;
2674 		}
2675 	}
2676 
2677 	if (nsvp != NULL) {
2678 		rw_exit(&nsvp->sv_lock);
2679 	}
2680 
2681 	if ((rc == 0) && ddi_copyout(&p_size,
2682 	    (void *)(arg + offsetof(struct partition64, p_size)),
2683 	    sizeof (p_size), mode) != 0) {
2684 		return (EFAULT);
2685 	}
2686 
2687 	return (rc);
2688 }
2689 #endif /* DKIOCPARTITION */
2690 
2691 
2692 static int
sv_lyr_ioctl(const dev_t dev,const int cmd,const intptr_t arg,const int mode,cred_t * crp,int * rvalp)2693 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2694     const int mode, cred_t *crp, int *rvalp)
2695 {
2696 	sv_dev_t *svp;
2697 	sv_maj_t *maj;
2698 	int (*fn)();
2699 	int rc = 0;
2700 
2701 	maj = 0;
2702 	fn = 0;
2703 
2704 	/*
2705 	 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2706 	 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2707 	 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2708 	 *
2709 	 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2710 	 */
2711 	if (sv_mod_status == SV_ALLOW_UNLOAD) {
2712 		return (EBUSY);
2713 	}
2714 
2715 	svp = sv_find_enabled(dev, &maj);
2716 	if (svp != NULL) {
2717 		if (nskernd_isdaemon()) {
2718 			/*
2719 			 * This is nskernd which always needs to see
2720 			 * the underlying disk device accurately.
2721 			 *
2722 			 * So just pass the ioctl straight through
2723 			 * to the underlying driver as though the device
2724 			 * was not sv enabled.
2725 			 */
2726 			DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2727 			    dev_t, dev);
2728 
2729 			rw_exit(&svp->sv_lock);
2730 			svp = NULL;
2731 		} else {
2732 			ASSERT(RW_READ_HELD(&svp->sv_lock));
2733 		}
2734 	}
2735 
2736 	/*
2737 	 * We now have a locked and enabled SV device, or a non-SV device.
2738 	 */
2739 
2740 	switch (cmd) {
2741 		/*
2742 		 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2743 		 * and DKIOCSETEFI are intercepted and faked up as some
2744 		 * i/o providers emulate volumes of a different size to
2745 		 * the underlying volume.
2746 		 *
2747 		 * Setting the size by rewriting the vtoc is not permitted.
2748 		 */
2749 
2750 	case DKIOCSVTOC:
2751 #ifdef DKIOCPARTITION
2752 	case DKIOCSETEFI:
2753 #endif
2754 		if (svp == NULL) {
2755 			/* not intercepted -- allow ioctl through */
2756 			break;
2757 		}
2758 
2759 		rw_exit(&svp->sv_lock);
2760 
2761 		DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2762 
2763 		return (EPERM);
2764 
2765 	default:
2766 		break;
2767 	}
2768 
2769 	/*
2770 	 * Pass through the real ioctl command.
2771 	 */
2772 
2773 	if (maj && (fn = maj->sm_ioctl) != 0) {
2774 		if (!(maj->sm_flag & D_MP)) {
2775 			UNSAFE_ENTER();
2776 			rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2777 			UNSAFE_EXIT();
2778 		} else {
2779 			rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2780 		}
2781 	} else {
2782 		rc = ENODEV;
2783 	}
2784 
2785 	/*
2786 	 * Bug 4755783
2787 	 * Fix up the size of the current partition to allow
2788 	 * for the virtual volume to be a different size to the
2789 	 * physical volume (e.g. for II compact dependent shadows).
2790 	 *
2791 	 * Note that this only attempts to fix up the current partition
2792 	 * - the one that the ioctl was issued against.  There could be
2793 	 * other sv'd partitions in the same vtoc, but we cannot tell
2794 	 * so we don't attempt to fix them up.
2795 	 */
2796 
2797 	if (svp != NULL && rc == 0) {
2798 		switch (cmd) {
2799 		case DKIOCGVTOC:
2800 			rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2801 			break;
2802 
2803 #ifdef DKIOCPARTITION
2804 		case DKIOCGETEFI:
2805 			rc = sv_fix_dkiocgetefi(arg, mode, svp);
2806 			break;
2807 
2808 		case DKIOCPARTITION:
2809 			rc = sv_fix_dkiocpartition(arg, mode, svp);
2810 			break;
2811 #endif /* DKIOCPARTITION */
2812 		}
2813 	}
2814 
2815 	if (svp != NULL) {
2816 		rw_exit(&svp->sv_lock);
2817 	}
2818 
2819 	return (rc);
2820 }
2821