xref: /titanic_41/usr/src/uts/common/avs/ns/sv/sv.c (revision b9175c69691c8949bec97fb8f689b7d1efdb05bb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Storage Volume Character and Block Driver (SV)
28  *
29  * This driver implements a simplistic /dev/{r}dsk/ interface to a
30  * specified disk volume that is otherwise managed by the Prism
31  * software.  The SV driver layers itself onto the underlying disk
32  * device driver by changing function pointers in the cb_ops
33  * structure.
34  *
35  * CONFIGURATION:
36  *
37  * 1. Configure the driver using the svadm utility.
38  * 2. Access the device as before through /dev/rdsk/c?t?d?s?
39  *
40  * LIMITATIONS:
41  *
42  * This driver should NOT be used to share a device between another
43  * DataServices user interface module (e.g., STE) and a user accessing
44  * the device through the block device in O_WRITE mode.  This is because
45  * writes through the block device are asynchronous (due to the page
46  * cache) and so consistency between the block device user and the
47  * STE user cannot be guaranteed.
48  *
49  * Data is copied between system struct buf(9s) and nsc_vec_t.  This is
50  * wasteful and slow.
51  */
52 
53 #include <sys/debug.h>
54 #include <sys/types.h>
55 
56 #include <sys/ksynch.h>
57 #include <sys/kmem.h>
58 #include <sys/errno.h>
59 #include <sys/varargs.h>
60 #include <sys/file.h>
61 #include <sys/open.h>
62 #include <sys/conf.h>
63 #include <sys/cred.h>
64 #include <sys/buf.h>
65 #include <sys/uio.h>
66 #ifndef DS_DDICT
67 #include <sys/pathname.h>
68 #endif
69 #include <sys/aio_req.h>
70 #include <sys/dkio.h>
71 #include <sys/vtoc.h>
72 #include <sys/cmn_err.h>
73 #include <sys/modctl.h>
74 #include <sys/ddi.h>
75 #include <sys/sunddi.h>
76 #include <sys/sunldi.h>
77 #include <sys/nsctl/nsvers.h>
78 
79 #include <sys/nsc_thread.h>
80 #include <sys/unistat/spcs_s.h>
81 #include <sys/unistat/spcs_s_k.h>
82 #include <sys/unistat/spcs_errors.h>
83 
84 #ifdef DS_DDICT
85 #include "../contract.h"
86 #endif
87 
88 #include "../nsctl.h"
89 
90 
91 #include <sys/sdt.h>		/* dtrace is S10 or later */
92 
93 #include "sv.h"
94 #include "sv_impl.h"
95 #include "sv_efi.h"
96 
97 #define	MAX_EINTR_COUNT 1000
98 
99 /*
100  * sv_mod_status
101  */
102 #define	SV_PREVENT_UNLOAD 1
103 #define	SV_ALLOW_UNLOAD	2
104 
105 static const int sv_major_rev = ISS_VERSION_MAJ;	/* Major number */
106 static const int sv_minor_rev = ISS_VERSION_MIN;	/* Minor number */
107 static const int sv_micro_rev = ISS_VERSION_MIC;	/* Micro number */
108 static const int sv_baseline_rev = ISS_VERSION_NUM;	/* Baseline number */
109 
110 #ifdef DKIOCPARTITION
111 /*
112  * CRC32 polynomial table needed for computing the checksums
113  * in an EFI vtoc.
114  */
115 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
116 #endif
117 
118 static clock_t sv_config_time;		/* Time of successful {en,dis}able */
119 static int sv_debug;			/* Set non-zero for debug to syslog */
120 static int sv_mod_status;		/* Set to prevent modunload */
121 
122 static dev_info_t *sv_dip;		/* Single DIP for driver */
123 static kmutex_t sv_mutex;		/* Protect global lists, etc. */
124 
125 static nsc_mem_t	*sv_mem;	/* nsctl memory allocator token */
126 
127 
128 /*
129  * Per device and per major state.
130  */
131 
132 #ifndef _SunOS_5_6
133 #define	UNSAFE_ENTER()
134 #define	UNSAFE_EXIT()
135 #else
136 #define	UNSAFE_ENTER()	mutex_enter(&unsafe_driver)
137 #define	UNSAFE_EXIT()	mutex_exit(&unsafe_driver)
138 #endif
139 
140 					/* hash table of major dev structures */
141 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
142 static sv_dev_t *sv_devs;		/* array of per device structures */
143 static int sv_max_devices;		/* SV version of nsc_max_devices() */
144 static int sv_ndevices;			/* number of SV enabled devices */
145 
146 /*
147  * Threading.
148  */
149 
150 int sv_threads_max = 1024;		/* maximum # to dynamically alloc */
151 int sv_threads = 32;			/* # to pre-allocate (see sv.conf) */
152 int sv_threads_extra = 0;		/* addl # we would have alloc'ed */
153 
154 static nstset_t *sv_tset;		/* the threadset pointer */
155 
156 static int sv_threads_hysteresis = 4;	/* hysteresis for threadset resizing */
157 static int sv_threads_dev = 2;		/* # of threads to alloc per device */
158 static int sv_threads_inc = 8;		/* increment for changing the set */
159 static int sv_threads_needed;		/* number of threads needed */
160 static int sv_no_threads;		/* number of nsc_create errors */
161 static int sv_max_nlive;		/* max number of threads running */
162 
163 
164 
165 /*
166  * nsctl fd callbacks.
167  */
168 
169 static int svattach_fd(blind_t);
170 static int svdetach_fd(blind_t);
171 
172 static nsc_def_t sv_fd_def[] = {
173 	{ "Attach",	(uintptr_t)svattach_fd, },
174 	{ "Detach",	(uintptr_t)svdetach_fd, },
175 	{ 0, 0, }
176 };
177 
178 /*
179  * cb_ops functions.
180  */
181 
182 static int svopen(dev_t *, int, int, cred_t *);
183 static int svclose(dev_t, int, int, cred_t *);
184 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
185 static int svprint(dev_t, char *);
186 
187 /*
188  * These next functions are layered into the underlying driver's devops.
189  */
190 
191 static int sv_lyr_open(dev_t *, int, int, cred_t *);
192 static int sv_lyr_close(dev_t, int, int, cred_t *);
193 static int sv_lyr_strategy(struct buf *);
194 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
195 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
196 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
197 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
198 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
199 
200 static struct cb_ops sv_cb_ops = {
201 	svopen,		/* open */
202 	svclose,	/* close */
203 	nulldev,	/* strategy */
204 	svprint,
205 	nodev,		/* dump */
206 	nodev,		/* read */
207 	nodev,		/* write */
208 	svioctl,
209 	nodev,		/* devmap */
210 	nodev,		/* mmap */
211 	nodev,		/* segmap */
212 	nochpoll,	/* poll */
213 	ddi_prop_op,
214 	NULL,		/* NOT a stream */
215 	D_NEW | D_MP | D_64BIT,
216 	CB_REV,
217 	nodev,		/* aread */
218 	nodev,		/* awrite */
219 };
220 
221 
222 /*
223  * dev_ops functions.
224  */
225 
226 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
227 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
228 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
229 
230 static struct dev_ops sv_ops = {
231 	DEVO_REV,
232 	0,
233 	sv_getinfo,
234 	nulldev,	/* identify */
235 	nulldev,	/* probe */
236 	sv_attach,
237 	sv_detach,
238 	nodev,		/* reset */
239 	&sv_cb_ops,
240 	(struct bus_ops *)0
241 };
242 
243 /*
244  * Module linkage.
245  */
246 
247 extern struct mod_ops mod_driverops;
248 
249 static struct modldrv modldrv = {
250 	&mod_driverops,
251 	"nws:Storage Volume:" ISS_VERSION_STR,
252 	&sv_ops
253 };
254 
255 static struct modlinkage modlinkage = {
256 	MODREV_1,
257 	&modldrv,
258 	0
259 };
260 
261 
262 int
263 _init(void)
264 {
265 	int error;
266 
267 	mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
268 
269 	if ((error = mod_install(&modlinkage)) != 0) {
270 		mutex_destroy(&sv_mutex);
271 		return (error);
272 	}
273 
274 #ifdef DEBUG
275 	cmn_err(CE_CONT, "!sv %s %s (revision %d.%d.%d.%d, %s, %s)\n",
276 	    __DATE__, __TIME__,
277 	    sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
278 	    ISS_VERSION_STR, BUILD_DATE_STR);
279 #else
280 	if (sv_micro_rev) {
281 		cmn_err(CE_CONT, "!sv %s %s (revision %d.%d.%d, %s, %s)\n",
282 		    __DATE__, __TIME__,
283 		    sv_major_rev, sv_minor_rev, sv_micro_rev,
284 		    ISS_VERSION_STR, BUILD_DATE_STR);
285 	} else {
286 		cmn_err(CE_CONT, "!sv %s %s (revision %d.%d, %s, %s)\n",
287 		    __DATE__, __TIME__,
288 		    sv_major_rev, sv_minor_rev,
289 		    ISS_VERSION_STR, BUILD_DATE_STR);
290 	}
291 #endif
292 
293 	return (error);
294 }
295 
296 
297 int
298 _fini(void)
299 {
300 	int error;
301 
302 	if ((error = mod_remove(&modlinkage)) != 0)
303 		return (error);
304 
305 	mutex_destroy(&sv_mutex);
306 
307 	return (error);
308 }
309 
310 
311 int
312 _info(struct modinfo *modinfop)
313 {
314 	return (mod_info(&modlinkage, modinfop));
315 }
316 
317 
318 /*
319  * Locking & State.
320  *
321  * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
322  * threadset creation and sizing; sv_ndevices.
323  *
324  * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
325  * must be acquired first.
326  *
327  * sv_lock protects the sv_dev_t structure for an individual device.
328  *
329  * sv_olock protects the otyp/open members of the sv_dev_t.  If we need
330  * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
331  * first.
332  *
333  * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
334  * I/O operations to a device simultaneously, as above.
335  *
336  * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
337  * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
338  * and (sv_pending == curthread) so that any recursion through
339  * sv_lyr_open/sv_lyr_close can be detected.
340  */
341 
342 
343 static int
344 sv_init_devs(void)
345 {
346 	int i;
347 
348 	ASSERT(MUTEX_HELD(&sv_mutex));
349 
350 	if (sv_max_devices > 0)
351 		return (0);
352 
353 	sv_max_devices = nsc_max_devices();
354 
355 	if (sv_max_devices <= 0) {
356 		/* nsctl is not attached (nskernd not running) */
357 		if (sv_debug > 0)
358 			cmn_err(CE_CONT, "sv: nsc_max_devices = 0\n");
359 		return (EAGAIN);
360 	}
361 
362 	sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
363 	    KM_NOSLEEP, sv_mem);
364 
365 	if (sv_devs == NULL) {
366 		cmn_err(CE_WARN, "sv: could not allocate sv_devs array");
367 		return (ENOMEM);
368 	}
369 
370 	for (i = 0; i < sv_max_devices; i++) {
371 		mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
372 		rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
373 	}
374 
375 	if (sv_debug > 0)
376 		cmn_err(CE_CONT, "sv: sv_init_devs successful\n");
377 
378 	return (0);
379 }
380 
381 
382 static int
383 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
384 {
385 	int rc;
386 
387 	switch (cmd) {
388 
389 	case DDI_ATTACH:
390 		sv_dip = dip;
391 
392 		if (ddi_create_minor_node(dip, "sv", S_IFCHR,
393 					    0, DDI_PSEUDO, 0) != DDI_SUCCESS)
394 			goto failed;
395 
396 		mutex_enter(&sv_mutex);
397 
398 		sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
399 		if (sv_mem == NULL) {
400 			mutex_exit(&sv_mutex);
401 			goto failed;
402 		}
403 
404 		rc = sv_init_devs();
405 		if (rc != 0 && rc != EAGAIN) {
406 			mutex_exit(&sv_mutex);
407 			goto failed;
408 		}
409 
410 		mutex_exit(&sv_mutex);
411 
412 
413 		ddi_report_dev(dip);
414 
415 		sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
416 		    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
417 		    "sv_threads", sv_threads);
418 
419 		if (sv_debug > 0)
420 			cmn_err(CE_CONT, "sv: sv_threads=%d\n", sv_threads);
421 
422 		if (sv_threads > sv_threads_max)
423 			sv_threads_max = sv_threads;
424 
425 		return (DDI_SUCCESS);
426 
427 	default:
428 		return (DDI_FAILURE);
429 	}
430 
431 failed:
432 	DTRACE_PROBE(sv_attach_failed);
433 	(void) sv_detach(dip, DDI_DETACH);
434 	return (DDI_FAILURE);
435 }
436 
437 
438 static int
439 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
440 {
441 	sv_dev_t *svp;
442 	int i;
443 
444 	switch (cmd) {
445 
446 	case DDI_DETACH:
447 
448 		/*
449 		 * Check that everything is disabled.
450 		 */
451 
452 		mutex_enter(&sv_mutex);
453 
454 		if (sv_mod_status == SV_PREVENT_UNLOAD) {
455 			mutex_exit(&sv_mutex);
456 			DTRACE_PROBE(sv_detach_err_prevent);
457 			return (DDI_FAILURE);
458 		}
459 
460 		for (i = 0; sv_devs && i < sv_max_devices; i++) {
461 			svp = &sv_devs[i];
462 
463 			if (svp->sv_state != SV_DISABLE) {
464 				mutex_exit(&sv_mutex);
465 				DTRACE_PROBE(sv_detach_err_busy);
466 				return (DDI_FAILURE);
467 			}
468 		}
469 
470 
471 		for (i = 0; sv_devs && i < sv_max_devices; i++) {
472 			mutex_destroy(&sv_devs[i].sv_olock);
473 			rw_destroy(&sv_devs[i].sv_lock);
474 		}
475 
476 		if (sv_devs) {
477 			nsc_kmem_free(sv_devs,
478 			    (sv_max_devices * sizeof (*sv_devs)));
479 			sv_devs = NULL;
480 		}
481 		sv_max_devices = 0;
482 
483 		if (sv_mem) {
484 			nsc_unregister_mem(sv_mem);
485 			sv_mem = NULL;
486 		}
487 
488 		mutex_exit(&sv_mutex);
489 
490 		/*
491 		 * Remove all minor nodes.
492 		 */
493 
494 		ddi_remove_minor_node(dip, NULL);
495 		sv_dip = NULL;
496 
497 		return (DDI_SUCCESS);
498 
499 	default:
500 		return (DDI_FAILURE);
501 	}
502 }
503 
504 static sv_maj_t *
505 sv_getmajor(const dev_t dev)
506 {
507 	sv_maj_t **insert, *maj;
508 	major_t umaj = getmajor(dev);
509 
510 	/*
511 	 * See if the hash table entry, or one of the hash chains
512 	 * is already allocated for this major number
513 	 */
514 	if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
515 		do {
516 			if (maj->sm_major == umaj)
517 				return (maj);
518 		} while ((maj = maj->sm_next) != 0);
519 	}
520 
521 	/*
522 	 * If the sv_mutex is held, there is design flaw, as the only non-mutex
523 	 * held callers can be sv_enable() or sv_dev_to_sv()
524 	 * Return an error, instead of panicing the system
525 	 */
526 	if (MUTEX_HELD(&sv_mutex)) {
527 		cmn_err(CE_WARN, "sv: could not allocate sv_maj_t");
528 		return (NULL);
529 	}
530 
531 	/*
532 	 * Determine where to allocate a new element in the hash table
533 	 */
534 	mutex_enter(&sv_mutex);
535 	insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
536 	for (maj = *insert; maj; maj = maj->sm_next) {
537 
538 		/* Did another thread beat us to it? */
539 		if (maj->sm_major == umaj)
540 			return (maj);
541 
542 		/* Find a NULL insert point? */
543 		if (maj->sm_next == NULL)
544 			insert = &maj->sm_next;
545 	}
546 
547 	/*
548 	 * Located the new insert point
549 	 */
550 	*insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
551 	if ((maj = *insert) != 0)
552 		maj->sm_major = umaj;
553 	else
554 		cmn_err(CE_WARN, "sv: could not allocate sv_maj_t");
555 
556 	mutex_exit(&sv_mutex);
557 
558 	return (maj);
559 }
560 
561 /* ARGSUSED */
562 
563 static int
564 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
565 {
566 	int rc = DDI_FAILURE;
567 
568 	switch (infocmd) {
569 
570 	case DDI_INFO_DEVT2DEVINFO:
571 		*result = sv_dip;
572 		rc = DDI_SUCCESS;
573 		break;
574 
575 	case DDI_INFO_DEVT2INSTANCE:
576 		/*
577 		 * We only have a single instance.
578 		 */
579 		*result = 0;
580 		rc = DDI_SUCCESS;
581 		break;
582 
583 	default:
584 		break;
585 	}
586 
587 	return (rc);
588 }
589 
590 
591 /*
592  * Hashing of devices onto major device structures.
593  *
594  * Individual device structures are hashed onto one of the sm_hash[]
595  * buckets in the relevant major device structure.
596  *
597  * Hash insertion and deletion -must- be done with sv_mutex held.  Hash
598  * searching does not require the mutex because of the sm_seq member.
599  * sm_seq is incremented on each insertion (-after- hash chain pointer
600  * manipulation) and each deletion (-before- hash chain pointer
601  * manipulation).  When searching the hash chain, the seq number is
602  * checked before accessing each device structure, if the seq number has
603  * changed, then we restart the search from the top of the hash chain.
604  * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
605  * the hash chain (we are guaranteed that this search cannot be
606  * interrupted).
607  */
608 
609 #define	SV_HASH_RETRY	16
610 
611 static sv_dev_t *
612 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
613 {
614 	minor_t umin = getminor(dev);
615 	sv_dev_t **hb, *next, *svp;
616 	sv_maj_t *maj;
617 	int seq;
618 	int try;
619 
620 	/* Get major hash table */
621 	maj = sv_getmajor(dev);
622 	if (majpp)
623 		*majpp = maj;
624 	if (maj == NULL)
625 		return (NULL);
626 
627 	if (maj->sm_inuse == 0) {
628 		DTRACE_PROBE1(
629 		    sv_dev_to_sv_end,
630 		    dev_t, dev);
631 		return (NULL);
632 	}
633 
634 	hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
635 	try = 0;
636 
637 retry:
638 	if (try > SV_HASH_RETRY)
639 		mutex_enter(&sv_mutex);
640 
641 	seq = maj->sm_seq;
642 	for (svp = *hb; svp; svp = next) {
643 		next = svp->sv_hash;
644 
645 		nsc_membar_stld();	/* preserve register load order */
646 
647 		if (maj->sm_seq != seq) {
648 			DTRACE_PROBE1(sv_dev_to_sv_retry,
649 		    dev_t, dev);
650 			try++;
651 			goto retry;
652 		}
653 
654 		if (svp->sv_dev == dev)
655 			break;
656 	}
657 
658 	if (try > SV_HASH_RETRY)
659 		mutex_exit(&sv_mutex);
660 
661 	return (svp);
662 }
663 
664 
665 /*
666  * Must be called with sv_mutex held.
667  */
668 
669 static int
670 sv_get_state(const dev_t udev, sv_dev_t **svpp)
671 {
672 	sv_dev_t **hb, **insert, *svp;
673 	sv_maj_t *maj;
674 	minor_t umin;
675 	int i;
676 
677 	/* Get major hash table */
678 	if ((maj = sv_getmajor(udev)) == NULL)
679 		return (NULL);
680 
681 	/* Determine which minor hash table */
682 	umin = getminor(udev);
683 	hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
684 
685 	/* look for clash */
686 
687 	insert = hb;
688 
689 	for (svp = *hb; svp; svp = svp->sv_hash) {
690 		if (svp->sv_dev == udev)
691 			break;
692 
693 		if (svp->sv_hash == NULL)
694 			insert = &svp->sv_hash;
695 	}
696 
697 	if (svp) {
698 		DTRACE_PROBE1(
699 		    sv_get_state_enabled,
700 		    dev_t, udev);
701 		return (SV_EENABLED);
702 	}
703 
704 	/* look for spare sv_devs slot */
705 
706 	for (i = 0; i < sv_max_devices; i++) {
707 		svp = &sv_devs[i];
708 
709 		if (svp->sv_state == SV_DISABLE)
710 			break;
711 	}
712 
713 	if (i >= sv_max_devices) {
714 		DTRACE_PROBE1(
715 		    sv_get_state_noslots,
716 		    dev_t, udev);
717 		return (SV_ENOSLOTS);
718 	}
719 
720 	svp->sv_state = SV_PENDING;
721 	svp->sv_pending = curthread;
722 
723 	*insert = svp;
724 	svp->sv_hash = NULL;
725 	maj->sm_seq++;		/* must be after the store to the hash chain */
726 
727 	*svpp = svp;
728 
729 	/*
730 	 * We do not know the size of the underlying device at
731 	 * this stage, so initialise "nblocks" property to
732 	 * zero, and update it whenever we succeed in
733 	 * nsc_reserve'ing the underlying nsc_fd_t.
734 	 */
735 
736 	svp->sv_nblocks = 0;
737 
738 	return (0);
739 }
740 
741 
742 /*
743  * Remove a device structure from it's hash chain.
744  * Must be called with sv_mutex held.
745  */
746 
747 static void
748 sv_rm_hash(sv_dev_t *svp)
749 {
750 	sv_dev_t **svpp;
751 	sv_maj_t *maj;
752 
753 	/* Get major hash table */
754 	if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
755 		return;
756 
757 	/* remove svp from hash chain */
758 
759 	svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
760 	while (*svpp) {
761 		if (*svpp == svp) {
762 			/*
763 			 * increment of sm_seq must be before the
764 			 * removal from the hash chain
765 			 */
766 			maj->sm_seq++;
767 			*svpp = svp->sv_hash;
768 			break;
769 		}
770 
771 		svpp = &(*svpp)->sv_hash;
772 	}
773 
774 	svp->sv_hash = NULL;
775 }
776 
777 /*
778  * Free (disable) a device structure.
779  * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
780  * perform the exits during its processing.
781  */
782 
783 static int
784 sv_free(sv_dev_t *svp, const int error)
785 {
786 	struct cb_ops *cb_ops;
787 	sv_maj_t *maj;
788 
789 	/* Get major hash table */
790 	if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
791 		return (NULL);
792 
793 	svp->sv_state = SV_PENDING;
794 	svp->sv_pending = curthread;
795 
796 	/*
797 	 * Close the fd's before removing from the hash or swapping
798 	 * back the cb_ops pointers so that the cache flushes before new
799 	 * io can come in.
800 	 */
801 
802 	if (svp->sv_fd) {
803 		(void) nsc_close(svp->sv_fd);
804 		svp->sv_fd = 0;
805 	}
806 
807 	sv_rm_hash(svp);
808 
809 	if (error != SV_ESDOPEN &&
810 	    error != SV_ELYROPEN && --maj->sm_inuse == 0) {
811 
812 		if (maj->sm_dev_ops)
813 			cb_ops = maj->sm_dev_ops->devo_cb_ops;
814 		else
815 			cb_ops = NULL;
816 
817 		if (cb_ops && maj->sm_strategy != NULL) {
818 			cb_ops->cb_strategy = maj->sm_strategy;
819 			cb_ops->cb_close = maj->sm_close;
820 			cb_ops->cb_ioctl = maj->sm_ioctl;
821 			cb_ops->cb_write = maj->sm_write;
822 			cb_ops->cb_open = maj->sm_open;
823 			cb_ops->cb_read = maj->sm_read;
824 			cb_ops->cb_flag = maj->sm_flag;
825 
826 			if (maj->sm_awrite)
827 				cb_ops->cb_awrite = maj->sm_awrite;
828 
829 			if (maj->sm_aread)
830 				cb_ops->cb_aread = maj->sm_aread;
831 
832 			/*
833 			 * corbin XXX
834 			 * Leave backing device ops in maj->sm_*
835 			 * to handle any requests that might come
836 			 * in during the disable.  This could be
837 			 * a problem however if the backing device
838 			 * driver is changed while we process these
839 			 * requests.
840 			 *
841 			 * maj->sm_strategy = 0;
842 			 * maj->sm_awrite = 0;
843 			 * maj->sm_write = 0;
844 			 * maj->sm_ioctl = 0;
845 			 * maj->sm_close = 0;
846 			 * maj->sm_aread = 0;
847 			 * maj->sm_read = 0;
848 			 * maj->sm_open = 0;
849 			 * maj->sm_flag = 0;
850 			 *
851 			 */
852 		}
853 
854 		if (maj->sm_dev_ops) {
855 			maj->sm_dev_ops = 0;
856 		}
857 	}
858 
859 	if (svp->sv_lh) {
860 		cred_t *crp = ddi_get_cred();
861 
862 		/*
863 		 * Close the protective layered driver open using the
864 		 * Sun Private layered driver i/f.
865 		 */
866 
867 		(void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
868 		svp->sv_lh = NULL;
869 	}
870 
871 	svp->sv_timestamp = nsc_lbolt();
872 	svp->sv_state = SV_DISABLE;
873 	svp->sv_pending = NULL;
874 	rw_exit(&svp->sv_lock);
875 	mutex_exit(&sv_mutex);
876 
877 	return (error);
878 }
879 
880 /*
881  * Reserve the device, taking into account the possibility that
882  * the reserve might have to be retried.
883  */
884 static int
885 sv_reserve(nsc_fd_t *fd, int flags)
886 {
887 	int eintr_count;
888 	int rc;
889 
890 	eintr_count = 0;
891 	do {
892 		rc = nsc_reserve(fd, flags);
893 		if (rc == EINTR) {
894 			++eintr_count;
895 			delay(2);
896 		}
897 	} while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
898 
899 	return (rc);
900 }
901 
902 static int
903 sv_enable(const caddr_t path, const int flag,
904     const dev_t udev, spcs_s_info_t kstatus)
905 {
906 	struct dev_ops *dev_ops;
907 	struct cb_ops *cb_ops;
908 	sv_dev_t *svp;
909 	sv_maj_t *maj;
910 	nsc_size_t nblocks;
911 	int rc;
912 	cred_t *crp;
913 	ldi_ident_t	li;
914 
915 	if (udev == (dev_t)-1 || udev == 0) {
916 		DTRACE_PROBE1(
917 		    sv_enable_err_baddev,
918 		    dev_t, udev);
919 		return (SV_EBADDEV);
920 	}
921 
922 	if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
923 		DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
924 		return (SV_EAMODE);
925 	}
926 
927 	/* Get major hash table */
928 	if ((maj = sv_getmajor(udev)) == NULL)
929 		return (SV_EBADDEV);
930 
931 	mutex_enter(&sv_mutex);
932 
933 	rc = sv_get_state(udev, &svp);
934 	if (rc) {
935 		mutex_exit(&sv_mutex);
936 		DTRACE_PROBE1(sv_enable_err_state,
937 				dev_t, udev);
938 		return (rc);
939 	}
940 
941 	rw_enter(&svp->sv_lock, RW_WRITER);
942 
943 	/*
944 	 * Get real fd used for io
945 	 */
946 
947 	svp->sv_dev = udev;
948 	svp->sv_flag = flag;
949 
950 	/*
951 	 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
952 	 * function pointer before sv swaps them out.
953 	 */
954 
955 	svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
956 				sv_fd_def, (blind_t)udev, &rc);
957 
958 	if (svp->sv_fd == NULL) {
959 		if (kstatus)
960 			spcs_s_add(kstatus, rc);
961 		DTRACE_PROBE1(sv_enable_err_fd,
962 				dev_t, udev);
963 		return (sv_free(svp, SV_ESDOPEN));
964 	}
965 
966 	/*
967 	 * Perform a layered driver open using the Sun Private layered
968 	 * driver i/f to ensure that the cb_ops structure for the driver
969 	 * is not detached out from under us whilst sv is enabled.
970 	 *
971 	 */
972 
973 	crp = ddi_get_cred();
974 	svp->sv_lh = NULL;
975 
976 	if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
977 		rc = ldi_open_by_dev(&svp->sv_dev,
978 		    OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
979 	}
980 
981 	if (rc != 0) {
982 		if (kstatus)
983 			spcs_s_add(kstatus, rc);
984 		DTRACE_PROBE1(sv_enable_err_lyr_open,
985 				dev_t, udev);
986 		return (sv_free(svp, SV_ELYROPEN));
987 	}
988 
989 	/*
990 	 * Do layering if required - must happen after nsc_open().
991 	 */
992 
993 	if (maj->sm_inuse++ == 0) {
994 		maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
995 
996 		if (maj->sm_dev_ops == NULL ||
997 			maj->sm_dev_ops->devo_cb_ops == NULL) {
998 			DTRACE_PROBE1(
999 			    sv_enable_err_load,
1000 				dev_t, udev);
1001 			return (sv_free(svp, SV_ELOAD));
1002 		}
1003 
1004 		dev_ops = maj->sm_dev_ops;
1005 		cb_ops = dev_ops->devo_cb_ops;
1006 
1007 		if (cb_ops->cb_strategy == NULL ||
1008 		    cb_ops->cb_strategy == nodev ||
1009 		    cb_ops->cb_strategy == nulldev) {
1010 			DTRACE_PROBE1(sv_enable_err_nostrategy,
1011 				dev_t, udev);
1012 			return (sv_free(svp, SV_ELOAD));
1013 		}
1014 
1015 		if (cb_ops->cb_strategy == sv_lyr_strategy) {
1016 			DTRACE_PROBE1(sv_enable_err_svstrategy,
1017 				dev_t, udev);
1018 			return (sv_free(svp, SV_ESTRATEGY));
1019 		}
1020 
1021 		maj->sm_strategy = cb_ops->cb_strategy;
1022 		maj->sm_close = cb_ops->cb_close;
1023 		maj->sm_ioctl = cb_ops->cb_ioctl;
1024 		maj->sm_write = cb_ops->cb_write;
1025 		maj->sm_open = cb_ops->cb_open;
1026 		maj->sm_read = cb_ops->cb_read;
1027 		maj->sm_flag = cb_ops->cb_flag;
1028 
1029 		cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1030 		cb_ops->cb_strategy = sv_lyr_strategy;
1031 		cb_ops->cb_close = sv_lyr_close;
1032 		cb_ops->cb_ioctl = sv_lyr_ioctl;
1033 		cb_ops->cb_write = sv_lyr_write;
1034 		cb_ops->cb_open = sv_lyr_open;
1035 		cb_ops->cb_read = sv_lyr_read;
1036 
1037 		/*
1038 		 * Check that the driver has async I/O entry points
1039 		 * before changing them.
1040 		 */
1041 
1042 		if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1043 			maj->sm_awrite = 0;
1044 			maj->sm_aread = 0;
1045 		} else {
1046 			maj->sm_awrite = cb_ops->cb_awrite;
1047 			maj->sm_aread = cb_ops->cb_aread;
1048 
1049 			cb_ops->cb_awrite = sv_lyr_awrite;
1050 			cb_ops->cb_aread = sv_lyr_aread;
1051 		}
1052 
1053 		/*
1054 		 * Bug 4645743
1055 		 *
1056 		 * Prevent sv from ever unloading after it has interposed
1057 		 * on a major device because there is a race between
1058 		 * sv removing its layered entry points from the target
1059 		 * dev_ops, a client coming in and accessing the driver,
1060 		 * and the kernel modunloading the sv text.
1061 		 *
1062 		 * To allow unload, do svboot -u, which only happens in
1063 		 * pkgrm time.
1064 		 */
1065 		ASSERT(MUTEX_HELD(&sv_mutex));
1066 		sv_mod_status = SV_PREVENT_UNLOAD;
1067 	}
1068 
1069 
1070 	svp->sv_timestamp = nsc_lbolt();
1071 	svp->sv_state = SV_ENABLE;
1072 	svp->sv_pending = NULL;
1073 	rw_exit(&svp->sv_lock);
1074 
1075 	sv_ndevices++;
1076 	mutex_exit(&sv_mutex);
1077 
1078 	nblocks = 0;
1079 	if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1080 		nblocks = svp->sv_nblocks;
1081 		nsc_release(svp->sv_fd);
1082 	}
1083 
1084 	cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1085 	    svp->sv_dev, nblocks);
1086 
1087 	return (0);
1088 }
1089 
1090 
1091 static int
1092 sv_prepare_unload()
1093 {
1094 	int rc = 0;
1095 
1096 	mutex_enter(&sv_mutex);
1097 
1098 	if (sv_mod_status == SV_PREVENT_UNLOAD) {
1099 		if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1100 			rc = EBUSY;
1101 		} else {
1102 			sv_mod_status = SV_ALLOW_UNLOAD;
1103 			delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1104 		}
1105 	}
1106 
1107 	mutex_exit(&sv_mutex);
1108 	return (rc);
1109 }
1110 
1111 static int
1112 svattach_fd(blind_t arg)
1113 {
1114 	dev_t dev = (dev_t)arg;
1115 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1116 	int rc;
1117 
1118 	if (sv_debug > 0)
1119 		cmn_err(CE_CONT, "svattach_fd(%p, %p)\n", arg, (void *)svp);
1120 
1121 	if (svp == NULL) {
1122 		cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1123 		return (0);
1124 	}
1125 
1126 	if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1127 		cmn_err(CE_WARN,
1128 		    "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1129 		svp->sv_nblocks = 0;
1130 	}
1131 
1132 	if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1133 		cmn_err(CE_WARN,
1134 		    "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1135 		svp->sv_maxfbas = 0;
1136 	}
1137 
1138 	if (sv_debug > 0) {
1139 		cmn_err(CE_CONT,
1140 		    "svattach_fd(%p): size %" NSC_SZFMT ", "
1141 		    "maxfbas %" NSC_SZFMT "\n",
1142 		    arg, svp->sv_nblocks, svp->sv_maxfbas);
1143 	}
1144 
1145 	return (0);
1146 }
1147 
1148 
1149 static int
1150 svdetach_fd(blind_t arg)
1151 {
1152 	dev_t dev = (dev_t)arg;
1153 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1154 
1155 	if (sv_debug > 0)
1156 		cmn_err(CE_CONT, "svdetach_fd(%p, %p)\n", arg, (void *)svp);
1157 
1158 	/* svp can be NULL during disable of an sv */
1159 	if (svp == NULL)
1160 		return (0);
1161 
1162 	svp->sv_maxfbas = 0;
1163 	svp->sv_nblocks = 0;
1164 	return (0);
1165 }
1166 
1167 
1168 /*
1169  * Side effect: if called with (guard != 0), then expects both sv_mutex
1170  * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1171  */
1172 
1173 /* ARGSUSED */
1174 static int
1175 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1176 {
1177 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1178 
1179 	if (svp == NULL) {
1180 
1181 		DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1182 		return (SV_ENODEV);
1183 	}
1184 
1185 	mutex_enter(&sv_mutex);
1186 	rw_enter(&svp->sv_lock, RW_WRITER);
1187 
1188 	if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1189 		rw_exit(&svp->sv_lock);
1190 		mutex_exit(&sv_mutex);
1191 
1192 		DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1193 		return (SV_EDISABLED);
1194 	}
1195 
1196 
1197 	sv_ndevices--;
1198 	return (sv_free(svp, 0));
1199 }
1200 
1201 
1202 
1203 static int
1204 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1205 {
1206 	nsc_buf_t *tmph;
1207 	sv_dev_t *svp;
1208 	sv_maj_t *maj;
1209 	int (*fn)();
1210 	dev_t odev;
1211 	int ret;
1212 	int rc;
1213 
1214 	svp = sv_dev_to_sv(*devp, &maj);
1215 
1216 	if (svp) {
1217 		if (svp->sv_state == SV_PENDING &&
1218 		    svp->sv_pending == curthread) {
1219 			/*
1220 			 * This is a recursive open from a call to
1221 			 * ddi_lyr_open_by_devt and so we just want
1222 			 * to pass it straight through to the
1223 			 * underlying driver.
1224 			 */
1225 			DTRACE_PROBE2(sv_lyr_open_recursive,
1226 			    sv_dev_t *, svp,
1227 			    dev_t, *devp);
1228 			svp = NULL;
1229 		} else
1230 			rw_enter(&svp->sv_lock, RW_READER);
1231 	}
1232 
1233 	odev = *devp;
1234 
1235 	if (maj && (fn = maj->sm_open) != 0) {
1236 		if (!(maj->sm_flag & D_MP)) {
1237 			UNSAFE_ENTER();
1238 			ret = (*fn)(devp, flag, otyp, crp);
1239 			UNSAFE_EXIT();
1240 		} else {
1241 			ret = (*fn)(devp, flag, otyp, crp);
1242 		}
1243 
1244 		if (ret == 0) {
1245 			/*
1246 			 * Re-acquire svp if the driver changed *devp.
1247 			 */
1248 
1249 			if (*devp != odev) {
1250 				rw_exit(&svp->sv_lock);
1251 
1252 				svp = sv_dev_to_sv(*devp, NULL);
1253 
1254 				if (svp) {
1255 					rw_enter(&svp->sv_lock, RW_READER);
1256 				}
1257 			}
1258 		}
1259 	} else {
1260 		ret = ENODEV;
1261 	}
1262 
1263 	if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1264 		/*
1265 		 * Underlying DDI open failed, but we have this
1266 		 * device SV enabled.  If we can read some data
1267 		 * from the device, fake a successful open (this
1268 		 * probably means that this device is RDC'd and we
1269 		 * are getting the data from the secondary node).
1270 		 *
1271 		 * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1272 		 * ensure that it does not deadlock if this open is
1273 		 * coming from nskernd:get_bsize().
1274 		 */
1275 		rc = sv_reserve(svp->sv_fd,
1276 			NSC_TRY|NSC_NOWAIT|NSC_MULTI|NSC_PCATCH);
1277 		if (rc == 0) {
1278 			tmph = NULL;
1279 
1280 			rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1281 			if (rc <= 0) {
1282 				/* success */
1283 				ret = 0;
1284 			}
1285 
1286 			if (tmph) {
1287 				(void) nsc_free_buf(tmph);
1288 				tmph = NULL;
1289 			}
1290 
1291 			nsc_release(svp->sv_fd);
1292 
1293 			/*
1294 			 * Count the number of layered opens that we
1295 			 * fake since we have to fake a matching number
1296 			 * of closes (OTYP_LYR open/close calls must be
1297 			 * paired).
1298 			 */
1299 
1300 			if (ret == 0 && otyp == OTYP_LYR) {
1301 				mutex_enter(&svp->sv_olock);
1302 				svp->sv_openlcnt++;
1303 				mutex_exit(&svp->sv_olock);
1304 			}
1305 		}
1306 	}
1307 
1308 	if (svp) {
1309 		rw_exit(&svp->sv_lock);
1310 	}
1311 
1312 	return (ret);
1313 }
1314 
1315 
1316 static int
1317 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1318 {
1319 	sv_dev_t *svp;
1320 	sv_maj_t *maj;
1321 	int (*fn)();
1322 	int ret;
1323 
1324 	svp = sv_dev_to_sv(dev, &maj);
1325 
1326 	if (svp &&
1327 	    svp->sv_state == SV_PENDING &&
1328 	    svp->sv_pending == curthread) {
1329 		/*
1330 		 * This is a recursive open from a call to
1331 		 * ddi_lyr_close and so we just want
1332 		 * to pass it straight through to the
1333 		 * underlying driver.
1334 		 */
1335 		DTRACE_PROBE2(sv_lyr_close_recursive,
1336 			    sv_dev_t *, svp,
1337 			    dev_t, dev);
1338 		svp = NULL;
1339 	}
1340 
1341 	if (svp) {
1342 		rw_enter(&svp->sv_lock, RW_READER);
1343 
1344 		if (otyp == OTYP_LYR) {
1345 			mutex_enter(&svp->sv_olock);
1346 
1347 			if (svp->sv_openlcnt) {
1348 				/*
1349 				 * Consume sufficient layered closes to
1350 				 * account for the opens that we faked
1351 				 * whilst the device was failed.
1352 				 */
1353 				svp->sv_openlcnt--;
1354 				mutex_exit(&svp->sv_olock);
1355 				rw_exit(&svp->sv_lock);
1356 
1357 				DTRACE_PROBE1(sv_lyr_close_end,
1358 						dev_t, dev);
1359 
1360 				return (0);
1361 			}
1362 
1363 			mutex_exit(&svp->sv_olock);
1364 		}
1365 	}
1366 
1367 	if (maj && (fn = maj->sm_close) != 0) {
1368 		if (!(maj->sm_flag & D_MP)) {
1369 			UNSAFE_ENTER();
1370 			ret = (*fn)(dev, flag, otyp, crp);
1371 			UNSAFE_EXIT();
1372 		} else {
1373 			ret = (*fn)(dev, flag, otyp, crp);
1374 		}
1375 	} else {
1376 		ret = ENODEV;
1377 	}
1378 
1379 	if (svp) {
1380 		rw_exit(&svp->sv_lock);
1381 	}
1382 
1383 	return (ret);
1384 }
1385 
1386 
1387 /*
1388  * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1389  * return NULL.
1390  */
1391 static sv_dev_t *
1392 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1393 {
1394 	sv_dev_t *svp;
1395 
1396 	while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1397 		rw_enter(&svp->sv_lock, RW_READER);
1398 
1399 		if (svp->sv_state == SV_ENABLE) {
1400 			/* locked and enabled */
1401 			break;
1402 		}
1403 
1404 		/*
1405 		 * State was changed while waiting on the lock.
1406 		 * Wait for a stable state.
1407 		 */
1408 		rw_exit(&svp->sv_lock);
1409 
1410 		DTRACE_PROBE1(sv_find_enabled_retry,
1411 				dev_t, dev);
1412 
1413 		delay(2);
1414 	}
1415 
1416 	return (svp);
1417 }
1418 
1419 
1420 static int
1421 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1422 {
1423 	sv_dev_t *svp;
1424 	sv_maj_t *maj;
1425 	int (*fn)();
1426 	int rc;
1427 
1428 	svp = sv_find_enabled(dev, &maj);
1429 	if (svp == NULL) {
1430 		if (maj) {
1431 			if (rw == NSC_READ)
1432 				fn = maj->sm_read;
1433 			else
1434 				fn = maj->sm_write;
1435 
1436 			if (fn != 0) {
1437 				if (!(maj->sm_flag & D_MP)) {
1438 					UNSAFE_ENTER();
1439 					rc = (*fn)(dev, uiop, crp);
1440 					UNSAFE_EXIT();
1441 				} else {
1442 					rc = (*fn)(dev, uiop, crp);
1443 				}
1444 			}
1445 
1446 			return (rc);
1447 		} else {
1448 			return (ENODEV);
1449 		}
1450 	}
1451 
1452 	ASSERT(RW_READ_HELD(&svp->sv_lock));
1453 
1454 	if (svp->sv_flag == 0) {
1455 		/*
1456 		 * guard access mode
1457 		 * - prevent user level access to the device
1458 		 */
1459 		DTRACE_PROBE1(sv_lyr_uio_err_guard,
1460 				uio_t *, uiop);
1461 		rc = EPERM;
1462 		goto out;
1463 	}
1464 
1465 	if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1466 		DTRACE_PROBE1(sv_lyr_uio_err_rsrv,
1467 				uio_t *, uiop);
1468 		goto out;
1469 	}
1470 
1471 	if (rw == NSC_READ)
1472 		rc = nsc_uread(svp->sv_fd, uiop, crp);
1473 	else
1474 		rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1475 
1476 	nsc_release(svp->sv_fd);
1477 
1478 out:
1479 	rw_exit(&svp->sv_lock);
1480 
1481 	return (rc);
1482 }
1483 
1484 
1485 static int
1486 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1487 {
1488 	return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1489 }
1490 
1491 
1492 static int
1493 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1494 {
1495 	return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1496 }
1497 
1498 
1499 /* ARGSUSED */
1500 
1501 static int
1502 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1503 {
1504 	return (aphysio(sv_lyr_strategy,
1505 	    anocancel, dev, B_READ, minphys, aio));
1506 }
1507 
1508 
1509 /* ARGSUSED */
1510 
1511 static int
1512 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1513 {
1514 	return (aphysio(sv_lyr_strategy,
1515 	    anocancel, dev, B_WRITE, minphys, aio));
1516 }
1517 
1518 
1519 /*
1520  * Set up an array containing the list of raw path names
1521  * The array for the paths is svl and the size of the array is
1522  * in size.
1523  *
1524  * If there are more layered devices than will fit in the array,
1525  * the number of extra layered devices is returned.  Otherwise
1526  * zero is return.
1527  *
1528  * Input:
1529  *	svn	: array for paths
1530  *	size	: size of the array
1531  *
1532  * Output (extra):
1533  *	zero	: All paths fit in array
1534  *	>0	: Number of defined layered devices don't fit in array
1535  */
1536 
1537 static int
1538 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1539 {
1540 	sv_name32_t *svn32;
1541 	sv_name_t *svn;
1542 	sv_dev_t *svp;
1543 	int *mode, *nblocks;
1544 	int i, index;
1545 	char *path;
1546 
1547 	*extra = 0;
1548 	index = 0;
1549 
1550 	if (ilp32)
1551 		svn32 = ptr;
1552 	else
1553 		svn = ptr;
1554 
1555 	mutex_enter(&sv_mutex);
1556 	for (i = 0; i < sv_max_devices; i++) {
1557 		svp = &sv_devs[i];
1558 
1559 		rw_enter(&svp->sv_lock, RW_READER);
1560 
1561 		if (svp->sv_state != SV_ENABLE) {
1562 			rw_exit(&svp->sv_lock);
1563 			continue;
1564 		}
1565 
1566 		if ((*extra) != 0 || ptr == NULL) {
1567 			/* Another overflow entry */
1568 			rw_exit(&svp->sv_lock);
1569 			(*extra)++;
1570 			continue;
1571 		}
1572 
1573 		if (ilp32) {
1574 			nblocks = &svn32->svn_nblocks;
1575 			mode = &svn32->svn_mode;
1576 			path = svn32->svn_path;
1577 
1578 			svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1579 			svn32++;
1580 		} else {
1581 			nblocks = &svn->svn_nblocks;
1582 			mode = &svn->svn_mode;
1583 			path = svn->svn_path;
1584 
1585 			svn->svn_timestamp = svp->sv_timestamp;
1586 			svn++;
1587 		}
1588 
1589 		(void) strcpy(path, nsc_pathname(svp->sv_fd));
1590 		*nblocks = svp->sv_nblocks;
1591 		*mode = svp->sv_flag;
1592 
1593 		if (*nblocks == 0) {
1594 			if (sv_debug > 3)
1595 				cmn_err(CE_CONT, "sv_list: need to reserve\n");
1596 
1597 			if (sv_reserve(svp->sv_fd,
1598 					NSC_MULTI|NSC_PCATCH) == 0) {
1599 				*nblocks = svp->sv_nblocks;
1600 				nsc_release(svp->sv_fd);
1601 			}
1602 		}
1603 
1604 		if (++index >= size) {
1605 			/* Out of space */
1606 			(*extra)++;
1607 		}
1608 
1609 		rw_exit(&svp->sv_lock);
1610 	}
1611 	mutex_exit(&sv_mutex);
1612 
1613 	if (index < size) {
1614 		/* NULL terminated list */
1615 		if (ilp32)
1616 			svn32->svn_path[0] = '\0';
1617 		else
1618 			svn->svn_path[0] = '\0';
1619 	}
1620 
1621 	return (0);
1622 }
1623 
1624 
1625 static void
1626 sv_thread_tune(int threads)
1627 {
1628 	int incr = (threads > 0) ? 1 : -1;
1629 	int change = 0;
1630 	int nthreads;
1631 
1632 	ASSERT(MUTEX_HELD(&sv_mutex));
1633 
1634 	if (sv_threads_extra) {
1635 		/* keep track of any additional threads requested */
1636 		if (threads > 0) {
1637 			sv_threads_extra += threads;
1638 			return;
1639 		}
1640 		threads = -threads;
1641 		if (threads >= sv_threads_extra) {
1642 			threads -= sv_threads_extra;
1643 			sv_threads_extra = 0;
1644 			/* fall through to while loop */
1645 		} else {
1646 			sv_threads_extra -= threads;
1647 			return;
1648 		}
1649 	} else if (threads > 0) {
1650 		/*
1651 		 * do not increase the number of threads beyond
1652 		 * sv_threads_max when doing dynamic thread tuning
1653 		 */
1654 		nthreads = nst_nthread(sv_tset);
1655 		if ((nthreads + threads) > sv_threads_max) {
1656 			sv_threads_extra = nthreads + threads - sv_threads_max;
1657 			threads = sv_threads_max - nthreads;
1658 			if (threads <= 0)
1659 				return;
1660 		}
1661 	}
1662 
1663 	if (threads < 0)
1664 		threads = -threads;
1665 
1666 	while (threads--) {
1667 		nthreads = nst_nthread(sv_tset);
1668 		sv_threads_needed += incr;
1669 
1670 		if (sv_threads_needed >= nthreads)
1671 			change += nst_add_thread(sv_tset, sv_threads_inc);
1672 		else if ((sv_threads_needed <
1673 		    (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1674 		    ((nthreads - sv_threads_inc) >= sv_threads))
1675 			change -= nst_del_thread(sv_tset, sv_threads_inc);
1676 	}
1677 
1678 #ifdef DEBUG
1679 	if (change) {
1680 		cmn_err(CE_NOTE,
1681 		    "sv_thread_tune: threads needed %d, nthreads %d, "
1682 		    "nthreads change %d",
1683 		    sv_threads_needed, nst_nthread(sv_tset), change);
1684 	}
1685 #endif
1686 }
1687 
1688 
1689 /* ARGSUSED */
1690 static int
1691 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1692 {
1693 	int rc;
1694 
1695 	mutex_enter(&sv_mutex);
1696 	rc = sv_init_devs();
1697 	mutex_exit(&sv_mutex);
1698 
1699 	return (rc);
1700 }
1701 
1702 
1703 /* ARGSUSED */
1704 static int
1705 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1706 {
1707 	const int secs = HZ * 5;
1708 	const int ticks = HZ / 10;
1709 	int loops = secs / ticks;
1710 
1711 	mutex_enter(&sv_mutex);
1712 	while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1713 		if (nst_nlive(sv_tset) <= 0) {
1714 			nst_destroy(sv_tset);
1715 			sv_tset = NULL;
1716 			break;
1717 		}
1718 
1719 		/* threads still active - wait for them to exit */
1720 		mutex_exit(&sv_mutex);
1721 		delay(ticks);
1722 		loops--;
1723 		mutex_enter(&sv_mutex);
1724 	}
1725 	mutex_exit(&sv_mutex);
1726 
1727 	if (loops <= 0) {
1728 		cmn_err(CE_WARN,
1729 #ifndef DEBUG
1730 		    /* do not write to console when non-DEBUG */
1731 		    "!"
1732 #endif
1733 		    "sv:svclose: threads still active "
1734 		    "after %d sec - leaking thread set", secs);
1735 	}
1736 
1737 	return (0);
1738 }
1739 
1740 
1741 static int
1742 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1743 {
1744 	char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1745 	spcs_s_info_t kstatus;	/* Kernel version of spcs status */
1746 	spcs_s_info_t ustatus;	/* Address of user version of spcs status */
1747 	sv_list32_t svl32;	/* 32 bit Initial structure for SVIOC_LIST */
1748 	sv_version_t svv;	/* Version structure */
1749 	sv_conf_t svc;		/* User config structure */
1750 	sv_list_t svl;		/* Initial structure for SVIOC_LIST */
1751 	void *usvn;		/* Address of user sv_name_t */
1752 	void *svn = NULL;	/* Array for SVIOC_LIST */
1753 	uint64_t phash;		/* pathname hash */
1754 	int rc = 0;		/* Return code -- errno */
1755 	int size;		/* Number of items in array */
1756 	int bytes;		/* Byte size of array */
1757 	int ilp32;		/* Convert data structures for ilp32 userland */
1758 
1759 	*rvalp = 0;
1760 
1761 	/*
1762 	 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1763 	 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1764 	 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1765 	 *
1766 	 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1767 	 */
1768 	if (sv_mod_status == SV_ALLOW_UNLOAD) {
1769 		return (EBUSY);
1770 	}
1771 
1772 	if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1773 		return (rc);
1774 
1775 	kstatus = spcs_s_kcreate();
1776 	if (!kstatus) {
1777 		DTRACE_PROBE1(sv_ioctl_err_kcreate,
1778 				dev_t, dev);
1779 		return (ENOMEM);
1780 	}
1781 
1782 	ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1783 
1784 	switch (cmd) {
1785 
1786 	case SVIOC_ENABLE:
1787 
1788 		if (ilp32) {
1789 			sv_conf32_t svc32;
1790 
1791 			if (ddi_copyin((void *)arg, &svc32,
1792 					sizeof (svc32), mode) < 0) {
1793 				spcs_s_kfree(kstatus);
1794 				return (EFAULT);
1795 			}
1796 
1797 			svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1798 			(void) strcpy(svc.svc_path, svc32.svc_path);
1799 			svc.svc_flag  = svc32.svc_flag;
1800 			svc.svc_major = svc32.svc_major;
1801 			svc.svc_minor = svc32.svc_minor;
1802 		} else {
1803 			if (ddi_copyin((void *)arg, &svc,
1804 					sizeof (svc), mode) < 0) {
1805 				spcs_s_kfree(kstatus);
1806 				return (EFAULT);
1807 			}
1808 		}
1809 
1810 		/* force to raw access */
1811 		svc.svc_flag = NSC_DEVICE;
1812 
1813 		if (sv_tset == NULL) {
1814 			mutex_enter(&sv_mutex);
1815 
1816 			if (sv_tset == NULL) {
1817 				sv_tset = nst_init("sv_thr", sv_threads);
1818 			}
1819 
1820 			mutex_exit(&sv_mutex);
1821 
1822 			if (sv_tset == NULL) {
1823 				cmn_err(CE_WARN,
1824 				    "sv: could not allocate %d threads",
1825 				    sv_threads);
1826 			}
1827 		}
1828 
1829 		rc = sv_enable(svc.svc_path, svc.svc_flag,
1830 				makedevice(svc.svc_major, svc.svc_minor),
1831 				kstatus);
1832 
1833 		if (rc == 0) {
1834 			sv_config_time = nsc_lbolt();
1835 
1836 			mutex_enter(&sv_mutex);
1837 			sv_thread_tune(sv_threads_dev);
1838 			mutex_exit(&sv_mutex);
1839 		}
1840 
1841 		DTRACE_PROBE3(sv_ioctl_end,
1842 				dev_t, dev,
1843 				int, *rvalp,
1844 				int, rc);
1845 
1846 		return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1847 		/* NOTREACHED */
1848 
1849 	case SVIOC_DISABLE:
1850 
1851 		if (ilp32) {
1852 			sv_conf32_t svc32;
1853 
1854 			if (ddi_copyin((void *)arg, &svc32,
1855 					sizeof (svc32), mode) < 0) {
1856 				spcs_s_kfree(kstatus);
1857 				return (EFAULT);
1858 			}
1859 
1860 			svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1861 			svc.svc_major = svc32.svc_major;
1862 			svc.svc_minor = svc32.svc_minor;
1863 			(void) strcpy(svc.svc_path, svc32.svc_path);
1864 			svc.svc_flag  = svc32.svc_flag;
1865 		} else {
1866 			if (ddi_copyin((void *)arg, &svc,
1867 					sizeof (svc), mode) < 0) {
1868 				spcs_s_kfree(kstatus);
1869 				return (EFAULT);
1870 			}
1871 		}
1872 
1873 		if (svc.svc_major == (major_t)-1 &&
1874 		    svc.svc_minor == (minor_t)-1) {
1875 			sv_dev_t *svp;
1876 			int i;
1877 
1878 			/*
1879 			 * User level could not find the minor device
1880 			 * node, so do this the slow way by searching
1881 			 * the entire sv config for a matching pathname.
1882 			 */
1883 
1884 			phash = nsc_strhash(svc.svc_path);
1885 
1886 			mutex_enter(&sv_mutex);
1887 
1888 			for (i = 0; i < sv_max_devices; i++) {
1889 				svp = &sv_devs[i];
1890 
1891 				if (svp->sv_state == SV_DISABLE ||
1892 				    svp->sv_fd == NULL)
1893 					continue;
1894 
1895 				if (nsc_fdpathcmp(svp->sv_fd, phash,
1896 				    svc.svc_path) == 0) {
1897 					svc.svc_major = getmajor(svp->sv_dev);
1898 					svc.svc_minor = getminor(svp->sv_dev);
1899 					break;
1900 				}
1901 			}
1902 
1903 			mutex_exit(&sv_mutex);
1904 
1905 			if (svc.svc_major == (major_t)-1 &&
1906 			    svc.svc_minor == (minor_t)-1)
1907 				return (spcs_s_ocopyoutf(&kstatus,
1908 					svc.svc_error, SV_ENODEV));
1909 		}
1910 
1911 		rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1912 				kstatus);
1913 
1914 		if (rc == 0) {
1915 			sv_config_time = nsc_lbolt();
1916 
1917 			mutex_enter(&sv_mutex);
1918 			sv_thread_tune(-sv_threads_dev);
1919 			mutex_exit(&sv_mutex);
1920 		}
1921 
1922 		DTRACE_PROBE3(sv_ioctl_2,
1923 				dev_t, dev,
1924 				int, *rvalp,
1925 				int, rc);
1926 
1927 		return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1928 		/* NOTREACHED */
1929 
1930 	case SVIOC_LIST:
1931 
1932 		if (ilp32) {
1933 			if (ddi_copyin((void *)arg, &svl32,
1934 					sizeof (svl32), mode) < 0) {
1935 				spcs_s_kfree(kstatus);
1936 				return (EFAULT);
1937 			}
1938 
1939 			ustatus = (spcs_s_info_t)svl32.svl_error;
1940 			size = svl32.svl_count;
1941 			usvn = (void *)(unsigned long)svl32.svl_names;
1942 		} else {
1943 			if (ddi_copyin((void *)arg, &svl,
1944 					sizeof (svl), mode) < 0) {
1945 				spcs_s_kfree(kstatus);
1946 				return (EFAULT);
1947 			}
1948 
1949 			ustatus = svl.svl_error;
1950 			size = svl.svl_count;
1951 			usvn = svl.svl_names;
1952 		}
1953 
1954 		/* Do some boundary checking */
1955 		if ((size < 0) || (size > sv_max_devices)) {
1956 			/* Array size is out of range */
1957 			return (spcs_s_ocopyoutf(&kstatus, ustatus,
1958 				SV_EARRBOUNDS, "0",
1959 				spcs_s_inttostring(sv_max_devices, itmp1,
1960 						sizeof (itmp1), 0),
1961 				spcs_s_inttostring(size, itmp2,
1962 						sizeof (itmp2), 0)));
1963 		}
1964 
1965 		if (ilp32)
1966 			bytes = size * sizeof (sv_name32_t);
1967 		else
1968 			bytes = size * sizeof (sv_name_t);
1969 
1970 		/* Allocate memory for the array of structures */
1971 		if (bytes != 0) {
1972 			svn = kmem_zalloc(bytes, KM_SLEEP);
1973 			if (!svn) {
1974 				return (spcs_s_ocopyoutf(&kstatus,
1975 				    ustatus, ENOMEM));
1976 			}
1977 		}
1978 
1979 		rc = sv_list(svn, size, rvalp, ilp32);
1980 		if (rc) {
1981 			if (svn != NULL)
1982 				kmem_free(svn, bytes);
1983 			return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1984 		}
1985 
1986 		if (ilp32) {
1987 			svl32.svl_timestamp = (uint32_t)sv_config_time;
1988 			svl32.svl_maxdevs = (int32_t)sv_max_devices;
1989 
1990 			/* Return the list structure */
1991 			if (ddi_copyout(&svl32, (void *)arg,
1992 					sizeof (svl32), mode) < 0) {
1993 				spcs_s_kfree(kstatus);
1994 				if (svn != NULL)
1995 					kmem_free(svn, bytes);
1996 				return (EFAULT);
1997 			}
1998 		} else {
1999 			svl.svl_timestamp = sv_config_time;
2000 			svl.svl_maxdevs = sv_max_devices;
2001 
2002 			/* Return the list structure */
2003 			if (ddi_copyout(&svl, (void *)arg,
2004 					sizeof (svl), mode) < 0) {
2005 				spcs_s_kfree(kstatus);
2006 				if (svn != NULL)
2007 					kmem_free(svn, bytes);
2008 				return (EFAULT);
2009 			}
2010 		}
2011 
2012 		/* Return the array */
2013 		if (svn != NULL) {
2014 			if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
2015 				kmem_free(svn, bytes);
2016 				spcs_s_kfree(kstatus);
2017 				return (EFAULT);
2018 			}
2019 			kmem_free(svn, bytes);
2020 		}
2021 
2022 		DTRACE_PROBE3(sv_ioctl_3,
2023 				dev_t, dev,
2024 				int, *rvalp,
2025 				int, 0);
2026 
2027 		return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2028 		/* NOTREACHED */
2029 
2030 	case SVIOC_VERSION:
2031 
2032 		if (ilp32) {
2033 			sv_version32_t svv32;
2034 
2035 			if (ddi_copyin((void *)arg, &svv32,
2036 					sizeof (svv32), mode) < 0) {
2037 				spcs_s_kfree(kstatus);
2038 				return (EFAULT);
2039 			}
2040 
2041 			svv32.svv_major_rev = sv_major_rev;
2042 			svv32.svv_minor_rev = sv_minor_rev;
2043 			svv32.svv_micro_rev = sv_micro_rev;
2044 			svv32.svv_baseline_rev = sv_baseline_rev;
2045 
2046 			if (ddi_copyout(&svv32, (void *)arg,
2047 					sizeof (svv32), mode) < 0) {
2048 				spcs_s_kfree(kstatus);
2049 				return (EFAULT);
2050 			}
2051 
2052 			ustatus = (spcs_s_info_t)svv32.svv_error;
2053 		} else {
2054 			if (ddi_copyin((void *)arg, &svv,
2055 					sizeof (svv), mode) < 0) {
2056 				spcs_s_kfree(kstatus);
2057 				return (EFAULT);
2058 			}
2059 
2060 			svv.svv_major_rev = sv_major_rev;
2061 			svv.svv_minor_rev = sv_minor_rev;
2062 			svv.svv_micro_rev = sv_micro_rev;
2063 			svv.svv_baseline_rev = sv_baseline_rev;
2064 
2065 			if (ddi_copyout(&svv, (void *)arg,
2066 					sizeof (svv), mode) < 0) {
2067 				spcs_s_kfree(kstatus);
2068 				return (EFAULT);
2069 			}
2070 
2071 			ustatus = svv.svv_error;
2072 		}
2073 
2074 		DTRACE_PROBE3(sv_ioctl_4,
2075 				dev_t, dev,
2076 				int, *rvalp,
2077 				int, 0);
2078 
2079 		return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2080 		/* NOTREACHED */
2081 
2082 	case SVIOC_UNLOAD:
2083 		rc = sv_prepare_unload();
2084 
2085 		if (ddi_copyout(&rc, (void *)arg,
2086 				sizeof (rc), mode) < 0) {
2087 			rc = EFAULT;
2088 		}
2089 
2090 		spcs_s_kfree(kstatus);
2091 		return (rc);
2092 
2093 	default:
2094 		spcs_s_kfree(kstatus);
2095 
2096 		DTRACE_PROBE3(sv_ioctl_4,
2097 				dev_t, dev,
2098 				int, *rvalp,
2099 				int, EINVAL);
2100 
2101 		return (EINVAL);
2102 		/* NOTREACHED */
2103 	}
2104 
2105 	/* NOTREACHED */
2106 }
2107 
2108 
2109 /* ARGSUSED */
2110 static int
2111 svprint(dev_t dev, char *str)
2112 {
2113 	int instance = ddi_get_instance(sv_dip);
2114 	cmn_err(CE_WARN, "%s%d: %s", ddi_get_name(sv_dip), instance, str);
2115 	return (0);
2116 }
2117 
2118 
2119 static void
2120 _sv_lyr_strategy(struct buf *bp)
2121 {
2122 	caddr_t buf_addr;		/* pointer to linear buffer in bp */
2123 	nsc_buf_t *bufh = NULL;
2124 	nsc_buf_t *hndl = NULL;
2125 	sv_dev_t *svp;
2126 	nsc_vec_t *v;
2127 	sv_maj_t *maj;
2128 	nsc_size_t fba_req, fba_len;	/* FBA lengths */
2129 	nsc_off_t fba_off;		/* FBA offset */
2130 	size_t tocopy, nbytes;		/* byte lengths */
2131 	int rw, rc;			/* flags and return codes */
2132 	int (*fn)();
2133 
2134 	rc = 0;
2135 
2136 	if (sv_debug > 5)
2137 		cmn_err(CE_CONT, "_sv_lyr_strategy(%p)\n", (void *)bp);
2138 
2139 	svp = sv_find_enabled(bp->b_edev, &maj);
2140 	if (svp == NULL) {
2141 		if (maj && (fn = maj->sm_strategy) != 0) {
2142 			if (!(maj->sm_flag & D_MP)) {
2143 				UNSAFE_ENTER();
2144 				rc = (*fn)(bp);
2145 				UNSAFE_EXIT();
2146 			} else {
2147 				rc = (*fn)(bp);
2148 			}
2149 			return;
2150 		} else {
2151 			bioerror(bp, ENODEV);
2152 			biodone(bp);
2153 			return;
2154 		}
2155 	}
2156 
2157 	ASSERT(RW_READ_HELD(&svp->sv_lock));
2158 
2159 	if (svp->sv_flag == 0) {
2160 		/*
2161 		 * guard access mode
2162 		 * - prevent user level access to the device
2163 		 */
2164 		DTRACE_PROBE1(sv_lyr_strategy_err_guard,
2165 				struct buf *, bp);
2166 		bioerror(bp, EPERM);
2167 		goto out;
2168 	}
2169 
2170 	if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2171 		DTRACE_PROBE1(sv_lyr_strategy_err_rsrv,
2172 				struct buf *, bp);
2173 
2174 		if (rc == EINTR)
2175 			cmn_err(CE_WARN, "nsc_reserve() returned EINTR");
2176 		bioerror(bp, rc);
2177 		goto out;
2178 	}
2179 
2180 	if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2181 		DTRACE_PROBE1(sv_lyr_strategy_eof,
2182 				struct buf *, bp);
2183 
2184 		if (bp->b_flags & B_READ) {
2185 			/* return EOF, not an error */
2186 			bp->b_resid = bp->b_bcount;
2187 			bioerror(bp, 0);
2188 		} else
2189 			bioerror(bp, EINVAL);
2190 
2191 		goto done;
2192 	}
2193 
2194 	/*
2195 	 * Preallocate a handle once per call to strategy.
2196 	 * If this fails, then the nsc_alloc_buf() will allocate
2197 	 * a temporary handle per allocation/free pair.
2198 	 */
2199 
2200 	DTRACE_PROBE1(sv_dbg_alloch_start,
2201 			sv_dev_t *, svp);
2202 
2203 	bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2204 
2205 	DTRACE_PROBE1(sv_dbg_alloch_end,
2206 			sv_dev_t *, svp);
2207 
2208 	if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2209 		DTRACE_PROBE1(sv_lyr_strategy_err_hactive,
2210 				struct buf *, bp);
2211 
2212 		cmn_err(CE_WARN,
2213 			"sv: allocated active handle (bufh %p, flags %x)",
2214 			(void *)bufh, bufh->sb_flag);
2215 
2216 		bioerror(bp, ENXIO);
2217 		goto done;
2218 	}
2219 
2220 	fba_req = FBA_LEN(bp->b_bcount);
2221 	if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2222 		fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2223 
2224 	rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2225 
2226 	bp_mapin(bp);
2227 
2228 	bp->b_resid = bp->b_bcount;
2229 	buf_addr = bp->b_un.b_addr;
2230 	fba_off = 0;
2231 
2232 	/*
2233 	 * fba_req  - requested size of transfer in FBAs after
2234 	 *		truncation to device extent, and allowing for
2235 	 *		possible non-FBA bounded final chunk.
2236 	 * fba_off  - offset of start of chunk from start of bp in FBAs.
2237 	 * fba_len  - size of this chunk in FBAs.
2238 	 */
2239 
2240 loop:
2241 	fba_len = min(fba_req, svp->sv_maxfbas);
2242 	hndl = bufh;
2243 
2244 	DTRACE_PROBE4(sv_dbg_allocb_start,
2245 	    sv_dev_t *, svp,
2246 	    uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2247 	    uint64_t, (uint64_t)fba_len,
2248 	    int, rw);
2249 
2250 	rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2251 				fba_len, rw, &hndl);
2252 
2253 	DTRACE_PROBE1(sv_dbg_allocb_end,
2254 			sv_dev_t *, svp);
2255 
2256 	if (rc > 0) {
2257 		DTRACE_PROBE1(sv_lyr_strategy_err_alloc,
2258 				struct buf *, bp);
2259 		bioerror(bp, rc);
2260 		if (hndl != bufh)
2261 			(void) nsc_free_buf(hndl);
2262 		hndl = NULL;
2263 		goto done;
2264 	}
2265 
2266 	tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2267 	v = hndl->sb_vec;
2268 
2269 	if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2270 		/*
2271 		 * Not overwriting all of the last FBA, so read in the
2272 		 * old contents now before we overwrite it with the new
2273 		 * data.
2274 		 */
2275 
2276 		DTRACE_PROBE2(sv_dbg_read_start,
2277 			sv_dev_t *, svp,
2278 			uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2279 
2280 		rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2281 		if (rc > 0) {
2282 			bioerror(bp, rc);
2283 			goto done;
2284 		}
2285 
2286 		DTRACE_PROBE1(sv_dbg_read_end,
2287 			sv_dev_t *, svp);
2288 	}
2289 
2290 	DTRACE_PROBE1(sv_dbg_bcopy_start,
2291 			sv_dev_t *, svp);
2292 
2293 	while (tocopy > 0) {
2294 		nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2295 
2296 		if (bp->b_flags & B_READ)
2297 			(void) bcopy(v->sv_addr, buf_addr, nbytes);
2298 		else
2299 			(void) bcopy(buf_addr, v->sv_addr, nbytes);
2300 
2301 		bp->b_resid -= nbytes;
2302 		buf_addr += nbytes;
2303 		tocopy -= nbytes;
2304 		v++;
2305 	}
2306 
2307 	DTRACE_PROBE1(sv_dbg_bcopy_end,
2308 			sv_dev_t *, svp);
2309 
2310 	if ((bp->b_flags & B_READ) == 0) {
2311 		DTRACE_PROBE3(sv_dbg_write_start,
2312 			sv_dev_t *, svp,
2313 		    uint64_t, (uint64_t)hndl->sb_pos,
2314 		    uint64_t, (uint64_t)hndl->sb_len);
2315 
2316 		rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2317 
2318 		DTRACE_PROBE1(sv_dbg_write_end,
2319 			sv_dev_t *, svp);
2320 
2321 		if (rc > 0) {
2322 			bioerror(bp, rc);
2323 			goto done;
2324 		}
2325 	}
2326 
2327 	/*
2328 	 * Adjust FBA offset and requested (ie. remaining) length,
2329 	 * loop if more data to transfer.
2330 	 */
2331 
2332 	fba_off += fba_len;
2333 	fba_req -= fba_len;
2334 
2335 	if (fba_req > 0) {
2336 		DTRACE_PROBE1(sv_dbg_freeb_start,
2337 			sv_dev_t *, svp);
2338 
2339 		rc = nsc_free_buf(hndl);
2340 
2341 		DTRACE_PROBE1(sv_dbg_freeb_end,
2342 			sv_dev_t *, svp);
2343 
2344 		if (rc > 0) {
2345 			DTRACE_PROBE1(sv_lyr_strategy_err_free,
2346 				struct buf *, bp);
2347 			bioerror(bp, rc);
2348 		}
2349 
2350 		hndl = NULL;
2351 
2352 		if (rc <= 0)
2353 			goto loop;
2354 	}
2355 
2356 done:
2357 	if (hndl != NULL) {
2358 		DTRACE_PROBE1(sv_dbg_freeb_start,
2359 			sv_dev_t *, svp);
2360 
2361 		rc = nsc_free_buf(hndl);
2362 
2363 		DTRACE_PROBE1(sv_dbg_freeb_end,
2364 			sv_dev_t *, svp);
2365 
2366 		if (rc > 0) {
2367 			DTRACE_PROBE1(sv_lyr_strategy_err_free,
2368 				struct buf *, bp);
2369 			bioerror(bp, rc);
2370 		}
2371 
2372 		hndl = NULL;
2373 	}
2374 
2375 	if (bufh)
2376 		(void) nsc_free_handle(bufh);
2377 
2378 	DTRACE_PROBE1(sv_dbg_rlse_start,
2379 			sv_dev_t *, svp);
2380 
2381 	nsc_release(svp->sv_fd);
2382 
2383 	DTRACE_PROBE1(sv_dbg_rlse_end,
2384 			sv_dev_t *, svp);
2385 
2386 out:
2387 	if (sv_debug > 5) {
2388 		cmn_err(CE_CONT,
2389 		    "_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2390 		    (void *)bp, (void *)bufh, bp->b_error);
2391 	}
2392 
2393 	DTRACE_PROBE2(sv_lyr_strategy_end,
2394 				struct buf *, bp,
2395 				int, bp->b_error);
2396 
2397 	rw_exit(&svp->sv_lock);
2398 	biodone(bp);
2399 }
2400 
2401 
2402 static void
2403 sv_async_strategy(blind_t arg)
2404 {
2405 	struct buf *bp = (struct buf *)arg;
2406 	_sv_lyr_strategy(bp);
2407 }
2408 
2409 
2410 static int
2411 sv_lyr_strategy(struct buf *bp)
2412 {
2413 	nsthread_t *tp;
2414 	int nlive;
2415 
2416 	/*
2417 	 * If B_ASYNC was part of the DDI we could use it as a hint to
2418 	 * not create a thread for synchronous i/o.
2419 	 */
2420 	if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2421 		/* not sv enabled - just pass through */
2422 		DTRACE_PROBE1(sv_lyr_strategy_notsv,
2423 				struct buf *, bp);
2424 		_sv_lyr_strategy(bp);
2425 		return (0);
2426 	}
2427 
2428 	if (sv_debug > 4) {
2429 		cmn_err(CE_CONT, "sv_lyr_strategy: nthread %d nlive %d\n",
2430 		    nst_nthread(sv_tset), nst_nlive(sv_tset));
2431 	}
2432 
2433 	/*
2434 	 * If there are only guard devices enabled there
2435 	 * won't be a threadset, so don't try and use it.
2436 	 */
2437 	tp = NULL;
2438 	if (sv_tset != NULL) {
2439 		tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2440 	}
2441 
2442 	if (tp == NULL) {
2443 		/*
2444 		 * out of threads, so fall back to synchronous io.
2445 		 */
2446 		if (sv_debug > 0) {
2447 			cmn_err(CE_CONT,
2448 			    "sv_lyr_strategy: thread alloc failed\n");
2449 		}
2450 
2451 		DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2452 				struct buf *, bp);
2453 
2454 		_sv_lyr_strategy(bp);
2455 		sv_no_threads++;
2456 	} else {
2457 		nlive = nst_nlive(sv_tset);
2458 		if (nlive > sv_max_nlive) {
2459 			if (sv_debug > 0) {
2460 				cmn_err(CE_CONT,
2461 				    "sv_lyr_strategy: "
2462 				    "new max nlive %d (nthread %d)\n",
2463 				    nlive, nst_nthread(sv_tset));
2464 			}
2465 
2466 			sv_max_nlive = nlive;
2467 		}
2468 	}
2469 
2470 	return (0);
2471 }
2472 
2473 
2474 #ifndef offsetof
2475 #define	offsetof(s, m)	((size_t)(&((s *)0)->m))
2476 #endif
2477 
2478 /*
2479  * re-write the size of the current partition
2480  */
2481 static int
2482 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2483 {
2484 	size_t offset;
2485 	int ilp32;
2486 	int pnum;
2487 	int rc;
2488 
2489 	ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2490 
2491 	rc = nskern_partition(svp->sv_dev, &pnum);
2492 	if (rc != 0) {
2493 		return (rc);
2494 	}
2495 
2496 	if (pnum < 0 || pnum >= V_NUMPAR) {
2497 		cmn_err(CE_WARN,
2498 		    "sv_gvtoc: unable to determine partition number "
2499 		    "for dev %lx", svp->sv_dev);
2500 		return (EINVAL);
2501 	}
2502 
2503 	if (ilp32) {
2504 		int32_t p_size;
2505 
2506 #ifdef _SunOS_5_6
2507 		offset = offsetof(struct vtoc, v_part);
2508 		offset += sizeof (struct partition) * pnum;
2509 		offset += offsetof(struct partition, p_size);
2510 #else
2511 		offset = offsetof(struct vtoc32, v_part);
2512 		offset += sizeof (struct partition32) * pnum;
2513 		offset += offsetof(struct partition32, p_size);
2514 #endif
2515 
2516 		p_size = (int32_t)svp->sv_nblocks;
2517 		if (p_size == 0) {
2518 			if (sv_reserve(svp->sv_fd,
2519 			    NSC_MULTI|NSC_PCATCH) == 0) {
2520 				p_size = (int32_t)svp->sv_nblocks;
2521 				nsc_release(svp->sv_fd);
2522 			} else {
2523 				rc = EINTR;
2524 			}
2525 		}
2526 
2527 		if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2528 		    sizeof (p_size), mode) != 0) {
2529 			rc = EFAULT;
2530 		}
2531 	} else {
2532 		long p_size;
2533 
2534 		offset = offsetof(struct vtoc, v_part);
2535 		offset += sizeof (struct partition) * pnum;
2536 		offset += offsetof(struct partition, p_size);
2537 
2538 		p_size = (long)svp->sv_nblocks;
2539 		if (p_size == 0) {
2540 			if (sv_reserve(svp->sv_fd,
2541 			    NSC_MULTI|NSC_PCATCH) == 0) {
2542 				p_size = (long)svp->sv_nblocks;
2543 				nsc_release(svp->sv_fd);
2544 			} else {
2545 				rc = EINTR;
2546 			}
2547 		}
2548 
2549 		if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2550 		    sizeof (p_size), mode) != 0) {
2551 			rc = EFAULT;
2552 		}
2553 	}
2554 
2555 	return (rc);
2556 }
2557 
2558 
2559 #ifdef DKIOCPARTITION
2560 /*
2561  * re-write the size of the current partition
2562  *
2563  * arg is dk_efi_t.
2564  *
2565  * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2566  *
2567  * dk_efi_t->dki_data --> efi_gpt_t (label header)
2568  * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2569  *
2570  * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2571  * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2572  *
2573  * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2574  * logical block on the disk.
2575  *
2576  * Everything is little endian (i.e. disk format).
2577  */
2578 static int
2579 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2580 {
2581 	dk_efi_t efi;
2582 	efi_gpt_t gpt;
2583 	efi_gpe_t *gpe = NULL;
2584 	size_t sgpe;
2585 	uint64_t p_size;	/* virtual partition size from nsctl */
2586 	uint32_t crc;
2587 	int unparts;		/* number of parts in user's array */
2588 	int pnum;
2589 	int rc;
2590 
2591 	rc = nskern_partition(svp->sv_dev, &pnum);
2592 	if (rc != 0) {
2593 		return (rc);
2594 	}
2595 
2596 	if (pnum < 0) {
2597 		cmn_err(CE_WARN,
2598 		    "sv_efi: unable to determine partition number for dev %lx",
2599 		    svp->sv_dev);
2600 		return (EINVAL);
2601 	}
2602 
2603 	if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2604 		return (EFAULT);
2605 	}
2606 
2607 	efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2608 
2609 	if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2610 		return (EINVAL);
2611 	}
2612 
2613 	if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2614 		rc = EFAULT;
2615 		goto out;
2616 	}
2617 
2618 	if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2619 		unparts = 1;
2620 	else if (pnum >= unparts) {
2621 		cmn_err(CE_WARN,
2622 		    "sv_efi: partition# beyond end of user array (%d >= %d)",
2623 		    pnum, unparts);
2624 		return (EINVAL);
2625 	}
2626 
2627 	sgpe = sizeof (*gpe) * unparts;
2628 	gpe = kmem_alloc(sgpe, KM_SLEEP);
2629 
2630 	if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2631 		rc = EFAULT;
2632 		goto out;
2633 	}
2634 
2635 	p_size = svp->sv_nblocks;
2636 	if (p_size == 0) {
2637 		if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2638 			p_size = (diskaddr_t)svp->sv_nblocks;
2639 			nsc_release(svp->sv_fd);
2640 		} else {
2641 			rc = EINTR;
2642 		}
2643 	}
2644 
2645 	gpe[pnum].efi_gpe_EndingLBA = LE_64(
2646 	    LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2647 
2648 	gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2649 	CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2650 	gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2651 
2652 	gpt.efi_gpt_HeaderCRC32 = 0;
2653 	CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2654 	gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2655 
2656 	if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2657 		rc = EFAULT;
2658 		goto out;
2659 	}
2660 
2661 	if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2662 		rc = EFAULT;
2663 		goto out;
2664 	}
2665 
2666 out:
2667 	if (gpe) {
2668 		kmem_free(gpe, sgpe);
2669 	}
2670 
2671 	return (rc);
2672 }
2673 
2674 
2675 /*
2676  * Re-write the size of the partition specified by p_partno
2677  *
2678  * Note that if a DKIOCPARTITION is issued to an fd opened against a
2679  * non-sv'd device, but p_partno requests the size for a different
2680  * device that is sv'd, this function will *not* be called as sv is
2681  * not interposed on the original device (the fd).
2682  *
2683  * It would not be easy to change this as we cannot get the partition
2684  * number for the non-sv'd device, so cannot compute the dev_t of the
2685  * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2686  * its size from nsctl.
2687  *
2688  * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2689  */
2690 static int
2691 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2692 {
2693 	struct partition64 p64;
2694 	sv_dev_t *nsvp = NULL;
2695 	diskaddr_t p_size;
2696 	minor_t nminor;
2697 	int pnum, rc;
2698 	dev_t ndev;
2699 
2700 	rc = nskern_partition(svp->sv_dev, &pnum);
2701 	if (rc != 0) {
2702 		return (rc);
2703 	}
2704 
2705 	if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2706 		return (EFAULT);
2707 	}
2708 
2709 	if (p64.p_partno != pnum) {
2710 		/* switch to requested partition, not the current one */
2711 		nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2712 		ndev = makedevice(getmajor(svp->sv_dev), nminor);
2713 		nsvp = sv_find_enabled(ndev, NULL);
2714 		if (nsvp == NULL) {
2715 			/* not sv device - just return */
2716 			return (0);
2717 		}
2718 
2719 		svp = nsvp;
2720 	}
2721 
2722 	p_size = svp->sv_nblocks;
2723 	if (p_size == 0) {
2724 		if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2725 			p_size = (diskaddr_t)svp->sv_nblocks;
2726 			nsc_release(svp->sv_fd);
2727 		} else {
2728 			rc = EINTR;
2729 		}
2730 	}
2731 
2732 	if (nsvp != NULL) {
2733 		rw_exit(&nsvp->sv_lock);
2734 	}
2735 
2736 	if ((rc == 0) && ddi_copyout(&p_size,
2737 	    (void *)(arg + offsetof(struct partition64, p_size)),
2738 	    sizeof (p_size), mode) != 0) {
2739 		return (EFAULT);
2740 	}
2741 
2742 	return (rc);
2743 }
2744 #endif /* DKIOCPARTITION */
2745 
2746 
2747 static int
2748 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2749     const int mode, cred_t *crp, int *rvalp)
2750 {
2751 	sv_dev_t *svp;
2752 	sv_maj_t *maj;
2753 	int (*fn)();
2754 	int rc = 0;
2755 
2756 	maj = 0;
2757 	fn = 0;
2758 
2759 	/*
2760 	 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2761 	 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2762 	 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2763 	 *
2764 	 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2765 	 */
2766 	if (sv_mod_status == SV_ALLOW_UNLOAD) {
2767 		return (EBUSY);
2768 	}
2769 
2770 	svp = sv_find_enabled(dev, &maj);
2771 	if (svp != NULL) {
2772 		if (nskernd_isdaemon()) {
2773 			/*
2774 			 * This is nskernd which always needs to see
2775 			 * the underlying disk device accurately.
2776 			 *
2777 			 * So just pass the ioctl straight through
2778 			 * to the underlying driver as though the device
2779 			 * was not sv enabled.
2780 			 */
2781 			DTRACE_PROBE2(sv_lyr_ioctl_nskernd,
2782 					sv_dev_t *, svp,
2783 					dev_t, dev);
2784 
2785 			rw_exit(&svp->sv_lock);
2786 			svp = NULL;
2787 		} else {
2788 			ASSERT(RW_READ_HELD(&svp->sv_lock));
2789 		}
2790 	}
2791 
2792 	/*
2793 	 * We now have a locked and enabled SV device, or a non-SV device.
2794 	 */
2795 
2796 	switch (cmd) {
2797 		/*
2798 		 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2799 		 * and DKIOCSETEFI are intercepted and faked up as some
2800 		 * i/o providers emulate volumes of a different size to
2801 		 * the underlying volume.
2802 		 *
2803 		 * Setting the size by rewriting the vtoc is not permitted.
2804 		 */
2805 
2806 	case DKIOCSVTOC:
2807 #ifdef DKIOCPARTITION
2808 	case DKIOCSETEFI:
2809 #endif
2810 		if (svp == NULL) {
2811 			/* not intercepted -- allow ioctl through */
2812 			break;
2813 		}
2814 
2815 		rw_exit(&svp->sv_lock);
2816 
2817 		DTRACE_PROBE2(sv_lyr_ioctl_svtoc,
2818 				dev_t, dev,
2819 				int, EPERM);
2820 
2821 		return (EPERM);
2822 
2823 	default:
2824 		break;
2825 	}
2826 
2827 	/*
2828 	 * Pass through the real ioctl command.
2829 	 */
2830 
2831 	if (maj && (fn = maj->sm_ioctl) != 0) {
2832 		if (!(maj->sm_flag & D_MP)) {
2833 			UNSAFE_ENTER();
2834 			rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2835 			UNSAFE_EXIT();
2836 		} else {
2837 			rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2838 		}
2839 	} else {
2840 		rc = ENODEV;
2841 	}
2842 
2843 	/*
2844 	 * Bug 4755783
2845 	 * Fix up the size of the current partition to allow
2846 	 * for the virtual volume to be a different size to the
2847 	 * physical volume (e.g. for II compact dependent shadows).
2848 	 *
2849 	 * Note that this only attempts to fix up the current partition
2850 	 * - the one that the ioctl was issued against.  There could be
2851 	 * other sv'd partitions in the same vtoc, but we cannot tell
2852 	 * so we don't attempt to fix them up.
2853 	 */
2854 
2855 	if (svp != NULL && rc == 0) {
2856 		switch (cmd) {
2857 		case DKIOCGVTOC:
2858 			rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2859 			break;
2860 
2861 #ifdef DKIOCPARTITION
2862 		case DKIOCGETEFI:
2863 			rc = sv_fix_dkiocgetefi(arg, mode, svp);
2864 			break;
2865 
2866 		case DKIOCPARTITION:
2867 			rc = sv_fix_dkiocpartition(arg, mode, svp);
2868 			break;
2869 #endif /* DKIOCPARTITION */
2870 		}
2871 	}
2872 
2873 	if (svp != NULL) {
2874 		rw_exit(&svp->sv_lock);
2875 	}
2876 
2877 	return (rc);
2878 }
2879