xref: /titanic_50/usr/src/uts/common/avs/ns/sv/sv.c (revision d2b5b2d357ee3172eacb6860be1891259902203d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Storage Volume Character and Block Driver (SV)
28  *
29  * This driver implements a simplistic /dev/{r}dsk/ interface to a
30  * specified disk volume that is otherwise managed by the Prism
31  * software.  The SV driver layers itself onto the underlying disk
32  * device driver by changing function pointers in the cb_ops
33  * structure.
34  *
35  * CONFIGURATION:
36  *
37  * 1. Configure the driver using the svadm utility.
38  * 2. Access the device as before through /dev/rdsk/c?t?d?s?
39  *
40  * LIMITATIONS:
41  *
42  * This driver should NOT be used to share a device between another
43  * DataServices user interface module (e.g., STE) and a user accessing
44  * the device through the block device in O_WRITE mode.  This is because
45  * writes through the block device are asynchronous (due to the page
46  * cache) and so consistency between the block device user and the
47  * STE user cannot be guaranteed.
48  *
49  * Data is copied between system struct buf(9s) and nsc_vec_t.  This is
50  * wasteful and slow.
51  */
52 
53 #include <sys/debug.h>
54 #include <sys/types.h>
55 
56 #include <sys/ksynch.h>
57 #include <sys/kmem.h>
58 #include <sys/errno.h>
59 #include <sys/varargs.h>
60 #include <sys/file.h>
61 #include <sys/open.h>
62 #include <sys/conf.h>
63 #include <sys/cred.h>
64 #include <sys/buf.h>
65 #include <sys/uio.h>
66 #ifndef DS_DDICT
67 #include <sys/pathname.h>
68 #endif
69 #include <sys/aio_req.h>
70 #include <sys/dkio.h>
71 #include <sys/vtoc.h>
72 #include <sys/cmn_err.h>
73 #include <sys/modctl.h>
74 #include <sys/ddi.h>
75 #include <sys/sunddi.h>
76 #include <sys/sunldi.h>
77 #include <sys/nsctl/nsvers.h>
78 
79 #include <sys/nsc_thread.h>
80 #include <sys/unistat/spcs_s.h>
81 #include <sys/unistat/spcs_s_k.h>
82 #include <sys/unistat/spcs_errors.h>
83 
84 #ifdef DS_DDICT
85 #include "../contract.h"
86 #endif
87 
88 #include "../nsctl.h"
89 
90 
91 #include <sys/sdt.h>		/* dtrace is S10 or later */
92 
93 #include "sv.h"
94 #include "sv_impl.h"
95 #include "sv_efi.h"
96 
97 #define	MAX_EINTR_COUNT 1000
98 
99 /*
100  * sv_mod_status
101  */
102 #define	SV_PREVENT_UNLOAD 1
103 #define	SV_ALLOW_UNLOAD	2
104 
105 static const int sv_major_rev = ISS_VERSION_MAJ;	/* Major number */
106 static const int sv_minor_rev = ISS_VERSION_MIN;	/* Minor number */
107 static const int sv_micro_rev = ISS_VERSION_MIC;	/* Micro number */
108 static const int sv_baseline_rev = ISS_VERSION_NUM;	/* Baseline number */
109 
110 #ifdef DKIOCPARTITION
111 /*
112  * CRC32 polynomial table needed for computing the checksums
113  * in an EFI vtoc.
114  */
115 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
116 #endif
117 
118 static clock_t sv_config_time;		/* Time of successful {en,dis}able */
119 static int sv_debug;			/* Set non-zero for debug to syslog */
120 static int sv_mod_status;		/* Set to prevent modunload */
121 
122 static dev_info_t *sv_dip;		/* Single DIP for driver */
123 static kmutex_t sv_mutex;		/* Protect global lists, etc. */
124 
125 static nsc_mem_t	*sv_mem;	/* nsctl memory allocator token */
126 
127 
128 /*
129  * Per device and per major state.
130  */
131 
132 #ifndef _SunOS_5_6
133 #define	UNSAFE_ENTER()
134 #define	UNSAFE_EXIT()
135 #else
136 #define	UNSAFE_ENTER()	mutex_enter(&unsafe_driver)
137 #define	UNSAFE_EXIT()	mutex_exit(&unsafe_driver)
138 #endif
139 
140 					/* hash table of major dev structures */
141 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
142 static sv_dev_t *sv_devs;		/* array of per device structures */
143 static int sv_max_devices;		/* SV version of nsc_max_devices() */
144 static int sv_ndevices;			/* number of SV enabled devices */
145 
146 /*
147  * Threading.
148  */
149 
150 int sv_threads_max = 1024;		/* maximum # to dynamically alloc */
151 int sv_threads = 32;			/* # to pre-allocate (see sv.conf) */
152 int sv_threads_extra = 0;		/* addl # we would have alloc'ed */
153 
154 static nstset_t *sv_tset;		/* the threadset pointer */
155 
156 static int sv_threads_hysteresis = 4;	/* hysteresis for threadset resizing */
157 static int sv_threads_dev = 2;		/* # of threads to alloc per device */
158 static int sv_threads_inc = 8;		/* increment for changing the set */
159 static int sv_threads_needed;		/* number of threads needed */
160 static int sv_no_threads;		/* number of nsc_create errors */
161 static int sv_max_nlive;		/* max number of threads running */
162 
163 
164 
165 /*
166  * nsctl fd callbacks.
167  */
168 
169 static int svattach_fd(blind_t);
170 static int svdetach_fd(blind_t);
171 
172 static nsc_def_t sv_fd_def[] = {
173 	{ "Attach",	(uintptr_t)svattach_fd, },
174 	{ "Detach",	(uintptr_t)svdetach_fd, },
175 	{ 0, 0, }
176 };
177 
178 /*
179  * cb_ops functions.
180  */
181 
182 static int svopen(dev_t *, int, int, cred_t *);
183 static int svclose(dev_t, int, int, cred_t *);
184 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
185 static int svprint(dev_t, char *);
186 
187 /*
188  * These next functions are layered into the underlying driver's devops.
189  */
190 
191 static int sv_lyr_open(dev_t *, int, int, cred_t *);
192 static int sv_lyr_close(dev_t, int, int, cred_t *);
193 static int sv_lyr_strategy(struct buf *);
194 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
195 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
196 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
197 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
198 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
199 
200 static struct cb_ops sv_cb_ops = {
201 	svopen,		/* open */
202 	svclose,	/* close */
203 	nulldev,	/* strategy */
204 	svprint,
205 	nodev,		/* dump */
206 	nodev,		/* read */
207 	nodev,		/* write */
208 	svioctl,
209 	nodev,		/* devmap */
210 	nodev,		/* mmap */
211 	nodev,		/* segmap */
212 	nochpoll,	/* poll */
213 	ddi_prop_op,
214 	NULL,		/* NOT a stream */
215 	D_NEW | D_MP | D_64BIT,
216 	CB_REV,
217 	nodev,		/* aread */
218 	nodev,		/* awrite */
219 };
220 
221 
222 /*
223  * dev_ops functions.
224  */
225 
226 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
227 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
228 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
229 
230 static struct dev_ops sv_ops = {
231 	DEVO_REV,
232 	0,
233 	sv_getinfo,
234 	nulldev,	/* identify */
235 	nulldev,	/* probe */
236 	sv_attach,
237 	sv_detach,
238 	nodev,		/* reset */
239 	&sv_cb_ops,
240 	(struct bus_ops *)0
241 };
242 
243 /*
244  * Module linkage.
245  */
246 
247 extern struct mod_ops mod_driverops;
248 
249 static struct modldrv modldrv = {
250 	&mod_driverops,
251 	"nws:Storage Volume:" ISS_VERSION_STR,
252 	&sv_ops
253 };
254 
255 static struct modlinkage modlinkage = {
256 	MODREV_1,
257 	&modldrv,
258 	0
259 };
260 
261 
262 int
263 _init(void)
264 {
265 	int error;
266 
267 	mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
268 
269 	if ((error = mod_install(&modlinkage)) != 0) {
270 		mutex_destroy(&sv_mutex);
271 		return (error);
272 	}
273 
274 #ifdef DEBUG
275 	cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
276 	    sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
277 	    ISS_VERSION_STR, BUILD_DATE_STR);
278 #else
279 	if (sv_micro_rev) {
280 		cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
281 		    sv_major_rev, sv_minor_rev, sv_micro_rev,
282 		    ISS_VERSION_STR, BUILD_DATE_STR);
283 	} else {
284 		cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
285 		    sv_major_rev, sv_minor_rev,
286 		    ISS_VERSION_STR, BUILD_DATE_STR);
287 	}
288 #endif
289 
290 	return (error);
291 }
292 
293 
294 int
295 _fini(void)
296 {
297 	int error;
298 
299 	if ((error = mod_remove(&modlinkage)) != 0)
300 		return (error);
301 
302 	mutex_destroy(&sv_mutex);
303 
304 	return (error);
305 }
306 
307 
308 int
309 _info(struct modinfo *modinfop)
310 {
311 	return (mod_info(&modlinkage, modinfop));
312 }
313 
314 
315 /*
316  * Locking & State.
317  *
318  * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
319  * threadset creation and sizing; sv_ndevices.
320  *
321  * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
322  * must be acquired first.
323  *
324  * sv_lock protects the sv_dev_t structure for an individual device.
325  *
326  * sv_olock protects the otyp/open members of the sv_dev_t.  If we need
327  * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
328  * first.
329  *
330  * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
331  * I/O operations to a device simultaneously, as above.
332  *
333  * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
334  * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
335  * and (sv_pending == curthread) so that any recursion through
336  * sv_lyr_open/sv_lyr_close can be detected.
337  */
338 
339 
340 static int
341 sv_init_devs(void)
342 {
343 	int i;
344 
345 	ASSERT(MUTEX_HELD(&sv_mutex));
346 
347 	if (sv_max_devices > 0)
348 		return (0);
349 
350 	sv_max_devices = nsc_max_devices();
351 
352 	if (sv_max_devices <= 0) {
353 		/* nsctl is not attached (nskernd not running) */
354 		if (sv_debug > 0)
355 			cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
356 		return (EAGAIN);
357 	}
358 
359 	sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
360 	    KM_NOSLEEP, sv_mem);
361 
362 	if (sv_devs == NULL) {
363 		cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
364 		return (ENOMEM);
365 	}
366 
367 	for (i = 0; i < sv_max_devices; i++) {
368 		mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
369 		rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
370 	}
371 
372 	if (sv_debug > 0)
373 		cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
374 
375 	return (0);
376 }
377 
378 
379 static int
380 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
381 {
382 	int rc;
383 
384 	switch (cmd) {
385 
386 	case DDI_ATTACH:
387 		sv_dip = dip;
388 
389 		if (ddi_create_minor_node(dip, "sv", S_IFCHR,
390 		    0, DDI_PSEUDO, 0) != DDI_SUCCESS)
391 			goto failed;
392 
393 		mutex_enter(&sv_mutex);
394 
395 		sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
396 		if (sv_mem == NULL) {
397 			mutex_exit(&sv_mutex);
398 			goto failed;
399 		}
400 
401 		rc = sv_init_devs();
402 		if (rc != 0 && rc != EAGAIN) {
403 			mutex_exit(&sv_mutex);
404 			goto failed;
405 		}
406 
407 		mutex_exit(&sv_mutex);
408 
409 
410 		ddi_report_dev(dip);
411 
412 		sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
413 		    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
414 		    "sv_threads", sv_threads);
415 
416 		if (sv_debug > 0)
417 			cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
418 
419 		if (sv_threads > sv_threads_max)
420 			sv_threads_max = sv_threads;
421 
422 		return (DDI_SUCCESS);
423 
424 	default:
425 		return (DDI_FAILURE);
426 	}
427 
428 failed:
429 	DTRACE_PROBE(sv_attach_failed);
430 	(void) sv_detach(dip, DDI_DETACH);
431 	return (DDI_FAILURE);
432 }
433 
434 
435 static int
436 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
437 {
438 	sv_dev_t *svp;
439 	int i;
440 
441 	switch (cmd) {
442 
443 	case DDI_DETACH:
444 
445 		/*
446 		 * Check that everything is disabled.
447 		 */
448 
449 		mutex_enter(&sv_mutex);
450 
451 		if (sv_mod_status == SV_PREVENT_UNLOAD) {
452 			mutex_exit(&sv_mutex);
453 			DTRACE_PROBE(sv_detach_err_prevent);
454 			return (DDI_FAILURE);
455 		}
456 
457 		for (i = 0; sv_devs && i < sv_max_devices; i++) {
458 			svp = &sv_devs[i];
459 
460 			if (svp->sv_state != SV_DISABLE) {
461 				mutex_exit(&sv_mutex);
462 				DTRACE_PROBE(sv_detach_err_busy);
463 				return (DDI_FAILURE);
464 			}
465 		}
466 
467 
468 		for (i = 0; sv_devs && i < sv_max_devices; i++) {
469 			mutex_destroy(&sv_devs[i].sv_olock);
470 			rw_destroy(&sv_devs[i].sv_lock);
471 		}
472 
473 		if (sv_devs) {
474 			nsc_kmem_free(sv_devs,
475 			    (sv_max_devices * sizeof (*sv_devs)));
476 			sv_devs = NULL;
477 		}
478 		sv_max_devices = 0;
479 
480 		if (sv_mem) {
481 			nsc_unregister_mem(sv_mem);
482 			sv_mem = NULL;
483 		}
484 
485 		mutex_exit(&sv_mutex);
486 
487 		/*
488 		 * Remove all minor nodes.
489 		 */
490 
491 		ddi_remove_minor_node(dip, NULL);
492 		sv_dip = NULL;
493 
494 		return (DDI_SUCCESS);
495 
496 	default:
497 		return (DDI_FAILURE);
498 	}
499 }
500 
501 static sv_maj_t *
502 sv_getmajor(const dev_t dev)
503 {
504 	sv_maj_t **insert, *maj;
505 	major_t umaj = getmajor(dev);
506 
507 	/*
508 	 * See if the hash table entry, or one of the hash chains
509 	 * is already allocated for this major number
510 	 */
511 	if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
512 		do {
513 			if (maj->sm_major == umaj)
514 				return (maj);
515 		} while ((maj = maj->sm_next) != 0);
516 	}
517 
518 	/*
519 	 * If the sv_mutex is held, there is design flaw, as the only non-mutex
520 	 * held callers can be sv_enable() or sv_dev_to_sv()
521 	 * Return an error, instead of panicing the system
522 	 */
523 	if (MUTEX_HELD(&sv_mutex)) {
524 		cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
525 		return (NULL);
526 	}
527 
528 	/*
529 	 * Determine where to allocate a new element in the hash table
530 	 */
531 	mutex_enter(&sv_mutex);
532 	insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
533 	for (maj = *insert; maj; maj = maj->sm_next) {
534 
535 		/* Did another thread beat us to it? */
536 		if (maj->sm_major == umaj)
537 			return (maj);
538 
539 		/* Find a NULL insert point? */
540 		if (maj->sm_next == NULL)
541 			insert = &maj->sm_next;
542 	}
543 
544 	/*
545 	 * Located the new insert point
546 	 */
547 	*insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
548 	if ((maj = *insert) != 0)
549 		maj->sm_major = umaj;
550 	else
551 		cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
552 
553 	mutex_exit(&sv_mutex);
554 
555 	return (maj);
556 }
557 
558 /* ARGSUSED */
559 
560 static int
561 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
562 {
563 	int rc = DDI_FAILURE;
564 
565 	switch (infocmd) {
566 
567 	case DDI_INFO_DEVT2DEVINFO:
568 		*result = sv_dip;
569 		rc = DDI_SUCCESS;
570 		break;
571 
572 	case DDI_INFO_DEVT2INSTANCE:
573 		/*
574 		 * We only have a single instance.
575 		 */
576 		*result = 0;
577 		rc = DDI_SUCCESS;
578 		break;
579 
580 	default:
581 		break;
582 	}
583 
584 	return (rc);
585 }
586 
587 
588 /*
589  * Hashing of devices onto major device structures.
590  *
591  * Individual device structures are hashed onto one of the sm_hash[]
592  * buckets in the relevant major device structure.
593  *
594  * Hash insertion and deletion -must- be done with sv_mutex held.  Hash
595  * searching does not require the mutex because of the sm_seq member.
596  * sm_seq is incremented on each insertion (-after- hash chain pointer
597  * manipulation) and each deletion (-before- hash chain pointer
598  * manipulation).  When searching the hash chain, the seq number is
599  * checked before accessing each device structure, if the seq number has
600  * changed, then we restart the search from the top of the hash chain.
601  * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
602  * the hash chain (we are guaranteed that this search cannot be
603  * interrupted).
604  */
605 
606 #define	SV_HASH_RETRY	16
607 
608 static sv_dev_t *
609 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
610 {
611 	minor_t umin = getminor(dev);
612 	sv_dev_t **hb, *next, *svp;
613 	sv_maj_t *maj;
614 	int seq;
615 	int try;
616 
617 	/* Get major hash table */
618 	maj = sv_getmajor(dev);
619 	if (majpp)
620 		*majpp = maj;
621 	if (maj == NULL)
622 		return (NULL);
623 
624 	if (maj->sm_inuse == 0) {
625 		DTRACE_PROBE1(
626 		    sv_dev_to_sv_end,
627 		    dev_t, dev);
628 		return (NULL);
629 	}
630 
631 	hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
632 	try = 0;
633 
634 retry:
635 	if (try > SV_HASH_RETRY)
636 		mutex_enter(&sv_mutex);
637 
638 	seq = maj->sm_seq;
639 	for (svp = *hb; svp; svp = next) {
640 		next = svp->sv_hash;
641 
642 		nsc_membar_stld();	/* preserve register load order */
643 
644 		if (maj->sm_seq != seq) {
645 			DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
646 			try++;
647 			goto retry;
648 		}
649 
650 		if (svp->sv_dev == dev)
651 			break;
652 	}
653 
654 	if (try > SV_HASH_RETRY)
655 		mutex_exit(&sv_mutex);
656 
657 	return (svp);
658 }
659 
660 
661 /*
662  * Must be called with sv_mutex held.
663  */
664 
665 static int
666 sv_get_state(const dev_t udev, sv_dev_t **svpp)
667 {
668 	sv_dev_t **hb, **insert, *svp;
669 	sv_maj_t *maj;
670 	minor_t umin;
671 	int i;
672 
673 	/* Get major hash table */
674 	if ((maj = sv_getmajor(udev)) == NULL)
675 		return (NULL);
676 
677 	/* Determine which minor hash table */
678 	umin = getminor(udev);
679 	hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
680 
681 	/* look for clash */
682 
683 	insert = hb;
684 
685 	for (svp = *hb; svp; svp = svp->sv_hash) {
686 		if (svp->sv_dev == udev)
687 			break;
688 
689 		if (svp->sv_hash == NULL)
690 			insert = &svp->sv_hash;
691 	}
692 
693 	if (svp) {
694 		DTRACE_PROBE1(
695 		    sv_get_state_enabled,
696 		    dev_t, udev);
697 		return (SV_EENABLED);
698 	}
699 
700 	/* look for spare sv_devs slot */
701 
702 	for (i = 0; i < sv_max_devices; i++) {
703 		svp = &sv_devs[i];
704 
705 		if (svp->sv_state == SV_DISABLE)
706 			break;
707 	}
708 
709 	if (i >= sv_max_devices) {
710 		DTRACE_PROBE1(
711 		    sv_get_state_noslots,
712 		    dev_t, udev);
713 		return (SV_ENOSLOTS);
714 	}
715 
716 	svp->sv_state = SV_PENDING;
717 	svp->sv_pending = curthread;
718 
719 	*insert = svp;
720 	svp->sv_hash = NULL;
721 	maj->sm_seq++;		/* must be after the store to the hash chain */
722 
723 	*svpp = svp;
724 
725 	/*
726 	 * We do not know the size of the underlying device at
727 	 * this stage, so initialise "nblocks" property to
728 	 * zero, and update it whenever we succeed in
729 	 * nsc_reserve'ing the underlying nsc_fd_t.
730 	 */
731 
732 	svp->sv_nblocks = 0;
733 
734 	return (0);
735 }
736 
737 
738 /*
739  * Remove a device structure from it's hash chain.
740  * Must be called with sv_mutex held.
741  */
742 
743 static void
744 sv_rm_hash(sv_dev_t *svp)
745 {
746 	sv_dev_t **svpp;
747 	sv_maj_t *maj;
748 
749 	/* Get major hash table */
750 	if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
751 		return;
752 
753 	/* remove svp from hash chain */
754 
755 	svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
756 	while (*svpp) {
757 		if (*svpp == svp) {
758 			/*
759 			 * increment of sm_seq must be before the
760 			 * removal from the hash chain
761 			 */
762 			maj->sm_seq++;
763 			*svpp = svp->sv_hash;
764 			break;
765 		}
766 
767 		svpp = &(*svpp)->sv_hash;
768 	}
769 
770 	svp->sv_hash = NULL;
771 }
772 
773 /*
774  * Free (disable) a device structure.
775  * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
776  * perform the exits during its processing.
777  */
778 
779 static int
780 sv_free(sv_dev_t *svp, const int error)
781 {
782 	struct cb_ops *cb_ops;
783 	sv_maj_t *maj;
784 
785 	/* Get major hash table */
786 	if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
787 		return (NULL);
788 
789 	svp->sv_state = SV_PENDING;
790 	svp->sv_pending = curthread;
791 
792 	/*
793 	 * Close the fd's before removing from the hash or swapping
794 	 * back the cb_ops pointers so that the cache flushes before new
795 	 * io can come in.
796 	 */
797 
798 	if (svp->sv_fd) {
799 		(void) nsc_close(svp->sv_fd);
800 		svp->sv_fd = 0;
801 	}
802 
803 	sv_rm_hash(svp);
804 
805 	if (error != SV_ESDOPEN &&
806 	    error != SV_ELYROPEN && --maj->sm_inuse == 0) {
807 
808 		if (maj->sm_dev_ops)
809 			cb_ops = maj->sm_dev_ops->devo_cb_ops;
810 		else
811 			cb_ops = NULL;
812 
813 		if (cb_ops && maj->sm_strategy != NULL) {
814 			cb_ops->cb_strategy = maj->sm_strategy;
815 			cb_ops->cb_close = maj->sm_close;
816 			cb_ops->cb_ioctl = maj->sm_ioctl;
817 			cb_ops->cb_write = maj->sm_write;
818 			cb_ops->cb_open = maj->sm_open;
819 			cb_ops->cb_read = maj->sm_read;
820 			cb_ops->cb_flag = maj->sm_flag;
821 
822 			if (maj->sm_awrite)
823 				cb_ops->cb_awrite = maj->sm_awrite;
824 
825 			if (maj->sm_aread)
826 				cb_ops->cb_aread = maj->sm_aread;
827 
828 			/*
829 			 * corbin XXX
830 			 * Leave backing device ops in maj->sm_*
831 			 * to handle any requests that might come
832 			 * in during the disable.  This could be
833 			 * a problem however if the backing device
834 			 * driver is changed while we process these
835 			 * requests.
836 			 *
837 			 * maj->sm_strategy = 0;
838 			 * maj->sm_awrite = 0;
839 			 * maj->sm_write = 0;
840 			 * maj->sm_ioctl = 0;
841 			 * maj->sm_close = 0;
842 			 * maj->sm_aread = 0;
843 			 * maj->sm_read = 0;
844 			 * maj->sm_open = 0;
845 			 * maj->sm_flag = 0;
846 			 *
847 			 */
848 		}
849 
850 		if (maj->sm_dev_ops) {
851 			maj->sm_dev_ops = 0;
852 		}
853 	}
854 
855 	if (svp->sv_lh) {
856 		cred_t *crp = ddi_get_cred();
857 
858 		/*
859 		 * Close the protective layered driver open using the
860 		 * Sun Private layered driver i/f.
861 		 */
862 
863 		(void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
864 		svp->sv_lh = NULL;
865 	}
866 
867 	svp->sv_timestamp = nsc_lbolt();
868 	svp->sv_state = SV_DISABLE;
869 	svp->sv_pending = NULL;
870 	rw_exit(&svp->sv_lock);
871 	mutex_exit(&sv_mutex);
872 
873 	return (error);
874 }
875 
876 /*
877  * Reserve the device, taking into account the possibility that
878  * the reserve might have to be retried.
879  */
880 static int
881 sv_reserve(nsc_fd_t *fd, int flags)
882 {
883 	int eintr_count;
884 	int rc;
885 
886 	eintr_count = 0;
887 	do {
888 		rc = nsc_reserve(fd, flags);
889 		if (rc == EINTR) {
890 			++eintr_count;
891 			delay(2);
892 		}
893 	} while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
894 
895 	return (rc);
896 }
897 
898 static int
899 sv_enable(const caddr_t path, const int flag,
900     const dev_t udev, spcs_s_info_t kstatus)
901 {
902 	struct dev_ops *dev_ops;
903 	struct cb_ops *cb_ops;
904 	sv_dev_t *svp;
905 	sv_maj_t *maj;
906 	nsc_size_t nblocks;
907 	int rc;
908 	cred_t *crp;
909 	ldi_ident_t	li;
910 
911 	if (udev == (dev_t)-1 || udev == 0) {
912 		DTRACE_PROBE1(
913 		    sv_enable_err_baddev,
914 		    dev_t, udev);
915 		return (SV_EBADDEV);
916 	}
917 
918 	if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
919 		DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
920 		return (SV_EAMODE);
921 	}
922 
923 	/* Get major hash table */
924 	if ((maj = sv_getmajor(udev)) == NULL)
925 		return (SV_EBADDEV);
926 
927 	mutex_enter(&sv_mutex);
928 
929 	rc = sv_get_state(udev, &svp);
930 	if (rc) {
931 		mutex_exit(&sv_mutex);
932 		DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
933 		return (rc);
934 	}
935 
936 	rw_enter(&svp->sv_lock, RW_WRITER);
937 
938 	/*
939 	 * Get real fd used for io
940 	 */
941 
942 	svp->sv_dev = udev;
943 	svp->sv_flag = flag;
944 
945 	/*
946 	 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
947 	 * function pointer before sv swaps them out.
948 	 */
949 
950 	svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
951 	    sv_fd_def, (blind_t)udev, &rc);
952 
953 	if (svp->sv_fd == NULL) {
954 		if (kstatus)
955 			spcs_s_add(kstatus, rc);
956 		DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
957 		return (sv_free(svp, SV_ESDOPEN));
958 	}
959 
960 	/*
961 	 * Perform a layered driver open using the Sun Private layered
962 	 * driver i/f to ensure that the cb_ops structure for the driver
963 	 * is not detached out from under us whilst sv is enabled.
964 	 *
965 	 */
966 
967 	crp = ddi_get_cred();
968 	svp->sv_lh = NULL;
969 
970 	if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
971 		rc = ldi_open_by_dev(&svp->sv_dev,
972 		    OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
973 	}
974 
975 	if (rc != 0) {
976 		if (kstatus)
977 			spcs_s_add(kstatus, rc);
978 		DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
979 		return (sv_free(svp, SV_ELYROPEN));
980 	}
981 
982 	/*
983 	 * Do layering if required - must happen after nsc_open().
984 	 */
985 
986 	if (maj->sm_inuse++ == 0) {
987 		maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
988 
989 		if (maj->sm_dev_ops == NULL ||
990 		    maj->sm_dev_ops->devo_cb_ops == NULL) {
991 			DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
992 			return (sv_free(svp, SV_ELOAD));
993 		}
994 
995 		dev_ops = maj->sm_dev_ops;
996 		cb_ops = dev_ops->devo_cb_ops;
997 
998 		if (cb_ops->cb_strategy == NULL ||
999 		    cb_ops->cb_strategy == nodev ||
1000 		    cb_ops->cb_strategy == nulldev) {
1001 			DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1002 			return (sv_free(svp, SV_ELOAD));
1003 		}
1004 
1005 		if (cb_ops->cb_strategy == sv_lyr_strategy) {
1006 			DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1007 			return (sv_free(svp, SV_ESTRATEGY));
1008 		}
1009 
1010 		maj->sm_strategy = cb_ops->cb_strategy;
1011 		maj->sm_close = cb_ops->cb_close;
1012 		maj->sm_ioctl = cb_ops->cb_ioctl;
1013 		maj->sm_write = cb_ops->cb_write;
1014 		maj->sm_open = cb_ops->cb_open;
1015 		maj->sm_read = cb_ops->cb_read;
1016 		maj->sm_flag = cb_ops->cb_flag;
1017 
1018 		cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1019 		cb_ops->cb_strategy = sv_lyr_strategy;
1020 		cb_ops->cb_close = sv_lyr_close;
1021 		cb_ops->cb_ioctl = sv_lyr_ioctl;
1022 		cb_ops->cb_write = sv_lyr_write;
1023 		cb_ops->cb_open = sv_lyr_open;
1024 		cb_ops->cb_read = sv_lyr_read;
1025 
1026 		/*
1027 		 * Check that the driver has async I/O entry points
1028 		 * before changing them.
1029 		 */
1030 
1031 		if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1032 			maj->sm_awrite = 0;
1033 			maj->sm_aread = 0;
1034 		} else {
1035 			maj->sm_awrite = cb_ops->cb_awrite;
1036 			maj->sm_aread = cb_ops->cb_aread;
1037 
1038 			cb_ops->cb_awrite = sv_lyr_awrite;
1039 			cb_ops->cb_aread = sv_lyr_aread;
1040 		}
1041 
1042 		/*
1043 		 * Bug 4645743
1044 		 *
1045 		 * Prevent sv from ever unloading after it has interposed
1046 		 * on a major device because there is a race between
1047 		 * sv removing its layered entry points from the target
1048 		 * dev_ops, a client coming in and accessing the driver,
1049 		 * and the kernel modunloading the sv text.
1050 		 *
1051 		 * To allow unload, do svboot -u, which only happens in
1052 		 * pkgrm time.
1053 		 */
1054 		ASSERT(MUTEX_HELD(&sv_mutex));
1055 		sv_mod_status = SV_PREVENT_UNLOAD;
1056 	}
1057 
1058 
1059 	svp->sv_timestamp = nsc_lbolt();
1060 	svp->sv_state = SV_ENABLE;
1061 	svp->sv_pending = NULL;
1062 	rw_exit(&svp->sv_lock);
1063 
1064 	sv_ndevices++;
1065 	mutex_exit(&sv_mutex);
1066 
1067 	nblocks = 0;
1068 	if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1069 		nblocks = svp->sv_nblocks;
1070 		nsc_release(svp->sv_fd);
1071 	}
1072 
1073 	cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1074 	    svp->sv_dev, nblocks);
1075 
1076 	return (0);
1077 }
1078 
1079 
1080 static int
1081 sv_prepare_unload()
1082 {
1083 	int rc = 0;
1084 
1085 	mutex_enter(&sv_mutex);
1086 
1087 	if (sv_mod_status == SV_PREVENT_UNLOAD) {
1088 		if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1089 			rc = EBUSY;
1090 		} else {
1091 			sv_mod_status = SV_ALLOW_UNLOAD;
1092 			delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1093 		}
1094 	}
1095 
1096 	mutex_exit(&sv_mutex);
1097 	return (rc);
1098 }
1099 
1100 static int
1101 svattach_fd(blind_t arg)
1102 {
1103 	dev_t dev = (dev_t)arg;
1104 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1105 	int rc;
1106 
1107 	if (sv_debug > 0)
1108 		cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1109 
1110 	if (svp == NULL) {
1111 		cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1112 		return (0);
1113 	}
1114 
1115 	if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1116 		cmn_err(CE_WARN,
1117 		    "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1118 		svp->sv_nblocks = 0;
1119 	}
1120 
1121 	if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1122 		cmn_err(CE_WARN,
1123 		    "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1124 		svp->sv_maxfbas = 0;
1125 	}
1126 
1127 	if (sv_debug > 0) {
1128 		cmn_err(CE_CONT,
1129 		    "!svattach_fd(%p): size %" NSC_SZFMT ", "
1130 		    "maxfbas %" NSC_SZFMT "\n",
1131 		    arg, svp->sv_nblocks, svp->sv_maxfbas);
1132 	}
1133 
1134 	return (0);
1135 }
1136 
1137 
1138 static int
1139 svdetach_fd(blind_t arg)
1140 {
1141 	dev_t dev = (dev_t)arg;
1142 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1143 
1144 	if (sv_debug > 0)
1145 		cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1146 
1147 	/* svp can be NULL during disable of an sv */
1148 	if (svp == NULL)
1149 		return (0);
1150 
1151 	svp->sv_maxfbas = 0;
1152 	svp->sv_nblocks = 0;
1153 	return (0);
1154 }
1155 
1156 
1157 /*
1158  * Side effect: if called with (guard != 0), then expects both sv_mutex
1159  * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1160  */
1161 
1162 /* ARGSUSED */
1163 static int
1164 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1165 {
1166 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1167 
1168 	if (svp == NULL) {
1169 
1170 		DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1171 		return (SV_ENODEV);
1172 	}
1173 
1174 	mutex_enter(&sv_mutex);
1175 	rw_enter(&svp->sv_lock, RW_WRITER);
1176 
1177 	if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1178 		rw_exit(&svp->sv_lock);
1179 		mutex_exit(&sv_mutex);
1180 
1181 		DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1182 		return (SV_EDISABLED);
1183 	}
1184 
1185 
1186 	sv_ndevices--;
1187 	return (sv_free(svp, 0));
1188 }
1189 
1190 
1191 
1192 static int
1193 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1194 {
1195 	nsc_buf_t *tmph;
1196 	sv_dev_t *svp;
1197 	sv_maj_t *maj;
1198 	int (*fn)();
1199 	dev_t odev;
1200 	int ret;
1201 	int rc;
1202 
1203 	svp = sv_dev_to_sv(*devp, &maj);
1204 
1205 	if (svp) {
1206 		if (svp->sv_state == SV_PENDING &&
1207 		    svp->sv_pending == curthread) {
1208 			/*
1209 			 * This is a recursive open from a call to
1210 			 * ddi_lyr_open_by_devt and so we just want
1211 			 * to pass it straight through to the
1212 			 * underlying driver.
1213 			 */
1214 			DTRACE_PROBE2(sv_lyr_open_recursive,
1215 			    sv_dev_t *, svp,
1216 			    dev_t, *devp);
1217 			svp = NULL;
1218 		} else
1219 			rw_enter(&svp->sv_lock, RW_READER);
1220 	}
1221 
1222 	odev = *devp;
1223 
1224 	if (maj && (fn = maj->sm_open) != 0) {
1225 		if (!(maj->sm_flag & D_MP)) {
1226 			UNSAFE_ENTER();
1227 			ret = (*fn)(devp, flag, otyp, crp);
1228 			UNSAFE_EXIT();
1229 		} else {
1230 			ret = (*fn)(devp, flag, otyp, crp);
1231 		}
1232 
1233 		if (ret == 0) {
1234 			/*
1235 			 * Re-acquire svp if the driver changed *devp.
1236 			 */
1237 
1238 			if (*devp != odev) {
1239 				rw_exit(&svp->sv_lock);
1240 
1241 				svp = sv_dev_to_sv(*devp, NULL);
1242 
1243 				if (svp) {
1244 					rw_enter(&svp->sv_lock, RW_READER);
1245 				}
1246 			}
1247 		}
1248 	} else {
1249 		ret = ENODEV;
1250 	}
1251 
1252 	if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1253 		/*
1254 		 * Underlying DDI open failed, but we have this
1255 		 * device SV enabled.  If we can read some data
1256 		 * from the device, fake a successful open (this
1257 		 * probably means that this device is RDC'd and we
1258 		 * are getting the data from the secondary node).
1259 		 *
1260 		 * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1261 		 * ensure that it does not deadlock if this open is
1262 		 * coming from nskernd:get_bsize().
1263 		 */
1264 		rc = sv_reserve(svp->sv_fd,
1265 		    NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1266 		if (rc == 0) {
1267 			tmph = NULL;
1268 
1269 			rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1270 			if (rc <= 0) {
1271 				/* success */
1272 				ret = 0;
1273 			}
1274 
1275 			if (tmph) {
1276 				(void) nsc_free_buf(tmph);
1277 				tmph = NULL;
1278 			}
1279 
1280 			nsc_release(svp->sv_fd);
1281 
1282 			/*
1283 			 * Count the number of layered opens that we
1284 			 * fake since we have to fake a matching number
1285 			 * of closes (OTYP_LYR open/close calls must be
1286 			 * paired).
1287 			 */
1288 
1289 			if (ret == 0 && otyp == OTYP_LYR) {
1290 				mutex_enter(&svp->sv_olock);
1291 				svp->sv_openlcnt++;
1292 				mutex_exit(&svp->sv_olock);
1293 			}
1294 		}
1295 	}
1296 
1297 	if (svp) {
1298 		rw_exit(&svp->sv_lock);
1299 	}
1300 
1301 	return (ret);
1302 }
1303 
1304 
1305 static int
1306 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1307 {
1308 	sv_dev_t *svp;
1309 	sv_maj_t *maj;
1310 	int (*fn)();
1311 	int ret;
1312 
1313 	svp = sv_dev_to_sv(dev, &maj);
1314 
1315 	if (svp &&
1316 	    svp->sv_state == SV_PENDING &&
1317 	    svp->sv_pending == curthread) {
1318 		/*
1319 		 * This is a recursive open from a call to
1320 		 * ddi_lyr_close and so we just want
1321 		 * to pass it straight through to the
1322 		 * underlying driver.
1323 		 */
1324 		DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1325 		    dev_t, dev);
1326 		svp = NULL;
1327 	}
1328 
1329 	if (svp) {
1330 		rw_enter(&svp->sv_lock, RW_READER);
1331 
1332 		if (otyp == OTYP_LYR) {
1333 			mutex_enter(&svp->sv_olock);
1334 
1335 			if (svp->sv_openlcnt) {
1336 				/*
1337 				 * Consume sufficient layered closes to
1338 				 * account for the opens that we faked
1339 				 * whilst the device was failed.
1340 				 */
1341 				svp->sv_openlcnt--;
1342 				mutex_exit(&svp->sv_olock);
1343 				rw_exit(&svp->sv_lock);
1344 
1345 				DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1346 
1347 				return (0);
1348 			}
1349 
1350 			mutex_exit(&svp->sv_olock);
1351 		}
1352 	}
1353 
1354 	if (maj && (fn = maj->sm_close) != 0) {
1355 		if (!(maj->sm_flag & D_MP)) {
1356 			UNSAFE_ENTER();
1357 			ret = (*fn)(dev, flag, otyp, crp);
1358 			UNSAFE_EXIT();
1359 		} else {
1360 			ret = (*fn)(dev, flag, otyp, crp);
1361 		}
1362 	} else {
1363 		ret = ENODEV;
1364 	}
1365 
1366 	if (svp) {
1367 		rw_exit(&svp->sv_lock);
1368 	}
1369 
1370 	return (ret);
1371 }
1372 
1373 
1374 /*
1375  * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1376  * return NULL.
1377  */
1378 static sv_dev_t *
1379 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1380 {
1381 	sv_dev_t *svp;
1382 
1383 	while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1384 		rw_enter(&svp->sv_lock, RW_READER);
1385 
1386 		if (svp->sv_state == SV_ENABLE) {
1387 			/* locked and enabled */
1388 			break;
1389 		}
1390 
1391 		/*
1392 		 * State was changed while waiting on the lock.
1393 		 * Wait for a stable state.
1394 		 */
1395 		rw_exit(&svp->sv_lock);
1396 
1397 		DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1398 
1399 		delay(2);
1400 	}
1401 
1402 	return (svp);
1403 }
1404 
1405 
1406 static int
1407 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1408 {
1409 	sv_dev_t *svp;
1410 	sv_maj_t *maj;
1411 	int (*fn)();
1412 	int rc;
1413 
1414 	svp = sv_find_enabled(dev, &maj);
1415 	if (svp == NULL) {
1416 		if (maj) {
1417 			if (rw == NSC_READ)
1418 				fn = maj->sm_read;
1419 			else
1420 				fn = maj->sm_write;
1421 
1422 			if (fn != 0) {
1423 				if (!(maj->sm_flag & D_MP)) {
1424 					UNSAFE_ENTER();
1425 					rc = (*fn)(dev, uiop, crp);
1426 					UNSAFE_EXIT();
1427 				} else {
1428 					rc = (*fn)(dev, uiop, crp);
1429 				}
1430 			}
1431 
1432 			return (rc);
1433 		} else {
1434 			return (ENODEV);
1435 		}
1436 	}
1437 
1438 	ASSERT(RW_READ_HELD(&svp->sv_lock));
1439 
1440 	if (svp->sv_flag == 0) {
1441 		/*
1442 		 * guard access mode
1443 		 * - prevent user level access to the device
1444 		 */
1445 		DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1446 		rc = EPERM;
1447 		goto out;
1448 	}
1449 
1450 	if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1451 		DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1452 		goto out;
1453 	}
1454 
1455 	if (rw == NSC_READ)
1456 		rc = nsc_uread(svp->sv_fd, uiop, crp);
1457 	else
1458 		rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1459 
1460 	nsc_release(svp->sv_fd);
1461 
1462 out:
1463 	rw_exit(&svp->sv_lock);
1464 
1465 	return (rc);
1466 }
1467 
1468 
1469 static int
1470 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1471 {
1472 	return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1473 }
1474 
1475 
1476 static int
1477 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1478 {
1479 	return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1480 }
1481 
1482 
1483 /* ARGSUSED */
1484 
1485 static int
1486 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1487 {
1488 	return (aphysio(sv_lyr_strategy,
1489 	    anocancel, dev, B_READ, minphys, aio));
1490 }
1491 
1492 
1493 /* ARGSUSED */
1494 
1495 static int
1496 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1497 {
1498 	return (aphysio(sv_lyr_strategy,
1499 	    anocancel, dev, B_WRITE, minphys, aio));
1500 }
1501 
1502 
1503 /*
1504  * Set up an array containing the list of raw path names
1505  * The array for the paths is svl and the size of the array is
1506  * in size.
1507  *
1508  * If there are more layered devices than will fit in the array,
1509  * the number of extra layered devices is returned.  Otherwise
1510  * zero is return.
1511  *
1512  * Input:
1513  *	svn	: array for paths
1514  *	size	: size of the array
1515  *
1516  * Output (extra):
1517  *	zero	: All paths fit in array
1518  *	>0	: Number of defined layered devices don't fit in array
1519  */
1520 
1521 static int
1522 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1523 {
1524 	sv_name32_t *svn32;
1525 	sv_name_t *svn;
1526 	sv_dev_t *svp;
1527 	int *mode, *nblocks;
1528 	int i, index;
1529 	char *path;
1530 
1531 	*extra = 0;
1532 	index = 0;
1533 
1534 	if (ilp32)
1535 		svn32 = ptr;
1536 	else
1537 		svn = ptr;
1538 
1539 	mutex_enter(&sv_mutex);
1540 	for (i = 0; i < sv_max_devices; i++) {
1541 		svp = &sv_devs[i];
1542 
1543 		rw_enter(&svp->sv_lock, RW_READER);
1544 
1545 		if (svp->sv_state != SV_ENABLE) {
1546 			rw_exit(&svp->sv_lock);
1547 			continue;
1548 		}
1549 
1550 		if ((*extra) != 0 || ptr == NULL) {
1551 			/* Another overflow entry */
1552 			rw_exit(&svp->sv_lock);
1553 			(*extra)++;
1554 			continue;
1555 		}
1556 
1557 		if (ilp32) {
1558 			nblocks = &svn32->svn_nblocks;
1559 			mode = &svn32->svn_mode;
1560 			path = svn32->svn_path;
1561 
1562 			svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1563 			svn32++;
1564 		} else {
1565 			nblocks = &svn->svn_nblocks;
1566 			mode = &svn->svn_mode;
1567 			path = svn->svn_path;
1568 
1569 			svn->svn_timestamp = svp->sv_timestamp;
1570 			svn++;
1571 		}
1572 
1573 		(void) strcpy(path, nsc_pathname(svp->sv_fd));
1574 		*nblocks = svp->sv_nblocks;
1575 		*mode = svp->sv_flag;
1576 
1577 		if (*nblocks == 0) {
1578 			if (sv_debug > 3)
1579 				cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1580 
1581 			if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1582 				*nblocks = svp->sv_nblocks;
1583 				nsc_release(svp->sv_fd);
1584 			}
1585 		}
1586 
1587 		if (++index >= size) {
1588 			/* Out of space */
1589 			(*extra)++;
1590 		}
1591 
1592 		rw_exit(&svp->sv_lock);
1593 	}
1594 	mutex_exit(&sv_mutex);
1595 
1596 	if (index < size) {
1597 		/* NULL terminated list */
1598 		if (ilp32)
1599 			svn32->svn_path[0] = '\0';
1600 		else
1601 			svn->svn_path[0] = '\0';
1602 	}
1603 
1604 	return (0);
1605 }
1606 
1607 
1608 static void
1609 sv_thread_tune(int threads)
1610 {
1611 	int incr = (threads > 0) ? 1 : -1;
1612 	int change = 0;
1613 	int nthreads;
1614 
1615 	ASSERT(MUTEX_HELD(&sv_mutex));
1616 
1617 	if (sv_threads_extra) {
1618 		/* keep track of any additional threads requested */
1619 		if (threads > 0) {
1620 			sv_threads_extra += threads;
1621 			return;
1622 		}
1623 		threads = -threads;
1624 		if (threads >= sv_threads_extra) {
1625 			threads -= sv_threads_extra;
1626 			sv_threads_extra = 0;
1627 			/* fall through to while loop */
1628 		} else {
1629 			sv_threads_extra -= threads;
1630 			return;
1631 		}
1632 	} else if (threads > 0) {
1633 		/*
1634 		 * do not increase the number of threads beyond
1635 		 * sv_threads_max when doing dynamic thread tuning
1636 		 */
1637 		nthreads = nst_nthread(sv_tset);
1638 		if ((nthreads + threads) > sv_threads_max) {
1639 			sv_threads_extra = nthreads + threads - sv_threads_max;
1640 			threads = sv_threads_max - nthreads;
1641 			if (threads <= 0)
1642 				return;
1643 		}
1644 	}
1645 
1646 	if (threads < 0)
1647 		threads = -threads;
1648 
1649 	while (threads--) {
1650 		nthreads = nst_nthread(sv_tset);
1651 		sv_threads_needed += incr;
1652 
1653 		if (sv_threads_needed >= nthreads)
1654 			change += nst_add_thread(sv_tset, sv_threads_inc);
1655 		else if ((sv_threads_needed <
1656 		    (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1657 		    ((nthreads - sv_threads_inc) >= sv_threads))
1658 			change -= nst_del_thread(sv_tset, sv_threads_inc);
1659 	}
1660 
1661 #ifdef DEBUG
1662 	if (change) {
1663 		cmn_err(CE_NOTE,
1664 		    "!sv_thread_tune: threads needed %d, nthreads %d, "
1665 		    "nthreads change %d",
1666 		    sv_threads_needed, nst_nthread(sv_tset), change);
1667 	}
1668 #endif
1669 }
1670 
1671 
1672 /* ARGSUSED */
1673 static int
1674 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1675 {
1676 	int rc;
1677 
1678 	mutex_enter(&sv_mutex);
1679 	rc = sv_init_devs();
1680 	mutex_exit(&sv_mutex);
1681 
1682 	return (rc);
1683 }
1684 
1685 
1686 /* ARGSUSED */
1687 static int
1688 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1689 {
1690 	const int secs = HZ * 5;
1691 	const int ticks = HZ / 10;
1692 	int loops = secs / ticks;
1693 
1694 	mutex_enter(&sv_mutex);
1695 	while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1696 		if (nst_nlive(sv_tset) <= 0) {
1697 			nst_destroy(sv_tset);
1698 			sv_tset = NULL;
1699 			break;
1700 		}
1701 
1702 		/* threads still active - wait for them to exit */
1703 		mutex_exit(&sv_mutex);
1704 		delay(ticks);
1705 		loops--;
1706 		mutex_enter(&sv_mutex);
1707 	}
1708 	mutex_exit(&sv_mutex);
1709 
1710 	if (loops <= 0) {
1711 		cmn_err(CE_WARN,
1712 #ifndef DEBUG
1713 		    /* do not write to console when non-DEBUG */
1714 		    "!"
1715 #endif
1716 		    "sv:svclose: threads still active "
1717 		    "after %d sec - leaking thread set", secs);
1718 	}
1719 
1720 	return (0);
1721 }
1722 
1723 
1724 static int
1725 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1726 {
1727 	char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1728 	spcs_s_info_t kstatus;	/* Kernel version of spcs status */
1729 	spcs_s_info_t ustatus;	/* Address of user version of spcs status */
1730 	sv_list32_t svl32;	/* 32 bit Initial structure for SVIOC_LIST */
1731 	sv_version_t svv;	/* Version structure */
1732 	sv_conf_t svc;		/* User config structure */
1733 	sv_list_t svl;		/* Initial structure for SVIOC_LIST */
1734 	void *usvn;		/* Address of user sv_name_t */
1735 	void *svn = NULL;	/* Array for SVIOC_LIST */
1736 	uint64_t phash;		/* pathname hash */
1737 	int rc = 0;		/* Return code -- errno */
1738 	int size;		/* Number of items in array */
1739 	int bytes;		/* Byte size of array */
1740 	int ilp32;		/* Convert data structures for ilp32 userland */
1741 
1742 	*rvalp = 0;
1743 
1744 	/*
1745 	 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1746 	 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1747 	 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1748 	 *
1749 	 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1750 	 */
1751 	if (sv_mod_status == SV_ALLOW_UNLOAD) {
1752 		return (EBUSY);
1753 	}
1754 
1755 	if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1756 		return (rc);
1757 
1758 	kstatus = spcs_s_kcreate();
1759 	if (!kstatus) {
1760 		DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1761 		return (ENOMEM);
1762 	}
1763 
1764 	ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1765 
1766 	switch (cmd) {
1767 
1768 	case SVIOC_ENABLE:
1769 
1770 		if (ilp32) {
1771 			sv_conf32_t svc32;
1772 
1773 			if (ddi_copyin((void *)arg, &svc32,
1774 			    sizeof (svc32), mode) < 0) {
1775 				spcs_s_kfree(kstatus);
1776 				return (EFAULT);
1777 			}
1778 
1779 			svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1780 			(void) strcpy(svc.svc_path, svc32.svc_path);
1781 			svc.svc_flag  = svc32.svc_flag;
1782 			svc.svc_major = svc32.svc_major;
1783 			svc.svc_minor = svc32.svc_minor;
1784 		} else {
1785 			if (ddi_copyin((void *)arg, &svc,
1786 			    sizeof (svc), mode) < 0) {
1787 				spcs_s_kfree(kstatus);
1788 				return (EFAULT);
1789 			}
1790 		}
1791 
1792 		/* force to raw access */
1793 		svc.svc_flag = NSC_DEVICE;
1794 
1795 		if (sv_tset == NULL) {
1796 			mutex_enter(&sv_mutex);
1797 
1798 			if (sv_tset == NULL) {
1799 				sv_tset = nst_init("sv_thr", sv_threads);
1800 			}
1801 
1802 			mutex_exit(&sv_mutex);
1803 
1804 			if (sv_tset == NULL) {
1805 				cmn_err(CE_WARN,
1806 				    "!sv: could not allocate %d threads",
1807 				    sv_threads);
1808 			}
1809 		}
1810 
1811 		rc = sv_enable(svc.svc_path, svc.svc_flag,
1812 		    makedevice(svc.svc_major, svc.svc_minor), kstatus);
1813 
1814 		if (rc == 0) {
1815 			sv_config_time = nsc_lbolt();
1816 
1817 			mutex_enter(&sv_mutex);
1818 			sv_thread_tune(sv_threads_dev);
1819 			mutex_exit(&sv_mutex);
1820 		}
1821 
1822 		DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1823 
1824 		return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1825 		/* NOTREACHED */
1826 
1827 	case SVIOC_DISABLE:
1828 
1829 		if (ilp32) {
1830 			sv_conf32_t svc32;
1831 
1832 			if (ddi_copyin((void *)arg, &svc32,
1833 			    sizeof (svc32), mode) < 0) {
1834 				spcs_s_kfree(kstatus);
1835 				return (EFAULT);
1836 			}
1837 
1838 			svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1839 			svc.svc_major = svc32.svc_major;
1840 			svc.svc_minor = svc32.svc_minor;
1841 			(void) strcpy(svc.svc_path, svc32.svc_path);
1842 			svc.svc_flag  = svc32.svc_flag;
1843 		} else {
1844 			if (ddi_copyin((void *)arg, &svc,
1845 			    sizeof (svc), mode) < 0) {
1846 				spcs_s_kfree(kstatus);
1847 				return (EFAULT);
1848 			}
1849 		}
1850 
1851 		if (svc.svc_major == (major_t)-1 &&
1852 		    svc.svc_minor == (minor_t)-1) {
1853 			sv_dev_t *svp;
1854 			int i;
1855 
1856 			/*
1857 			 * User level could not find the minor device
1858 			 * node, so do this the slow way by searching
1859 			 * the entire sv config for a matching pathname.
1860 			 */
1861 
1862 			phash = nsc_strhash(svc.svc_path);
1863 
1864 			mutex_enter(&sv_mutex);
1865 
1866 			for (i = 0; i < sv_max_devices; i++) {
1867 				svp = &sv_devs[i];
1868 
1869 				if (svp->sv_state == SV_DISABLE ||
1870 				    svp->sv_fd == NULL)
1871 					continue;
1872 
1873 				if (nsc_fdpathcmp(svp->sv_fd, phash,
1874 				    svc.svc_path) == 0) {
1875 					svc.svc_major = getmajor(svp->sv_dev);
1876 					svc.svc_minor = getminor(svp->sv_dev);
1877 					break;
1878 				}
1879 			}
1880 
1881 			mutex_exit(&sv_mutex);
1882 
1883 			if (svc.svc_major == (major_t)-1 &&
1884 			    svc.svc_minor == (minor_t)-1)
1885 				return (spcs_s_ocopyoutf(&kstatus,
1886 				    svc.svc_error, SV_ENODEV));
1887 		}
1888 
1889 		rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1890 		    kstatus);
1891 
1892 		if (rc == 0) {
1893 			sv_config_time = nsc_lbolt();
1894 
1895 			mutex_enter(&sv_mutex);
1896 			sv_thread_tune(-sv_threads_dev);
1897 			mutex_exit(&sv_mutex);
1898 		}
1899 
1900 		DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1901 
1902 		return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1903 		/* NOTREACHED */
1904 
1905 	case SVIOC_LIST:
1906 
1907 		if (ilp32) {
1908 			if (ddi_copyin((void *)arg, &svl32,
1909 			    sizeof (svl32), mode) < 0) {
1910 				spcs_s_kfree(kstatus);
1911 				return (EFAULT);
1912 			}
1913 
1914 			ustatus = (spcs_s_info_t)svl32.svl_error;
1915 			size = svl32.svl_count;
1916 			usvn = (void *)(unsigned long)svl32.svl_names;
1917 		} else {
1918 			if (ddi_copyin((void *)arg, &svl,
1919 			    sizeof (svl), mode) < 0) {
1920 				spcs_s_kfree(kstatus);
1921 				return (EFAULT);
1922 			}
1923 
1924 			ustatus = svl.svl_error;
1925 			size = svl.svl_count;
1926 			usvn = svl.svl_names;
1927 		}
1928 
1929 		/* Do some boundary checking */
1930 		if ((size < 0) || (size > sv_max_devices)) {
1931 			/* Array size is out of range */
1932 			return (spcs_s_ocopyoutf(&kstatus, ustatus,
1933 			    SV_EARRBOUNDS, "0",
1934 			    spcs_s_inttostring(sv_max_devices, itmp1,
1935 			    sizeof (itmp1), 0),
1936 			    spcs_s_inttostring(size, itmp2,
1937 			    sizeof (itmp2), 0)));
1938 		}
1939 
1940 		if (ilp32)
1941 			bytes = size * sizeof (sv_name32_t);
1942 		else
1943 			bytes = size * sizeof (sv_name_t);
1944 
1945 		/* Allocate memory for the array of structures */
1946 		if (bytes != 0) {
1947 			svn = kmem_zalloc(bytes, KM_SLEEP);
1948 			if (!svn) {
1949 				return (spcs_s_ocopyoutf(&kstatus,
1950 				    ustatus, ENOMEM));
1951 			}
1952 		}
1953 
1954 		rc = sv_list(svn, size, rvalp, ilp32);
1955 		if (rc) {
1956 			if (svn != NULL)
1957 				kmem_free(svn, bytes);
1958 			return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1959 		}
1960 
1961 		if (ilp32) {
1962 			svl32.svl_timestamp = (uint32_t)sv_config_time;
1963 			svl32.svl_maxdevs = (int32_t)sv_max_devices;
1964 
1965 			/* Return the list structure */
1966 			if (ddi_copyout(&svl32, (void *)arg,
1967 			    sizeof (svl32), mode) < 0) {
1968 				spcs_s_kfree(kstatus);
1969 				if (svn != NULL)
1970 					kmem_free(svn, bytes);
1971 				return (EFAULT);
1972 			}
1973 		} else {
1974 			svl.svl_timestamp = sv_config_time;
1975 			svl.svl_maxdevs = sv_max_devices;
1976 
1977 			/* Return the list structure */
1978 			if (ddi_copyout(&svl, (void *)arg,
1979 			    sizeof (svl), mode) < 0) {
1980 				spcs_s_kfree(kstatus);
1981 				if (svn != NULL)
1982 					kmem_free(svn, bytes);
1983 				return (EFAULT);
1984 			}
1985 		}
1986 
1987 		/* Return the array */
1988 		if (svn != NULL) {
1989 			if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1990 				kmem_free(svn, bytes);
1991 				spcs_s_kfree(kstatus);
1992 				return (EFAULT);
1993 			}
1994 			kmem_free(svn, bytes);
1995 		}
1996 
1997 		DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
1998 
1999 		return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2000 		/* NOTREACHED */
2001 
2002 	case SVIOC_VERSION:
2003 
2004 		if (ilp32) {
2005 			sv_version32_t svv32;
2006 
2007 			if (ddi_copyin((void *)arg, &svv32,
2008 			    sizeof (svv32), mode) < 0) {
2009 				spcs_s_kfree(kstatus);
2010 				return (EFAULT);
2011 			}
2012 
2013 			svv32.svv_major_rev = sv_major_rev;
2014 			svv32.svv_minor_rev = sv_minor_rev;
2015 			svv32.svv_micro_rev = sv_micro_rev;
2016 			svv32.svv_baseline_rev = sv_baseline_rev;
2017 
2018 			if (ddi_copyout(&svv32, (void *)arg,
2019 			    sizeof (svv32), mode) < 0) {
2020 				spcs_s_kfree(kstatus);
2021 				return (EFAULT);
2022 			}
2023 
2024 			ustatus = (spcs_s_info_t)svv32.svv_error;
2025 		} else {
2026 			if (ddi_copyin((void *)arg, &svv,
2027 			    sizeof (svv), mode) < 0) {
2028 				spcs_s_kfree(kstatus);
2029 				return (EFAULT);
2030 			}
2031 
2032 			svv.svv_major_rev = sv_major_rev;
2033 			svv.svv_minor_rev = sv_minor_rev;
2034 			svv.svv_micro_rev = sv_micro_rev;
2035 			svv.svv_baseline_rev = sv_baseline_rev;
2036 
2037 			if (ddi_copyout(&svv, (void *)arg,
2038 			    sizeof (svv), mode) < 0) {
2039 				spcs_s_kfree(kstatus);
2040 				return (EFAULT);
2041 			}
2042 
2043 			ustatus = svv.svv_error;
2044 		}
2045 
2046 		DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2047 
2048 		return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2049 		/* NOTREACHED */
2050 
2051 	case SVIOC_UNLOAD:
2052 		rc = sv_prepare_unload();
2053 
2054 		if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2055 			rc = EFAULT;
2056 		}
2057 
2058 		spcs_s_kfree(kstatus);
2059 		return (rc);
2060 
2061 	default:
2062 		spcs_s_kfree(kstatus);
2063 
2064 		DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2065 
2066 		return (EINVAL);
2067 		/* NOTREACHED */
2068 	}
2069 
2070 	/* NOTREACHED */
2071 }
2072 
2073 
2074 /* ARGSUSED */
2075 static int
2076 svprint(dev_t dev, char *str)
2077 {
2078 	int instance = ddi_get_instance(sv_dip);
2079 	cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2080 	return (0);
2081 }
2082 
2083 
2084 static void
2085 _sv_lyr_strategy(struct buf *bp)
2086 {
2087 	caddr_t buf_addr;		/* pointer to linear buffer in bp */
2088 	nsc_buf_t *bufh = NULL;
2089 	nsc_buf_t *hndl = NULL;
2090 	sv_dev_t *svp;
2091 	nsc_vec_t *v;
2092 	sv_maj_t *maj;
2093 	nsc_size_t fba_req, fba_len;	/* FBA lengths */
2094 	nsc_off_t fba_off;		/* FBA offset */
2095 	size_t tocopy, nbytes;		/* byte lengths */
2096 	int rw, rc;			/* flags and return codes */
2097 	int (*fn)();
2098 
2099 	rc = 0;
2100 
2101 	if (sv_debug > 5)
2102 		cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2103 
2104 	svp = sv_find_enabled(bp->b_edev, &maj);
2105 	if (svp == NULL) {
2106 		if (maj && (fn = maj->sm_strategy) != 0) {
2107 			if (!(maj->sm_flag & D_MP)) {
2108 				UNSAFE_ENTER();
2109 				rc = (*fn)(bp);
2110 				UNSAFE_EXIT();
2111 			} else {
2112 				rc = (*fn)(bp);
2113 			}
2114 			return;
2115 		} else {
2116 			bioerror(bp, ENODEV);
2117 			biodone(bp);
2118 			return;
2119 		}
2120 	}
2121 
2122 	ASSERT(RW_READ_HELD(&svp->sv_lock));
2123 
2124 	if (svp->sv_flag == 0) {
2125 		/*
2126 		 * guard access mode
2127 		 * - prevent user level access to the device
2128 		 */
2129 		DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2130 		bioerror(bp, EPERM);
2131 		goto out;
2132 	}
2133 
2134 	if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2135 		DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2136 
2137 		if (rc == EINTR)
2138 			cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2139 		bioerror(bp, rc);
2140 		goto out;
2141 	}
2142 
2143 	if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2144 		DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2145 
2146 		if (bp->b_flags & B_READ) {
2147 			/* return EOF, not an error */
2148 			bp->b_resid = bp->b_bcount;
2149 			bioerror(bp, 0);
2150 		} else
2151 			bioerror(bp, EINVAL);
2152 
2153 		goto done;
2154 	}
2155 
2156 	/*
2157 	 * Preallocate a handle once per call to strategy.
2158 	 * If this fails, then the nsc_alloc_buf() will allocate
2159 	 * a temporary handle per allocation/free pair.
2160 	 */
2161 
2162 	DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2163 
2164 	bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2165 
2166 	DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2167 
2168 	if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2169 		DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2170 
2171 		cmn_err(CE_WARN,
2172 		    "!sv: allocated active handle (bufh %p, flags %x)",
2173 		    (void *)bufh, bufh->sb_flag);
2174 
2175 		bioerror(bp, ENXIO);
2176 		goto done;
2177 	}
2178 
2179 	fba_req = FBA_LEN(bp->b_bcount);
2180 	if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2181 		fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2182 
2183 	rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2184 
2185 	bp_mapin(bp);
2186 
2187 	bp->b_resid = bp->b_bcount;
2188 	buf_addr = bp->b_un.b_addr;
2189 	fba_off = 0;
2190 
2191 	/*
2192 	 * fba_req  - requested size of transfer in FBAs after
2193 	 *		truncation to device extent, and allowing for
2194 	 *		possible non-FBA bounded final chunk.
2195 	 * fba_off  - offset of start of chunk from start of bp in FBAs.
2196 	 * fba_len  - size of this chunk in FBAs.
2197 	 */
2198 
2199 loop:
2200 	fba_len = min(fba_req, svp->sv_maxfbas);
2201 	hndl = bufh;
2202 
2203 	DTRACE_PROBE4(sv_dbg_allocb_start,
2204 	    sv_dev_t *, svp,
2205 	    uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2206 	    uint64_t, (uint64_t)fba_len,
2207 	    int, rw);
2208 
2209 	rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2210 	    fba_len, rw, &hndl);
2211 
2212 	DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2213 
2214 	if (rc > 0) {
2215 		DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2216 		bioerror(bp, rc);
2217 		if (hndl != bufh)
2218 			(void) nsc_free_buf(hndl);
2219 		hndl = NULL;
2220 		goto done;
2221 	}
2222 
2223 	tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2224 	v = hndl->sb_vec;
2225 
2226 	if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2227 		/*
2228 		 * Not overwriting all of the last FBA, so read in the
2229 		 * old contents now before we overwrite it with the new
2230 		 * data.
2231 		 */
2232 
2233 		DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2234 		    uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2235 
2236 		rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2237 		if (rc > 0) {
2238 			bioerror(bp, rc);
2239 			goto done;
2240 		}
2241 
2242 		DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2243 	}
2244 
2245 	DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2246 
2247 	while (tocopy > 0) {
2248 		nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2249 
2250 		if (bp->b_flags & B_READ)
2251 			(void) bcopy(v->sv_addr, buf_addr, nbytes);
2252 		else
2253 			(void) bcopy(buf_addr, v->sv_addr, nbytes);
2254 
2255 		bp->b_resid -= nbytes;
2256 		buf_addr += nbytes;
2257 		tocopy -= nbytes;
2258 		v++;
2259 	}
2260 
2261 	DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2262 
2263 	if ((bp->b_flags & B_READ) == 0) {
2264 		DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2265 		    uint64_t, (uint64_t)hndl->sb_pos,
2266 		    uint64_t, (uint64_t)hndl->sb_len);
2267 
2268 		rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2269 
2270 		DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2271 
2272 		if (rc > 0) {
2273 			bioerror(bp, rc);
2274 			goto done;
2275 		}
2276 	}
2277 
2278 	/*
2279 	 * Adjust FBA offset and requested (ie. remaining) length,
2280 	 * loop if more data to transfer.
2281 	 */
2282 
2283 	fba_off += fba_len;
2284 	fba_req -= fba_len;
2285 
2286 	if (fba_req > 0) {
2287 		DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2288 
2289 		rc = nsc_free_buf(hndl);
2290 
2291 		DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2292 
2293 		if (rc > 0) {
2294 			DTRACE_PROBE1(sv_lyr_strategy_err_free,
2295 			    struct buf *, bp);
2296 			bioerror(bp, rc);
2297 		}
2298 
2299 		hndl = NULL;
2300 
2301 		if (rc <= 0)
2302 			goto loop;
2303 	}
2304 
2305 done:
2306 	if (hndl != NULL) {
2307 		DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2308 
2309 		rc = nsc_free_buf(hndl);
2310 
2311 		DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2312 
2313 		if (rc > 0) {
2314 			DTRACE_PROBE1(sv_lyr_strategy_err_free,
2315 			    struct buf *, bp);
2316 			bioerror(bp, rc);
2317 		}
2318 
2319 		hndl = NULL;
2320 	}
2321 
2322 	if (bufh)
2323 		(void) nsc_free_handle(bufh);
2324 
2325 	DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2326 
2327 	nsc_release(svp->sv_fd);
2328 
2329 	DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2330 
2331 out:
2332 	if (sv_debug > 5) {
2333 		cmn_err(CE_CONT,
2334 		    "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2335 		    (void *)bp, (void *)bufh, bp->b_error);
2336 	}
2337 
2338 	DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2339 
2340 	rw_exit(&svp->sv_lock);
2341 	biodone(bp);
2342 }
2343 
2344 
2345 static void
2346 sv_async_strategy(blind_t arg)
2347 {
2348 	struct buf *bp = (struct buf *)arg;
2349 	_sv_lyr_strategy(bp);
2350 }
2351 
2352 
2353 static int
2354 sv_lyr_strategy(struct buf *bp)
2355 {
2356 	nsthread_t *tp;
2357 	int nlive;
2358 
2359 	/*
2360 	 * If B_ASYNC was part of the DDI we could use it as a hint to
2361 	 * not create a thread for synchronous i/o.
2362 	 */
2363 	if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2364 		/* not sv enabled - just pass through */
2365 		DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2366 		_sv_lyr_strategy(bp);
2367 		return (0);
2368 	}
2369 
2370 	if (sv_debug > 4) {
2371 		cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2372 		    nst_nthread(sv_tset), nst_nlive(sv_tset));
2373 	}
2374 
2375 	/*
2376 	 * If there are only guard devices enabled there
2377 	 * won't be a threadset, so don't try and use it.
2378 	 */
2379 	tp = NULL;
2380 	if (sv_tset != NULL) {
2381 		tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2382 	}
2383 
2384 	if (tp == NULL) {
2385 		/*
2386 		 * out of threads, so fall back to synchronous io.
2387 		 */
2388 		if (sv_debug > 0) {
2389 			cmn_err(CE_CONT,
2390 			    "!sv_lyr_strategy: thread alloc failed\n");
2391 		}
2392 
2393 		DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2394 		    struct buf *, bp);
2395 
2396 		_sv_lyr_strategy(bp);
2397 		sv_no_threads++;
2398 	} else {
2399 		nlive = nst_nlive(sv_tset);
2400 		if (nlive > sv_max_nlive) {
2401 			if (sv_debug > 0) {
2402 				cmn_err(CE_CONT,
2403 				    "!sv_lyr_strategy: "
2404 				    "new max nlive %d (nthread %d)\n",
2405 				    nlive, nst_nthread(sv_tset));
2406 			}
2407 
2408 			sv_max_nlive = nlive;
2409 		}
2410 	}
2411 
2412 	return (0);
2413 }
2414 
2415 
2416 #ifndef offsetof
2417 #define	offsetof(s, m)	((size_t)(&((s *)0)->m))
2418 #endif
2419 
2420 /*
2421  * re-write the size of the current partition
2422  */
2423 static int
2424 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2425 {
2426 	size_t offset;
2427 	int ilp32;
2428 	int pnum;
2429 	int rc;
2430 
2431 	ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2432 
2433 	rc = nskern_partition(svp->sv_dev, &pnum);
2434 	if (rc != 0) {
2435 		return (rc);
2436 	}
2437 
2438 	if (pnum < 0 || pnum >= V_NUMPAR) {
2439 		cmn_err(CE_WARN,
2440 		    "!sv_gvtoc: unable to determine partition number "
2441 		    "for dev %lx", svp->sv_dev);
2442 		return (EINVAL);
2443 	}
2444 
2445 	if (ilp32) {
2446 		int32_t p_size;
2447 
2448 #ifdef _SunOS_5_6
2449 		offset = offsetof(struct vtoc, v_part);
2450 		offset += sizeof (struct partition) * pnum;
2451 		offset += offsetof(struct partition, p_size);
2452 #else
2453 		offset = offsetof(struct vtoc32, v_part);
2454 		offset += sizeof (struct partition32) * pnum;
2455 		offset += offsetof(struct partition32, p_size);
2456 #endif
2457 
2458 		p_size = (int32_t)svp->sv_nblocks;
2459 		if (p_size == 0) {
2460 			if (sv_reserve(svp->sv_fd,
2461 			    NSC_MULTI|NSC_PCATCH) == 0) {
2462 				p_size = (int32_t)svp->sv_nblocks;
2463 				nsc_release(svp->sv_fd);
2464 			} else {
2465 				rc = EINTR;
2466 			}
2467 		}
2468 
2469 		if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2470 		    sizeof (p_size), mode) != 0) {
2471 			rc = EFAULT;
2472 		}
2473 	} else {
2474 		long p_size;
2475 
2476 		offset = offsetof(struct vtoc, v_part);
2477 		offset += sizeof (struct partition) * pnum;
2478 		offset += offsetof(struct partition, p_size);
2479 
2480 		p_size = (long)svp->sv_nblocks;
2481 		if (p_size == 0) {
2482 			if (sv_reserve(svp->sv_fd,
2483 			    NSC_MULTI|NSC_PCATCH) == 0) {
2484 				p_size = (long)svp->sv_nblocks;
2485 				nsc_release(svp->sv_fd);
2486 			} else {
2487 				rc = EINTR;
2488 			}
2489 		}
2490 
2491 		if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2492 		    sizeof (p_size), mode) != 0) {
2493 			rc = EFAULT;
2494 		}
2495 	}
2496 
2497 	return (rc);
2498 }
2499 
2500 
2501 #ifdef DKIOCPARTITION
2502 /*
2503  * re-write the size of the current partition
2504  *
2505  * arg is dk_efi_t.
2506  *
2507  * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2508  *
2509  * dk_efi_t->dki_data --> efi_gpt_t (label header)
2510  * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2511  *
2512  * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2513  * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2514  *
2515  * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2516  * logical block on the disk.
2517  *
2518  * Everything is little endian (i.e. disk format).
2519  */
2520 static int
2521 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2522 {
2523 	dk_efi_t efi;
2524 	efi_gpt_t gpt;
2525 	efi_gpe_t *gpe = NULL;
2526 	size_t sgpe;
2527 	uint64_t p_size;	/* virtual partition size from nsctl */
2528 	uint32_t crc;
2529 	int unparts;		/* number of parts in user's array */
2530 	int pnum;
2531 	int rc;
2532 
2533 	rc = nskern_partition(svp->sv_dev, &pnum);
2534 	if (rc != 0) {
2535 		return (rc);
2536 	}
2537 
2538 	if (pnum < 0) {
2539 		cmn_err(CE_WARN,
2540 		    "!sv_efi: unable to determine partition number for dev %lx",
2541 		    svp->sv_dev);
2542 		return (EINVAL);
2543 	}
2544 
2545 	if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2546 		return (EFAULT);
2547 	}
2548 
2549 	efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2550 
2551 	if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2552 		return (EINVAL);
2553 	}
2554 
2555 	if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2556 		rc = EFAULT;
2557 		goto out;
2558 	}
2559 
2560 	if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2561 		unparts = 1;
2562 	else if (pnum >= unparts) {
2563 		cmn_err(CE_WARN,
2564 		    "!sv_efi: partition# beyond end of user array (%d >= %d)",
2565 		    pnum, unparts);
2566 		return (EINVAL);
2567 	}
2568 
2569 	sgpe = sizeof (*gpe) * unparts;
2570 	gpe = kmem_alloc(sgpe, KM_SLEEP);
2571 
2572 	if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2573 		rc = EFAULT;
2574 		goto out;
2575 	}
2576 
2577 	p_size = svp->sv_nblocks;
2578 	if (p_size == 0) {
2579 		if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2580 			p_size = (diskaddr_t)svp->sv_nblocks;
2581 			nsc_release(svp->sv_fd);
2582 		} else {
2583 			rc = EINTR;
2584 		}
2585 	}
2586 
2587 	gpe[pnum].efi_gpe_EndingLBA = LE_64(
2588 	    LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2589 
2590 	gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2591 	CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2592 	gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2593 
2594 	gpt.efi_gpt_HeaderCRC32 = 0;
2595 	CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2596 	gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2597 
2598 	if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2599 		rc = EFAULT;
2600 		goto out;
2601 	}
2602 
2603 	if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2604 		rc = EFAULT;
2605 		goto out;
2606 	}
2607 
2608 out:
2609 	if (gpe) {
2610 		kmem_free(gpe, sgpe);
2611 	}
2612 
2613 	return (rc);
2614 }
2615 
2616 
2617 /*
2618  * Re-write the size of the partition specified by p_partno
2619  *
2620  * Note that if a DKIOCPARTITION is issued to an fd opened against a
2621  * non-sv'd device, but p_partno requests the size for a different
2622  * device that is sv'd, this function will *not* be called as sv is
2623  * not interposed on the original device (the fd).
2624  *
2625  * It would not be easy to change this as we cannot get the partition
2626  * number for the non-sv'd device, so cannot compute the dev_t of the
2627  * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2628  * its size from nsctl.
2629  *
2630  * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2631  */
2632 static int
2633 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2634 {
2635 	struct partition64 p64;
2636 	sv_dev_t *nsvp = NULL;
2637 	diskaddr_t p_size;
2638 	minor_t nminor;
2639 	int pnum, rc;
2640 	dev_t ndev;
2641 
2642 	rc = nskern_partition(svp->sv_dev, &pnum);
2643 	if (rc != 0) {
2644 		return (rc);
2645 	}
2646 
2647 	if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2648 		return (EFAULT);
2649 	}
2650 
2651 	if (p64.p_partno != pnum) {
2652 		/* switch to requested partition, not the current one */
2653 		nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2654 		ndev = makedevice(getmajor(svp->sv_dev), nminor);
2655 		nsvp = sv_find_enabled(ndev, NULL);
2656 		if (nsvp == NULL) {
2657 			/* not sv device - just return */
2658 			return (0);
2659 		}
2660 
2661 		svp = nsvp;
2662 	}
2663 
2664 	p_size = svp->sv_nblocks;
2665 	if (p_size == 0) {
2666 		if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2667 			p_size = (diskaddr_t)svp->sv_nblocks;
2668 			nsc_release(svp->sv_fd);
2669 		} else {
2670 			rc = EINTR;
2671 		}
2672 	}
2673 
2674 	if (nsvp != NULL) {
2675 		rw_exit(&nsvp->sv_lock);
2676 	}
2677 
2678 	if ((rc == 0) && ddi_copyout(&p_size,
2679 	    (void *)(arg + offsetof(struct partition64, p_size)),
2680 	    sizeof (p_size), mode) != 0) {
2681 		return (EFAULT);
2682 	}
2683 
2684 	return (rc);
2685 }
2686 #endif /* DKIOCPARTITION */
2687 
2688 
2689 static int
2690 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2691     const int mode, cred_t *crp, int *rvalp)
2692 {
2693 	sv_dev_t *svp;
2694 	sv_maj_t *maj;
2695 	int (*fn)();
2696 	int rc = 0;
2697 
2698 	maj = 0;
2699 	fn = 0;
2700 
2701 	/*
2702 	 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2703 	 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2704 	 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2705 	 *
2706 	 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2707 	 */
2708 	if (sv_mod_status == SV_ALLOW_UNLOAD) {
2709 		return (EBUSY);
2710 	}
2711 
2712 	svp = sv_find_enabled(dev, &maj);
2713 	if (svp != NULL) {
2714 		if (nskernd_isdaemon()) {
2715 			/*
2716 			 * This is nskernd which always needs to see
2717 			 * the underlying disk device accurately.
2718 			 *
2719 			 * So just pass the ioctl straight through
2720 			 * to the underlying driver as though the device
2721 			 * was not sv enabled.
2722 			 */
2723 			DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2724 			    dev_t, dev);
2725 
2726 			rw_exit(&svp->sv_lock);
2727 			svp = NULL;
2728 		} else {
2729 			ASSERT(RW_READ_HELD(&svp->sv_lock));
2730 		}
2731 	}
2732 
2733 	/*
2734 	 * We now have a locked and enabled SV device, or a non-SV device.
2735 	 */
2736 
2737 	switch (cmd) {
2738 		/*
2739 		 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2740 		 * and DKIOCSETEFI are intercepted and faked up as some
2741 		 * i/o providers emulate volumes of a different size to
2742 		 * the underlying volume.
2743 		 *
2744 		 * Setting the size by rewriting the vtoc is not permitted.
2745 		 */
2746 
2747 	case DKIOCSVTOC:
2748 #ifdef DKIOCPARTITION
2749 	case DKIOCSETEFI:
2750 #endif
2751 		if (svp == NULL) {
2752 			/* not intercepted -- allow ioctl through */
2753 			break;
2754 		}
2755 
2756 		rw_exit(&svp->sv_lock);
2757 
2758 		DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2759 
2760 		return (EPERM);
2761 
2762 	default:
2763 		break;
2764 	}
2765 
2766 	/*
2767 	 * Pass through the real ioctl command.
2768 	 */
2769 
2770 	if (maj && (fn = maj->sm_ioctl) != 0) {
2771 		if (!(maj->sm_flag & D_MP)) {
2772 			UNSAFE_ENTER();
2773 			rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2774 			UNSAFE_EXIT();
2775 		} else {
2776 			rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2777 		}
2778 	} else {
2779 		rc = ENODEV;
2780 	}
2781 
2782 	/*
2783 	 * Bug 4755783
2784 	 * Fix up the size of the current partition to allow
2785 	 * for the virtual volume to be a different size to the
2786 	 * physical volume (e.g. for II compact dependent shadows).
2787 	 *
2788 	 * Note that this only attempts to fix up the current partition
2789 	 * - the one that the ioctl was issued against.  There could be
2790 	 * other sv'd partitions in the same vtoc, but we cannot tell
2791 	 * so we don't attempt to fix them up.
2792 	 */
2793 
2794 	if (svp != NULL && rc == 0) {
2795 		switch (cmd) {
2796 		case DKIOCGVTOC:
2797 			rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2798 			break;
2799 
2800 #ifdef DKIOCPARTITION
2801 		case DKIOCGETEFI:
2802 			rc = sv_fix_dkiocgetefi(arg, mode, svp);
2803 			break;
2804 
2805 		case DKIOCPARTITION:
2806 			rc = sv_fix_dkiocpartition(arg, mode, svp);
2807 			break;
2808 #endif /* DKIOCPARTITION */
2809 		}
2810 	}
2811 
2812 	if (svp != NULL) {
2813 		rw_exit(&svp->sv_lock);
2814 	}
2815 
2816 	return (rc);
2817 }
2818