xref: /titanic_50/usr/src/uts/common/avs/ns/sv/sv.c (revision 5c5f137104b2d56181283389fa902220f2023809)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26  */
27 
28 /*
29  * Storage Volume Character and Block Driver (SV)
30  *
31  * This driver implements a simplistic /dev/{r}dsk/ interface to a
32  * specified disk volume that is otherwise managed by the Prism
33  * software.  The SV driver layers itself onto the underlying disk
34  * device driver by changing function pointers in the cb_ops
35  * structure.
36  *
37  * CONFIGURATION:
38  *
39  * 1. Configure the driver using the svadm utility.
40  * 2. Access the device as before through /dev/rdsk/c?t?d?s?
41  *
42  * LIMITATIONS:
43  *
44  * This driver should NOT be used to share a device between another
45  * DataServices user interface module (e.g., STE) and a user accessing
46  * the device through the block device in O_WRITE mode.  This is because
47  * writes through the block device are asynchronous (due to the page
48  * cache) and so consistency between the block device user and the
49  * STE user cannot be guaranteed.
50  *
51  * Data is copied between system struct buf(9s) and nsc_vec_t.  This is
52  * wasteful and slow.
53  */
54 
55 #include <sys/debug.h>
56 #include <sys/types.h>
57 
58 #include <sys/ksynch.h>
59 #include <sys/kmem.h>
60 #include <sys/errno.h>
61 #include <sys/varargs.h>
62 #include <sys/file.h>
63 #include <sys/open.h>
64 #include <sys/conf.h>
65 #include <sys/cred.h>
66 #include <sys/buf.h>
67 #include <sys/uio.h>
68 #ifndef DS_DDICT
69 #include <sys/pathname.h>
70 #endif
71 #include <sys/aio_req.h>
72 #include <sys/dkio.h>
73 #include <sys/vtoc.h>
74 #include <sys/cmn_err.h>
75 #include <sys/modctl.h>
76 #include <sys/ddi.h>
77 #include <sys/sysmacros.h>
78 #include <sys/sunddi.h>
79 #include <sys/sunldi.h>
80 #include <sys/nsctl/nsvers.h>
81 
82 #include <sys/nsc_thread.h>
83 #include <sys/unistat/spcs_s.h>
84 #include <sys/unistat/spcs_s_k.h>
85 #include <sys/unistat/spcs_errors.h>
86 
87 #ifdef DS_DDICT
88 #include "../contract.h"
89 #endif
90 
91 #include "../nsctl.h"
92 
93 
94 #include <sys/sdt.h>		/* dtrace is S10 or later */
95 
96 #include "sv.h"
97 #include "sv_impl.h"
98 #include "sv_efi.h"
99 
100 #define	MAX_EINTR_COUNT 1000
101 
102 /*
103  * sv_mod_status
104  */
105 #define	SV_PREVENT_UNLOAD 1
106 #define	SV_ALLOW_UNLOAD	2
107 
108 static const int sv_major_rev = ISS_VERSION_MAJ;	/* Major number */
109 static const int sv_minor_rev = ISS_VERSION_MIN;	/* Minor number */
110 static const int sv_micro_rev = ISS_VERSION_MIC;	/* Micro number */
111 static const int sv_baseline_rev = ISS_VERSION_NUM;	/* Baseline number */
112 
113 #ifdef DKIOCPARTITION
114 /*
115  * CRC32 polynomial table needed for computing the checksums
116  * in an EFI vtoc.
117  */
118 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
119 #endif
120 
121 static clock_t sv_config_time;		/* Time of successful {en,dis}able */
122 static int sv_debug;			/* Set non-zero for debug to syslog */
123 static int sv_mod_status;		/* Set to prevent modunload */
124 
125 static dev_info_t *sv_dip;		/* Single DIP for driver */
126 static kmutex_t sv_mutex;		/* Protect global lists, etc. */
127 
128 static nsc_mem_t	*sv_mem;	/* nsctl memory allocator token */
129 
130 
131 /*
132  * Per device and per major state.
133  */
134 
135 #ifndef _SunOS_5_6
136 #define	UNSAFE_ENTER()
137 #define	UNSAFE_EXIT()
138 #else
139 #define	UNSAFE_ENTER()	mutex_enter(&unsafe_driver)
140 #define	UNSAFE_EXIT()	mutex_exit(&unsafe_driver)
141 #endif
142 
143 					/* hash table of major dev structures */
144 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
145 static sv_dev_t *sv_devs;		/* array of per device structures */
146 static int sv_max_devices;		/* SV version of nsc_max_devices() */
147 static int sv_ndevices;			/* number of SV enabled devices */
148 
149 /*
150  * Threading.
151  */
152 
153 int sv_threads_max = 1024;		/* maximum # to dynamically alloc */
154 int sv_threads = 32;			/* # to pre-allocate (see sv.conf) */
155 int sv_threads_extra = 0;		/* addl # we would have alloc'ed */
156 
157 static nstset_t *sv_tset;		/* the threadset pointer */
158 
159 static int sv_threads_hysteresis = 4;	/* hysteresis for threadset resizing */
160 static int sv_threads_dev = 2;		/* # of threads to alloc per device */
161 static int sv_threads_inc = 8;		/* increment for changing the set */
162 static int sv_threads_needed;		/* number of threads needed */
163 static int sv_no_threads;		/* number of nsc_create errors */
164 static int sv_max_nlive;		/* max number of threads running */
165 
166 
167 
168 /*
169  * nsctl fd callbacks.
170  */
171 
172 static int svattach_fd(blind_t);
173 static int svdetach_fd(blind_t);
174 
175 static nsc_def_t sv_fd_def[] = {
176 	{ "Attach",	(uintptr_t)svattach_fd, },
177 	{ "Detach",	(uintptr_t)svdetach_fd, },
178 	{ 0, 0, }
179 };
180 
181 /*
182  * cb_ops functions.
183  */
184 
185 static int svopen(dev_t *, int, int, cred_t *);
186 static int svclose(dev_t, int, int, cred_t *);
187 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
188 static int svprint(dev_t, char *);
189 
190 /*
191  * These next functions are layered into the underlying driver's devops.
192  */
193 
194 static int sv_lyr_open(dev_t *, int, int, cred_t *);
195 static int sv_lyr_close(dev_t, int, int, cred_t *);
196 static int sv_lyr_strategy(struct buf *);
197 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
198 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
199 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
200 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
201 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
202 
203 static struct cb_ops sv_cb_ops = {
204 	svopen,		/* open */
205 	svclose,	/* close */
206 	nulldev,	/* strategy */
207 	svprint,
208 	nodev,		/* dump */
209 	nodev,		/* read */
210 	nodev,		/* write */
211 	svioctl,
212 	nodev,		/* devmap */
213 	nodev,		/* mmap */
214 	nodev,		/* segmap */
215 	nochpoll,	/* poll */
216 	ddi_prop_op,
217 	NULL,		/* NOT a stream */
218 	D_NEW | D_MP | D_64BIT,
219 	CB_REV,
220 	nodev,		/* aread */
221 	nodev,		/* awrite */
222 };
223 
224 
225 /*
226  * dev_ops functions.
227  */
228 
229 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
230 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
231 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
232 
233 static struct dev_ops sv_ops = {
234 	DEVO_REV,
235 	0,
236 	sv_getinfo,
237 	nulldev,	/* identify */
238 	nulldev,	/* probe */
239 	sv_attach,
240 	sv_detach,
241 	nodev,		/* reset */
242 	&sv_cb_ops,
243 	(struct bus_ops *)0
244 };
245 
246 /*
247  * Module linkage.
248  */
249 
250 extern struct mod_ops mod_driverops;
251 
252 static struct modldrv modldrv = {
253 	&mod_driverops,
254 	"nws:Storage Volume:" ISS_VERSION_STR,
255 	&sv_ops
256 };
257 
258 static struct modlinkage modlinkage = {
259 	MODREV_1,
260 	&modldrv,
261 	0
262 };
263 
264 
265 int
_init(void)266 _init(void)
267 {
268 	int error;
269 
270 	mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
271 
272 	if ((error = mod_install(&modlinkage)) != 0) {
273 		mutex_destroy(&sv_mutex);
274 		return (error);
275 	}
276 
277 #ifdef DEBUG
278 	cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
279 	    sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
280 	    ISS_VERSION_STR, BUILD_DATE_STR);
281 #else
282 	if (sv_micro_rev) {
283 		cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
284 		    sv_major_rev, sv_minor_rev, sv_micro_rev,
285 		    ISS_VERSION_STR, BUILD_DATE_STR);
286 	} else {
287 		cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
288 		    sv_major_rev, sv_minor_rev,
289 		    ISS_VERSION_STR, BUILD_DATE_STR);
290 	}
291 #endif
292 
293 	return (error);
294 }
295 
296 
297 int
_fini(void)298 _fini(void)
299 {
300 	int error;
301 
302 	if ((error = mod_remove(&modlinkage)) != 0)
303 		return (error);
304 
305 	mutex_destroy(&sv_mutex);
306 
307 	return (error);
308 }
309 
310 
311 int
_info(struct modinfo * modinfop)312 _info(struct modinfo *modinfop)
313 {
314 	return (mod_info(&modlinkage, modinfop));
315 }
316 
317 
318 /*
319  * Locking & State.
320  *
321  * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
322  * threadset creation and sizing; sv_ndevices.
323  *
324  * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
325  * must be acquired first.
326  *
327  * sv_lock protects the sv_dev_t structure for an individual device.
328  *
329  * sv_olock protects the otyp/open members of the sv_dev_t.  If we need
330  * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
331  * first.
332  *
333  * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
334  * I/O operations to a device simultaneously, as above.
335  *
336  * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
337  * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
338  * and (sv_pending == curthread) so that any recursion through
339  * sv_lyr_open/sv_lyr_close can be detected.
340  */
341 
342 
343 static int
sv_init_devs(void)344 sv_init_devs(void)
345 {
346 	int i;
347 
348 	ASSERT(MUTEX_HELD(&sv_mutex));
349 
350 	if (sv_max_devices > 0)
351 		return (0);
352 
353 	sv_max_devices = nsc_max_devices();
354 
355 	if (sv_max_devices <= 0) {
356 		/* nsctl is not attached (nskernd not running) */
357 		if (sv_debug > 0)
358 			cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
359 		return (EAGAIN);
360 	}
361 
362 	sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
363 	    KM_NOSLEEP, sv_mem);
364 
365 	if (sv_devs == NULL) {
366 		cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
367 		return (ENOMEM);
368 	}
369 
370 	for (i = 0; i < sv_max_devices; i++) {
371 		mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
372 		rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
373 	}
374 
375 	if (sv_debug > 0)
376 		cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
377 
378 	return (0);
379 }
380 
381 
382 static int
sv_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)383 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
384 {
385 	int rc;
386 
387 	switch (cmd) {
388 
389 	case DDI_ATTACH:
390 		sv_dip = dip;
391 
392 		if (ddi_create_minor_node(dip, "sv", S_IFCHR,
393 		    0, DDI_PSEUDO, 0) != DDI_SUCCESS)
394 			goto failed;
395 
396 		mutex_enter(&sv_mutex);
397 
398 		sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
399 		if (sv_mem == NULL) {
400 			mutex_exit(&sv_mutex);
401 			goto failed;
402 		}
403 
404 		rc = sv_init_devs();
405 		if (rc != 0 && rc != EAGAIN) {
406 			mutex_exit(&sv_mutex);
407 			goto failed;
408 		}
409 
410 		mutex_exit(&sv_mutex);
411 
412 
413 		ddi_report_dev(dip);
414 
415 		sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
416 		    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
417 		    "sv_threads", sv_threads);
418 
419 		if (sv_debug > 0)
420 			cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
421 
422 		if (sv_threads > sv_threads_max)
423 			sv_threads_max = sv_threads;
424 
425 		return (DDI_SUCCESS);
426 
427 	default:
428 		return (DDI_FAILURE);
429 	}
430 
431 failed:
432 	DTRACE_PROBE(sv_attach_failed);
433 	(void) sv_detach(dip, DDI_DETACH);
434 	return (DDI_FAILURE);
435 }
436 
437 
438 static int
sv_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)439 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
440 {
441 	sv_dev_t *svp;
442 	int i;
443 
444 	switch (cmd) {
445 
446 	case DDI_DETACH:
447 
448 		/*
449 		 * Check that everything is disabled.
450 		 */
451 
452 		mutex_enter(&sv_mutex);
453 
454 		if (sv_mod_status == SV_PREVENT_UNLOAD) {
455 			mutex_exit(&sv_mutex);
456 			DTRACE_PROBE(sv_detach_err_prevent);
457 			return (DDI_FAILURE);
458 		}
459 
460 		for (i = 0; sv_devs && i < sv_max_devices; i++) {
461 			svp = &sv_devs[i];
462 
463 			if (svp->sv_state != SV_DISABLE) {
464 				mutex_exit(&sv_mutex);
465 				DTRACE_PROBE(sv_detach_err_busy);
466 				return (DDI_FAILURE);
467 			}
468 		}
469 
470 
471 		for (i = 0; sv_devs && i < sv_max_devices; i++) {
472 			mutex_destroy(&sv_devs[i].sv_olock);
473 			rw_destroy(&sv_devs[i].sv_lock);
474 		}
475 
476 		if (sv_devs) {
477 			nsc_kmem_free(sv_devs,
478 			    (sv_max_devices * sizeof (*sv_devs)));
479 			sv_devs = NULL;
480 		}
481 		sv_max_devices = 0;
482 
483 		if (sv_mem) {
484 			nsc_unregister_mem(sv_mem);
485 			sv_mem = NULL;
486 		}
487 
488 		mutex_exit(&sv_mutex);
489 
490 		/*
491 		 * Remove all minor nodes.
492 		 */
493 
494 		ddi_remove_minor_node(dip, NULL);
495 		sv_dip = NULL;
496 
497 		return (DDI_SUCCESS);
498 
499 	default:
500 		return (DDI_FAILURE);
501 	}
502 }
503 
504 static sv_maj_t *
sv_getmajor(const dev_t dev)505 sv_getmajor(const dev_t dev)
506 {
507 	sv_maj_t **insert, *maj;
508 	major_t umaj = getmajor(dev);
509 
510 	/*
511 	 * See if the hash table entry, or one of the hash chains
512 	 * is already allocated for this major number
513 	 */
514 	if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
515 		do {
516 			if (maj->sm_major == umaj)
517 				return (maj);
518 		} while ((maj = maj->sm_next) != 0);
519 	}
520 
521 	/*
522 	 * If the sv_mutex is held, there is design flaw, as the only non-mutex
523 	 * held callers can be sv_enable() or sv_dev_to_sv()
524 	 * Return an error, instead of panicing the system
525 	 */
526 	if (MUTEX_HELD(&sv_mutex)) {
527 		cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
528 		return (NULL);
529 	}
530 
531 	/*
532 	 * Determine where to allocate a new element in the hash table
533 	 */
534 	mutex_enter(&sv_mutex);
535 	insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
536 	for (maj = *insert; maj; maj = maj->sm_next) {
537 
538 		/* Did another thread beat us to it? */
539 		if (maj->sm_major == umaj)
540 			return (maj);
541 
542 		/* Find a NULL insert point? */
543 		if (maj->sm_next == NULL)
544 			insert = &maj->sm_next;
545 	}
546 
547 	/*
548 	 * Located the new insert point
549 	 */
550 	*insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
551 	if ((maj = *insert) != 0)
552 		maj->sm_major = umaj;
553 	else
554 		cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
555 
556 	mutex_exit(&sv_mutex);
557 
558 	return (maj);
559 }
560 
561 /* ARGSUSED */
562 
563 static int
sv_getinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)564 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
565 {
566 	int rc = DDI_FAILURE;
567 
568 	switch (infocmd) {
569 
570 	case DDI_INFO_DEVT2DEVINFO:
571 		*result = sv_dip;
572 		rc = DDI_SUCCESS;
573 		break;
574 
575 	case DDI_INFO_DEVT2INSTANCE:
576 		/*
577 		 * We only have a single instance.
578 		 */
579 		*result = 0;
580 		rc = DDI_SUCCESS;
581 		break;
582 
583 	default:
584 		break;
585 	}
586 
587 	return (rc);
588 }
589 
590 
591 /*
592  * Hashing of devices onto major device structures.
593  *
594  * Individual device structures are hashed onto one of the sm_hash[]
595  * buckets in the relevant major device structure.
596  *
597  * Hash insertion and deletion -must- be done with sv_mutex held.  Hash
598  * searching does not require the mutex because of the sm_seq member.
599  * sm_seq is incremented on each insertion (-after- hash chain pointer
600  * manipulation) and each deletion (-before- hash chain pointer
601  * manipulation).  When searching the hash chain, the seq number is
602  * checked before accessing each device structure, if the seq number has
603  * changed, then we restart the search from the top of the hash chain.
604  * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
605  * the hash chain (we are guaranteed that this search cannot be
606  * interrupted).
607  */
608 
609 #define	SV_HASH_RETRY	16
610 
611 static sv_dev_t *
sv_dev_to_sv(const dev_t dev,sv_maj_t ** majpp)612 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
613 {
614 	minor_t umin = getminor(dev);
615 	sv_dev_t **hb, *next, *svp;
616 	sv_maj_t *maj;
617 	int seq;
618 	int try;
619 
620 	/* Get major hash table */
621 	maj = sv_getmajor(dev);
622 	if (majpp)
623 		*majpp = maj;
624 	if (maj == NULL)
625 		return (NULL);
626 
627 	if (maj->sm_inuse == 0) {
628 		DTRACE_PROBE1(
629 		    sv_dev_to_sv_end,
630 		    dev_t, dev);
631 		return (NULL);
632 	}
633 
634 	hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
635 	try = 0;
636 
637 retry:
638 	if (try > SV_HASH_RETRY)
639 		mutex_enter(&sv_mutex);
640 
641 	seq = maj->sm_seq;
642 	for (svp = *hb; svp; svp = next) {
643 		next = svp->sv_hash;
644 
645 		nsc_membar_stld();	/* preserve register load order */
646 
647 		if (maj->sm_seq != seq) {
648 			DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
649 			try++;
650 			goto retry;
651 		}
652 
653 		if (svp->sv_dev == dev)
654 			break;
655 	}
656 
657 	if (try > SV_HASH_RETRY)
658 		mutex_exit(&sv_mutex);
659 
660 	return (svp);
661 }
662 
663 
664 /*
665  * Must be called with sv_mutex held.
666  */
667 
668 static int
sv_get_state(const dev_t udev,sv_dev_t ** svpp)669 sv_get_state(const dev_t udev, sv_dev_t **svpp)
670 {
671 	sv_dev_t **hb, **insert, *svp;
672 	sv_maj_t *maj;
673 	minor_t umin;
674 	int i;
675 
676 	/* Get major hash table */
677 	if ((maj = sv_getmajor(udev)) == NULL)
678 		return (NULL);
679 
680 	/* Determine which minor hash table */
681 	umin = getminor(udev);
682 	hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
683 
684 	/* look for clash */
685 
686 	insert = hb;
687 
688 	for (svp = *hb; svp; svp = svp->sv_hash) {
689 		if (svp->sv_dev == udev)
690 			break;
691 
692 		if (svp->sv_hash == NULL)
693 			insert = &svp->sv_hash;
694 	}
695 
696 	if (svp) {
697 		DTRACE_PROBE1(
698 		    sv_get_state_enabled,
699 		    dev_t, udev);
700 		return (SV_EENABLED);
701 	}
702 
703 	/* look for spare sv_devs slot */
704 
705 	for (i = 0; i < sv_max_devices; i++) {
706 		svp = &sv_devs[i];
707 
708 		if (svp->sv_state == SV_DISABLE)
709 			break;
710 	}
711 
712 	if (i >= sv_max_devices) {
713 		DTRACE_PROBE1(
714 		    sv_get_state_noslots,
715 		    dev_t, udev);
716 		return (SV_ENOSLOTS);
717 	}
718 
719 	svp->sv_state = SV_PENDING;
720 	svp->sv_pending = curthread;
721 
722 	*insert = svp;
723 	svp->sv_hash = NULL;
724 	maj->sm_seq++;		/* must be after the store to the hash chain */
725 
726 	*svpp = svp;
727 
728 	/*
729 	 * We do not know the size of the underlying device at
730 	 * this stage, so initialise "nblocks" property to
731 	 * zero, and update it whenever we succeed in
732 	 * nsc_reserve'ing the underlying nsc_fd_t.
733 	 */
734 
735 	svp->sv_nblocks = 0;
736 
737 	return (0);
738 }
739 
740 
741 /*
742  * Remove a device structure from it's hash chain.
743  * Must be called with sv_mutex held.
744  */
745 
746 static void
sv_rm_hash(sv_dev_t * svp)747 sv_rm_hash(sv_dev_t *svp)
748 {
749 	sv_dev_t **svpp;
750 	sv_maj_t *maj;
751 
752 	/* Get major hash table */
753 	if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
754 		return;
755 
756 	/* remove svp from hash chain */
757 
758 	svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
759 	while (*svpp) {
760 		if (*svpp == svp) {
761 			/*
762 			 * increment of sm_seq must be before the
763 			 * removal from the hash chain
764 			 */
765 			maj->sm_seq++;
766 			*svpp = svp->sv_hash;
767 			break;
768 		}
769 
770 		svpp = &(*svpp)->sv_hash;
771 	}
772 
773 	svp->sv_hash = NULL;
774 }
775 
776 /*
777  * Free (disable) a device structure.
778  * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
779  * perform the exits during its processing.
780  */
781 
782 static int
sv_free(sv_dev_t * svp,const int error)783 sv_free(sv_dev_t *svp, const int error)
784 {
785 	struct cb_ops *cb_ops;
786 	sv_maj_t *maj;
787 
788 	/* Get major hash table */
789 	if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
790 		return (NULL);
791 
792 	svp->sv_state = SV_PENDING;
793 	svp->sv_pending = curthread;
794 
795 	/*
796 	 * Close the fd's before removing from the hash or swapping
797 	 * back the cb_ops pointers so that the cache flushes before new
798 	 * io can come in.
799 	 */
800 
801 	if (svp->sv_fd) {
802 		(void) nsc_close(svp->sv_fd);
803 		svp->sv_fd = 0;
804 	}
805 
806 	sv_rm_hash(svp);
807 
808 	if (error != SV_ESDOPEN &&
809 	    error != SV_ELYROPEN && --maj->sm_inuse == 0) {
810 
811 		if (maj->sm_dev_ops)
812 			cb_ops = maj->sm_dev_ops->devo_cb_ops;
813 		else
814 			cb_ops = NULL;
815 
816 		if (cb_ops && maj->sm_strategy != NULL) {
817 			cb_ops->cb_strategy = maj->sm_strategy;
818 			cb_ops->cb_close = maj->sm_close;
819 			cb_ops->cb_ioctl = maj->sm_ioctl;
820 			cb_ops->cb_write = maj->sm_write;
821 			cb_ops->cb_open = maj->sm_open;
822 			cb_ops->cb_read = maj->sm_read;
823 			cb_ops->cb_flag = maj->sm_flag;
824 
825 			if (maj->sm_awrite)
826 				cb_ops->cb_awrite = maj->sm_awrite;
827 
828 			if (maj->sm_aread)
829 				cb_ops->cb_aread = maj->sm_aread;
830 
831 			/*
832 			 * corbin XXX
833 			 * Leave backing device ops in maj->sm_*
834 			 * to handle any requests that might come
835 			 * in during the disable.  This could be
836 			 * a problem however if the backing device
837 			 * driver is changed while we process these
838 			 * requests.
839 			 *
840 			 * maj->sm_strategy = 0;
841 			 * maj->sm_awrite = 0;
842 			 * maj->sm_write = 0;
843 			 * maj->sm_ioctl = 0;
844 			 * maj->sm_close = 0;
845 			 * maj->sm_aread = 0;
846 			 * maj->sm_read = 0;
847 			 * maj->sm_open = 0;
848 			 * maj->sm_flag = 0;
849 			 *
850 			 */
851 		}
852 
853 		if (maj->sm_dev_ops) {
854 			maj->sm_dev_ops = 0;
855 		}
856 	}
857 
858 	if (svp->sv_lh) {
859 		cred_t *crp = ddi_get_cred();
860 
861 		/*
862 		 * Close the protective layered driver open using the
863 		 * Sun Private layered driver i/f.
864 		 */
865 
866 		(void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
867 		svp->sv_lh = NULL;
868 	}
869 
870 	svp->sv_timestamp = nsc_lbolt();
871 	svp->sv_state = SV_DISABLE;
872 	svp->sv_pending = NULL;
873 	rw_exit(&svp->sv_lock);
874 	mutex_exit(&sv_mutex);
875 
876 	return (error);
877 }
878 
879 /*
880  * Reserve the device, taking into account the possibility that
881  * the reserve might have to be retried.
882  */
883 static int
sv_reserve(nsc_fd_t * fd,int flags)884 sv_reserve(nsc_fd_t *fd, int flags)
885 {
886 	int eintr_count;
887 	int rc;
888 
889 	eintr_count = 0;
890 	do {
891 		rc = nsc_reserve(fd, flags);
892 		if (rc == EINTR) {
893 			++eintr_count;
894 			delay(2);
895 		}
896 	} while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
897 
898 	return (rc);
899 }
900 
901 static int
sv_enable(const caddr_t path,const int flag,const dev_t udev,spcs_s_info_t kstatus)902 sv_enable(const caddr_t path, const int flag,
903     const dev_t udev, spcs_s_info_t kstatus)
904 {
905 	struct dev_ops *dev_ops;
906 	struct cb_ops *cb_ops;
907 	sv_dev_t *svp;
908 	sv_maj_t *maj;
909 	nsc_size_t nblocks;
910 	int rc;
911 	cred_t *crp;
912 	ldi_ident_t	li;
913 
914 	if (udev == (dev_t)-1 || udev == 0) {
915 		DTRACE_PROBE1(
916 		    sv_enable_err_baddev,
917 		    dev_t, udev);
918 		return (SV_EBADDEV);
919 	}
920 
921 	if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
922 		DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
923 		return (SV_EAMODE);
924 	}
925 
926 	/* Get major hash table */
927 	if ((maj = sv_getmajor(udev)) == NULL)
928 		return (SV_EBADDEV);
929 
930 	mutex_enter(&sv_mutex);
931 
932 	rc = sv_get_state(udev, &svp);
933 	if (rc) {
934 		mutex_exit(&sv_mutex);
935 		DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
936 		return (rc);
937 	}
938 
939 	rw_enter(&svp->sv_lock, RW_WRITER);
940 
941 	/*
942 	 * Get real fd used for io
943 	 */
944 
945 	svp->sv_dev = udev;
946 	svp->sv_flag = flag;
947 
948 	/*
949 	 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
950 	 * function pointer before sv swaps them out.
951 	 */
952 
953 	svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
954 	    sv_fd_def, (blind_t)udev, &rc);
955 
956 	if (svp->sv_fd == NULL) {
957 		if (kstatus)
958 			spcs_s_add(kstatus, rc);
959 		DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
960 		return (sv_free(svp, SV_ESDOPEN));
961 	}
962 
963 	/*
964 	 * Perform a layered driver open using the Sun Private layered
965 	 * driver i/f to ensure that the cb_ops structure for the driver
966 	 * is not detached out from under us whilst sv is enabled.
967 	 *
968 	 */
969 
970 	crp = ddi_get_cred();
971 	svp->sv_lh = NULL;
972 
973 	if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
974 		rc = ldi_open_by_dev(&svp->sv_dev,
975 		    OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
976 	}
977 
978 	if (rc != 0) {
979 		if (kstatus)
980 			spcs_s_add(kstatus, rc);
981 		DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
982 		return (sv_free(svp, SV_ELYROPEN));
983 	}
984 
985 	/*
986 	 * Do layering if required - must happen after nsc_open().
987 	 */
988 
989 	if (maj->sm_inuse++ == 0) {
990 		maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
991 
992 		if (maj->sm_dev_ops == NULL ||
993 		    maj->sm_dev_ops->devo_cb_ops == NULL) {
994 			DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
995 			return (sv_free(svp, SV_ELOAD));
996 		}
997 
998 		dev_ops = maj->sm_dev_ops;
999 		cb_ops = dev_ops->devo_cb_ops;
1000 
1001 		if (cb_ops->cb_strategy == NULL ||
1002 		    cb_ops->cb_strategy == nodev ||
1003 		    cb_ops->cb_strategy == nulldev) {
1004 			DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1005 			return (sv_free(svp, SV_ELOAD));
1006 		}
1007 
1008 		if (cb_ops->cb_strategy == sv_lyr_strategy) {
1009 			DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1010 			return (sv_free(svp, SV_ESTRATEGY));
1011 		}
1012 
1013 		maj->sm_strategy = cb_ops->cb_strategy;
1014 		maj->sm_close = cb_ops->cb_close;
1015 		maj->sm_ioctl = cb_ops->cb_ioctl;
1016 		maj->sm_write = cb_ops->cb_write;
1017 		maj->sm_open = cb_ops->cb_open;
1018 		maj->sm_read = cb_ops->cb_read;
1019 		maj->sm_flag = cb_ops->cb_flag;
1020 
1021 		cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1022 		cb_ops->cb_strategy = sv_lyr_strategy;
1023 		cb_ops->cb_close = sv_lyr_close;
1024 		cb_ops->cb_ioctl = sv_lyr_ioctl;
1025 		cb_ops->cb_write = sv_lyr_write;
1026 		cb_ops->cb_open = sv_lyr_open;
1027 		cb_ops->cb_read = sv_lyr_read;
1028 
1029 		/*
1030 		 * Check that the driver has async I/O entry points
1031 		 * before changing them.
1032 		 */
1033 
1034 		if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1035 			maj->sm_awrite = 0;
1036 			maj->sm_aread = 0;
1037 		} else {
1038 			maj->sm_awrite = cb_ops->cb_awrite;
1039 			maj->sm_aread = cb_ops->cb_aread;
1040 
1041 			cb_ops->cb_awrite = sv_lyr_awrite;
1042 			cb_ops->cb_aread = sv_lyr_aread;
1043 		}
1044 
1045 		/*
1046 		 * Bug 4645743
1047 		 *
1048 		 * Prevent sv from ever unloading after it has interposed
1049 		 * on a major device because there is a race between
1050 		 * sv removing its layered entry points from the target
1051 		 * dev_ops, a client coming in and accessing the driver,
1052 		 * and the kernel modunloading the sv text.
1053 		 *
1054 		 * To allow unload, do svboot -u, which only happens in
1055 		 * pkgrm time.
1056 		 */
1057 		ASSERT(MUTEX_HELD(&sv_mutex));
1058 		sv_mod_status = SV_PREVENT_UNLOAD;
1059 	}
1060 
1061 
1062 	svp->sv_timestamp = nsc_lbolt();
1063 	svp->sv_state = SV_ENABLE;
1064 	svp->sv_pending = NULL;
1065 	rw_exit(&svp->sv_lock);
1066 
1067 	sv_ndevices++;
1068 	mutex_exit(&sv_mutex);
1069 
1070 	nblocks = 0;
1071 	if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1072 		nblocks = svp->sv_nblocks;
1073 		nsc_release(svp->sv_fd);
1074 	}
1075 
1076 	cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1077 	    svp->sv_dev, nblocks);
1078 
1079 	return (0);
1080 }
1081 
1082 
1083 static int
sv_prepare_unload()1084 sv_prepare_unload()
1085 {
1086 	int rc = 0;
1087 
1088 	mutex_enter(&sv_mutex);
1089 
1090 	if (sv_mod_status == SV_PREVENT_UNLOAD) {
1091 		if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1092 			rc = EBUSY;
1093 		} else {
1094 			sv_mod_status = SV_ALLOW_UNLOAD;
1095 			delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1096 		}
1097 	}
1098 
1099 	mutex_exit(&sv_mutex);
1100 	return (rc);
1101 }
1102 
1103 static int
svattach_fd(blind_t arg)1104 svattach_fd(blind_t arg)
1105 {
1106 	dev_t dev = (dev_t)arg;
1107 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1108 	int rc;
1109 
1110 	if (sv_debug > 0)
1111 		cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1112 
1113 	if (svp == NULL) {
1114 		cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1115 		return (0);
1116 	}
1117 
1118 	if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1119 		cmn_err(CE_WARN,
1120 		    "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1121 		svp->sv_nblocks = 0;
1122 	}
1123 
1124 	if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1125 		cmn_err(CE_WARN,
1126 		    "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1127 		svp->sv_maxfbas = 0;
1128 	}
1129 
1130 	if (sv_debug > 0) {
1131 		cmn_err(CE_CONT,
1132 		    "!svattach_fd(%p): size %" NSC_SZFMT ", "
1133 		    "maxfbas %" NSC_SZFMT "\n",
1134 		    arg, svp->sv_nblocks, svp->sv_maxfbas);
1135 	}
1136 
1137 	return (0);
1138 }
1139 
1140 
1141 static int
svdetach_fd(blind_t arg)1142 svdetach_fd(blind_t arg)
1143 {
1144 	dev_t dev = (dev_t)arg;
1145 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1146 
1147 	if (sv_debug > 0)
1148 		cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1149 
1150 	/* svp can be NULL during disable of an sv */
1151 	if (svp == NULL)
1152 		return (0);
1153 
1154 	svp->sv_maxfbas = 0;
1155 	svp->sv_nblocks = 0;
1156 	return (0);
1157 }
1158 
1159 
1160 /*
1161  * Side effect: if called with (guard != 0), then expects both sv_mutex
1162  * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1163  */
1164 
1165 /* ARGSUSED */
1166 static int
sv_disable(dev_t dev,spcs_s_info_t kstatus)1167 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1168 {
1169 	sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1170 
1171 	if (svp == NULL) {
1172 
1173 		DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1174 		return (SV_ENODEV);
1175 	}
1176 
1177 	mutex_enter(&sv_mutex);
1178 	rw_enter(&svp->sv_lock, RW_WRITER);
1179 
1180 	if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1181 		rw_exit(&svp->sv_lock);
1182 		mutex_exit(&sv_mutex);
1183 
1184 		DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1185 		return (SV_EDISABLED);
1186 	}
1187 
1188 
1189 	sv_ndevices--;
1190 	return (sv_free(svp, 0));
1191 }
1192 
1193 
1194 
1195 static int
sv_lyr_open(dev_t * devp,int flag,int otyp,cred_t * crp)1196 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1197 {
1198 	nsc_buf_t *tmph;
1199 	sv_dev_t *svp;
1200 	sv_maj_t *maj;
1201 	int (*fn)();
1202 	dev_t odev;
1203 	int ret;
1204 	int rc;
1205 
1206 	svp = sv_dev_to_sv(*devp, &maj);
1207 
1208 	if (svp) {
1209 		if (svp->sv_state == SV_PENDING &&
1210 		    svp->sv_pending == curthread) {
1211 			/*
1212 			 * This is a recursive open from a call to
1213 			 * ddi_lyr_open_by_devt and so we just want
1214 			 * to pass it straight through to the
1215 			 * underlying driver.
1216 			 */
1217 			DTRACE_PROBE2(sv_lyr_open_recursive,
1218 			    sv_dev_t *, svp,
1219 			    dev_t, *devp);
1220 			svp = NULL;
1221 		} else
1222 			rw_enter(&svp->sv_lock, RW_READER);
1223 	}
1224 
1225 	odev = *devp;
1226 
1227 	if (maj && (fn = maj->sm_open) != 0) {
1228 		if (!(maj->sm_flag & D_MP)) {
1229 			UNSAFE_ENTER();
1230 			ret = (*fn)(devp, flag, otyp, crp);
1231 			UNSAFE_EXIT();
1232 		} else {
1233 			ret = (*fn)(devp, flag, otyp, crp);
1234 		}
1235 
1236 		if (ret == 0) {
1237 			/*
1238 			 * Re-acquire svp if the driver changed *devp.
1239 			 */
1240 
1241 			if (*devp != odev) {
1242 				if (svp != NULL)
1243 					rw_exit(&svp->sv_lock);
1244 
1245 				svp = sv_dev_to_sv(*devp, NULL);
1246 
1247 				if (svp) {
1248 					rw_enter(&svp->sv_lock, RW_READER);
1249 				}
1250 			}
1251 		}
1252 	} else {
1253 		ret = ENODEV;
1254 	}
1255 
1256 	if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1257 		/*
1258 		 * Underlying DDI open failed, but we have this
1259 		 * device SV enabled.  If we can read some data
1260 		 * from the device, fake a successful open (this
1261 		 * probably means that this device is RDC'd and we
1262 		 * are getting the data from the secondary node).
1263 		 *
1264 		 * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1265 		 * ensure that it does not deadlock if this open is
1266 		 * coming from nskernd:get_bsize().
1267 		 */
1268 		rc = sv_reserve(svp->sv_fd,
1269 		    NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1270 		if (rc == 0) {
1271 			tmph = NULL;
1272 
1273 			rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1274 			if (rc <= 0) {
1275 				/* success */
1276 				ret = 0;
1277 			}
1278 
1279 			if (tmph) {
1280 				(void) nsc_free_buf(tmph);
1281 				tmph = NULL;
1282 			}
1283 
1284 			nsc_release(svp->sv_fd);
1285 
1286 			/*
1287 			 * Count the number of layered opens that we
1288 			 * fake since we have to fake a matching number
1289 			 * of closes (OTYP_LYR open/close calls must be
1290 			 * paired).
1291 			 */
1292 
1293 			if (ret == 0 && otyp == OTYP_LYR) {
1294 				mutex_enter(&svp->sv_olock);
1295 				svp->sv_openlcnt++;
1296 				mutex_exit(&svp->sv_olock);
1297 			}
1298 		}
1299 	}
1300 
1301 	if (svp) {
1302 		rw_exit(&svp->sv_lock);
1303 	}
1304 
1305 	return (ret);
1306 }
1307 
1308 
1309 static int
sv_lyr_close(dev_t dev,int flag,int otyp,cred_t * crp)1310 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1311 {
1312 	sv_dev_t *svp;
1313 	sv_maj_t *maj;
1314 	int (*fn)();
1315 	int ret;
1316 
1317 	svp = sv_dev_to_sv(dev, &maj);
1318 
1319 	if (svp &&
1320 	    svp->sv_state == SV_PENDING &&
1321 	    svp->sv_pending == curthread) {
1322 		/*
1323 		 * This is a recursive open from a call to
1324 		 * ddi_lyr_close and so we just want
1325 		 * to pass it straight through to the
1326 		 * underlying driver.
1327 		 */
1328 		DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1329 		    dev_t, dev);
1330 		svp = NULL;
1331 	}
1332 
1333 	if (svp) {
1334 		rw_enter(&svp->sv_lock, RW_READER);
1335 
1336 		if (otyp == OTYP_LYR) {
1337 			mutex_enter(&svp->sv_olock);
1338 
1339 			if (svp->sv_openlcnt) {
1340 				/*
1341 				 * Consume sufficient layered closes to
1342 				 * account for the opens that we faked
1343 				 * whilst the device was failed.
1344 				 */
1345 				svp->sv_openlcnt--;
1346 				mutex_exit(&svp->sv_olock);
1347 				rw_exit(&svp->sv_lock);
1348 
1349 				DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1350 
1351 				return (0);
1352 			}
1353 
1354 			mutex_exit(&svp->sv_olock);
1355 		}
1356 	}
1357 
1358 	if (maj && (fn = maj->sm_close) != 0) {
1359 		if (!(maj->sm_flag & D_MP)) {
1360 			UNSAFE_ENTER();
1361 			ret = (*fn)(dev, flag, otyp, crp);
1362 			UNSAFE_EXIT();
1363 		} else {
1364 			ret = (*fn)(dev, flag, otyp, crp);
1365 		}
1366 	} else {
1367 		ret = ENODEV;
1368 	}
1369 
1370 	if (svp) {
1371 		rw_exit(&svp->sv_lock);
1372 	}
1373 
1374 	return (ret);
1375 }
1376 
1377 
1378 /*
1379  * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1380  * return NULL.
1381  */
1382 static sv_dev_t *
sv_find_enabled(const dev_t dev,sv_maj_t ** majpp)1383 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1384 {
1385 	sv_dev_t *svp;
1386 
1387 	while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1388 		rw_enter(&svp->sv_lock, RW_READER);
1389 
1390 		if (svp->sv_state == SV_ENABLE) {
1391 			/* locked and enabled */
1392 			break;
1393 		}
1394 
1395 		/*
1396 		 * State was changed while waiting on the lock.
1397 		 * Wait for a stable state.
1398 		 */
1399 		rw_exit(&svp->sv_lock);
1400 
1401 		DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1402 
1403 		delay(2);
1404 	}
1405 
1406 	return (svp);
1407 }
1408 
1409 
1410 static int
sv_lyr_uio(dev_t dev,uio_t * uiop,cred_t * crp,int rw)1411 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1412 {
1413 	sv_dev_t *svp;
1414 	sv_maj_t *maj;
1415 	int (*fn)();
1416 	int rc;
1417 
1418 	svp = sv_find_enabled(dev, &maj);
1419 	if (svp == NULL) {
1420 		if (maj) {
1421 			if (rw == NSC_READ)
1422 				fn = maj->sm_read;
1423 			else
1424 				fn = maj->sm_write;
1425 
1426 			if (fn != 0) {
1427 				if (!(maj->sm_flag & D_MP)) {
1428 					UNSAFE_ENTER();
1429 					rc = (*fn)(dev, uiop, crp);
1430 					UNSAFE_EXIT();
1431 				} else {
1432 					rc = (*fn)(dev, uiop, crp);
1433 				}
1434 			}
1435 
1436 			return (rc);
1437 		} else {
1438 			return (ENODEV);
1439 		}
1440 	}
1441 
1442 	ASSERT(RW_READ_HELD(&svp->sv_lock));
1443 
1444 	if (svp->sv_flag == 0) {
1445 		/*
1446 		 * guard access mode
1447 		 * - prevent user level access to the device
1448 		 */
1449 		DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1450 		rc = EPERM;
1451 		goto out;
1452 	}
1453 
1454 	if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1455 		DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1456 		goto out;
1457 	}
1458 
1459 	if (rw == NSC_READ)
1460 		rc = nsc_uread(svp->sv_fd, uiop, crp);
1461 	else
1462 		rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1463 
1464 	nsc_release(svp->sv_fd);
1465 
1466 out:
1467 	rw_exit(&svp->sv_lock);
1468 
1469 	return (rc);
1470 }
1471 
1472 
1473 static int
sv_lyr_read(dev_t dev,uio_t * uiop,cred_t * crp)1474 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1475 {
1476 	return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1477 }
1478 
1479 
1480 static int
sv_lyr_write(dev_t dev,uio_t * uiop,cred_t * crp)1481 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1482 {
1483 	return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1484 }
1485 
1486 
1487 /* ARGSUSED */
1488 
1489 static int
sv_lyr_aread(dev_t dev,struct aio_req * aio,cred_t * crp)1490 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1491 {
1492 	return (aphysio(sv_lyr_strategy,
1493 	    anocancel, dev, B_READ, minphys, aio));
1494 }
1495 
1496 
1497 /* ARGSUSED */
1498 
1499 static int
sv_lyr_awrite(dev_t dev,struct aio_req * aio,cred_t * crp)1500 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1501 {
1502 	return (aphysio(sv_lyr_strategy,
1503 	    anocancel, dev, B_WRITE, minphys, aio));
1504 }
1505 
1506 
1507 /*
1508  * Set up an array containing the list of raw path names
1509  * The array for the paths is svl and the size of the array is
1510  * in size.
1511  *
1512  * If there are more layered devices than will fit in the array,
1513  * the number of extra layered devices is returned.  Otherwise
1514  * zero is return.
1515  *
1516  * Input:
1517  *	svn	: array for paths
1518  *	size	: size of the array
1519  *
1520  * Output (extra):
1521  *	zero	: All paths fit in array
1522  *	>0	: Number of defined layered devices don't fit in array
1523  */
1524 
1525 static int
sv_list(void * ptr,const int size,int * extra,const int ilp32)1526 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1527 {
1528 	sv_name32_t *svn32;
1529 	sv_name_t *svn;
1530 	sv_dev_t *svp;
1531 	int *mode, *nblocks;
1532 	int i, index;
1533 	char *path;
1534 
1535 	*extra = 0;
1536 	index = 0;
1537 
1538 	if (ilp32)
1539 		svn32 = ptr;
1540 	else
1541 		svn = ptr;
1542 
1543 	mutex_enter(&sv_mutex);
1544 	for (i = 0; i < sv_max_devices; i++) {
1545 		svp = &sv_devs[i];
1546 
1547 		rw_enter(&svp->sv_lock, RW_READER);
1548 
1549 		if (svp->sv_state != SV_ENABLE) {
1550 			rw_exit(&svp->sv_lock);
1551 			continue;
1552 		}
1553 
1554 		if ((*extra) != 0 || ptr == NULL) {
1555 			/* Another overflow entry */
1556 			rw_exit(&svp->sv_lock);
1557 			(*extra)++;
1558 			continue;
1559 		}
1560 
1561 		if (ilp32) {
1562 			nblocks = &svn32->svn_nblocks;
1563 			mode = &svn32->svn_mode;
1564 			path = svn32->svn_path;
1565 
1566 			svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1567 			svn32++;
1568 		} else {
1569 			nblocks = &svn->svn_nblocks;
1570 			mode = &svn->svn_mode;
1571 			path = svn->svn_path;
1572 
1573 			svn->svn_timestamp = svp->sv_timestamp;
1574 			svn++;
1575 		}
1576 
1577 		(void) strcpy(path, nsc_pathname(svp->sv_fd));
1578 		*nblocks = svp->sv_nblocks;
1579 		*mode = svp->sv_flag;
1580 
1581 		if (*nblocks == 0) {
1582 			if (sv_debug > 3)
1583 				cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1584 
1585 			if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1586 				*nblocks = svp->sv_nblocks;
1587 				nsc_release(svp->sv_fd);
1588 			}
1589 		}
1590 
1591 		if (++index >= size) {
1592 			/* Out of space */
1593 			(*extra)++;
1594 		}
1595 
1596 		rw_exit(&svp->sv_lock);
1597 	}
1598 	mutex_exit(&sv_mutex);
1599 
1600 	if (index < size) {
1601 		/* NULL terminated list */
1602 		if (ilp32)
1603 			svn32->svn_path[0] = '\0';
1604 		else
1605 			svn->svn_path[0] = '\0';
1606 	}
1607 
1608 	return (0);
1609 }
1610 
1611 
1612 static void
sv_thread_tune(int threads)1613 sv_thread_tune(int threads)
1614 {
1615 	int incr = (threads > 0) ? 1 : -1;
1616 	int change = 0;
1617 	int nthreads;
1618 
1619 	ASSERT(MUTEX_HELD(&sv_mutex));
1620 
1621 	if (sv_threads_extra) {
1622 		/* keep track of any additional threads requested */
1623 		if (threads > 0) {
1624 			sv_threads_extra += threads;
1625 			return;
1626 		}
1627 		threads = -threads;
1628 		if (threads >= sv_threads_extra) {
1629 			threads -= sv_threads_extra;
1630 			sv_threads_extra = 0;
1631 			/* fall through to while loop */
1632 		} else {
1633 			sv_threads_extra -= threads;
1634 			return;
1635 		}
1636 	} else if (threads > 0) {
1637 		/*
1638 		 * do not increase the number of threads beyond
1639 		 * sv_threads_max when doing dynamic thread tuning
1640 		 */
1641 		nthreads = nst_nthread(sv_tset);
1642 		if ((nthreads + threads) > sv_threads_max) {
1643 			sv_threads_extra = nthreads + threads - sv_threads_max;
1644 			threads = sv_threads_max - nthreads;
1645 			if (threads <= 0)
1646 				return;
1647 		}
1648 	}
1649 
1650 	if (threads < 0)
1651 		threads = -threads;
1652 
1653 	while (threads--) {
1654 		nthreads = nst_nthread(sv_tset);
1655 		sv_threads_needed += incr;
1656 
1657 		if (sv_threads_needed >= nthreads)
1658 			change += nst_add_thread(sv_tset, sv_threads_inc);
1659 		else if ((sv_threads_needed <
1660 		    (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1661 		    ((nthreads - sv_threads_inc) >= sv_threads))
1662 			change -= nst_del_thread(sv_tset, sv_threads_inc);
1663 	}
1664 
1665 #ifdef DEBUG
1666 	if (change) {
1667 		cmn_err(CE_NOTE,
1668 		    "!sv_thread_tune: threads needed %d, nthreads %d, "
1669 		    "nthreads change %d",
1670 		    sv_threads_needed, nst_nthread(sv_tset), change);
1671 	}
1672 #endif
1673 }
1674 
1675 
1676 /* ARGSUSED */
1677 static int
svopen(dev_t * devp,int flag,int otyp,cred_t * crp)1678 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1679 {
1680 	int rc;
1681 
1682 	mutex_enter(&sv_mutex);
1683 	rc = sv_init_devs();
1684 	mutex_exit(&sv_mutex);
1685 
1686 	return (rc);
1687 }
1688 
1689 
1690 /* ARGSUSED */
1691 static int
svclose(dev_t dev,int flag,int otyp,cred_t * crp)1692 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1693 {
1694 	const int secs = HZ * 5;
1695 	const int ticks = HZ / 10;
1696 	int loops = secs / ticks;
1697 
1698 	mutex_enter(&sv_mutex);
1699 	while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1700 		if (nst_nlive(sv_tset) <= 0) {
1701 			nst_destroy(sv_tset);
1702 			sv_tset = NULL;
1703 			break;
1704 		}
1705 
1706 		/* threads still active - wait for them to exit */
1707 		mutex_exit(&sv_mutex);
1708 		delay(ticks);
1709 		loops--;
1710 		mutex_enter(&sv_mutex);
1711 	}
1712 	mutex_exit(&sv_mutex);
1713 
1714 	if (loops <= 0) {
1715 		cmn_err(CE_WARN,
1716 #ifndef DEBUG
1717 		    /* do not write to console when non-DEBUG */
1718 		    "!"
1719 #endif
1720 		    "sv:svclose: threads still active "
1721 		    "after %d sec - leaking thread set", secs);
1722 	}
1723 
1724 	return (0);
1725 }
1726 
1727 
1728 static int
svioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * crp,int * rvalp)1729 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1730 {
1731 	char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1732 	spcs_s_info_t kstatus;	/* Kernel version of spcs status */
1733 	spcs_s_info_t ustatus;	/* Address of user version of spcs status */
1734 	sv_list32_t svl32;	/* 32 bit Initial structure for SVIOC_LIST */
1735 	sv_version_t svv;	/* Version structure */
1736 	sv_conf_t svc;		/* User config structure */
1737 	sv_list_t svl;		/* Initial structure for SVIOC_LIST */
1738 	void *usvn;		/* Address of user sv_name_t */
1739 	void *svn = NULL;	/* Array for SVIOC_LIST */
1740 	uint64_t phash;		/* pathname hash */
1741 	int rc = 0;		/* Return code -- errno */
1742 	int size;		/* Number of items in array */
1743 	int bytes;		/* Byte size of array */
1744 	int ilp32;		/* Convert data structures for ilp32 userland */
1745 
1746 	*rvalp = 0;
1747 
1748 	/*
1749 	 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1750 	 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1751 	 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1752 	 *
1753 	 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1754 	 */
1755 	if (sv_mod_status == SV_ALLOW_UNLOAD) {
1756 		return (EBUSY);
1757 	}
1758 
1759 	if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1760 		return (rc);
1761 
1762 	kstatus = spcs_s_kcreate();
1763 	if (!kstatus) {
1764 		DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1765 		return (ENOMEM);
1766 	}
1767 
1768 	ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1769 
1770 	switch (cmd) {
1771 
1772 	case SVIOC_ENABLE:
1773 
1774 		if (ilp32) {
1775 			sv_conf32_t svc32;
1776 
1777 			if (ddi_copyin((void *)arg, &svc32,
1778 			    sizeof (svc32), mode) < 0) {
1779 				spcs_s_kfree(kstatus);
1780 				return (EFAULT);
1781 			}
1782 
1783 			svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1784 			(void) strcpy(svc.svc_path, svc32.svc_path);
1785 			svc.svc_flag  = svc32.svc_flag;
1786 			svc.svc_major = svc32.svc_major;
1787 			svc.svc_minor = svc32.svc_minor;
1788 		} else {
1789 			if (ddi_copyin((void *)arg, &svc,
1790 			    sizeof (svc), mode) < 0) {
1791 				spcs_s_kfree(kstatus);
1792 				return (EFAULT);
1793 			}
1794 		}
1795 
1796 		/* force to raw access */
1797 		svc.svc_flag = NSC_DEVICE;
1798 
1799 		if (sv_tset == NULL) {
1800 			mutex_enter(&sv_mutex);
1801 
1802 			if (sv_tset == NULL) {
1803 				sv_tset = nst_init("sv_thr", sv_threads);
1804 			}
1805 
1806 			mutex_exit(&sv_mutex);
1807 
1808 			if (sv_tset == NULL) {
1809 				cmn_err(CE_WARN,
1810 				    "!sv: could not allocate %d threads",
1811 				    sv_threads);
1812 			}
1813 		}
1814 
1815 		rc = sv_enable(svc.svc_path, svc.svc_flag,
1816 		    makedevice(svc.svc_major, svc.svc_minor), kstatus);
1817 
1818 		if (rc == 0) {
1819 			sv_config_time = nsc_lbolt();
1820 
1821 			mutex_enter(&sv_mutex);
1822 			sv_thread_tune(sv_threads_dev);
1823 			mutex_exit(&sv_mutex);
1824 		}
1825 
1826 		DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1827 
1828 		return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1829 		/* NOTREACHED */
1830 
1831 	case SVIOC_DISABLE:
1832 
1833 		if (ilp32) {
1834 			sv_conf32_t svc32;
1835 
1836 			if (ddi_copyin((void *)arg, &svc32,
1837 			    sizeof (svc32), mode) < 0) {
1838 				spcs_s_kfree(kstatus);
1839 				return (EFAULT);
1840 			}
1841 
1842 			svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1843 			svc.svc_major = svc32.svc_major;
1844 			svc.svc_minor = svc32.svc_minor;
1845 			(void) strcpy(svc.svc_path, svc32.svc_path);
1846 			svc.svc_flag  = svc32.svc_flag;
1847 		} else {
1848 			if (ddi_copyin((void *)arg, &svc,
1849 			    sizeof (svc), mode) < 0) {
1850 				spcs_s_kfree(kstatus);
1851 				return (EFAULT);
1852 			}
1853 		}
1854 
1855 		if (svc.svc_major == (major_t)-1 &&
1856 		    svc.svc_minor == (minor_t)-1) {
1857 			sv_dev_t *svp;
1858 			int i;
1859 
1860 			/*
1861 			 * User level could not find the minor device
1862 			 * node, so do this the slow way by searching
1863 			 * the entire sv config for a matching pathname.
1864 			 */
1865 
1866 			phash = nsc_strhash(svc.svc_path);
1867 
1868 			mutex_enter(&sv_mutex);
1869 
1870 			for (i = 0; i < sv_max_devices; i++) {
1871 				svp = &sv_devs[i];
1872 
1873 				if (svp->sv_state == SV_DISABLE ||
1874 				    svp->sv_fd == NULL)
1875 					continue;
1876 
1877 				if (nsc_fdpathcmp(svp->sv_fd, phash,
1878 				    svc.svc_path) == 0) {
1879 					svc.svc_major = getmajor(svp->sv_dev);
1880 					svc.svc_minor = getminor(svp->sv_dev);
1881 					break;
1882 				}
1883 			}
1884 
1885 			mutex_exit(&sv_mutex);
1886 
1887 			if (svc.svc_major == (major_t)-1 &&
1888 			    svc.svc_minor == (minor_t)-1)
1889 				return (spcs_s_ocopyoutf(&kstatus,
1890 				    svc.svc_error, SV_ENODEV));
1891 		}
1892 
1893 		rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1894 		    kstatus);
1895 
1896 		if (rc == 0) {
1897 			sv_config_time = nsc_lbolt();
1898 
1899 			mutex_enter(&sv_mutex);
1900 			sv_thread_tune(-sv_threads_dev);
1901 			mutex_exit(&sv_mutex);
1902 		}
1903 
1904 		DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1905 
1906 		return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1907 		/* NOTREACHED */
1908 
1909 	case SVIOC_LIST:
1910 
1911 		if (ilp32) {
1912 			if (ddi_copyin((void *)arg, &svl32,
1913 			    sizeof (svl32), mode) < 0) {
1914 				spcs_s_kfree(kstatus);
1915 				return (EFAULT);
1916 			}
1917 
1918 			ustatus = (spcs_s_info_t)svl32.svl_error;
1919 			size = svl32.svl_count;
1920 			usvn = (void *)(unsigned long)svl32.svl_names;
1921 		} else {
1922 			if (ddi_copyin((void *)arg, &svl,
1923 			    sizeof (svl), mode) < 0) {
1924 				spcs_s_kfree(kstatus);
1925 				return (EFAULT);
1926 			}
1927 
1928 			ustatus = svl.svl_error;
1929 			size = svl.svl_count;
1930 			usvn = svl.svl_names;
1931 		}
1932 
1933 		/* Do some boundary checking */
1934 		if ((size < 0) || (size > sv_max_devices)) {
1935 			/* Array size is out of range */
1936 			return (spcs_s_ocopyoutf(&kstatus, ustatus,
1937 			    SV_EARRBOUNDS, "0",
1938 			    spcs_s_inttostring(sv_max_devices, itmp1,
1939 			    sizeof (itmp1), 0),
1940 			    spcs_s_inttostring(size, itmp2,
1941 			    sizeof (itmp2), 0)));
1942 		}
1943 
1944 		if (ilp32)
1945 			bytes = size * sizeof (sv_name32_t);
1946 		else
1947 			bytes = size * sizeof (sv_name_t);
1948 
1949 		/* Allocate memory for the array of structures */
1950 		if (bytes != 0) {
1951 			svn = kmem_zalloc(bytes, KM_SLEEP);
1952 			if (!svn) {
1953 				return (spcs_s_ocopyoutf(&kstatus,
1954 				    ustatus, ENOMEM));
1955 			}
1956 		}
1957 
1958 		rc = sv_list(svn, size, rvalp, ilp32);
1959 		if (rc) {
1960 			if (svn != NULL)
1961 				kmem_free(svn, bytes);
1962 			return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1963 		}
1964 
1965 		if (ilp32) {
1966 			svl32.svl_timestamp = (uint32_t)sv_config_time;
1967 			svl32.svl_maxdevs = (int32_t)sv_max_devices;
1968 
1969 			/* Return the list structure */
1970 			if (ddi_copyout(&svl32, (void *)arg,
1971 			    sizeof (svl32), mode) < 0) {
1972 				spcs_s_kfree(kstatus);
1973 				if (svn != NULL)
1974 					kmem_free(svn, bytes);
1975 				return (EFAULT);
1976 			}
1977 		} else {
1978 			svl.svl_timestamp = sv_config_time;
1979 			svl.svl_maxdevs = sv_max_devices;
1980 
1981 			/* Return the list structure */
1982 			if (ddi_copyout(&svl, (void *)arg,
1983 			    sizeof (svl), mode) < 0) {
1984 				spcs_s_kfree(kstatus);
1985 				if (svn != NULL)
1986 					kmem_free(svn, bytes);
1987 				return (EFAULT);
1988 			}
1989 		}
1990 
1991 		/* Return the array */
1992 		if (svn != NULL) {
1993 			if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1994 				kmem_free(svn, bytes);
1995 				spcs_s_kfree(kstatus);
1996 				return (EFAULT);
1997 			}
1998 			kmem_free(svn, bytes);
1999 		}
2000 
2001 		DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
2002 
2003 		return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2004 		/* NOTREACHED */
2005 
2006 	case SVIOC_VERSION:
2007 
2008 		if (ilp32) {
2009 			sv_version32_t svv32;
2010 
2011 			if (ddi_copyin((void *)arg, &svv32,
2012 			    sizeof (svv32), mode) < 0) {
2013 				spcs_s_kfree(kstatus);
2014 				return (EFAULT);
2015 			}
2016 
2017 			svv32.svv_major_rev = sv_major_rev;
2018 			svv32.svv_minor_rev = sv_minor_rev;
2019 			svv32.svv_micro_rev = sv_micro_rev;
2020 			svv32.svv_baseline_rev = sv_baseline_rev;
2021 
2022 			if (ddi_copyout(&svv32, (void *)arg,
2023 			    sizeof (svv32), mode) < 0) {
2024 				spcs_s_kfree(kstatus);
2025 				return (EFAULT);
2026 			}
2027 
2028 			ustatus = (spcs_s_info_t)svv32.svv_error;
2029 		} else {
2030 			if (ddi_copyin((void *)arg, &svv,
2031 			    sizeof (svv), mode) < 0) {
2032 				spcs_s_kfree(kstatus);
2033 				return (EFAULT);
2034 			}
2035 
2036 			svv.svv_major_rev = sv_major_rev;
2037 			svv.svv_minor_rev = sv_minor_rev;
2038 			svv.svv_micro_rev = sv_micro_rev;
2039 			svv.svv_baseline_rev = sv_baseline_rev;
2040 
2041 			if (ddi_copyout(&svv, (void *)arg,
2042 			    sizeof (svv), mode) < 0) {
2043 				spcs_s_kfree(kstatus);
2044 				return (EFAULT);
2045 			}
2046 
2047 			ustatus = svv.svv_error;
2048 		}
2049 
2050 		DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2051 
2052 		return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2053 		/* NOTREACHED */
2054 
2055 	case SVIOC_UNLOAD:
2056 		rc = sv_prepare_unload();
2057 
2058 		if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2059 			rc = EFAULT;
2060 		}
2061 
2062 		spcs_s_kfree(kstatus);
2063 		return (rc);
2064 
2065 	default:
2066 		spcs_s_kfree(kstatus);
2067 
2068 		DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2069 
2070 		return (EINVAL);
2071 		/* NOTREACHED */
2072 	}
2073 
2074 	/* NOTREACHED */
2075 }
2076 
2077 
2078 /* ARGSUSED */
2079 static int
svprint(dev_t dev,char * str)2080 svprint(dev_t dev, char *str)
2081 {
2082 	int instance = ddi_get_instance(sv_dip);
2083 	cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2084 	return (0);
2085 }
2086 
2087 
2088 static void
_sv_lyr_strategy(struct buf * bp)2089 _sv_lyr_strategy(struct buf *bp)
2090 {
2091 	caddr_t buf_addr;		/* pointer to linear buffer in bp */
2092 	nsc_buf_t *bufh = NULL;
2093 	nsc_buf_t *hndl = NULL;
2094 	sv_dev_t *svp;
2095 	nsc_vec_t *v;
2096 	sv_maj_t *maj;
2097 	nsc_size_t fba_req, fba_len;	/* FBA lengths */
2098 	nsc_off_t fba_off;		/* FBA offset */
2099 	size_t tocopy, nbytes;		/* byte lengths */
2100 	int rw, rc;			/* flags and return codes */
2101 	int (*fn)();
2102 
2103 	rc = 0;
2104 
2105 	if (sv_debug > 5)
2106 		cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2107 
2108 	svp = sv_find_enabled(bp->b_edev, &maj);
2109 	if (svp == NULL) {
2110 		if (maj && (fn = maj->sm_strategy) != 0) {
2111 			if (!(maj->sm_flag & D_MP)) {
2112 				UNSAFE_ENTER();
2113 				rc = (*fn)(bp);
2114 				UNSAFE_EXIT();
2115 			} else {
2116 				rc = (*fn)(bp);
2117 			}
2118 			return;
2119 		} else {
2120 			bioerror(bp, ENODEV);
2121 			biodone(bp);
2122 			return;
2123 		}
2124 	}
2125 
2126 	ASSERT(RW_READ_HELD(&svp->sv_lock));
2127 
2128 	if (svp->sv_flag == 0) {
2129 		/*
2130 		 * guard access mode
2131 		 * - prevent user level access to the device
2132 		 */
2133 		DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2134 		bioerror(bp, EPERM);
2135 		goto out;
2136 	}
2137 
2138 	if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2139 		DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2140 
2141 		if (rc == EINTR)
2142 			cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2143 		bioerror(bp, rc);
2144 		goto out;
2145 	}
2146 
2147 	if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2148 		DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2149 
2150 		if (bp->b_flags & B_READ) {
2151 			/* return EOF, not an error */
2152 			bp->b_resid = bp->b_bcount;
2153 			bioerror(bp, 0);
2154 		} else
2155 			bioerror(bp, EINVAL);
2156 
2157 		goto done;
2158 	}
2159 
2160 	/*
2161 	 * Preallocate a handle once per call to strategy.
2162 	 * If this fails, then the nsc_alloc_buf() will allocate
2163 	 * a temporary handle per allocation/free pair.
2164 	 */
2165 
2166 	DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2167 
2168 	bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2169 
2170 	DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2171 
2172 	if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2173 		DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2174 
2175 		cmn_err(CE_WARN,
2176 		    "!sv: allocated active handle (bufh %p, flags %x)",
2177 		    (void *)bufh, bufh->sb_flag);
2178 
2179 		bioerror(bp, ENXIO);
2180 		goto done;
2181 	}
2182 
2183 	fba_req = FBA_LEN(bp->b_bcount);
2184 	if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2185 		fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2186 
2187 	rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2188 
2189 	bp_mapin(bp);
2190 
2191 	bp->b_resid = bp->b_bcount;
2192 	buf_addr = bp->b_un.b_addr;
2193 	fba_off = 0;
2194 
2195 	/*
2196 	 * fba_req  - requested size of transfer in FBAs after
2197 	 *		truncation to device extent, and allowing for
2198 	 *		possible non-FBA bounded final chunk.
2199 	 * fba_off  - offset of start of chunk from start of bp in FBAs.
2200 	 * fba_len  - size of this chunk in FBAs.
2201 	 */
2202 
2203 loop:
2204 	fba_len = min(fba_req, svp->sv_maxfbas);
2205 	hndl = bufh;
2206 
2207 	DTRACE_PROBE4(sv_dbg_allocb_start,
2208 	    sv_dev_t *, svp,
2209 	    uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2210 	    uint64_t, (uint64_t)fba_len,
2211 	    int, rw);
2212 
2213 	rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2214 	    fba_len, rw, &hndl);
2215 
2216 	DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2217 
2218 	if (rc > 0) {
2219 		DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2220 		bioerror(bp, rc);
2221 		if (hndl != bufh)
2222 			(void) nsc_free_buf(hndl);
2223 		hndl = NULL;
2224 		goto done;
2225 	}
2226 
2227 	tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2228 	v = hndl->sb_vec;
2229 
2230 	if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2231 		/*
2232 		 * Not overwriting all of the last FBA, so read in the
2233 		 * old contents now before we overwrite it with the new
2234 		 * data.
2235 		 */
2236 
2237 		DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2238 		    uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2239 
2240 		rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2241 		if (rc > 0) {
2242 			bioerror(bp, rc);
2243 			goto done;
2244 		}
2245 
2246 		DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2247 	}
2248 
2249 	DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2250 
2251 	while (tocopy > 0) {
2252 		nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2253 
2254 		if (bp->b_flags & B_READ)
2255 			(void) bcopy(v->sv_addr, buf_addr, nbytes);
2256 		else
2257 			(void) bcopy(buf_addr, v->sv_addr, nbytes);
2258 
2259 		bp->b_resid -= nbytes;
2260 		buf_addr += nbytes;
2261 		tocopy -= nbytes;
2262 		v++;
2263 	}
2264 
2265 	DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2266 
2267 	if ((bp->b_flags & B_READ) == 0) {
2268 		DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2269 		    uint64_t, (uint64_t)hndl->sb_pos,
2270 		    uint64_t, (uint64_t)hndl->sb_len);
2271 
2272 		rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2273 
2274 		DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2275 
2276 		if (rc > 0) {
2277 			bioerror(bp, rc);
2278 			goto done;
2279 		}
2280 	}
2281 
2282 	/*
2283 	 * Adjust FBA offset and requested (ie. remaining) length,
2284 	 * loop if more data to transfer.
2285 	 */
2286 
2287 	fba_off += fba_len;
2288 	fba_req -= fba_len;
2289 
2290 	if (fba_req > 0) {
2291 		DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2292 
2293 		rc = nsc_free_buf(hndl);
2294 
2295 		DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2296 
2297 		if (rc > 0) {
2298 			DTRACE_PROBE1(sv_lyr_strategy_err_free,
2299 			    struct buf *, bp);
2300 			bioerror(bp, rc);
2301 		}
2302 
2303 		hndl = NULL;
2304 
2305 		if (rc <= 0)
2306 			goto loop;
2307 	}
2308 
2309 done:
2310 	if (hndl != NULL) {
2311 		DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2312 
2313 		rc = nsc_free_buf(hndl);
2314 
2315 		DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2316 
2317 		if (rc > 0) {
2318 			DTRACE_PROBE1(sv_lyr_strategy_err_free,
2319 			    struct buf *, bp);
2320 			bioerror(bp, rc);
2321 		}
2322 
2323 		hndl = NULL;
2324 	}
2325 
2326 	if (bufh)
2327 		(void) nsc_free_handle(bufh);
2328 
2329 	DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2330 
2331 	nsc_release(svp->sv_fd);
2332 
2333 	DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2334 
2335 out:
2336 	if (sv_debug > 5) {
2337 		cmn_err(CE_CONT,
2338 		    "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2339 		    (void *)bp, (void *)bufh, bp->b_error);
2340 	}
2341 
2342 	DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2343 
2344 	rw_exit(&svp->sv_lock);
2345 	biodone(bp);
2346 }
2347 
2348 
2349 static void
sv_async_strategy(blind_t arg)2350 sv_async_strategy(blind_t arg)
2351 {
2352 	struct buf *bp = (struct buf *)arg;
2353 	_sv_lyr_strategy(bp);
2354 }
2355 
2356 
2357 static int
sv_lyr_strategy(struct buf * bp)2358 sv_lyr_strategy(struct buf *bp)
2359 {
2360 	nsthread_t *tp;
2361 	int nlive;
2362 
2363 	/*
2364 	 * If B_ASYNC was part of the DDI we could use it as a hint to
2365 	 * not create a thread for synchronous i/o.
2366 	 */
2367 	if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2368 		/* not sv enabled - just pass through */
2369 		DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2370 		_sv_lyr_strategy(bp);
2371 		return (0);
2372 	}
2373 
2374 	if (sv_debug > 4) {
2375 		cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2376 		    nst_nthread(sv_tset), nst_nlive(sv_tset));
2377 	}
2378 
2379 	/*
2380 	 * If there are only guard devices enabled there
2381 	 * won't be a threadset, so don't try and use it.
2382 	 */
2383 	tp = NULL;
2384 	if (sv_tset != NULL) {
2385 		tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2386 	}
2387 
2388 	if (tp == NULL) {
2389 		/*
2390 		 * out of threads, so fall back to synchronous io.
2391 		 */
2392 		if (sv_debug > 0) {
2393 			cmn_err(CE_CONT,
2394 			    "!sv_lyr_strategy: thread alloc failed\n");
2395 		}
2396 
2397 		DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2398 		    struct buf *, bp);
2399 
2400 		_sv_lyr_strategy(bp);
2401 		sv_no_threads++;
2402 	} else {
2403 		nlive = nst_nlive(sv_tset);
2404 		if (nlive > sv_max_nlive) {
2405 			if (sv_debug > 0) {
2406 				cmn_err(CE_CONT,
2407 				    "!sv_lyr_strategy: "
2408 				    "new max nlive %d (nthread %d)\n",
2409 				    nlive, nst_nthread(sv_tset));
2410 			}
2411 
2412 			sv_max_nlive = nlive;
2413 		}
2414 	}
2415 
2416 	return (0);
2417 }
2418 
2419 /*
2420  * re-write the size of the current partition
2421  */
2422 static int
sv_fix_dkiocgvtoc(const intptr_t arg,const int mode,sv_dev_t * svp)2423 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2424 {
2425 	size_t offset;
2426 	int ilp32;
2427 	int pnum;
2428 	int rc;
2429 
2430 	ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2431 
2432 	rc = nskern_partition(svp->sv_dev, &pnum);
2433 	if (rc != 0) {
2434 		return (rc);
2435 	}
2436 
2437 	if (pnum < 0 || pnum >= V_NUMPAR) {
2438 		cmn_err(CE_WARN,
2439 		    "!sv_gvtoc: unable to determine partition number "
2440 		    "for dev %lx", svp->sv_dev);
2441 		return (EINVAL);
2442 	}
2443 
2444 	if (ilp32) {
2445 		int32_t p_size;
2446 
2447 #ifdef _SunOS_5_6
2448 		offset = offsetof(struct vtoc, v_part);
2449 		offset += sizeof (struct partition) * pnum;
2450 		offset += offsetof(struct partition, p_size);
2451 #else
2452 		offset = offsetof(struct vtoc32, v_part);
2453 		offset += sizeof (struct partition32) * pnum;
2454 		offset += offsetof(struct partition32, p_size);
2455 #endif
2456 
2457 		p_size = (int32_t)svp->sv_nblocks;
2458 		if (p_size == 0) {
2459 			if (sv_reserve(svp->sv_fd,
2460 			    NSC_MULTI|NSC_PCATCH) == 0) {
2461 				p_size = (int32_t)svp->sv_nblocks;
2462 				nsc_release(svp->sv_fd);
2463 			} else {
2464 				rc = EINTR;
2465 			}
2466 		}
2467 
2468 		if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2469 		    sizeof (p_size), mode) != 0) {
2470 			rc = EFAULT;
2471 		}
2472 	} else {
2473 		long p_size;
2474 
2475 		offset = offsetof(struct vtoc, v_part);
2476 		offset += sizeof (struct partition) * pnum;
2477 		offset += offsetof(struct partition, p_size);
2478 
2479 		p_size = (long)svp->sv_nblocks;
2480 		if (p_size == 0) {
2481 			if (sv_reserve(svp->sv_fd,
2482 			    NSC_MULTI|NSC_PCATCH) == 0) {
2483 				p_size = (long)svp->sv_nblocks;
2484 				nsc_release(svp->sv_fd);
2485 			} else {
2486 				rc = EINTR;
2487 			}
2488 		}
2489 
2490 		if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2491 		    sizeof (p_size), mode) != 0) {
2492 			rc = EFAULT;
2493 		}
2494 	}
2495 
2496 	return (rc);
2497 }
2498 
2499 
2500 #ifdef DKIOCPARTITION
2501 /*
2502  * re-write the size of the current partition
2503  *
2504  * arg is dk_efi_t.
2505  *
2506  * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2507  *
2508  * dk_efi_t->dki_data --> efi_gpt_t (label header)
2509  * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2510  *
2511  * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2512  * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2513  *
2514  * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2515  * logical block on the disk.
2516  *
2517  * Everything is little endian (i.e. disk format).
2518  */
2519 static int
sv_fix_dkiocgetefi(const intptr_t arg,const int mode,sv_dev_t * svp)2520 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2521 {
2522 	dk_efi_t efi;
2523 	efi_gpt_t gpt;
2524 	efi_gpe_t *gpe = NULL;
2525 	size_t sgpe;
2526 	uint64_t p_size;	/* virtual partition size from nsctl */
2527 	uint32_t crc;
2528 	int unparts;		/* number of parts in user's array */
2529 	int pnum;
2530 	int rc;
2531 
2532 	rc = nskern_partition(svp->sv_dev, &pnum);
2533 	if (rc != 0) {
2534 		return (rc);
2535 	}
2536 
2537 	if (pnum < 0) {
2538 		cmn_err(CE_WARN,
2539 		    "!sv_efi: unable to determine partition number for dev %lx",
2540 		    svp->sv_dev);
2541 		return (EINVAL);
2542 	}
2543 
2544 	if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2545 		return (EFAULT);
2546 	}
2547 
2548 	efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2549 
2550 	if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2551 		return (EINVAL);
2552 	}
2553 
2554 	if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2555 		rc = EFAULT;
2556 		goto out;
2557 	}
2558 
2559 	if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2560 		unparts = 1;
2561 	else if (pnum >= unparts) {
2562 		cmn_err(CE_WARN,
2563 		    "!sv_efi: partition# beyond end of user array (%d >= %d)",
2564 		    pnum, unparts);
2565 		return (EINVAL);
2566 	}
2567 
2568 	sgpe = sizeof (*gpe) * unparts;
2569 	gpe = kmem_alloc(sgpe, KM_SLEEP);
2570 
2571 	if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2572 		rc = EFAULT;
2573 		goto out;
2574 	}
2575 
2576 	p_size = svp->sv_nblocks;
2577 	if (p_size == 0) {
2578 		if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2579 			p_size = (diskaddr_t)svp->sv_nblocks;
2580 			nsc_release(svp->sv_fd);
2581 		} else {
2582 			rc = EINTR;
2583 		}
2584 	}
2585 
2586 	gpe[pnum].efi_gpe_EndingLBA = LE_64(
2587 	    LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2588 
2589 	gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2590 	CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2591 	gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2592 
2593 	gpt.efi_gpt_HeaderCRC32 = 0;
2594 	CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2595 	gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2596 
2597 	if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2598 		rc = EFAULT;
2599 		goto out;
2600 	}
2601 
2602 	if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2603 		rc = EFAULT;
2604 		goto out;
2605 	}
2606 
2607 out:
2608 	if (gpe) {
2609 		kmem_free(gpe, sgpe);
2610 	}
2611 
2612 	return (rc);
2613 }
2614 
2615 
2616 /*
2617  * Re-write the size of the partition specified by p_partno
2618  *
2619  * Note that if a DKIOCPARTITION is issued to an fd opened against a
2620  * non-sv'd device, but p_partno requests the size for a different
2621  * device that is sv'd, this function will *not* be called as sv is
2622  * not interposed on the original device (the fd).
2623  *
2624  * It would not be easy to change this as we cannot get the partition
2625  * number for the non-sv'd device, so cannot compute the dev_t of the
2626  * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2627  * its size from nsctl.
2628  *
2629  * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2630  */
2631 static int
sv_fix_dkiocpartition(const intptr_t arg,const int mode,sv_dev_t * svp)2632 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2633 {
2634 	struct partition64 p64;
2635 	sv_dev_t *nsvp = NULL;
2636 	diskaddr_t p_size;
2637 	minor_t nminor;
2638 	int pnum, rc;
2639 	dev_t ndev;
2640 
2641 	rc = nskern_partition(svp->sv_dev, &pnum);
2642 	if (rc != 0) {
2643 		return (rc);
2644 	}
2645 
2646 	if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2647 		return (EFAULT);
2648 	}
2649 
2650 	if (p64.p_partno != pnum) {
2651 		/* switch to requested partition, not the current one */
2652 		nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2653 		ndev = makedevice(getmajor(svp->sv_dev), nminor);
2654 		nsvp = sv_find_enabled(ndev, NULL);
2655 		if (nsvp == NULL) {
2656 			/* not sv device - just return */
2657 			return (0);
2658 		}
2659 
2660 		svp = nsvp;
2661 	}
2662 
2663 	p_size = svp->sv_nblocks;
2664 	if (p_size == 0) {
2665 		if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2666 			p_size = (diskaddr_t)svp->sv_nblocks;
2667 			nsc_release(svp->sv_fd);
2668 		} else {
2669 			rc = EINTR;
2670 		}
2671 	}
2672 
2673 	if (nsvp != NULL) {
2674 		rw_exit(&nsvp->sv_lock);
2675 	}
2676 
2677 	if ((rc == 0) && ddi_copyout(&p_size,
2678 	    (void *)(arg + offsetof(struct partition64, p_size)),
2679 	    sizeof (p_size), mode) != 0) {
2680 		return (EFAULT);
2681 	}
2682 
2683 	return (rc);
2684 }
2685 #endif /* DKIOCPARTITION */
2686 
2687 
2688 static int
sv_lyr_ioctl(const dev_t dev,const int cmd,const intptr_t arg,const int mode,cred_t * crp,int * rvalp)2689 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2690     const int mode, cred_t *crp, int *rvalp)
2691 {
2692 	sv_dev_t *svp;
2693 	sv_maj_t *maj;
2694 	int (*fn)();
2695 	int rc = 0;
2696 
2697 	maj = 0;
2698 	fn = 0;
2699 
2700 	/*
2701 	 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2702 	 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2703 	 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2704 	 *
2705 	 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2706 	 */
2707 	if (sv_mod_status == SV_ALLOW_UNLOAD) {
2708 		return (EBUSY);
2709 	}
2710 
2711 	svp = sv_find_enabled(dev, &maj);
2712 	if (svp != NULL) {
2713 		if (nskernd_isdaemon()) {
2714 			/*
2715 			 * This is nskernd which always needs to see
2716 			 * the underlying disk device accurately.
2717 			 *
2718 			 * So just pass the ioctl straight through
2719 			 * to the underlying driver as though the device
2720 			 * was not sv enabled.
2721 			 */
2722 			DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2723 			    dev_t, dev);
2724 
2725 			rw_exit(&svp->sv_lock);
2726 			svp = NULL;
2727 		} else {
2728 			ASSERT(RW_READ_HELD(&svp->sv_lock));
2729 		}
2730 	}
2731 
2732 	/*
2733 	 * We now have a locked and enabled SV device, or a non-SV device.
2734 	 */
2735 
2736 	switch (cmd) {
2737 		/*
2738 		 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2739 		 * and DKIOCSETEFI are intercepted and faked up as some
2740 		 * i/o providers emulate volumes of a different size to
2741 		 * the underlying volume.
2742 		 *
2743 		 * Setting the size by rewriting the vtoc is not permitted.
2744 		 */
2745 
2746 	case DKIOCSVTOC:
2747 #ifdef DKIOCPARTITION
2748 	case DKIOCSETEFI:
2749 #endif
2750 		if (svp == NULL) {
2751 			/* not intercepted -- allow ioctl through */
2752 			break;
2753 		}
2754 
2755 		rw_exit(&svp->sv_lock);
2756 
2757 		DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2758 
2759 		return (EPERM);
2760 
2761 	default:
2762 		break;
2763 	}
2764 
2765 	/*
2766 	 * Pass through the real ioctl command.
2767 	 */
2768 
2769 	if (maj && (fn = maj->sm_ioctl) != 0) {
2770 		if (!(maj->sm_flag & D_MP)) {
2771 			UNSAFE_ENTER();
2772 			rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2773 			UNSAFE_EXIT();
2774 		} else {
2775 			rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2776 		}
2777 	} else {
2778 		rc = ENODEV;
2779 	}
2780 
2781 	/*
2782 	 * Bug 4755783
2783 	 * Fix up the size of the current partition to allow
2784 	 * for the virtual volume to be a different size to the
2785 	 * physical volume (e.g. for II compact dependent shadows).
2786 	 *
2787 	 * Note that this only attempts to fix up the current partition
2788 	 * - the one that the ioctl was issued against.  There could be
2789 	 * other sv'd partitions in the same vtoc, but we cannot tell
2790 	 * so we don't attempt to fix them up.
2791 	 */
2792 
2793 	if (svp != NULL && rc == 0) {
2794 		switch (cmd) {
2795 		case DKIOCGVTOC:
2796 			rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2797 			break;
2798 
2799 #ifdef DKIOCPARTITION
2800 		case DKIOCGETEFI:
2801 			rc = sv_fix_dkiocgetefi(arg, mode, svp);
2802 			break;
2803 
2804 		case DKIOCPARTITION:
2805 			rc = sv_fix_dkiocpartition(arg, mode, svp);
2806 			break;
2807 #endif /* DKIOCPARTITION */
2808 		}
2809 	}
2810 
2811 	if (svp != NULL) {
2812 		rw_exit(&svp->sv_lock);
2813 	}
2814 
2815 	return (rc);
2816 }
2817