1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /*
29 * Storage Volume Character and Block Driver (SV)
30 *
31 * This driver implements a simplistic /dev/{r}dsk/ interface to a
32 * specified disk volume that is otherwise managed by the Prism
33 * software. The SV driver layers itself onto the underlying disk
34 * device driver by changing function pointers in the cb_ops
35 * structure.
36 *
37 * CONFIGURATION:
38 *
39 * 1. Configure the driver using the svadm utility.
40 * 2. Access the device as before through /dev/rdsk/c?t?d?s?
41 *
42 * LIMITATIONS:
43 *
44 * This driver should NOT be used to share a device between another
45 * DataServices user interface module (e.g., STE) and a user accessing
46 * the device through the block device in O_WRITE mode. This is because
47 * writes through the block device are asynchronous (due to the page
48 * cache) and so consistency between the block device user and the
49 * STE user cannot be guaranteed.
50 *
51 * Data is copied between system struct buf(9s) and nsc_vec_t. This is
52 * wasteful and slow.
53 */
54
55 #include <sys/debug.h>
56 #include <sys/types.h>
57
58 #include <sys/ksynch.h>
59 #include <sys/kmem.h>
60 #include <sys/errno.h>
61 #include <sys/varargs.h>
62 #include <sys/file.h>
63 #include <sys/open.h>
64 #include <sys/conf.h>
65 #include <sys/cred.h>
66 #include <sys/buf.h>
67 #include <sys/uio.h>
68 #ifndef DS_DDICT
69 #include <sys/pathname.h>
70 #endif
71 #include <sys/aio_req.h>
72 #include <sys/dkio.h>
73 #include <sys/vtoc.h>
74 #include <sys/cmn_err.h>
75 #include <sys/modctl.h>
76 #include <sys/ddi.h>
77 #include <sys/sunddi.h>
78 #include <sys/sunldi.h>
79 #include <sys/nsctl/nsvers.h>
80
81 #include <sys/nsc_thread.h>
82 #include <sys/unistat/spcs_s.h>
83 #include <sys/unistat/spcs_s_k.h>
84 #include <sys/unistat/spcs_errors.h>
85
86 #ifdef DS_DDICT
87 #include "../contract.h"
88 #endif
89
90 #include "../nsctl.h"
91
92
93 #include <sys/sdt.h> /* dtrace is S10 or later */
94
95 #include "sv.h"
96 #include "sv_impl.h"
97 #include "sv_efi.h"
98
99 #define MAX_EINTR_COUNT 1000
100
101 /*
102 * sv_mod_status
103 */
104 #define SV_PREVENT_UNLOAD 1
105 #define SV_ALLOW_UNLOAD 2
106
107 static const int sv_major_rev = ISS_VERSION_MAJ; /* Major number */
108 static const int sv_minor_rev = ISS_VERSION_MIN; /* Minor number */
109 static const int sv_micro_rev = ISS_VERSION_MIC; /* Micro number */
110 static const int sv_baseline_rev = ISS_VERSION_NUM; /* Baseline number */
111
112 #ifdef DKIOCPARTITION
113 /*
114 * CRC32 polynomial table needed for computing the checksums
115 * in an EFI vtoc.
116 */
117 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
118 #endif
119
120 static clock_t sv_config_time; /* Time of successful {en,dis}able */
121 static int sv_debug; /* Set non-zero for debug to syslog */
122 static int sv_mod_status; /* Set to prevent modunload */
123
124 static dev_info_t *sv_dip; /* Single DIP for driver */
125 static kmutex_t sv_mutex; /* Protect global lists, etc. */
126
127 static nsc_mem_t *sv_mem; /* nsctl memory allocator token */
128
129
130 /*
131 * Per device and per major state.
132 */
133
134 #ifndef _SunOS_5_6
135 #define UNSAFE_ENTER()
136 #define UNSAFE_EXIT()
137 #else
138 #define UNSAFE_ENTER() mutex_enter(&unsafe_driver)
139 #define UNSAFE_EXIT() mutex_exit(&unsafe_driver)
140 #endif
141
142 /* hash table of major dev structures */
143 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
144 static sv_dev_t *sv_devs; /* array of per device structures */
145 static int sv_max_devices; /* SV version of nsc_max_devices() */
146 static int sv_ndevices; /* number of SV enabled devices */
147
148 /*
149 * Threading.
150 */
151
152 int sv_threads_max = 1024; /* maximum # to dynamically alloc */
153 int sv_threads = 32; /* # to pre-allocate (see sv.conf) */
154 int sv_threads_extra = 0; /* addl # we would have alloc'ed */
155
156 static nstset_t *sv_tset; /* the threadset pointer */
157
158 static int sv_threads_hysteresis = 4; /* hysteresis for threadset resizing */
159 static int sv_threads_dev = 2; /* # of threads to alloc per device */
160 static int sv_threads_inc = 8; /* increment for changing the set */
161 static int sv_threads_needed; /* number of threads needed */
162 static int sv_no_threads; /* number of nsc_create errors */
163 static int sv_max_nlive; /* max number of threads running */
164
165
166
167 /*
168 * nsctl fd callbacks.
169 */
170
171 static int svattach_fd(blind_t);
172 static int svdetach_fd(blind_t);
173
174 static nsc_def_t sv_fd_def[] = {
175 { "Attach", (uintptr_t)svattach_fd, },
176 { "Detach", (uintptr_t)svdetach_fd, },
177 { 0, 0, }
178 };
179
180 /*
181 * cb_ops functions.
182 */
183
184 static int svopen(dev_t *, int, int, cred_t *);
185 static int svclose(dev_t, int, int, cred_t *);
186 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
187 static int svprint(dev_t, char *);
188
189 /*
190 * These next functions are layered into the underlying driver's devops.
191 */
192
193 static int sv_lyr_open(dev_t *, int, int, cred_t *);
194 static int sv_lyr_close(dev_t, int, int, cred_t *);
195 static int sv_lyr_strategy(struct buf *);
196 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
197 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
198 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
199 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
200 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
201
202 static struct cb_ops sv_cb_ops = {
203 svopen, /* open */
204 svclose, /* close */
205 nulldev, /* strategy */
206 svprint,
207 nodev, /* dump */
208 nodev, /* read */
209 nodev, /* write */
210 svioctl,
211 nodev, /* devmap */
212 nodev, /* mmap */
213 nodev, /* segmap */
214 nochpoll, /* poll */
215 ddi_prop_op,
216 NULL, /* NOT a stream */
217 D_NEW | D_MP | D_64BIT,
218 CB_REV,
219 nodev, /* aread */
220 nodev, /* awrite */
221 };
222
223
224 /*
225 * dev_ops functions.
226 */
227
228 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
229 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
230 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
231
232 static struct dev_ops sv_ops = {
233 DEVO_REV,
234 0,
235 sv_getinfo,
236 nulldev, /* identify */
237 nulldev, /* probe */
238 sv_attach,
239 sv_detach,
240 nodev, /* reset */
241 &sv_cb_ops,
242 (struct bus_ops *)0
243 };
244
245 /*
246 * Module linkage.
247 */
248
249 extern struct mod_ops mod_driverops;
250
251 static struct modldrv modldrv = {
252 &mod_driverops,
253 "nws:Storage Volume:" ISS_VERSION_STR,
254 &sv_ops
255 };
256
257 static struct modlinkage modlinkage = {
258 MODREV_1,
259 &modldrv,
260 0
261 };
262
263
264 int
_init(void)265 _init(void)
266 {
267 int error;
268
269 mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
270
271 if ((error = mod_install(&modlinkage)) != 0) {
272 mutex_destroy(&sv_mutex);
273 return (error);
274 }
275
276 #ifdef DEBUG
277 cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
278 sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
279 ISS_VERSION_STR, BUILD_DATE_STR);
280 #else
281 if (sv_micro_rev) {
282 cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
283 sv_major_rev, sv_minor_rev, sv_micro_rev,
284 ISS_VERSION_STR, BUILD_DATE_STR);
285 } else {
286 cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
287 sv_major_rev, sv_minor_rev,
288 ISS_VERSION_STR, BUILD_DATE_STR);
289 }
290 #endif
291
292 return (error);
293 }
294
295
296 int
_fini(void)297 _fini(void)
298 {
299 int error;
300
301 if ((error = mod_remove(&modlinkage)) != 0)
302 return (error);
303
304 mutex_destroy(&sv_mutex);
305
306 return (error);
307 }
308
309
310 int
_info(struct modinfo * modinfop)311 _info(struct modinfo *modinfop)
312 {
313 return (mod_info(&modlinkage, modinfop));
314 }
315
316
317 /*
318 * Locking & State.
319 *
320 * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
321 * threadset creation and sizing; sv_ndevices.
322 *
323 * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
324 * must be acquired first.
325 *
326 * sv_lock protects the sv_dev_t structure for an individual device.
327 *
328 * sv_olock protects the otyp/open members of the sv_dev_t. If we need
329 * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
330 * first.
331 *
332 * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
333 * I/O operations to a device simultaneously, as above.
334 *
335 * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
336 * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
337 * and (sv_pending == curthread) so that any recursion through
338 * sv_lyr_open/sv_lyr_close can be detected.
339 */
340
341
342 static int
sv_init_devs(void)343 sv_init_devs(void)
344 {
345 int i;
346
347 ASSERT(MUTEX_HELD(&sv_mutex));
348
349 if (sv_max_devices > 0)
350 return (0);
351
352 sv_max_devices = nsc_max_devices();
353
354 if (sv_max_devices <= 0) {
355 /* nsctl is not attached (nskernd not running) */
356 if (sv_debug > 0)
357 cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
358 return (EAGAIN);
359 }
360
361 sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
362 KM_NOSLEEP, sv_mem);
363
364 if (sv_devs == NULL) {
365 cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
366 return (ENOMEM);
367 }
368
369 for (i = 0; i < sv_max_devices; i++) {
370 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
371 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
372 }
373
374 if (sv_debug > 0)
375 cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
376
377 return (0);
378 }
379
380
381 static int
sv_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)382 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
383 {
384 int rc;
385
386 switch (cmd) {
387
388 case DDI_ATTACH:
389 sv_dip = dip;
390
391 if (ddi_create_minor_node(dip, "sv", S_IFCHR,
392 0, DDI_PSEUDO, 0) != DDI_SUCCESS)
393 goto failed;
394
395 mutex_enter(&sv_mutex);
396
397 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
398 if (sv_mem == NULL) {
399 mutex_exit(&sv_mutex);
400 goto failed;
401 }
402
403 rc = sv_init_devs();
404 if (rc != 0 && rc != EAGAIN) {
405 mutex_exit(&sv_mutex);
406 goto failed;
407 }
408
409 mutex_exit(&sv_mutex);
410
411
412 ddi_report_dev(dip);
413
414 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
415 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
416 "sv_threads", sv_threads);
417
418 if (sv_debug > 0)
419 cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
420
421 if (sv_threads > sv_threads_max)
422 sv_threads_max = sv_threads;
423
424 return (DDI_SUCCESS);
425
426 default:
427 return (DDI_FAILURE);
428 }
429
430 failed:
431 DTRACE_PROBE(sv_attach_failed);
432 (void) sv_detach(dip, DDI_DETACH);
433 return (DDI_FAILURE);
434 }
435
436
437 static int
sv_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)438 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
439 {
440 sv_dev_t *svp;
441 int i;
442
443 switch (cmd) {
444
445 case DDI_DETACH:
446
447 /*
448 * Check that everything is disabled.
449 */
450
451 mutex_enter(&sv_mutex);
452
453 if (sv_mod_status == SV_PREVENT_UNLOAD) {
454 mutex_exit(&sv_mutex);
455 DTRACE_PROBE(sv_detach_err_prevent);
456 return (DDI_FAILURE);
457 }
458
459 for (i = 0; sv_devs && i < sv_max_devices; i++) {
460 svp = &sv_devs[i];
461
462 if (svp->sv_state != SV_DISABLE) {
463 mutex_exit(&sv_mutex);
464 DTRACE_PROBE(sv_detach_err_busy);
465 return (DDI_FAILURE);
466 }
467 }
468
469
470 for (i = 0; sv_devs && i < sv_max_devices; i++) {
471 mutex_destroy(&sv_devs[i].sv_olock);
472 rw_destroy(&sv_devs[i].sv_lock);
473 }
474
475 if (sv_devs) {
476 nsc_kmem_free(sv_devs,
477 (sv_max_devices * sizeof (*sv_devs)));
478 sv_devs = NULL;
479 }
480 sv_max_devices = 0;
481
482 if (sv_mem) {
483 nsc_unregister_mem(sv_mem);
484 sv_mem = NULL;
485 }
486
487 mutex_exit(&sv_mutex);
488
489 /*
490 * Remove all minor nodes.
491 */
492
493 ddi_remove_minor_node(dip, NULL);
494 sv_dip = NULL;
495
496 return (DDI_SUCCESS);
497
498 default:
499 return (DDI_FAILURE);
500 }
501 }
502
503 static sv_maj_t *
sv_getmajor(const dev_t dev)504 sv_getmajor(const dev_t dev)
505 {
506 sv_maj_t **insert, *maj;
507 major_t umaj = getmajor(dev);
508
509 /*
510 * See if the hash table entry, or one of the hash chains
511 * is already allocated for this major number
512 */
513 if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
514 do {
515 if (maj->sm_major == umaj)
516 return (maj);
517 } while ((maj = maj->sm_next) != 0);
518 }
519
520 /*
521 * If the sv_mutex is held, there is design flaw, as the only non-mutex
522 * held callers can be sv_enable() or sv_dev_to_sv()
523 * Return an error, instead of panicing the system
524 */
525 if (MUTEX_HELD(&sv_mutex)) {
526 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
527 return (NULL);
528 }
529
530 /*
531 * Determine where to allocate a new element in the hash table
532 */
533 mutex_enter(&sv_mutex);
534 insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
535 for (maj = *insert; maj; maj = maj->sm_next) {
536
537 /* Did another thread beat us to it? */
538 if (maj->sm_major == umaj)
539 return (maj);
540
541 /* Find a NULL insert point? */
542 if (maj->sm_next == NULL)
543 insert = &maj->sm_next;
544 }
545
546 /*
547 * Located the new insert point
548 */
549 *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
550 if ((maj = *insert) != 0)
551 maj->sm_major = umaj;
552 else
553 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
554
555 mutex_exit(&sv_mutex);
556
557 return (maj);
558 }
559
560 /* ARGSUSED */
561
562 static int
sv_getinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)563 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
564 {
565 int rc = DDI_FAILURE;
566
567 switch (infocmd) {
568
569 case DDI_INFO_DEVT2DEVINFO:
570 *result = sv_dip;
571 rc = DDI_SUCCESS;
572 break;
573
574 case DDI_INFO_DEVT2INSTANCE:
575 /*
576 * We only have a single instance.
577 */
578 *result = 0;
579 rc = DDI_SUCCESS;
580 break;
581
582 default:
583 break;
584 }
585
586 return (rc);
587 }
588
589
590 /*
591 * Hashing of devices onto major device structures.
592 *
593 * Individual device structures are hashed onto one of the sm_hash[]
594 * buckets in the relevant major device structure.
595 *
596 * Hash insertion and deletion -must- be done with sv_mutex held. Hash
597 * searching does not require the mutex because of the sm_seq member.
598 * sm_seq is incremented on each insertion (-after- hash chain pointer
599 * manipulation) and each deletion (-before- hash chain pointer
600 * manipulation). When searching the hash chain, the seq number is
601 * checked before accessing each device structure, if the seq number has
602 * changed, then we restart the search from the top of the hash chain.
603 * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
604 * the hash chain (we are guaranteed that this search cannot be
605 * interrupted).
606 */
607
608 #define SV_HASH_RETRY 16
609
610 static sv_dev_t *
sv_dev_to_sv(const dev_t dev,sv_maj_t ** majpp)611 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
612 {
613 minor_t umin = getminor(dev);
614 sv_dev_t **hb, *next, *svp;
615 sv_maj_t *maj;
616 int seq;
617 int try;
618
619 /* Get major hash table */
620 maj = sv_getmajor(dev);
621 if (majpp)
622 *majpp = maj;
623 if (maj == NULL)
624 return (NULL);
625
626 if (maj->sm_inuse == 0) {
627 DTRACE_PROBE1(
628 sv_dev_to_sv_end,
629 dev_t, dev);
630 return (NULL);
631 }
632
633 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
634 try = 0;
635
636 retry:
637 if (try > SV_HASH_RETRY)
638 mutex_enter(&sv_mutex);
639
640 seq = maj->sm_seq;
641 for (svp = *hb; svp; svp = next) {
642 next = svp->sv_hash;
643
644 nsc_membar_stld(); /* preserve register load order */
645
646 if (maj->sm_seq != seq) {
647 DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
648 try++;
649 goto retry;
650 }
651
652 if (svp->sv_dev == dev)
653 break;
654 }
655
656 if (try > SV_HASH_RETRY)
657 mutex_exit(&sv_mutex);
658
659 return (svp);
660 }
661
662
663 /*
664 * Must be called with sv_mutex held.
665 */
666
667 static int
sv_get_state(const dev_t udev,sv_dev_t ** svpp)668 sv_get_state(const dev_t udev, sv_dev_t **svpp)
669 {
670 sv_dev_t **hb, **insert, *svp;
671 sv_maj_t *maj;
672 minor_t umin;
673 int i;
674
675 /* Get major hash table */
676 if ((maj = sv_getmajor(udev)) == NULL)
677 return (NULL);
678
679 /* Determine which minor hash table */
680 umin = getminor(udev);
681 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
682
683 /* look for clash */
684
685 insert = hb;
686
687 for (svp = *hb; svp; svp = svp->sv_hash) {
688 if (svp->sv_dev == udev)
689 break;
690
691 if (svp->sv_hash == NULL)
692 insert = &svp->sv_hash;
693 }
694
695 if (svp) {
696 DTRACE_PROBE1(
697 sv_get_state_enabled,
698 dev_t, udev);
699 return (SV_EENABLED);
700 }
701
702 /* look for spare sv_devs slot */
703
704 for (i = 0; i < sv_max_devices; i++) {
705 svp = &sv_devs[i];
706
707 if (svp->sv_state == SV_DISABLE)
708 break;
709 }
710
711 if (i >= sv_max_devices) {
712 DTRACE_PROBE1(
713 sv_get_state_noslots,
714 dev_t, udev);
715 return (SV_ENOSLOTS);
716 }
717
718 svp->sv_state = SV_PENDING;
719 svp->sv_pending = curthread;
720
721 *insert = svp;
722 svp->sv_hash = NULL;
723 maj->sm_seq++; /* must be after the store to the hash chain */
724
725 *svpp = svp;
726
727 /*
728 * We do not know the size of the underlying device at
729 * this stage, so initialise "nblocks" property to
730 * zero, and update it whenever we succeed in
731 * nsc_reserve'ing the underlying nsc_fd_t.
732 */
733
734 svp->sv_nblocks = 0;
735
736 return (0);
737 }
738
739
740 /*
741 * Remove a device structure from it's hash chain.
742 * Must be called with sv_mutex held.
743 */
744
745 static void
sv_rm_hash(sv_dev_t * svp)746 sv_rm_hash(sv_dev_t *svp)
747 {
748 sv_dev_t **svpp;
749 sv_maj_t *maj;
750
751 /* Get major hash table */
752 if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
753 return;
754
755 /* remove svp from hash chain */
756
757 svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
758 while (*svpp) {
759 if (*svpp == svp) {
760 /*
761 * increment of sm_seq must be before the
762 * removal from the hash chain
763 */
764 maj->sm_seq++;
765 *svpp = svp->sv_hash;
766 break;
767 }
768
769 svpp = &(*svpp)->sv_hash;
770 }
771
772 svp->sv_hash = NULL;
773 }
774
775 /*
776 * Free (disable) a device structure.
777 * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
778 * perform the exits during its processing.
779 */
780
781 static int
sv_free(sv_dev_t * svp,const int error)782 sv_free(sv_dev_t *svp, const int error)
783 {
784 struct cb_ops *cb_ops;
785 sv_maj_t *maj;
786
787 /* Get major hash table */
788 if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
789 return (NULL);
790
791 svp->sv_state = SV_PENDING;
792 svp->sv_pending = curthread;
793
794 /*
795 * Close the fd's before removing from the hash or swapping
796 * back the cb_ops pointers so that the cache flushes before new
797 * io can come in.
798 */
799
800 if (svp->sv_fd) {
801 (void) nsc_close(svp->sv_fd);
802 svp->sv_fd = 0;
803 }
804
805 sv_rm_hash(svp);
806
807 if (error != SV_ESDOPEN &&
808 error != SV_ELYROPEN && --maj->sm_inuse == 0) {
809
810 if (maj->sm_dev_ops)
811 cb_ops = maj->sm_dev_ops->devo_cb_ops;
812 else
813 cb_ops = NULL;
814
815 if (cb_ops && maj->sm_strategy != NULL) {
816 cb_ops->cb_strategy = maj->sm_strategy;
817 cb_ops->cb_close = maj->sm_close;
818 cb_ops->cb_ioctl = maj->sm_ioctl;
819 cb_ops->cb_write = maj->sm_write;
820 cb_ops->cb_open = maj->sm_open;
821 cb_ops->cb_read = maj->sm_read;
822 cb_ops->cb_flag = maj->sm_flag;
823
824 if (maj->sm_awrite)
825 cb_ops->cb_awrite = maj->sm_awrite;
826
827 if (maj->sm_aread)
828 cb_ops->cb_aread = maj->sm_aread;
829
830 /*
831 * corbin XXX
832 * Leave backing device ops in maj->sm_*
833 * to handle any requests that might come
834 * in during the disable. This could be
835 * a problem however if the backing device
836 * driver is changed while we process these
837 * requests.
838 *
839 * maj->sm_strategy = 0;
840 * maj->sm_awrite = 0;
841 * maj->sm_write = 0;
842 * maj->sm_ioctl = 0;
843 * maj->sm_close = 0;
844 * maj->sm_aread = 0;
845 * maj->sm_read = 0;
846 * maj->sm_open = 0;
847 * maj->sm_flag = 0;
848 *
849 */
850 }
851
852 if (maj->sm_dev_ops) {
853 maj->sm_dev_ops = 0;
854 }
855 }
856
857 if (svp->sv_lh) {
858 cred_t *crp = ddi_get_cred();
859
860 /*
861 * Close the protective layered driver open using the
862 * Sun Private layered driver i/f.
863 */
864
865 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
866 svp->sv_lh = NULL;
867 }
868
869 svp->sv_timestamp = nsc_lbolt();
870 svp->sv_state = SV_DISABLE;
871 svp->sv_pending = NULL;
872 rw_exit(&svp->sv_lock);
873 mutex_exit(&sv_mutex);
874
875 return (error);
876 }
877
878 /*
879 * Reserve the device, taking into account the possibility that
880 * the reserve might have to be retried.
881 */
882 static int
sv_reserve(nsc_fd_t * fd,int flags)883 sv_reserve(nsc_fd_t *fd, int flags)
884 {
885 int eintr_count;
886 int rc;
887
888 eintr_count = 0;
889 do {
890 rc = nsc_reserve(fd, flags);
891 if (rc == EINTR) {
892 ++eintr_count;
893 delay(2);
894 }
895 } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
896
897 return (rc);
898 }
899
900 static int
sv_enable(const caddr_t path,const int flag,const dev_t udev,spcs_s_info_t kstatus)901 sv_enable(const caddr_t path, const int flag,
902 const dev_t udev, spcs_s_info_t kstatus)
903 {
904 struct dev_ops *dev_ops;
905 struct cb_ops *cb_ops;
906 sv_dev_t *svp;
907 sv_maj_t *maj;
908 nsc_size_t nblocks;
909 int rc;
910 cred_t *crp;
911 ldi_ident_t li;
912
913 if (udev == (dev_t)-1 || udev == 0) {
914 DTRACE_PROBE1(
915 sv_enable_err_baddev,
916 dev_t, udev);
917 return (SV_EBADDEV);
918 }
919
920 if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
921 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
922 return (SV_EAMODE);
923 }
924
925 /* Get major hash table */
926 if ((maj = sv_getmajor(udev)) == NULL)
927 return (SV_EBADDEV);
928
929 mutex_enter(&sv_mutex);
930
931 rc = sv_get_state(udev, &svp);
932 if (rc) {
933 mutex_exit(&sv_mutex);
934 DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
935 return (rc);
936 }
937
938 rw_enter(&svp->sv_lock, RW_WRITER);
939
940 /*
941 * Get real fd used for io
942 */
943
944 svp->sv_dev = udev;
945 svp->sv_flag = flag;
946
947 /*
948 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
949 * function pointer before sv swaps them out.
950 */
951
952 svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
953 sv_fd_def, (blind_t)udev, &rc);
954
955 if (svp->sv_fd == NULL) {
956 if (kstatus)
957 spcs_s_add(kstatus, rc);
958 DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
959 return (sv_free(svp, SV_ESDOPEN));
960 }
961
962 /*
963 * Perform a layered driver open using the Sun Private layered
964 * driver i/f to ensure that the cb_ops structure for the driver
965 * is not detached out from under us whilst sv is enabled.
966 *
967 */
968
969 crp = ddi_get_cred();
970 svp->sv_lh = NULL;
971
972 if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
973 rc = ldi_open_by_dev(&svp->sv_dev,
974 OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
975 }
976
977 if (rc != 0) {
978 if (kstatus)
979 spcs_s_add(kstatus, rc);
980 DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
981 return (sv_free(svp, SV_ELYROPEN));
982 }
983
984 /*
985 * Do layering if required - must happen after nsc_open().
986 */
987
988 if (maj->sm_inuse++ == 0) {
989 maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
990
991 if (maj->sm_dev_ops == NULL ||
992 maj->sm_dev_ops->devo_cb_ops == NULL) {
993 DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
994 return (sv_free(svp, SV_ELOAD));
995 }
996
997 dev_ops = maj->sm_dev_ops;
998 cb_ops = dev_ops->devo_cb_ops;
999
1000 if (cb_ops->cb_strategy == NULL ||
1001 cb_ops->cb_strategy == nodev ||
1002 cb_ops->cb_strategy == nulldev) {
1003 DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1004 return (sv_free(svp, SV_ELOAD));
1005 }
1006
1007 if (cb_ops->cb_strategy == sv_lyr_strategy) {
1008 DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1009 return (sv_free(svp, SV_ESTRATEGY));
1010 }
1011
1012 maj->sm_strategy = cb_ops->cb_strategy;
1013 maj->sm_close = cb_ops->cb_close;
1014 maj->sm_ioctl = cb_ops->cb_ioctl;
1015 maj->sm_write = cb_ops->cb_write;
1016 maj->sm_open = cb_ops->cb_open;
1017 maj->sm_read = cb_ops->cb_read;
1018 maj->sm_flag = cb_ops->cb_flag;
1019
1020 cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1021 cb_ops->cb_strategy = sv_lyr_strategy;
1022 cb_ops->cb_close = sv_lyr_close;
1023 cb_ops->cb_ioctl = sv_lyr_ioctl;
1024 cb_ops->cb_write = sv_lyr_write;
1025 cb_ops->cb_open = sv_lyr_open;
1026 cb_ops->cb_read = sv_lyr_read;
1027
1028 /*
1029 * Check that the driver has async I/O entry points
1030 * before changing them.
1031 */
1032
1033 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1034 maj->sm_awrite = 0;
1035 maj->sm_aread = 0;
1036 } else {
1037 maj->sm_awrite = cb_ops->cb_awrite;
1038 maj->sm_aread = cb_ops->cb_aread;
1039
1040 cb_ops->cb_awrite = sv_lyr_awrite;
1041 cb_ops->cb_aread = sv_lyr_aread;
1042 }
1043
1044 /*
1045 * Bug 4645743
1046 *
1047 * Prevent sv from ever unloading after it has interposed
1048 * on a major device because there is a race between
1049 * sv removing its layered entry points from the target
1050 * dev_ops, a client coming in and accessing the driver,
1051 * and the kernel modunloading the sv text.
1052 *
1053 * To allow unload, do svboot -u, which only happens in
1054 * pkgrm time.
1055 */
1056 ASSERT(MUTEX_HELD(&sv_mutex));
1057 sv_mod_status = SV_PREVENT_UNLOAD;
1058 }
1059
1060
1061 svp->sv_timestamp = nsc_lbolt();
1062 svp->sv_state = SV_ENABLE;
1063 svp->sv_pending = NULL;
1064 rw_exit(&svp->sv_lock);
1065
1066 sv_ndevices++;
1067 mutex_exit(&sv_mutex);
1068
1069 nblocks = 0;
1070 if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1071 nblocks = svp->sv_nblocks;
1072 nsc_release(svp->sv_fd);
1073 }
1074
1075 cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1076 svp->sv_dev, nblocks);
1077
1078 return (0);
1079 }
1080
1081
1082 static int
sv_prepare_unload()1083 sv_prepare_unload()
1084 {
1085 int rc = 0;
1086
1087 mutex_enter(&sv_mutex);
1088
1089 if (sv_mod_status == SV_PREVENT_UNLOAD) {
1090 if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1091 rc = EBUSY;
1092 } else {
1093 sv_mod_status = SV_ALLOW_UNLOAD;
1094 delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1095 }
1096 }
1097
1098 mutex_exit(&sv_mutex);
1099 return (rc);
1100 }
1101
1102 static int
svattach_fd(blind_t arg)1103 svattach_fd(blind_t arg)
1104 {
1105 dev_t dev = (dev_t)arg;
1106 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1107 int rc;
1108
1109 if (sv_debug > 0)
1110 cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1111
1112 if (svp == NULL) {
1113 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1114 return (0);
1115 }
1116
1117 if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1118 cmn_err(CE_WARN,
1119 "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1120 svp->sv_nblocks = 0;
1121 }
1122
1123 if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1124 cmn_err(CE_WARN,
1125 "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1126 svp->sv_maxfbas = 0;
1127 }
1128
1129 if (sv_debug > 0) {
1130 cmn_err(CE_CONT,
1131 "!svattach_fd(%p): size %" NSC_SZFMT ", "
1132 "maxfbas %" NSC_SZFMT "\n",
1133 arg, svp->sv_nblocks, svp->sv_maxfbas);
1134 }
1135
1136 return (0);
1137 }
1138
1139
1140 static int
svdetach_fd(blind_t arg)1141 svdetach_fd(blind_t arg)
1142 {
1143 dev_t dev = (dev_t)arg;
1144 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1145
1146 if (sv_debug > 0)
1147 cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1148
1149 /* svp can be NULL during disable of an sv */
1150 if (svp == NULL)
1151 return (0);
1152
1153 svp->sv_maxfbas = 0;
1154 svp->sv_nblocks = 0;
1155 return (0);
1156 }
1157
1158
1159 /*
1160 * Side effect: if called with (guard != 0), then expects both sv_mutex
1161 * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1162 */
1163
1164 /* ARGSUSED */
1165 static int
sv_disable(dev_t dev,spcs_s_info_t kstatus)1166 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1167 {
1168 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1169
1170 if (svp == NULL) {
1171
1172 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1173 return (SV_ENODEV);
1174 }
1175
1176 mutex_enter(&sv_mutex);
1177 rw_enter(&svp->sv_lock, RW_WRITER);
1178
1179 if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1180 rw_exit(&svp->sv_lock);
1181 mutex_exit(&sv_mutex);
1182
1183 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1184 return (SV_EDISABLED);
1185 }
1186
1187
1188 sv_ndevices--;
1189 return (sv_free(svp, 0));
1190 }
1191
1192
1193
1194 static int
sv_lyr_open(dev_t * devp,int flag,int otyp,cred_t * crp)1195 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1196 {
1197 nsc_buf_t *tmph;
1198 sv_dev_t *svp;
1199 sv_maj_t *maj;
1200 int (*fn)();
1201 dev_t odev;
1202 int ret;
1203 int rc;
1204
1205 svp = sv_dev_to_sv(*devp, &maj);
1206
1207 if (svp) {
1208 if (svp->sv_state == SV_PENDING &&
1209 svp->sv_pending == curthread) {
1210 /*
1211 * This is a recursive open from a call to
1212 * ddi_lyr_open_by_devt and so we just want
1213 * to pass it straight through to the
1214 * underlying driver.
1215 */
1216 DTRACE_PROBE2(sv_lyr_open_recursive,
1217 sv_dev_t *, svp,
1218 dev_t, *devp);
1219 svp = NULL;
1220 } else
1221 rw_enter(&svp->sv_lock, RW_READER);
1222 }
1223
1224 odev = *devp;
1225
1226 if (maj && (fn = maj->sm_open) != 0) {
1227 if (!(maj->sm_flag & D_MP)) {
1228 UNSAFE_ENTER();
1229 ret = (*fn)(devp, flag, otyp, crp);
1230 UNSAFE_EXIT();
1231 } else {
1232 ret = (*fn)(devp, flag, otyp, crp);
1233 }
1234
1235 if (ret == 0) {
1236 /*
1237 * Re-acquire svp if the driver changed *devp.
1238 */
1239
1240 if (*devp != odev) {
1241 if (svp != NULL)
1242 rw_exit(&svp->sv_lock);
1243
1244 svp = sv_dev_to_sv(*devp, NULL);
1245
1246 if (svp) {
1247 rw_enter(&svp->sv_lock, RW_READER);
1248 }
1249 }
1250 }
1251 } else {
1252 ret = ENODEV;
1253 }
1254
1255 if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1256 /*
1257 * Underlying DDI open failed, but we have this
1258 * device SV enabled. If we can read some data
1259 * from the device, fake a successful open (this
1260 * probably means that this device is RDC'd and we
1261 * are getting the data from the secondary node).
1262 *
1263 * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1264 * ensure that it does not deadlock if this open is
1265 * coming from nskernd:get_bsize().
1266 */
1267 rc = sv_reserve(svp->sv_fd,
1268 NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1269 if (rc == 0) {
1270 tmph = NULL;
1271
1272 rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1273 if (rc <= 0) {
1274 /* success */
1275 ret = 0;
1276 }
1277
1278 if (tmph) {
1279 (void) nsc_free_buf(tmph);
1280 tmph = NULL;
1281 }
1282
1283 nsc_release(svp->sv_fd);
1284
1285 /*
1286 * Count the number of layered opens that we
1287 * fake since we have to fake a matching number
1288 * of closes (OTYP_LYR open/close calls must be
1289 * paired).
1290 */
1291
1292 if (ret == 0 && otyp == OTYP_LYR) {
1293 mutex_enter(&svp->sv_olock);
1294 svp->sv_openlcnt++;
1295 mutex_exit(&svp->sv_olock);
1296 }
1297 }
1298 }
1299
1300 if (svp) {
1301 rw_exit(&svp->sv_lock);
1302 }
1303
1304 return (ret);
1305 }
1306
1307
1308 static int
sv_lyr_close(dev_t dev,int flag,int otyp,cred_t * crp)1309 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1310 {
1311 sv_dev_t *svp;
1312 sv_maj_t *maj;
1313 int (*fn)();
1314 int ret;
1315
1316 svp = sv_dev_to_sv(dev, &maj);
1317
1318 if (svp &&
1319 svp->sv_state == SV_PENDING &&
1320 svp->sv_pending == curthread) {
1321 /*
1322 * This is a recursive open from a call to
1323 * ddi_lyr_close and so we just want
1324 * to pass it straight through to the
1325 * underlying driver.
1326 */
1327 DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1328 dev_t, dev);
1329 svp = NULL;
1330 }
1331
1332 if (svp) {
1333 rw_enter(&svp->sv_lock, RW_READER);
1334
1335 if (otyp == OTYP_LYR) {
1336 mutex_enter(&svp->sv_olock);
1337
1338 if (svp->sv_openlcnt) {
1339 /*
1340 * Consume sufficient layered closes to
1341 * account for the opens that we faked
1342 * whilst the device was failed.
1343 */
1344 svp->sv_openlcnt--;
1345 mutex_exit(&svp->sv_olock);
1346 rw_exit(&svp->sv_lock);
1347
1348 DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1349
1350 return (0);
1351 }
1352
1353 mutex_exit(&svp->sv_olock);
1354 }
1355 }
1356
1357 if (maj && (fn = maj->sm_close) != 0) {
1358 if (!(maj->sm_flag & D_MP)) {
1359 UNSAFE_ENTER();
1360 ret = (*fn)(dev, flag, otyp, crp);
1361 UNSAFE_EXIT();
1362 } else {
1363 ret = (*fn)(dev, flag, otyp, crp);
1364 }
1365 } else {
1366 ret = ENODEV;
1367 }
1368
1369 if (svp) {
1370 rw_exit(&svp->sv_lock);
1371 }
1372
1373 return (ret);
1374 }
1375
1376
1377 /*
1378 * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1379 * return NULL.
1380 */
1381 static sv_dev_t *
sv_find_enabled(const dev_t dev,sv_maj_t ** majpp)1382 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1383 {
1384 sv_dev_t *svp;
1385
1386 while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1387 rw_enter(&svp->sv_lock, RW_READER);
1388
1389 if (svp->sv_state == SV_ENABLE) {
1390 /* locked and enabled */
1391 break;
1392 }
1393
1394 /*
1395 * State was changed while waiting on the lock.
1396 * Wait for a stable state.
1397 */
1398 rw_exit(&svp->sv_lock);
1399
1400 DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1401
1402 delay(2);
1403 }
1404
1405 return (svp);
1406 }
1407
1408
1409 static int
sv_lyr_uio(dev_t dev,uio_t * uiop,cred_t * crp,int rw)1410 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1411 {
1412 sv_dev_t *svp;
1413 sv_maj_t *maj;
1414 int (*fn)();
1415 int rc;
1416
1417 svp = sv_find_enabled(dev, &maj);
1418 if (svp == NULL) {
1419 if (maj) {
1420 if (rw == NSC_READ)
1421 fn = maj->sm_read;
1422 else
1423 fn = maj->sm_write;
1424
1425 if (fn != 0) {
1426 if (!(maj->sm_flag & D_MP)) {
1427 UNSAFE_ENTER();
1428 rc = (*fn)(dev, uiop, crp);
1429 UNSAFE_EXIT();
1430 } else {
1431 rc = (*fn)(dev, uiop, crp);
1432 }
1433 }
1434
1435 return (rc);
1436 } else {
1437 return (ENODEV);
1438 }
1439 }
1440
1441 ASSERT(RW_READ_HELD(&svp->sv_lock));
1442
1443 if (svp->sv_flag == 0) {
1444 /*
1445 * guard access mode
1446 * - prevent user level access to the device
1447 */
1448 DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1449 rc = EPERM;
1450 goto out;
1451 }
1452
1453 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1454 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1455 goto out;
1456 }
1457
1458 if (rw == NSC_READ)
1459 rc = nsc_uread(svp->sv_fd, uiop, crp);
1460 else
1461 rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1462
1463 nsc_release(svp->sv_fd);
1464
1465 out:
1466 rw_exit(&svp->sv_lock);
1467
1468 return (rc);
1469 }
1470
1471
1472 static int
sv_lyr_read(dev_t dev,uio_t * uiop,cred_t * crp)1473 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1474 {
1475 return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1476 }
1477
1478
1479 static int
sv_lyr_write(dev_t dev,uio_t * uiop,cred_t * crp)1480 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1481 {
1482 return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1483 }
1484
1485
1486 /* ARGSUSED */
1487
1488 static int
sv_lyr_aread(dev_t dev,struct aio_req * aio,cred_t * crp)1489 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1490 {
1491 return (aphysio(sv_lyr_strategy,
1492 anocancel, dev, B_READ, minphys, aio));
1493 }
1494
1495
1496 /* ARGSUSED */
1497
1498 static int
sv_lyr_awrite(dev_t dev,struct aio_req * aio,cred_t * crp)1499 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1500 {
1501 return (aphysio(sv_lyr_strategy,
1502 anocancel, dev, B_WRITE, minphys, aio));
1503 }
1504
1505
1506 /*
1507 * Set up an array containing the list of raw path names
1508 * The array for the paths is svl and the size of the array is
1509 * in size.
1510 *
1511 * If there are more layered devices than will fit in the array,
1512 * the number of extra layered devices is returned. Otherwise
1513 * zero is return.
1514 *
1515 * Input:
1516 * svn : array for paths
1517 * size : size of the array
1518 *
1519 * Output (extra):
1520 * zero : All paths fit in array
1521 * >0 : Number of defined layered devices don't fit in array
1522 */
1523
1524 static int
sv_list(void * ptr,const int size,int * extra,const int ilp32)1525 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1526 {
1527 sv_name32_t *svn32;
1528 sv_name_t *svn;
1529 sv_dev_t *svp;
1530 int *mode, *nblocks;
1531 int i, index;
1532 char *path;
1533
1534 *extra = 0;
1535 index = 0;
1536
1537 if (ilp32)
1538 svn32 = ptr;
1539 else
1540 svn = ptr;
1541
1542 mutex_enter(&sv_mutex);
1543 for (i = 0; i < sv_max_devices; i++) {
1544 svp = &sv_devs[i];
1545
1546 rw_enter(&svp->sv_lock, RW_READER);
1547
1548 if (svp->sv_state != SV_ENABLE) {
1549 rw_exit(&svp->sv_lock);
1550 continue;
1551 }
1552
1553 if ((*extra) != 0 || ptr == NULL) {
1554 /* Another overflow entry */
1555 rw_exit(&svp->sv_lock);
1556 (*extra)++;
1557 continue;
1558 }
1559
1560 if (ilp32) {
1561 nblocks = &svn32->svn_nblocks;
1562 mode = &svn32->svn_mode;
1563 path = svn32->svn_path;
1564
1565 svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1566 svn32++;
1567 } else {
1568 nblocks = &svn->svn_nblocks;
1569 mode = &svn->svn_mode;
1570 path = svn->svn_path;
1571
1572 svn->svn_timestamp = svp->sv_timestamp;
1573 svn++;
1574 }
1575
1576 (void) strcpy(path, nsc_pathname(svp->sv_fd));
1577 *nblocks = svp->sv_nblocks;
1578 *mode = svp->sv_flag;
1579
1580 if (*nblocks == 0) {
1581 if (sv_debug > 3)
1582 cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1583
1584 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1585 *nblocks = svp->sv_nblocks;
1586 nsc_release(svp->sv_fd);
1587 }
1588 }
1589
1590 if (++index >= size) {
1591 /* Out of space */
1592 (*extra)++;
1593 }
1594
1595 rw_exit(&svp->sv_lock);
1596 }
1597 mutex_exit(&sv_mutex);
1598
1599 if (index < size) {
1600 /* NULL terminated list */
1601 if (ilp32)
1602 svn32->svn_path[0] = '\0';
1603 else
1604 svn->svn_path[0] = '\0';
1605 }
1606
1607 return (0);
1608 }
1609
1610
1611 static void
sv_thread_tune(int threads)1612 sv_thread_tune(int threads)
1613 {
1614 int incr = (threads > 0) ? 1 : -1;
1615 int change = 0;
1616 int nthreads;
1617
1618 ASSERT(MUTEX_HELD(&sv_mutex));
1619
1620 if (sv_threads_extra) {
1621 /* keep track of any additional threads requested */
1622 if (threads > 0) {
1623 sv_threads_extra += threads;
1624 return;
1625 }
1626 threads = -threads;
1627 if (threads >= sv_threads_extra) {
1628 threads -= sv_threads_extra;
1629 sv_threads_extra = 0;
1630 /* fall through to while loop */
1631 } else {
1632 sv_threads_extra -= threads;
1633 return;
1634 }
1635 } else if (threads > 0) {
1636 /*
1637 * do not increase the number of threads beyond
1638 * sv_threads_max when doing dynamic thread tuning
1639 */
1640 nthreads = nst_nthread(sv_tset);
1641 if ((nthreads + threads) > sv_threads_max) {
1642 sv_threads_extra = nthreads + threads - sv_threads_max;
1643 threads = sv_threads_max - nthreads;
1644 if (threads <= 0)
1645 return;
1646 }
1647 }
1648
1649 if (threads < 0)
1650 threads = -threads;
1651
1652 while (threads--) {
1653 nthreads = nst_nthread(sv_tset);
1654 sv_threads_needed += incr;
1655
1656 if (sv_threads_needed >= nthreads)
1657 change += nst_add_thread(sv_tset, sv_threads_inc);
1658 else if ((sv_threads_needed <
1659 (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1660 ((nthreads - sv_threads_inc) >= sv_threads))
1661 change -= nst_del_thread(sv_tset, sv_threads_inc);
1662 }
1663
1664 #ifdef DEBUG
1665 if (change) {
1666 cmn_err(CE_NOTE,
1667 "!sv_thread_tune: threads needed %d, nthreads %d, "
1668 "nthreads change %d",
1669 sv_threads_needed, nst_nthread(sv_tset), change);
1670 }
1671 #endif
1672 }
1673
1674
1675 /* ARGSUSED */
1676 static int
svopen(dev_t * devp,int flag,int otyp,cred_t * crp)1677 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1678 {
1679 int rc;
1680
1681 mutex_enter(&sv_mutex);
1682 rc = sv_init_devs();
1683 mutex_exit(&sv_mutex);
1684
1685 return (rc);
1686 }
1687
1688
1689 /* ARGSUSED */
1690 static int
svclose(dev_t dev,int flag,int otyp,cred_t * crp)1691 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1692 {
1693 const int secs = HZ * 5;
1694 const int ticks = HZ / 10;
1695 int loops = secs / ticks;
1696
1697 mutex_enter(&sv_mutex);
1698 while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1699 if (nst_nlive(sv_tset) <= 0) {
1700 nst_destroy(sv_tset);
1701 sv_tset = NULL;
1702 break;
1703 }
1704
1705 /* threads still active - wait for them to exit */
1706 mutex_exit(&sv_mutex);
1707 delay(ticks);
1708 loops--;
1709 mutex_enter(&sv_mutex);
1710 }
1711 mutex_exit(&sv_mutex);
1712
1713 if (loops <= 0) {
1714 cmn_err(CE_WARN,
1715 #ifndef DEBUG
1716 /* do not write to console when non-DEBUG */
1717 "!"
1718 #endif
1719 "sv:svclose: threads still active "
1720 "after %d sec - leaking thread set", secs);
1721 }
1722
1723 return (0);
1724 }
1725
1726
1727 static int
svioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * crp,int * rvalp)1728 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1729 {
1730 char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1731 spcs_s_info_t kstatus; /* Kernel version of spcs status */
1732 spcs_s_info_t ustatus; /* Address of user version of spcs status */
1733 sv_list32_t svl32; /* 32 bit Initial structure for SVIOC_LIST */
1734 sv_version_t svv; /* Version structure */
1735 sv_conf_t svc; /* User config structure */
1736 sv_list_t svl; /* Initial structure for SVIOC_LIST */
1737 void *usvn; /* Address of user sv_name_t */
1738 void *svn = NULL; /* Array for SVIOC_LIST */
1739 uint64_t phash; /* pathname hash */
1740 int rc = 0; /* Return code -- errno */
1741 int size; /* Number of items in array */
1742 int bytes; /* Byte size of array */
1743 int ilp32; /* Convert data structures for ilp32 userland */
1744
1745 *rvalp = 0;
1746
1747 /*
1748 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1749 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1750 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1751 *
1752 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1753 */
1754 if (sv_mod_status == SV_ALLOW_UNLOAD) {
1755 return (EBUSY);
1756 }
1757
1758 if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1759 return (rc);
1760
1761 kstatus = spcs_s_kcreate();
1762 if (!kstatus) {
1763 DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1764 return (ENOMEM);
1765 }
1766
1767 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1768
1769 switch (cmd) {
1770
1771 case SVIOC_ENABLE:
1772
1773 if (ilp32) {
1774 sv_conf32_t svc32;
1775
1776 if (ddi_copyin((void *)arg, &svc32,
1777 sizeof (svc32), mode) < 0) {
1778 spcs_s_kfree(kstatus);
1779 return (EFAULT);
1780 }
1781
1782 svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1783 (void) strcpy(svc.svc_path, svc32.svc_path);
1784 svc.svc_flag = svc32.svc_flag;
1785 svc.svc_major = svc32.svc_major;
1786 svc.svc_minor = svc32.svc_minor;
1787 } else {
1788 if (ddi_copyin((void *)arg, &svc,
1789 sizeof (svc), mode) < 0) {
1790 spcs_s_kfree(kstatus);
1791 return (EFAULT);
1792 }
1793 }
1794
1795 /* force to raw access */
1796 svc.svc_flag = NSC_DEVICE;
1797
1798 if (sv_tset == NULL) {
1799 mutex_enter(&sv_mutex);
1800
1801 if (sv_tset == NULL) {
1802 sv_tset = nst_init("sv_thr", sv_threads);
1803 }
1804
1805 mutex_exit(&sv_mutex);
1806
1807 if (sv_tset == NULL) {
1808 cmn_err(CE_WARN,
1809 "!sv: could not allocate %d threads",
1810 sv_threads);
1811 }
1812 }
1813
1814 rc = sv_enable(svc.svc_path, svc.svc_flag,
1815 makedevice(svc.svc_major, svc.svc_minor), kstatus);
1816
1817 if (rc == 0) {
1818 sv_config_time = nsc_lbolt();
1819
1820 mutex_enter(&sv_mutex);
1821 sv_thread_tune(sv_threads_dev);
1822 mutex_exit(&sv_mutex);
1823 }
1824
1825 DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1826
1827 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1828 /* NOTREACHED */
1829
1830 case SVIOC_DISABLE:
1831
1832 if (ilp32) {
1833 sv_conf32_t svc32;
1834
1835 if (ddi_copyin((void *)arg, &svc32,
1836 sizeof (svc32), mode) < 0) {
1837 spcs_s_kfree(kstatus);
1838 return (EFAULT);
1839 }
1840
1841 svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1842 svc.svc_major = svc32.svc_major;
1843 svc.svc_minor = svc32.svc_minor;
1844 (void) strcpy(svc.svc_path, svc32.svc_path);
1845 svc.svc_flag = svc32.svc_flag;
1846 } else {
1847 if (ddi_copyin((void *)arg, &svc,
1848 sizeof (svc), mode) < 0) {
1849 spcs_s_kfree(kstatus);
1850 return (EFAULT);
1851 }
1852 }
1853
1854 if (svc.svc_major == (major_t)-1 &&
1855 svc.svc_minor == (minor_t)-1) {
1856 sv_dev_t *svp;
1857 int i;
1858
1859 /*
1860 * User level could not find the minor device
1861 * node, so do this the slow way by searching
1862 * the entire sv config for a matching pathname.
1863 */
1864
1865 phash = nsc_strhash(svc.svc_path);
1866
1867 mutex_enter(&sv_mutex);
1868
1869 for (i = 0; i < sv_max_devices; i++) {
1870 svp = &sv_devs[i];
1871
1872 if (svp->sv_state == SV_DISABLE ||
1873 svp->sv_fd == NULL)
1874 continue;
1875
1876 if (nsc_fdpathcmp(svp->sv_fd, phash,
1877 svc.svc_path) == 0) {
1878 svc.svc_major = getmajor(svp->sv_dev);
1879 svc.svc_minor = getminor(svp->sv_dev);
1880 break;
1881 }
1882 }
1883
1884 mutex_exit(&sv_mutex);
1885
1886 if (svc.svc_major == (major_t)-1 &&
1887 svc.svc_minor == (minor_t)-1)
1888 return (spcs_s_ocopyoutf(&kstatus,
1889 svc.svc_error, SV_ENODEV));
1890 }
1891
1892 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1893 kstatus);
1894
1895 if (rc == 0) {
1896 sv_config_time = nsc_lbolt();
1897
1898 mutex_enter(&sv_mutex);
1899 sv_thread_tune(-sv_threads_dev);
1900 mutex_exit(&sv_mutex);
1901 }
1902
1903 DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1904
1905 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1906 /* NOTREACHED */
1907
1908 case SVIOC_LIST:
1909
1910 if (ilp32) {
1911 if (ddi_copyin((void *)arg, &svl32,
1912 sizeof (svl32), mode) < 0) {
1913 spcs_s_kfree(kstatus);
1914 return (EFAULT);
1915 }
1916
1917 ustatus = (spcs_s_info_t)svl32.svl_error;
1918 size = svl32.svl_count;
1919 usvn = (void *)(unsigned long)svl32.svl_names;
1920 } else {
1921 if (ddi_copyin((void *)arg, &svl,
1922 sizeof (svl), mode) < 0) {
1923 spcs_s_kfree(kstatus);
1924 return (EFAULT);
1925 }
1926
1927 ustatus = svl.svl_error;
1928 size = svl.svl_count;
1929 usvn = svl.svl_names;
1930 }
1931
1932 /* Do some boundary checking */
1933 if ((size < 0) || (size > sv_max_devices)) {
1934 /* Array size is out of range */
1935 return (spcs_s_ocopyoutf(&kstatus, ustatus,
1936 SV_EARRBOUNDS, "0",
1937 spcs_s_inttostring(sv_max_devices, itmp1,
1938 sizeof (itmp1), 0),
1939 spcs_s_inttostring(size, itmp2,
1940 sizeof (itmp2), 0)));
1941 }
1942
1943 if (ilp32)
1944 bytes = size * sizeof (sv_name32_t);
1945 else
1946 bytes = size * sizeof (sv_name_t);
1947
1948 /* Allocate memory for the array of structures */
1949 if (bytes != 0) {
1950 svn = kmem_zalloc(bytes, KM_SLEEP);
1951 if (!svn) {
1952 return (spcs_s_ocopyoutf(&kstatus,
1953 ustatus, ENOMEM));
1954 }
1955 }
1956
1957 rc = sv_list(svn, size, rvalp, ilp32);
1958 if (rc) {
1959 if (svn != NULL)
1960 kmem_free(svn, bytes);
1961 return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1962 }
1963
1964 if (ilp32) {
1965 svl32.svl_timestamp = (uint32_t)sv_config_time;
1966 svl32.svl_maxdevs = (int32_t)sv_max_devices;
1967
1968 /* Return the list structure */
1969 if (ddi_copyout(&svl32, (void *)arg,
1970 sizeof (svl32), mode) < 0) {
1971 spcs_s_kfree(kstatus);
1972 if (svn != NULL)
1973 kmem_free(svn, bytes);
1974 return (EFAULT);
1975 }
1976 } else {
1977 svl.svl_timestamp = sv_config_time;
1978 svl.svl_maxdevs = sv_max_devices;
1979
1980 /* Return the list structure */
1981 if (ddi_copyout(&svl, (void *)arg,
1982 sizeof (svl), mode) < 0) {
1983 spcs_s_kfree(kstatus);
1984 if (svn != NULL)
1985 kmem_free(svn, bytes);
1986 return (EFAULT);
1987 }
1988 }
1989
1990 /* Return the array */
1991 if (svn != NULL) {
1992 if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1993 kmem_free(svn, bytes);
1994 spcs_s_kfree(kstatus);
1995 return (EFAULT);
1996 }
1997 kmem_free(svn, bytes);
1998 }
1999
2000 DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
2001
2002 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2003 /* NOTREACHED */
2004
2005 case SVIOC_VERSION:
2006
2007 if (ilp32) {
2008 sv_version32_t svv32;
2009
2010 if (ddi_copyin((void *)arg, &svv32,
2011 sizeof (svv32), mode) < 0) {
2012 spcs_s_kfree(kstatus);
2013 return (EFAULT);
2014 }
2015
2016 svv32.svv_major_rev = sv_major_rev;
2017 svv32.svv_minor_rev = sv_minor_rev;
2018 svv32.svv_micro_rev = sv_micro_rev;
2019 svv32.svv_baseline_rev = sv_baseline_rev;
2020
2021 if (ddi_copyout(&svv32, (void *)arg,
2022 sizeof (svv32), mode) < 0) {
2023 spcs_s_kfree(kstatus);
2024 return (EFAULT);
2025 }
2026
2027 ustatus = (spcs_s_info_t)svv32.svv_error;
2028 } else {
2029 if (ddi_copyin((void *)arg, &svv,
2030 sizeof (svv), mode) < 0) {
2031 spcs_s_kfree(kstatus);
2032 return (EFAULT);
2033 }
2034
2035 svv.svv_major_rev = sv_major_rev;
2036 svv.svv_minor_rev = sv_minor_rev;
2037 svv.svv_micro_rev = sv_micro_rev;
2038 svv.svv_baseline_rev = sv_baseline_rev;
2039
2040 if (ddi_copyout(&svv, (void *)arg,
2041 sizeof (svv), mode) < 0) {
2042 spcs_s_kfree(kstatus);
2043 return (EFAULT);
2044 }
2045
2046 ustatus = svv.svv_error;
2047 }
2048
2049 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2050
2051 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2052 /* NOTREACHED */
2053
2054 case SVIOC_UNLOAD:
2055 rc = sv_prepare_unload();
2056
2057 if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2058 rc = EFAULT;
2059 }
2060
2061 spcs_s_kfree(kstatus);
2062 return (rc);
2063
2064 default:
2065 spcs_s_kfree(kstatus);
2066
2067 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2068
2069 return (EINVAL);
2070 /* NOTREACHED */
2071 }
2072
2073 /* NOTREACHED */
2074 }
2075
2076
2077 /* ARGSUSED */
2078 static int
svprint(dev_t dev,char * str)2079 svprint(dev_t dev, char *str)
2080 {
2081 int instance = ddi_get_instance(sv_dip);
2082 cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2083 return (0);
2084 }
2085
2086
2087 static void
_sv_lyr_strategy(struct buf * bp)2088 _sv_lyr_strategy(struct buf *bp)
2089 {
2090 caddr_t buf_addr; /* pointer to linear buffer in bp */
2091 nsc_buf_t *bufh = NULL;
2092 nsc_buf_t *hndl = NULL;
2093 sv_dev_t *svp;
2094 nsc_vec_t *v;
2095 sv_maj_t *maj;
2096 nsc_size_t fba_req, fba_len; /* FBA lengths */
2097 nsc_off_t fba_off; /* FBA offset */
2098 size_t tocopy, nbytes; /* byte lengths */
2099 int rw, rc; /* flags and return codes */
2100 int (*fn)();
2101
2102 rc = 0;
2103
2104 if (sv_debug > 5)
2105 cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2106
2107 svp = sv_find_enabled(bp->b_edev, &maj);
2108 if (svp == NULL) {
2109 if (maj && (fn = maj->sm_strategy) != 0) {
2110 if (!(maj->sm_flag & D_MP)) {
2111 UNSAFE_ENTER();
2112 rc = (*fn)(bp);
2113 UNSAFE_EXIT();
2114 } else {
2115 rc = (*fn)(bp);
2116 }
2117 return;
2118 } else {
2119 bioerror(bp, ENODEV);
2120 biodone(bp);
2121 return;
2122 }
2123 }
2124
2125 ASSERT(RW_READ_HELD(&svp->sv_lock));
2126
2127 if (svp->sv_flag == 0) {
2128 /*
2129 * guard access mode
2130 * - prevent user level access to the device
2131 */
2132 DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2133 bioerror(bp, EPERM);
2134 goto out;
2135 }
2136
2137 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2138 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2139
2140 if (rc == EINTR)
2141 cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2142 bioerror(bp, rc);
2143 goto out;
2144 }
2145
2146 if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2147 DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2148
2149 if (bp->b_flags & B_READ) {
2150 /* return EOF, not an error */
2151 bp->b_resid = bp->b_bcount;
2152 bioerror(bp, 0);
2153 } else
2154 bioerror(bp, EINVAL);
2155
2156 goto done;
2157 }
2158
2159 /*
2160 * Preallocate a handle once per call to strategy.
2161 * If this fails, then the nsc_alloc_buf() will allocate
2162 * a temporary handle per allocation/free pair.
2163 */
2164
2165 DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2166
2167 bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2168
2169 DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2170
2171 if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2172 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2173
2174 cmn_err(CE_WARN,
2175 "!sv: allocated active handle (bufh %p, flags %x)",
2176 (void *)bufh, bufh->sb_flag);
2177
2178 bioerror(bp, ENXIO);
2179 goto done;
2180 }
2181
2182 fba_req = FBA_LEN(bp->b_bcount);
2183 if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2184 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2185
2186 rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2187
2188 bp_mapin(bp);
2189
2190 bp->b_resid = bp->b_bcount;
2191 buf_addr = bp->b_un.b_addr;
2192 fba_off = 0;
2193
2194 /*
2195 * fba_req - requested size of transfer in FBAs after
2196 * truncation to device extent, and allowing for
2197 * possible non-FBA bounded final chunk.
2198 * fba_off - offset of start of chunk from start of bp in FBAs.
2199 * fba_len - size of this chunk in FBAs.
2200 */
2201
2202 loop:
2203 fba_len = min(fba_req, svp->sv_maxfbas);
2204 hndl = bufh;
2205
2206 DTRACE_PROBE4(sv_dbg_allocb_start,
2207 sv_dev_t *, svp,
2208 uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2209 uint64_t, (uint64_t)fba_len,
2210 int, rw);
2211
2212 rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2213 fba_len, rw, &hndl);
2214
2215 DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2216
2217 if (rc > 0) {
2218 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2219 bioerror(bp, rc);
2220 if (hndl != bufh)
2221 (void) nsc_free_buf(hndl);
2222 hndl = NULL;
2223 goto done;
2224 }
2225
2226 tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2227 v = hndl->sb_vec;
2228
2229 if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2230 /*
2231 * Not overwriting all of the last FBA, so read in the
2232 * old contents now before we overwrite it with the new
2233 * data.
2234 */
2235
2236 DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2237 uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2238
2239 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2240 if (rc > 0) {
2241 bioerror(bp, rc);
2242 goto done;
2243 }
2244
2245 DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2246 }
2247
2248 DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2249
2250 while (tocopy > 0) {
2251 nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2252
2253 if (bp->b_flags & B_READ)
2254 (void) bcopy(v->sv_addr, buf_addr, nbytes);
2255 else
2256 (void) bcopy(buf_addr, v->sv_addr, nbytes);
2257
2258 bp->b_resid -= nbytes;
2259 buf_addr += nbytes;
2260 tocopy -= nbytes;
2261 v++;
2262 }
2263
2264 DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2265
2266 if ((bp->b_flags & B_READ) == 0) {
2267 DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2268 uint64_t, (uint64_t)hndl->sb_pos,
2269 uint64_t, (uint64_t)hndl->sb_len);
2270
2271 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2272
2273 DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2274
2275 if (rc > 0) {
2276 bioerror(bp, rc);
2277 goto done;
2278 }
2279 }
2280
2281 /*
2282 * Adjust FBA offset and requested (ie. remaining) length,
2283 * loop if more data to transfer.
2284 */
2285
2286 fba_off += fba_len;
2287 fba_req -= fba_len;
2288
2289 if (fba_req > 0) {
2290 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2291
2292 rc = nsc_free_buf(hndl);
2293
2294 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2295
2296 if (rc > 0) {
2297 DTRACE_PROBE1(sv_lyr_strategy_err_free,
2298 struct buf *, bp);
2299 bioerror(bp, rc);
2300 }
2301
2302 hndl = NULL;
2303
2304 if (rc <= 0)
2305 goto loop;
2306 }
2307
2308 done:
2309 if (hndl != NULL) {
2310 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2311
2312 rc = nsc_free_buf(hndl);
2313
2314 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2315
2316 if (rc > 0) {
2317 DTRACE_PROBE1(sv_lyr_strategy_err_free,
2318 struct buf *, bp);
2319 bioerror(bp, rc);
2320 }
2321
2322 hndl = NULL;
2323 }
2324
2325 if (bufh)
2326 (void) nsc_free_handle(bufh);
2327
2328 DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2329
2330 nsc_release(svp->sv_fd);
2331
2332 DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2333
2334 out:
2335 if (sv_debug > 5) {
2336 cmn_err(CE_CONT,
2337 "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2338 (void *)bp, (void *)bufh, bp->b_error);
2339 }
2340
2341 DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2342
2343 rw_exit(&svp->sv_lock);
2344 biodone(bp);
2345 }
2346
2347
2348 static void
sv_async_strategy(blind_t arg)2349 sv_async_strategy(blind_t arg)
2350 {
2351 struct buf *bp = (struct buf *)arg;
2352 _sv_lyr_strategy(bp);
2353 }
2354
2355
2356 static int
sv_lyr_strategy(struct buf * bp)2357 sv_lyr_strategy(struct buf *bp)
2358 {
2359 nsthread_t *tp;
2360 int nlive;
2361
2362 /*
2363 * If B_ASYNC was part of the DDI we could use it as a hint to
2364 * not create a thread for synchronous i/o.
2365 */
2366 if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2367 /* not sv enabled - just pass through */
2368 DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2369 _sv_lyr_strategy(bp);
2370 return (0);
2371 }
2372
2373 if (sv_debug > 4) {
2374 cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2375 nst_nthread(sv_tset), nst_nlive(sv_tset));
2376 }
2377
2378 /*
2379 * If there are only guard devices enabled there
2380 * won't be a threadset, so don't try and use it.
2381 */
2382 tp = NULL;
2383 if (sv_tset != NULL) {
2384 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2385 }
2386
2387 if (tp == NULL) {
2388 /*
2389 * out of threads, so fall back to synchronous io.
2390 */
2391 if (sv_debug > 0) {
2392 cmn_err(CE_CONT,
2393 "!sv_lyr_strategy: thread alloc failed\n");
2394 }
2395
2396 DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2397 struct buf *, bp);
2398
2399 _sv_lyr_strategy(bp);
2400 sv_no_threads++;
2401 } else {
2402 nlive = nst_nlive(sv_tset);
2403 if (nlive > sv_max_nlive) {
2404 if (sv_debug > 0) {
2405 cmn_err(CE_CONT,
2406 "!sv_lyr_strategy: "
2407 "new max nlive %d (nthread %d)\n",
2408 nlive, nst_nthread(sv_tset));
2409 }
2410
2411 sv_max_nlive = nlive;
2412 }
2413 }
2414
2415 return (0);
2416 }
2417
2418
2419 #ifndef offsetof
2420 #define offsetof(s, m) ((size_t)(&((s *)0)->m))
2421 #endif
2422
2423 /*
2424 * re-write the size of the current partition
2425 */
2426 static int
sv_fix_dkiocgvtoc(const intptr_t arg,const int mode,sv_dev_t * svp)2427 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2428 {
2429 size_t offset;
2430 int ilp32;
2431 int pnum;
2432 int rc;
2433
2434 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2435
2436 rc = nskern_partition(svp->sv_dev, &pnum);
2437 if (rc != 0) {
2438 return (rc);
2439 }
2440
2441 if (pnum < 0 || pnum >= V_NUMPAR) {
2442 cmn_err(CE_WARN,
2443 "!sv_gvtoc: unable to determine partition number "
2444 "for dev %lx", svp->sv_dev);
2445 return (EINVAL);
2446 }
2447
2448 if (ilp32) {
2449 int32_t p_size;
2450
2451 #ifdef _SunOS_5_6
2452 offset = offsetof(struct vtoc, v_part);
2453 offset += sizeof (struct partition) * pnum;
2454 offset += offsetof(struct partition, p_size);
2455 #else
2456 offset = offsetof(struct vtoc32, v_part);
2457 offset += sizeof (struct partition32) * pnum;
2458 offset += offsetof(struct partition32, p_size);
2459 #endif
2460
2461 p_size = (int32_t)svp->sv_nblocks;
2462 if (p_size == 0) {
2463 if (sv_reserve(svp->sv_fd,
2464 NSC_MULTI|NSC_PCATCH) == 0) {
2465 p_size = (int32_t)svp->sv_nblocks;
2466 nsc_release(svp->sv_fd);
2467 } else {
2468 rc = EINTR;
2469 }
2470 }
2471
2472 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2473 sizeof (p_size), mode) != 0) {
2474 rc = EFAULT;
2475 }
2476 } else {
2477 long p_size;
2478
2479 offset = offsetof(struct vtoc, v_part);
2480 offset += sizeof (struct partition) * pnum;
2481 offset += offsetof(struct partition, p_size);
2482
2483 p_size = (long)svp->sv_nblocks;
2484 if (p_size == 0) {
2485 if (sv_reserve(svp->sv_fd,
2486 NSC_MULTI|NSC_PCATCH) == 0) {
2487 p_size = (long)svp->sv_nblocks;
2488 nsc_release(svp->sv_fd);
2489 } else {
2490 rc = EINTR;
2491 }
2492 }
2493
2494 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2495 sizeof (p_size), mode) != 0) {
2496 rc = EFAULT;
2497 }
2498 }
2499
2500 return (rc);
2501 }
2502
2503
2504 #ifdef DKIOCPARTITION
2505 /*
2506 * re-write the size of the current partition
2507 *
2508 * arg is dk_efi_t.
2509 *
2510 * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2511 *
2512 * dk_efi_t->dki_data --> efi_gpt_t (label header)
2513 * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2514 *
2515 * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2516 * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2517 *
2518 * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2519 * logical block on the disk.
2520 *
2521 * Everything is little endian (i.e. disk format).
2522 */
2523 static int
sv_fix_dkiocgetefi(const intptr_t arg,const int mode,sv_dev_t * svp)2524 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2525 {
2526 dk_efi_t efi;
2527 efi_gpt_t gpt;
2528 efi_gpe_t *gpe = NULL;
2529 size_t sgpe;
2530 uint64_t p_size; /* virtual partition size from nsctl */
2531 uint32_t crc;
2532 int unparts; /* number of parts in user's array */
2533 int pnum;
2534 int rc;
2535
2536 rc = nskern_partition(svp->sv_dev, &pnum);
2537 if (rc != 0) {
2538 return (rc);
2539 }
2540
2541 if (pnum < 0) {
2542 cmn_err(CE_WARN,
2543 "!sv_efi: unable to determine partition number for dev %lx",
2544 svp->sv_dev);
2545 return (EINVAL);
2546 }
2547
2548 if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2549 return (EFAULT);
2550 }
2551
2552 efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2553
2554 if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2555 return (EINVAL);
2556 }
2557
2558 if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2559 rc = EFAULT;
2560 goto out;
2561 }
2562
2563 if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2564 unparts = 1;
2565 else if (pnum >= unparts) {
2566 cmn_err(CE_WARN,
2567 "!sv_efi: partition# beyond end of user array (%d >= %d)",
2568 pnum, unparts);
2569 return (EINVAL);
2570 }
2571
2572 sgpe = sizeof (*gpe) * unparts;
2573 gpe = kmem_alloc(sgpe, KM_SLEEP);
2574
2575 if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2576 rc = EFAULT;
2577 goto out;
2578 }
2579
2580 p_size = svp->sv_nblocks;
2581 if (p_size == 0) {
2582 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2583 p_size = (diskaddr_t)svp->sv_nblocks;
2584 nsc_release(svp->sv_fd);
2585 } else {
2586 rc = EINTR;
2587 }
2588 }
2589
2590 gpe[pnum].efi_gpe_EndingLBA = LE_64(
2591 LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2592
2593 gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2594 CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2595 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2596
2597 gpt.efi_gpt_HeaderCRC32 = 0;
2598 CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2599 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2600
2601 if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2602 rc = EFAULT;
2603 goto out;
2604 }
2605
2606 if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2607 rc = EFAULT;
2608 goto out;
2609 }
2610
2611 out:
2612 if (gpe) {
2613 kmem_free(gpe, sgpe);
2614 }
2615
2616 return (rc);
2617 }
2618
2619
2620 /*
2621 * Re-write the size of the partition specified by p_partno
2622 *
2623 * Note that if a DKIOCPARTITION is issued to an fd opened against a
2624 * non-sv'd device, but p_partno requests the size for a different
2625 * device that is sv'd, this function will *not* be called as sv is
2626 * not interposed on the original device (the fd).
2627 *
2628 * It would not be easy to change this as we cannot get the partition
2629 * number for the non-sv'd device, so cannot compute the dev_t of the
2630 * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2631 * its size from nsctl.
2632 *
2633 * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2634 */
2635 static int
sv_fix_dkiocpartition(const intptr_t arg,const int mode,sv_dev_t * svp)2636 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2637 {
2638 struct partition64 p64;
2639 sv_dev_t *nsvp = NULL;
2640 diskaddr_t p_size;
2641 minor_t nminor;
2642 int pnum, rc;
2643 dev_t ndev;
2644
2645 rc = nskern_partition(svp->sv_dev, &pnum);
2646 if (rc != 0) {
2647 return (rc);
2648 }
2649
2650 if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2651 return (EFAULT);
2652 }
2653
2654 if (p64.p_partno != pnum) {
2655 /* switch to requested partition, not the current one */
2656 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2657 ndev = makedevice(getmajor(svp->sv_dev), nminor);
2658 nsvp = sv_find_enabled(ndev, NULL);
2659 if (nsvp == NULL) {
2660 /* not sv device - just return */
2661 return (0);
2662 }
2663
2664 svp = nsvp;
2665 }
2666
2667 p_size = svp->sv_nblocks;
2668 if (p_size == 0) {
2669 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2670 p_size = (diskaddr_t)svp->sv_nblocks;
2671 nsc_release(svp->sv_fd);
2672 } else {
2673 rc = EINTR;
2674 }
2675 }
2676
2677 if (nsvp != NULL) {
2678 rw_exit(&nsvp->sv_lock);
2679 }
2680
2681 if ((rc == 0) && ddi_copyout(&p_size,
2682 (void *)(arg + offsetof(struct partition64, p_size)),
2683 sizeof (p_size), mode) != 0) {
2684 return (EFAULT);
2685 }
2686
2687 return (rc);
2688 }
2689 #endif /* DKIOCPARTITION */
2690
2691
2692 static int
sv_lyr_ioctl(const dev_t dev,const int cmd,const intptr_t arg,const int mode,cred_t * crp,int * rvalp)2693 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2694 const int mode, cred_t *crp, int *rvalp)
2695 {
2696 sv_dev_t *svp;
2697 sv_maj_t *maj;
2698 int (*fn)();
2699 int rc = 0;
2700
2701 maj = 0;
2702 fn = 0;
2703
2704 /*
2705 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2706 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2707 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2708 *
2709 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2710 */
2711 if (sv_mod_status == SV_ALLOW_UNLOAD) {
2712 return (EBUSY);
2713 }
2714
2715 svp = sv_find_enabled(dev, &maj);
2716 if (svp != NULL) {
2717 if (nskernd_isdaemon()) {
2718 /*
2719 * This is nskernd which always needs to see
2720 * the underlying disk device accurately.
2721 *
2722 * So just pass the ioctl straight through
2723 * to the underlying driver as though the device
2724 * was not sv enabled.
2725 */
2726 DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2727 dev_t, dev);
2728
2729 rw_exit(&svp->sv_lock);
2730 svp = NULL;
2731 } else {
2732 ASSERT(RW_READ_HELD(&svp->sv_lock));
2733 }
2734 }
2735
2736 /*
2737 * We now have a locked and enabled SV device, or a non-SV device.
2738 */
2739
2740 switch (cmd) {
2741 /*
2742 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2743 * and DKIOCSETEFI are intercepted and faked up as some
2744 * i/o providers emulate volumes of a different size to
2745 * the underlying volume.
2746 *
2747 * Setting the size by rewriting the vtoc is not permitted.
2748 */
2749
2750 case DKIOCSVTOC:
2751 #ifdef DKIOCPARTITION
2752 case DKIOCSETEFI:
2753 #endif
2754 if (svp == NULL) {
2755 /* not intercepted -- allow ioctl through */
2756 break;
2757 }
2758
2759 rw_exit(&svp->sv_lock);
2760
2761 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2762
2763 return (EPERM);
2764
2765 default:
2766 break;
2767 }
2768
2769 /*
2770 * Pass through the real ioctl command.
2771 */
2772
2773 if (maj && (fn = maj->sm_ioctl) != 0) {
2774 if (!(maj->sm_flag & D_MP)) {
2775 UNSAFE_ENTER();
2776 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2777 UNSAFE_EXIT();
2778 } else {
2779 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2780 }
2781 } else {
2782 rc = ENODEV;
2783 }
2784
2785 /*
2786 * Bug 4755783
2787 * Fix up the size of the current partition to allow
2788 * for the virtual volume to be a different size to the
2789 * physical volume (e.g. for II compact dependent shadows).
2790 *
2791 * Note that this only attempts to fix up the current partition
2792 * - the one that the ioctl was issued against. There could be
2793 * other sv'd partitions in the same vtoc, but we cannot tell
2794 * so we don't attempt to fix them up.
2795 */
2796
2797 if (svp != NULL && rc == 0) {
2798 switch (cmd) {
2799 case DKIOCGVTOC:
2800 rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2801 break;
2802
2803 #ifdef DKIOCPARTITION
2804 case DKIOCGETEFI:
2805 rc = sv_fix_dkiocgetefi(arg, mode, svp);
2806 break;
2807
2808 case DKIOCPARTITION:
2809 rc = sv_fix_dkiocpartition(arg, mode, svp);
2810 break;
2811 #endif /* DKIOCPARTITION */
2812 }
2813 }
2814
2815 if (svp != NULL) {
2816 rw_exit(&svp->sv_lock);
2817 }
2818
2819 return (rc);
2820 }
2821