1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /*
29 * Storage Volume Character and Block Driver (SV)
30 *
31 * This driver implements a simplistic /dev/{r}dsk/ interface to a
32 * specified disk volume that is otherwise managed by the Prism
33 * software. The SV driver layers itself onto the underlying disk
34 * device driver by changing function pointers in the cb_ops
35 * structure.
36 *
37 * CONFIGURATION:
38 *
39 * 1. Configure the driver using the svadm utility.
40 * 2. Access the device as before through /dev/rdsk/c?t?d?s?
41 *
42 * LIMITATIONS:
43 *
44 * This driver should NOT be used to share a device between another
45 * DataServices user interface module (e.g., STE) and a user accessing
46 * the device through the block device in O_WRITE mode. This is because
47 * writes through the block device are asynchronous (due to the page
48 * cache) and so consistency between the block device user and the
49 * STE user cannot be guaranteed.
50 *
51 * Data is copied between system struct buf(9s) and nsc_vec_t. This is
52 * wasteful and slow.
53 */
54
55 #include <sys/debug.h>
56 #include <sys/types.h>
57
58 #include <sys/ksynch.h>
59 #include <sys/kmem.h>
60 #include <sys/errno.h>
61 #include <sys/varargs.h>
62 #include <sys/file.h>
63 #include <sys/open.h>
64 #include <sys/conf.h>
65 #include <sys/cred.h>
66 #include <sys/buf.h>
67 #include <sys/uio.h>
68 #ifndef DS_DDICT
69 #include <sys/pathname.h>
70 #endif
71 #include <sys/aio_req.h>
72 #include <sys/dkio.h>
73 #include <sys/vtoc.h>
74 #include <sys/cmn_err.h>
75 #include <sys/modctl.h>
76 #include <sys/ddi.h>
77 #include <sys/sysmacros.h>
78 #include <sys/sunddi.h>
79 #include <sys/sunldi.h>
80 #include <sys/nsctl/nsvers.h>
81
82 #include <sys/nsc_thread.h>
83 #include <sys/unistat/spcs_s.h>
84 #include <sys/unistat/spcs_s_k.h>
85 #include <sys/unistat/spcs_errors.h>
86
87 #ifdef DS_DDICT
88 #include "../contract.h"
89 #endif
90
91 #include "../nsctl.h"
92
93
94 #include <sys/sdt.h> /* dtrace is S10 or later */
95
96 #include "sv.h"
97 #include "sv_impl.h"
98 #include "sv_efi.h"
99
100 #define MAX_EINTR_COUNT 1000
101
102 /*
103 * sv_mod_status
104 */
105 #define SV_PREVENT_UNLOAD 1
106 #define SV_ALLOW_UNLOAD 2
107
108 static const int sv_major_rev = ISS_VERSION_MAJ; /* Major number */
109 static const int sv_minor_rev = ISS_VERSION_MIN; /* Minor number */
110 static const int sv_micro_rev = ISS_VERSION_MIC; /* Micro number */
111 static const int sv_baseline_rev = ISS_VERSION_NUM; /* Baseline number */
112
113 #ifdef DKIOCPARTITION
114 /*
115 * CRC32 polynomial table needed for computing the checksums
116 * in an EFI vtoc.
117 */
118 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
119 #endif
120
121 static clock_t sv_config_time; /* Time of successful {en,dis}able */
122 static int sv_debug; /* Set non-zero for debug to syslog */
123 static int sv_mod_status; /* Set to prevent modunload */
124
125 static dev_info_t *sv_dip; /* Single DIP for driver */
126 static kmutex_t sv_mutex; /* Protect global lists, etc. */
127
128 static nsc_mem_t *sv_mem; /* nsctl memory allocator token */
129
130
131 /*
132 * Per device and per major state.
133 */
134
135 #ifndef _SunOS_5_6
136 #define UNSAFE_ENTER()
137 #define UNSAFE_EXIT()
138 #else
139 #define UNSAFE_ENTER() mutex_enter(&unsafe_driver)
140 #define UNSAFE_EXIT() mutex_exit(&unsafe_driver)
141 #endif
142
143 /* hash table of major dev structures */
144 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
145 static sv_dev_t *sv_devs; /* array of per device structures */
146 static int sv_max_devices; /* SV version of nsc_max_devices() */
147 static int sv_ndevices; /* number of SV enabled devices */
148
149 /*
150 * Threading.
151 */
152
153 int sv_threads_max = 1024; /* maximum # to dynamically alloc */
154 int sv_threads = 32; /* # to pre-allocate (see sv.conf) */
155 int sv_threads_extra = 0; /* addl # we would have alloc'ed */
156
157 static nstset_t *sv_tset; /* the threadset pointer */
158
159 static int sv_threads_hysteresis = 4; /* hysteresis for threadset resizing */
160 static int sv_threads_dev = 2; /* # of threads to alloc per device */
161 static int sv_threads_inc = 8; /* increment for changing the set */
162 static int sv_threads_needed; /* number of threads needed */
163 static int sv_no_threads; /* number of nsc_create errors */
164 static int sv_max_nlive; /* max number of threads running */
165
166
167
168 /*
169 * nsctl fd callbacks.
170 */
171
172 static int svattach_fd(blind_t);
173 static int svdetach_fd(blind_t);
174
175 static nsc_def_t sv_fd_def[] = {
176 { "Attach", (uintptr_t)svattach_fd, },
177 { "Detach", (uintptr_t)svdetach_fd, },
178 { 0, 0, }
179 };
180
181 /*
182 * cb_ops functions.
183 */
184
185 static int svopen(dev_t *, int, int, cred_t *);
186 static int svclose(dev_t, int, int, cred_t *);
187 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
188 static int svprint(dev_t, char *);
189
190 /*
191 * These next functions are layered into the underlying driver's devops.
192 */
193
194 static int sv_lyr_open(dev_t *, int, int, cred_t *);
195 static int sv_lyr_close(dev_t, int, int, cred_t *);
196 static int sv_lyr_strategy(struct buf *);
197 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
198 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
199 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
200 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
201 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
202
203 static struct cb_ops sv_cb_ops = {
204 svopen, /* open */
205 svclose, /* close */
206 nulldev, /* strategy */
207 svprint,
208 nodev, /* dump */
209 nodev, /* read */
210 nodev, /* write */
211 svioctl,
212 nodev, /* devmap */
213 nodev, /* mmap */
214 nodev, /* segmap */
215 nochpoll, /* poll */
216 ddi_prop_op,
217 NULL, /* NOT a stream */
218 D_NEW | D_MP | D_64BIT,
219 CB_REV,
220 nodev, /* aread */
221 nodev, /* awrite */
222 };
223
224
225 /*
226 * dev_ops functions.
227 */
228
229 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
230 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
231 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
232
233 static struct dev_ops sv_ops = {
234 DEVO_REV,
235 0,
236 sv_getinfo,
237 nulldev, /* identify */
238 nulldev, /* probe */
239 sv_attach,
240 sv_detach,
241 nodev, /* reset */
242 &sv_cb_ops,
243 (struct bus_ops *)0
244 };
245
246 /*
247 * Module linkage.
248 */
249
250 extern struct mod_ops mod_driverops;
251
252 static struct modldrv modldrv = {
253 &mod_driverops,
254 "nws:Storage Volume:" ISS_VERSION_STR,
255 &sv_ops
256 };
257
258 static struct modlinkage modlinkage = {
259 MODREV_1,
260 &modldrv,
261 0
262 };
263
264
265 int
_init(void)266 _init(void)
267 {
268 int error;
269
270 mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
271
272 if ((error = mod_install(&modlinkage)) != 0) {
273 mutex_destroy(&sv_mutex);
274 return (error);
275 }
276
277 #ifdef DEBUG
278 cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
279 sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
280 ISS_VERSION_STR, BUILD_DATE_STR);
281 #else
282 if (sv_micro_rev) {
283 cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
284 sv_major_rev, sv_minor_rev, sv_micro_rev,
285 ISS_VERSION_STR, BUILD_DATE_STR);
286 } else {
287 cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
288 sv_major_rev, sv_minor_rev,
289 ISS_VERSION_STR, BUILD_DATE_STR);
290 }
291 #endif
292
293 return (error);
294 }
295
296
297 int
_fini(void)298 _fini(void)
299 {
300 int error;
301
302 if ((error = mod_remove(&modlinkage)) != 0)
303 return (error);
304
305 mutex_destroy(&sv_mutex);
306
307 return (error);
308 }
309
310
311 int
_info(struct modinfo * modinfop)312 _info(struct modinfo *modinfop)
313 {
314 return (mod_info(&modlinkage, modinfop));
315 }
316
317
318 /*
319 * Locking & State.
320 *
321 * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
322 * threadset creation and sizing; sv_ndevices.
323 *
324 * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
325 * must be acquired first.
326 *
327 * sv_lock protects the sv_dev_t structure for an individual device.
328 *
329 * sv_olock protects the otyp/open members of the sv_dev_t. If we need
330 * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
331 * first.
332 *
333 * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
334 * I/O operations to a device simultaneously, as above.
335 *
336 * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
337 * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
338 * and (sv_pending == curthread) so that any recursion through
339 * sv_lyr_open/sv_lyr_close can be detected.
340 */
341
342
343 static int
sv_init_devs(void)344 sv_init_devs(void)
345 {
346 int i;
347
348 ASSERT(MUTEX_HELD(&sv_mutex));
349
350 if (sv_max_devices > 0)
351 return (0);
352
353 sv_max_devices = nsc_max_devices();
354
355 if (sv_max_devices <= 0) {
356 /* nsctl is not attached (nskernd not running) */
357 if (sv_debug > 0)
358 cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
359 return (EAGAIN);
360 }
361
362 sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
363 KM_NOSLEEP, sv_mem);
364
365 if (sv_devs == NULL) {
366 cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
367 return (ENOMEM);
368 }
369
370 for (i = 0; i < sv_max_devices; i++) {
371 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
372 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
373 }
374
375 if (sv_debug > 0)
376 cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
377
378 return (0);
379 }
380
381
382 static int
sv_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)383 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
384 {
385 int rc;
386
387 switch (cmd) {
388
389 case DDI_ATTACH:
390 sv_dip = dip;
391
392 if (ddi_create_minor_node(dip, "sv", S_IFCHR,
393 0, DDI_PSEUDO, 0) != DDI_SUCCESS)
394 goto failed;
395
396 mutex_enter(&sv_mutex);
397
398 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
399 if (sv_mem == NULL) {
400 mutex_exit(&sv_mutex);
401 goto failed;
402 }
403
404 rc = sv_init_devs();
405 if (rc != 0 && rc != EAGAIN) {
406 mutex_exit(&sv_mutex);
407 goto failed;
408 }
409
410 mutex_exit(&sv_mutex);
411
412
413 ddi_report_dev(dip);
414
415 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
416 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
417 "sv_threads", sv_threads);
418
419 if (sv_debug > 0)
420 cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
421
422 if (sv_threads > sv_threads_max)
423 sv_threads_max = sv_threads;
424
425 return (DDI_SUCCESS);
426
427 default:
428 return (DDI_FAILURE);
429 }
430
431 failed:
432 DTRACE_PROBE(sv_attach_failed);
433 (void) sv_detach(dip, DDI_DETACH);
434 return (DDI_FAILURE);
435 }
436
437
438 static int
sv_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)439 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
440 {
441 sv_dev_t *svp;
442 int i;
443
444 switch (cmd) {
445
446 case DDI_DETACH:
447
448 /*
449 * Check that everything is disabled.
450 */
451
452 mutex_enter(&sv_mutex);
453
454 if (sv_mod_status == SV_PREVENT_UNLOAD) {
455 mutex_exit(&sv_mutex);
456 DTRACE_PROBE(sv_detach_err_prevent);
457 return (DDI_FAILURE);
458 }
459
460 for (i = 0; sv_devs && i < sv_max_devices; i++) {
461 svp = &sv_devs[i];
462
463 if (svp->sv_state != SV_DISABLE) {
464 mutex_exit(&sv_mutex);
465 DTRACE_PROBE(sv_detach_err_busy);
466 return (DDI_FAILURE);
467 }
468 }
469
470
471 for (i = 0; sv_devs && i < sv_max_devices; i++) {
472 mutex_destroy(&sv_devs[i].sv_olock);
473 rw_destroy(&sv_devs[i].sv_lock);
474 }
475
476 if (sv_devs) {
477 nsc_kmem_free(sv_devs,
478 (sv_max_devices * sizeof (*sv_devs)));
479 sv_devs = NULL;
480 }
481 sv_max_devices = 0;
482
483 if (sv_mem) {
484 nsc_unregister_mem(sv_mem);
485 sv_mem = NULL;
486 }
487
488 mutex_exit(&sv_mutex);
489
490 /*
491 * Remove all minor nodes.
492 */
493
494 ddi_remove_minor_node(dip, NULL);
495 sv_dip = NULL;
496
497 return (DDI_SUCCESS);
498
499 default:
500 return (DDI_FAILURE);
501 }
502 }
503
504 static sv_maj_t *
sv_getmajor(const dev_t dev)505 sv_getmajor(const dev_t dev)
506 {
507 sv_maj_t **insert, *maj;
508 major_t umaj = getmajor(dev);
509
510 /*
511 * See if the hash table entry, or one of the hash chains
512 * is already allocated for this major number
513 */
514 if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
515 do {
516 if (maj->sm_major == umaj)
517 return (maj);
518 } while ((maj = maj->sm_next) != 0);
519 }
520
521 /*
522 * If the sv_mutex is held, there is design flaw, as the only non-mutex
523 * held callers can be sv_enable() or sv_dev_to_sv()
524 * Return an error, instead of panicing the system
525 */
526 if (MUTEX_HELD(&sv_mutex)) {
527 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
528 return (NULL);
529 }
530
531 /*
532 * Determine where to allocate a new element in the hash table
533 */
534 mutex_enter(&sv_mutex);
535 insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
536 for (maj = *insert; maj; maj = maj->sm_next) {
537
538 /* Did another thread beat us to it? */
539 if (maj->sm_major == umaj)
540 return (maj);
541
542 /* Find a NULL insert point? */
543 if (maj->sm_next == NULL)
544 insert = &maj->sm_next;
545 }
546
547 /*
548 * Located the new insert point
549 */
550 *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
551 if ((maj = *insert) != 0)
552 maj->sm_major = umaj;
553 else
554 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
555
556 mutex_exit(&sv_mutex);
557
558 return (maj);
559 }
560
561 /* ARGSUSED */
562
563 static int
sv_getinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)564 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
565 {
566 int rc = DDI_FAILURE;
567
568 switch (infocmd) {
569
570 case DDI_INFO_DEVT2DEVINFO:
571 *result = sv_dip;
572 rc = DDI_SUCCESS;
573 break;
574
575 case DDI_INFO_DEVT2INSTANCE:
576 /*
577 * We only have a single instance.
578 */
579 *result = 0;
580 rc = DDI_SUCCESS;
581 break;
582
583 default:
584 break;
585 }
586
587 return (rc);
588 }
589
590
591 /*
592 * Hashing of devices onto major device structures.
593 *
594 * Individual device structures are hashed onto one of the sm_hash[]
595 * buckets in the relevant major device structure.
596 *
597 * Hash insertion and deletion -must- be done with sv_mutex held. Hash
598 * searching does not require the mutex because of the sm_seq member.
599 * sm_seq is incremented on each insertion (-after- hash chain pointer
600 * manipulation) and each deletion (-before- hash chain pointer
601 * manipulation). When searching the hash chain, the seq number is
602 * checked before accessing each device structure, if the seq number has
603 * changed, then we restart the search from the top of the hash chain.
604 * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
605 * the hash chain (we are guaranteed that this search cannot be
606 * interrupted).
607 */
608
609 #define SV_HASH_RETRY 16
610
611 static sv_dev_t *
sv_dev_to_sv(const dev_t dev,sv_maj_t ** majpp)612 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
613 {
614 minor_t umin = getminor(dev);
615 sv_dev_t **hb, *next, *svp;
616 sv_maj_t *maj;
617 int seq;
618 int try;
619
620 /* Get major hash table */
621 maj = sv_getmajor(dev);
622 if (majpp)
623 *majpp = maj;
624 if (maj == NULL)
625 return (NULL);
626
627 if (maj->sm_inuse == 0) {
628 DTRACE_PROBE1(
629 sv_dev_to_sv_end,
630 dev_t, dev);
631 return (NULL);
632 }
633
634 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
635 try = 0;
636
637 retry:
638 if (try > SV_HASH_RETRY)
639 mutex_enter(&sv_mutex);
640
641 seq = maj->sm_seq;
642 for (svp = *hb; svp; svp = next) {
643 next = svp->sv_hash;
644
645 nsc_membar_stld(); /* preserve register load order */
646
647 if (maj->sm_seq != seq) {
648 DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
649 try++;
650 goto retry;
651 }
652
653 if (svp->sv_dev == dev)
654 break;
655 }
656
657 if (try > SV_HASH_RETRY)
658 mutex_exit(&sv_mutex);
659
660 return (svp);
661 }
662
663
664 /*
665 * Must be called with sv_mutex held.
666 */
667
668 static int
sv_get_state(const dev_t udev,sv_dev_t ** svpp)669 sv_get_state(const dev_t udev, sv_dev_t **svpp)
670 {
671 sv_dev_t **hb, **insert, *svp;
672 sv_maj_t *maj;
673 minor_t umin;
674 int i;
675
676 /* Get major hash table */
677 if ((maj = sv_getmajor(udev)) == NULL)
678 return (NULL);
679
680 /* Determine which minor hash table */
681 umin = getminor(udev);
682 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
683
684 /* look for clash */
685
686 insert = hb;
687
688 for (svp = *hb; svp; svp = svp->sv_hash) {
689 if (svp->sv_dev == udev)
690 break;
691
692 if (svp->sv_hash == NULL)
693 insert = &svp->sv_hash;
694 }
695
696 if (svp) {
697 DTRACE_PROBE1(
698 sv_get_state_enabled,
699 dev_t, udev);
700 return (SV_EENABLED);
701 }
702
703 /* look for spare sv_devs slot */
704
705 for (i = 0; i < sv_max_devices; i++) {
706 svp = &sv_devs[i];
707
708 if (svp->sv_state == SV_DISABLE)
709 break;
710 }
711
712 if (i >= sv_max_devices) {
713 DTRACE_PROBE1(
714 sv_get_state_noslots,
715 dev_t, udev);
716 return (SV_ENOSLOTS);
717 }
718
719 svp->sv_state = SV_PENDING;
720 svp->sv_pending = curthread;
721
722 *insert = svp;
723 svp->sv_hash = NULL;
724 maj->sm_seq++; /* must be after the store to the hash chain */
725
726 *svpp = svp;
727
728 /*
729 * We do not know the size of the underlying device at
730 * this stage, so initialise "nblocks" property to
731 * zero, and update it whenever we succeed in
732 * nsc_reserve'ing the underlying nsc_fd_t.
733 */
734
735 svp->sv_nblocks = 0;
736
737 return (0);
738 }
739
740
741 /*
742 * Remove a device structure from it's hash chain.
743 * Must be called with sv_mutex held.
744 */
745
746 static void
sv_rm_hash(sv_dev_t * svp)747 sv_rm_hash(sv_dev_t *svp)
748 {
749 sv_dev_t **svpp;
750 sv_maj_t *maj;
751
752 /* Get major hash table */
753 if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
754 return;
755
756 /* remove svp from hash chain */
757
758 svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
759 while (*svpp) {
760 if (*svpp == svp) {
761 /*
762 * increment of sm_seq must be before the
763 * removal from the hash chain
764 */
765 maj->sm_seq++;
766 *svpp = svp->sv_hash;
767 break;
768 }
769
770 svpp = &(*svpp)->sv_hash;
771 }
772
773 svp->sv_hash = NULL;
774 }
775
776 /*
777 * Free (disable) a device structure.
778 * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
779 * perform the exits during its processing.
780 */
781
782 static int
sv_free(sv_dev_t * svp,const int error)783 sv_free(sv_dev_t *svp, const int error)
784 {
785 struct cb_ops *cb_ops;
786 sv_maj_t *maj;
787
788 /* Get major hash table */
789 if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
790 return (NULL);
791
792 svp->sv_state = SV_PENDING;
793 svp->sv_pending = curthread;
794
795 /*
796 * Close the fd's before removing from the hash or swapping
797 * back the cb_ops pointers so that the cache flushes before new
798 * io can come in.
799 */
800
801 if (svp->sv_fd) {
802 (void) nsc_close(svp->sv_fd);
803 svp->sv_fd = 0;
804 }
805
806 sv_rm_hash(svp);
807
808 if (error != SV_ESDOPEN &&
809 error != SV_ELYROPEN && --maj->sm_inuse == 0) {
810
811 if (maj->sm_dev_ops)
812 cb_ops = maj->sm_dev_ops->devo_cb_ops;
813 else
814 cb_ops = NULL;
815
816 if (cb_ops && maj->sm_strategy != NULL) {
817 cb_ops->cb_strategy = maj->sm_strategy;
818 cb_ops->cb_close = maj->sm_close;
819 cb_ops->cb_ioctl = maj->sm_ioctl;
820 cb_ops->cb_write = maj->sm_write;
821 cb_ops->cb_open = maj->sm_open;
822 cb_ops->cb_read = maj->sm_read;
823 cb_ops->cb_flag = maj->sm_flag;
824
825 if (maj->sm_awrite)
826 cb_ops->cb_awrite = maj->sm_awrite;
827
828 if (maj->sm_aread)
829 cb_ops->cb_aread = maj->sm_aread;
830
831 /*
832 * corbin XXX
833 * Leave backing device ops in maj->sm_*
834 * to handle any requests that might come
835 * in during the disable. This could be
836 * a problem however if the backing device
837 * driver is changed while we process these
838 * requests.
839 *
840 * maj->sm_strategy = 0;
841 * maj->sm_awrite = 0;
842 * maj->sm_write = 0;
843 * maj->sm_ioctl = 0;
844 * maj->sm_close = 0;
845 * maj->sm_aread = 0;
846 * maj->sm_read = 0;
847 * maj->sm_open = 0;
848 * maj->sm_flag = 0;
849 *
850 */
851 }
852
853 if (maj->sm_dev_ops) {
854 maj->sm_dev_ops = 0;
855 }
856 }
857
858 if (svp->sv_lh) {
859 cred_t *crp = ddi_get_cred();
860
861 /*
862 * Close the protective layered driver open using the
863 * Sun Private layered driver i/f.
864 */
865
866 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
867 svp->sv_lh = NULL;
868 }
869
870 svp->sv_timestamp = nsc_lbolt();
871 svp->sv_state = SV_DISABLE;
872 svp->sv_pending = NULL;
873 rw_exit(&svp->sv_lock);
874 mutex_exit(&sv_mutex);
875
876 return (error);
877 }
878
879 /*
880 * Reserve the device, taking into account the possibility that
881 * the reserve might have to be retried.
882 */
883 static int
sv_reserve(nsc_fd_t * fd,int flags)884 sv_reserve(nsc_fd_t *fd, int flags)
885 {
886 int eintr_count;
887 int rc;
888
889 eintr_count = 0;
890 do {
891 rc = nsc_reserve(fd, flags);
892 if (rc == EINTR) {
893 ++eintr_count;
894 delay(2);
895 }
896 } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
897
898 return (rc);
899 }
900
901 static int
sv_enable(const caddr_t path,const int flag,const dev_t udev,spcs_s_info_t kstatus)902 sv_enable(const caddr_t path, const int flag,
903 const dev_t udev, spcs_s_info_t kstatus)
904 {
905 struct dev_ops *dev_ops;
906 struct cb_ops *cb_ops;
907 sv_dev_t *svp;
908 sv_maj_t *maj;
909 nsc_size_t nblocks;
910 int rc;
911 cred_t *crp;
912 ldi_ident_t li;
913
914 if (udev == (dev_t)-1 || udev == 0) {
915 DTRACE_PROBE1(
916 sv_enable_err_baddev,
917 dev_t, udev);
918 return (SV_EBADDEV);
919 }
920
921 if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
922 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
923 return (SV_EAMODE);
924 }
925
926 /* Get major hash table */
927 if ((maj = sv_getmajor(udev)) == NULL)
928 return (SV_EBADDEV);
929
930 mutex_enter(&sv_mutex);
931
932 rc = sv_get_state(udev, &svp);
933 if (rc) {
934 mutex_exit(&sv_mutex);
935 DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
936 return (rc);
937 }
938
939 rw_enter(&svp->sv_lock, RW_WRITER);
940
941 /*
942 * Get real fd used for io
943 */
944
945 svp->sv_dev = udev;
946 svp->sv_flag = flag;
947
948 /*
949 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
950 * function pointer before sv swaps them out.
951 */
952
953 svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
954 sv_fd_def, (blind_t)udev, &rc);
955
956 if (svp->sv_fd == NULL) {
957 if (kstatus)
958 spcs_s_add(kstatus, rc);
959 DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
960 return (sv_free(svp, SV_ESDOPEN));
961 }
962
963 /*
964 * Perform a layered driver open using the Sun Private layered
965 * driver i/f to ensure that the cb_ops structure for the driver
966 * is not detached out from under us whilst sv is enabled.
967 *
968 */
969
970 crp = ddi_get_cred();
971 svp->sv_lh = NULL;
972
973 if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
974 rc = ldi_open_by_dev(&svp->sv_dev,
975 OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
976 }
977
978 if (rc != 0) {
979 if (kstatus)
980 spcs_s_add(kstatus, rc);
981 DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
982 return (sv_free(svp, SV_ELYROPEN));
983 }
984
985 /*
986 * Do layering if required - must happen after nsc_open().
987 */
988
989 if (maj->sm_inuse++ == 0) {
990 maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
991
992 if (maj->sm_dev_ops == NULL ||
993 maj->sm_dev_ops->devo_cb_ops == NULL) {
994 DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
995 return (sv_free(svp, SV_ELOAD));
996 }
997
998 dev_ops = maj->sm_dev_ops;
999 cb_ops = dev_ops->devo_cb_ops;
1000
1001 if (cb_ops->cb_strategy == NULL ||
1002 cb_ops->cb_strategy == nodev ||
1003 cb_ops->cb_strategy == nulldev) {
1004 DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1005 return (sv_free(svp, SV_ELOAD));
1006 }
1007
1008 if (cb_ops->cb_strategy == sv_lyr_strategy) {
1009 DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1010 return (sv_free(svp, SV_ESTRATEGY));
1011 }
1012
1013 maj->sm_strategy = cb_ops->cb_strategy;
1014 maj->sm_close = cb_ops->cb_close;
1015 maj->sm_ioctl = cb_ops->cb_ioctl;
1016 maj->sm_write = cb_ops->cb_write;
1017 maj->sm_open = cb_ops->cb_open;
1018 maj->sm_read = cb_ops->cb_read;
1019 maj->sm_flag = cb_ops->cb_flag;
1020
1021 cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1022 cb_ops->cb_strategy = sv_lyr_strategy;
1023 cb_ops->cb_close = sv_lyr_close;
1024 cb_ops->cb_ioctl = sv_lyr_ioctl;
1025 cb_ops->cb_write = sv_lyr_write;
1026 cb_ops->cb_open = sv_lyr_open;
1027 cb_ops->cb_read = sv_lyr_read;
1028
1029 /*
1030 * Check that the driver has async I/O entry points
1031 * before changing them.
1032 */
1033
1034 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1035 maj->sm_awrite = 0;
1036 maj->sm_aread = 0;
1037 } else {
1038 maj->sm_awrite = cb_ops->cb_awrite;
1039 maj->sm_aread = cb_ops->cb_aread;
1040
1041 cb_ops->cb_awrite = sv_lyr_awrite;
1042 cb_ops->cb_aread = sv_lyr_aread;
1043 }
1044
1045 /*
1046 * Bug 4645743
1047 *
1048 * Prevent sv from ever unloading after it has interposed
1049 * on a major device because there is a race between
1050 * sv removing its layered entry points from the target
1051 * dev_ops, a client coming in and accessing the driver,
1052 * and the kernel modunloading the sv text.
1053 *
1054 * To allow unload, do svboot -u, which only happens in
1055 * pkgrm time.
1056 */
1057 ASSERT(MUTEX_HELD(&sv_mutex));
1058 sv_mod_status = SV_PREVENT_UNLOAD;
1059 }
1060
1061
1062 svp->sv_timestamp = nsc_lbolt();
1063 svp->sv_state = SV_ENABLE;
1064 svp->sv_pending = NULL;
1065 rw_exit(&svp->sv_lock);
1066
1067 sv_ndevices++;
1068 mutex_exit(&sv_mutex);
1069
1070 nblocks = 0;
1071 if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1072 nblocks = svp->sv_nblocks;
1073 nsc_release(svp->sv_fd);
1074 }
1075
1076 cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1077 svp->sv_dev, nblocks);
1078
1079 return (0);
1080 }
1081
1082
1083 static int
sv_prepare_unload()1084 sv_prepare_unload()
1085 {
1086 int rc = 0;
1087
1088 mutex_enter(&sv_mutex);
1089
1090 if (sv_mod_status == SV_PREVENT_UNLOAD) {
1091 if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1092 rc = EBUSY;
1093 } else {
1094 sv_mod_status = SV_ALLOW_UNLOAD;
1095 delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1096 }
1097 }
1098
1099 mutex_exit(&sv_mutex);
1100 return (rc);
1101 }
1102
1103 static int
svattach_fd(blind_t arg)1104 svattach_fd(blind_t arg)
1105 {
1106 dev_t dev = (dev_t)arg;
1107 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1108 int rc;
1109
1110 if (sv_debug > 0)
1111 cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1112
1113 if (svp == NULL) {
1114 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1115 return (0);
1116 }
1117
1118 if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1119 cmn_err(CE_WARN,
1120 "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1121 svp->sv_nblocks = 0;
1122 }
1123
1124 if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1125 cmn_err(CE_WARN,
1126 "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1127 svp->sv_maxfbas = 0;
1128 }
1129
1130 if (sv_debug > 0) {
1131 cmn_err(CE_CONT,
1132 "!svattach_fd(%p): size %" NSC_SZFMT ", "
1133 "maxfbas %" NSC_SZFMT "\n",
1134 arg, svp->sv_nblocks, svp->sv_maxfbas);
1135 }
1136
1137 return (0);
1138 }
1139
1140
1141 static int
svdetach_fd(blind_t arg)1142 svdetach_fd(blind_t arg)
1143 {
1144 dev_t dev = (dev_t)arg;
1145 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1146
1147 if (sv_debug > 0)
1148 cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1149
1150 /* svp can be NULL during disable of an sv */
1151 if (svp == NULL)
1152 return (0);
1153
1154 svp->sv_maxfbas = 0;
1155 svp->sv_nblocks = 0;
1156 return (0);
1157 }
1158
1159
1160 /*
1161 * Side effect: if called with (guard != 0), then expects both sv_mutex
1162 * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1163 */
1164
1165 /* ARGSUSED */
1166 static int
sv_disable(dev_t dev,spcs_s_info_t kstatus)1167 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1168 {
1169 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1170
1171 if (svp == NULL) {
1172
1173 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1174 return (SV_ENODEV);
1175 }
1176
1177 mutex_enter(&sv_mutex);
1178 rw_enter(&svp->sv_lock, RW_WRITER);
1179
1180 if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1181 rw_exit(&svp->sv_lock);
1182 mutex_exit(&sv_mutex);
1183
1184 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1185 return (SV_EDISABLED);
1186 }
1187
1188
1189 sv_ndevices--;
1190 return (sv_free(svp, 0));
1191 }
1192
1193
1194
1195 static int
sv_lyr_open(dev_t * devp,int flag,int otyp,cred_t * crp)1196 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1197 {
1198 nsc_buf_t *tmph;
1199 sv_dev_t *svp;
1200 sv_maj_t *maj;
1201 int (*fn)();
1202 dev_t odev;
1203 int ret;
1204 int rc;
1205
1206 svp = sv_dev_to_sv(*devp, &maj);
1207
1208 if (svp) {
1209 if (svp->sv_state == SV_PENDING &&
1210 svp->sv_pending == curthread) {
1211 /*
1212 * This is a recursive open from a call to
1213 * ddi_lyr_open_by_devt and so we just want
1214 * to pass it straight through to the
1215 * underlying driver.
1216 */
1217 DTRACE_PROBE2(sv_lyr_open_recursive,
1218 sv_dev_t *, svp,
1219 dev_t, *devp);
1220 svp = NULL;
1221 } else
1222 rw_enter(&svp->sv_lock, RW_READER);
1223 }
1224
1225 odev = *devp;
1226
1227 if (maj && (fn = maj->sm_open) != 0) {
1228 if (!(maj->sm_flag & D_MP)) {
1229 UNSAFE_ENTER();
1230 ret = (*fn)(devp, flag, otyp, crp);
1231 UNSAFE_EXIT();
1232 } else {
1233 ret = (*fn)(devp, flag, otyp, crp);
1234 }
1235
1236 if (ret == 0) {
1237 /*
1238 * Re-acquire svp if the driver changed *devp.
1239 */
1240
1241 if (*devp != odev) {
1242 if (svp != NULL)
1243 rw_exit(&svp->sv_lock);
1244
1245 svp = sv_dev_to_sv(*devp, NULL);
1246
1247 if (svp) {
1248 rw_enter(&svp->sv_lock, RW_READER);
1249 }
1250 }
1251 }
1252 } else {
1253 ret = ENODEV;
1254 }
1255
1256 if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1257 /*
1258 * Underlying DDI open failed, but we have this
1259 * device SV enabled. If we can read some data
1260 * from the device, fake a successful open (this
1261 * probably means that this device is RDC'd and we
1262 * are getting the data from the secondary node).
1263 *
1264 * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1265 * ensure that it does not deadlock if this open is
1266 * coming from nskernd:get_bsize().
1267 */
1268 rc = sv_reserve(svp->sv_fd,
1269 NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1270 if (rc == 0) {
1271 tmph = NULL;
1272
1273 rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1274 if (rc <= 0) {
1275 /* success */
1276 ret = 0;
1277 }
1278
1279 if (tmph) {
1280 (void) nsc_free_buf(tmph);
1281 tmph = NULL;
1282 }
1283
1284 nsc_release(svp->sv_fd);
1285
1286 /*
1287 * Count the number of layered opens that we
1288 * fake since we have to fake a matching number
1289 * of closes (OTYP_LYR open/close calls must be
1290 * paired).
1291 */
1292
1293 if (ret == 0 && otyp == OTYP_LYR) {
1294 mutex_enter(&svp->sv_olock);
1295 svp->sv_openlcnt++;
1296 mutex_exit(&svp->sv_olock);
1297 }
1298 }
1299 }
1300
1301 if (svp) {
1302 rw_exit(&svp->sv_lock);
1303 }
1304
1305 return (ret);
1306 }
1307
1308
1309 static int
sv_lyr_close(dev_t dev,int flag,int otyp,cred_t * crp)1310 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1311 {
1312 sv_dev_t *svp;
1313 sv_maj_t *maj;
1314 int (*fn)();
1315 int ret;
1316
1317 svp = sv_dev_to_sv(dev, &maj);
1318
1319 if (svp &&
1320 svp->sv_state == SV_PENDING &&
1321 svp->sv_pending == curthread) {
1322 /*
1323 * This is a recursive open from a call to
1324 * ddi_lyr_close and so we just want
1325 * to pass it straight through to the
1326 * underlying driver.
1327 */
1328 DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1329 dev_t, dev);
1330 svp = NULL;
1331 }
1332
1333 if (svp) {
1334 rw_enter(&svp->sv_lock, RW_READER);
1335
1336 if (otyp == OTYP_LYR) {
1337 mutex_enter(&svp->sv_olock);
1338
1339 if (svp->sv_openlcnt) {
1340 /*
1341 * Consume sufficient layered closes to
1342 * account for the opens that we faked
1343 * whilst the device was failed.
1344 */
1345 svp->sv_openlcnt--;
1346 mutex_exit(&svp->sv_olock);
1347 rw_exit(&svp->sv_lock);
1348
1349 DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1350
1351 return (0);
1352 }
1353
1354 mutex_exit(&svp->sv_olock);
1355 }
1356 }
1357
1358 if (maj && (fn = maj->sm_close) != 0) {
1359 if (!(maj->sm_flag & D_MP)) {
1360 UNSAFE_ENTER();
1361 ret = (*fn)(dev, flag, otyp, crp);
1362 UNSAFE_EXIT();
1363 } else {
1364 ret = (*fn)(dev, flag, otyp, crp);
1365 }
1366 } else {
1367 ret = ENODEV;
1368 }
1369
1370 if (svp) {
1371 rw_exit(&svp->sv_lock);
1372 }
1373
1374 return (ret);
1375 }
1376
1377
1378 /*
1379 * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1380 * return NULL.
1381 */
1382 static sv_dev_t *
sv_find_enabled(const dev_t dev,sv_maj_t ** majpp)1383 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1384 {
1385 sv_dev_t *svp;
1386
1387 while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1388 rw_enter(&svp->sv_lock, RW_READER);
1389
1390 if (svp->sv_state == SV_ENABLE) {
1391 /* locked and enabled */
1392 break;
1393 }
1394
1395 /*
1396 * State was changed while waiting on the lock.
1397 * Wait for a stable state.
1398 */
1399 rw_exit(&svp->sv_lock);
1400
1401 DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1402
1403 delay(2);
1404 }
1405
1406 return (svp);
1407 }
1408
1409
1410 static int
sv_lyr_uio(dev_t dev,uio_t * uiop,cred_t * crp,int rw)1411 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1412 {
1413 sv_dev_t *svp;
1414 sv_maj_t *maj;
1415 int (*fn)();
1416 int rc;
1417
1418 svp = sv_find_enabled(dev, &maj);
1419 if (svp == NULL) {
1420 if (maj) {
1421 if (rw == NSC_READ)
1422 fn = maj->sm_read;
1423 else
1424 fn = maj->sm_write;
1425
1426 if (fn != 0) {
1427 if (!(maj->sm_flag & D_MP)) {
1428 UNSAFE_ENTER();
1429 rc = (*fn)(dev, uiop, crp);
1430 UNSAFE_EXIT();
1431 } else {
1432 rc = (*fn)(dev, uiop, crp);
1433 }
1434 }
1435
1436 return (rc);
1437 } else {
1438 return (ENODEV);
1439 }
1440 }
1441
1442 ASSERT(RW_READ_HELD(&svp->sv_lock));
1443
1444 if (svp->sv_flag == 0) {
1445 /*
1446 * guard access mode
1447 * - prevent user level access to the device
1448 */
1449 DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1450 rc = EPERM;
1451 goto out;
1452 }
1453
1454 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1455 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1456 goto out;
1457 }
1458
1459 if (rw == NSC_READ)
1460 rc = nsc_uread(svp->sv_fd, uiop, crp);
1461 else
1462 rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1463
1464 nsc_release(svp->sv_fd);
1465
1466 out:
1467 rw_exit(&svp->sv_lock);
1468
1469 return (rc);
1470 }
1471
1472
1473 static int
sv_lyr_read(dev_t dev,uio_t * uiop,cred_t * crp)1474 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1475 {
1476 return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1477 }
1478
1479
1480 static int
sv_lyr_write(dev_t dev,uio_t * uiop,cred_t * crp)1481 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1482 {
1483 return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1484 }
1485
1486
1487 /* ARGSUSED */
1488
1489 static int
sv_lyr_aread(dev_t dev,struct aio_req * aio,cred_t * crp)1490 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1491 {
1492 return (aphysio(sv_lyr_strategy,
1493 anocancel, dev, B_READ, minphys, aio));
1494 }
1495
1496
1497 /* ARGSUSED */
1498
1499 static int
sv_lyr_awrite(dev_t dev,struct aio_req * aio,cred_t * crp)1500 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1501 {
1502 return (aphysio(sv_lyr_strategy,
1503 anocancel, dev, B_WRITE, minphys, aio));
1504 }
1505
1506
1507 /*
1508 * Set up an array containing the list of raw path names
1509 * The array for the paths is svl and the size of the array is
1510 * in size.
1511 *
1512 * If there are more layered devices than will fit in the array,
1513 * the number of extra layered devices is returned. Otherwise
1514 * zero is return.
1515 *
1516 * Input:
1517 * svn : array for paths
1518 * size : size of the array
1519 *
1520 * Output (extra):
1521 * zero : All paths fit in array
1522 * >0 : Number of defined layered devices don't fit in array
1523 */
1524
1525 static int
sv_list(void * ptr,const int size,int * extra,const int ilp32)1526 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1527 {
1528 sv_name32_t *svn32;
1529 sv_name_t *svn;
1530 sv_dev_t *svp;
1531 int *mode, *nblocks;
1532 int i, index;
1533 char *path;
1534
1535 *extra = 0;
1536 index = 0;
1537
1538 if (ilp32)
1539 svn32 = ptr;
1540 else
1541 svn = ptr;
1542
1543 mutex_enter(&sv_mutex);
1544 for (i = 0; i < sv_max_devices; i++) {
1545 svp = &sv_devs[i];
1546
1547 rw_enter(&svp->sv_lock, RW_READER);
1548
1549 if (svp->sv_state != SV_ENABLE) {
1550 rw_exit(&svp->sv_lock);
1551 continue;
1552 }
1553
1554 if ((*extra) != 0 || ptr == NULL) {
1555 /* Another overflow entry */
1556 rw_exit(&svp->sv_lock);
1557 (*extra)++;
1558 continue;
1559 }
1560
1561 if (ilp32) {
1562 nblocks = &svn32->svn_nblocks;
1563 mode = &svn32->svn_mode;
1564 path = svn32->svn_path;
1565
1566 svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1567 svn32++;
1568 } else {
1569 nblocks = &svn->svn_nblocks;
1570 mode = &svn->svn_mode;
1571 path = svn->svn_path;
1572
1573 svn->svn_timestamp = svp->sv_timestamp;
1574 svn++;
1575 }
1576
1577 (void) strcpy(path, nsc_pathname(svp->sv_fd));
1578 *nblocks = svp->sv_nblocks;
1579 *mode = svp->sv_flag;
1580
1581 if (*nblocks == 0) {
1582 if (sv_debug > 3)
1583 cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1584
1585 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1586 *nblocks = svp->sv_nblocks;
1587 nsc_release(svp->sv_fd);
1588 }
1589 }
1590
1591 if (++index >= size) {
1592 /* Out of space */
1593 (*extra)++;
1594 }
1595
1596 rw_exit(&svp->sv_lock);
1597 }
1598 mutex_exit(&sv_mutex);
1599
1600 if (index < size) {
1601 /* NULL terminated list */
1602 if (ilp32)
1603 svn32->svn_path[0] = '\0';
1604 else
1605 svn->svn_path[0] = '\0';
1606 }
1607
1608 return (0);
1609 }
1610
1611
1612 static void
sv_thread_tune(int threads)1613 sv_thread_tune(int threads)
1614 {
1615 int incr = (threads > 0) ? 1 : -1;
1616 int change = 0;
1617 int nthreads;
1618
1619 ASSERT(MUTEX_HELD(&sv_mutex));
1620
1621 if (sv_threads_extra) {
1622 /* keep track of any additional threads requested */
1623 if (threads > 0) {
1624 sv_threads_extra += threads;
1625 return;
1626 }
1627 threads = -threads;
1628 if (threads >= sv_threads_extra) {
1629 threads -= sv_threads_extra;
1630 sv_threads_extra = 0;
1631 /* fall through to while loop */
1632 } else {
1633 sv_threads_extra -= threads;
1634 return;
1635 }
1636 } else if (threads > 0) {
1637 /*
1638 * do not increase the number of threads beyond
1639 * sv_threads_max when doing dynamic thread tuning
1640 */
1641 nthreads = nst_nthread(sv_tset);
1642 if ((nthreads + threads) > sv_threads_max) {
1643 sv_threads_extra = nthreads + threads - sv_threads_max;
1644 threads = sv_threads_max - nthreads;
1645 if (threads <= 0)
1646 return;
1647 }
1648 }
1649
1650 if (threads < 0)
1651 threads = -threads;
1652
1653 while (threads--) {
1654 nthreads = nst_nthread(sv_tset);
1655 sv_threads_needed += incr;
1656
1657 if (sv_threads_needed >= nthreads)
1658 change += nst_add_thread(sv_tset, sv_threads_inc);
1659 else if ((sv_threads_needed <
1660 (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1661 ((nthreads - sv_threads_inc) >= sv_threads))
1662 change -= nst_del_thread(sv_tset, sv_threads_inc);
1663 }
1664
1665 #ifdef DEBUG
1666 if (change) {
1667 cmn_err(CE_NOTE,
1668 "!sv_thread_tune: threads needed %d, nthreads %d, "
1669 "nthreads change %d",
1670 sv_threads_needed, nst_nthread(sv_tset), change);
1671 }
1672 #endif
1673 }
1674
1675
1676 /* ARGSUSED */
1677 static int
svopen(dev_t * devp,int flag,int otyp,cred_t * crp)1678 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1679 {
1680 int rc;
1681
1682 mutex_enter(&sv_mutex);
1683 rc = sv_init_devs();
1684 mutex_exit(&sv_mutex);
1685
1686 return (rc);
1687 }
1688
1689
1690 /* ARGSUSED */
1691 static int
svclose(dev_t dev,int flag,int otyp,cred_t * crp)1692 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1693 {
1694 const int secs = HZ * 5;
1695 const int ticks = HZ / 10;
1696 int loops = secs / ticks;
1697
1698 mutex_enter(&sv_mutex);
1699 while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1700 if (nst_nlive(sv_tset) <= 0) {
1701 nst_destroy(sv_tset);
1702 sv_tset = NULL;
1703 break;
1704 }
1705
1706 /* threads still active - wait for them to exit */
1707 mutex_exit(&sv_mutex);
1708 delay(ticks);
1709 loops--;
1710 mutex_enter(&sv_mutex);
1711 }
1712 mutex_exit(&sv_mutex);
1713
1714 if (loops <= 0) {
1715 cmn_err(CE_WARN,
1716 #ifndef DEBUG
1717 /* do not write to console when non-DEBUG */
1718 "!"
1719 #endif
1720 "sv:svclose: threads still active "
1721 "after %d sec - leaking thread set", secs);
1722 }
1723
1724 return (0);
1725 }
1726
1727
1728 static int
svioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * crp,int * rvalp)1729 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1730 {
1731 char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1732 spcs_s_info_t kstatus; /* Kernel version of spcs status */
1733 spcs_s_info_t ustatus; /* Address of user version of spcs status */
1734 sv_list32_t svl32; /* 32 bit Initial structure for SVIOC_LIST */
1735 sv_version_t svv; /* Version structure */
1736 sv_conf_t svc; /* User config structure */
1737 sv_list_t svl; /* Initial structure for SVIOC_LIST */
1738 void *usvn; /* Address of user sv_name_t */
1739 void *svn = NULL; /* Array for SVIOC_LIST */
1740 uint64_t phash; /* pathname hash */
1741 int rc = 0; /* Return code -- errno */
1742 int size; /* Number of items in array */
1743 int bytes; /* Byte size of array */
1744 int ilp32; /* Convert data structures for ilp32 userland */
1745
1746 *rvalp = 0;
1747
1748 /*
1749 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1750 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1751 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1752 *
1753 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1754 */
1755 if (sv_mod_status == SV_ALLOW_UNLOAD) {
1756 return (EBUSY);
1757 }
1758
1759 if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1760 return (rc);
1761
1762 kstatus = spcs_s_kcreate();
1763 if (!kstatus) {
1764 DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1765 return (ENOMEM);
1766 }
1767
1768 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1769
1770 switch (cmd) {
1771
1772 case SVIOC_ENABLE:
1773
1774 if (ilp32) {
1775 sv_conf32_t svc32;
1776
1777 if (ddi_copyin((void *)arg, &svc32,
1778 sizeof (svc32), mode) < 0) {
1779 spcs_s_kfree(kstatus);
1780 return (EFAULT);
1781 }
1782
1783 svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1784 (void) strcpy(svc.svc_path, svc32.svc_path);
1785 svc.svc_flag = svc32.svc_flag;
1786 svc.svc_major = svc32.svc_major;
1787 svc.svc_minor = svc32.svc_minor;
1788 } else {
1789 if (ddi_copyin((void *)arg, &svc,
1790 sizeof (svc), mode) < 0) {
1791 spcs_s_kfree(kstatus);
1792 return (EFAULT);
1793 }
1794 }
1795
1796 /* force to raw access */
1797 svc.svc_flag = NSC_DEVICE;
1798
1799 if (sv_tset == NULL) {
1800 mutex_enter(&sv_mutex);
1801
1802 if (sv_tset == NULL) {
1803 sv_tset = nst_init("sv_thr", sv_threads);
1804 }
1805
1806 mutex_exit(&sv_mutex);
1807
1808 if (sv_tset == NULL) {
1809 cmn_err(CE_WARN,
1810 "!sv: could not allocate %d threads",
1811 sv_threads);
1812 }
1813 }
1814
1815 rc = sv_enable(svc.svc_path, svc.svc_flag,
1816 makedevice(svc.svc_major, svc.svc_minor), kstatus);
1817
1818 if (rc == 0) {
1819 sv_config_time = nsc_lbolt();
1820
1821 mutex_enter(&sv_mutex);
1822 sv_thread_tune(sv_threads_dev);
1823 mutex_exit(&sv_mutex);
1824 }
1825
1826 DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1827
1828 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1829 /* NOTREACHED */
1830
1831 case SVIOC_DISABLE:
1832
1833 if (ilp32) {
1834 sv_conf32_t svc32;
1835
1836 if (ddi_copyin((void *)arg, &svc32,
1837 sizeof (svc32), mode) < 0) {
1838 spcs_s_kfree(kstatus);
1839 return (EFAULT);
1840 }
1841
1842 svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1843 svc.svc_major = svc32.svc_major;
1844 svc.svc_minor = svc32.svc_minor;
1845 (void) strcpy(svc.svc_path, svc32.svc_path);
1846 svc.svc_flag = svc32.svc_flag;
1847 } else {
1848 if (ddi_copyin((void *)arg, &svc,
1849 sizeof (svc), mode) < 0) {
1850 spcs_s_kfree(kstatus);
1851 return (EFAULT);
1852 }
1853 }
1854
1855 if (svc.svc_major == (major_t)-1 &&
1856 svc.svc_minor == (minor_t)-1) {
1857 sv_dev_t *svp;
1858 int i;
1859
1860 /*
1861 * User level could not find the minor device
1862 * node, so do this the slow way by searching
1863 * the entire sv config for a matching pathname.
1864 */
1865
1866 phash = nsc_strhash(svc.svc_path);
1867
1868 mutex_enter(&sv_mutex);
1869
1870 for (i = 0; i < sv_max_devices; i++) {
1871 svp = &sv_devs[i];
1872
1873 if (svp->sv_state == SV_DISABLE ||
1874 svp->sv_fd == NULL)
1875 continue;
1876
1877 if (nsc_fdpathcmp(svp->sv_fd, phash,
1878 svc.svc_path) == 0) {
1879 svc.svc_major = getmajor(svp->sv_dev);
1880 svc.svc_minor = getminor(svp->sv_dev);
1881 break;
1882 }
1883 }
1884
1885 mutex_exit(&sv_mutex);
1886
1887 if (svc.svc_major == (major_t)-1 &&
1888 svc.svc_minor == (minor_t)-1)
1889 return (spcs_s_ocopyoutf(&kstatus,
1890 svc.svc_error, SV_ENODEV));
1891 }
1892
1893 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1894 kstatus);
1895
1896 if (rc == 0) {
1897 sv_config_time = nsc_lbolt();
1898
1899 mutex_enter(&sv_mutex);
1900 sv_thread_tune(-sv_threads_dev);
1901 mutex_exit(&sv_mutex);
1902 }
1903
1904 DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1905
1906 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1907 /* NOTREACHED */
1908
1909 case SVIOC_LIST:
1910
1911 if (ilp32) {
1912 if (ddi_copyin((void *)arg, &svl32,
1913 sizeof (svl32), mode) < 0) {
1914 spcs_s_kfree(kstatus);
1915 return (EFAULT);
1916 }
1917
1918 ustatus = (spcs_s_info_t)svl32.svl_error;
1919 size = svl32.svl_count;
1920 usvn = (void *)(unsigned long)svl32.svl_names;
1921 } else {
1922 if (ddi_copyin((void *)arg, &svl,
1923 sizeof (svl), mode) < 0) {
1924 spcs_s_kfree(kstatus);
1925 return (EFAULT);
1926 }
1927
1928 ustatus = svl.svl_error;
1929 size = svl.svl_count;
1930 usvn = svl.svl_names;
1931 }
1932
1933 /* Do some boundary checking */
1934 if ((size < 0) || (size > sv_max_devices)) {
1935 /* Array size is out of range */
1936 return (spcs_s_ocopyoutf(&kstatus, ustatus,
1937 SV_EARRBOUNDS, "0",
1938 spcs_s_inttostring(sv_max_devices, itmp1,
1939 sizeof (itmp1), 0),
1940 spcs_s_inttostring(size, itmp2,
1941 sizeof (itmp2), 0)));
1942 }
1943
1944 if (ilp32)
1945 bytes = size * sizeof (sv_name32_t);
1946 else
1947 bytes = size * sizeof (sv_name_t);
1948
1949 /* Allocate memory for the array of structures */
1950 if (bytes != 0) {
1951 svn = kmem_zalloc(bytes, KM_SLEEP);
1952 if (!svn) {
1953 return (spcs_s_ocopyoutf(&kstatus,
1954 ustatus, ENOMEM));
1955 }
1956 }
1957
1958 rc = sv_list(svn, size, rvalp, ilp32);
1959 if (rc) {
1960 if (svn != NULL)
1961 kmem_free(svn, bytes);
1962 return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1963 }
1964
1965 if (ilp32) {
1966 svl32.svl_timestamp = (uint32_t)sv_config_time;
1967 svl32.svl_maxdevs = (int32_t)sv_max_devices;
1968
1969 /* Return the list structure */
1970 if (ddi_copyout(&svl32, (void *)arg,
1971 sizeof (svl32), mode) < 0) {
1972 spcs_s_kfree(kstatus);
1973 if (svn != NULL)
1974 kmem_free(svn, bytes);
1975 return (EFAULT);
1976 }
1977 } else {
1978 svl.svl_timestamp = sv_config_time;
1979 svl.svl_maxdevs = sv_max_devices;
1980
1981 /* Return the list structure */
1982 if (ddi_copyout(&svl, (void *)arg,
1983 sizeof (svl), mode) < 0) {
1984 spcs_s_kfree(kstatus);
1985 if (svn != NULL)
1986 kmem_free(svn, bytes);
1987 return (EFAULT);
1988 }
1989 }
1990
1991 /* Return the array */
1992 if (svn != NULL) {
1993 if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1994 kmem_free(svn, bytes);
1995 spcs_s_kfree(kstatus);
1996 return (EFAULT);
1997 }
1998 kmem_free(svn, bytes);
1999 }
2000
2001 DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
2002
2003 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2004 /* NOTREACHED */
2005
2006 case SVIOC_VERSION:
2007
2008 if (ilp32) {
2009 sv_version32_t svv32;
2010
2011 if (ddi_copyin((void *)arg, &svv32,
2012 sizeof (svv32), mode) < 0) {
2013 spcs_s_kfree(kstatus);
2014 return (EFAULT);
2015 }
2016
2017 svv32.svv_major_rev = sv_major_rev;
2018 svv32.svv_minor_rev = sv_minor_rev;
2019 svv32.svv_micro_rev = sv_micro_rev;
2020 svv32.svv_baseline_rev = sv_baseline_rev;
2021
2022 if (ddi_copyout(&svv32, (void *)arg,
2023 sizeof (svv32), mode) < 0) {
2024 spcs_s_kfree(kstatus);
2025 return (EFAULT);
2026 }
2027
2028 ustatus = (spcs_s_info_t)svv32.svv_error;
2029 } else {
2030 if (ddi_copyin((void *)arg, &svv,
2031 sizeof (svv), mode) < 0) {
2032 spcs_s_kfree(kstatus);
2033 return (EFAULT);
2034 }
2035
2036 svv.svv_major_rev = sv_major_rev;
2037 svv.svv_minor_rev = sv_minor_rev;
2038 svv.svv_micro_rev = sv_micro_rev;
2039 svv.svv_baseline_rev = sv_baseline_rev;
2040
2041 if (ddi_copyout(&svv, (void *)arg,
2042 sizeof (svv), mode) < 0) {
2043 spcs_s_kfree(kstatus);
2044 return (EFAULT);
2045 }
2046
2047 ustatus = svv.svv_error;
2048 }
2049
2050 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2051
2052 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2053 /* NOTREACHED */
2054
2055 case SVIOC_UNLOAD:
2056 rc = sv_prepare_unload();
2057
2058 if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2059 rc = EFAULT;
2060 }
2061
2062 spcs_s_kfree(kstatus);
2063 return (rc);
2064
2065 default:
2066 spcs_s_kfree(kstatus);
2067
2068 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2069
2070 return (EINVAL);
2071 /* NOTREACHED */
2072 }
2073
2074 /* NOTREACHED */
2075 }
2076
2077
2078 /* ARGSUSED */
2079 static int
svprint(dev_t dev,char * str)2080 svprint(dev_t dev, char *str)
2081 {
2082 int instance = ddi_get_instance(sv_dip);
2083 cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2084 return (0);
2085 }
2086
2087
2088 static void
_sv_lyr_strategy(struct buf * bp)2089 _sv_lyr_strategy(struct buf *bp)
2090 {
2091 caddr_t buf_addr; /* pointer to linear buffer in bp */
2092 nsc_buf_t *bufh = NULL;
2093 nsc_buf_t *hndl = NULL;
2094 sv_dev_t *svp;
2095 nsc_vec_t *v;
2096 sv_maj_t *maj;
2097 nsc_size_t fba_req, fba_len; /* FBA lengths */
2098 nsc_off_t fba_off; /* FBA offset */
2099 size_t tocopy, nbytes; /* byte lengths */
2100 int rw, rc; /* flags and return codes */
2101 int (*fn)();
2102
2103 rc = 0;
2104
2105 if (sv_debug > 5)
2106 cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2107
2108 svp = sv_find_enabled(bp->b_edev, &maj);
2109 if (svp == NULL) {
2110 if (maj && (fn = maj->sm_strategy) != 0) {
2111 if (!(maj->sm_flag & D_MP)) {
2112 UNSAFE_ENTER();
2113 rc = (*fn)(bp);
2114 UNSAFE_EXIT();
2115 } else {
2116 rc = (*fn)(bp);
2117 }
2118 return;
2119 } else {
2120 bioerror(bp, ENODEV);
2121 biodone(bp);
2122 return;
2123 }
2124 }
2125
2126 ASSERT(RW_READ_HELD(&svp->sv_lock));
2127
2128 if (svp->sv_flag == 0) {
2129 /*
2130 * guard access mode
2131 * - prevent user level access to the device
2132 */
2133 DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2134 bioerror(bp, EPERM);
2135 goto out;
2136 }
2137
2138 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2139 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2140
2141 if (rc == EINTR)
2142 cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2143 bioerror(bp, rc);
2144 goto out;
2145 }
2146
2147 if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2148 DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2149
2150 if (bp->b_flags & B_READ) {
2151 /* return EOF, not an error */
2152 bp->b_resid = bp->b_bcount;
2153 bioerror(bp, 0);
2154 } else
2155 bioerror(bp, EINVAL);
2156
2157 goto done;
2158 }
2159
2160 /*
2161 * Preallocate a handle once per call to strategy.
2162 * If this fails, then the nsc_alloc_buf() will allocate
2163 * a temporary handle per allocation/free pair.
2164 */
2165
2166 DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2167
2168 bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2169
2170 DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2171
2172 if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2173 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2174
2175 cmn_err(CE_WARN,
2176 "!sv: allocated active handle (bufh %p, flags %x)",
2177 (void *)bufh, bufh->sb_flag);
2178
2179 bioerror(bp, ENXIO);
2180 goto done;
2181 }
2182
2183 fba_req = FBA_LEN(bp->b_bcount);
2184 if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2185 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2186
2187 rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2188
2189 bp_mapin(bp);
2190
2191 bp->b_resid = bp->b_bcount;
2192 buf_addr = bp->b_un.b_addr;
2193 fba_off = 0;
2194
2195 /*
2196 * fba_req - requested size of transfer in FBAs after
2197 * truncation to device extent, and allowing for
2198 * possible non-FBA bounded final chunk.
2199 * fba_off - offset of start of chunk from start of bp in FBAs.
2200 * fba_len - size of this chunk in FBAs.
2201 */
2202
2203 loop:
2204 fba_len = min(fba_req, svp->sv_maxfbas);
2205 hndl = bufh;
2206
2207 DTRACE_PROBE4(sv_dbg_allocb_start,
2208 sv_dev_t *, svp,
2209 uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2210 uint64_t, (uint64_t)fba_len,
2211 int, rw);
2212
2213 rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2214 fba_len, rw, &hndl);
2215
2216 DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2217
2218 if (rc > 0) {
2219 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2220 bioerror(bp, rc);
2221 if (hndl != bufh)
2222 (void) nsc_free_buf(hndl);
2223 hndl = NULL;
2224 goto done;
2225 }
2226
2227 tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2228 v = hndl->sb_vec;
2229
2230 if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2231 /*
2232 * Not overwriting all of the last FBA, so read in the
2233 * old contents now before we overwrite it with the new
2234 * data.
2235 */
2236
2237 DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2238 uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2239
2240 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2241 if (rc > 0) {
2242 bioerror(bp, rc);
2243 goto done;
2244 }
2245
2246 DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2247 }
2248
2249 DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2250
2251 while (tocopy > 0) {
2252 nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2253
2254 if (bp->b_flags & B_READ)
2255 (void) bcopy(v->sv_addr, buf_addr, nbytes);
2256 else
2257 (void) bcopy(buf_addr, v->sv_addr, nbytes);
2258
2259 bp->b_resid -= nbytes;
2260 buf_addr += nbytes;
2261 tocopy -= nbytes;
2262 v++;
2263 }
2264
2265 DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2266
2267 if ((bp->b_flags & B_READ) == 0) {
2268 DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2269 uint64_t, (uint64_t)hndl->sb_pos,
2270 uint64_t, (uint64_t)hndl->sb_len);
2271
2272 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2273
2274 DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2275
2276 if (rc > 0) {
2277 bioerror(bp, rc);
2278 goto done;
2279 }
2280 }
2281
2282 /*
2283 * Adjust FBA offset and requested (ie. remaining) length,
2284 * loop if more data to transfer.
2285 */
2286
2287 fba_off += fba_len;
2288 fba_req -= fba_len;
2289
2290 if (fba_req > 0) {
2291 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2292
2293 rc = nsc_free_buf(hndl);
2294
2295 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2296
2297 if (rc > 0) {
2298 DTRACE_PROBE1(sv_lyr_strategy_err_free,
2299 struct buf *, bp);
2300 bioerror(bp, rc);
2301 }
2302
2303 hndl = NULL;
2304
2305 if (rc <= 0)
2306 goto loop;
2307 }
2308
2309 done:
2310 if (hndl != NULL) {
2311 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2312
2313 rc = nsc_free_buf(hndl);
2314
2315 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2316
2317 if (rc > 0) {
2318 DTRACE_PROBE1(sv_lyr_strategy_err_free,
2319 struct buf *, bp);
2320 bioerror(bp, rc);
2321 }
2322
2323 hndl = NULL;
2324 }
2325
2326 if (bufh)
2327 (void) nsc_free_handle(bufh);
2328
2329 DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2330
2331 nsc_release(svp->sv_fd);
2332
2333 DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2334
2335 out:
2336 if (sv_debug > 5) {
2337 cmn_err(CE_CONT,
2338 "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2339 (void *)bp, (void *)bufh, bp->b_error);
2340 }
2341
2342 DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2343
2344 rw_exit(&svp->sv_lock);
2345 biodone(bp);
2346 }
2347
2348
2349 static void
sv_async_strategy(blind_t arg)2350 sv_async_strategy(blind_t arg)
2351 {
2352 struct buf *bp = (struct buf *)arg;
2353 _sv_lyr_strategy(bp);
2354 }
2355
2356
2357 static int
sv_lyr_strategy(struct buf * bp)2358 sv_lyr_strategy(struct buf *bp)
2359 {
2360 nsthread_t *tp;
2361 int nlive;
2362
2363 /*
2364 * If B_ASYNC was part of the DDI we could use it as a hint to
2365 * not create a thread for synchronous i/o.
2366 */
2367 if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2368 /* not sv enabled - just pass through */
2369 DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2370 _sv_lyr_strategy(bp);
2371 return (0);
2372 }
2373
2374 if (sv_debug > 4) {
2375 cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2376 nst_nthread(sv_tset), nst_nlive(sv_tset));
2377 }
2378
2379 /*
2380 * If there are only guard devices enabled there
2381 * won't be a threadset, so don't try and use it.
2382 */
2383 tp = NULL;
2384 if (sv_tset != NULL) {
2385 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2386 }
2387
2388 if (tp == NULL) {
2389 /*
2390 * out of threads, so fall back to synchronous io.
2391 */
2392 if (sv_debug > 0) {
2393 cmn_err(CE_CONT,
2394 "!sv_lyr_strategy: thread alloc failed\n");
2395 }
2396
2397 DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2398 struct buf *, bp);
2399
2400 _sv_lyr_strategy(bp);
2401 sv_no_threads++;
2402 } else {
2403 nlive = nst_nlive(sv_tset);
2404 if (nlive > sv_max_nlive) {
2405 if (sv_debug > 0) {
2406 cmn_err(CE_CONT,
2407 "!sv_lyr_strategy: "
2408 "new max nlive %d (nthread %d)\n",
2409 nlive, nst_nthread(sv_tset));
2410 }
2411
2412 sv_max_nlive = nlive;
2413 }
2414 }
2415
2416 return (0);
2417 }
2418
2419 /*
2420 * re-write the size of the current partition
2421 */
2422 static int
sv_fix_dkiocgvtoc(const intptr_t arg,const int mode,sv_dev_t * svp)2423 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2424 {
2425 size_t offset;
2426 int ilp32;
2427 int pnum;
2428 int rc;
2429
2430 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2431
2432 rc = nskern_partition(svp->sv_dev, &pnum);
2433 if (rc != 0) {
2434 return (rc);
2435 }
2436
2437 if (pnum < 0 || pnum >= V_NUMPAR) {
2438 cmn_err(CE_WARN,
2439 "!sv_gvtoc: unable to determine partition number "
2440 "for dev %lx", svp->sv_dev);
2441 return (EINVAL);
2442 }
2443
2444 if (ilp32) {
2445 int32_t p_size;
2446
2447 #ifdef _SunOS_5_6
2448 offset = offsetof(struct vtoc, v_part);
2449 offset += sizeof (struct partition) * pnum;
2450 offset += offsetof(struct partition, p_size);
2451 #else
2452 offset = offsetof(struct vtoc32, v_part);
2453 offset += sizeof (struct partition32) * pnum;
2454 offset += offsetof(struct partition32, p_size);
2455 #endif
2456
2457 p_size = (int32_t)svp->sv_nblocks;
2458 if (p_size == 0) {
2459 if (sv_reserve(svp->sv_fd,
2460 NSC_MULTI|NSC_PCATCH) == 0) {
2461 p_size = (int32_t)svp->sv_nblocks;
2462 nsc_release(svp->sv_fd);
2463 } else {
2464 rc = EINTR;
2465 }
2466 }
2467
2468 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2469 sizeof (p_size), mode) != 0) {
2470 rc = EFAULT;
2471 }
2472 } else {
2473 long p_size;
2474
2475 offset = offsetof(struct vtoc, v_part);
2476 offset += sizeof (struct partition) * pnum;
2477 offset += offsetof(struct partition, p_size);
2478
2479 p_size = (long)svp->sv_nblocks;
2480 if (p_size == 0) {
2481 if (sv_reserve(svp->sv_fd,
2482 NSC_MULTI|NSC_PCATCH) == 0) {
2483 p_size = (long)svp->sv_nblocks;
2484 nsc_release(svp->sv_fd);
2485 } else {
2486 rc = EINTR;
2487 }
2488 }
2489
2490 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2491 sizeof (p_size), mode) != 0) {
2492 rc = EFAULT;
2493 }
2494 }
2495
2496 return (rc);
2497 }
2498
2499
2500 #ifdef DKIOCPARTITION
2501 /*
2502 * re-write the size of the current partition
2503 *
2504 * arg is dk_efi_t.
2505 *
2506 * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2507 *
2508 * dk_efi_t->dki_data --> efi_gpt_t (label header)
2509 * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2510 *
2511 * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2512 * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2513 *
2514 * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2515 * logical block on the disk.
2516 *
2517 * Everything is little endian (i.e. disk format).
2518 */
2519 static int
sv_fix_dkiocgetefi(const intptr_t arg,const int mode,sv_dev_t * svp)2520 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2521 {
2522 dk_efi_t efi;
2523 efi_gpt_t gpt;
2524 efi_gpe_t *gpe = NULL;
2525 size_t sgpe;
2526 uint64_t p_size; /* virtual partition size from nsctl */
2527 uint32_t crc;
2528 int unparts; /* number of parts in user's array */
2529 int pnum;
2530 int rc;
2531
2532 rc = nskern_partition(svp->sv_dev, &pnum);
2533 if (rc != 0) {
2534 return (rc);
2535 }
2536
2537 if (pnum < 0) {
2538 cmn_err(CE_WARN,
2539 "!sv_efi: unable to determine partition number for dev %lx",
2540 svp->sv_dev);
2541 return (EINVAL);
2542 }
2543
2544 if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2545 return (EFAULT);
2546 }
2547
2548 efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2549
2550 if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2551 return (EINVAL);
2552 }
2553
2554 if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2555 rc = EFAULT;
2556 goto out;
2557 }
2558
2559 if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2560 unparts = 1;
2561 else if (pnum >= unparts) {
2562 cmn_err(CE_WARN,
2563 "!sv_efi: partition# beyond end of user array (%d >= %d)",
2564 pnum, unparts);
2565 return (EINVAL);
2566 }
2567
2568 sgpe = sizeof (*gpe) * unparts;
2569 gpe = kmem_alloc(sgpe, KM_SLEEP);
2570
2571 if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2572 rc = EFAULT;
2573 goto out;
2574 }
2575
2576 p_size = svp->sv_nblocks;
2577 if (p_size == 0) {
2578 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2579 p_size = (diskaddr_t)svp->sv_nblocks;
2580 nsc_release(svp->sv_fd);
2581 } else {
2582 rc = EINTR;
2583 }
2584 }
2585
2586 gpe[pnum].efi_gpe_EndingLBA = LE_64(
2587 LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2588
2589 gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2590 CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2591 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2592
2593 gpt.efi_gpt_HeaderCRC32 = 0;
2594 CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2595 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2596
2597 if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2598 rc = EFAULT;
2599 goto out;
2600 }
2601
2602 if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2603 rc = EFAULT;
2604 goto out;
2605 }
2606
2607 out:
2608 if (gpe) {
2609 kmem_free(gpe, sgpe);
2610 }
2611
2612 return (rc);
2613 }
2614
2615
2616 /*
2617 * Re-write the size of the partition specified by p_partno
2618 *
2619 * Note that if a DKIOCPARTITION is issued to an fd opened against a
2620 * non-sv'd device, but p_partno requests the size for a different
2621 * device that is sv'd, this function will *not* be called as sv is
2622 * not interposed on the original device (the fd).
2623 *
2624 * It would not be easy to change this as we cannot get the partition
2625 * number for the non-sv'd device, so cannot compute the dev_t of the
2626 * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2627 * its size from nsctl.
2628 *
2629 * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2630 */
2631 static int
sv_fix_dkiocpartition(const intptr_t arg,const int mode,sv_dev_t * svp)2632 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2633 {
2634 struct partition64 p64;
2635 sv_dev_t *nsvp = NULL;
2636 diskaddr_t p_size;
2637 minor_t nminor;
2638 int pnum, rc;
2639 dev_t ndev;
2640
2641 rc = nskern_partition(svp->sv_dev, &pnum);
2642 if (rc != 0) {
2643 return (rc);
2644 }
2645
2646 if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2647 return (EFAULT);
2648 }
2649
2650 if (p64.p_partno != pnum) {
2651 /* switch to requested partition, not the current one */
2652 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2653 ndev = makedevice(getmajor(svp->sv_dev), nminor);
2654 nsvp = sv_find_enabled(ndev, NULL);
2655 if (nsvp == NULL) {
2656 /* not sv device - just return */
2657 return (0);
2658 }
2659
2660 svp = nsvp;
2661 }
2662
2663 p_size = svp->sv_nblocks;
2664 if (p_size == 0) {
2665 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2666 p_size = (diskaddr_t)svp->sv_nblocks;
2667 nsc_release(svp->sv_fd);
2668 } else {
2669 rc = EINTR;
2670 }
2671 }
2672
2673 if (nsvp != NULL) {
2674 rw_exit(&nsvp->sv_lock);
2675 }
2676
2677 if ((rc == 0) && ddi_copyout(&p_size,
2678 (void *)(arg + offsetof(struct partition64, p_size)),
2679 sizeof (p_size), mode) != 0) {
2680 return (EFAULT);
2681 }
2682
2683 return (rc);
2684 }
2685 #endif /* DKIOCPARTITION */
2686
2687
2688 static int
sv_lyr_ioctl(const dev_t dev,const int cmd,const intptr_t arg,const int mode,cred_t * crp,int * rvalp)2689 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2690 const int mode, cred_t *crp, int *rvalp)
2691 {
2692 sv_dev_t *svp;
2693 sv_maj_t *maj;
2694 int (*fn)();
2695 int rc = 0;
2696
2697 maj = 0;
2698 fn = 0;
2699
2700 /*
2701 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2702 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2703 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2704 *
2705 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2706 */
2707 if (sv_mod_status == SV_ALLOW_UNLOAD) {
2708 return (EBUSY);
2709 }
2710
2711 svp = sv_find_enabled(dev, &maj);
2712 if (svp != NULL) {
2713 if (nskernd_isdaemon()) {
2714 /*
2715 * This is nskernd which always needs to see
2716 * the underlying disk device accurately.
2717 *
2718 * So just pass the ioctl straight through
2719 * to the underlying driver as though the device
2720 * was not sv enabled.
2721 */
2722 DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2723 dev_t, dev);
2724
2725 rw_exit(&svp->sv_lock);
2726 svp = NULL;
2727 } else {
2728 ASSERT(RW_READ_HELD(&svp->sv_lock));
2729 }
2730 }
2731
2732 /*
2733 * We now have a locked and enabled SV device, or a non-SV device.
2734 */
2735
2736 switch (cmd) {
2737 /*
2738 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2739 * and DKIOCSETEFI are intercepted and faked up as some
2740 * i/o providers emulate volumes of a different size to
2741 * the underlying volume.
2742 *
2743 * Setting the size by rewriting the vtoc is not permitted.
2744 */
2745
2746 case DKIOCSVTOC:
2747 #ifdef DKIOCPARTITION
2748 case DKIOCSETEFI:
2749 #endif
2750 if (svp == NULL) {
2751 /* not intercepted -- allow ioctl through */
2752 break;
2753 }
2754
2755 rw_exit(&svp->sv_lock);
2756
2757 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2758
2759 return (EPERM);
2760
2761 default:
2762 break;
2763 }
2764
2765 /*
2766 * Pass through the real ioctl command.
2767 */
2768
2769 if (maj && (fn = maj->sm_ioctl) != 0) {
2770 if (!(maj->sm_flag & D_MP)) {
2771 UNSAFE_ENTER();
2772 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2773 UNSAFE_EXIT();
2774 } else {
2775 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2776 }
2777 } else {
2778 rc = ENODEV;
2779 }
2780
2781 /*
2782 * Bug 4755783
2783 * Fix up the size of the current partition to allow
2784 * for the virtual volume to be a different size to the
2785 * physical volume (e.g. for II compact dependent shadows).
2786 *
2787 * Note that this only attempts to fix up the current partition
2788 * - the one that the ioctl was issued against. There could be
2789 * other sv'd partitions in the same vtoc, but we cannot tell
2790 * so we don't attempt to fix them up.
2791 */
2792
2793 if (svp != NULL && rc == 0) {
2794 switch (cmd) {
2795 case DKIOCGVTOC:
2796 rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2797 break;
2798
2799 #ifdef DKIOCPARTITION
2800 case DKIOCGETEFI:
2801 rc = sv_fix_dkiocgetefi(arg, mode, svp);
2802 break;
2803
2804 case DKIOCPARTITION:
2805 rc = sv_fix_dkiocpartition(arg, mode, svp);
2806 break;
2807 #endif /* DKIOCPARTITION */
2808 }
2809 }
2810
2811 if (svp != NULL) {
2812 rw_exit(&svp->sv_lock);
2813 }
2814
2815 return (rc);
2816 }
2817