1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Storage Volume Character and Block Driver (SV) 28 * 29 * This driver implements a simplistic /dev/{r}dsk/ interface to a 30 * specified disk volume that is otherwise managed by the Prism 31 * software. The SV driver layers itself onto the underlying disk 32 * device driver by changing function pointers in the cb_ops 33 * structure. 34 * 35 * CONFIGURATION: 36 * 37 * 1. Configure the driver using the svadm utility. 38 * 2. Access the device as before through /dev/rdsk/c?t?d?s? 39 * 40 * LIMITATIONS: 41 * 42 * This driver should NOT be used to share a device between another 43 * DataServices user interface module (e.g., STE) and a user accessing 44 * the device through the block device in O_WRITE mode. This is because 45 * writes through the block device are asynchronous (due to the page 46 * cache) and so consistency between the block device user and the 47 * STE user cannot be guaranteed. 48 * 49 * Data is copied between system struct buf(9s) and nsc_vec_t. This is 50 * wasteful and slow. 51 */ 52 53 #include <sys/debug.h> 54 #include <sys/types.h> 55 56 #include <sys/ksynch.h> 57 #include <sys/kmem.h> 58 #include <sys/errno.h> 59 #include <sys/varargs.h> 60 #include <sys/file.h> 61 #include <sys/open.h> 62 #include <sys/conf.h> 63 #include <sys/cred.h> 64 #include <sys/buf.h> 65 #include <sys/uio.h> 66 #ifndef DS_DDICT 67 #include <sys/pathname.h> 68 #endif 69 #include <sys/aio_req.h> 70 #include <sys/dkio.h> 71 #include <sys/vtoc.h> 72 #include <sys/cmn_err.h> 73 #include <sys/modctl.h> 74 #include <sys/ddi.h> 75 #include <sys/sunddi.h> 76 #include <sys/sunldi.h> 77 #include <sys/nsctl/nsvers.h> 78 79 #include <sys/nsc_thread.h> 80 #include <sys/unistat/spcs_s.h> 81 #include <sys/unistat/spcs_s_k.h> 82 #include <sys/unistat/spcs_errors.h> 83 84 #ifdef DS_DDICT 85 #include "../contract.h" 86 #endif 87 88 #include "../nsctl.h" 89 90 91 #include <sys/sdt.h> /* dtrace is S10 or later */ 92 93 #include "sv.h" 94 #include "sv_impl.h" 95 #include "sv_efi.h" 96 97 #define MAX_EINTR_COUNT 1000 98 99 /* 100 * sv_mod_status 101 */ 102 #define SV_PREVENT_UNLOAD 1 103 #define SV_ALLOW_UNLOAD 2 104 105 static const int sv_major_rev = ISS_VERSION_MAJ; /* Major number */ 106 static const int sv_minor_rev = ISS_VERSION_MIN; /* Minor number */ 107 static const int sv_micro_rev = ISS_VERSION_MIC; /* Micro number */ 108 static const int sv_baseline_rev = ISS_VERSION_NUM; /* Baseline number */ 109 110 #ifdef DKIOCPARTITION 111 /* 112 * CRC32 polynomial table needed for computing the checksums 113 * in an EFI vtoc. 114 */ 115 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE }; 116 #endif 117 118 static clock_t sv_config_time; /* Time of successful {en,dis}able */ 119 static int sv_debug; /* Set non-zero for debug to syslog */ 120 static int sv_mod_status; /* Set to prevent modunload */ 121 122 static dev_info_t *sv_dip; /* Single DIP for driver */ 123 static kmutex_t sv_mutex; /* Protect global lists, etc. */ 124 125 static nsc_mem_t *sv_mem; /* nsctl memory allocator token */ 126 127 128 /* 129 * Per device and per major state. 130 */ 131 132 #ifndef _SunOS_5_6 133 #define UNSAFE_ENTER() 134 #define UNSAFE_EXIT() 135 #else 136 #define UNSAFE_ENTER() mutex_enter(&unsafe_driver) 137 #define UNSAFE_EXIT() mutex_exit(&unsafe_driver) 138 #endif 139 140 /* hash table of major dev structures */ 141 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0}; 142 static sv_dev_t *sv_devs; /* array of per device structures */ 143 static int sv_max_devices; /* SV version of nsc_max_devices() */ 144 static int sv_ndevices; /* number of SV enabled devices */ 145 146 /* 147 * Threading. 148 */ 149 150 int sv_threads_max = 1024; /* maximum # to dynamically alloc */ 151 int sv_threads = 32; /* # to pre-allocate (see sv.conf) */ 152 int sv_threads_extra = 0; /* addl # we would have alloc'ed */ 153 154 static nstset_t *sv_tset; /* the threadset pointer */ 155 156 static int sv_threads_hysteresis = 4; /* hysteresis for threadset resizing */ 157 static int sv_threads_dev = 2; /* # of threads to alloc per device */ 158 static int sv_threads_inc = 8; /* increment for changing the set */ 159 static int sv_threads_needed; /* number of threads needed */ 160 static int sv_no_threads; /* number of nsc_create errors */ 161 static int sv_max_nlive; /* max number of threads running */ 162 163 164 165 /* 166 * nsctl fd callbacks. 167 */ 168 169 static int svattach_fd(blind_t); 170 static int svdetach_fd(blind_t); 171 172 static nsc_def_t sv_fd_def[] = { 173 { "Attach", (uintptr_t)svattach_fd, }, 174 { "Detach", (uintptr_t)svdetach_fd, }, 175 { 0, 0, } 176 }; 177 178 /* 179 * cb_ops functions. 180 */ 181 182 static int svopen(dev_t *, int, int, cred_t *); 183 static int svclose(dev_t, int, int, cred_t *); 184 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *); 185 static int svprint(dev_t, char *); 186 187 /* 188 * These next functions are layered into the underlying driver's devops. 189 */ 190 191 static int sv_lyr_open(dev_t *, int, int, cred_t *); 192 static int sv_lyr_close(dev_t, int, int, cred_t *); 193 static int sv_lyr_strategy(struct buf *); 194 static int sv_lyr_read(dev_t, struct uio *, cred_t *); 195 static int sv_lyr_write(dev_t, struct uio *, cred_t *); 196 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *); 197 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *); 198 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 199 200 static struct cb_ops sv_cb_ops = { 201 svopen, /* open */ 202 svclose, /* close */ 203 nulldev, /* strategy */ 204 svprint, 205 nodev, /* dump */ 206 nodev, /* read */ 207 nodev, /* write */ 208 svioctl, 209 nodev, /* devmap */ 210 nodev, /* mmap */ 211 nodev, /* segmap */ 212 nochpoll, /* poll */ 213 ddi_prop_op, 214 NULL, /* NOT a stream */ 215 D_NEW | D_MP | D_64BIT, 216 CB_REV, 217 nodev, /* aread */ 218 nodev, /* awrite */ 219 }; 220 221 222 /* 223 * dev_ops functions. 224 */ 225 226 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 227 static int sv_attach(dev_info_t *, ddi_attach_cmd_t); 228 static int sv_detach(dev_info_t *, ddi_detach_cmd_t); 229 230 static struct dev_ops sv_ops = { 231 DEVO_REV, 232 0, 233 sv_getinfo, 234 nulldev, /* identify */ 235 nulldev, /* probe */ 236 sv_attach, 237 sv_detach, 238 nodev, /* reset */ 239 &sv_cb_ops, 240 (struct bus_ops *)0 241 }; 242 243 /* 244 * Module linkage. 245 */ 246 247 extern struct mod_ops mod_driverops; 248 249 static struct modldrv modldrv = { 250 &mod_driverops, 251 "nws:Storage Volume:" ISS_VERSION_STR, 252 &sv_ops 253 }; 254 255 static struct modlinkage modlinkage = { 256 MODREV_1, 257 &modldrv, 258 0 259 }; 260 261 262 int 263 _init(void) 264 { 265 int error; 266 267 mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL); 268 269 if ((error = mod_install(&modlinkage)) != 0) { 270 mutex_destroy(&sv_mutex); 271 return (error); 272 } 273 274 #ifdef DEBUG 275 cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n", 276 sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev, 277 ISS_VERSION_STR, BUILD_DATE_STR); 278 #else 279 if (sv_micro_rev) { 280 cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n", 281 sv_major_rev, sv_minor_rev, sv_micro_rev, 282 ISS_VERSION_STR, BUILD_DATE_STR); 283 } else { 284 cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n", 285 sv_major_rev, sv_minor_rev, 286 ISS_VERSION_STR, BUILD_DATE_STR); 287 } 288 #endif 289 290 return (error); 291 } 292 293 294 int 295 _fini(void) 296 { 297 int error; 298 299 if ((error = mod_remove(&modlinkage)) != 0) 300 return (error); 301 302 mutex_destroy(&sv_mutex); 303 304 return (error); 305 } 306 307 308 int 309 _info(struct modinfo *modinfop) 310 { 311 return (mod_info(&modlinkage, modinfop)); 312 } 313 314 315 /* 316 * Locking & State. 317 * 318 * sv_mutex protects config information - sv_maj_t and sv_dev_t lists; 319 * threadset creation and sizing; sv_ndevices. 320 * 321 * If we need to hold both sv_mutex and sv_lock, then the sv_mutex 322 * must be acquired first. 323 * 324 * sv_lock protects the sv_dev_t structure for an individual device. 325 * 326 * sv_olock protects the otyp/open members of the sv_dev_t. If we need 327 * to hold both sv_lock and sv_olock, then the sv_lock must be acquired 328 * first. 329 * 330 * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple 331 * I/O operations to a device simultaneously, as above. 332 * 333 * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur 334 * with sv_lock write-locked must be done with (sv_state == SV_PENDING) 335 * and (sv_pending == curthread) so that any recursion through 336 * sv_lyr_open/sv_lyr_close can be detected. 337 */ 338 339 340 static int 341 sv_init_devs(void) 342 { 343 int i; 344 345 ASSERT(MUTEX_HELD(&sv_mutex)); 346 347 if (sv_max_devices > 0) 348 return (0); 349 350 sv_max_devices = nsc_max_devices(); 351 352 if (sv_max_devices <= 0) { 353 /* nsctl is not attached (nskernd not running) */ 354 if (sv_debug > 0) 355 cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n"); 356 return (EAGAIN); 357 } 358 359 sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)), 360 KM_NOSLEEP, sv_mem); 361 362 if (sv_devs == NULL) { 363 cmn_err(CE_WARN, "!sv: could not allocate sv_devs array"); 364 return (ENOMEM); 365 } 366 367 for (i = 0; i < sv_max_devices; i++) { 368 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL); 369 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL); 370 } 371 372 if (sv_debug > 0) 373 cmn_err(CE_CONT, "!sv: sv_init_devs successful\n"); 374 375 return (0); 376 } 377 378 379 static int 380 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 381 { 382 int rc; 383 384 switch (cmd) { 385 386 case DDI_ATTACH: 387 sv_dip = dip; 388 389 if (ddi_create_minor_node(dip, "sv", S_IFCHR, 390 0, DDI_PSEUDO, 0) != DDI_SUCCESS) 391 goto failed; 392 393 mutex_enter(&sv_mutex); 394 395 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0); 396 if (sv_mem == NULL) { 397 mutex_exit(&sv_mutex); 398 goto failed; 399 } 400 401 rc = sv_init_devs(); 402 if (rc != 0 && rc != EAGAIN) { 403 mutex_exit(&sv_mutex); 404 goto failed; 405 } 406 407 mutex_exit(&sv_mutex); 408 409 410 ddi_report_dev(dip); 411 412 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 413 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 414 "sv_threads", sv_threads); 415 416 if (sv_debug > 0) 417 cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads); 418 419 if (sv_threads > sv_threads_max) 420 sv_threads_max = sv_threads; 421 422 return (DDI_SUCCESS); 423 424 default: 425 return (DDI_FAILURE); 426 } 427 428 failed: 429 DTRACE_PROBE(sv_attach_failed); 430 (void) sv_detach(dip, DDI_DETACH); 431 return (DDI_FAILURE); 432 } 433 434 435 static int 436 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 437 { 438 sv_dev_t *svp; 439 int i; 440 441 switch (cmd) { 442 443 case DDI_DETACH: 444 445 /* 446 * Check that everything is disabled. 447 */ 448 449 mutex_enter(&sv_mutex); 450 451 if (sv_mod_status == SV_PREVENT_UNLOAD) { 452 mutex_exit(&sv_mutex); 453 DTRACE_PROBE(sv_detach_err_prevent); 454 return (DDI_FAILURE); 455 } 456 457 for (i = 0; sv_devs && i < sv_max_devices; i++) { 458 svp = &sv_devs[i]; 459 460 if (svp->sv_state != SV_DISABLE) { 461 mutex_exit(&sv_mutex); 462 DTRACE_PROBE(sv_detach_err_busy); 463 return (DDI_FAILURE); 464 } 465 } 466 467 468 for (i = 0; sv_devs && i < sv_max_devices; i++) { 469 mutex_destroy(&sv_devs[i].sv_olock); 470 rw_destroy(&sv_devs[i].sv_lock); 471 } 472 473 if (sv_devs) { 474 nsc_kmem_free(sv_devs, 475 (sv_max_devices * sizeof (*sv_devs))); 476 sv_devs = NULL; 477 } 478 sv_max_devices = 0; 479 480 if (sv_mem) { 481 nsc_unregister_mem(sv_mem); 482 sv_mem = NULL; 483 } 484 485 mutex_exit(&sv_mutex); 486 487 /* 488 * Remove all minor nodes. 489 */ 490 491 ddi_remove_minor_node(dip, NULL); 492 sv_dip = NULL; 493 494 return (DDI_SUCCESS); 495 496 default: 497 return (DDI_FAILURE); 498 } 499 } 500 501 static sv_maj_t * 502 sv_getmajor(const dev_t dev) 503 { 504 sv_maj_t **insert, *maj; 505 major_t umaj = getmajor(dev); 506 507 /* 508 * See if the hash table entry, or one of the hash chains 509 * is already allocated for this major number 510 */ 511 if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) { 512 do { 513 if (maj->sm_major == umaj) 514 return (maj); 515 } while ((maj = maj->sm_next) != 0); 516 } 517 518 /* 519 * If the sv_mutex is held, there is design flaw, as the only non-mutex 520 * held callers can be sv_enable() or sv_dev_to_sv() 521 * Return an error, instead of panicing the system 522 */ 523 if (MUTEX_HELD(&sv_mutex)) { 524 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t"); 525 return (NULL); 526 } 527 528 /* 529 * Determine where to allocate a new element in the hash table 530 */ 531 mutex_enter(&sv_mutex); 532 insert = &(sv_majors[SV_MAJOR_HASH(umaj)]); 533 for (maj = *insert; maj; maj = maj->sm_next) { 534 535 /* Did another thread beat us to it? */ 536 if (maj->sm_major == umaj) 537 return (maj); 538 539 /* Find a NULL insert point? */ 540 if (maj->sm_next == NULL) 541 insert = &maj->sm_next; 542 } 543 544 /* 545 * Located the new insert point 546 */ 547 *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem); 548 if ((maj = *insert) != 0) 549 maj->sm_major = umaj; 550 else 551 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t"); 552 553 mutex_exit(&sv_mutex); 554 555 return (maj); 556 } 557 558 /* ARGSUSED */ 559 560 static int 561 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 562 { 563 int rc = DDI_FAILURE; 564 565 switch (infocmd) { 566 567 case DDI_INFO_DEVT2DEVINFO: 568 *result = sv_dip; 569 rc = DDI_SUCCESS; 570 break; 571 572 case DDI_INFO_DEVT2INSTANCE: 573 /* 574 * We only have a single instance. 575 */ 576 *result = 0; 577 rc = DDI_SUCCESS; 578 break; 579 580 default: 581 break; 582 } 583 584 return (rc); 585 } 586 587 588 /* 589 * Hashing of devices onto major device structures. 590 * 591 * Individual device structures are hashed onto one of the sm_hash[] 592 * buckets in the relevant major device structure. 593 * 594 * Hash insertion and deletion -must- be done with sv_mutex held. Hash 595 * searching does not require the mutex because of the sm_seq member. 596 * sm_seq is incremented on each insertion (-after- hash chain pointer 597 * manipulation) and each deletion (-before- hash chain pointer 598 * manipulation). When searching the hash chain, the seq number is 599 * checked before accessing each device structure, if the seq number has 600 * changed, then we restart the search from the top of the hash chain. 601 * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search 602 * the hash chain (we are guaranteed that this search cannot be 603 * interrupted). 604 */ 605 606 #define SV_HASH_RETRY 16 607 608 static sv_dev_t * 609 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp) 610 { 611 minor_t umin = getminor(dev); 612 sv_dev_t **hb, *next, *svp; 613 sv_maj_t *maj; 614 int seq; 615 int try; 616 617 /* Get major hash table */ 618 maj = sv_getmajor(dev); 619 if (majpp) 620 *majpp = maj; 621 if (maj == NULL) 622 return (NULL); 623 624 if (maj->sm_inuse == 0) { 625 DTRACE_PROBE1( 626 sv_dev_to_sv_end, 627 dev_t, dev); 628 return (NULL); 629 } 630 631 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]); 632 try = 0; 633 634 retry: 635 if (try > SV_HASH_RETRY) 636 mutex_enter(&sv_mutex); 637 638 seq = maj->sm_seq; 639 for (svp = *hb; svp; svp = next) { 640 next = svp->sv_hash; 641 642 nsc_membar_stld(); /* preserve register load order */ 643 644 if (maj->sm_seq != seq) { 645 DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev); 646 try++; 647 goto retry; 648 } 649 650 if (svp->sv_dev == dev) 651 break; 652 } 653 654 if (try > SV_HASH_RETRY) 655 mutex_exit(&sv_mutex); 656 657 return (svp); 658 } 659 660 661 /* 662 * Must be called with sv_mutex held. 663 */ 664 665 static int 666 sv_get_state(const dev_t udev, sv_dev_t **svpp) 667 { 668 sv_dev_t **hb, **insert, *svp; 669 sv_maj_t *maj; 670 minor_t umin; 671 int i; 672 673 /* Get major hash table */ 674 if ((maj = sv_getmajor(udev)) == NULL) 675 return (NULL); 676 677 /* Determine which minor hash table */ 678 umin = getminor(udev); 679 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]); 680 681 /* look for clash */ 682 683 insert = hb; 684 685 for (svp = *hb; svp; svp = svp->sv_hash) { 686 if (svp->sv_dev == udev) 687 break; 688 689 if (svp->sv_hash == NULL) 690 insert = &svp->sv_hash; 691 } 692 693 if (svp) { 694 DTRACE_PROBE1( 695 sv_get_state_enabled, 696 dev_t, udev); 697 return (SV_EENABLED); 698 } 699 700 /* look for spare sv_devs slot */ 701 702 for (i = 0; i < sv_max_devices; i++) { 703 svp = &sv_devs[i]; 704 705 if (svp->sv_state == SV_DISABLE) 706 break; 707 } 708 709 if (i >= sv_max_devices) { 710 DTRACE_PROBE1( 711 sv_get_state_noslots, 712 dev_t, udev); 713 return (SV_ENOSLOTS); 714 } 715 716 svp->sv_state = SV_PENDING; 717 svp->sv_pending = curthread; 718 719 *insert = svp; 720 svp->sv_hash = NULL; 721 maj->sm_seq++; /* must be after the store to the hash chain */ 722 723 *svpp = svp; 724 725 /* 726 * We do not know the size of the underlying device at 727 * this stage, so initialise "nblocks" property to 728 * zero, and update it whenever we succeed in 729 * nsc_reserve'ing the underlying nsc_fd_t. 730 */ 731 732 svp->sv_nblocks = 0; 733 734 return (0); 735 } 736 737 738 /* 739 * Remove a device structure from it's hash chain. 740 * Must be called with sv_mutex held. 741 */ 742 743 static void 744 sv_rm_hash(sv_dev_t *svp) 745 { 746 sv_dev_t **svpp; 747 sv_maj_t *maj; 748 749 /* Get major hash table */ 750 if ((maj = sv_getmajor(svp->sv_dev)) == NULL) 751 return; 752 753 /* remove svp from hash chain */ 754 755 svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]); 756 while (*svpp) { 757 if (*svpp == svp) { 758 /* 759 * increment of sm_seq must be before the 760 * removal from the hash chain 761 */ 762 maj->sm_seq++; 763 *svpp = svp->sv_hash; 764 break; 765 } 766 767 svpp = &(*svpp)->sv_hash; 768 } 769 770 svp->sv_hash = NULL; 771 } 772 773 /* 774 * Free (disable) a device structure. 775 * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will 776 * perform the exits during its processing. 777 */ 778 779 static int 780 sv_free(sv_dev_t *svp, const int error) 781 { 782 struct cb_ops *cb_ops; 783 sv_maj_t *maj; 784 785 /* Get major hash table */ 786 if ((maj = sv_getmajor(svp->sv_dev)) == NULL) 787 return (NULL); 788 789 svp->sv_state = SV_PENDING; 790 svp->sv_pending = curthread; 791 792 /* 793 * Close the fd's before removing from the hash or swapping 794 * back the cb_ops pointers so that the cache flushes before new 795 * io can come in. 796 */ 797 798 if (svp->sv_fd) { 799 (void) nsc_close(svp->sv_fd); 800 svp->sv_fd = 0; 801 } 802 803 sv_rm_hash(svp); 804 805 if (error != SV_ESDOPEN && 806 error != SV_ELYROPEN && --maj->sm_inuse == 0) { 807 808 if (maj->sm_dev_ops) 809 cb_ops = maj->sm_dev_ops->devo_cb_ops; 810 else 811 cb_ops = NULL; 812 813 if (cb_ops && maj->sm_strategy != NULL) { 814 cb_ops->cb_strategy = maj->sm_strategy; 815 cb_ops->cb_close = maj->sm_close; 816 cb_ops->cb_ioctl = maj->sm_ioctl; 817 cb_ops->cb_write = maj->sm_write; 818 cb_ops->cb_open = maj->sm_open; 819 cb_ops->cb_read = maj->sm_read; 820 cb_ops->cb_flag = maj->sm_flag; 821 822 if (maj->sm_awrite) 823 cb_ops->cb_awrite = maj->sm_awrite; 824 825 if (maj->sm_aread) 826 cb_ops->cb_aread = maj->sm_aread; 827 828 /* 829 * corbin XXX 830 * Leave backing device ops in maj->sm_* 831 * to handle any requests that might come 832 * in during the disable. This could be 833 * a problem however if the backing device 834 * driver is changed while we process these 835 * requests. 836 * 837 * maj->sm_strategy = 0; 838 * maj->sm_awrite = 0; 839 * maj->sm_write = 0; 840 * maj->sm_ioctl = 0; 841 * maj->sm_close = 0; 842 * maj->sm_aread = 0; 843 * maj->sm_read = 0; 844 * maj->sm_open = 0; 845 * maj->sm_flag = 0; 846 * 847 */ 848 } 849 850 if (maj->sm_dev_ops) { 851 maj->sm_dev_ops = 0; 852 } 853 } 854 855 if (svp->sv_lh) { 856 cred_t *crp = ddi_get_cred(); 857 858 /* 859 * Close the protective layered driver open using the 860 * Sun Private layered driver i/f. 861 */ 862 863 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp); 864 svp->sv_lh = NULL; 865 } 866 867 svp->sv_timestamp = nsc_lbolt(); 868 svp->sv_state = SV_DISABLE; 869 svp->sv_pending = NULL; 870 rw_exit(&svp->sv_lock); 871 mutex_exit(&sv_mutex); 872 873 return (error); 874 } 875 876 /* 877 * Reserve the device, taking into account the possibility that 878 * the reserve might have to be retried. 879 */ 880 static int 881 sv_reserve(nsc_fd_t *fd, int flags) 882 { 883 int eintr_count; 884 int rc; 885 886 eintr_count = 0; 887 do { 888 rc = nsc_reserve(fd, flags); 889 if (rc == EINTR) { 890 ++eintr_count; 891 delay(2); 892 } 893 } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT)); 894 895 return (rc); 896 } 897 898 static int 899 sv_enable(const caddr_t path, const int flag, 900 const dev_t udev, spcs_s_info_t kstatus) 901 { 902 struct dev_ops *dev_ops; 903 struct cb_ops *cb_ops; 904 sv_dev_t *svp; 905 sv_maj_t *maj; 906 nsc_size_t nblocks; 907 int rc; 908 cred_t *crp; 909 ldi_ident_t li; 910 911 if (udev == (dev_t)-1 || udev == 0) { 912 DTRACE_PROBE1( 913 sv_enable_err_baddev, 914 dev_t, udev); 915 return (SV_EBADDEV); 916 } 917 918 if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) { 919 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev); 920 return (SV_EAMODE); 921 } 922 923 /* Get major hash table */ 924 if ((maj = sv_getmajor(udev)) == NULL) 925 return (SV_EBADDEV); 926 927 mutex_enter(&sv_mutex); 928 929 rc = sv_get_state(udev, &svp); 930 if (rc) { 931 mutex_exit(&sv_mutex); 932 DTRACE_PROBE1(sv_enable_err_state, dev_t, udev); 933 return (rc); 934 } 935 936 rw_enter(&svp->sv_lock, RW_WRITER); 937 938 /* 939 * Get real fd used for io 940 */ 941 942 svp->sv_dev = udev; 943 svp->sv_flag = flag; 944 945 /* 946 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy 947 * function pointer before sv swaps them out. 948 */ 949 950 svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE), 951 sv_fd_def, (blind_t)udev, &rc); 952 953 if (svp->sv_fd == NULL) { 954 if (kstatus) 955 spcs_s_add(kstatus, rc); 956 DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev); 957 return (sv_free(svp, SV_ESDOPEN)); 958 } 959 960 /* 961 * Perform a layered driver open using the Sun Private layered 962 * driver i/f to ensure that the cb_ops structure for the driver 963 * is not detached out from under us whilst sv is enabled. 964 * 965 */ 966 967 crp = ddi_get_cred(); 968 svp->sv_lh = NULL; 969 970 if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) { 971 rc = ldi_open_by_dev(&svp->sv_dev, 972 OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li); 973 } 974 975 if (rc != 0) { 976 if (kstatus) 977 spcs_s_add(kstatus, rc); 978 DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev); 979 return (sv_free(svp, SV_ELYROPEN)); 980 } 981 982 /* 983 * Do layering if required - must happen after nsc_open(). 984 */ 985 986 if (maj->sm_inuse++ == 0) { 987 maj->sm_dev_ops = nsc_get_devops(getmajor(udev)); 988 989 if (maj->sm_dev_ops == NULL || 990 maj->sm_dev_ops->devo_cb_ops == NULL) { 991 DTRACE_PROBE1(sv_enable_err_load, dev_t, udev); 992 return (sv_free(svp, SV_ELOAD)); 993 } 994 995 dev_ops = maj->sm_dev_ops; 996 cb_ops = dev_ops->devo_cb_ops; 997 998 if (cb_ops->cb_strategy == NULL || 999 cb_ops->cb_strategy == nodev || 1000 cb_ops->cb_strategy == nulldev) { 1001 DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev); 1002 return (sv_free(svp, SV_ELOAD)); 1003 } 1004 1005 if (cb_ops->cb_strategy == sv_lyr_strategy) { 1006 DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev); 1007 return (sv_free(svp, SV_ESTRATEGY)); 1008 } 1009 1010 maj->sm_strategy = cb_ops->cb_strategy; 1011 maj->sm_close = cb_ops->cb_close; 1012 maj->sm_ioctl = cb_ops->cb_ioctl; 1013 maj->sm_write = cb_ops->cb_write; 1014 maj->sm_open = cb_ops->cb_open; 1015 maj->sm_read = cb_ops->cb_read; 1016 maj->sm_flag = cb_ops->cb_flag; 1017 1018 cb_ops->cb_flag = cb_ops->cb_flag | D_MP; 1019 cb_ops->cb_strategy = sv_lyr_strategy; 1020 cb_ops->cb_close = sv_lyr_close; 1021 cb_ops->cb_ioctl = sv_lyr_ioctl; 1022 cb_ops->cb_write = sv_lyr_write; 1023 cb_ops->cb_open = sv_lyr_open; 1024 cb_ops->cb_read = sv_lyr_read; 1025 1026 /* 1027 * Check that the driver has async I/O entry points 1028 * before changing them. 1029 */ 1030 1031 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) { 1032 maj->sm_awrite = 0; 1033 maj->sm_aread = 0; 1034 } else { 1035 maj->sm_awrite = cb_ops->cb_awrite; 1036 maj->sm_aread = cb_ops->cb_aread; 1037 1038 cb_ops->cb_awrite = sv_lyr_awrite; 1039 cb_ops->cb_aread = sv_lyr_aread; 1040 } 1041 1042 /* 1043 * Bug 4645743 1044 * 1045 * Prevent sv from ever unloading after it has interposed 1046 * on a major device because there is a race between 1047 * sv removing its layered entry points from the target 1048 * dev_ops, a client coming in and accessing the driver, 1049 * and the kernel modunloading the sv text. 1050 * 1051 * To allow unload, do svboot -u, which only happens in 1052 * pkgrm time. 1053 */ 1054 ASSERT(MUTEX_HELD(&sv_mutex)); 1055 sv_mod_status = SV_PREVENT_UNLOAD; 1056 } 1057 1058 1059 svp->sv_timestamp = nsc_lbolt(); 1060 svp->sv_state = SV_ENABLE; 1061 svp->sv_pending = NULL; 1062 rw_exit(&svp->sv_lock); 1063 1064 sv_ndevices++; 1065 mutex_exit(&sv_mutex); 1066 1067 nblocks = 0; 1068 if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) { 1069 nblocks = svp->sv_nblocks; 1070 nsc_release(svp->sv_fd); 1071 } 1072 1073 cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n", 1074 svp->sv_dev, nblocks); 1075 1076 return (0); 1077 } 1078 1079 1080 static int 1081 sv_prepare_unload() 1082 { 1083 int rc = 0; 1084 1085 mutex_enter(&sv_mutex); 1086 1087 if (sv_mod_status == SV_PREVENT_UNLOAD) { 1088 if ((sv_ndevices != 0) || (sv_tset != NULL)) { 1089 rc = EBUSY; 1090 } else { 1091 sv_mod_status = SV_ALLOW_UNLOAD; 1092 delay(SV_WAIT_UNLOAD * drv_usectohz(1000000)); 1093 } 1094 } 1095 1096 mutex_exit(&sv_mutex); 1097 return (rc); 1098 } 1099 1100 static int 1101 svattach_fd(blind_t arg) 1102 { 1103 dev_t dev = (dev_t)arg; 1104 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1105 int rc; 1106 1107 if (sv_debug > 0) 1108 cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp); 1109 1110 if (svp == NULL) { 1111 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg); 1112 return (0); 1113 } 1114 1115 if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) { 1116 cmn_err(CE_WARN, 1117 "!svattach_fd: nsc_partsize() failed, rc %d", rc); 1118 svp->sv_nblocks = 0; 1119 } 1120 1121 if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) { 1122 cmn_err(CE_WARN, 1123 "!svattach_fd: nsc_maxfbas() failed, rc %d", rc); 1124 svp->sv_maxfbas = 0; 1125 } 1126 1127 if (sv_debug > 0) { 1128 cmn_err(CE_CONT, 1129 "!svattach_fd(%p): size %" NSC_SZFMT ", " 1130 "maxfbas %" NSC_SZFMT "\n", 1131 arg, svp->sv_nblocks, svp->sv_maxfbas); 1132 } 1133 1134 return (0); 1135 } 1136 1137 1138 static int 1139 svdetach_fd(blind_t arg) 1140 { 1141 dev_t dev = (dev_t)arg; 1142 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1143 1144 if (sv_debug > 0) 1145 cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp); 1146 1147 /* svp can be NULL during disable of an sv */ 1148 if (svp == NULL) 1149 return (0); 1150 1151 svp->sv_maxfbas = 0; 1152 svp->sv_nblocks = 0; 1153 return (0); 1154 } 1155 1156 1157 /* 1158 * Side effect: if called with (guard != 0), then expects both sv_mutex 1159 * and sv_lock(RW_WRITER) to be held, and will release them before returning. 1160 */ 1161 1162 /* ARGSUSED */ 1163 static int 1164 sv_disable(dev_t dev, spcs_s_info_t kstatus) 1165 { 1166 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1167 1168 if (svp == NULL) { 1169 1170 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp); 1171 return (SV_ENODEV); 1172 } 1173 1174 mutex_enter(&sv_mutex); 1175 rw_enter(&svp->sv_lock, RW_WRITER); 1176 1177 if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) { 1178 rw_exit(&svp->sv_lock); 1179 mutex_exit(&sv_mutex); 1180 1181 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp); 1182 return (SV_EDISABLED); 1183 } 1184 1185 1186 sv_ndevices--; 1187 return (sv_free(svp, 0)); 1188 } 1189 1190 1191 1192 static int 1193 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp) 1194 { 1195 nsc_buf_t *tmph; 1196 sv_dev_t *svp; 1197 sv_maj_t *maj; 1198 int (*fn)(); 1199 dev_t odev; 1200 int ret; 1201 int rc; 1202 1203 svp = sv_dev_to_sv(*devp, &maj); 1204 1205 if (svp) { 1206 if (svp->sv_state == SV_PENDING && 1207 svp->sv_pending == curthread) { 1208 /* 1209 * This is a recursive open from a call to 1210 * ddi_lyr_open_by_devt and so we just want 1211 * to pass it straight through to the 1212 * underlying driver. 1213 */ 1214 DTRACE_PROBE2(sv_lyr_open_recursive, 1215 sv_dev_t *, svp, 1216 dev_t, *devp); 1217 svp = NULL; 1218 } else 1219 rw_enter(&svp->sv_lock, RW_READER); 1220 } 1221 1222 odev = *devp; 1223 1224 if (maj && (fn = maj->sm_open) != 0) { 1225 if (!(maj->sm_flag & D_MP)) { 1226 UNSAFE_ENTER(); 1227 ret = (*fn)(devp, flag, otyp, crp); 1228 UNSAFE_EXIT(); 1229 } else { 1230 ret = (*fn)(devp, flag, otyp, crp); 1231 } 1232 1233 if (ret == 0) { 1234 /* 1235 * Re-acquire svp if the driver changed *devp. 1236 */ 1237 1238 if (*devp != odev) { 1239 rw_exit(&svp->sv_lock); 1240 1241 svp = sv_dev_to_sv(*devp, NULL); 1242 1243 if (svp) { 1244 rw_enter(&svp->sv_lock, RW_READER); 1245 } 1246 } 1247 } 1248 } else { 1249 ret = ENODEV; 1250 } 1251 1252 if (svp && ret != 0 && svp->sv_state == SV_ENABLE) { 1253 /* 1254 * Underlying DDI open failed, but we have this 1255 * device SV enabled. If we can read some data 1256 * from the device, fake a successful open (this 1257 * probably means that this device is RDC'd and we 1258 * are getting the data from the secondary node). 1259 * 1260 * The reserve must be done with NSC_TRY|NSC_NOWAIT to 1261 * ensure that it does not deadlock if this open is 1262 * coming from nskernd:get_bsize(). 1263 */ 1264 rc = sv_reserve(svp->sv_fd, 1265 NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH); 1266 if (rc == 0) { 1267 tmph = NULL; 1268 1269 rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph); 1270 if (rc <= 0) { 1271 /* success */ 1272 ret = 0; 1273 } 1274 1275 if (tmph) { 1276 (void) nsc_free_buf(tmph); 1277 tmph = NULL; 1278 } 1279 1280 nsc_release(svp->sv_fd); 1281 1282 /* 1283 * Count the number of layered opens that we 1284 * fake since we have to fake a matching number 1285 * of closes (OTYP_LYR open/close calls must be 1286 * paired). 1287 */ 1288 1289 if (ret == 0 && otyp == OTYP_LYR) { 1290 mutex_enter(&svp->sv_olock); 1291 svp->sv_openlcnt++; 1292 mutex_exit(&svp->sv_olock); 1293 } 1294 } 1295 } 1296 1297 if (svp) { 1298 rw_exit(&svp->sv_lock); 1299 } 1300 1301 return (ret); 1302 } 1303 1304 1305 static int 1306 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp) 1307 { 1308 sv_dev_t *svp; 1309 sv_maj_t *maj; 1310 int (*fn)(); 1311 int ret; 1312 1313 svp = sv_dev_to_sv(dev, &maj); 1314 1315 if (svp && 1316 svp->sv_state == SV_PENDING && 1317 svp->sv_pending == curthread) { 1318 /* 1319 * This is a recursive open from a call to 1320 * ddi_lyr_close and so we just want 1321 * to pass it straight through to the 1322 * underlying driver. 1323 */ 1324 DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp, 1325 dev_t, dev); 1326 svp = NULL; 1327 } 1328 1329 if (svp) { 1330 rw_enter(&svp->sv_lock, RW_READER); 1331 1332 if (otyp == OTYP_LYR) { 1333 mutex_enter(&svp->sv_olock); 1334 1335 if (svp->sv_openlcnt) { 1336 /* 1337 * Consume sufficient layered closes to 1338 * account for the opens that we faked 1339 * whilst the device was failed. 1340 */ 1341 svp->sv_openlcnt--; 1342 mutex_exit(&svp->sv_olock); 1343 rw_exit(&svp->sv_lock); 1344 1345 DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev); 1346 1347 return (0); 1348 } 1349 1350 mutex_exit(&svp->sv_olock); 1351 } 1352 } 1353 1354 if (maj && (fn = maj->sm_close) != 0) { 1355 if (!(maj->sm_flag & D_MP)) { 1356 UNSAFE_ENTER(); 1357 ret = (*fn)(dev, flag, otyp, crp); 1358 UNSAFE_EXIT(); 1359 } else { 1360 ret = (*fn)(dev, flag, otyp, crp); 1361 } 1362 } else { 1363 ret = ENODEV; 1364 } 1365 1366 if (svp) { 1367 rw_exit(&svp->sv_lock); 1368 } 1369 1370 return (ret); 1371 } 1372 1373 1374 /* 1375 * Convert the specified dev_t into a locked and enabled sv_dev_t, or 1376 * return NULL. 1377 */ 1378 static sv_dev_t * 1379 sv_find_enabled(const dev_t dev, sv_maj_t **majpp) 1380 { 1381 sv_dev_t *svp; 1382 1383 while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) { 1384 rw_enter(&svp->sv_lock, RW_READER); 1385 1386 if (svp->sv_state == SV_ENABLE) { 1387 /* locked and enabled */ 1388 break; 1389 } 1390 1391 /* 1392 * State was changed while waiting on the lock. 1393 * Wait for a stable state. 1394 */ 1395 rw_exit(&svp->sv_lock); 1396 1397 DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev); 1398 1399 delay(2); 1400 } 1401 1402 return (svp); 1403 } 1404 1405 1406 static int 1407 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw) 1408 { 1409 sv_dev_t *svp; 1410 sv_maj_t *maj; 1411 int (*fn)(); 1412 int rc; 1413 1414 svp = sv_find_enabled(dev, &maj); 1415 if (svp == NULL) { 1416 if (maj) { 1417 if (rw == NSC_READ) 1418 fn = maj->sm_read; 1419 else 1420 fn = maj->sm_write; 1421 1422 if (fn != 0) { 1423 if (!(maj->sm_flag & D_MP)) { 1424 UNSAFE_ENTER(); 1425 rc = (*fn)(dev, uiop, crp); 1426 UNSAFE_EXIT(); 1427 } else { 1428 rc = (*fn)(dev, uiop, crp); 1429 } 1430 } 1431 1432 return (rc); 1433 } else { 1434 return (ENODEV); 1435 } 1436 } 1437 1438 ASSERT(RW_READ_HELD(&svp->sv_lock)); 1439 1440 if (svp->sv_flag == 0) { 1441 /* 1442 * guard access mode 1443 * - prevent user level access to the device 1444 */ 1445 DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop); 1446 rc = EPERM; 1447 goto out; 1448 } 1449 1450 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) { 1451 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop); 1452 goto out; 1453 } 1454 1455 if (rw == NSC_READ) 1456 rc = nsc_uread(svp->sv_fd, uiop, crp); 1457 else 1458 rc = nsc_uwrite(svp->sv_fd, uiop, crp); 1459 1460 nsc_release(svp->sv_fd); 1461 1462 out: 1463 rw_exit(&svp->sv_lock); 1464 1465 return (rc); 1466 } 1467 1468 1469 static int 1470 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp) 1471 { 1472 return (sv_lyr_uio(dev, uiop, crp, NSC_READ)); 1473 } 1474 1475 1476 static int 1477 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp) 1478 { 1479 return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE)); 1480 } 1481 1482 1483 /* ARGSUSED */ 1484 1485 static int 1486 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp) 1487 { 1488 return (aphysio(sv_lyr_strategy, 1489 anocancel, dev, B_READ, minphys, aio)); 1490 } 1491 1492 1493 /* ARGSUSED */ 1494 1495 static int 1496 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp) 1497 { 1498 return (aphysio(sv_lyr_strategy, 1499 anocancel, dev, B_WRITE, minphys, aio)); 1500 } 1501 1502 1503 /* 1504 * Set up an array containing the list of raw path names 1505 * The array for the paths is svl and the size of the array is 1506 * in size. 1507 * 1508 * If there are more layered devices than will fit in the array, 1509 * the number of extra layered devices is returned. Otherwise 1510 * zero is return. 1511 * 1512 * Input: 1513 * svn : array for paths 1514 * size : size of the array 1515 * 1516 * Output (extra): 1517 * zero : All paths fit in array 1518 * >0 : Number of defined layered devices don't fit in array 1519 */ 1520 1521 static int 1522 sv_list(void *ptr, const int size, int *extra, const int ilp32) 1523 { 1524 sv_name32_t *svn32; 1525 sv_name_t *svn; 1526 sv_dev_t *svp; 1527 int *mode, *nblocks; 1528 int i, index; 1529 char *path; 1530 1531 *extra = 0; 1532 index = 0; 1533 1534 if (ilp32) 1535 svn32 = ptr; 1536 else 1537 svn = ptr; 1538 1539 mutex_enter(&sv_mutex); 1540 for (i = 0; i < sv_max_devices; i++) { 1541 svp = &sv_devs[i]; 1542 1543 rw_enter(&svp->sv_lock, RW_READER); 1544 1545 if (svp->sv_state != SV_ENABLE) { 1546 rw_exit(&svp->sv_lock); 1547 continue; 1548 } 1549 1550 if ((*extra) != 0 || ptr == NULL) { 1551 /* Another overflow entry */ 1552 rw_exit(&svp->sv_lock); 1553 (*extra)++; 1554 continue; 1555 } 1556 1557 if (ilp32) { 1558 nblocks = &svn32->svn_nblocks; 1559 mode = &svn32->svn_mode; 1560 path = svn32->svn_path; 1561 1562 svn32->svn_timestamp = (uint32_t)svp->sv_timestamp; 1563 svn32++; 1564 } else { 1565 nblocks = &svn->svn_nblocks; 1566 mode = &svn->svn_mode; 1567 path = svn->svn_path; 1568 1569 svn->svn_timestamp = svp->sv_timestamp; 1570 svn++; 1571 } 1572 1573 (void) strcpy(path, nsc_pathname(svp->sv_fd)); 1574 *nblocks = svp->sv_nblocks; 1575 *mode = svp->sv_flag; 1576 1577 if (*nblocks == 0) { 1578 if (sv_debug > 3) 1579 cmn_err(CE_CONT, "!sv_list: need to reserve\n"); 1580 1581 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 1582 *nblocks = svp->sv_nblocks; 1583 nsc_release(svp->sv_fd); 1584 } 1585 } 1586 1587 if (++index >= size) { 1588 /* Out of space */ 1589 (*extra)++; 1590 } 1591 1592 rw_exit(&svp->sv_lock); 1593 } 1594 mutex_exit(&sv_mutex); 1595 1596 if (index < size) { 1597 /* NULL terminated list */ 1598 if (ilp32) 1599 svn32->svn_path[0] = '\0'; 1600 else 1601 svn->svn_path[0] = '\0'; 1602 } 1603 1604 return (0); 1605 } 1606 1607 1608 static void 1609 sv_thread_tune(int threads) 1610 { 1611 int incr = (threads > 0) ? 1 : -1; 1612 int change = 0; 1613 int nthreads; 1614 1615 ASSERT(MUTEX_HELD(&sv_mutex)); 1616 1617 if (sv_threads_extra) { 1618 /* keep track of any additional threads requested */ 1619 if (threads > 0) { 1620 sv_threads_extra += threads; 1621 return; 1622 } 1623 threads = -threads; 1624 if (threads >= sv_threads_extra) { 1625 threads -= sv_threads_extra; 1626 sv_threads_extra = 0; 1627 /* fall through to while loop */ 1628 } else { 1629 sv_threads_extra -= threads; 1630 return; 1631 } 1632 } else if (threads > 0) { 1633 /* 1634 * do not increase the number of threads beyond 1635 * sv_threads_max when doing dynamic thread tuning 1636 */ 1637 nthreads = nst_nthread(sv_tset); 1638 if ((nthreads + threads) > sv_threads_max) { 1639 sv_threads_extra = nthreads + threads - sv_threads_max; 1640 threads = sv_threads_max - nthreads; 1641 if (threads <= 0) 1642 return; 1643 } 1644 } 1645 1646 if (threads < 0) 1647 threads = -threads; 1648 1649 while (threads--) { 1650 nthreads = nst_nthread(sv_tset); 1651 sv_threads_needed += incr; 1652 1653 if (sv_threads_needed >= nthreads) 1654 change += nst_add_thread(sv_tset, sv_threads_inc); 1655 else if ((sv_threads_needed < 1656 (nthreads - (sv_threads_inc + sv_threads_hysteresis))) && 1657 ((nthreads - sv_threads_inc) >= sv_threads)) 1658 change -= nst_del_thread(sv_tset, sv_threads_inc); 1659 } 1660 1661 #ifdef DEBUG 1662 if (change) { 1663 cmn_err(CE_NOTE, 1664 "!sv_thread_tune: threads needed %d, nthreads %d, " 1665 "nthreads change %d", 1666 sv_threads_needed, nst_nthread(sv_tset), change); 1667 } 1668 #endif 1669 } 1670 1671 1672 /* ARGSUSED */ 1673 static int 1674 svopen(dev_t *devp, int flag, int otyp, cred_t *crp) 1675 { 1676 int rc; 1677 1678 mutex_enter(&sv_mutex); 1679 rc = sv_init_devs(); 1680 mutex_exit(&sv_mutex); 1681 1682 return (rc); 1683 } 1684 1685 1686 /* ARGSUSED */ 1687 static int 1688 svclose(dev_t dev, int flag, int otyp, cred_t *crp) 1689 { 1690 const int secs = HZ * 5; 1691 const int ticks = HZ / 10; 1692 int loops = secs / ticks; 1693 1694 mutex_enter(&sv_mutex); 1695 while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) { 1696 if (nst_nlive(sv_tset) <= 0) { 1697 nst_destroy(sv_tset); 1698 sv_tset = NULL; 1699 break; 1700 } 1701 1702 /* threads still active - wait for them to exit */ 1703 mutex_exit(&sv_mutex); 1704 delay(ticks); 1705 loops--; 1706 mutex_enter(&sv_mutex); 1707 } 1708 mutex_exit(&sv_mutex); 1709 1710 if (loops <= 0) { 1711 cmn_err(CE_WARN, 1712 #ifndef DEBUG 1713 /* do not write to console when non-DEBUG */ 1714 "!" 1715 #endif 1716 "sv:svclose: threads still active " 1717 "after %d sec - leaking thread set", secs); 1718 } 1719 1720 return (0); 1721 } 1722 1723 1724 static int 1725 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp) 1726 { 1727 char itmp1[12], itmp2[12]; /* temp char array for editing ints */ 1728 spcs_s_info_t kstatus; /* Kernel version of spcs status */ 1729 spcs_s_info_t ustatus; /* Address of user version of spcs status */ 1730 sv_list32_t svl32; /* 32 bit Initial structure for SVIOC_LIST */ 1731 sv_version_t svv; /* Version structure */ 1732 sv_conf_t svc; /* User config structure */ 1733 sv_list_t svl; /* Initial structure for SVIOC_LIST */ 1734 void *usvn; /* Address of user sv_name_t */ 1735 void *svn = NULL; /* Array for SVIOC_LIST */ 1736 uint64_t phash; /* pathname hash */ 1737 int rc = 0; /* Return code -- errno */ 1738 int size; /* Number of items in array */ 1739 int bytes; /* Byte size of array */ 1740 int ilp32; /* Convert data structures for ilp32 userland */ 1741 1742 *rvalp = 0; 1743 1744 /* 1745 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue. 1746 * else it means it previously was SV_PREVENT_UNLOAD, and now it's 1747 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload. 1748 * 1749 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex. 1750 */ 1751 if (sv_mod_status == SV_ALLOW_UNLOAD) { 1752 return (EBUSY); 1753 } 1754 1755 if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0)) 1756 return (rc); 1757 1758 kstatus = spcs_s_kcreate(); 1759 if (!kstatus) { 1760 DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev); 1761 return (ENOMEM); 1762 } 1763 1764 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32); 1765 1766 switch (cmd) { 1767 1768 case SVIOC_ENABLE: 1769 1770 if (ilp32) { 1771 sv_conf32_t svc32; 1772 1773 if (ddi_copyin((void *)arg, &svc32, 1774 sizeof (svc32), mode) < 0) { 1775 spcs_s_kfree(kstatus); 1776 return (EFAULT); 1777 } 1778 1779 svc.svc_error = (spcs_s_info_t)svc32.svc_error; 1780 (void) strcpy(svc.svc_path, svc32.svc_path); 1781 svc.svc_flag = svc32.svc_flag; 1782 svc.svc_major = svc32.svc_major; 1783 svc.svc_minor = svc32.svc_minor; 1784 } else { 1785 if (ddi_copyin((void *)arg, &svc, 1786 sizeof (svc), mode) < 0) { 1787 spcs_s_kfree(kstatus); 1788 return (EFAULT); 1789 } 1790 } 1791 1792 /* force to raw access */ 1793 svc.svc_flag = NSC_DEVICE; 1794 1795 if (sv_tset == NULL) { 1796 mutex_enter(&sv_mutex); 1797 1798 if (sv_tset == NULL) { 1799 sv_tset = nst_init("sv_thr", sv_threads); 1800 } 1801 1802 mutex_exit(&sv_mutex); 1803 1804 if (sv_tset == NULL) { 1805 cmn_err(CE_WARN, 1806 "!sv: could not allocate %d threads", 1807 sv_threads); 1808 } 1809 } 1810 1811 rc = sv_enable(svc.svc_path, svc.svc_flag, 1812 makedevice(svc.svc_major, svc.svc_minor), kstatus); 1813 1814 if (rc == 0) { 1815 sv_config_time = nsc_lbolt(); 1816 1817 mutex_enter(&sv_mutex); 1818 sv_thread_tune(sv_threads_dev); 1819 mutex_exit(&sv_mutex); 1820 } 1821 1822 DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc); 1823 1824 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc)); 1825 /* NOTREACHED */ 1826 1827 case SVIOC_DISABLE: 1828 1829 if (ilp32) { 1830 sv_conf32_t svc32; 1831 1832 if (ddi_copyin((void *)arg, &svc32, 1833 sizeof (svc32), mode) < 0) { 1834 spcs_s_kfree(kstatus); 1835 return (EFAULT); 1836 } 1837 1838 svc.svc_error = (spcs_s_info_t)svc32.svc_error; 1839 svc.svc_major = svc32.svc_major; 1840 svc.svc_minor = svc32.svc_minor; 1841 (void) strcpy(svc.svc_path, svc32.svc_path); 1842 svc.svc_flag = svc32.svc_flag; 1843 } else { 1844 if (ddi_copyin((void *)arg, &svc, 1845 sizeof (svc), mode) < 0) { 1846 spcs_s_kfree(kstatus); 1847 return (EFAULT); 1848 } 1849 } 1850 1851 if (svc.svc_major == (major_t)-1 && 1852 svc.svc_minor == (minor_t)-1) { 1853 sv_dev_t *svp; 1854 int i; 1855 1856 /* 1857 * User level could not find the minor device 1858 * node, so do this the slow way by searching 1859 * the entire sv config for a matching pathname. 1860 */ 1861 1862 phash = nsc_strhash(svc.svc_path); 1863 1864 mutex_enter(&sv_mutex); 1865 1866 for (i = 0; i < sv_max_devices; i++) { 1867 svp = &sv_devs[i]; 1868 1869 if (svp->sv_state == SV_DISABLE || 1870 svp->sv_fd == NULL) 1871 continue; 1872 1873 if (nsc_fdpathcmp(svp->sv_fd, phash, 1874 svc.svc_path) == 0) { 1875 svc.svc_major = getmajor(svp->sv_dev); 1876 svc.svc_minor = getminor(svp->sv_dev); 1877 break; 1878 } 1879 } 1880 1881 mutex_exit(&sv_mutex); 1882 1883 if (svc.svc_major == (major_t)-1 && 1884 svc.svc_minor == (minor_t)-1) 1885 return (spcs_s_ocopyoutf(&kstatus, 1886 svc.svc_error, SV_ENODEV)); 1887 } 1888 1889 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor), 1890 kstatus); 1891 1892 if (rc == 0) { 1893 sv_config_time = nsc_lbolt(); 1894 1895 mutex_enter(&sv_mutex); 1896 sv_thread_tune(-sv_threads_dev); 1897 mutex_exit(&sv_mutex); 1898 } 1899 1900 DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc); 1901 1902 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc)); 1903 /* NOTREACHED */ 1904 1905 case SVIOC_LIST: 1906 1907 if (ilp32) { 1908 if (ddi_copyin((void *)arg, &svl32, 1909 sizeof (svl32), mode) < 0) { 1910 spcs_s_kfree(kstatus); 1911 return (EFAULT); 1912 } 1913 1914 ustatus = (spcs_s_info_t)svl32.svl_error; 1915 size = svl32.svl_count; 1916 usvn = (void *)(unsigned long)svl32.svl_names; 1917 } else { 1918 if (ddi_copyin((void *)arg, &svl, 1919 sizeof (svl), mode) < 0) { 1920 spcs_s_kfree(kstatus); 1921 return (EFAULT); 1922 } 1923 1924 ustatus = svl.svl_error; 1925 size = svl.svl_count; 1926 usvn = svl.svl_names; 1927 } 1928 1929 /* Do some boundary checking */ 1930 if ((size < 0) || (size > sv_max_devices)) { 1931 /* Array size is out of range */ 1932 return (spcs_s_ocopyoutf(&kstatus, ustatus, 1933 SV_EARRBOUNDS, "0", 1934 spcs_s_inttostring(sv_max_devices, itmp1, 1935 sizeof (itmp1), 0), 1936 spcs_s_inttostring(size, itmp2, 1937 sizeof (itmp2), 0))); 1938 } 1939 1940 if (ilp32) 1941 bytes = size * sizeof (sv_name32_t); 1942 else 1943 bytes = size * sizeof (sv_name_t); 1944 1945 /* Allocate memory for the array of structures */ 1946 if (bytes != 0) { 1947 svn = kmem_zalloc(bytes, KM_SLEEP); 1948 if (!svn) { 1949 return (spcs_s_ocopyoutf(&kstatus, 1950 ustatus, ENOMEM)); 1951 } 1952 } 1953 1954 rc = sv_list(svn, size, rvalp, ilp32); 1955 if (rc) { 1956 if (svn != NULL) 1957 kmem_free(svn, bytes); 1958 return (spcs_s_ocopyoutf(&kstatus, ustatus, rc)); 1959 } 1960 1961 if (ilp32) { 1962 svl32.svl_timestamp = (uint32_t)sv_config_time; 1963 svl32.svl_maxdevs = (int32_t)sv_max_devices; 1964 1965 /* Return the list structure */ 1966 if (ddi_copyout(&svl32, (void *)arg, 1967 sizeof (svl32), mode) < 0) { 1968 spcs_s_kfree(kstatus); 1969 if (svn != NULL) 1970 kmem_free(svn, bytes); 1971 return (EFAULT); 1972 } 1973 } else { 1974 svl.svl_timestamp = sv_config_time; 1975 svl.svl_maxdevs = sv_max_devices; 1976 1977 /* Return the list structure */ 1978 if (ddi_copyout(&svl, (void *)arg, 1979 sizeof (svl), mode) < 0) { 1980 spcs_s_kfree(kstatus); 1981 if (svn != NULL) 1982 kmem_free(svn, bytes); 1983 return (EFAULT); 1984 } 1985 } 1986 1987 /* Return the array */ 1988 if (svn != NULL) { 1989 if (ddi_copyout(svn, usvn, bytes, mode) < 0) { 1990 kmem_free(svn, bytes); 1991 spcs_s_kfree(kstatus); 1992 return (EFAULT); 1993 } 1994 kmem_free(svn, bytes); 1995 } 1996 1997 DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0); 1998 1999 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0)); 2000 /* NOTREACHED */ 2001 2002 case SVIOC_VERSION: 2003 2004 if (ilp32) { 2005 sv_version32_t svv32; 2006 2007 if (ddi_copyin((void *)arg, &svv32, 2008 sizeof (svv32), mode) < 0) { 2009 spcs_s_kfree(kstatus); 2010 return (EFAULT); 2011 } 2012 2013 svv32.svv_major_rev = sv_major_rev; 2014 svv32.svv_minor_rev = sv_minor_rev; 2015 svv32.svv_micro_rev = sv_micro_rev; 2016 svv32.svv_baseline_rev = sv_baseline_rev; 2017 2018 if (ddi_copyout(&svv32, (void *)arg, 2019 sizeof (svv32), mode) < 0) { 2020 spcs_s_kfree(kstatus); 2021 return (EFAULT); 2022 } 2023 2024 ustatus = (spcs_s_info_t)svv32.svv_error; 2025 } else { 2026 if (ddi_copyin((void *)arg, &svv, 2027 sizeof (svv), mode) < 0) { 2028 spcs_s_kfree(kstatus); 2029 return (EFAULT); 2030 } 2031 2032 svv.svv_major_rev = sv_major_rev; 2033 svv.svv_minor_rev = sv_minor_rev; 2034 svv.svv_micro_rev = sv_micro_rev; 2035 svv.svv_baseline_rev = sv_baseline_rev; 2036 2037 if (ddi_copyout(&svv, (void *)arg, 2038 sizeof (svv), mode) < 0) { 2039 spcs_s_kfree(kstatus); 2040 return (EFAULT); 2041 } 2042 2043 ustatus = svv.svv_error; 2044 } 2045 2046 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0); 2047 2048 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0)); 2049 /* NOTREACHED */ 2050 2051 case SVIOC_UNLOAD: 2052 rc = sv_prepare_unload(); 2053 2054 if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) { 2055 rc = EFAULT; 2056 } 2057 2058 spcs_s_kfree(kstatus); 2059 return (rc); 2060 2061 default: 2062 spcs_s_kfree(kstatus); 2063 2064 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL); 2065 2066 return (EINVAL); 2067 /* NOTREACHED */ 2068 } 2069 2070 /* NOTREACHED */ 2071 } 2072 2073 2074 /* ARGSUSED */ 2075 static int 2076 svprint(dev_t dev, char *str) 2077 { 2078 int instance = ddi_get_instance(sv_dip); 2079 cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str); 2080 return (0); 2081 } 2082 2083 2084 static void 2085 _sv_lyr_strategy(struct buf *bp) 2086 { 2087 caddr_t buf_addr; /* pointer to linear buffer in bp */ 2088 nsc_buf_t *bufh = NULL; 2089 nsc_buf_t *hndl = NULL; 2090 sv_dev_t *svp; 2091 nsc_vec_t *v; 2092 sv_maj_t *maj; 2093 nsc_size_t fba_req, fba_len; /* FBA lengths */ 2094 nsc_off_t fba_off; /* FBA offset */ 2095 size_t tocopy, nbytes; /* byte lengths */ 2096 int rw, rc; /* flags and return codes */ 2097 int (*fn)(); 2098 2099 rc = 0; 2100 2101 if (sv_debug > 5) 2102 cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp); 2103 2104 svp = sv_find_enabled(bp->b_edev, &maj); 2105 if (svp == NULL) { 2106 if (maj && (fn = maj->sm_strategy) != 0) { 2107 if (!(maj->sm_flag & D_MP)) { 2108 UNSAFE_ENTER(); 2109 rc = (*fn)(bp); 2110 UNSAFE_EXIT(); 2111 } else { 2112 rc = (*fn)(bp); 2113 } 2114 return; 2115 } else { 2116 bioerror(bp, ENODEV); 2117 biodone(bp); 2118 return; 2119 } 2120 } 2121 2122 ASSERT(RW_READ_HELD(&svp->sv_lock)); 2123 2124 if (svp->sv_flag == 0) { 2125 /* 2126 * guard access mode 2127 * - prevent user level access to the device 2128 */ 2129 DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp); 2130 bioerror(bp, EPERM); 2131 goto out; 2132 } 2133 2134 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) { 2135 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp); 2136 2137 if (rc == EINTR) 2138 cmn_err(CE_WARN, "!nsc_reserve() returned EINTR"); 2139 bioerror(bp, rc); 2140 goto out; 2141 } 2142 2143 if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) { 2144 DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp); 2145 2146 if (bp->b_flags & B_READ) { 2147 /* return EOF, not an error */ 2148 bp->b_resid = bp->b_bcount; 2149 bioerror(bp, 0); 2150 } else 2151 bioerror(bp, EINVAL); 2152 2153 goto done; 2154 } 2155 2156 /* 2157 * Preallocate a handle once per call to strategy. 2158 * If this fails, then the nsc_alloc_buf() will allocate 2159 * a temporary handle per allocation/free pair. 2160 */ 2161 2162 DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp); 2163 2164 bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL); 2165 2166 DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp); 2167 2168 if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) { 2169 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp); 2170 2171 cmn_err(CE_WARN, 2172 "!sv: allocated active handle (bufh %p, flags %x)", 2173 (void *)bufh, bufh->sb_flag); 2174 2175 bioerror(bp, ENXIO); 2176 goto done; 2177 } 2178 2179 fba_req = FBA_LEN(bp->b_bcount); 2180 if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks) 2181 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno); 2182 2183 rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE; 2184 2185 bp_mapin(bp); 2186 2187 bp->b_resid = bp->b_bcount; 2188 buf_addr = bp->b_un.b_addr; 2189 fba_off = 0; 2190 2191 /* 2192 * fba_req - requested size of transfer in FBAs after 2193 * truncation to device extent, and allowing for 2194 * possible non-FBA bounded final chunk. 2195 * fba_off - offset of start of chunk from start of bp in FBAs. 2196 * fba_len - size of this chunk in FBAs. 2197 */ 2198 2199 loop: 2200 fba_len = min(fba_req, svp->sv_maxfbas); 2201 hndl = bufh; 2202 2203 DTRACE_PROBE4(sv_dbg_allocb_start, 2204 sv_dev_t *, svp, 2205 uint64_t, (uint64_t)(bp->b_lblkno + fba_off), 2206 uint64_t, (uint64_t)fba_len, 2207 int, rw); 2208 2209 rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off), 2210 fba_len, rw, &hndl); 2211 2212 DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp); 2213 2214 if (rc > 0) { 2215 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp); 2216 bioerror(bp, rc); 2217 if (hndl != bufh) 2218 (void) nsc_free_buf(hndl); 2219 hndl = NULL; 2220 goto done; 2221 } 2222 2223 tocopy = min(FBA_SIZE(fba_len), bp->b_resid); 2224 v = hndl->sb_vec; 2225 2226 if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) { 2227 /* 2228 * Not overwriting all of the last FBA, so read in the 2229 * old contents now before we overwrite it with the new 2230 * data. 2231 */ 2232 2233 DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp, 2234 uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1)); 2235 2236 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0); 2237 if (rc > 0) { 2238 bioerror(bp, rc); 2239 goto done; 2240 } 2241 2242 DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp); 2243 } 2244 2245 DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp); 2246 2247 while (tocopy > 0) { 2248 nbytes = min(tocopy, (nsc_size_t)v->sv_len); 2249 2250 if (bp->b_flags & B_READ) 2251 (void) bcopy(v->sv_addr, buf_addr, nbytes); 2252 else 2253 (void) bcopy(buf_addr, v->sv_addr, nbytes); 2254 2255 bp->b_resid -= nbytes; 2256 buf_addr += nbytes; 2257 tocopy -= nbytes; 2258 v++; 2259 } 2260 2261 DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp); 2262 2263 if ((bp->b_flags & B_READ) == 0) { 2264 DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp, 2265 uint64_t, (uint64_t)hndl->sb_pos, 2266 uint64_t, (uint64_t)hndl->sb_len); 2267 2268 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0); 2269 2270 DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp); 2271 2272 if (rc > 0) { 2273 bioerror(bp, rc); 2274 goto done; 2275 } 2276 } 2277 2278 /* 2279 * Adjust FBA offset and requested (ie. remaining) length, 2280 * loop if more data to transfer. 2281 */ 2282 2283 fba_off += fba_len; 2284 fba_req -= fba_len; 2285 2286 if (fba_req > 0) { 2287 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp); 2288 2289 rc = nsc_free_buf(hndl); 2290 2291 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp); 2292 2293 if (rc > 0) { 2294 DTRACE_PROBE1(sv_lyr_strategy_err_free, 2295 struct buf *, bp); 2296 bioerror(bp, rc); 2297 } 2298 2299 hndl = NULL; 2300 2301 if (rc <= 0) 2302 goto loop; 2303 } 2304 2305 done: 2306 if (hndl != NULL) { 2307 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp); 2308 2309 rc = nsc_free_buf(hndl); 2310 2311 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp); 2312 2313 if (rc > 0) { 2314 DTRACE_PROBE1(sv_lyr_strategy_err_free, 2315 struct buf *, bp); 2316 bioerror(bp, rc); 2317 } 2318 2319 hndl = NULL; 2320 } 2321 2322 if (bufh) 2323 (void) nsc_free_handle(bufh); 2324 2325 DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp); 2326 2327 nsc_release(svp->sv_fd); 2328 2329 DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp); 2330 2331 out: 2332 if (sv_debug > 5) { 2333 cmn_err(CE_CONT, 2334 "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n", 2335 (void *)bp, (void *)bufh, bp->b_error); 2336 } 2337 2338 DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error); 2339 2340 rw_exit(&svp->sv_lock); 2341 biodone(bp); 2342 } 2343 2344 2345 static void 2346 sv_async_strategy(blind_t arg) 2347 { 2348 struct buf *bp = (struct buf *)arg; 2349 _sv_lyr_strategy(bp); 2350 } 2351 2352 2353 static int 2354 sv_lyr_strategy(struct buf *bp) 2355 { 2356 nsthread_t *tp; 2357 int nlive; 2358 2359 /* 2360 * If B_ASYNC was part of the DDI we could use it as a hint to 2361 * not create a thread for synchronous i/o. 2362 */ 2363 if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) { 2364 /* not sv enabled - just pass through */ 2365 DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp); 2366 _sv_lyr_strategy(bp); 2367 return (0); 2368 } 2369 2370 if (sv_debug > 4) { 2371 cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n", 2372 nst_nthread(sv_tset), nst_nlive(sv_tset)); 2373 } 2374 2375 /* 2376 * If there are only guard devices enabled there 2377 * won't be a threadset, so don't try and use it. 2378 */ 2379 tp = NULL; 2380 if (sv_tset != NULL) { 2381 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0); 2382 } 2383 2384 if (tp == NULL) { 2385 /* 2386 * out of threads, so fall back to synchronous io. 2387 */ 2388 if (sv_debug > 0) { 2389 cmn_err(CE_CONT, 2390 "!sv_lyr_strategy: thread alloc failed\n"); 2391 } 2392 2393 DTRACE_PROBE1(sv_lyr_strategy_no_thread, 2394 struct buf *, bp); 2395 2396 _sv_lyr_strategy(bp); 2397 sv_no_threads++; 2398 } else { 2399 nlive = nst_nlive(sv_tset); 2400 if (nlive > sv_max_nlive) { 2401 if (sv_debug > 0) { 2402 cmn_err(CE_CONT, 2403 "!sv_lyr_strategy: " 2404 "new max nlive %d (nthread %d)\n", 2405 nlive, nst_nthread(sv_tset)); 2406 } 2407 2408 sv_max_nlive = nlive; 2409 } 2410 } 2411 2412 return (0); 2413 } 2414 2415 2416 #ifndef offsetof 2417 #define offsetof(s, m) ((size_t)(&((s *)0)->m)) 2418 #endif 2419 2420 /* 2421 * re-write the size of the current partition 2422 */ 2423 static int 2424 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp) 2425 { 2426 size_t offset; 2427 int ilp32; 2428 int pnum; 2429 int rc; 2430 2431 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32); 2432 2433 rc = nskern_partition(svp->sv_dev, &pnum); 2434 if (rc != 0) { 2435 return (rc); 2436 } 2437 2438 if (pnum < 0 || pnum >= V_NUMPAR) { 2439 cmn_err(CE_WARN, 2440 "!sv_gvtoc: unable to determine partition number " 2441 "for dev %lx", svp->sv_dev); 2442 return (EINVAL); 2443 } 2444 2445 if (ilp32) { 2446 int32_t p_size; 2447 2448 #ifdef _SunOS_5_6 2449 offset = offsetof(struct vtoc, v_part); 2450 offset += sizeof (struct partition) * pnum; 2451 offset += offsetof(struct partition, p_size); 2452 #else 2453 offset = offsetof(struct vtoc32, v_part); 2454 offset += sizeof (struct partition32) * pnum; 2455 offset += offsetof(struct partition32, p_size); 2456 #endif 2457 2458 p_size = (int32_t)svp->sv_nblocks; 2459 if (p_size == 0) { 2460 if (sv_reserve(svp->sv_fd, 2461 NSC_MULTI|NSC_PCATCH) == 0) { 2462 p_size = (int32_t)svp->sv_nblocks; 2463 nsc_release(svp->sv_fd); 2464 } else { 2465 rc = EINTR; 2466 } 2467 } 2468 2469 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset), 2470 sizeof (p_size), mode) != 0) { 2471 rc = EFAULT; 2472 } 2473 } else { 2474 long p_size; 2475 2476 offset = offsetof(struct vtoc, v_part); 2477 offset += sizeof (struct partition) * pnum; 2478 offset += offsetof(struct partition, p_size); 2479 2480 p_size = (long)svp->sv_nblocks; 2481 if (p_size == 0) { 2482 if (sv_reserve(svp->sv_fd, 2483 NSC_MULTI|NSC_PCATCH) == 0) { 2484 p_size = (long)svp->sv_nblocks; 2485 nsc_release(svp->sv_fd); 2486 } else { 2487 rc = EINTR; 2488 } 2489 } 2490 2491 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset), 2492 sizeof (p_size), mode) != 0) { 2493 rc = EFAULT; 2494 } 2495 } 2496 2497 return (rc); 2498 } 2499 2500 2501 #ifdef DKIOCPARTITION 2502 /* 2503 * re-write the size of the current partition 2504 * 2505 * arg is dk_efi_t. 2506 * 2507 * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64; 2508 * 2509 * dk_efi_t->dki_data --> efi_gpt_t (label header) 2510 * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions) 2511 * 2512 * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts 2513 * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself 2514 * 2515 * This assumes that sizeof (efi_gpt_t) is the same as the size of a 2516 * logical block on the disk. 2517 * 2518 * Everything is little endian (i.e. disk format). 2519 */ 2520 static int 2521 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp) 2522 { 2523 dk_efi_t efi; 2524 efi_gpt_t gpt; 2525 efi_gpe_t *gpe = NULL; 2526 size_t sgpe; 2527 uint64_t p_size; /* virtual partition size from nsctl */ 2528 uint32_t crc; 2529 int unparts; /* number of parts in user's array */ 2530 int pnum; 2531 int rc; 2532 2533 rc = nskern_partition(svp->sv_dev, &pnum); 2534 if (rc != 0) { 2535 return (rc); 2536 } 2537 2538 if (pnum < 0) { 2539 cmn_err(CE_WARN, 2540 "!sv_efi: unable to determine partition number for dev %lx", 2541 svp->sv_dev); 2542 return (EINVAL); 2543 } 2544 2545 if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) { 2546 return (EFAULT); 2547 } 2548 2549 efi.dki_data = (void *)(uintptr_t)efi.dki_data_64; 2550 2551 if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) { 2552 return (EINVAL); 2553 } 2554 2555 if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) { 2556 rc = EFAULT; 2557 goto out; 2558 } 2559 2560 if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0) 2561 unparts = 1; 2562 else if (pnum >= unparts) { 2563 cmn_err(CE_WARN, 2564 "!sv_efi: partition# beyond end of user array (%d >= %d)", 2565 pnum, unparts); 2566 return (EINVAL); 2567 } 2568 2569 sgpe = sizeof (*gpe) * unparts; 2570 gpe = kmem_alloc(sgpe, KM_SLEEP); 2571 2572 if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) { 2573 rc = EFAULT; 2574 goto out; 2575 } 2576 2577 p_size = svp->sv_nblocks; 2578 if (p_size == 0) { 2579 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 2580 p_size = (diskaddr_t)svp->sv_nblocks; 2581 nsc_release(svp->sv_fd); 2582 } else { 2583 rc = EINTR; 2584 } 2585 } 2586 2587 gpe[pnum].efi_gpe_EndingLBA = LE_64( 2588 LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1); 2589 2590 gpt.efi_gpt_PartitionEntryArrayCRC32 = 0; 2591 CRC32(crc, gpe, sgpe, -1U, sv_crc32_table); 2592 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2593 2594 gpt.efi_gpt_HeaderCRC32 = 0; 2595 CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table); 2596 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); 2597 2598 if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) { 2599 rc = EFAULT; 2600 goto out; 2601 } 2602 2603 if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) { 2604 rc = EFAULT; 2605 goto out; 2606 } 2607 2608 out: 2609 if (gpe) { 2610 kmem_free(gpe, sgpe); 2611 } 2612 2613 return (rc); 2614 } 2615 2616 2617 /* 2618 * Re-write the size of the partition specified by p_partno 2619 * 2620 * Note that if a DKIOCPARTITION is issued to an fd opened against a 2621 * non-sv'd device, but p_partno requests the size for a different 2622 * device that is sv'd, this function will *not* be called as sv is 2623 * not interposed on the original device (the fd). 2624 * 2625 * It would not be easy to change this as we cannot get the partition 2626 * number for the non-sv'd device, so cannot compute the dev_t of the 2627 * (sv'd) p_partno device, and so cannot find out if it is sv'd or get 2628 * its size from nsctl. 2629 * 2630 * See also the "Bug 4755783" comment in sv_lyr_ioctl(). 2631 */ 2632 static int 2633 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp) 2634 { 2635 struct partition64 p64; 2636 sv_dev_t *nsvp = NULL; 2637 diskaddr_t p_size; 2638 minor_t nminor; 2639 int pnum, rc; 2640 dev_t ndev; 2641 2642 rc = nskern_partition(svp->sv_dev, &pnum); 2643 if (rc != 0) { 2644 return (rc); 2645 } 2646 2647 if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) { 2648 return (EFAULT); 2649 } 2650 2651 if (p64.p_partno != pnum) { 2652 /* switch to requested partition, not the current one */ 2653 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum); 2654 ndev = makedevice(getmajor(svp->sv_dev), nminor); 2655 nsvp = sv_find_enabled(ndev, NULL); 2656 if (nsvp == NULL) { 2657 /* not sv device - just return */ 2658 return (0); 2659 } 2660 2661 svp = nsvp; 2662 } 2663 2664 p_size = svp->sv_nblocks; 2665 if (p_size == 0) { 2666 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 2667 p_size = (diskaddr_t)svp->sv_nblocks; 2668 nsc_release(svp->sv_fd); 2669 } else { 2670 rc = EINTR; 2671 } 2672 } 2673 2674 if (nsvp != NULL) { 2675 rw_exit(&nsvp->sv_lock); 2676 } 2677 2678 if ((rc == 0) && ddi_copyout(&p_size, 2679 (void *)(arg + offsetof(struct partition64, p_size)), 2680 sizeof (p_size), mode) != 0) { 2681 return (EFAULT); 2682 } 2683 2684 return (rc); 2685 } 2686 #endif /* DKIOCPARTITION */ 2687 2688 2689 static int 2690 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg, 2691 const int mode, cred_t *crp, int *rvalp) 2692 { 2693 sv_dev_t *svp; 2694 sv_maj_t *maj; 2695 int (*fn)(); 2696 int rc = 0; 2697 2698 maj = 0; 2699 fn = 0; 2700 2701 /* 2702 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue. 2703 * else it means it previously was SV_PREVENT_UNLOAD, and now it's 2704 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload. 2705 * 2706 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex. 2707 */ 2708 if (sv_mod_status == SV_ALLOW_UNLOAD) { 2709 return (EBUSY); 2710 } 2711 2712 svp = sv_find_enabled(dev, &maj); 2713 if (svp != NULL) { 2714 if (nskernd_isdaemon()) { 2715 /* 2716 * This is nskernd which always needs to see 2717 * the underlying disk device accurately. 2718 * 2719 * So just pass the ioctl straight through 2720 * to the underlying driver as though the device 2721 * was not sv enabled. 2722 */ 2723 DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp, 2724 dev_t, dev); 2725 2726 rw_exit(&svp->sv_lock); 2727 svp = NULL; 2728 } else { 2729 ASSERT(RW_READ_HELD(&svp->sv_lock)); 2730 } 2731 } 2732 2733 /* 2734 * We now have a locked and enabled SV device, or a non-SV device. 2735 */ 2736 2737 switch (cmd) { 2738 /* 2739 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI 2740 * and DKIOCSETEFI are intercepted and faked up as some 2741 * i/o providers emulate volumes of a different size to 2742 * the underlying volume. 2743 * 2744 * Setting the size by rewriting the vtoc is not permitted. 2745 */ 2746 2747 case DKIOCSVTOC: 2748 #ifdef DKIOCPARTITION 2749 case DKIOCSETEFI: 2750 #endif 2751 if (svp == NULL) { 2752 /* not intercepted -- allow ioctl through */ 2753 break; 2754 } 2755 2756 rw_exit(&svp->sv_lock); 2757 2758 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM); 2759 2760 return (EPERM); 2761 2762 default: 2763 break; 2764 } 2765 2766 /* 2767 * Pass through the real ioctl command. 2768 */ 2769 2770 if (maj && (fn = maj->sm_ioctl) != 0) { 2771 if (!(maj->sm_flag & D_MP)) { 2772 UNSAFE_ENTER(); 2773 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp); 2774 UNSAFE_EXIT(); 2775 } else { 2776 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp); 2777 } 2778 } else { 2779 rc = ENODEV; 2780 } 2781 2782 /* 2783 * Bug 4755783 2784 * Fix up the size of the current partition to allow 2785 * for the virtual volume to be a different size to the 2786 * physical volume (e.g. for II compact dependent shadows). 2787 * 2788 * Note that this only attempts to fix up the current partition 2789 * - the one that the ioctl was issued against. There could be 2790 * other sv'd partitions in the same vtoc, but we cannot tell 2791 * so we don't attempt to fix them up. 2792 */ 2793 2794 if (svp != NULL && rc == 0) { 2795 switch (cmd) { 2796 case DKIOCGVTOC: 2797 rc = sv_fix_dkiocgvtoc(arg, mode, svp); 2798 break; 2799 2800 #ifdef DKIOCPARTITION 2801 case DKIOCGETEFI: 2802 rc = sv_fix_dkiocgetefi(arg, mode, svp); 2803 break; 2804 2805 case DKIOCPARTITION: 2806 rc = sv_fix_dkiocpartition(arg, mode, svp); 2807 break; 2808 #endif /* DKIOCPARTITION */ 2809 } 2810 } 2811 2812 if (svp != NULL) { 2813 rw_exit(&svp->sv_lock); 2814 } 2815 2816 return (rc); 2817 } 2818