1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Storage Volume Character and Block Driver (SV) 28 * 29 * This driver implements a simplistic /dev/{r}dsk/ interface to a 30 * specified disk volume that is otherwise managed by the Prism 31 * software. The SV driver layers itself onto the underlying disk 32 * device driver by changing function pointers in the cb_ops 33 * structure. 34 * 35 * CONFIGURATION: 36 * 37 * 1. Configure the driver using the svadm utility. 38 * 2. Access the device as before through /dev/rdsk/c?t?d?s? 39 * 40 * LIMITATIONS: 41 * 42 * This driver should NOT be used to share a device between another 43 * DataServices user interface module (e.g., STE) and a user accessing 44 * the device through the block device in O_WRITE mode. This is because 45 * writes through the block device are asynchronous (due to the page 46 * cache) and so consistency between the block device user and the 47 * STE user cannot be guaranteed. 48 * 49 * Data is copied between system struct buf(9s) and nsc_vec_t. This is 50 * wasteful and slow. 51 */ 52 53 #include <sys/debug.h> 54 #include <sys/types.h> 55 56 #include <sys/ksynch.h> 57 #include <sys/kmem.h> 58 #include <sys/errno.h> 59 #include <sys/varargs.h> 60 #include <sys/file.h> 61 #include <sys/open.h> 62 #include <sys/conf.h> 63 #include <sys/cred.h> 64 #include <sys/buf.h> 65 #include <sys/uio.h> 66 #ifndef DS_DDICT 67 #include <sys/pathname.h> 68 #endif 69 #include <sys/aio_req.h> 70 #include <sys/dkio.h> 71 #include <sys/vtoc.h> 72 #include <sys/cmn_err.h> 73 #include <sys/modctl.h> 74 #include <sys/ddi.h> 75 #include <sys/sunddi.h> 76 #include <sys/sunldi.h> 77 #include <sys/nsctl/nsvers.h> 78 79 #include <sys/nsc_thread.h> 80 #include <sys/unistat/spcs_s.h> 81 #include <sys/unistat/spcs_s_k.h> 82 #include <sys/unistat/spcs_errors.h> 83 84 #ifdef DS_DDICT 85 #include "../contract.h" 86 #endif 87 88 #include "../nsctl.h" 89 90 91 #include <sys/sdt.h> /* dtrace is S10 or later */ 92 93 #include "sv.h" 94 #include "sv_impl.h" 95 #include "sv_efi.h" 96 97 #define MAX_EINTR_COUNT 1000 98 99 /* 100 * sv_mod_status 101 */ 102 #define SV_PREVENT_UNLOAD 1 103 #define SV_ALLOW_UNLOAD 2 104 105 static const int sv_major_rev = ISS_VERSION_MAJ; /* Major number */ 106 static const int sv_minor_rev = ISS_VERSION_MIN; /* Minor number */ 107 static const int sv_micro_rev = ISS_VERSION_MIC; /* Micro number */ 108 static const int sv_baseline_rev = ISS_VERSION_NUM; /* Baseline number */ 109 110 #ifdef DKIOCPARTITION 111 /* 112 * CRC32 polynomial table needed for computing the checksums 113 * in an EFI vtoc. 114 */ 115 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE }; 116 #endif 117 118 static clock_t sv_config_time; /* Time of successful {en,dis}able */ 119 static int sv_debug; /* Set non-zero for debug to syslog */ 120 static int sv_mod_status; /* Set to prevent modunload */ 121 122 static dev_info_t *sv_dip; /* Single DIP for driver */ 123 static kmutex_t sv_mutex; /* Protect global lists, etc. */ 124 125 static nsc_mem_t *sv_mem; /* nsctl memory allocator token */ 126 127 128 /* 129 * Per device and per major state. 130 */ 131 132 #ifndef _SunOS_5_6 133 #define UNSAFE_ENTER() 134 #define UNSAFE_EXIT() 135 #else 136 #define UNSAFE_ENTER() mutex_enter(&unsafe_driver) 137 #define UNSAFE_EXIT() mutex_exit(&unsafe_driver) 138 #endif 139 140 /* hash table of major dev structures */ 141 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0}; 142 static sv_dev_t *sv_devs; /* array of per device structures */ 143 static int sv_max_devices; /* SV version of nsc_max_devices() */ 144 static int sv_ndevices; /* number of SV enabled devices */ 145 146 /* 147 * Threading. 148 */ 149 150 int sv_threads_max = 1024; /* maximum # to dynamically alloc */ 151 int sv_threads = 32; /* # to pre-allocate (see sv.conf) */ 152 int sv_threads_extra = 0; /* addl # we would have alloc'ed */ 153 154 static nstset_t *sv_tset; /* the threadset pointer */ 155 156 static int sv_threads_hysteresis = 4; /* hysteresis for threadset resizing */ 157 static int sv_threads_dev = 2; /* # of threads to alloc per device */ 158 static int sv_threads_inc = 8; /* increment for changing the set */ 159 static int sv_threads_needed; /* number of threads needed */ 160 static int sv_no_threads; /* number of nsc_create errors */ 161 static int sv_max_nlive; /* max number of threads running */ 162 163 164 165 /* 166 * nsctl fd callbacks. 167 */ 168 169 static int svattach_fd(blind_t); 170 static int svdetach_fd(blind_t); 171 172 static nsc_def_t sv_fd_def[] = { 173 { "Attach", (uintptr_t)svattach_fd, }, 174 { "Detach", (uintptr_t)svdetach_fd, }, 175 { 0, 0, } 176 }; 177 178 /* 179 * cb_ops functions. 180 */ 181 182 static int svopen(dev_t *, int, int, cred_t *); 183 static int svclose(dev_t, int, int, cred_t *); 184 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *); 185 static int svprint(dev_t, char *); 186 187 /* 188 * These next functions are layered into the underlying driver's devops. 189 */ 190 191 static int sv_lyr_open(dev_t *, int, int, cred_t *); 192 static int sv_lyr_close(dev_t, int, int, cred_t *); 193 static int sv_lyr_strategy(struct buf *); 194 static int sv_lyr_read(dev_t, struct uio *, cred_t *); 195 static int sv_lyr_write(dev_t, struct uio *, cred_t *); 196 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *); 197 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *); 198 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 199 200 static struct cb_ops sv_cb_ops = { 201 svopen, /* open */ 202 svclose, /* close */ 203 nulldev, /* strategy */ 204 svprint, 205 nodev, /* dump */ 206 nodev, /* read */ 207 nodev, /* write */ 208 svioctl, 209 nodev, /* devmap */ 210 nodev, /* mmap */ 211 nodev, /* segmap */ 212 nochpoll, /* poll */ 213 ddi_prop_op, 214 NULL, /* NOT a stream */ 215 D_NEW | D_MP | D_64BIT, 216 CB_REV, 217 nodev, /* aread */ 218 nodev, /* awrite */ 219 }; 220 221 222 /* 223 * dev_ops functions. 224 */ 225 226 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 227 static int sv_attach(dev_info_t *, ddi_attach_cmd_t); 228 static int sv_detach(dev_info_t *, ddi_detach_cmd_t); 229 230 static struct dev_ops sv_ops = { 231 DEVO_REV, 232 0, 233 sv_getinfo, 234 nulldev, /* identify */ 235 nulldev, /* probe */ 236 sv_attach, 237 sv_detach, 238 nodev, /* reset */ 239 &sv_cb_ops, 240 (struct bus_ops *)0 241 }; 242 243 /* 244 * Module linkage. 245 */ 246 247 extern struct mod_ops mod_driverops; 248 249 static struct modldrv modldrv = { 250 &mod_driverops, 251 "nws:Storage Volume:" ISS_VERSION_STR, 252 &sv_ops 253 }; 254 255 static struct modlinkage modlinkage = { 256 MODREV_1, 257 &modldrv, 258 0 259 }; 260 261 262 int 263 _init(void) 264 { 265 int error; 266 267 mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL); 268 269 if ((error = mod_install(&modlinkage)) != 0) { 270 mutex_destroy(&sv_mutex); 271 return (error); 272 } 273 274 #ifdef DEBUG 275 cmn_err(CE_CONT, "!sv %s %s (revision %d.%d.%d.%d, %s, %s)\n", 276 __DATE__, __TIME__, 277 sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev, 278 ISS_VERSION_STR, BUILD_DATE_STR); 279 #else 280 if (sv_micro_rev) { 281 cmn_err(CE_CONT, "!sv %s %s (revision %d.%d.%d, %s, %s)\n", 282 __DATE__, __TIME__, 283 sv_major_rev, sv_minor_rev, sv_micro_rev, 284 ISS_VERSION_STR, BUILD_DATE_STR); 285 } else { 286 cmn_err(CE_CONT, "!sv %s %s (revision %d.%d, %s, %s)\n", 287 __DATE__, __TIME__, 288 sv_major_rev, sv_minor_rev, 289 ISS_VERSION_STR, BUILD_DATE_STR); 290 } 291 #endif 292 293 return (error); 294 } 295 296 297 int 298 _fini(void) 299 { 300 int error; 301 302 if ((error = mod_remove(&modlinkage)) != 0) 303 return (error); 304 305 mutex_destroy(&sv_mutex); 306 307 return (error); 308 } 309 310 311 int 312 _info(struct modinfo *modinfop) 313 { 314 return (mod_info(&modlinkage, modinfop)); 315 } 316 317 318 /* 319 * Locking & State. 320 * 321 * sv_mutex protects config information - sv_maj_t and sv_dev_t lists; 322 * threadset creation and sizing; sv_ndevices. 323 * 324 * If we need to hold both sv_mutex and sv_lock, then the sv_mutex 325 * must be acquired first. 326 * 327 * sv_lock protects the sv_dev_t structure for an individual device. 328 * 329 * sv_olock protects the otyp/open members of the sv_dev_t. If we need 330 * to hold both sv_lock and sv_olock, then the sv_lock must be acquired 331 * first. 332 * 333 * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple 334 * I/O operations to a device simultaneously, as above. 335 * 336 * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur 337 * with sv_lock write-locked must be done with (sv_state == SV_PENDING) 338 * and (sv_pending == curthread) so that any recursion through 339 * sv_lyr_open/sv_lyr_close can be detected. 340 */ 341 342 343 static int 344 sv_init_devs(void) 345 { 346 int i; 347 348 ASSERT(MUTEX_HELD(&sv_mutex)); 349 350 if (sv_max_devices > 0) 351 return (0); 352 353 sv_max_devices = nsc_max_devices(); 354 355 if (sv_max_devices <= 0) { 356 /* nsctl is not attached (nskernd not running) */ 357 if (sv_debug > 0) 358 cmn_err(CE_CONT, "sv: nsc_max_devices = 0\n"); 359 return (EAGAIN); 360 } 361 362 sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)), 363 KM_NOSLEEP, sv_mem); 364 365 if (sv_devs == NULL) { 366 cmn_err(CE_WARN, "sv: could not allocate sv_devs array"); 367 return (ENOMEM); 368 } 369 370 for (i = 0; i < sv_max_devices; i++) { 371 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL); 372 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL); 373 } 374 375 if (sv_debug > 0) 376 cmn_err(CE_CONT, "sv: sv_init_devs successful\n"); 377 378 return (0); 379 } 380 381 382 static int 383 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 384 { 385 int rc; 386 387 switch (cmd) { 388 389 case DDI_ATTACH: 390 sv_dip = dip; 391 392 if (ddi_create_minor_node(dip, "sv", S_IFCHR, 393 0, DDI_PSEUDO, 0) != DDI_SUCCESS) 394 goto failed; 395 396 mutex_enter(&sv_mutex); 397 398 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0); 399 if (sv_mem == NULL) { 400 mutex_exit(&sv_mutex); 401 goto failed; 402 } 403 404 rc = sv_init_devs(); 405 if (rc != 0 && rc != EAGAIN) { 406 mutex_exit(&sv_mutex); 407 goto failed; 408 } 409 410 mutex_exit(&sv_mutex); 411 412 413 ddi_report_dev(dip); 414 415 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 416 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 417 "sv_threads", sv_threads); 418 419 if (sv_debug > 0) 420 cmn_err(CE_CONT, "sv: sv_threads=%d\n", sv_threads); 421 422 if (sv_threads > sv_threads_max) 423 sv_threads_max = sv_threads; 424 425 return (DDI_SUCCESS); 426 427 default: 428 return (DDI_FAILURE); 429 } 430 431 failed: 432 DTRACE_PROBE(sv_attach_failed); 433 (void) sv_detach(dip, DDI_DETACH); 434 return (DDI_FAILURE); 435 } 436 437 438 static int 439 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 440 { 441 sv_dev_t *svp; 442 int i; 443 444 switch (cmd) { 445 446 case DDI_DETACH: 447 448 /* 449 * Check that everything is disabled. 450 */ 451 452 mutex_enter(&sv_mutex); 453 454 if (sv_mod_status == SV_PREVENT_UNLOAD) { 455 mutex_exit(&sv_mutex); 456 DTRACE_PROBE(sv_detach_err_prevent); 457 return (DDI_FAILURE); 458 } 459 460 for (i = 0; sv_devs && i < sv_max_devices; i++) { 461 svp = &sv_devs[i]; 462 463 if (svp->sv_state != SV_DISABLE) { 464 mutex_exit(&sv_mutex); 465 DTRACE_PROBE(sv_detach_err_busy); 466 return (DDI_FAILURE); 467 } 468 } 469 470 471 for (i = 0; sv_devs && i < sv_max_devices; i++) { 472 mutex_destroy(&sv_devs[i].sv_olock); 473 rw_destroy(&sv_devs[i].sv_lock); 474 } 475 476 if (sv_devs) { 477 nsc_kmem_free(sv_devs, 478 (sv_max_devices * sizeof (*sv_devs))); 479 sv_devs = NULL; 480 } 481 sv_max_devices = 0; 482 483 if (sv_mem) { 484 nsc_unregister_mem(sv_mem); 485 sv_mem = NULL; 486 } 487 488 mutex_exit(&sv_mutex); 489 490 /* 491 * Remove all minor nodes. 492 */ 493 494 ddi_remove_minor_node(dip, NULL); 495 sv_dip = NULL; 496 497 return (DDI_SUCCESS); 498 499 default: 500 return (DDI_FAILURE); 501 } 502 } 503 504 static sv_maj_t * 505 sv_getmajor(const dev_t dev) 506 { 507 sv_maj_t **insert, *maj; 508 major_t umaj = getmajor(dev); 509 510 /* 511 * See if the hash table entry, or one of the hash chains 512 * is already allocated for this major number 513 */ 514 if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) { 515 do { 516 if (maj->sm_major == umaj) 517 return (maj); 518 } while ((maj = maj->sm_next) != 0); 519 } 520 521 /* 522 * If the sv_mutex is held, there is design flaw, as the only non-mutex 523 * held callers can be sv_enable() or sv_dev_to_sv() 524 * Return an error, instead of panicing the system 525 */ 526 if (MUTEX_HELD(&sv_mutex)) { 527 cmn_err(CE_WARN, "sv: could not allocate sv_maj_t"); 528 return (NULL); 529 } 530 531 /* 532 * Determine where to allocate a new element in the hash table 533 */ 534 mutex_enter(&sv_mutex); 535 insert = &(sv_majors[SV_MAJOR_HASH(umaj)]); 536 for (maj = *insert; maj; maj = maj->sm_next) { 537 538 /* Did another thread beat us to it? */ 539 if (maj->sm_major == umaj) 540 return (maj); 541 542 /* Find a NULL insert point? */ 543 if (maj->sm_next == NULL) 544 insert = &maj->sm_next; 545 } 546 547 /* 548 * Located the new insert point 549 */ 550 *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem); 551 if ((maj = *insert) != 0) 552 maj->sm_major = umaj; 553 else 554 cmn_err(CE_WARN, "sv: could not allocate sv_maj_t"); 555 556 mutex_exit(&sv_mutex); 557 558 return (maj); 559 } 560 561 /* ARGSUSED */ 562 563 static int 564 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 565 { 566 int rc = DDI_FAILURE; 567 568 switch (infocmd) { 569 570 case DDI_INFO_DEVT2DEVINFO: 571 *result = sv_dip; 572 rc = DDI_SUCCESS; 573 break; 574 575 case DDI_INFO_DEVT2INSTANCE: 576 /* 577 * We only have a single instance. 578 */ 579 *result = 0; 580 rc = DDI_SUCCESS; 581 break; 582 583 default: 584 break; 585 } 586 587 return (rc); 588 } 589 590 591 /* 592 * Hashing of devices onto major device structures. 593 * 594 * Individual device structures are hashed onto one of the sm_hash[] 595 * buckets in the relevant major device structure. 596 * 597 * Hash insertion and deletion -must- be done with sv_mutex held. Hash 598 * searching does not require the mutex because of the sm_seq member. 599 * sm_seq is incremented on each insertion (-after- hash chain pointer 600 * manipulation) and each deletion (-before- hash chain pointer 601 * manipulation). When searching the hash chain, the seq number is 602 * checked before accessing each device structure, if the seq number has 603 * changed, then we restart the search from the top of the hash chain. 604 * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search 605 * the hash chain (we are guaranteed that this search cannot be 606 * interrupted). 607 */ 608 609 #define SV_HASH_RETRY 16 610 611 static sv_dev_t * 612 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp) 613 { 614 minor_t umin = getminor(dev); 615 sv_dev_t **hb, *next, *svp; 616 sv_maj_t *maj; 617 int seq; 618 int try; 619 620 /* Get major hash table */ 621 maj = sv_getmajor(dev); 622 if (majpp) 623 *majpp = maj; 624 if (maj == NULL) 625 return (NULL); 626 627 if (maj->sm_inuse == 0) { 628 DTRACE_PROBE1( 629 sv_dev_to_sv_end, 630 dev_t, dev); 631 return (NULL); 632 } 633 634 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]); 635 try = 0; 636 637 retry: 638 if (try > SV_HASH_RETRY) 639 mutex_enter(&sv_mutex); 640 641 seq = maj->sm_seq; 642 for (svp = *hb; svp; svp = next) { 643 next = svp->sv_hash; 644 645 nsc_membar_stld(); /* preserve register load order */ 646 647 if (maj->sm_seq != seq) { 648 DTRACE_PROBE1(sv_dev_to_sv_retry, 649 dev_t, dev); 650 try++; 651 goto retry; 652 } 653 654 if (svp->sv_dev == dev) 655 break; 656 } 657 658 if (try > SV_HASH_RETRY) 659 mutex_exit(&sv_mutex); 660 661 return (svp); 662 } 663 664 665 /* 666 * Must be called with sv_mutex held. 667 */ 668 669 static int 670 sv_get_state(const dev_t udev, sv_dev_t **svpp) 671 { 672 sv_dev_t **hb, **insert, *svp; 673 sv_maj_t *maj; 674 minor_t umin; 675 int i; 676 677 /* Get major hash table */ 678 if ((maj = sv_getmajor(udev)) == NULL) 679 return (NULL); 680 681 /* Determine which minor hash table */ 682 umin = getminor(udev); 683 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]); 684 685 /* look for clash */ 686 687 insert = hb; 688 689 for (svp = *hb; svp; svp = svp->sv_hash) { 690 if (svp->sv_dev == udev) 691 break; 692 693 if (svp->sv_hash == NULL) 694 insert = &svp->sv_hash; 695 } 696 697 if (svp) { 698 DTRACE_PROBE1( 699 sv_get_state_enabled, 700 dev_t, udev); 701 return (SV_EENABLED); 702 } 703 704 /* look for spare sv_devs slot */ 705 706 for (i = 0; i < sv_max_devices; i++) { 707 svp = &sv_devs[i]; 708 709 if (svp->sv_state == SV_DISABLE) 710 break; 711 } 712 713 if (i >= sv_max_devices) { 714 DTRACE_PROBE1( 715 sv_get_state_noslots, 716 dev_t, udev); 717 return (SV_ENOSLOTS); 718 } 719 720 svp->sv_state = SV_PENDING; 721 svp->sv_pending = curthread; 722 723 *insert = svp; 724 svp->sv_hash = NULL; 725 maj->sm_seq++; /* must be after the store to the hash chain */ 726 727 *svpp = svp; 728 729 /* 730 * We do not know the size of the underlying device at 731 * this stage, so initialise "nblocks" property to 732 * zero, and update it whenever we succeed in 733 * nsc_reserve'ing the underlying nsc_fd_t. 734 */ 735 736 svp->sv_nblocks = 0; 737 738 return (0); 739 } 740 741 742 /* 743 * Remove a device structure from it's hash chain. 744 * Must be called with sv_mutex held. 745 */ 746 747 static void 748 sv_rm_hash(sv_dev_t *svp) 749 { 750 sv_dev_t **svpp; 751 sv_maj_t *maj; 752 753 /* Get major hash table */ 754 if ((maj = sv_getmajor(svp->sv_dev)) == NULL) 755 return; 756 757 /* remove svp from hash chain */ 758 759 svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]); 760 while (*svpp) { 761 if (*svpp == svp) { 762 /* 763 * increment of sm_seq must be before the 764 * removal from the hash chain 765 */ 766 maj->sm_seq++; 767 *svpp = svp->sv_hash; 768 break; 769 } 770 771 svpp = &(*svpp)->sv_hash; 772 } 773 774 svp->sv_hash = NULL; 775 } 776 777 /* 778 * Free (disable) a device structure. 779 * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will 780 * perform the exits during its processing. 781 */ 782 783 static int 784 sv_free(sv_dev_t *svp, const int error) 785 { 786 struct cb_ops *cb_ops; 787 sv_maj_t *maj; 788 789 /* Get major hash table */ 790 if ((maj = sv_getmajor(svp->sv_dev)) == NULL) 791 return (NULL); 792 793 svp->sv_state = SV_PENDING; 794 svp->sv_pending = curthread; 795 796 /* 797 * Close the fd's before removing from the hash or swapping 798 * back the cb_ops pointers so that the cache flushes before new 799 * io can come in. 800 */ 801 802 if (svp->sv_fd) { 803 (void) nsc_close(svp->sv_fd); 804 svp->sv_fd = 0; 805 } 806 807 sv_rm_hash(svp); 808 809 if (error != SV_ESDOPEN && 810 error != SV_ELYROPEN && --maj->sm_inuse == 0) { 811 812 if (maj->sm_dev_ops) 813 cb_ops = maj->sm_dev_ops->devo_cb_ops; 814 else 815 cb_ops = NULL; 816 817 if (cb_ops && maj->sm_strategy != NULL) { 818 cb_ops->cb_strategy = maj->sm_strategy; 819 cb_ops->cb_close = maj->sm_close; 820 cb_ops->cb_ioctl = maj->sm_ioctl; 821 cb_ops->cb_write = maj->sm_write; 822 cb_ops->cb_open = maj->sm_open; 823 cb_ops->cb_read = maj->sm_read; 824 cb_ops->cb_flag = maj->sm_flag; 825 826 if (maj->sm_awrite) 827 cb_ops->cb_awrite = maj->sm_awrite; 828 829 if (maj->sm_aread) 830 cb_ops->cb_aread = maj->sm_aread; 831 832 /* 833 * corbin XXX 834 * Leave backing device ops in maj->sm_* 835 * to handle any requests that might come 836 * in during the disable. This could be 837 * a problem however if the backing device 838 * driver is changed while we process these 839 * requests. 840 * 841 * maj->sm_strategy = 0; 842 * maj->sm_awrite = 0; 843 * maj->sm_write = 0; 844 * maj->sm_ioctl = 0; 845 * maj->sm_close = 0; 846 * maj->sm_aread = 0; 847 * maj->sm_read = 0; 848 * maj->sm_open = 0; 849 * maj->sm_flag = 0; 850 * 851 */ 852 } 853 854 if (maj->sm_dev_ops) { 855 maj->sm_dev_ops = 0; 856 } 857 } 858 859 if (svp->sv_lh) { 860 cred_t *crp = ddi_get_cred(); 861 862 /* 863 * Close the protective layered driver open using the 864 * Sun Private layered driver i/f. 865 */ 866 867 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp); 868 svp->sv_lh = NULL; 869 } 870 871 svp->sv_timestamp = nsc_lbolt(); 872 svp->sv_state = SV_DISABLE; 873 svp->sv_pending = NULL; 874 rw_exit(&svp->sv_lock); 875 mutex_exit(&sv_mutex); 876 877 return (error); 878 } 879 880 /* 881 * Reserve the device, taking into account the possibility that 882 * the reserve might have to be retried. 883 */ 884 static int 885 sv_reserve(nsc_fd_t *fd, int flags) 886 { 887 int eintr_count; 888 int rc; 889 890 eintr_count = 0; 891 do { 892 rc = nsc_reserve(fd, flags); 893 if (rc == EINTR) { 894 ++eintr_count; 895 delay(2); 896 } 897 } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT)); 898 899 return (rc); 900 } 901 902 static int 903 sv_enable(const caddr_t path, const int flag, 904 const dev_t udev, spcs_s_info_t kstatus) 905 { 906 struct dev_ops *dev_ops; 907 struct cb_ops *cb_ops; 908 sv_dev_t *svp; 909 sv_maj_t *maj; 910 nsc_size_t nblocks; 911 int rc; 912 cred_t *crp; 913 ldi_ident_t li; 914 915 if (udev == (dev_t)-1 || udev == 0) { 916 DTRACE_PROBE1( 917 sv_enable_err_baddev, 918 dev_t, udev); 919 return (SV_EBADDEV); 920 } 921 922 if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) { 923 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev); 924 return (SV_EAMODE); 925 } 926 927 /* Get major hash table */ 928 if ((maj = sv_getmajor(udev)) == NULL) 929 return (SV_EBADDEV); 930 931 mutex_enter(&sv_mutex); 932 933 rc = sv_get_state(udev, &svp); 934 if (rc) { 935 mutex_exit(&sv_mutex); 936 DTRACE_PROBE1(sv_enable_err_state, 937 dev_t, udev); 938 return (rc); 939 } 940 941 rw_enter(&svp->sv_lock, RW_WRITER); 942 943 /* 944 * Get real fd used for io 945 */ 946 947 svp->sv_dev = udev; 948 svp->sv_flag = flag; 949 950 /* 951 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy 952 * function pointer before sv swaps them out. 953 */ 954 955 svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE), 956 sv_fd_def, (blind_t)udev, &rc); 957 958 if (svp->sv_fd == NULL) { 959 if (kstatus) 960 spcs_s_add(kstatus, rc); 961 DTRACE_PROBE1(sv_enable_err_fd, 962 dev_t, udev); 963 return (sv_free(svp, SV_ESDOPEN)); 964 } 965 966 /* 967 * Perform a layered driver open using the Sun Private layered 968 * driver i/f to ensure that the cb_ops structure for the driver 969 * is not detached out from under us whilst sv is enabled. 970 * 971 */ 972 973 crp = ddi_get_cred(); 974 svp->sv_lh = NULL; 975 976 if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) { 977 rc = ldi_open_by_dev(&svp->sv_dev, 978 OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li); 979 } 980 981 if (rc != 0) { 982 if (kstatus) 983 spcs_s_add(kstatus, rc); 984 DTRACE_PROBE1(sv_enable_err_lyr_open, 985 dev_t, udev); 986 return (sv_free(svp, SV_ELYROPEN)); 987 } 988 989 /* 990 * Do layering if required - must happen after nsc_open(). 991 */ 992 993 if (maj->sm_inuse++ == 0) { 994 maj->sm_dev_ops = nsc_get_devops(getmajor(udev)); 995 996 if (maj->sm_dev_ops == NULL || 997 maj->sm_dev_ops->devo_cb_ops == NULL) { 998 DTRACE_PROBE1( 999 sv_enable_err_load, 1000 dev_t, udev); 1001 return (sv_free(svp, SV_ELOAD)); 1002 } 1003 1004 dev_ops = maj->sm_dev_ops; 1005 cb_ops = dev_ops->devo_cb_ops; 1006 1007 if (cb_ops->cb_strategy == NULL || 1008 cb_ops->cb_strategy == nodev || 1009 cb_ops->cb_strategy == nulldev) { 1010 DTRACE_PROBE1(sv_enable_err_nostrategy, 1011 dev_t, udev); 1012 return (sv_free(svp, SV_ELOAD)); 1013 } 1014 1015 if (cb_ops->cb_strategy == sv_lyr_strategy) { 1016 DTRACE_PROBE1(sv_enable_err_svstrategy, 1017 dev_t, udev); 1018 return (sv_free(svp, SV_ESTRATEGY)); 1019 } 1020 1021 maj->sm_strategy = cb_ops->cb_strategy; 1022 maj->sm_close = cb_ops->cb_close; 1023 maj->sm_ioctl = cb_ops->cb_ioctl; 1024 maj->sm_write = cb_ops->cb_write; 1025 maj->sm_open = cb_ops->cb_open; 1026 maj->sm_read = cb_ops->cb_read; 1027 maj->sm_flag = cb_ops->cb_flag; 1028 1029 cb_ops->cb_flag = cb_ops->cb_flag | D_MP; 1030 cb_ops->cb_strategy = sv_lyr_strategy; 1031 cb_ops->cb_close = sv_lyr_close; 1032 cb_ops->cb_ioctl = sv_lyr_ioctl; 1033 cb_ops->cb_write = sv_lyr_write; 1034 cb_ops->cb_open = sv_lyr_open; 1035 cb_ops->cb_read = sv_lyr_read; 1036 1037 /* 1038 * Check that the driver has async I/O entry points 1039 * before changing them. 1040 */ 1041 1042 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) { 1043 maj->sm_awrite = 0; 1044 maj->sm_aread = 0; 1045 } else { 1046 maj->sm_awrite = cb_ops->cb_awrite; 1047 maj->sm_aread = cb_ops->cb_aread; 1048 1049 cb_ops->cb_awrite = sv_lyr_awrite; 1050 cb_ops->cb_aread = sv_lyr_aread; 1051 } 1052 1053 /* 1054 * Bug 4645743 1055 * 1056 * Prevent sv from ever unloading after it has interposed 1057 * on a major device because there is a race between 1058 * sv removing its layered entry points from the target 1059 * dev_ops, a client coming in and accessing the driver, 1060 * and the kernel modunloading the sv text. 1061 * 1062 * To allow unload, do svboot -u, which only happens in 1063 * pkgrm time. 1064 */ 1065 ASSERT(MUTEX_HELD(&sv_mutex)); 1066 sv_mod_status = SV_PREVENT_UNLOAD; 1067 } 1068 1069 1070 svp->sv_timestamp = nsc_lbolt(); 1071 svp->sv_state = SV_ENABLE; 1072 svp->sv_pending = NULL; 1073 rw_exit(&svp->sv_lock); 1074 1075 sv_ndevices++; 1076 mutex_exit(&sv_mutex); 1077 1078 nblocks = 0; 1079 if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) { 1080 nblocks = svp->sv_nblocks; 1081 nsc_release(svp->sv_fd); 1082 } 1083 1084 cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n", 1085 svp->sv_dev, nblocks); 1086 1087 return (0); 1088 } 1089 1090 1091 static int 1092 sv_prepare_unload() 1093 { 1094 int rc = 0; 1095 1096 mutex_enter(&sv_mutex); 1097 1098 if (sv_mod_status == SV_PREVENT_UNLOAD) { 1099 if ((sv_ndevices != 0) || (sv_tset != NULL)) { 1100 rc = EBUSY; 1101 } else { 1102 sv_mod_status = SV_ALLOW_UNLOAD; 1103 delay(SV_WAIT_UNLOAD * drv_usectohz(1000000)); 1104 } 1105 } 1106 1107 mutex_exit(&sv_mutex); 1108 return (rc); 1109 } 1110 1111 static int 1112 svattach_fd(blind_t arg) 1113 { 1114 dev_t dev = (dev_t)arg; 1115 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1116 int rc; 1117 1118 if (sv_debug > 0) 1119 cmn_err(CE_CONT, "svattach_fd(%p, %p)\n", arg, (void *)svp); 1120 1121 if (svp == NULL) { 1122 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg); 1123 return (0); 1124 } 1125 1126 if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) { 1127 cmn_err(CE_WARN, 1128 "!svattach_fd: nsc_partsize() failed, rc %d", rc); 1129 svp->sv_nblocks = 0; 1130 } 1131 1132 if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) { 1133 cmn_err(CE_WARN, 1134 "!svattach_fd: nsc_maxfbas() failed, rc %d", rc); 1135 svp->sv_maxfbas = 0; 1136 } 1137 1138 if (sv_debug > 0) { 1139 cmn_err(CE_CONT, 1140 "svattach_fd(%p): size %" NSC_SZFMT ", " 1141 "maxfbas %" NSC_SZFMT "\n", 1142 arg, svp->sv_nblocks, svp->sv_maxfbas); 1143 } 1144 1145 return (0); 1146 } 1147 1148 1149 static int 1150 svdetach_fd(blind_t arg) 1151 { 1152 dev_t dev = (dev_t)arg; 1153 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1154 1155 if (sv_debug > 0) 1156 cmn_err(CE_CONT, "svdetach_fd(%p, %p)\n", arg, (void *)svp); 1157 1158 /* svp can be NULL during disable of an sv */ 1159 if (svp == NULL) 1160 return (0); 1161 1162 svp->sv_maxfbas = 0; 1163 svp->sv_nblocks = 0; 1164 return (0); 1165 } 1166 1167 1168 /* 1169 * Side effect: if called with (guard != 0), then expects both sv_mutex 1170 * and sv_lock(RW_WRITER) to be held, and will release them before returning. 1171 */ 1172 1173 /* ARGSUSED */ 1174 static int 1175 sv_disable(dev_t dev, spcs_s_info_t kstatus) 1176 { 1177 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1178 1179 if (svp == NULL) { 1180 1181 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp); 1182 return (SV_ENODEV); 1183 } 1184 1185 mutex_enter(&sv_mutex); 1186 rw_enter(&svp->sv_lock, RW_WRITER); 1187 1188 if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) { 1189 rw_exit(&svp->sv_lock); 1190 mutex_exit(&sv_mutex); 1191 1192 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp); 1193 return (SV_EDISABLED); 1194 } 1195 1196 1197 sv_ndevices--; 1198 return (sv_free(svp, 0)); 1199 } 1200 1201 1202 1203 static int 1204 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp) 1205 { 1206 nsc_buf_t *tmph; 1207 sv_dev_t *svp; 1208 sv_maj_t *maj; 1209 int (*fn)(); 1210 dev_t odev; 1211 int ret; 1212 int rc; 1213 1214 svp = sv_dev_to_sv(*devp, &maj); 1215 1216 if (svp) { 1217 if (svp->sv_state == SV_PENDING && 1218 svp->sv_pending == curthread) { 1219 /* 1220 * This is a recursive open from a call to 1221 * ddi_lyr_open_by_devt and so we just want 1222 * to pass it straight through to the 1223 * underlying driver. 1224 */ 1225 DTRACE_PROBE2(sv_lyr_open_recursive, 1226 sv_dev_t *, svp, 1227 dev_t, *devp); 1228 svp = NULL; 1229 } else 1230 rw_enter(&svp->sv_lock, RW_READER); 1231 } 1232 1233 odev = *devp; 1234 1235 if (maj && (fn = maj->sm_open) != 0) { 1236 if (!(maj->sm_flag & D_MP)) { 1237 UNSAFE_ENTER(); 1238 ret = (*fn)(devp, flag, otyp, crp); 1239 UNSAFE_EXIT(); 1240 } else { 1241 ret = (*fn)(devp, flag, otyp, crp); 1242 } 1243 1244 if (ret == 0) { 1245 /* 1246 * Re-acquire svp if the driver changed *devp. 1247 */ 1248 1249 if (*devp != odev) { 1250 rw_exit(&svp->sv_lock); 1251 1252 svp = sv_dev_to_sv(*devp, NULL); 1253 1254 if (svp) { 1255 rw_enter(&svp->sv_lock, RW_READER); 1256 } 1257 } 1258 } 1259 } else { 1260 ret = ENODEV; 1261 } 1262 1263 if (svp && ret != 0 && svp->sv_state == SV_ENABLE) { 1264 /* 1265 * Underlying DDI open failed, but we have this 1266 * device SV enabled. If we can read some data 1267 * from the device, fake a successful open (this 1268 * probably means that this device is RDC'd and we 1269 * are getting the data from the secondary node). 1270 * 1271 * The reserve must be done with NSC_TRY|NSC_NOWAIT to 1272 * ensure that it does not deadlock if this open is 1273 * coming from nskernd:get_bsize(). 1274 */ 1275 rc = sv_reserve(svp->sv_fd, 1276 NSC_TRY|NSC_NOWAIT|NSC_MULTI|NSC_PCATCH); 1277 if (rc == 0) { 1278 tmph = NULL; 1279 1280 rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph); 1281 if (rc <= 0) { 1282 /* success */ 1283 ret = 0; 1284 } 1285 1286 if (tmph) { 1287 (void) nsc_free_buf(tmph); 1288 tmph = NULL; 1289 } 1290 1291 nsc_release(svp->sv_fd); 1292 1293 /* 1294 * Count the number of layered opens that we 1295 * fake since we have to fake a matching number 1296 * of closes (OTYP_LYR open/close calls must be 1297 * paired). 1298 */ 1299 1300 if (ret == 0 && otyp == OTYP_LYR) { 1301 mutex_enter(&svp->sv_olock); 1302 svp->sv_openlcnt++; 1303 mutex_exit(&svp->sv_olock); 1304 } 1305 } 1306 } 1307 1308 if (svp) { 1309 rw_exit(&svp->sv_lock); 1310 } 1311 1312 return (ret); 1313 } 1314 1315 1316 static int 1317 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp) 1318 { 1319 sv_dev_t *svp; 1320 sv_maj_t *maj; 1321 int (*fn)(); 1322 int ret; 1323 1324 svp = sv_dev_to_sv(dev, &maj); 1325 1326 if (svp && 1327 svp->sv_state == SV_PENDING && 1328 svp->sv_pending == curthread) { 1329 /* 1330 * This is a recursive open from a call to 1331 * ddi_lyr_close and so we just want 1332 * to pass it straight through to the 1333 * underlying driver. 1334 */ 1335 DTRACE_PROBE2(sv_lyr_close_recursive, 1336 sv_dev_t *, svp, 1337 dev_t, dev); 1338 svp = NULL; 1339 } 1340 1341 if (svp) { 1342 rw_enter(&svp->sv_lock, RW_READER); 1343 1344 if (otyp == OTYP_LYR) { 1345 mutex_enter(&svp->sv_olock); 1346 1347 if (svp->sv_openlcnt) { 1348 /* 1349 * Consume sufficient layered closes to 1350 * account for the opens that we faked 1351 * whilst the device was failed. 1352 */ 1353 svp->sv_openlcnt--; 1354 mutex_exit(&svp->sv_olock); 1355 rw_exit(&svp->sv_lock); 1356 1357 DTRACE_PROBE1(sv_lyr_close_end, 1358 dev_t, dev); 1359 1360 return (0); 1361 } 1362 1363 mutex_exit(&svp->sv_olock); 1364 } 1365 } 1366 1367 if (maj && (fn = maj->sm_close) != 0) { 1368 if (!(maj->sm_flag & D_MP)) { 1369 UNSAFE_ENTER(); 1370 ret = (*fn)(dev, flag, otyp, crp); 1371 UNSAFE_EXIT(); 1372 } else { 1373 ret = (*fn)(dev, flag, otyp, crp); 1374 } 1375 } else { 1376 ret = ENODEV; 1377 } 1378 1379 if (svp) { 1380 rw_exit(&svp->sv_lock); 1381 } 1382 1383 return (ret); 1384 } 1385 1386 1387 /* 1388 * Convert the specified dev_t into a locked and enabled sv_dev_t, or 1389 * return NULL. 1390 */ 1391 static sv_dev_t * 1392 sv_find_enabled(const dev_t dev, sv_maj_t **majpp) 1393 { 1394 sv_dev_t *svp; 1395 1396 while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) { 1397 rw_enter(&svp->sv_lock, RW_READER); 1398 1399 if (svp->sv_state == SV_ENABLE) { 1400 /* locked and enabled */ 1401 break; 1402 } 1403 1404 /* 1405 * State was changed while waiting on the lock. 1406 * Wait for a stable state. 1407 */ 1408 rw_exit(&svp->sv_lock); 1409 1410 DTRACE_PROBE1(sv_find_enabled_retry, 1411 dev_t, dev); 1412 1413 delay(2); 1414 } 1415 1416 return (svp); 1417 } 1418 1419 1420 static int 1421 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw) 1422 { 1423 sv_dev_t *svp; 1424 sv_maj_t *maj; 1425 int (*fn)(); 1426 int rc; 1427 1428 svp = sv_find_enabled(dev, &maj); 1429 if (svp == NULL) { 1430 if (maj) { 1431 if (rw == NSC_READ) 1432 fn = maj->sm_read; 1433 else 1434 fn = maj->sm_write; 1435 1436 if (fn != 0) { 1437 if (!(maj->sm_flag & D_MP)) { 1438 UNSAFE_ENTER(); 1439 rc = (*fn)(dev, uiop, crp); 1440 UNSAFE_EXIT(); 1441 } else { 1442 rc = (*fn)(dev, uiop, crp); 1443 } 1444 } 1445 1446 return (rc); 1447 } else { 1448 return (ENODEV); 1449 } 1450 } 1451 1452 ASSERT(RW_READ_HELD(&svp->sv_lock)); 1453 1454 if (svp->sv_flag == 0) { 1455 /* 1456 * guard access mode 1457 * - prevent user level access to the device 1458 */ 1459 DTRACE_PROBE1(sv_lyr_uio_err_guard, 1460 uio_t *, uiop); 1461 rc = EPERM; 1462 goto out; 1463 } 1464 1465 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) { 1466 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, 1467 uio_t *, uiop); 1468 goto out; 1469 } 1470 1471 if (rw == NSC_READ) 1472 rc = nsc_uread(svp->sv_fd, uiop, crp); 1473 else 1474 rc = nsc_uwrite(svp->sv_fd, uiop, crp); 1475 1476 nsc_release(svp->sv_fd); 1477 1478 out: 1479 rw_exit(&svp->sv_lock); 1480 1481 return (rc); 1482 } 1483 1484 1485 static int 1486 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp) 1487 { 1488 return (sv_lyr_uio(dev, uiop, crp, NSC_READ)); 1489 } 1490 1491 1492 static int 1493 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp) 1494 { 1495 return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE)); 1496 } 1497 1498 1499 /* ARGSUSED */ 1500 1501 static int 1502 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp) 1503 { 1504 return (aphysio(sv_lyr_strategy, 1505 anocancel, dev, B_READ, minphys, aio)); 1506 } 1507 1508 1509 /* ARGSUSED */ 1510 1511 static int 1512 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp) 1513 { 1514 return (aphysio(sv_lyr_strategy, 1515 anocancel, dev, B_WRITE, minphys, aio)); 1516 } 1517 1518 1519 /* 1520 * Set up an array containing the list of raw path names 1521 * The array for the paths is svl and the size of the array is 1522 * in size. 1523 * 1524 * If there are more layered devices than will fit in the array, 1525 * the number of extra layered devices is returned. Otherwise 1526 * zero is return. 1527 * 1528 * Input: 1529 * svn : array for paths 1530 * size : size of the array 1531 * 1532 * Output (extra): 1533 * zero : All paths fit in array 1534 * >0 : Number of defined layered devices don't fit in array 1535 */ 1536 1537 static int 1538 sv_list(void *ptr, const int size, int *extra, const int ilp32) 1539 { 1540 sv_name32_t *svn32; 1541 sv_name_t *svn; 1542 sv_dev_t *svp; 1543 int *mode, *nblocks; 1544 int i, index; 1545 char *path; 1546 1547 *extra = 0; 1548 index = 0; 1549 1550 if (ilp32) 1551 svn32 = ptr; 1552 else 1553 svn = ptr; 1554 1555 mutex_enter(&sv_mutex); 1556 for (i = 0; i < sv_max_devices; i++) { 1557 svp = &sv_devs[i]; 1558 1559 rw_enter(&svp->sv_lock, RW_READER); 1560 1561 if (svp->sv_state != SV_ENABLE) { 1562 rw_exit(&svp->sv_lock); 1563 continue; 1564 } 1565 1566 if ((*extra) != 0 || ptr == NULL) { 1567 /* Another overflow entry */ 1568 rw_exit(&svp->sv_lock); 1569 (*extra)++; 1570 continue; 1571 } 1572 1573 if (ilp32) { 1574 nblocks = &svn32->svn_nblocks; 1575 mode = &svn32->svn_mode; 1576 path = svn32->svn_path; 1577 1578 svn32->svn_timestamp = (uint32_t)svp->sv_timestamp; 1579 svn32++; 1580 } else { 1581 nblocks = &svn->svn_nblocks; 1582 mode = &svn->svn_mode; 1583 path = svn->svn_path; 1584 1585 svn->svn_timestamp = svp->sv_timestamp; 1586 svn++; 1587 } 1588 1589 (void) strcpy(path, nsc_pathname(svp->sv_fd)); 1590 *nblocks = svp->sv_nblocks; 1591 *mode = svp->sv_flag; 1592 1593 if (*nblocks == 0) { 1594 if (sv_debug > 3) 1595 cmn_err(CE_CONT, "sv_list: need to reserve\n"); 1596 1597 if (sv_reserve(svp->sv_fd, 1598 NSC_MULTI|NSC_PCATCH) == 0) { 1599 *nblocks = svp->sv_nblocks; 1600 nsc_release(svp->sv_fd); 1601 } 1602 } 1603 1604 if (++index >= size) { 1605 /* Out of space */ 1606 (*extra)++; 1607 } 1608 1609 rw_exit(&svp->sv_lock); 1610 } 1611 mutex_exit(&sv_mutex); 1612 1613 if (index < size) { 1614 /* NULL terminated list */ 1615 if (ilp32) 1616 svn32->svn_path[0] = '\0'; 1617 else 1618 svn->svn_path[0] = '\0'; 1619 } 1620 1621 return (0); 1622 } 1623 1624 1625 static void 1626 sv_thread_tune(int threads) 1627 { 1628 int incr = (threads > 0) ? 1 : -1; 1629 int change = 0; 1630 int nthreads; 1631 1632 ASSERT(MUTEX_HELD(&sv_mutex)); 1633 1634 if (sv_threads_extra) { 1635 /* keep track of any additional threads requested */ 1636 if (threads > 0) { 1637 sv_threads_extra += threads; 1638 return; 1639 } 1640 threads = -threads; 1641 if (threads >= sv_threads_extra) { 1642 threads -= sv_threads_extra; 1643 sv_threads_extra = 0; 1644 /* fall through to while loop */ 1645 } else { 1646 sv_threads_extra -= threads; 1647 return; 1648 } 1649 } else if (threads > 0) { 1650 /* 1651 * do not increase the number of threads beyond 1652 * sv_threads_max when doing dynamic thread tuning 1653 */ 1654 nthreads = nst_nthread(sv_tset); 1655 if ((nthreads + threads) > sv_threads_max) { 1656 sv_threads_extra = nthreads + threads - sv_threads_max; 1657 threads = sv_threads_max - nthreads; 1658 if (threads <= 0) 1659 return; 1660 } 1661 } 1662 1663 if (threads < 0) 1664 threads = -threads; 1665 1666 while (threads--) { 1667 nthreads = nst_nthread(sv_tset); 1668 sv_threads_needed += incr; 1669 1670 if (sv_threads_needed >= nthreads) 1671 change += nst_add_thread(sv_tset, sv_threads_inc); 1672 else if ((sv_threads_needed < 1673 (nthreads - (sv_threads_inc + sv_threads_hysteresis))) && 1674 ((nthreads - sv_threads_inc) >= sv_threads)) 1675 change -= nst_del_thread(sv_tset, sv_threads_inc); 1676 } 1677 1678 #ifdef DEBUG 1679 if (change) { 1680 cmn_err(CE_NOTE, 1681 "sv_thread_tune: threads needed %d, nthreads %d, " 1682 "nthreads change %d", 1683 sv_threads_needed, nst_nthread(sv_tset), change); 1684 } 1685 #endif 1686 } 1687 1688 1689 /* ARGSUSED */ 1690 static int 1691 svopen(dev_t *devp, int flag, int otyp, cred_t *crp) 1692 { 1693 int rc; 1694 1695 mutex_enter(&sv_mutex); 1696 rc = sv_init_devs(); 1697 mutex_exit(&sv_mutex); 1698 1699 return (rc); 1700 } 1701 1702 1703 /* ARGSUSED */ 1704 static int 1705 svclose(dev_t dev, int flag, int otyp, cred_t *crp) 1706 { 1707 const int secs = HZ * 5; 1708 const int ticks = HZ / 10; 1709 int loops = secs / ticks; 1710 1711 mutex_enter(&sv_mutex); 1712 while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) { 1713 if (nst_nlive(sv_tset) <= 0) { 1714 nst_destroy(sv_tset); 1715 sv_tset = NULL; 1716 break; 1717 } 1718 1719 /* threads still active - wait for them to exit */ 1720 mutex_exit(&sv_mutex); 1721 delay(ticks); 1722 loops--; 1723 mutex_enter(&sv_mutex); 1724 } 1725 mutex_exit(&sv_mutex); 1726 1727 if (loops <= 0) { 1728 cmn_err(CE_WARN, 1729 #ifndef DEBUG 1730 /* do not write to console when non-DEBUG */ 1731 "!" 1732 #endif 1733 "sv:svclose: threads still active " 1734 "after %d sec - leaking thread set", secs); 1735 } 1736 1737 return (0); 1738 } 1739 1740 1741 static int 1742 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp) 1743 { 1744 char itmp1[12], itmp2[12]; /* temp char array for editing ints */ 1745 spcs_s_info_t kstatus; /* Kernel version of spcs status */ 1746 spcs_s_info_t ustatus; /* Address of user version of spcs status */ 1747 sv_list32_t svl32; /* 32 bit Initial structure for SVIOC_LIST */ 1748 sv_version_t svv; /* Version structure */ 1749 sv_conf_t svc; /* User config structure */ 1750 sv_list_t svl; /* Initial structure for SVIOC_LIST */ 1751 void *usvn; /* Address of user sv_name_t */ 1752 void *svn = NULL; /* Array for SVIOC_LIST */ 1753 uint64_t phash; /* pathname hash */ 1754 int rc = 0; /* Return code -- errno */ 1755 int size; /* Number of items in array */ 1756 int bytes; /* Byte size of array */ 1757 int ilp32; /* Convert data structures for ilp32 userland */ 1758 1759 *rvalp = 0; 1760 1761 /* 1762 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue. 1763 * else it means it previously was SV_PREVENT_UNLOAD, and now it's 1764 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload. 1765 * 1766 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex. 1767 */ 1768 if (sv_mod_status == SV_ALLOW_UNLOAD) { 1769 return (EBUSY); 1770 } 1771 1772 if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0)) 1773 return (rc); 1774 1775 kstatus = spcs_s_kcreate(); 1776 if (!kstatus) { 1777 DTRACE_PROBE1(sv_ioctl_err_kcreate, 1778 dev_t, dev); 1779 return (ENOMEM); 1780 } 1781 1782 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32); 1783 1784 switch (cmd) { 1785 1786 case SVIOC_ENABLE: 1787 1788 if (ilp32) { 1789 sv_conf32_t svc32; 1790 1791 if (ddi_copyin((void *)arg, &svc32, 1792 sizeof (svc32), mode) < 0) { 1793 spcs_s_kfree(kstatus); 1794 return (EFAULT); 1795 } 1796 1797 svc.svc_error = (spcs_s_info_t)svc32.svc_error; 1798 (void) strcpy(svc.svc_path, svc32.svc_path); 1799 svc.svc_flag = svc32.svc_flag; 1800 svc.svc_major = svc32.svc_major; 1801 svc.svc_minor = svc32.svc_minor; 1802 } else { 1803 if (ddi_copyin((void *)arg, &svc, 1804 sizeof (svc), mode) < 0) { 1805 spcs_s_kfree(kstatus); 1806 return (EFAULT); 1807 } 1808 } 1809 1810 /* force to raw access */ 1811 svc.svc_flag = NSC_DEVICE; 1812 1813 if (sv_tset == NULL) { 1814 mutex_enter(&sv_mutex); 1815 1816 if (sv_tset == NULL) { 1817 sv_tset = nst_init("sv_thr", sv_threads); 1818 } 1819 1820 mutex_exit(&sv_mutex); 1821 1822 if (sv_tset == NULL) { 1823 cmn_err(CE_WARN, 1824 "sv: could not allocate %d threads", 1825 sv_threads); 1826 } 1827 } 1828 1829 rc = sv_enable(svc.svc_path, svc.svc_flag, 1830 makedevice(svc.svc_major, svc.svc_minor), 1831 kstatus); 1832 1833 if (rc == 0) { 1834 sv_config_time = nsc_lbolt(); 1835 1836 mutex_enter(&sv_mutex); 1837 sv_thread_tune(sv_threads_dev); 1838 mutex_exit(&sv_mutex); 1839 } 1840 1841 DTRACE_PROBE3(sv_ioctl_end, 1842 dev_t, dev, 1843 int, *rvalp, 1844 int, rc); 1845 1846 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc)); 1847 /* NOTREACHED */ 1848 1849 case SVIOC_DISABLE: 1850 1851 if (ilp32) { 1852 sv_conf32_t svc32; 1853 1854 if (ddi_copyin((void *)arg, &svc32, 1855 sizeof (svc32), mode) < 0) { 1856 spcs_s_kfree(kstatus); 1857 return (EFAULT); 1858 } 1859 1860 svc.svc_error = (spcs_s_info_t)svc32.svc_error; 1861 svc.svc_major = svc32.svc_major; 1862 svc.svc_minor = svc32.svc_minor; 1863 (void) strcpy(svc.svc_path, svc32.svc_path); 1864 svc.svc_flag = svc32.svc_flag; 1865 } else { 1866 if (ddi_copyin((void *)arg, &svc, 1867 sizeof (svc), mode) < 0) { 1868 spcs_s_kfree(kstatus); 1869 return (EFAULT); 1870 } 1871 } 1872 1873 if (svc.svc_major == (major_t)-1 && 1874 svc.svc_minor == (minor_t)-1) { 1875 sv_dev_t *svp; 1876 int i; 1877 1878 /* 1879 * User level could not find the minor device 1880 * node, so do this the slow way by searching 1881 * the entire sv config for a matching pathname. 1882 */ 1883 1884 phash = nsc_strhash(svc.svc_path); 1885 1886 mutex_enter(&sv_mutex); 1887 1888 for (i = 0; i < sv_max_devices; i++) { 1889 svp = &sv_devs[i]; 1890 1891 if (svp->sv_state == SV_DISABLE || 1892 svp->sv_fd == NULL) 1893 continue; 1894 1895 if (nsc_fdpathcmp(svp->sv_fd, phash, 1896 svc.svc_path) == 0) { 1897 svc.svc_major = getmajor(svp->sv_dev); 1898 svc.svc_minor = getminor(svp->sv_dev); 1899 break; 1900 } 1901 } 1902 1903 mutex_exit(&sv_mutex); 1904 1905 if (svc.svc_major == (major_t)-1 && 1906 svc.svc_minor == (minor_t)-1) 1907 return (spcs_s_ocopyoutf(&kstatus, 1908 svc.svc_error, SV_ENODEV)); 1909 } 1910 1911 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor), 1912 kstatus); 1913 1914 if (rc == 0) { 1915 sv_config_time = nsc_lbolt(); 1916 1917 mutex_enter(&sv_mutex); 1918 sv_thread_tune(-sv_threads_dev); 1919 mutex_exit(&sv_mutex); 1920 } 1921 1922 DTRACE_PROBE3(sv_ioctl_2, 1923 dev_t, dev, 1924 int, *rvalp, 1925 int, rc); 1926 1927 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc)); 1928 /* NOTREACHED */ 1929 1930 case SVIOC_LIST: 1931 1932 if (ilp32) { 1933 if (ddi_copyin((void *)arg, &svl32, 1934 sizeof (svl32), mode) < 0) { 1935 spcs_s_kfree(kstatus); 1936 return (EFAULT); 1937 } 1938 1939 ustatus = (spcs_s_info_t)svl32.svl_error; 1940 size = svl32.svl_count; 1941 usvn = (void *)(unsigned long)svl32.svl_names; 1942 } else { 1943 if (ddi_copyin((void *)arg, &svl, 1944 sizeof (svl), mode) < 0) { 1945 spcs_s_kfree(kstatus); 1946 return (EFAULT); 1947 } 1948 1949 ustatus = svl.svl_error; 1950 size = svl.svl_count; 1951 usvn = svl.svl_names; 1952 } 1953 1954 /* Do some boundary checking */ 1955 if ((size < 0) || (size > sv_max_devices)) { 1956 /* Array size is out of range */ 1957 return (spcs_s_ocopyoutf(&kstatus, ustatus, 1958 SV_EARRBOUNDS, "0", 1959 spcs_s_inttostring(sv_max_devices, itmp1, 1960 sizeof (itmp1), 0), 1961 spcs_s_inttostring(size, itmp2, 1962 sizeof (itmp2), 0))); 1963 } 1964 1965 if (ilp32) 1966 bytes = size * sizeof (sv_name32_t); 1967 else 1968 bytes = size * sizeof (sv_name_t); 1969 1970 /* Allocate memory for the array of structures */ 1971 if (bytes != 0) { 1972 svn = kmem_zalloc(bytes, KM_SLEEP); 1973 if (!svn) { 1974 return (spcs_s_ocopyoutf(&kstatus, 1975 ustatus, ENOMEM)); 1976 } 1977 } 1978 1979 rc = sv_list(svn, size, rvalp, ilp32); 1980 if (rc) { 1981 if (svn != NULL) 1982 kmem_free(svn, bytes); 1983 return (spcs_s_ocopyoutf(&kstatus, ustatus, rc)); 1984 } 1985 1986 if (ilp32) { 1987 svl32.svl_timestamp = (uint32_t)sv_config_time; 1988 svl32.svl_maxdevs = (int32_t)sv_max_devices; 1989 1990 /* Return the list structure */ 1991 if (ddi_copyout(&svl32, (void *)arg, 1992 sizeof (svl32), mode) < 0) { 1993 spcs_s_kfree(kstatus); 1994 if (svn != NULL) 1995 kmem_free(svn, bytes); 1996 return (EFAULT); 1997 } 1998 } else { 1999 svl.svl_timestamp = sv_config_time; 2000 svl.svl_maxdevs = sv_max_devices; 2001 2002 /* Return the list structure */ 2003 if (ddi_copyout(&svl, (void *)arg, 2004 sizeof (svl), mode) < 0) { 2005 spcs_s_kfree(kstatus); 2006 if (svn != NULL) 2007 kmem_free(svn, bytes); 2008 return (EFAULT); 2009 } 2010 } 2011 2012 /* Return the array */ 2013 if (svn != NULL) { 2014 if (ddi_copyout(svn, usvn, bytes, mode) < 0) { 2015 kmem_free(svn, bytes); 2016 spcs_s_kfree(kstatus); 2017 return (EFAULT); 2018 } 2019 kmem_free(svn, bytes); 2020 } 2021 2022 DTRACE_PROBE3(sv_ioctl_3, 2023 dev_t, dev, 2024 int, *rvalp, 2025 int, 0); 2026 2027 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0)); 2028 /* NOTREACHED */ 2029 2030 case SVIOC_VERSION: 2031 2032 if (ilp32) { 2033 sv_version32_t svv32; 2034 2035 if (ddi_copyin((void *)arg, &svv32, 2036 sizeof (svv32), mode) < 0) { 2037 spcs_s_kfree(kstatus); 2038 return (EFAULT); 2039 } 2040 2041 svv32.svv_major_rev = sv_major_rev; 2042 svv32.svv_minor_rev = sv_minor_rev; 2043 svv32.svv_micro_rev = sv_micro_rev; 2044 svv32.svv_baseline_rev = sv_baseline_rev; 2045 2046 if (ddi_copyout(&svv32, (void *)arg, 2047 sizeof (svv32), mode) < 0) { 2048 spcs_s_kfree(kstatus); 2049 return (EFAULT); 2050 } 2051 2052 ustatus = (spcs_s_info_t)svv32.svv_error; 2053 } else { 2054 if (ddi_copyin((void *)arg, &svv, 2055 sizeof (svv), mode) < 0) { 2056 spcs_s_kfree(kstatus); 2057 return (EFAULT); 2058 } 2059 2060 svv.svv_major_rev = sv_major_rev; 2061 svv.svv_minor_rev = sv_minor_rev; 2062 svv.svv_micro_rev = sv_micro_rev; 2063 svv.svv_baseline_rev = sv_baseline_rev; 2064 2065 if (ddi_copyout(&svv, (void *)arg, 2066 sizeof (svv), mode) < 0) { 2067 spcs_s_kfree(kstatus); 2068 return (EFAULT); 2069 } 2070 2071 ustatus = svv.svv_error; 2072 } 2073 2074 DTRACE_PROBE3(sv_ioctl_4, 2075 dev_t, dev, 2076 int, *rvalp, 2077 int, 0); 2078 2079 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0)); 2080 /* NOTREACHED */ 2081 2082 case SVIOC_UNLOAD: 2083 rc = sv_prepare_unload(); 2084 2085 if (ddi_copyout(&rc, (void *)arg, 2086 sizeof (rc), mode) < 0) { 2087 rc = EFAULT; 2088 } 2089 2090 spcs_s_kfree(kstatus); 2091 return (rc); 2092 2093 default: 2094 spcs_s_kfree(kstatus); 2095 2096 DTRACE_PROBE3(sv_ioctl_4, 2097 dev_t, dev, 2098 int, *rvalp, 2099 int, EINVAL); 2100 2101 return (EINVAL); 2102 /* NOTREACHED */ 2103 } 2104 2105 /* NOTREACHED */ 2106 } 2107 2108 2109 /* ARGSUSED */ 2110 static int 2111 svprint(dev_t dev, char *str) 2112 { 2113 int instance = ddi_get_instance(sv_dip); 2114 cmn_err(CE_WARN, "%s%d: %s", ddi_get_name(sv_dip), instance, str); 2115 return (0); 2116 } 2117 2118 2119 static void 2120 _sv_lyr_strategy(struct buf *bp) 2121 { 2122 caddr_t buf_addr; /* pointer to linear buffer in bp */ 2123 nsc_buf_t *bufh = NULL; 2124 nsc_buf_t *hndl = NULL; 2125 sv_dev_t *svp; 2126 nsc_vec_t *v; 2127 sv_maj_t *maj; 2128 nsc_size_t fba_req, fba_len; /* FBA lengths */ 2129 nsc_off_t fba_off; /* FBA offset */ 2130 size_t tocopy, nbytes; /* byte lengths */ 2131 int rw, rc; /* flags and return codes */ 2132 int (*fn)(); 2133 2134 rc = 0; 2135 2136 if (sv_debug > 5) 2137 cmn_err(CE_CONT, "_sv_lyr_strategy(%p)\n", (void *)bp); 2138 2139 svp = sv_find_enabled(bp->b_edev, &maj); 2140 if (svp == NULL) { 2141 if (maj && (fn = maj->sm_strategy) != 0) { 2142 if (!(maj->sm_flag & D_MP)) { 2143 UNSAFE_ENTER(); 2144 rc = (*fn)(bp); 2145 UNSAFE_EXIT(); 2146 } else { 2147 rc = (*fn)(bp); 2148 } 2149 return; 2150 } else { 2151 bioerror(bp, ENODEV); 2152 biodone(bp); 2153 return; 2154 } 2155 } 2156 2157 ASSERT(RW_READ_HELD(&svp->sv_lock)); 2158 2159 if (svp->sv_flag == 0) { 2160 /* 2161 * guard access mode 2162 * - prevent user level access to the device 2163 */ 2164 DTRACE_PROBE1(sv_lyr_strategy_err_guard, 2165 struct buf *, bp); 2166 bioerror(bp, EPERM); 2167 goto out; 2168 } 2169 2170 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) { 2171 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, 2172 struct buf *, bp); 2173 2174 if (rc == EINTR) 2175 cmn_err(CE_WARN, "nsc_reserve() returned EINTR"); 2176 bioerror(bp, rc); 2177 goto out; 2178 } 2179 2180 if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) { 2181 DTRACE_PROBE1(sv_lyr_strategy_eof, 2182 struct buf *, bp); 2183 2184 if (bp->b_flags & B_READ) { 2185 /* return EOF, not an error */ 2186 bp->b_resid = bp->b_bcount; 2187 bioerror(bp, 0); 2188 } else 2189 bioerror(bp, EINVAL); 2190 2191 goto done; 2192 } 2193 2194 /* 2195 * Preallocate a handle once per call to strategy. 2196 * If this fails, then the nsc_alloc_buf() will allocate 2197 * a temporary handle per allocation/free pair. 2198 */ 2199 2200 DTRACE_PROBE1(sv_dbg_alloch_start, 2201 sv_dev_t *, svp); 2202 2203 bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL); 2204 2205 DTRACE_PROBE1(sv_dbg_alloch_end, 2206 sv_dev_t *, svp); 2207 2208 if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) { 2209 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, 2210 struct buf *, bp); 2211 2212 cmn_err(CE_WARN, 2213 "sv: allocated active handle (bufh %p, flags %x)", 2214 (void *)bufh, bufh->sb_flag); 2215 2216 bioerror(bp, ENXIO); 2217 goto done; 2218 } 2219 2220 fba_req = FBA_LEN(bp->b_bcount); 2221 if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks) 2222 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno); 2223 2224 rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE; 2225 2226 bp_mapin(bp); 2227 2228 bp->b_resid = bp->b_bcount; 2229 buf_addr = bp->b_un.b_addr; 2230 fba_off = 0; 2231 2232 /* 2233 * fba_req - requested size of transfer in FBAs after 2234 * truncation to device extent, and allowing for 2235 * possible non-FBA bounded final chunk. 2236 * fba_off - offset of start of chunk from start of bp in FBAs. 2237 * fba_len - size of this chunk in FBAs. 2238 */ 2239 2240 loop: 2241 fba_len = min(fba_req, svp->sv_maxfbas); 2242 hndl = bufh; 2243 2244 DTRACE_PROBE4(sv_dbg_allocb_start, 2245 sv_dev_t *, svp, 2246 uint64_t, (uint64_t)(bp->b_lblkno + fba_off), 2247 uint64_t, (uint64_t)fba_len, 2248 int, rw); 2249 2250 rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off), 2251 fba_len, rw, &hndl); 2252 2253 DTRACE_PROBE1(sv_dbg_allocb_end, 2254 sv_dev_t *, svp); 2255 2256 if (rc > 0) { 2257 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, 2258 struct buf *, bp); 2259 bioerror(bp, rc); 2260 if (hndl != bufh) 2261 (void) nsc_free_buf(hndl); 2262 hndl = NULL; 2263 goto done; 2264 } 2265 2266 tocopy = min(FBA_SIZE(fba_len), bp->b_resid); 2267 v = hndl->sb_vec; 2268 2269 if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) { 2270 /* 2271 * Not overwriting all of the last FBA, so read in the 2272 * old contents now before we overwrite it with the new 2273 * data. 2274 */ 2275 2276 DTRACE_PROBE2(sv_dbg_read_start, 2277 sv_dev_t *, svp, 2278 uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1)); 2279 2280 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0); 2281 if (rc > 0) { 2282 bioerror(bp, rc); 2283 goto done; 2284 } 2285 2286 DTRACE_PROBE1(sv_dbg_read_end, 2287 sv_dev_t *, svp); 2288 } 2289 2290 DTRACE_PROBE1(sv_dbg_bcopy_start, 2291 sv_dev_t *, svp); 2292 2293 while (tocopy > 0) { 2294 nbytes = min(tocopy, (nsc_size_t)v->sv_len); 2295 2296 if (bp->b_flags & B_READ) 2297 (void) bcopy(v->sv_addr, buf_addr, nbytes); 2298 else 2299 (void) bcopy(buf_addr, v->sv_addr, nbytes); 2300 2301 bp->b_resid -= nbytes; 2302 buf_addr += nbytes; 2303 tocopy -= nbytes; 2304 v++; 2305 } 2306 2307 DTRACE_PROBE1(sv_dbg_bcopy_end, 2308 sv_dev_t *, svp); 2309 2310 if ((bp->b_flags & B_READ) == 0) { 2311 DTRACE_PROBE3(sv_dbg_write_start, 2312 sv_dev_t *, svp, 2313 uint64_t, (uint64_t)hndl->sb_pos, 2314 uint64_t, (uint64_t)hndl->sb_len); 2315 2316 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0); 2317 2318 DTRACE_PROBE1(sv_dbg_write_end, 2319 sv_dev_t *, svp); 2320 2321 if (rc > 0) { 2322 bioerror(bp, rc); 2323 goto done; 2324 } 2325 } 2326 2327 /* 2328 * Adjust FBA offset and requested (ie. remaining) length, 2329 * loop if more data to transfer. 2330 */ 2331 2332 fba_off += fba_len; 2333 fba_req -= fba_len; 2334 2335 if (fba_req > 0) { 2336 DTRACE_PROBE1(sv_dbg_freeb_start, 2337 sv_dev_t *, svp); 2338 2339 rc = nsc_free_buf(hndl); 2340 2341 DTRACE_PROBE1(sv_dbg_freeb_end, 2342 sv_dev_t *, svp); 2343 2344 if (rc > 0) { 2345 DTRACE_PROBE1(sv_lyr_strategy_err_free, 2346 struct buf *, bp); 2347 bioerror(bp, rc); 2348 } 2349 2350 hndl = NULL; 2351 2352 if (rc <= 0) 2353 goto loop; 2354 } 2355 2356 done: 2357 if (hndl != NULL) { 2358 DTRACE_PROBE1(sv_dbg_freeb_start, 2359 sv_dev_t *, svp); 2360 2361 rc = nsc_free_buf(hndl); 2362 2363 DTRACE_PROBE1(sv_dbg_freeb_end, 2364 sv_dev_t *, svp); 2365 2366 if (rc > 0) { 2367 DTRACE_PROBE1(sv_lyr_strategy_err_free, 2368 struct buf *, bp); 2369 bioerror(bp, rc); 2370 } 2371 2372 hndl = NULL; 2373 } 2374 2375 if (bufh) 2376 (void) nsc_free_handle(bufh); 2377 2378 DTRACE_PROBE1(sv_dbg_rlse_start, 2379 sv_dev_t *, svp); 2380 2381 nsc_release(svp->sv_fd); 2382 2383 DTRACE_PROBE1(sv_dbg_rlse_end, 2384 sv_dev_t *, svp); 2385 2386 out: 2387 if (sv_debug > 5) { 2388 cmn_err(CE_CONT, 2389 "_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n", 2390 (void *)bp, (void *)bufh, bp->b_error); 2391 } 2392 2393 DTRACE_PROBE2(sv_lyr_strategy_end, 2394 struct buf *, bp, 2395 int, bp->b_error); 2396 2397 rw_exit(&svp->sv_lock); 2398 biodone(bp); 2399 } 2400 2401 2402 static void 2403 sv_async_strategy(blind_t arg) 2404 { 2405 struct buf *bp = (struct buf *)arg; 2406 _sv_lyr_strategy(bp); 2407 } 2408 2409 2410 static int 2411 sv_lyr_strategy(struct buf *bp) 2412 { 2413 nsthread_t *tp; 2414 int nlive; 2415 2416 /* 2417 * If B_ASYNC was part of the DDI we could use it as a hint to 2418 * not create a thread for synchronous i/o. 2419 */ 2420 if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) { 2421 /* not sv enabled - just pass through */ 2422 DTRACE_PROBE1(sv_lyr_strategy_notsv, 2423 struct buf *, bp); 2424 _sv_lyr_strategy(bp); 2425 return (0); 2426 } 2427 2428 if (sv_debug > 4) { 2429 cmn_err(CE_CONT, "sv_lyr_strategy: nthread %d nlive %d\n", 2430 nst_nthread(sv_tset), nst_nlive(sv_tset)); 2431 } 2432 2433 /* 2434 * If there are only guard devices enabled there 2435 * won't be a threadset, so don't try and use it. 2436 */ 2437 tp = NULL; 2438 if (sv_tset != NULL) { 2439 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0); 2440 } 2441 2442 if (tp == NULL) { 2443 /* 2444 * out of threads, so fall back to synchronous io. 2445 */ 2446 if (sv_debug > 0) { 2447 cmn_err(CE_CONT, 2448 "sv_lyr_strategy: thread alloc failed\n"); 2449 } 2450 2451 DTRACE_PROBE1(sv_lyr_strategy_no_thread, 2452 struct buf *, bp); 2453 2454 _sv_lyr_strategy(bp); 2455 sv_no_threads++; 2456 } else { 2457 nlive = nst_nlive(sv_tset); 2458 if (nlive > sv_max_nlive) { 2459 if (sv_debug > 0) { 2460 cmn_err(CE_CONT, 2461 "sv_lyr_strategy: " 2462 "new max nlive %d (nthread %d)\n", 2463 nlive, nst_nthread(sv_tset)); 2464 } 2465 2466 sv_max_nlive = nlive; 2467 } 2468 } 2469 2470 return (0); 2471 } 2472 2473 2474 #ifndef offsetof 2475 #define offsetof(s, m) ((size_t)(&((s *)0)->m)) 2476 #endif 2477 2478 /* 2479 * re-write the size of the current partition 2480 */ 2481 static int 2482 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp) 2483 { 2484 size_t offset; 2485 int ilp32; 2486 int pnum; 2487 int rc; 2488 2489 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32); 2490 2491 rc = nskern_partition(svp->sv_dev, &pnum); 2492 if (rc != 0) { 2493 return (rc); 2494 } 2495 2496 if (pnum < 0 || pnum >= V_NUMPAR) { 2497 cmn_err(CE_WARN, 2498 "sv_gvtoc: unable to determine partition number " 2499 "for dev %lx", svp->sv_dev); 2500 return (EINVAL); 2501 } 2502 2503 if (ilp32) { 2504 int32_t p_size; 2505 2506 #ifdef _SunOS_5_6 2507 offset = offsetof(struct vtoc, v_part); 2508 offset += sizeof (struct partition) * pnum; 2509 offset += offsetof(struct partition, p_size); 2510 #else 2511 offset = offsetof(struct vtoc32, v_part); 2512 offset += sizeof (struct partition32) * pnum; 2513 offset += offsetof(struct partition32, p_size); 2514 #endif 2515 2516 p_size = (int32_t)svp->sv_nblocks; 2517 if (p_size == 0) { 2518 if (sv_reserve(svp->sv_fd, 2519 NSC_MULTI|NSC_PCATCH) == 0) { 2520 p_size = (int32_t)svp->sv_nblocks; 2521 nsc_release(svp->sv_fd); 2522 } else { 2523 rc = EINTR; 2524 } 2525 } 2526 2527 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset), 2528 sizeof (p_size), mode) != 0) { 2529 rc = EFAULT; 2530 } 2531 } else { 2532 long p_size; 2533 2534 offset = offsetof(struct vtoc, v_part); 2535 offset += sizeof (struct partition) * pnum; 2536 offset += offsetof(struct partition, p_size); 2537 2538 p_size = (long)svp->sv_nblocks; 2539 if (p_size == 0) { 2540 if (sv_reserve(svp->sv_fd, 2541 NSC_MULTI|NSC_PCATCH) == 0) { 2542 p_size = (long)svp->sv_nblocks; 2543 nsc_release(svp->sv_fd); 2544 } else { 2545 rc = EINTR; 2546 } 2547 } 2548 2549 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset), 2550 sizeof (p_size), mode) != 0) { 2551 rc = EFAULT; 2552 } 2553 } 2554 2555 return (rc); 2556 } 2557 2558 2559 #ifdef DKIOCPARTITION 2560 /* 2561 * re-write the size of the current partition 2562 * 2563 * arg is dk_efi_t. 2564 * 2565 * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64; 2566 * 2567 * dk_efi_t->dki_data --> efi_gpt_t (label header) 2568 * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions) 2569 * 2570 * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts 2571 * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself 2572 * 2573 * This assumes that sizeof (efi_gpt_t) is the same as the size of a 2574 * logical block on the disk. 2575 * 2576 * Everything is little endian (i.e. disk format). 2577 */ 2578 static int 2579 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp) 2580 { 2581 dk_efi_t efi; 2582 efi_gpt_t gpt; 2583 efi_gpe_t *gpe = NULL; 2584 size_t sgpe; 2585 uint64_t p_size; /* virtual partition size from nsctl */ 2586 uint32_t crc; 2587 int unparts; /* number of parts in user's array */ 2588 int pnum; 2589 int rc; 2590 2591 rc = nskern_partition(svp->sv_dev, &pnum); 2592 if (rc != 0) { 2593 return (rc); 2594 } 2595 2596 if (pnum < 0) { 2597 cmn_err(CE_WARN, 2598 "sv_efi: unable to determine partition number for dev %lx", 2599 svp->sv_dev); 2600 return (EINVAL); 2601 } 2602 2603 if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) { 2604 return (EFAULT); 2605 } 2606 2607 efi.dki_data = (void *)(uintptr_t)efi.dki_data_64; 2608 2609 if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) { 2610 return (EINVAL); 2611 } 2612 2613 if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) { 2614 rc = EFAULT; 2615 goto out; 2616 } 2617 2618 if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0) 2619 unparts = 1; 2620 else if (pnum >= unparts) { 2621 cmn_err(CE_WARN, 2622 "sv_efi: partition# beyond end of user array (%d >= %d)", 2623 pnum, unparts); 2624 return (EINVAL); 2625 } 2626 2627 sgpe = sizeof (*gpe) * unparts; 2628 gpe = kmem_alloc(sgpe, KM_SLEEP); 2629 2630 if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) { 2631 rc = EFAULT; 2632 goto out; 2633 } 2634 2635 p_size = svp->sv_nblocks; 2636 if (p_size == 0) { 2637 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 2638 p_size = (diskaddr_t)svp->sv_nblocks; 2639 nsc_release(svp->sv_fd); 2640 } else { 2641 rc = EINTR; 2642 } 2643 } 2644 2645 gpe[pnum].efi_gpe_EndingLBA = LE_64( 2646 LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1); 2647 2648 gpt.efi_gpt_PartitionEntryArrayCRC32 = 0; 2649 CRC32(crc, gpe, sgpe, -1U, sv_crc32_table); 2650 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2651 2652 gpt.efi_gpt_HeaderCRC32 = 0; 2653 CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table); 2654 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); 2655 2656 if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) { 2657 rc = EFAULT; 2658 goto out; 2659 } 2660 2661 if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) { 2662 rc = EFAULT; 2663 goto out; 2664 } 2665 2666 out: 2667 if (gpe) { 2668 kmem_free(gpe, sgpe); 2669 } 2670 2671 return (rc); 2672 } 2673 2674 2675 /* 2676 * Re-write the size of the partition specified by p_partno 2677 * 2678 * Note that if a DKIOCPARTITION is issued to an fd opened against a 2679 * non-sv'd device, but p_partno requests the size for a different 2680 * device that is sv'd, this function will *not* be called as sv is 2681 * not interposed on the original device (the fd). 2682 * 2683 * It would not be easy to change this as we cannot get the partition 2684 * number for the non-sv'd device, so cannot compute the dev_t of the 2685 * (sv'd) p_partno device, and so cannot find out if it is sv'd or get 2686 * its size from nsctl. 2687 * 2688 * See also the "Bug 4755783" comment in sv_lyr_ioctl(). 2689 */ 2690 static int 2691 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp) 2692 { 2693 struct partition64 p64; 2694 sv_dev_t *nsvp = NULL; 2695 diskaddr_t p_size; 2696 minor_t nminor; 2697 int pnum, rc; 2698 dev_t ndev; 2699 2700 rc = nskern_partition(svp->sv_dev, &pnum); 2701 if (rc != 0) { 2702 return (rc); 2703 } 2704 2705 if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) { 2706 return (EFAULT); 2707 } 2708 2709 if (p64.p_partno != pnum) { 2710 /* switch to requested partition, not the current one */ 2711 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum); 2712 ndev = makedevice(getmajor(svp->sv_dev), nminor); 2713 nsvp = sv_find_enabled(ndev, NULL); 2714 if (nsvp == NULL) { 2715 /* not sv device - just return */ 2716 return (0); 2717 } 2718 2719 svp = nsvp; 2720 } 2721 2722 p_size = svp->sv_nblocks; 2723 if (p_size == 0) { 2724 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 2725 p_size = (diskaddr_t)svp->sv_nblocks; 2726 nsc_release(svp->sv_fd); 2727 } else { 2728 rc = EINTR; 2729 } 2730 } 2731 2732 if (nsvp != NULL) { 2733 rw_exit(&nsvp->sv_lock); 2734 } 2735 2736 if ((rc == 0) && ddi_copyout(&p_size, 2737 (void *)(arg + offsetof(struct partition64, p_size)), 2738 sizeof (p_size), mode) != 0) { 2739 return (EFAULT); 2740 } 2741 2742 return (rc); 2743 } 2744 #endif /* DKIOCPARTITION */ 2745 2746 2747 static int 2748 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg, 2749 const int mode, cred_t *crp, int *rvalp) 2750 { 2751 sv_dev_t *svp; 2752 sv_maj_t *maj; 2753 int (*fn)(); 2754 int rc = 0; 2755 2756 maj = 0; 2757 fn = 0; 2758 2759 /* 2760 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue. 2761 * else it means it previously was SV_PREVENT_UNLOAD, and now it's 2762 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload. 2763 * 2764 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex. 2765 */ 2766 if (sv_mod_status == SV_ALLOW_UNLOAD) { 2767 return (EBUSY); 2768 } 2769 2770 svp = sv_find_enabled(dev, &maj); 2771 if (svp != NULL) { 2772 if (nskernd_isdaemon()) { 2773 /* 2774 * This is nskernd which always needs to see 2775 * the underlying disk device accurately. 2776 * 2777 * So just pass the ioctl straight through 2778 * to the underlying driver as though the device 2779 * was not sv enabled. 2780 */ 2781 DTRACE_PROBE2(sv_lyr_ioctl_nskernd, 2782 sv_dev_t *, svp, 2783 dev_t, dev); 2784 2785 rw_exit(&svp->sv_lock); 2786 svp = NULL; 2787 } else { 2788 ASSERT(RW_READ_HELD(&svp->sv_lock)); 2789 } 2790 } 2791 2792 /* 2793 * We now have a locked and enabled SV device, or a non-SV device. 2794 */ 2795 2796 switch (cmd) { 2797 /* 2798 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI 2799 * and DKIOCSETEFI are intercepted and faked up as some 2800 * i/o providers emulate volumes of a different size to 2801 * the underlying volume. 2802 * 2803 * Setting the size by rewriting the vtoc is not permitted. 2804 */ 2805 2806 case DKIOCSVTOC: 2807 #ifdef DKIOCPARTITION 2808 case DKIOCSETEFI: 2809 #endif 2810 if (svp == NULL) { 2811 /* not intercepted -- allow ioctl through */ 2812 break; 2813 } 2814 2815 rw_exit(&svp->sv_lock); 2816 2817 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, 2818 dev_t, dev, 2819 int, EPERM); 2820 2821 return (EPERM); 2822 2823 default: 2824 break; 2825 } 2826 2827 /* 2828 * Pass through the real ioctl command. 2829 */ 2830 2831 if (maj && (fn = maj->sm_ioctl) != 0) { 2832 if (!(maj->sm_flag & D_MP)) { 2833 UNSAFE_ENTER(); 2834 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp); 2835 UNSAFE_EXIT(); 2836 } else { 2837 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp); 2838 } 2839 } else { 2840 rc = ENODEV; 2841 } 2842 2843 /* 2844 * Bug 4755783 2845 * Fix up the size of the current partition to allow 2846 * for the virtual volume to be a different size to the 2847 * physical volume (e.g. for II compact dependent shadows). 2848 * 2849 * Note that this only attempts to fix up the current partition 2850 * - the one that the ioctl was issued against. There could be 2851 * other sv'd partitions in the same vtoc, but we cannot tell 2852 * so we don't attempt to fix them up. 2853 */ 2854 2855 if (svp != NULL && rc == 0) { 2856 switch (cmd) { 2857 case DKIOCGVTOC: 2858 rc = sv_fix_dkiocgvtoc(arg, mode, svp); 2859 break; 2860 2861 #ifdef DKIOCPARTITION 2862 case DKIOCGETEFI: 2863 rc = sv_fix_dkiocgetefi(arg, mode, svp); 2864 break; 2865 2866 case DKIOCPARTITION: 2867 rc = sv_fix_dkiocpartition(arg, mode, svp); 2868 break; 2869 #endif /* DKIOCPARTITION */ 2870 } 2871 } 2872 2873 if (svp != NULL) { 2874 rw_exit(&svp->sv_lock); 2875 } 2876 2877 return (rc); 2878 } 2879