1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28 /* 29 * Storage Volume Character and Block Driver (SV) 30 * 31 * This driver implements a simplistic /dev/{r}dsk/ interface to a 32 * specified disk volume that is otherwise managed by the Prism 33 * software. The SV driver layers itself onto the underlying disk 34 * device driver by changing function pointers in the cb_ops 35 * structure. 36 * 37 * CONFIGURATION: 38 * 39 * 1. Configure the driver using the svadm utility. 40 * 2. Access the device as before through /dev/rdsk/c?t?d?s? 41 * 42 * LIMITATIONS: 43 * 44 * This driver should NOT be used to share a device between another 45 * DataServices user interface module (e.g., STE) and a user accessing 46 * the device through the block device in O_WRITE mode. This is because 47 * writes through the block device are asynchronous (due to the page 48 * cache) and so consistency between the block device user and the 49 * STE user cannot be guaranteed. 50 * 51 * Data is copied between system struct buf(9s) and nsc_vec_t. This is 52 * wasteful and slow. 53 */ 54 55 #include <sys/debug.h> 56 #include <sys/types.h> 57 58 #include <sys/ksynch.h> 59 #include <sys/kmem.h> 60 #include <sys/errno.h> 61 #include <sys/varargs.h> 62 #include <sys/file.h> 63 #include <sys/open.h> 64 #include <sys/conf.h> 65 #include <sys/cred.h> 66 #include <sys/buf.h> 67 #include <sys/uio.h> 68 #ifndef DS_DDICT 69 #include <sys/pathname.h> 70 #endif 71 #include <sys/aio_req.h> 72 #include <sys/dkio.h> 73 #include <sys/vtoc.h> 74 #include <sys/cmn_err.h> 75 #include <sys/modctl.h> 76 #include <sys/ddi.h> 77 #include <sys/sysmacros.h> 78 #include <sys/sunddi.h> 79 #include <sys/sunldi.h> 80 #include <sys/nsctl/nsvers.h> 81 82 #include <sys/nsc_thread.h> 83 #include <sys/unistat/spcs_s.h> 84 #include <sys/unistat/spcs_s_k.h> 85 #include <sys/unistat/spcs_errors.h> 86 87 #ifdef DS_DDICT 88 #include "../contract.h" 89 #endif 90 91 #include "../nsctl.h" 92 93 94 #include <sys/sdt.h> /* dtrace is S10 or later */ 95 96 #include "sv.h" 97 #include "sv_impl.h" 98 #include "sv_efi.h" 99 100 #define MAX_EINTR_COUNT 1000 101 102 /* 103 * sv_mod_status 104 */ 105 #define SV_PREVENT_UNLOAD 1 106 #define SV_ALLOW_UNLOAD 2 107 108 static const int sv_major_rev = ISS_VERSION_MAJ; /* Major number */ 109 static const int sv_minor_rev = ISS_VERSION_MIN; /* Minor number */ 110 static const int sv_micro_rev = ISS_VERSION_MIC; /* Micro number */ 111 static const int sv_baseline_rev = ISS_VERSION_NUM; /* Baseline number */ 112 113 #ifdef DKIOCPARTITION 114 /* 115 * CRC32 polynomial table needed for computing the checksums 116 * in an EFI vtoc. 117 */ 118 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE }; 119 #endif 120 121 static clock_t sv_config_time; /* Time of successful {en,dis}able */ 122 static int sv_debug; /* Set non-zero for debug to syslog */ 123 static int sv_mod_status; /* Set to prevent modunload */ 124 125 static dev_info_t *sv_dip; /* Single DIP for driver */ 126 static kmutex_t sv_mutex; /* Protect global lists, etc. */ 127 128 static nsc_mem_t *sv_mem; /* nsctl memory allocator token */ 129 130 131 /* 132 * Per device and per major state. 133 */ 134 135 #ifndef _SunOS_5_6 136 #define UNSAFE_ENTER() 137 #define UNSAFE_EXIT() 138 #else 139 #define UNSAFE_ENTER() mutex_enter(&unsafe_driver) 140 #define UNSAFE_EXIT() mutex_exit(&unsafe_driver) 141 #endif 142 143 /* hash table of major dev structures */ 144 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0}; 145 static sv_dev_t *sv_devs; /* array of per device structures */ 146 static int sv_max_devices; /* SV version of nsc_max_devices() */ 147 static int sv_ndevices; /* number of SV enabled devices */ 148 149 /* 150 * Threading. 151 */ 152 153 int sv_threads_max = 1024; /* maximum # to dynamically alloc */ 154 int sv_threads = 32; /* # to pre-allocate (see sv.conf) */ 155 int sv_threads_extra = 0; /* addl # we would have alloc'ed */ 156 157 static nstset_t *sv_tset; /* the threadset pointer */ 158 159 static int sv_threads_hysteresis = 4; /* hysteresis for threadset resizing */ 160 static int sv_threads_dev = 2; /* # of threads to alloc per device */ 161 static int sv_threads_inc = 8; /* increment for changing the set */ 162 static int sv_threads_needed; /* number of threads needed */ 163 static int sv_no_threads; /* number of nsc_create errors */ 164 static int sv_max_nlive; /* max number of threads running */ 165 166 167 168 /* 169 * nsctl fd callbacks. 170 */ 171 172 static int svattach_fd(blind_t); 173 static int svdetach_fd(blind_t); 174 175 static nsc_def_t sv_fd_def[] = { 176 { "Attach", (uintptr_t)svattach_fd, }, 177 { "Detach", (uintptr_t)svdetach_fd, }, 178 { 0, 0, } 179 }; 180 181 /* 182 * cb_ops functions. 183 */ 184 185 static int svopen(dev_t *, int, int, cred_t *); 186 static int svclose(dev_t, int, int, cred_t *); 187 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *); 188 static int svprint(dev_t, char *); 189 190 /* 191 * These next functions are layered into the underlying driver's devops. 192 */ 193 194 static int sv_lyr_open(dev_t *, int, int, cred_t *); 195 static int sv_lyr_close(dev_t, int, int, cred_t *); 196 static int sv_lyr_strategy(struct buf *); 197 static int sv_lyr_read(dev_t, struct uio *, cred_t *); 198 static int sv_lyr_write(dev_t, struct uio *, cred_t *); 199 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *); 200 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *); 201 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 202 203 static struct cb_ops sv_cb_ops = { 204 svopen, /* open */ 205 svclose, /* close */ 206 nulldev, /* strategy */ 207 svprint, 208 nodev, /* dump */ 209 nodev, /* read */ 210 nodev, /* write */ 211 svioctl, 212 nodev, /* devmap */ 213 nodev, /* mmap */ 214 nodev, /* segmap */ 215 nochpoll, /* poll */ 216 ddi_prop_op, 217 NULL, /* NOT a stream */ 218 D_NEW | D_MP | D_64BIT, 219 CB_REV, 220 nodev, /* aread */ 221 nodev, /* awrite */ 222 }; 223 224 225 /* 226 * dev_ops functions. 227 */ 228 229 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 230 static int sv_attach(dev_info_t *, ddi_attach_cmd_t); 231 static int sv_detach(dev_info_t *, ddi_detach_cmd_t); 232 233 static struct dev_ops sv_ops = { 234 DEVO_REV, 235 0, 236 sv_getinfo, 237 nulldev, /* identify */ 238 nulldev, /* probe */ 239 sv_attach, 240 sv_detach, 241 nodev, /* reset */ 242 &sv_cb_ops, 243 (struct bus_ops *)0 244 }; 245 246 /* 247 * Module linkage. 248 */ 249 250 extern struct mod_ops mod_driverops; 251 252 static struct modldrv modldrv = { 253 &mod_driverops, 254 "nws:Storage Volume:" ISS_VERSION_STR, 255 &sv_ops 256 }; 257 258 static struct modlinkage modlinkage = { 259 MODREV_1, 260 &modldrv, 261 0 262 }; 263 264 265 int 266 _init(void) 267 { 268 int error; 269 270 mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL); 271 272 if ((error = mod_install(&modlinkage)) != 0) { 273 mutex_destroy(&sv_mutex); 274 return (error); 275 } 276 277 #ifdef DEBUG 278 cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n", 279 sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev, 280 ISS_VERSION_STR, BUILD_DATE_STR); 281 #else 282 if (sv_micro_rev) { 283 cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n", 284 sv_major_rev, sv_minor_rev, sv_micro_rev, 285 ISS_VERSION_STR, BUILD_DATE_STR); 286 } else { 287 cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n", 288 sv_major_rev, sv_minor_rev, 289 ISS_VERSION_STR, BUILD_DATE_STR); 290 } 291 #endif 292 293 return (error); 294 } 295 296 297 int 298 _fini(void) 299 { 300 int error; 301 302 if ((error = mod_remove(&modlinkage)) != 0) 303 return (error); 304 305 mutex_destroy(&sv_mutex); 306 307 return (error); 308 } 309 310 311 int 312 _info(struct modinfo *modinfop) 313 { 314 return (mod_info(&modlinkage, modinfop)); 315 } 316 317 318 /* 319 * Locking & State. 320 * 321 * sv_mutex protects config information - sv_maj_t and sv_dev_t lists; 322 * threadset creation and sizing; sv_ndevices. 323 * 324 * If we need to hold both sv_mutex and sv_lock, then the sv_mutex 325 * must be acquired first. 326 * 327 * sv_lock protects the sv_dev_t structure for an individual device. 328 * 329 * sv_olock protects the otyp/open members of the sv_dev_t. If we need 330 * to hold both sv_lock and sv_olock, then the sv_lock must be acquired 331 * first. 332 * 333 * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple 334 * I/O operations to a device simultaneously, as above. 335 * 336 * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur 337 * with sv_lock write-locked must be done with (sv_state == SV_PENDING) 338 * and (sv_pending == curthread) so that any recursion through 339 * sv_lyr_open/sv_lyr_close can be detected. 340 */ 341 342 343 static int 344 sv_init_devs(void) 345 { 346 int i; 347 348 ASSERT(MUTEX_HELD(&sv_mutex)); 349 350 if (sv_max_devices > 0) 351 return (0); 352 353 sv_max_devices = nsc_max_devices(); 354 355 if (sv_max_devices <= 0) { 356 /* nsctl is not attached (nskernd not running) */ 357 if (sv_debug > 0) 358 cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n"); 359 return (EAGAIN); 360 } 361 362 sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)), 363 KM_NOSLEEP, sv_mem); 364 365 if (sv_devs == NULL) { 366 cmn_err(CE_WARN, "!sv: could not allocate sv_devs array"); 367 return (ENOMEM); 368 } 369 370 for (i = 0; i < sv_max_devices; i++) { 371 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL); 372 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL); 373 } 374 375 if (sv_debug > 0) 376 cmn_err(CE_CONT, "!sv: sv_init_devs successful\n"); 377 378 return (0); 379 } 380 381 382 static int 383 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 384 { 385 int rc; 386 387 switch (cmd) { 388 389 case DDI_ATTACH: 390 sv_dip = dip; 391 392 if (ddi_create_minor_node(dip, "sv", S_IFCHR, 393 0, DDI_PSEUDO, 0) != DDI_SUCCESS) 394 goto failed; 395 396 mutex_enter(&sv_mutex); 397 398 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0); 399 if (sv_mem == NULL) { 400 mutex_exit(&sv_mutex); 401 goto failed; 402 } 403 404 rc = sv_init_devs(); 405 if (rc != 0 && rc != EAGAIN) { 406 mutex_exit(&sv_mutex); 407 goto failed; 408 } 409 410 mutex_exit(&sv_mutex); 411 412 413 ddi_report_dev(dip); 414 415 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 416 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 417 "sv_threads", sv_threads); 418 419 if (sv_debug > 0) 420 cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads); 421 422 if (sv_threads > sv_threads_max) 423 sv_threads_max = sv_threads; 424 425 return (DDI_SUCCESS); 426 427 default: 428 return (DDI_FAILURE); 429 } 430 431 failed: 432 DTRACE_PROBE(sv_attach_failed); 433 (void) sv_detach(dip, DDI_DETACH); 434 return (DDI_FAILURE); 435 } 436 437 438 static int 439 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 440 { 441 sv_dev_t *svp; 442 int i; 443 444 switch (cmd) { 445 446 case DDI_DETACH: 447 448 /* 449 * Check that everything is disabled. 450 */ 451 452 mutex_enter(&sv_mutex); 453 454 if (sv_mod_status == SV_PREVENT_UNLOAD) { 455 mutex_exit(&sv_mutex); 456 DTRACE_PROBE(sv_detach_err_prevent); 457 return (DDI_FAILURE); 458 } 459 460 for (i = 0; sv_devs && i < sv_max_devices; i++) { 461 svp = &sv_devs[i]; 462 463 if (svp->sv_state != SV_DISABLE) { 464 mutex_exit(&sv_mutex); 465 DTRACE_PROBE(sv_detach_err_busy); 466 return (DDI_FAILURE); 467 } 468 } 469 470 471 for (i = 0; sv_devs && i < sv_max_devices; i++) { 472 mutex_destroy(&sv_devs[i].sv_olock); 473 rw_destroy(&sv_devs[i].sv_lock); 474 } 475 476 if (sv_devs) { 477 nsc_kmem_free(sv_devs, 478 (sv_max_devices * sizeof (*sv_devs))); 479 sv_devs = NULL; 480 } 481 sv_max_devices = 0; 482 483 if (sv_mem) { 484 nsc_unregister_mem(sv_mem); 485 sv_mem = NULL; 486 } 487 488 mutex_exit(&sv_mutex); 489 490 /* 491 * Remove all minor nodes. 492 */ 493 494 ddi_remove_minor_node(dip, NULL); 495 sv_dip = NULL; 496 497 return (DDI_SUCCESS); 498 499 default: 500 return (DDI_FAILURE); 501 } 502 } 503 504 static sv_maj_t * 505 sv_getmajor(const dev_t dev) 506 { 507 sv_maj_t **insert, *maj; 508 major_t umaj = getmajor(dev); 509 510 /* 511 * See if the hash table entry, or one of the hash chains 512 * is already allocated for this major number 513 */ 514 if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) { 515 do { 516 if (maj->sm_major == umaj) 517 return (maj); 518 } while ((maj = maj->sm_next) != 0); 519 } 520 521 /* 522 * If the sv_mutex is held, there is design flaw, as the only non-mutex 523 * held callers can be sv_enable() or sv_dev_to_sv() 524 * Return an error, instead of panicing the system 525 */ 526 if (MUTEX_HELD(&sv_mutex)) { 527 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t"); 528 return (NULL); 529 } 530 531 /* 532 * Determine where to allocate a new element in the hash table 533 */ 534 mutex_enter(&sv_mutex); 535 insert = &(sv_majors[SV_MAJOR_HASH(umaj)]); 536 for (maj = *insert; maj; maj = maj->sm_next) { 537 538 /* Did another thread beat us to it? */ 539 if (maj->sm_major == umaj) 540 return (maj); 541 542 /* Find a NULL insert point? */ 543 if (maj->sm_next == NULL) 544 insert = &maj->sm_next; 545 } 546 547 /* 548 * Located the new insert point 549 */ 550 *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem); 551 if ((maj = *insert) != 0) 552 maj->sm_major = umaj; 553 else 554 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t"); 555 556 mutex_exit(&sv_mutex); 557 558 return (maj); 559 } 560 561 /* ARGSUSED */ 562 563 static int 564 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 565 { 566 int rc = DDI_FAILURE; 567 568 switch (infocmd) { 569 570 case DDI_INFO_DEVT2DEVINFO: 571 *result = sv_dip; 572 rc = DDI_SUCCESS; 573 break; 574 575 case DDI_INFO_DEVT2INSTANCE: 576 /* 577 * We only have a single instance. 578 */ 579 *result = 0; 580 rc = DDI_SUCCESS; 581 break; 582 583 default: 584 break; 585 } 586 587 return (rc); 588 } 589 590 591 /* 592 * Hashing of devices onto major device structures. 593 * 594 * Individual device structures are hashed onto one of the sm_hash[] 595 * buckets in the relevant major device structure. 596 * 597 * Hash insertion and deletion -must- be done with sv_mutex held. Hash 598 * searching does not require the mutex because of the sm_seq member. 599 * sm_seq is incremented on each insertion (-after- hash chain pointer 600 * manipulation) and each deletion (-before- hash chain pointer 601 * manipulation). When searching the hash chain, the seq number is 602 * checked before accessing each device structure, if the seq number has 603 * changed, then we restart the search from the top of the hash chain. 604 * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search 605 * the hash chain (we are guaranteed that this search cannot be 606 * interrupted). 607 */ 608 609 #define SV_HASH_RETRY 16 610 611 static sv_dev_t * 612 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp) 613 { 614 minor_t umin = getminor(dev); 615 sv_dev_t **hb, *next, *svp; 616 sv_maj_t *maj; 617 int seq; 618 int try; 619 620 /* Get major hash table */ 621 maj = sv_getmajor(dev); 622 if (majpp) 623 *majpp = maj; 624 if (maj == NULL) 625 return (NULL); 626 627 if (maj->sm_inuse == 0) { 628 DTRACE_PROBE1( 629 sv_dev_to_sv_end, 630 dev_t, dev); 631 return (NULL); 632 } 633 634 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]); 635 try = 0; 636 637 retry: 638 if (try > SV_HASH_RETRY) 639 mutex_enter(&sv_mutex); 640 641 seq = maj->sm_seq; 642 for (svp = *hb; svp; svp = next) { 643 next = svp->sv_hash; 644 645 nsc_membar_stld(); /* preserve register load order */ 646 647 if (maj->sm_seq != seq) { 648 DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev); 649 try++; 650 goto retry; 651 } 652 653 if (svp->sv_dev == dev) 654 break; 655 } 656 657 if (try > SV_HASH_RETRY) 658 mutex_exit(&sv_mutex); 659 660 return (svp); 661 } 662 663 664 /* 665 * Must be called with sv_mutex held. 666 */ 667 668 static int 669 sv_get_state(const dev_t udev, sv_dev_t **svpp) 670 { 671 sv_dev_t **hb, **insert, *svp; 672 sv_maj_t *maj; 673 minor_t umin; 674 int i; 675 676 /* Get major hash table */ 677 if ((maj = sv_getmajor(udev)) == NULL) 678 return (NULL); 679 680 /* Determine which minor hash table */ 681 umin = getminor(udev); 682 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]); 683 684 /* look for clash */ 685 686 insert = hb; 687 688 for (svp = *hb; svp; svp = svp->sv_hash) { 689 if (svp->sv_dev == udev) 690 break; 691 692 if (svp->sv_hash == NULL) 693 insert = &svp->sv_hash; 694 } 695 696 if (svp) { 697 DTRACE_PROBE1( 698 sv_get_state_enabled, 699 dev_t, udev); 700 return (SV_EENABLED); 701 } 702 703 /* look for spare sv_devs slot */ 704 705 for (i = 0; i < sv_max_devices; i++) { 706 svp = &sv_devs[i]; 707 708 if (svp->sv_state == SV_DISABLE) 709 break; 710 } 711 712 if (i >= sv_max_devices) { 713 DTRACE_PROBE1( 714 sv_get_state_noslots, 715 dev_t, udev); 716 return (SV_ENOSLOTS); 717 } 718 719 svp->sv_state = SV_PENDING; 720 svp->sv_pending = curthread; 721 722 *insert = svp; 723 svp->sv_hash = NULL; 724 maj->sm_seq++; /* must be after the store to the hash chain */ 725 726 *svpp = svp; 727 728 /* 729 * We do not know the size of the underlying device at 730 * this stage, so initialise "nblocks" property to 731 * zero, and update it whenever we succeed in 732 * nsc_reserve'ing the underlying nsc_fd_t. 733 */ 734 735 svp->sv_nblocks = 0; 736 737 return (0); 738 } 739 740 741 /* 742 * Remove a device structure from it's hash chain. 743 * Must be called with sv_mutex held. 744 */ 745 746 static void 747 sv_rm_hash(sv_dev_t *svp) 748 { 749 sv_dev_t **svpp; 750 sv_maj_t *maj; 751 752 /* Get major hash table */ 753 if ((maj = sv_getmajor(svp->sv_dev)) == NULL) 754 return; 755 756 /* remove svp from hash chain */ 757 758 svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]); 759 while (*svpp) { 760 if (*svpp == svp) { 761 /* 762 * increment of sm_seq must be before the 763 * removal from the hash chain 764 */ 765 maj->sm_seq++; 766 *svpp = svp->sv_hash; 767 break; 768 } 769 770 svpp = &(*svpp)->sv_hash; 771 } 772 773 svp->sv_hash = NULL; 774 } 775 776 /* 777 * Free (disable) a device structure. 778 * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will 779 * perform the exits during its processing. 780 */ 781 782 static int 783 sv_free(sv_dev_t *svp, const int error) 784 { 785 struct cb_ops *cb_ops; 786 sv_maj_t *maj; 787 788 /* Get major hash table */ 789 if ((maj = sv_getmajor(svp->sv_dev)) == NULL) 790 return (NULL); 791 792 svp->sv_state = SV_PENDING; 793 svp->sv_pending = curthread; 794 795 /* 796 * Close the fd's before removing from the hash or swapping 797 * back the cb_ops pointers so that the cache flushes before new 798 * io can come in. 799 */ 800 801 if (svp->sv_fd) { 802 (void) nsc_close(svp->sv_fd); 803 svp->sv_fd = 0; 804 } 805 806 sv_rm_hash(svp); 807 808 if (error != SV_ESDOPEN && 809 error != SV_ELYROPEN && --maj->sm_inuse == 0) { 810 811 if (maj->sm_dev_ops) 812 cb_ops = maj->sm_dev_ops->devo_cb_ops; 813 else 814 cb_ops = NULL; 815 816 if (cb_ops && maj->sm_strategy != NULL) { 817 cb_ops->cb_strategy = maj->sm_strategy; 818 cb_ops->cb_close = maj->sm_close; 819 cb_ops->cb_ioctl = maj->sm_ioctl; 820 cb_ops->cb_write = maj->sm_write; 821 cb_ops->cb_open = maj->sm_open; 822 cb_ops->cb_read = maj->sm_read; 823 cb_ops->cb_flag = maj->sm_flag; 824 825 if (maj->sm_awrite) 826 cb_ops->cb_awrite = maj->sm_awrite; 827 828 if (maj->sm_aread) 829 cb_ops->cb_aread = maj->sm_aread; 830 831 /* 832 * corbin XXX 833 * Leave backing device ops in maj->sm_* 834 * to handle any requests that might come 835 * in during the disable. This could be 836 * a problem however if the backing device 837 * driver is changed while we process these 838 * requests. 839 * 840 * maj->sm_strategy = 0; 841 * maj->sm_awrite = 0; 842 * maj->sm_write = 0; 843 * maj->sm_ioctl = 0; 844 * maj->sm_close = 0; 845 * maj->sm_aread = 0; 846 * maj->sm_read = 0; 847 * maj->sm_open = 0; 848 * maj->sm_flag = 0; 849 * 850 */ 851 } 852 853 if (maj->sm_dev_ops) { 854 maj->sm_dev_ops = 0; 855 } 856 } 857 858 if (svp->sv_lh) { 859 cred_t *crp = ddi_get_cred(); 860 861 /* 862 * Close the protective layered driver open using the 863 * Sun Private layered driver i/f. 864 */ 865 866 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp); 867 svp->sv_lh = NULL; 868 } 869 870 svp->sv_timestamp = nsc_lbolt(); 871 svp->sv_state = SV_DISABLE; 872 svp->sv_pending = NULL; 873 rw_exit(&svp->sv_lock); 874 mutex_exit(&sv_mutex); 875 876 return (error); 877 } 878 879 /* 880 * Reserve the device, taking into account the possibility that 881 * the reserve might have to be retried. 882 */ 883 static int 884 sv_reserve(nsc_fd_t *fd, int flags) 885 { 886 int eintr_count; 887 int rc; 888 889 eintr_count = 0; 890 do { 891 rc = nsc_reserve(fd, flags); 892 if (rc == EINTR) { 893 ++eintr_count; 894 delay(2); 895 } 896 } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT)); 897 898 return (rc); 899 } 900 901 static int 902 sv_enable(const caddr_t path, const int flag, 903 const dev_t udev, spcs_s_info_t kstatus) 904 { 905 struct dev_ops *dev_ops; 906 struct cb_ops *cb_ops; 907 sv_dev_t *svp; 908 sv_maj_t *maj; 909 nsc_size_t nblocks; 910 int rc; 911 cred_t *crp; 912 ldi_ident_t li; 913 914 if (udev == (dev_t)-1 || udev == 0) { 915 DTRACE_PROBE1( 916 sv_enable_err_baddev, 917 dev_t, udev); 918 return (SV_EBADDEV); 919 } 920 921 if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) { 922 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev); 923 return (SV_EAMODE); 924 } 925 926 /* Get major hash table */ 927 if ((maj = sv_getmajor(udev)) == NULL) 928 return (SV_EBADDEV); 929 930 mutex_enter(&sv_mutex); 931 932 rc = sv_get_state(udev, &svp); 933 if (rc) { 934 mutex_exit(&sv_mutex); 935 DTRACE_PROBE1(sv_enable_err_state, dev_t, udev); 936 return (rc); 937 } 938 939 rw_enter(&svp->sv_lock, RW_WRITER); 940 941 /* 942 * Get real fd used for io 943 */ 944 945 svp->sv_dev = udev; 946 svp->sv_flag = flag; 947 948 /* 949 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy 950 * function pointer before sv swaps them out. 951 */ 952 953 svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE), 954 sv_fd_def, (blind_t)udev, &rc); 955 956 if (svp->sv_fd == NULL) { 957 if (kstatus) 958 spcs_s_add(kstatus, rc); 959 DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev); 960 return (sv_free(svp, SV_ESDOPEN)); 961 } 962 963 /* 964 * Perform a layered driver open using the Sun Private layered 965 * driver i/f to ensure that the cb_ops structure for the driver 966 * is not detached out from under us whilst sv is enabled. 967 * 968 */ 969 970 crp = ddi_get_cred(); 971 svp->sv_lh = NULL; 972 973 if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) { 974 rc = ldi_open_by_dev(&svp->sv_dev, 975 OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li); 976 } 977 978 if (rc != 0) { 979 if (kstatus) 980 spcs_s_add(kstatus, rc); 981 DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev); 982 return (sv_free(svp, SV_ELYROPEN)); 983 } 984 985 /* 986 * Do layering if required - must happen after nsc_open(). 987 */ 988 989 if (maj->sm_inuse++ == 0) { 990 maj->sm_dev_ops = nsc_get_devops(getmajor(udev)); 991 992 if (maj->sm_dev_ops == NULL || 993 maj->sm_dev_ops->devo_cb_ops == NULL) { 994 DTRACE_PROBE1(sv_enable_err_load, dev_t, udev); 995 return (sv_free(svp, SV_ELOAD)); 996 } 997 998 dev_ops = maj->sm_dev_ops; 999 cb_ops = dev_ops->devo_cb_ops; 1000 1001 if (cb_ops->cb_strategy == NULL || 1002 cb_ops->cb_strategy == nodev || 1003 cb_ops->cb_strategy == nulldev) { 1004 DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev); 1005 return (sv_free(svp, SV_ELOAD)); 1006 } 1007 1008 if (cb_ops->cb_strategy == sv_lyr_strategy) { 1009 DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev); 1010 return (sv_free(svp, SV_ESTRATEGY)); 1011 } 1012 1013 maj->sm_strategy = cb_ops->cb_strategy; 1014 maj->sm_close = cb_ops->cb_close; 1015 maj->sm_ioctl = cb_ops->cb_ioctl; 1016 maj->sm_write = cb_ops->cb_write; 1017 maj->sm_open = cb_ops->cb_open; 1018 maj->sm_read = cb_ops->cb_read; 1019 maj->sm_flag = cb_ops->cb_flag; 1020 1021 cb_ops->cb_flag = cb_ops->cb_flag | D_MP; 1022 cb_ops->cb_strategy = sv_lyr_strategy; 1023 cb_ops->cb_close = sv_lyr_close; 1024 cb_ops->cb_ioctl = sv_lyr_ioctl; 1025 cb_ops->cb_write = sv_lyr_write; 1026 cb_ops->cb_open = sv_lyr_open; 1027 cb_ops->cb_read = sv_lyr_read; 1028 1029 /* 1030 * Check that the driver has async I/O entry points 1031 * before changing them. 1032 */ 1033 1034 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) { 1035 maj->sm_awrite = 0; 1036 maj->sm_aread = 0; 1037 } else { 1038 maj->sm_awrite = cb_ops->cb_awrite; 1039 maj->sm_aread = cb_ops->cb_aread; 1040 1041 cb_ops->cb_awrite = sv_lyr_awrite; 1042 cb_ops->cb_aread = sv_lyr_aread; 1043 } 1044 1045 /* 1046 * Bug 4645743 1047 * 1048 * Prevent sv from ever unloading after it has interposed 1049 * on a major device because there is a race between 1050 * sv removing its layered entry points from the target 1051 * dev_ops, a client coming in and accessing the driver, 1052 * and the kernel modunloading the sv text. 1053 * 1054 * To allow unload, do svboot -u, which only happens in 1055 * pkgrm time. 1056 */ 1057 ASSERT(MUTEX_HELD(&sv_mutex)); 1058 sv_mod_status = SV_PREVENT_UNLOAD; 1059 } 1060 1061 1062 svp->sv_timestamp = nsc_lbolt(); 1063 svp->sv_state = SV_ENABLE; 1064 svp->sv_pending = NULL; 1065 rw_exit(&svp->sv_lock); 1066 1067 sv_ndevices++; 1068 mutex_exit(&sv_mutex); 1069 1070 nblocks = 0; 1071 if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) { 1072 nblocks = svp->sv_nblocks; 1073 nsc_release(svp->sv_fd); 1074 } 1075 1076 cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n", 1077 svp->sv_dev, nblocks); 1078 1079 return (0); 1080 } 1081 1082 1083 static int 1084 sv_prepare_unload() 1085 { 1086 int rc = 0; 1087 1088 mutex_enter(&sv_mutex); 1089 1090 if (sv_mod_status == SV_PREVENT_UNLOAD) { 1091 if ((sv_ndevices != 0) || (sv_tset != NULL)) { 1092 rc = EBUSY; 1093 } else { 1094 sv_mod_status = SV_ALLOW_UNLOAD; 1095 delay(SV_WAIT_UNLOAD * drv_usectohz(1000000)); 1096 } 1097 } 1098 1099 mutex_exit(&sv_mutex); 1100 return (rc); 1101 } 1102 1103 static int 1104 svattach_fd(blind_t arg) 1105 { 1106 dev_t dev = (dev_t)arg; 1107 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1108 int rc; 1109 1110 if (sv_debug > 0) 1111 cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp); 1112 1113 if (svp == NULL) { 1114 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg); 1115 return (0); 1116 } 1117 1118 if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) { 1119 cmn_err(CE_WARN, 1120 "!svattach_fd: nsc_partsize() failed, rc %d", rc); 1121 svp->sv_nblocks = 0; 1122 } 1123 1124 if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) { 1125 cmn_err(CE_WARN, 1126 "!svattach_fd: nsc_maxfbas() failed, rc %d", rc); 1127 svp->sv_maxfbas = 0; 1128 } 1129 1130 if (sv_debug > 0) { 1131 cmn_err(CE_CONT, 1132 "!svattach_fd(%p): size %" NSC_SZFMT ", " 1133 "maxfbas %" NSC_SZFMT "\n", 1134 arg, svp->sv_nblocks, svp->sv_maxfbas); 1135 } 1136 1137 return (0); 1138 } 1139 1140 1141 static int 1142 svdetach_fd(blind_t arg) 1143 { 1144 dev_t dev = (dev_t)arg; 1145 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1146 1147 if (sv_debug > 0) 1148 cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp); 1149 1150 /* svp can be NULL during disable of an sv */ 1151 if (svp == NULL) 1152 return (0); 1153 1154 svp->sv_maxfbas = 0; 1155 svp->sv_nblocks = 0; 1156 return (0); 1157 } 1158 1159 1160 /* 1161 * Side effect: if called with (guard != 0), then expects both sv_mutex 1162 * and sv_lock(RW_WRITER) to be held, and will release them before returning. 1163 */ 1164 1165 /* ARGSUSED */ 1166 static int 1167 sv_disable(dev_t dev, spcs_s_info_t kstatus) 1168 { 1169 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1170 1171 if (svp == NULL) { 1172 1173 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp); 1174 return (SV_ENODEV); 1175 } 1176 1177 mutex_enter(&sv_mutex); 1178 rw_enter(&svp->sv_lock, RW_WRITER); 1179 1180 if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) { 1181 rw_exit(&svp->sv_lock); 1182 mutex_exit(&sv_mutex); 1183 1184 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp); 1185 return (SV_EDISABLED); 1186 } 1187 1188 1189 sv_ndevices--; 1190 return (sv_free(svp, 0)); 1191 } 1192 1193 1194 1195 static int 1196 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp) 1197 { 1198 nsc_buf_t *tmph; 1199 sv_dev_t *svp; 1200 sv_maj_t *maj; 1201 int (*fn)(); 1202 dev_t odev; 1203 int ret; 1204 int rc; 1205 1206 svp = sv_dev_to_sv(*devp, &maj); 1207 1208 if (svp) { 1209 if (svp->sv_state == SV_PENDING && 1210 svp->sv_pending == curthread) { 1211 /* 1212 * This is a recursive open from a call to 1213 * ddi_lyr_open_by_devt and so we just want 1214 * to pass it straight through to the 1215 * underlying driver. 1216 */ 1217 DTRACE_PROBE2(sv_lyr_open_recursive, 1218 sv_dev_t *, svp, 1219 dev_t, *devp); 1220 svp = NULL; 1221 } else 1222 rw_enter(&svp->sv_lock, RW_READER); 1223 } 1224 1225 odev = *devp; 1226 1227 if (maj && (fn = maj->sm_open) != 0) { 1228 if (!(maj->sm_flag & D_MP)) { 1229 UNSAFE_ENTER(); 1230 ret = (*fn)(devp, flag, otyp, crp); 1231 UNSAFE_EXIT(); 1232 } else { 1233 ret = (*fn)(devp, flag, otyp, crp); 1234 } 1235 1236 if (ret == 0) { 1237 /* 1238 * Re-acquire svp if the driver changed *devp. 1239 */ 1240 1241 if (*devp != odev) { 1242 if (svp != NULL) 1243 rw_exit(&svp->sv_lock); 1244 1245 svp = sv_dev_to_sv(*devp, NULL); 1246 1247 if (svp) { 1248 rw_enter(&svp->sv_lock, RW_READER); 1249 } 1250 } 1251 } 1252 } else { 1253 ret = ENODEV; 1254 } 1255 1256 if (svp && ret != 0 && svp->sv_state == SV_ENABLE) { 1257 /* 1258 * Underlying DDI open failed, but we have this 1259 * device SV enabled. If we can read some data 1260 * from the device, fake a successful open (this 1261 * probably means that this device is RDC'd and we 1262 * are getting the data from the secondary node). 1263 * 1264 * The reserve must be done with NSC_TRY|NSC_NOWAIT to 1265 * ensure that it does not deadlock if this open is 1266 * coming from nskernd:get_bsize(). 1267 */ 1268 rc = sv_reserve(svp->sv_fd, 1269 NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH); 1270 if (rc == 0) { 1271 tmph = NULL; 1272 1273 rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph); 1274 if (rc <= 0) { 1275 /* success */ 1276 ret = 0; 1277 } 1278 1279 if (tmph) { 1280 (void) nsc_free_buf(tmph); 1281 tmph = NULL; 1282 } 1283 1284 nsc_release(svp->sv_fd); 1285 1286 /* 1287 * Count the number of layered opens that we 1288 * fake since we have to fake a matching number 1289 * of closes (OTYP_LYR open/close calls must be 1290 * paired). 1291 */ 1292 1293 if (ret == 0 && otyp == OTYP_LYR) { 1294 mutex_enter(&svp->sv_olock); 1295 svp->sv_openlcnt++; 1296 mutex_exit(&svp->sv_olock); 1297 } 1298 } 1299 } 1300 1301 if (svp) { 1302 rw_exit(&svp->sv_lock); 1303 } 1304 1305 return (ret); 1306 } 1307 1308 1309 static int 1310 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp) 1311 { 1312 sv_dev_t *svp; 1313 sv_maj_t *maj; 1314 int (*fn)(); 1315 int ret; 1316 1317 svp = sv_dev_to_sv(dev, &maj); 1318 1319 if (svp && 1320 svp->sv_state == SV_PENDING && 1321 svp->sv_pending == curthread) { 1322 /* 1323 * This is a recursive open from a call to 1324 * ddi_lyr_close and so we just want 1325 * to pass it straight through to the 1326 * underlying driver. 1327 */ 1328 DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp, 1329 dev_t, dev); 1330 svp = NULL; 1331 } 1332 1333 if (svp) { 1334 rw_enter(&svp->sv_lock, RW_READER); 1335 1336 if (otyp == OTYP_LYR) { 1337 mutex_enter(&svp->sv_olock); 1338 1339 if (svp->sv_openlcnt) { 1340 /* 1341 * Consume sufficient layered closes to 1342 * account for the opens that we faked 1343 * whilst the device was failed. 1344 */ 1345 svp->sv_openlcnt--; 1346 mutex_exit(&svp->sv_olock); 1347 rw_exit(&svp->sv_lock); 1348 1349 DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev); 1350 1351 return (0); 1352 } 1353 1354 mutex_exit(&svp->sv_olock); 1355 } 1356 } 1357 1358 if (maj && (fn = maj->sm_close) != 0) { 1359 if (!(maj->sm_flag & D_MP)) { 1360 UNSAFE_ENTER(); 1361 ret = (*fn)(dev, flag, otyp, crp); 1362 UNSAFE_EXIT(); 1363 } else { 1364 ret = (*fn)(dev, flag, otyp, crp); 1365 } 1366 } else { 1367 ret = ENODEV; 1368 } 1369 1370 if (svp) { 1371 rw_exit(&svp->sv_lock); 1372 } 1373 1374 return (ret); 1375 } 1376 1377 1378 /* 1379 * Convert the specified dev_t into a locked and enabled sv_dev_t, or 1380 * return NULL. 1381 */ 1382 static sv_dev_t * 1383 sv_find_enabled(const dev_t dev, sv_maj_t **majpp) 1384 { 1385 sv_dev_t *svp; 1386 1387 while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) { 1388 rw_enter(&svp->sv_lock, RW_READER); 1389 1390 if (svp->sv_state == SV_ENABLE) { 1391 /* locked and enabled */ 1392 break; 1393 } 1394 1395 /* 1396 * State was changed while waiting on the lock. 1397 * Wait for a stable state. 1398 */ 1399 rw_exit(&svp->sv_lock); 1400 1401 DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev); 1402 1403 delay(2); 1404 } 1405 1406 return (svp); 1407 } 1408 1409 1410 static int 1411 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw) 1412 { 1413 sv_dev_t *svp; 1414 sv_maj_t *maj; 1415 int (*fn)(); 1416 int rc; 1417 1418 svp = sv_find_enabled(dev, &maj); 1419 if (svp == NULL) { 1420 if (maj) { 1421 if (rw == NSC_READ) 1422 fn = maj->sm_read; 1423 else 1424 fn = maj->sm_write; 1425 1426 if (fn != 0) { 1427 if (!(maj->sm_flag & D_MP)) { 1428 UNSAFE_ENTER(); 1429 rc = (*fn)(dev, uiop, crp); 1430 UNSAFE_EXIT(); 1431 } else { 1432 rc = (*fn)(dev, uiop, crp); 1433 } 1434 } 1435 1436 return (rc); 1437 } else { 1438 return (ENODEV); 1439 } 1440 } 1441 1442 ASSERT(RW_READ_HELD(&svp->sv_lock)); 1443 1444 if (svp->sv_flag == 0) { 1445 /* 1446 * guard access mode 1447 * - prevent user level access to the device 1448 */ 1449 DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop); 1450 rc = EPERM; 1451 goto out; 1452 } 1453 1454 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) { 1455 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop); 1456 goto out; 1457 } 1458 1459 if (rw == NSC_READ) 1460 rc = nsc_uread(svp->sv_fd, uiop, crp); 1461 else 1462 rc = nsc_uwrite(svp->sv_fd, uiop, crp); 1463 1464 nsc_release(svp->sv_fd); 1465 1466 out: 1467 rw_exit(&svp->sv_lock); 1468 1469 return (rc); 1470 } 1471 1472 1473 static int 1474 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp) 1475 { 1476 return (sv_lyr_uio(dev, uiop, crp, NSC_READ)); 1477 } 1478 1479 1480 static int 1481 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp) 1482 { 1483 return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE)); 1484 } 1485 1486 1487 /* ARGSUSED */ 1488 1489 static int 1490 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp) 1491 { 1492 return (aphysio(sv_lyr_strategy, 1493 anocancel, dev, B_READ, minphys, aio)); 1494 } 1495 1496 1497 /* ARGSUSED */ 1498 1499 static int 1500 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp) 1501 { 1502 return (aphysio(sv_lyr_strategy, 1503 anocancel, dev, B_WRITE, minphys, aio)); 1504 } 1505 1506 1507 /* 1508 * Set up an array containing the list of raw path names 1509 * The array for the paths is svl and the size of the array is 1510 * in size. 1511 * 1512 * If there are more layered devices than will fit in the array, 1513 * the number of extra layered devices is returned. Otherwise 1514 * zero is return. 1515 * 1516 * Input: 1517 * svn : array for paths 1518 * size : size of the array 1519 * 1520 * Output (extra): 1521 * zero : All paths fit in array 1522 * >0 : Number of defined layered devices don't fit in array 1523 */ 1524 1525 static int 1526 sv_list(void *ptr, const int size, int *extra, const int ilp32) 1527 { 1528 sv_name32_t *svn32; 1529 sv_name_t *svn; 1530 sv_dev_t *svp; 1531 int *mode, *nblocks; 1532 int i, index; 1533 char *path; 1534 1535 *extra = 0; 1536 index = 0; 1537 1538 if (ilp32) 1539 svn32 = ptr; 1540 else 1541 svn = ptr; 1542 1543 mutex_enter(&sv_mutex); 1544 for (i = 0; i < sv_max_devices; i++) { 1545 svp = &sv_devs[i]; 1546 1547 rw_enter(&svp->sv_lock, RW_READER); 1548 1549 if (svp->sv_state != SV_ENABLE) { 1550 rw_exit(&svp->sv_lock); 1551 continue; 1552 } 1553 1554 if ((*extra) != 0 || ptr == NULL) { 1555 /* Another overflow entry */ 1556 rw_exit(&svp->sv_lock); 1557 (*extra)++; 1558 continue; 1559 } 1560 1561 if (ilp32) { 1562 nblocks = &svn32->svn_nblocks; 1563 mode = &svn32->svn_mode; 1564 path = svn32->svn_path; 1565 1566 svn32->svn_timestamp = (uint32_t)svp->sv_timestamp; 1567 svn32++; 1568 } else { 1569 nblocks = &svn->svn_nblocks; 1570 mode = &svn->svn_mode; 1571 path = svn->svn_path; 1572 1573 svn->svn_timestamp = svp->sv_timestamp; 1574 svn++; 1575 } 1576 1577 (void) strcpy(path, nsc_pathname(svp->sv_fd)); 1578 *nblocks = svp->sv_nblocks; 1579 *mode = svp->sv_flag; 1580 1581 if (*nblocks == 0) { 1582 if (sv_debug > 3) 1583 cmn_err(CE_CONT, "!sv_list: need to reserve\n"); 1584 1585 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 1586 *nblocks = svp->sv_nblocks; 1587 nsc_release(svp->sv_fd); 1588 } 1589 } 1590 1591 if (++index >= size) { 1592 /* Out of space */ 1593 (*extra)++; 1594 } 1595 1596 rw_exit(&svp->sv_lock); 1597 } 1598 mutex_exit(&sv_mutex); 1599 1600 if (index < size) { 1601 /* NULL terminated list */ 1602 if (ilp32) 1603 svn32->svn_path[0] = '\0'; 1604 else 1605 svn->svn_path[0] = '\0'; 1606 } 1607 1608 return (0); 1609 } 1610 1611 1612 static void 1613 sv_thread_tune(int threads) 1614 { 1615 int incr = (threads > 0) ? 1 : -1; 1616 int change = 0; 1617 int nthreads; 1618 1619 ASSERT(MUTEX_HELD(&sv_mutex)); 1620 1621 if (sv_threads_extra) { 1622 /* keep track of any additional threads requested */ 1623 if (threads > 0) { 1624 sv_threads_extra += threads; 1625 return; 1626 } 1627 threads = -threads; 1628 if (threads >= sv_threads_extra) { 1629 threads -= sv_threads_extra; 1630 sv_threads_extra = 0; 1631 /* fall through to while loop */ 1632 } else { 1633 sv_threads_extra -= threads; 1634 return; 1635 } 1636 } else if (threads > 0) { 1637 /* 1638 * do not increase the number of threads beyond 1639 * sv_threads_max when doing dynamic thread tuning 1640 */ 1641 nthreads = nst_nthread(sv_tset); 1642 if ((nthreads + threads) > sv_threads_max) { 1643 sv_threads_extra = nthreads + threads - sv_threads_max; 1644 threads = sv_threads_max - nthreads; 1645 if (threads <= 0) 1646 return; 1647 } 1648 } 1649 1650 if (threads < 0) 1651 threads = -threads; 1652 1653 while (threads--) { 1654 nthreads = nst_nthread(sv_tset); 1655 sv_threads_needed += incr; 1656 1657 if (sv_threads_needed >= nthreads) 1658 change += nst_add_thread(sv_tset, sv_threads_inc); 1659 else if ((sv_threads_needed < 1660 (nthreads - (sv_threads_inc + sv_threads_hysteresis))) && 1661 ((nthreads - sv_threads_inc) >= sv_threads)) 1662 change -= nst_del_thread(sv_tset, sv_threads_inc); 1663 } 1664 1665 #ifdef DEBUG 1666 if (change) { 1667 cmn_err(CE_NOTE, 1668 "!sv_thread_tune: threads needed %d, nthreads %d, " 1669 "nthreads change %d", 1670 sv_threads_needed, nst_nthread(sv_tset), change); 1671 } 1672 #endif 1673 } 1674 1675 1676 /* ARGSUSED */ 1677 static int 1678 svopen(dev_t *devp, int flag, int otyp, cred_t *crp) 1679 { 1680 int rc; 1681 1682 mutex_enter(&sv_mutex); 1683 rc = sv_init_devs(); 1684 mutex_exit(&sv_mutex); 1685 1686 return (rc); 1687 } 1688 1689 1690 /* ARGSUSED */ 1691 static int 1692 svclose(dev_t dev, int flag, int otyp, cred_t *crp) 1693 { 1694 const int secs = HZ * 5; 1695 const int ticks = HZ / 10; 1696 int loops = secs / ticks; 1697 1698 mutex_enter(&sv_mutex); 1699 while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) { 1700 if (nst_nlive(sv_tset) <= 0) { 1701 nst_destroy(sv_tset); 1702 sv_tset = NULL; 1703 break; 1704 } 1705 1706 /* threads still active - wait for them to exit */ 1707 mutex_exit(&sv_mutex); 1708 delay(ticks); 1709 loops--; 1710 mutex_enter(&sv_mutex); 1711 } 1712 mutex_exit(&sv_mutex); 1713 1714 if (loops <= 0) { 1715 cmn_err(CE_WARN, 1716 #ifndef DEBUG 1717 /* do not write to console when non-DEBUG */ 1718 "!" 1719 #endif 1720 "sv:svclose: threads still active " 1721 "after %d sec - leaking thread set", secs); 1722 } 1723 1724 return (0); 1725 } 1726 1727 1728 static int 1729 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp) 1730 { 1731 char itmp1[12], itmp2[12]; /* temp char array for editing ints */ 1732 spcs_s_info_t kstatus; /* Kernel version of spcs status */ 1733 spcs_s_info_t ustatus; /* Address of user version of spcs status */ 1734 sv_list32_t svl32; /* 32 bit Initial structure for SVIOC_LIST */ 1735 sv_version_t svv; /* Version structure */ 1736 sv_conf_t svc; /* User config structure */ 1737 sv_list_t svl; /* Initial structure for SVIOC_LIST */ 1738 void *usvn; /* Address of user sv_name_t */ 1739 void *svn = NULL; /* Array for SVIOC_LIST */ 1740 uint64_t phash; /* pathname hash */ 1741 int rc = 0; /* Return code -- errno */ 1742 int size; /* Number of items in array */ 1743 int bytes; /* Byte size of array */ 1744 int ilp32; /* Convert data structures for ilp32 userland */ 1745 1746 *rvalp = 0; 1747 1748 /* 1749 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue. 1750 * else it means it previously was SV_PREVENT_UNLOAD, and now it's 1751 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload. 1752 * 1753 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex. 1754 */ 1755 if (sv_mod_status == SV_ALLOW_UNLOAD) { 1756 return (EBUSY); 1757 } 1758 1759 if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0)) 1760 return (rc); 1761 1762 kstatus = spcs_s_kcreate(); 1763 if (!kstatus) { 1764 DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev); 1765 return (ENOMEM); 1766 } 1767 1768 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32); 1769 1770 switch (cmd) { 1771 1772 case SVIOC_ENABLE: 1773 1774 if (ilp32) { 1775 sv_conf32_t svc32; 1776 1777 if (ddi_copyin((void *)arg, &svc32, 1778 sizeof (svc32), mode) < 0) { 1779 spcs_s_kfree(kstatus); 1780 return (EFAULT); 1781 } 1782 1783 svc.svc_error = (spcs_s_info_t)svc32.svc_error; 1784 (void) strcpy(svc.svc_path, svc32.svc_path); 1785 svc.svc_flag = svc32.svc_flag; 1786 svc.svc_major = svc32.svc_major; 1787 svc.svc_minor = svc32.svc_minor; 1788 } else { 1789 if (ddi_copyin((void *)arg, &svc, 1790 sizeof (svc), mode) < 0) { 1791 spcs_s_kfree(kstatus); 1792 return (EFAULT); 1793 } 1794 } 1795 1796 /* force to raw access */ 1797 svc.svc_flag = NSC_DEVICE; 1798 1799 if (sv_tset == NULL) { 1800 mutex_enter(&sv_mutex); 1801 1802 if (sv_tset == NULL) { 1803 sv_tset = nst_init("sv_thr", sv_threads); 1804 } 1805 1806 mutex_exit(&sv_mutex); 1807 1808 if (sv_tset == NULL) { 1809 cmn_err(CE_WARN, 1810 "!sv: could not allocate %d threads", 1811 sv_threads); 1812 } 1813 } 1814 1815 rc = sv_enable(svc.svc_path, svc.svc_flag, 1816 makedevice(svc.svc_major, svc.svc_minor), kstatus); 1817 1818 if (rc == 0) { 1819 sv_config_time = nsc_lbolt(); 1820 1821 mutex_enter(&sv_mutex); 1822 sv_thread_tune(sv_threads_dev); 1823 mutex_exit(&sv_mutex); 1824 } 1825 1826 DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc); 1827 1828 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc)); 1829 /* NOTREACHED */ 1830 1831 case SVIOC_DISABLE: 1832 1833 if (ilp32) { 1834 sv_conf32_t svc32; 1835 1836 if (ddi_copyin((void *)arg, &svc32, 1837 sizeof (svc32), mode) < 0) { 1838 spcs_s_kfree(kstatus); 1839 return (EFAULT); 1840 } 1841 1842 svc.svc_error = (spcs_s_info_t)svc32.svc_error; 1843 svc.svc_major = svc32.svc_major; 1844 svc.svc_minor = svc32.svc_minor; 1845 (void) strcpy(svc.svc_path, svc32.svc_path); 1846 svc.svc_flag = svc32.svc_flag; 1847 } else { 1848 if (ddi_copyin((void *)arg, &svc, 1849 sizeof (svc), mode) < 0) { 1850 spcs_s_kfree(kstatus); 1851 return (EFAULT); 1852 } 1853 } 1854 1855 if (svc.svc_major == (major_t)-1 && 1856 svc.svc_minor == (minor_t)-1) { 1857 sv_dev_t *svp; 1858 int i; 1859 1860 /* 1861 * User level could not find the minor device 1862 * node, so do this the slow way by searching 1863 * the entire sv config for a matching pathname. 1864 */ 1865 1866 phash = nsc_strhash(svc.svc_path); 1867 1868 mutex_enter(&sv_mutex); 1869 1870 for (i = 0; i < sv_max_devices; i++) { 1871 svp = &sv_devs[i]; 1872 1873 if (svp->sv_state == SV_DISABLE || 1874 svp->sv_fd == NULL) 1875 continue; 1876 1877 if (nsc_fdpathcmp(svp->sv_fd, phash, 1878 svc.svc_path) == 0) { 1879 svc.svc_major = getmajor(svp->sv_dev); 1880 svc.svc_minor = getminor(svp->sv_dev); 1881 break; 1882 } 1883 } 1884 1885 mutex_exit(&sv_mutex); 1886 1887 if (svc.svc_major == (major_t)-1 && 1888 svc.svc_minor == (minor_t)-1) 1889 return (spcs_s_ocopyoutf(&kstatus, 1890 svc.svc_error, SV_ENODEV)); 1891 } 1892 1893 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor), 1894 kstatus); 1895 1896 if (rc == 0) { 1897 sv_config_time = nsc_lbolt(); 1898 1899 mutex_enter(&sv_mutex); 1900 sv_thread_tune(-sv_threads_dev); 1901 mutex_exit(&sv_mutex); 1902 } 1903 1904 DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc); 1905 1906 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc)); 1907 /* NOTREACHED */ 1908 1909 case SVIOC_LIST: 1910 1911 if (ilp32) { 1912 if (ddi_copyin((void *)arg, &svl32, 1913 sizeof (svl32), mode) < 0) { 1914 spcs_s_kfree(kstatus); 1915 return (EFAULT); 1916 } 1917 1918 ustatus = (spcs_s_info_t)svl32.svl_error; 1919 size = svl32.svl_count; 1920 usvn = (void *)(unsigned long)svl32.svl_names; 1921 } else { 1922 if (ddi_copyin((void *)arg, &svl, 1923 sizeof (svl), mode) < 0) { 1924 spcs_s_kfree(kstatus); 1925 return (EFAULT); 1926 } 1927 1928 ustatus = svl.svl_error; 1929 size = svl.svl_count; 1930 usvn = svl.svl_names; 1931 } 1932 1933 /* Do some boundary checking */ 1934 if ((size < 0) || (size > sv_max_devices)) { 1935 /* Array size is out of range */ 1936 return (spcs_s_ocopyoutf(&kstatus, ustatus, 1937 SV_EARRBOUNDS, "0", 1938 spcs_s_inttostring(sv_max_devices, itmp1, 1939 sizeof (itmp1), 0), 1940 spcs_s_inttostring(size, itmp2, 1941 sizeof (itmp2), 0))); 1942 } 1943 1944 if (ilp32) 1945 bytes = size * sizeof (sv_name32_t); 1946 else 1947 bytes = size * sizeof (sv_name_t); 1948 1949 /* Allocate memory for the array of structures */ 1950 if (bytes != 0) { 1951 svn = kmem_zalloc(bytes, KM_SLEEP); 1952 if (!svn) { 1953 return (spcs_s_ocopyoutf(&kstatus, 1954 ustatus, ENOMEM)); 1955 } 1956 } 1957 1958 rc = sv_list(svn, size, rvalp, ilp32); 1959 if (rc) { 1960 if (svn != NULL) 1961 kmem_free(svn, bytes); 1962 return (spcs_s_ocopyoutf(&kstatus, ustatus, rc)); 1963 } 1964 1965 if (ilp32) { 1966 svl32.svl_timestamp = (uint32_t)sv_config_time; 1967 svl32.svl_maxdevs = (int32_t)sv_max_devices; 1968 1969 /* Return the list structure */ 1970 if (ddi_copyout(&svl32, (void *)arg, 1971 sizeof (svl32), mode) < 0) { 1972 spcs_s_kfree(kstatus); 1973 if (svn != NULL) 1974 kmem_free(svn, bytes); 1975 return (EFAULT); 1976 } 1977 } else { 1978 svl.svl_timestamp = sv_config_time; 1979 svl.svl_maxdevs = sv_max_devices; 1980 1981 /* Return the list structure */ 1982 if (ddi_copyout(&svl, (void *)arg, 1983 sizeof (svl), mode) < 0) { 1984 spcs_s_kfree(kstatus); 1985 if (svn != NULL) 1986 kmem_free(svn, bytes); 1987 return (EFAULT); 1988 } 1989 } 1990 1991 /* Return the array */ 1992 if (svn != NULL) { 1993 if (ddi_copyout(svn, usvn, bytes, mode) < 0) { 1994 kmem_free(svn, bytes); 1995 spcs_s_kfree(kstatus); 1996 return (EFAULT); 1997 } 1998 kmem_free(svn, bytes); 1999 } 2000 2001 DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0); 2002 2003 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0)); 2004 /* NOTREACHED */ 2005 2006 case SVIOC_VERSION: 2007 2008 if (ilp32) { 2009 sv_version32_t svv32; 2010 2011 if (ddi_copyin((void *)arg, &svv32, 2012 sizeof (svv32), mode) < 0) { 2013 spcs_s_kfree(kstatus); 2014 return (EFAULT); 2015 } 2016 2017 svv32.svv_major_rev = sv_major_rev; 2018 svv32.svv_minor_rev = sv_minor_rev; 2019 svv32.svv_micro_rev = sv_micro_rev; 2020 svv32.svv_baseline_rev = sv_baseline_rev; 2021 2022 if (ddi_copyout(&svv32, (void *)arg, 2023 sizeof (svv32), mode) < 0) { 2024 spcs_s_kfree(kstatus); 2025 return (EFAULT); 2026 } 2027 2028 ustatus = (spcs_s_info_t)svv32.svv_error; 2029 } else { 2030 if (ddi_copyin((void *)arg, &svv, 2031 sizeof (svv), mode) < 0) { 2032 spcs_s_kfree(kstatus); 2033 return (EFAULT); 2034 } 2035 2036 svv.svv_major_rev = sv_major_rev; 2037 svv.svv_minor_rev = sv_minor_rev; 2038 svv.svv_micro_rev = sv_micro_rev; 2039 svv.svv_baseline_rev = sv_baseline_rev; 2040 2041 if (ddi_copyout(&svv, (void *)arg, 2042 sizeof (svv), mode) < 0) { 2043 spcs_s_kfree(kstatus); 2044 return (EFAULT); 2045 } 2046 2047 ustatus = svv.svv_error; 2048 } 2049 2050 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0); 2051 2052 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0)); 2053 /* NOTREACHED */ 2054 2055 case SVIOC_UNLOAD: 2056 rc = sv_prepare_unload(); 2057 2058 if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) { 2059 rc = EFAULT; 2060 } 2061 2062 spcs_s_kfree(kstatus); 2063 return (rc); 2064 2065 default: 2066 spcs_s_kfree(kstatus); 2067 2068 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL); 2069 2070 return (EINVAL); 2071 /* NOTREACHED */ 2072 } 2073 2074 /* NOTREACHED */ 2075 } 2076 2077 2078 /* ARGSUSED */ 2079 static int 2080 svprint(dev_t dev, char *str) 2081 { 2082 int instance = ddi_get_instance(sv_dip); 2083 cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str); 2084 return (0); 2085 } 2086 2087 2088 static void 2089 _sv_lyr_strategy(struct buf *bp) 2090 { 2091 caddr_t buf_addr; /* pointer to linear buffer in bp */ 2092 nsc_buf_t *bufh = NULL; 2093 nsc_buf_t *hndl = NULL; 2094 sv_dev_t *svp; 2095 nsc_vec_t *v; 2096 sv_maj_t *maj; 2097 nsc_size_t fba_req, fba_len; /* FBA lengths */ 2098 nsc_off_t fba_off; /* FBA offset */ 2099 size_t tocopy, nbytes; /* byte lengths */ 2100 int rw, rc; /* flags and return codes */ 2101 int (*fn)(); 2102 2103 rc = 0; 2104 2105 if (sv_debug > 5) 2106 cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp); 2107 2108 svp = sv_find_enabled(bp->b_edev, &maj); 2109 if (svp == NULL) { 2110 if (maj && (fn = maj->sm_strategy) != 0) { 2111 if (!(maj->sm_flag & D_MP)) { 2112 UNSAFE_ENTER(); 2113 rc = (*fn)(bp); 2114 UNSAFE_EXIT(); 2115 } else { 2116 rc = (*fn)(bp); 2117 } 2118 return; 2119 } else { 2120 bioerror(bp, ENODEV); 2121 biodone(bp); 2122 return; 2123 } 2124 } 2125 2126 ASSERT(RW_READ_HELD(&svp->sv_lock)); 2127 2128 if (svp->sv_flag == 0) { 2129 /* 2130 * guard access mode 2131 * - prevent user level access to the device 2132 */ 2133 DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp); 2134 bioerror(bp, EPERM); 2135 goto out; 2136 } 2137 2138 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) { 2139 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp); 2140 2141 if (rc == EINTR) 2142 cmn_err(CE_WARN, "!nsc_reserve() returned EINTR"); 2143 bioerror(bp, rc); 2144 goto out; 2145 } 2146 2147 if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) { 2148 DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp); 2149 2150 if (bp->b_flags & B_READ) { 2151 /* return EOF, not an error */ 2152 bp->b_resid = bp->b_bcount; 2153 bioerror(bp, 0); 2154 } else 2155 bioerror(bp, EINVAL); 2156 2157 goto done; 2158 } 2159 2160 /* 2161 * Preallocate a handle once per call to strategy. 2162 * If this fails, then the nsc_alloc_buf() will allocate 2163 * a temporary handle per allocation/free pair. 2164 */ 2165 2166 DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp); 2167 2168 bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL); 2169 2170 DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp); 2171 2172 if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) { 2173 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp); 2174 2175 cmn_err(CE_WARN, 2176 "!sv: allocated active handle (bufh %p, flags %x)", 2177 (void *)bufh, bufh->sb_flag); 2178 2179 bioerror(bp, ENXIO); 2180 goto done; 2181 } 2182 2183 fba_req = FBA_LEN(bp->b_bcount); 2184 if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks) 2185 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno); 2186 2187 rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE; 2188 2189 bp_mapin(bp); 2190 2191 bp->b_resid = bp->b_bcount; 2192 buf_addr = bp->b_un.b_addr; 2193 fba_off = 0; 2194 2195 /* 2196 * fba_req - requested size of transfer in FBAs after 2197 * truncation to device extent, and allowing for 2198 * possible non-FBA bounded final chunk. 2199 * fba_off - offset of start of chunk from start of bp in FBAs. 2200 * fba_len - size of this chunk in FBAs. 2201 */ 2202 2203 loop: 2204 fba_len = min(fba_req, svp->sv_maxfbas); 2205 hndl = bufh; 2206 2207 DTRACE_PROBE4(sv_dbg_allocb_start, 2208 sv_dev_t *, svp, 2209 uint64_t, (uint64_t)(bp->b_lblkno + fba_off), 2210 uint64_t, (uint64_t)fba_len, 2211 int, rw); 2212 2213 rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off), 2214 fba_len, rw, &hndl); 2215 2216 DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp); 2217 2218 if (rc > 0) { 2219 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp); 2220 bioerror(bp, rc); 2221 if (hndl != bufh) 2222 (void) nsc_free_buf(hndl); 2223 hndl = NULL; 2224 goto done; 2225 } 2226 2227 tocopy = min(FBA_SIZE(fba_len), bp->b_resid); 2228 v = hndl->sb_vec; 2229 2230 if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) { 2231 /* 2232 * Not overwriting all of the last FBA, so read in the 2233 * old contents now before we overwrite it with the new 2234 * data. 2235 */ 2236 2237 DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp, 2238 uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1)); 2239 2240 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0); 2241 if (rc > 0) { 2242 bioerror(bp, rc); 2243 goto done; 2244 } 2245 2246 DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp); 2247 } 2248 2249 DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp); 2250 2251 while (tocopy > 0) { 2252 nbytes = min(tocopy, (nsc_size_t)v->sv_len); 2253 2254 if (bp->b_flags & B_READ) 2255 (void) bcopy(v->sv_addr, buf_addr, nbytes); 2256 else 2257 (void) bcopy(buf_addr, v->sv_addr, nbytes); 2258 2259 bp->b_resid -= nbytes; 2260 buf_addr += nbytes; 2261 tocopy -= nbytes; 2262 v++; 2263 } 2264 2265 DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp); 2266 2267 if ((bp->b_flags & B_READ) == 0) { 2268 DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp, 2269 uint64_t, (uint64_t)hndl->sb_pos, 2270 uint64_t, (uint64_t)hndl->sb_len); 2271 2272 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0); 2273 2274 DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp); 2275 2276 if (rc > 0) { 2277 bioerror(bp, rc); 2278 goto done; 2279 } 2280 } 2281 2282 /* 2283 * Adjust FBA offset and requested (ie. remaining) length, 2284 * loop if more data to transfer. 2285 */ 2286 2287 fba_off += fba_len; 2288 fba_req -= fba_len; 2289 2290 if (fba_req > 0) { 2291 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp); 2292 2293 rc = nsc_free_buf(hndl); 2294 2295 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp); 2296 2297 if (rc > 0) { 2298 DTRACE_PROBE1(sv_lyr_strategy_err_free, 2299 struct buf *, bp); 2300 bioerror(bp, rc); 2301 } 2302 2303 hndl = NULL; 2304 2305 if (rc <= 0) 2306 goto loop; 2307 } 2308 2309 done: 2310 if (hndl != NULL) { 2311 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp); 2312 2313 rc = nsc_free_buf(hndl); 2314 2315 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp); 2316 2317 if (rc > 0) { 2318 DTRACE_PROBE1(sv_lyr_strategy_err_free, 2319 struct buf *, bp); 2320 bioerror(bp, rc); 2321 } 2322 2323 hndl = NULL; 2324 } 2325 2326 if (bufh) 2327 (void) nsc_free_handle(bufh); 2328 2329 DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp); 2330 2331 nsc_release(svp->sv_fd); 2332 2333 DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp); 2334 2335 out: 2336 if (sv_debug > 5) { 2337 cmn_err(CE_CONT, 2338 "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n", 2339 (void *)bp, (void *)bufh, bp->b_error); 2340 } 2341 2342 DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error); 2343 2344 rw_exit(&svp->sv_lock); 2345 biodone(bp); 2346 } 2347 2348 2349 static void 2350 sv_async_strategy(blind_t arg) 2351 { 2352 struct buf *bp = (struct buf *)arg; 2353 _sv_lyr_strategy(bp); 2354 } 2355 2356 2357 static int 2358 sv_lyr_strategy(struct buf *bp) 2359 { 2360 nsthread_t *tp; 2361 int nlive; 2362 2363 /* 2364 * If B_ASYNC was part of the DDI we could use it as a hint to 2365 * not create a thread for synchronous i/o. 2366 */ 2367 if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) { 2368 /* not sv enabled - just pass through */ 2369 DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp); 2370 _sv_lyr_strategy(bp); 2371 return (0); 2372 } 2373 2374 if (sv_debug > 4) { 2375 cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n", 2376 nst_nthread(sv_tset), nst_nlive(sv_tset)); 2377 } 2378 2379 /* 2380 * If there are only guard devices enabled there 2381 * won't be a threadset, so don't try and use it. 2382 */ 2383 tp = NULL; 2384 if (sv_tset != NULL) { 2385 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0); 2386 } 2387 2388 if (tp == NULL) { 2389 /* 2390 * out of threads, so fall back to synchronous io. 2391 */ 2392 if (sv_debug > 0) { 2393 cmn_err(CE_CONT, 2394 "!sv_lyr_strategy: thread alloc failed\n"); 2395 } 2396 2397 DTRACE_PROBE1(sv_lyr_strategy_no_thread, 2398 struct buf *, bp); 2399 2400 _sv_lyr_strategy(bp); 2401 sv_no_threads++; 2402 } else { 2403 nlive = nst_nlive(sv_tset); 2404 if (nlive > sv_max_nlive) { 2405 if (sv_debug > 0) { 2406 cmn_err(CE_CONT, 2407 "!sv_lyr_strategy: " 2408 "new max nlive %d (nthread %d)\n", 2409 nlive, nst_nthread(sv_tset)); 2410 } 2411 2412 sv_max_nlive = nlive; 2413 } 2414 } 2415 2416 return (0); 2417 } 2418 2419 /* 2420 * re-write the size of the current partition 2421 */ 2422 static int 2423 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp) 2424 { 2425 size_t offset; 2426 int ilp32; 2427 int pnum; 2428 int rc; 2429 2430 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32); 2431 2432 rc = nskern_partition(svp->sv_dev, &pnum); 2433 if (rc != 0) { 2434 return (rc); 2435 } 2436 2437 if (pnum < 0 || pnum >= V_NUMPAR) { 2438 cmn_err(CE_WARN, 2439 "!sv_gvtoc: unable to determine partition number " 2440 "for dev %lx", svp->sv_dev); 2441 return (EINVAL); 2442 } 2443 2444 if (ilp32) { 2445 int32_t p_size; 2446 2447 #ifdef _SunOS_5_6 2448 offset = offsetof(struct vtoc, v_part); 2449 offset += sizeof (struct partition) * pnum; 2450 offset += offsetof(struct partition, p_size); 2451 #else 2452 offset = offsetof(struct vtoc32, v_part); 2453 offset += sizeof (struct partition32) * pnum; 2454 offset += offsetof(struct partition32, p_size); 2455 #endif 2456 2457 p_size = (int32_t)svp->sv_nblocks; 2458 if (p_size == 0) { 2459 if (sv_reserve(svp->sv_fd, 2460 NSC_MULTI|NSC_PCATCH) == 0) { 2461 p_size = (int32_t)svp->sv_nblocks; 2462 nsc_release(svp->sv_fd); 2463 } else { 2464 rc = EINTR; 2465 } 2466 } 2467 2468 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset), 2469 sizeof (p_size), mode) != 0) { 2470 rc = EFAULT; 2471 } 2472 } else { 2473 long p_size; 2474 2475 offset = offsetof(struct vtoc, v_part); 2476 offset += sizeof (struct partition) * pnum; 2477 offset += offsetof(struct partition, p_size); 2478 2479 p_size = (long)svp->sv_nblocks; 2480 if (p_size == 0) { 2481 if (sv_reserve(svp->sv_fd, 2482 NSC_MULTI|NSC_PCATCH) == 0) { 2483 p_size = (long)svp->sv_nblocks; 2484 nsc_release(svp->sv_fd); 2485 } else { 2486 rc = EINTR; 2487 } 2488 } 2489 2490 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset), 2491 sizeof (p_size), mode) != 0) { 2492 rc = EFAULT; 2493 } 2494 } 2495 2496 return (rc); 2497 } 2498 2499 2500 #ifdef DKIOCPARTITION 2501 /* 2502 * re-write the size of the current partition 2503 * 2504 * arg is dk_efi_t. 2505 * 2506 * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64; 2507 * 2508 * dk_efi_t->dki_data --> efi_gpt_t (label header) 2509 * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions) 2510 * 2511 * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts 2512 * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself 2513 * 2514 * This assumes that sizeof (efi_gpt_t) is the same as the size of a 2515 * logical block on the disk. 2516 * 2517 * Everything is little endian (i.e. disk format). 2518 */ 2519 static int 2520 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp) 2521 { 2522 dk_efi_t efi; 2523 efi_gpt_t gpt; 2524 efi_gpe_t *gpe = NULL; 2525 size_t sgpe; 2526 uint64_t p_size; /* virtual partition size from nsctl */ 2527 uint32_t crc; 2528 int unparts; /* number of parts in user's array */ 2529 int pnum; 2530 int rc; 2531 2532 rc = nskern_partition(svp->sv_dev, &pnum); 2533 if (rc != 0) { 2534 return (rc); 2535 } 2536 2537 if (pnum < 0) { 2538 cmn_err(CE_WARN, 2539 "!sv_efi: unable to determine partition number for dev %lx", 2540 svp->sv_dev); 2541 return (EINVAL); 2542 } 2543 2544 if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) { 2545 return (EFAULT); 2546 } 2547 2548 efi.dki_data = (void *)(uintptr_t)efi.dki_data_64; 2549 2550 if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) { 2551 return (EINVAL); 2552 } 2553 2554 if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) { 2555 rc = EFAULT; 2556 goto out; 2557 } 2558 2559 if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0) 2560 unparts = 1; 2561 else if (pnum >= unparts) { 2562 cmn_err(CE_WARN, 2563 "!sv_efi: partition# beyond end of user array (%d >= %d)", 2564 pnum, unparts); 2565 return (EINVAL); 2566 } 2567 2568 sgpe = sizeof (*gpe) * unparts; 2569 gpe = kmem_alloc(sgpe, KM_SLEEP); 2570 2571 if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) { 2572 rc = EFAULT; 2573 goto out; 2574 } 2575 2576 p_size = svp->sv_nblocks; 2577 if (p_size == 0) { 2578 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 2579 p_size = (diskaddr_t)svp->sv_nblocks; 2580 nsc_release(svp->sv_fd); 2581 } else { 2582 rc = EINTR; 2583 } 2584 } 2585 2586 gpe[pnum].efi_gpe_EndingLBA = LE_64( 2587 LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1); 2588 2589 gpt.efi_gpt_PartitionEntryArrayCRC32 = 0; 2590 CRC32(crc, gpe, sgpe, -1U, sv_crc32_table); 2591 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2592 2593 gpt.efi_gpt_HeaderCRC32 = 0; 2594 CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table); 2595 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); 2596 2597 if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) { 2598 rc = EFAULT; 2599 goto out; 2600 } 2601 2602 if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) { 2603 rc = EFAULT; 2604 goto out; 2605 } 2606 2607 out: 2608 if (gpe) { 2609 kmem_free(gpe, sgpe); 2610 } 2611 2612 return (rc); 2613 } 2614 2615 2616 /* 2617 * Re-write the size of the partition specified by p_partno 2618 * 2619 * Note that if a DKIOCPARTITION is issued to an fd opened against a 2620 * non-sv'd device, but p_partno requests the size for a different 2621 * device that is sv'd, this function will *not* be called as sv is 2622 * not interposed on the original device (the fd). 2623 * 2624 * It would not be easy to change this as we cannot get the partition 2625 * number for the non-sv'd device, so cannot compute the dev_t of the 2626 * (sv'd) p_partno device, and so cannot find out if it is sv'd or get 2627 * its size from nsctl. 2628 * 2629 * See also the "Bug 4755783" comment in sv_lyr_ioctl(). 2630 */ 2631 static int 2632 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp) 2633 { 2634 struct partition64 p64; 2635 sv_dev_t *nsvp = NULL; 2636 diskaddr_t p_size; 2637 minor_t nminor; 2638 int pnum, rc; 2639 dev_t ndev; 2640 2641 rc = nskern_partition(svp->sv_dev, &pnum); 2642 if (rc != 0) { 2643 return (rc); 2644 } 2645 2646 if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) { 2647 return (EFAULT); 2648 } 2649 2650 if (p64.p_partno != pnum) { 2651 /* switch to requested partition, not the current one */ 2652 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum); 2653 ndev = makedevice(getmajor(svp->sv_dev), nminor); 2654 nsvp = sv_find_enabled(ndev, NULL); 2655 if (nsvp == NULL) { 2656 /* not sv device - just return */ 2657 return (0); 2658 } 2659 2660 svp = nsvp; 2661 } 2662 2663 p_size = svp->sv_nblocks; 2664 if (p_size == 0) { 2665 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 2666 p_size = (diskaddr_t)svp->sv_nblocks; 2667 nsc_release(svp->sv_fd); 2668 } else { 2669 rc = EINTR; 2670 } 2671 } 2672 2673 if (nsvp != NULL) { 2674 rw_exit(&nsvp->sv_lock); 2675 } 2676 2677 if ((rc == 0) && ddi_copyout(&p_size, 2678 (void *)(arg + offsetof(struct partition64, p_size)), 2679 sizeof (p_size), mode) != 0) { 2680 return (EFAULT); 2681 } 2682 2683 return (rc); 2684 } 2685 #endif /* DKIOCPARTITION */ 2686 2687 2688 static int 2689 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg, 2690 const int mode, cred_t *crp, int *rvalp) 2691 { 2692 sv_dev_t *svp; 2693 sv_maj_t *maj; 2694 int (*fn)(); 2695 int rc = 0; 2696 2697 maj = 0; 2698 fn = 0; 2699 2700 /* 2701 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue. 2702 * else it means it previously was SV_PREVENT_UNLOAD, and now it's 2703 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload. 2704 * 2705 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex. 2706 */ 2707 if (sv_mod_status == SV_ALLOW_UNLOAD) { 2708 return (EBUSY); 2709 } 2710 2711 svp = sv_find_enabled(dev, &maj); 2712 if (svp != NULL) { 2713 if (nskernd_isdaemon()) { 2714 /* 2715 * This is nskernd which always needs to see 2716 * the underlying disk device accurately. 2717 * 2718 * So just pass the ioctl straight through 2719 * to the underlying driver as though the device 2720 * was not sv enabled. 2721 */ 2722 DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp, 2723 dev_t, dev); 2724 2725 rw_exit(&svp->sv_lock); 2726 svp = NULL; 2727 } else { 2728 ASSERT(RW_READ_HELD(&svp->sv_lock)); 2729 } 2730 } 2731 2732 /* 2733 * We now have a locked and enabled SV device, or a non-SV device. 2734 */ 2735 2736 switch (cmd) { 2737 /* 2738 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI 2739 * and DKIOCSETEFI are intercepted and faked up as some 2740 * i/o providers emulate volumes of a different size to 2741 * the underlying volume. 2742 * 2743 * Setting the size by rewriting the vtoc is not permitted. 2744 */ 2745 2746 case DKIOCSVTOC: 2747 #ifdef DKIOCPARTITION 2748 case DKIOCSETEFI: 2749 #endif 2750 if (svp == NULL) { 2751 /* not intercepted -- allow ioctl through */ 2752 break; 2753 } 2754 2755 rw_exit(&svp->sv_lock); 2756 2757 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM); 2758 2759 return (EPERM); 2760 2761 default: 2762 break; 2763 } 2764 2765 /* 2766 * Pass through the real ioctl command. 2767 */ 2768 2769 if (maj && (fn = maj->sm_ioctl) != 0) { 2770 if (!(maj->sm_flag & D_MP)) { 2771 UNSAFE_ENTER(); 2772 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp); 2773 UNSAFE_EXIT(); 2774 } else { 2775 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp); 2776 } 2777 } else { 2778 rc = ENODEV; 2779 } 2780 2781 /* 2782 * Bug 4755783 2783 * Fix up the size of the current partition to allow 2784 * for the virtual volume to be a different size to the 2785 * physical volume (e.g. for II compact dependent shadows). 2786 * 2787 * Note that this only attempts to fix up the current partition 2788 * - the one that the ioctl was issued against. There could be 2789 * other sv'd partitions in the same vtoc, but we cannot tell 2790 * so we don't attempt to fix them up. 2791 */ 2792 2793 if (svp != NULL && rc == 0) { 2794 switch (cmd) { 2795 case DKIOCGVTOC: 2796 rc = sv_fix_dkiocgvtoc(arg, mode, svp); 2797 break; 2798 2799 #ifdef DKIOCPARTITION 2800 case DKIOCGETEFI: 2801 rc = sv_fix_dkiocgetefi(arg, mode, svp); 2802 break; 2803 2804 case DKIOCPARTITION: 2805 rc = sv_fix_dkiocpartition(arg, mode, svp); 2806 break; 2807 #endif /* DKIOCPARTITION */ 2808 } 2809 } 2810 2811 if (svp != NULL) { 2812 rw_exit(&svp->sv_lock); 2813 } 2814 2815 return (rc); 2816 } 2817