1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28 /* 29 * Storage Volume Character and Block Driver (SV) 30 * 31 * This driver implements a simplistic /dev/{r}dsk/ interface to a 32 * specified disk volume that is otherwise managed by the Prism 33 * software. The SV driver layers itself onto the underlying disk 34 * device driver by changing function pointers in the cb_ops 35 * structure. 36 * 37 * CONFIGURATION: 38 * 39 * 1. Configure the driver using the svadm utility. 40 * 2. Access the device as before through /dev/rdsk/c?t?d?s? 41 * 42 * LIMITATIONS: 43 * 44 * This driver should NOT be used to share a device between another 45 * DataServices user interface module (e.g., STE) and a user accessing 46 * the device through the block device in O_WRITE mode. This is because 47 * writes through the block device are asynchronous (due to the page 48 * cache) and so consistency between the block device user and the 49 * STE user cannot be guaranteed. 50 * 51 * Data is copied between system struct buf(9s) and nsc_vec_t. This is 52 * wasteful and slow. 53 */ 54 55 #include <sys/debug.h> 56 #include <sys/types.h> 57 58 #include <sys/ksynch.h> 59 #include <sys/kmem.h> 60 #include <sys/errno.h> 61 #include <sys/varargs.h> 62 #include <sys/file.h> 63 #include <sys/open.h> 64 #include <sys/conf.h> 65 #include <sys/cred.h> 66 #include <sys/buf.h> 67 #include <sys/uio.h> 68 #ifndef DS_DDICT 69 #include <sys/pathname.h> 70 #endif 71 #include <sys/aio_req.h> 72 #include <sys/dkio.h> 73 #include <sys/vtoc.h> 74 #include <sys/cmn_err.h> 75 #include <sys/modctl.h> 76 #include <sys/ddi.h> 77 #include <sys/sunddi.h> 78 #include <sys/sunldi.h> 79 #include <sys/nsctl/nsvers.h> 80 81 #include <sys/nsc_thread.h> 82 #include <sys/unistat/spcs_s.h> 83 #include <sys/unistat/spcs_s_k.h> 84 #include <sys/unistat/spcs_errors.h> 85 86 #ifdef DS_DDICT 87 #include "../contract.h" 88 #endif 89 90 #include "../nsctl.h" 91 92 93 #include <sys/sdt.h> /* dtrace is S10 or later */ 94 95 #include "sv.h" 96 #include "sv_impl.h" 97 #include "sv_efi.h" 98 99 #define MAX_EINTR_COUNT 1000 100 101 /* 102 * sv_mod_status 103 */ 104 #define SV_PREVENT_UNLOAD 1 105 #define SV_ALLOW_UNLOAD 2 106 107 static const int sv_major_rev = ISS_VERSION_MAJ; /* Major number */ 108 static const int sv_minor_rev = ISS_VERSION_MIN; /* Minor number */ 109 static const int sv_micro_rev = ISS_VERSION_MIC; /* Micro number */ 110 static const int sv_baseline_rev = ISS_VERSION_NUM; /* Baseline number */ 111 112 #ifdef DKIOCPARTITION 113 /* 114 * CRC32 polynomial table needed for computing the checksums 115 * in an EFI vtoc. 116 */ 117 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE }; 118 #endif 119 120 static clock_t sv_config_time; /* Time of successful {en,dis}able */ 121 static int sv_debug; /* Set non-zero for debug to syslog */ 122 static int sv_mod_status; /* Set to prevent modunload */ 123 124 static dev_info_t *sv_dip; /* Single DIP for driver */ 125 static kmutex_t sv_mutex; /* Protect global lists, etc. */ 126 127 static nsc_mem_t *sv_mem; /* nsctl memory allocator token */ 128 129 130 /* 131 * Per device and per major state. 132 */ 133 134 #ifndef _SunOS_5_6 135 #define UNSAFE_ENTER() 136 #define UNSAFE_EXIT() 137 #else 138 #define UNSAFE_ENTER() mutex_enter(&unsafe_driver) 139 #define UNSAFE_EXIT() mutex_exit(&unsafe_driver) 140 #endif 141 142 /* hash table of major dev structures */ 143 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0}; 144 static sv_dev_t *sv_devs; /* array of per device structures */ 145 static int sv_max_devices; /* SV version of nsc_max_devices() */ 146 static int sv_ndevices; /* number of SV enabled devices */ 147 148 /* 149 * Threading. 150 */ 151 152 int sv_threads_max = 1024; /* maximum # to dynamically alloc */ 153 int sv_threads = 32; /* # to pre-allocate (see sv.conf) */ 154 int sv_threads_extra = 0; /* addl # we would have alloc'ed */ 155 156 static nstset_t *sv_tset; /* the threadset pointer */ 157 158 static int sv_threads_hysteresis = 4; /* hysteresis for threadset resizing */ 159 static int sv_threads_dev = 2; /* # of threads to alloc per device */ 160 static int sv_threads_inc = 8; /* increment for changing the set */ 161 static int sv_threads_needed; /* number of threads needed */ 162 static int sv_no_threads; /* number of nsc_create errors */ 163 static int sv_max_nlive; /* max number of threads running */ 164 165 166 167 /* 168 * nsctl fd callbacks. 169 */ 170 171 static int svattach_fd(blind_t); 172 static int svdetach_fd(blind_t); 173 174 static nsc_def_t sv_fd_def[] = { 175 { "Attach", (uintptr_t)svattach_fd, }, 176 { "Detach", (uintptr_t)svdetach_fd, }, 177 { 0, 0, } 178 }; 179 180 /* 181 * cb_ops functions. 182 */ 183 184 static int svopen(dev_t *, int, int, cred_t *); 185 static int svclose(dev_t, int, int, cred_t *); 186 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *); 187 static int svprint(dev_t, char *); 188 189 /* 190 * These next functions are layered into the underlying driver's devops. 191 */ 192 193 static int sv_lyr_open(dev_t *, int, int, cred_t *); 194 static int sv_lyr_close(dev_t, int, int, cred_t *); 195 static int sv_lyr_strategy(struct buf *); 196 static int sv_lyr_read(dev_t, struct uio *, cred_t *); 197 static int sv_lyr_write(dev_t, struct uio *, cred_t *); 198 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *); 199 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *); 200 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 201 202 static struct cb_ops sv_cb_ops = { 203 svopen, /* open */ 204 svclose, /* close */ 205 nulldev, /* strategy */ 206 svprint, 207 nodev, /* dump */ 208 nodev, /* read */ 209 nodev, /* write */ 210 svioctl, 211 nodev, /* devmap */ 212 nodev, /* mmap */ 213 nodev, /* segmap */ 214 nochpoll, /* poll */ 215 ddi_prop_op, 216 NULL, /* NOT a stream */ 217 D_NEW | D_MP | D_64BIT, 218 CB_REV, 219 nodev, /* aread */ 220 nodev, /* awrite */ 221 }; 222 223 224 /* 225 * dev_ops functions. 226 */ 227 228 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 229 static int sv_attach(dev_info_t *, ddi_attach_cmd_t); 230 static int sv_detach(dev_info_t *, ddi_detach_cmd_t); 231 232 static struct dev_ops sv_ops = { 233 DEVO_REV, 234 0, 235 sv_getinfo, 236 nulldev, /* identify */ 237 nulldev, /* probe */ 238 sv_attach, 239 sv_detach, 240 nodev, /* reset */ 241 &sv_cb_ops, 242 (struct bus_ops *)0 243 }; 244 245 /* 246 * Module linkage. 247 */ 248 249 extern struct mod_ops mod_driverops; 250 251 static struct modldrv modldrv = { 252 &mod_driverops, 253 "nws:Storage Volume:" ISS_VERSION_STR, 254 &sv_ops 255 }; 256 257 static struct modlinkage modlinkage = { 258 MODREV_1, 259 &modldrv, 260 0 261 }; 262 263 264 int 265 _init(void) 266 { 267 int error; 268 269 mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL); 270 271 if ((error = mod_install(&modlinkage)) != 0) { 272 mutex_destroy(&sv_mutex); 273 return (error); 274 } 275 276 #ifdef DEBUG 277 cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n", 278 sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev, 279 ISS_VERSION_STR, BUILD_DATE_STR); 280 #else 281 if (sv_micro_rev) { 282 cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n", 283 sv_major_rev, sv_minor_rev, sv_micro_rev, 284 ISS_VERSION_STR, BUILD_DATE_STR); 285 } else { 286 cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n", 287 sv_major_rev, sv_minor_rev, 288 ISS_VERSION_STR, BUILD_DATE_STR); 289 } 290 #endif 291 292 return (error); 293 } 294 295 296 int 297 _fini(void) 298 { 299 int error; 300 301 if ((error = mod_remove(&modlinkage)) != 0) 302 return (error); 303 304 mutex_destroy(&sv_mutex); 305 306 return (error); 307 } 308 309 310 int 311 _info(struct modinfo *modinfop) 312 { 313 return (mod_info(&modlinkage, modinfop)); 314 } 315 316 317 /* 318 * Locking & State. 319 * 320 * sv_mutex protects config information - sv_maj_t and sv_dev_t lists; 321 * threadset creation and sizing; sv_ndevices. 322 * 323 * If we need to hold both sv_mutex and sv_lock, then the sv_mutex 324 * must be acquired first. 325 * 326 * sv_lock protects the sv_dev_t structure for an individual device. 327 * 328 * sv_olock protects the otyp/open members of the sv_dev_t. If we need 329 * to hold both sv_lock and sv_olock, then the sv_lock must be acquired 330 * first. 331 * 332 * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple 333 * I/O operations to a device simultaneously, as above. 334 * 335 * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur 336 * with sv_lock write-locked must be done with (sv_state == SV_PENDING) 337 * and (sv_pending == curthread) so that any recursion through 338 * sv_lyr_open/sv_lyr_close can be detected. 339 */ 340 341 342 static int 343 sv_init_devs(void) 344 { 345 int i; 346 347 ASSERT(MUTEX_HELD(&sv_mutex)); 348 349 if (sv_max_devices > 0) 350 return (0); 351 352 sv_max_devices = nsc_max_devices(); 353 354 if (sv_max_devices <= 0) { 355 /* nsctl is not attached (nskernd not running) */ 356 if (sv_debug > 0) 357 cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n"); 358 return (EAGAIN); 359 } 360 361 sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)), 362 KM_NOSLEEP, sv_mem); 363 364 if (sv_devs == NULL) { 365 cmn_err(CE_WARN, "!sv: could not allocate sv_devs array"); 366 return (ENOMEM); 367 } 368 369 for (i = 0; i < sv_max_devices; i++) { 370 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL); 371 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL); 372 } 373 374 if (sv_debug > 0) 375 cmn_err(CE_CONT, "!sv: sv_init_devs successful\n"); 376 377 return (0); 378 } 379 380 381 static int 382 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 383 { 384 int rc; 385 386 switch (cmd) { 387 388 case DDI_ATTACH: 389 sv_dip = dip; 390 391 if (ddi_create_minor_node(dip, "sv", S_IFCHR, 392 0, DDI_PSEUDO, 0) != DDI_SUCCESS) 393 goto failed; 394 395 mutex_enter(&sv_mutex); 396 397 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0); 398 if (sv_mem == NULL) { 399 mutex_exit(&sv_mutex); 400 goto failed; 401 } 402 403 rc = sv_init_devs(); 404 if (rc != 0 && rc != EAGAIN) { 405 mutex_exit(&sv_mutex); 406 goto failed; 407 } 408 409 mutex_exit(&sv_mutex); 410 411 412 ddi_report_dev(dip); 413 414 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 415 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 416 "sv_threads", sv_threads); 417 418 if (sv_debug > 0) 419 cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads); 420 421 if (sv_threads > sv_threads_max) 422 sv_threads_max = sv_threads; 423 424 return (DDI_SUCCESS); 425 426 default: 427 return (DDI_FAILURE); 428 } 429 430 failed: 431 DTRACE_PROBE(sv_attach_failed); 432 (void) sv_detach(dip, DDI_DETACH); 433 return (DDI_FAILURE); 434 } 435 436 437 static int 438 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 439 { 440 sv_dev_t *svp; 441 int i; 442 443 switch (cmd) { 444 445 case DDI_DETACH: 446 447 /* 448 * Check that everything is disabled. 449 */ 450 451 mutex_enter(&sv_mutex); 452 453 if (sv_mod_status == SV_PREVENT_UNLOAD) { 454 mutex_exit(&sv_mutex); 455 DTRACE_PROBE(sv_detach_err_prevent); 456 return (DDI_FAILURE); 457 } 458 459 for (i = 0; sv_devs && i < sv_max_devices; i++) { 460 svp = &sv_devs[i]; 461 462 if (svp->sv_state != SV_DISABLE) { 463 mutex_exit(&sv_mutex); 464 DTRACE_PROBE(sv_detach_err_busy); 465 return (DDI_FAILURE); 466 } 467 } 468 469 470 for (i = 0; sv_devs && i < sv_max_devices; i++) { 471 mutex_destroy(&sv_devs[i].sv_olock); 472 rw_destroy(&sv_devs[i].sv_lock); 473 } 474 475 if (sv_devs) { 476 nsc_kmem_free(sv_devs, 477 (sv_max_devices * sizeof (*sv_devs))); 478 sv_devs = NULL; 479 } 480 sv_max_devices = 0; 481 482 if (sv_mem) { 483 nsc_unregister_mem(sv_mem); 484 sv_mem = NULL; 485 } 486 487 mutex_exit(&sv_mutex); 488 489 /* 490 * Remove all minor nodes. 491 */ 492 493 ddi_remove_minor_node(dip, NULL); 494 sv_dip = NULL; 495 496 return (DDI_SUCCESS); 497 498 default: 499 return (DDI_FAILURE); 500 } 501 } 502 503 static sv_maj_t * 504 sv_getmajor(const dev_t dev) 505 { 506 sv_maj_t **insert, *maj; 507 major_t umaj = getmajor(dev); 508 509 /* 510 * See if the hash table entry, or one of the hash chains 511 * is already allocated for this major number 512 */ 513 if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) { 514 do { 515 if (maj->sm_major == umaj) 516 return (maj); 517 } while ((maj = maj->sm_next) != 0); 518 } 519 520 /* 521 * If the sv_mutex is held, there is design flaw, as the only non-mutex 522 * held callers can be sv_enable() or sv_dev_to_sv() 523 * Return an error, instead of panicing the system 524 */ 525 if (MUTEX_HELD(&sv_mutex)) { 526 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t"); 527 return (NULL); 528 } 529 530 /* 531 * Determine where to allocate a new element in the hash table 532 */ 533 mutex_enter(&sv_mutex); 534 insert = &(sv_majors[SV_MAJOR_HASH(umaj)]); 535 for (maj = *insert; maj; maj = maj->sm_next) { 536 537 /* Did another thread beat us to it? */ 538 if (maj->sm_major == umaj) 539 return (maj); 540 541 /* Find a NULL insert point? */ 542 if (maj->sm_next == NULL) 543 insert = &maj->sm_next; 544 } 545 546 /* 547 * Located the new insert point 548 */ 549 *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem); 550 if ((maj = *insert) != 0) 551 maj->sm_major = umaj; 552 else 553 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t"); 554 555 mutex_exit(&sv_mutex); 556 557 return (maj); 558 } 559 560 /* ARGSUSED */ 561 562 static int 563 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 564 { 565 int rc = DDI_FAILURE; 566 567 switch (infocmd) { 568 569 case DDI_INFO_DEVT2DEVINFO: 570 *result = sv_dip; 571 rc = DDI_SUCCESS; 572 break; 573 574 case DDI_INFO_DEVT2INSTANCE: 575 /* 576 * We only have a single instance. 577 */ 578 *result = 0; 579 rc = DDI_SUCCESS; 580 break; 581 582 default: 583 break; 584 } 585 586 return (rc); 587 } 588 589 590 /* 591 * Hashing of devices onto major device structures. 592 * 593 * Individual device structures are hashed onto one of the sm_hash[] 594 * buckets in the relevant major device structure. 595 * 596 * Hash insertion and deletion -must- be done with sv_mutex held. Hash 597 * searching does not require the mutex because of the sm_seq member. 598 * sm_seq is incremented on each insertion (-after- hash chain pointer 599 * manipulation) and each deletion (-before- hash chain pointer 600 * manipulation). When searching the hash chain, the seq number is 601 * checked before accessing each device structure, if the seq number has 602 * changed, then we restart the search from the top of the hash chain. 603 * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search 604 * the hash chain (we are guaranteed that this search cannot be 605 * interrupted). 606 */ 607 608 #define SV_HASH_RETRY 16 609 610 static sv_dev_t * 611 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp) 612 { 613 minor_t umin = getminor(dev); 614 sv_dev_t **hb, *next, *svp; 615 sv_maj_t *maj; 616 int seq; 617 int try; 618 619 /* Get major hash table */ 620 maj = sv_getmajor(dev); 621 if (majpp) 622 *majpp = maj; 623 if (maj == NULL) 624 return (NULL); 625 626 if (maj->sm_inuse == 0) { 627 DTRACE_PROBE1( 628 sv_dev_to_sv_end, 629 dev_t, dev); 630 return (NULL); 631 } 632 633 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]); 634 try = 0; 635 636 retry: 637 if (try > SV_HASH_RETRY) 638 mutex_enter(&sv_mutex); 639 640 seq = maj->sm_seq; 641 for (svp = *hb; svp; svp = next) { 642 next = svp->sv_hash; 643 644 nsc_membar_stld(); /* preserve register load order */ 645 646 if (maj->sm_seq != seq) { 647 DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev); 648 try++; 649 goto retry; 650 } 651 652 if (svp->sv_dev == dev) 653 break; 654 } 655 656 if (try > SV_HASH_RETRY) 657 mutex_exit(&sv_mutex); 658 659 return (svp); 660 } 661 662 663 /* 664 * Must be called with sv_mutex held. 665 */ 666 667 static int 668 sv_get_state(const dev_t udev, sv_dev_t **svpp) 669 { 670 sv_dev_t **hb, **insert, *svp; 671 sv_maj_t *maj; 672 minor_t umin; 673 int i; 674 675 /* Get major hash table */ 676 if ((maj = sv_getmajor(udev)) == NULL) 677 return (NULL); 678 679 /* Determine which minor hash table */ 680 umin = getminor(udev); 681 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]); 682 683 /* look for clash */ 684 685 insert = hb; 686 687 for (svp = *hb; svp; svp = svp->sv_hash) { 688 if (svp->sv_dev == udev) 689 break; 690 691 if (svp->sv_hash == NULL) 692 insert = &svp->sv_hash; 693 } 694 695 if (svp) { 696 DTRACE_PROBE1( 697 sv_get_state_enabled, 698 dev_t, udev); 699 return (SV_EENABLED); 700 } 701 702 /* look for spare sv_devs slot */ 703 704 for (i = 0; i < sv_max_devices; i++) { 705 svp = &sv_devs[i]; 706 707 if (svp->sv_state == SV_DISABLE) 708 break; 709 } 710 711 if (i >= sv_max_devices) { 712 DTRACE_PROBE1( 713 sv_get_state_noslots, 714 dev_t, udev); 715 return (SV_ENOSLOTS); 716 } 717 718 svp->sv_state = SV_PENDING; 719 svp->sv_pending = curthread; 720 721 *insert = svp; 722 svp->sv_hash = NULL; 723 maj->sm_seq++; /* must be after the store to the hash chain */ 724 725 *svpp = svp; 726 727 /* 728 * We do not know the size of the underlying device at 729 * this stage, so initialise "nblocks" property to 730 * zero, and update it whenever we succeed in 731 * nsc_reserve'ing the underlying nsc_fd_t. 732 */ 733 734 svp->sv_nblocks = 0; 735 736 return (0); 737 } 738 739 740 /* 741 * Remove a device structure from it's hash chain. 742 * Must be called with sv_mutex held. 743 */ 744 745 static void 746 sv_rm_hash(sv_dev_t *svp) 747 { 748 sv_dev_t **svpp; 749 sv_maj_t *maj; 750 751 /* Get major hash table */ 752 if ((maj = sv_getmajor(svp->sv_dev)) == NULL) 753 return; 754 755 /* remove svp from hash chain */ 756 757 svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]); 758 while (*svpp) { 759 if (*svpp == svp) { 760 /* 761 * increment of sm_seq must be before the 762 * removal from the hash chain 763 */ 764 maj->sm_seq++; 765 *svpp = svp->sv_hash; 766 break; 767 } 768 769 svpp = &(*svpp)->sv_hash; 770 } 771 772 svp->sv_hash = NULL; 773 } 774 775 /* 776 * Free (disable) a device structure. 777 * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will 778 * perform the exits during its processing. 779 */ 780 781 static int 782 sv_free(sv_dev_t *svp, const int error) 783 { 784 struct cb_ops *cb_ops; 785 sv_maj_t *maj; 786 787 /* Get major hash table */ 788 if ((maj = sv_getmajor(svp->sv_dev)) == NULL) 789 return (NULL); 790 791 svp->sv_state = SV_PENDING; 792 svp->sv_pending = curthread; 793 794 /* 795 * Close the fd's before removing from the hash or swapping 796 * back the cb_ops pointers so that the cache flushes before new 797 * io can come in. 798 */ 799 800 if (svp->sv_fd) { 801 (void) nsc_close(svp->sv_fd); 802 svp->sv_fd = 0; 803 } 804 805 sv_rm_hash(svp); 806 807 if (error != SV_ESDOPEN && 808 error != SV_ELYROPEN && --maj->sm_inuse == 0) { 809 810 if (maj->sm_dev_ops) 811 cb_ops = maj->sm_dev_ops->devo_cb_ops; 812 else 813 cb_ops = NULL; 814 815 if (cb_ops && maj->sm_strategy != NULL) { 816 cb_ops->cb_strategy = maj->sm_strategy; 817 cb_ops->cb_close = maj->sm_close; 818 cb_ops->cb_ioctl = maj->sm_ioctl; 819 cb_ops->cb_write = maj->sm_write; 820 cb_ops->cb_open = maj->sm_open; 821 cb_ops->cb_read = maj->sm_read; 822 cb_ops->cb_flag = maj->sm_flag; 823 824 if (maj->sm_awrite) 825 cb_ops->cb_awrite = maj->sm_awrite; 826 827 if (maj->sm_aread) 828 cb_ops->cb_aread = maj->sm_aread; 829 830 /* 831 * corbin XXX 832 * Leave backing device ops in maj->sm_* 833 * to handle any requests that might come 834 * in during the disable. This could be 835 * a problem however if the backing device 836 * driver is changed while we process these 837 * requests. 838 * 839 * maj->sm_strategy = 0; 840 * maj->sm_awrite = 0; 841 * maj->sm_write = 0; 842 * maj->sm_ioctl = 0; 843 * maj->sm_close = 0; 844 * maj->sm_aread = 0; 845 * maj->sm_read = 0; 846 * maj->sm_open = 0; 847 * maj->sm_flag = 0; 848 * 849 */ 850 } 851 852 if (maj->sm_dev_ops) { 853 maj->sm_dev_ops = 0; 854 } 855 } 856 857 if (svp->sv_lh) { 858 cred_t *crp = ddi_get_cred(); 859 860 /* 861 * Close the protective layered driver open using the 862 * Sun Private layered driver i/f. 863 */ 864 865 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp); 866 svp->sv_lh = NULL; 867 } 868 869 svp->sv_timestamp = nsc_lbolt(); 870 svp->sv_state = SV_DISABLE; 871 svp->sv_pending = NULL; 872 rw_exit(&svp->sv_lock); 873 mutex_exit(&sv_mutex); 874 875 return (error); 876 } 877 878 /* 879 * Reserve the device, taking into account the possibility that 880 * the reserve might have to be retried. 881 */ 882 static int 883 sv_reserve(nsc_fd_t *fd, int flags) 884 { 885 int eintr_count; 886 int rc; 887 888 eintr_count = 0; 889 do { 890 rc = nsc_reserve(fd, flags); 891 if (rc == EINTR) { 892 ++eintr_count; 893 delay(2); 894 } 895 } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT)); 896 897 return (rc); 898 } 899 900 static int 901 sv_enable(const caddr_t path, const int flag, 902 const dev_t udev, spcs_s_info_t kstatus) 903 { 904 struct dev_ops *dev_ops; 905 struct cb_ops *cb_ops; 906 sv_dev_t *svp; 907 sv_maj_t *maj; 908 nsc_size_t nblocks; 909 int rc; 910 cred_t *crp; 911 ldi_ident_t li; 912 913 if (udev == (dev_t)-1 || udev == 0) { 914 DTRACE_PROBE1( 915 sv_enable_err_baddev, 916 dev_t, udev); 917 return (SV_EBADDEV); 918 } 919 920 if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) { 921 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev); 922 return (SV_EAMODE); 923 } 924 925 /* Get major hash table */ 926 if ((maj = sv_getmajor(udev)) == NULL) 927 return (SV_EBADDEV); 928 929 mutex_enter(&sv_mutex); 930 931 rc = sv_get_state(udev, &svp); 932 if (rc) { 933 mutex_exit(&sv_mutex); 934 DTRACE_PROBE1(sv_enable_err_state, dev_t, udev); 935 return (rc); 936 } 937 938 rw_enter(&svp->sv_lock, RW_WRITER); 939 940 /* 941 * Get real fd used for io 942 */ 943 944 svp->sv_dev = udev; 945 svp->sv_flag = flag; 946 947 /* 948 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy 949 * function pointer before sv swaps them out. 950 */ 951 952 svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE), 953 sv_fd_def, (blind_t)udev, &rc); 954 955 if (svp->sv_fd == NULL) { 956 if (kstatus) 957 spcs_s_add(kstatus, rc); 958 DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev); 959 return (sv_free(svp, SV_ESDOPEN)); 960 } 961 962 /* 963 * Perform a layered driver open using the Sun Private layered 964 * driver i/f to ensure that the cb_ops structure for the driver 965 * is not detached out from under us whilst sv is enabled. 966 * 967 */ 968 969 crp = ddi_get_cred(); 970 svp->sv_lh = NULL; 971 972 if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) { 973 rc = ldi_open_by_dev(&svp->sv_dev, 974 OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li); 975 } 976 977 if (rc != 0) { 978 if (kstatus) 979 spcs_s_add(kstatus, rc); 980 DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev); 981 return (sv_free(svp, SV_ELYROPEN)); 982 } 983 984 /* 985 * Do layering if required - must happen after nsc_open(). 986 */ 987 988 if (maj->sm_inuse++ == 0) { 989 maj->sm_dev_ops = nsc_get_devops(getmajor(udev)); 990 991 if (maj->sm_dev_ops == NULL || 992 maj->sm_dev_ops->devo_cb_ops == NULL) { 993 DTRACE_PROBE1(sv_enable_err_load, dev_t, udev); 994 return (sv_free(svp, SV_ELOAD)); 995 } 996 997 dev_ops = maj->sm_dev_ops; 998 cb_ops = dev_ops->devo_cb_ops; 999 1000 if (cb_ops->cb_strategy == NULL || 1001 cb_ops->cb_strategy == nodev || 1002 cb_ops->cb_strategy == nulldev) { 1003 DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev); 1004 return (sv_free(svp, SV_ELOAD)); 1005 } 1006 1007 if (cb_ops->cb_strategy == sv_lyr_strategy) { 1008 DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev); 1009 return (sv_free(svp, SV_ESTRATEGY)); 1010 } 1011 1012 maj->sm_strategy = cb_ops->cb_strategy; 1013 maj->sm_close = cb_ops->cb_close; 1014 maj->sm_ioctl = cb_ops->cb_ioctl; 1015 maj->sm_write = cb_ops->cb_write; 1016 maj->sm_open = cb_ops->cb_open; 1017 maj->sm_read = cb_ops->cb_read; 1018 maj->sm_flag = cb_ops->cb_flag; 1019 1020 cb_ops->cb_flag = cb_ops->cb_flag | D_MP; 1021 cb_ops->cb_strategy = sv_lyr_strategy; 1022 cb_ops->cb_close = sv_lyr_close; 1023 cb_ops->cb_ioctl = sv_lyr_ioctl; 1024 cb_ops->cb_write = sv_lyr_write; 1025 cb_ops->cb_open = sv_lyr_open; 1026 cb_ops->cb_read = sv_lyr_read; 1027 1028 /* 1029 * Check that the driver has async I/O entry points 1030 * before changing them. 1031 */ 1032 1033 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) { 1034 maj->sm_awrite = 0; 1035 maj->sm_aread = 0; 1036 } else { 1037 maj->sm_awrite = cb_ops->cb_awrite; 1038 maj->sm_aread = cb_ops->cb_aread; 1039 1040 cb_ops->cb_awrite = sv_lyr_awrite; 1041 cb_ops->cb_aread = sv_lyr_aread; 1042 } 1043 1044 /* 1045 * Bug 4645743 1046 * 1047 * Prevent sv from ever unloading after it has interposed 1048 * on a major device because there is a race between 1049 * sv removing its layered entry points from the target 1050 * dev_ops, a client coming in and accessing the driver, 1051 * and the kernel modunloading the sv text. 1052 * 1053 * To allow unload, do svboot -u, which only happens in 1054 * pkgrm time. 1055 */ 1056 ASSERT(MUTEX_HELD(&sv_mutex)); 1057 sv_mod_status = SV_PREVENT_UNLOAD; 1058 } 1059 1060 1061 svp->sv_timestamp = nsc_lbolt(); 1062 svp->sv_state = SV_ENABLE; 1063 svp->sv_pending = NULL; 1064 rw_exit(&svp->sv_lock); 1065 1066 sv_ndevices++; 1067 mutex_exit(&sv_mutex); 1068 1069 nblocks = 0; 1070 if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) { 1071 nblocks = svp->sv_nblocks; 1072 nsc_release(svp->sv_fd); 1073 } 1074 1075 cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n", 1076 svp->sv_dev, nblocks); 1077 1078 return (0); 1079 } 1080 1081 1082 static int 1083 sv_prepare_unload() 1084 { 1085 int rc = 0; 1086 1087 mutex_enter(&sv_mutex); 1088 1089 if (sv_mod_status == SV_PREVENT_UNLOAD) { 1090 if ((sv_ndevices != 0) || (sv_tset != NULL)) { 1091 rc = EBUSY; 1092 } else { 1093 sv_mod_status = SV_ALLOW_UNLOAD; 1094 delay(SV_WAIT_UNLOAD * drv_usectohz(1000000)); 1095 } 1096 } 1097 1098 mutex_exit(&sv_mutex); 1099 return (rc); 1100 } 1101 1102 static int 1103 svattach_fd(blind_t arg) 1104 { 1105 dev_t dev = (dev_t)arg; 1106 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1107 int rc; 1108 1109 if (sv_debug > 0) 1110 cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp); 1111 1112 if (svp == NULL) { 1113 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg); 1114 return (0); 1115 } 1116 1117 if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) { 1118 cmn_err(CE_WARN, 1119 "!svattach_fd: nsc_partsize() failed, rc %d", rc); 1120 svp->sv_nblocks = 0; 1121 } 1122 1123 if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) { 1124 cmn_err(CE_WARN, 1125 "!svattach_fd: nsc_maxfbas() failed, rc %d", rc); 1126 svp->sv_maxfbas = 0; 1127 } 1128 1129 if (sv_debug > 0) { 1130 cmn_err(CE_CONT, 1131 "!svattach_fd(%p): size %" NSC_SZFMT ", " 1132 "maxfbas %" NSC_SZFMT "\n", 1133 arg, svp->sv_nblocks, svp->sv_maxfbas); 1134 } 1135 1136 return (0); 1137 } 1138 1139 1140 static int 1141 svdetach_fd(blind_t arg) 1142 { 1143 dev_t dev = (dev_t)arg; 1144 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1145 1146 if (sv_debug > 0) 1147 cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp); 1148 1149 /* svp can be NULL during disable of an sv */ 1150 if (svp == NULL) 1151 return (0); 1152 1153 svp->sv_maxfbas = 0; 1154 svp->sv_nblocks = 0; 1155 return (0); 1156 } 1157 1158 1159 /* 1160 * Side effect: if called with (guard != 0), then expects both sv_mutex 1161 * and sv_lock(RW_WRITER) to be held, and will release them before returning. 1162 */ 1163 1164 /* ARGSUSED */ 1165 static int 1166 sv_disable(dev_t dev, spcs_s_info_t kstatus) 1167 { 1168 sv_dev_t *svp = sv_dev_to_sv(dev, NULL); 1169 1170 if (svp == NULL) { 1171 1172 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp); 1173 return (SV_ENODEV); 1174 } 1175 1176 mutex_enter(&sv_mutex); 1177 rw_enter(&svp->sv_lock, RW_WRITER); 1178 1179 if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) { 1180 rw_exit(&svp->sv_lock); 1181 mutex_exit(&sv_mutex); 1182 1183 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp); 1184 return (SV_EDISABLED); 1185 } 1186 1187 1188 sv_ndevices--; 1189 return (sv_free(svp, 0)); 1190 } 1191 1192 1193 1194 static int 1195 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp) 1196 { 1197 nsc_buf_t *tmph; 1198 sv_dev_t *svp; 1199 sv_maj_t *maj; 1200 int (*fn)(); 1201 dev_t odev; 1202 int ret; 1203 int rc; 1204 1205 svp = sv_dev_to_sv(*devp, &maj); 1206 1207 if (svp) { 1208 if (svp->sv_state == SV_PENDING && 1209 svp->sv_pending == curthread) { 1210 /* 1211 * This is a recursive open from a call to 1212 * ddi_lyr_open_by_devt and so we just want 1213 * to pass it straight through to the 1214 * underlying driver. 1215 */ 1216 DTRACE_PROBE2(sv_lyr_open_recursive, 1217 sv_dev_t *, svp, 1218 dev_t, *devp); 1219 svp = NULL; 1220 } else 1221 rw_enter(&svp->sv_lock, RW_READER); 1222 } 1223 1224 odev = *devp; 1225 1226 if (maj && (fn = maj->sm_open) != 0) { 1227 if (!(maj->sm_flag & D_MP)) { 1228 UNSAFE_ENTER(); 1229 ret = (*fn)(devp, flag, otyp, crp); 1230 UNSAFE_EXIT(); 1231 } else { 1232 ret = (*fn)(devp, flag, otyp, crp); 1233 } 1234 1235 if (ret == 0) { 1236 /* 1237 * Re-acquire svp if the driver changed *devp. 1238 */ 1239 1240 if (*devp != odev) { 1241 if (svp != NULL) 1242 rw_exit(&svp->sv_lock); 1243 1244 svp = sv_dev_to_sv(*devp, NULL); 1245 1246 if (svp) { 1247 rw_enter(&svp->sv_lock, RW_READER); 1248 } 1249 } 1250 } 1251 } else { 1252 ret = ENODEV; 1253 } 1254 1255 if (svp && ret != 0 && svp->sv_state == SV_ENABLE) { 1256 /* 1257 * Underlying DDI open failed, but we have this 1258 * device SV enabled. If we can read some data 1259 * from the device, fake a successful open (this 1260 * probably means that this device is RDC'd and we 1261 * are getting the data from the secondary node). 1262 * 1263 * The reserve must be done with NSC_TRY|NSC_NOWAIT to 1264 * ensure that it does not deadlock if this open is 1265 * coming from nskernd:get_bsize(). 1266 */ 1267 rc = sv_reserve(svp->sv_fd, 1268 NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH); 1269 if (rc == 0) { 1270 tmph = NULL; 1271 1272 rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph); 1273 if (rc <= 0) { 1274 /* success */ 1275 ret = 0; 1276 } 1277 1278 if (tmph) { 1279 (void) nsc_free_buf(tmph); 1280 tmph = NULL; 1281 } 1282 1283 nsc_release(svp->sv_fd); 1284 1285 /* 1286 * Count the number of layered opens that we 1287 * fake since we have to fake a matching number 1288 * of closes (OTYP_LYR open/close calls must be 1289 * paired). 1290 */ 1291 1292 if (ret == 0 && otyp == OTYP_LYR) { 1293 mutex_enter(&svp->sv_olock); 1294 svp->sv_openlcnt++; 1295 mutex_exit(&svp->sv_olock); 1296 } 1297 } 1298 } 1299 1300 if (svp) { 1301 rw_exit(&svp->sv_lock); 1302 } 1303 1304 return (ret); 1305 } 1306 1307 1308 static int 1309 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp) 1310 { 1311 sv_dev_t *svp; 1312 sv_maj_t *maj; 1313 int (*fn)(); 1314 int ret; 1315 1316 svp = sv_dev_to_sv(dev, &maj); 1317 1318 if (svp && 1319 svp->sv_state == SV_PENDING && 1320 svp->sv_pending == curthread) { 1321 /* 1322 * This is a recursive open from a call to 1323 * ddi_lyr_close and so we just want 1324 * to pass it straight through to the 1325 * underlying driver. 1326 */ 1327 DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp, 1328 dev_t, dev); 1329 svp = NULL; 1330 } 1331 1332 if (svp) { 1333 rw_enter(&svp->sv_lock, RW_READER); 1334 1335 if (otyp == OTYP_LYR) { 1336 mutex_enter(&svp->sv_olock); 1337 1338 if (svp->sv_openlcnt) { 1339 /* 1340 * Consume sufficient layered closes to 1341 * account for the opens that we faked 1342 * whilst the device was failed. 1343 */ 1344 svp->sv_openlcnt--; 1345 mutex_exit(&svp->sv_olock); 1346 rw_exit(&svp->sv_lock); 1347 1348 DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev); 1349 1350 return (0); 1351 } 1352 1353 mutex_exit(&svp->sv_olock); 1354 } 1355 } 1356 1357 if (maj && (fn = maj->sm_close) != 0) { 1358 if (!(maj->sm_flag & D_MP)) { 1359 UNSAFE_ENTER(); 1360 ret = (*fn)(dev, flag, otyp, crp); 1361 UNSAFE_EXIT(); 1362 } else { 1363 ret = (*fn)(dev, flag, otyp, crp); 1364 } 1365 } else { 1366 ret = ENODEV; 1367 } 1368 1369 if (svp) { 1370 rw_exit(&svp->sv_lock); 1371 } 1372 1373 return (ret); 1374 } 1375 1376 1377 /* 1378 * Convert the specified dev_t into a locked and enabled sv_dev_t, or 1379 * return NULL. 1380 */ 1381 static sv_dev_t * 1382 sv_find_enabled(const dev_t dev, sv_maj_t **majpp) 1383 { 1384 sv_dev_t *svp; 1385 1386 while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) { 1387 rw_enter(&svp->sv_lock, RW_READER); 1388 1389 if (svp->sv_state == SV_ENABLE) { 1390 /* locked and enabled */ 1391 break; 1392 } 1393 1394 /* 1395 * State was changed while waiting on the lock. 1396 * Wait for a stable state. 1397 */ 1398 rw_exit(&svp->sv_lock); 1399 1400 DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev); 1401 1402 delay(2); 1403 } 1404 1405 return (svp); 1406 } 1407 1408 1409 static int 1410 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw) 1411 { 1412 sv_dev_t *svp; 1413 sv_maj_t *maj; 1414 int (*fn)(); 1415 int rc; 1416 1417 svp = sv_find_enabled(dev, &maj); 1418 if (svp == NULL) { 1419 if (maj) { 1420 if (rw == NSC_READ) 1421 fn = maj->sm_read; 1422 else 1423 fn = maj->sm_write; 1424 1425 if (fn != 0) { 1426 if (!(maj->sm_flag & D_MP)) { 1427 UNSAFE_ENTER(); 1428 rc = (*fn)(dev, uiop, crp); 1429 UNSAFE_EXIT(); 1430 } else { 1431 rc = (*fn)(dev, uiop, crp); 1432 } 1433 } 1434 1435 return (rc); 1436 } else { 1437 return (ENODEV); 1438 } 1439 } 1440 1441 ASSERT(RW_READ_HELD(&svp->sv_lock)); 1442 1443 if (svp->sv_flag == 0) { 1444 /* 1445 * guard access mode 1446 * - prevent user level access to the device 1447 */ 1448 DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop); 1449 rc = EPERM; 1450 goto out; 1451 } 1452 1453 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) { 1454 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop); 1455 goto out; 1456 } 1457 1458 if (rw == NSC_READ) 1459 rc = nsc_uread(svp->sv_fd, uiop, crp); 1460 else 1461 rc = nsc_uwrite(svp->sv_fd, uiop, crp); 1462 1463 nsc_release(svp->sv_fd); 1464 1465 out: 1466 rw_exit(&svp->sv_lock); 1467 1468 return (rc); 1469 } 1470 1471 1472 static int 1473 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp) 1474 { 1475 return (sv_lyr_uio(dev, uiop, crp, NSC_READ)); 1476 } 1477 1478 1479 static int 1480 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp) 1481 { 1482 return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE)); 1483 } 1484 1485 1486 /* ARGSUSED */ 1487 1488 static int 1489 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp) 1490 { 1491 return (aphysio(sv_lyr_strategy, 1492 anocancel, dev, B_READ, minphys, aio)); 1493 } 1494 1495 1496 /* ARGSUSED */ 1497 1498 static int 1499 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp) 1500 { 1501 return (aphysio(sv_lyr_strategy, 1502 anocancel, dev, B_WRITE, minphys, aio)); 1503 } 1504 1505 1506 /* 1507 * Set up an array containing the list of raw path names 1508 * The array for the paths is svl and the size of the array is 1509 * in size. 1510 * 1511 * If there are more layered devices than will fit in the array, 1512 * the number of extra layered devices is returned. Otherwise 1513 * zero is return. 1514 * 1515 * Input: 1516 * svn : array for paths 1517 * size : size of the array 1518 * 1519 * Output (extra): 1520 * zero : All paths fit in array 1521 * >0 : Number of defined layered devices don't fit in array 1522 */ 1523 1524 static int 1525 sv_list(void *ptr, const int size, int *extra, const int ilp32) 1526 { 1527 sv_name32_t *svn32; 1528 sv_name_t *svn; 1529 sv_dev_t *svp; 1530 int *mode, *nblocks; 1531 int i, index; 1532 char *path; 1533 1534 *extra = 0; 1535 index = 0; 1536 1537 if (ilp32) 1538 svn32 = ptr; 1539 else 1540 svn = ptr; 1541 1542 mutex_enter(&sv_mutex); 1543 for (i = 0; i < sv_max_devices; i++) { 1544 svp = &sv_devs[i]; 1545 1546 rw_enter(&svp->sv_lock, RW_READER); 1547 1548 if (svp->sv_state != SV_ENABLE) { 1549 rw_exit(&svp->sv_lock); 1550 continue; 1551 } 1552 1553 if ((*extra) != 0 || ptr == NULL) { 1554 /* Another overflow entry */ 1555 rw_exit(&svp->sv_lock); 1556 (*extra)++; 1557 continue; 1558 } 1559 1560 if (ilp32) { 1561 nblocks = &svn32->svn_nblocks; 1562 mode = &svn32->svn_mode; 1563 path = svn32->svn_path; 1564 1565 svn32->svn_timestamp = (uint32_t)svp->sv_timestamp; 1566 svn32++; 1567 } else { 1568 nblocks = &svn->svn_nblocks; 1569 mode = &svn->svn_mode; 1570 path = svn->svn_path; 1571 1572 svn->svn_timestamp = svp->sv_timestamp; 1573 svn++; 1574 } 1575 1576 (void) strcpy(path, nsc_pathname(svp->sv_fd)); 1577 *nblocks = svp->sv_nblocks; 1578 *mode = svp->sv_flag; 1579 1580 if (*nblocks == 0) { 1581 if (sv_debug > 3) 1582 cmn_err(CE_CONT, "!sv_list: need to reserve\n"); 1583 1584 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 1585 *nblocks = svp->sv_nblocks; 1586 nsc_release(svp->sv_fd); 1587 } 1588 } 1589 1590 if (++index >= size) { 1591 /* Out of space */ 1592 (*extra)++; 1593 } 1594 1595 rw_exit(&svp->sv_lock); 1596 } 1597 mutex_exit(&sv_mutex); 1598 1599 if (index < size) { 1600 /* NULL terminated list */ 1601 if (ilp32) 1602 svn32->svn_path[0] = '\0'; 1603 else 1604 svn->svn_path[0] = '\0'; 1605 } 1606 1607 return (0); 1608 } 1609 1610 1611 static void 1612 sv_thread_tune(int threads) 1613 { 1614 int incr = (threads > 0) ? 1 : -1; 1615 int change = 0; 1616 int nthreads; 1617 1618 ASSERT(MUTEX_HELD(&sv_mutex)); 1619 1620 if (sv_threads_extra) { 1621 /* keep track of any additional threads requested */ 1622 if (threads > 0) { 1623 sv_threads_extra += threads; 1624 return; 1625 } 1626 threads = -threads; 1627 if (threads >= sv_threads_extra) { 1628 threads -= sv_threads_extra; 1629 sv_threads_extra = 0; 1630 /* fall through to while loop */ 1631 } else { 1632 sv_threads_extra -= threads; 1633 return; 1634 } 1635 } else if (threads > 0) { 1636 /* 1637 * do not increase the number of threads beyond 1638 * sv_threads_max when doing dynamic thread tuning 1639 */ 1640 nthreads = nst_nthread(sv_tset); 1641 if ((nthreads + threads) > sv_threads_max) { 1642 sv_threads_extra = nthreads + threads - sv_threads_max; 1643 threads = sv_threads_max - nthreads; 1644 if (threads <= 0) 1645 return; 1646 } 1647 } 1648 1649 if (threads < 0) 1650 threads = -threads; 1651 1652 while (threads--) { 1653 nthreads = nst_nthread(sv_tset); 1654 sv_threads_needed += incr; 1655 1656 if (sv_threads_needed >= nthreads) 1657 change += nst_add_thread(sv_tset, sv_threads_inc); 1658 else if ((sv_threads_needed < 1659 (nthreads - (sv_threads_inc + sv_threads_hysteresis))) && 1660 ((nthreads - sv_threads_inc) >= sv_threads)) 1661 change -= nst_del_thread(sv_tset, sv_threads_inc); 1662 } 1663 1664 #ifdef DEBUG 1665 if (change) { 1666 cmn_err(CE_NOTE, 1667 "!sv_thread_tune: threads needed %d, nthreads %d, " 1668 "nthreads change %d", 1669 sv_threads_needed, nst_nthread(sv_tset), change); 1670 } 1671 #endif 1672 } 1673 1674 1675 /* ARGSUSED */ 1676 static int 1677 svopen(dev_t *devp, int flag, int otyp, cred_t *crp) 1678 { 1679 int rc; 1680 1681 mutex_enter(&sv_mutex); 1682 rc = sv_init_devs(); 1683 mutex_exit(&sv_mutex); 1684 1685 return (rc); 1686 } 1687 1688 1689 /* ARGSUSED */ 1690 static int 1691 svclose(dev_t dev, int flag, int otyp, cred_t *crp) 1692 { 1693 const int secs = HZ * 5; 1694 const int ticks = HZ / 10; 1695 int loops = secs / ticks; 1696 1697 mutex_enter(&sv_mutex); 1698 while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) { 1699 if (nst_nlive(sv_tset) <= 0) { 1700 nst_destroy(sv_tset); 1701 sv_tset = NULL; 1702 break; 1703 } 1704 1705 /* threads still active - wait for them to exit */ 1706 mutex_exit(&sv_mutex); 1707 delay(ticks); 1708 loops--; 1709 mutex_enter(&sv_mutex); 1710 } 1711 mutex_exit(&sv_mutex); 1712 1713 if (loops <= 0) { 1714 cmn_err(CE_WARN, 1715 #ifndef DEBUG 1716 /* do not write to console when non-DEBUG */ 1717 "!" 1718 #endif 1719 "sv:svclose: threads still active " 1720 "after %d sec - leaking thread set", secs); 1721 } 1722 1723 return (0); 1724 } 1725 1726 1727 static int 1728 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp) 1729 { 1730 char itmp1[12], itmp2[12]; /* temp char array for editing ints */ 1731 spcs_s_info_t kstatus; /* Kernel version of spcs status */ 1732 spcs_s_info_t ustatus; /* Address of user version of spcs status */ 1733 sv_list32_t svl32; /* 32 bit Initial structure for SVIOC_LIST */ 1734 sv_version_t svv; /* Version structure */ 1735 sv_conf_t svc; /* User config structure */ 1736 sv_list_t svl; /* Initial structure for SVIOC_LIST */ 1737 void *usvn; /* Address of user sv_name_t */ 1738 void *svn = NULL; /* Array for SVIOC_LIST */ 1739 uint64_t phash; /* pathname hash */ 1740 int rc = 0; /* Return code -- errno */ 1741 int size; /* Number of items in array */ 1742 int bytes; /* Byte size of array */ 1743 int ilp32; /* Convert data structures for ilp32 userland */ 1744 1745 *rvalp = 0; 1746 1747 /* 1748 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue. 1749 * else it means it previously was SV_PREVENT_UNLOAD, and now it's 1750 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload. 1751 * 1752 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex. 1753 */ 1754 if (sv_mod_status == SV_ALLOW_UNLOAD) { 1755 return (EBUSY); 1756 } 1757 1758 if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0)) 1759 return (rc); 1760 1761 kstatus = spcs_s_kcreate(); 1762 if (!kstatus) { 1763 DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev); 1764 return (ENOMEM); 1765 } 1766 1767 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32); 1768 1769 switch (cmd) { 1770 1771 case SVIOC_ENABLE: 1772 1773 if (ilp32) { 1774 sv_conf32_t svc32; 1775 1776 if (ddi_copyin((void *)arg, &svc32, 1777 sizeof (svc32), mode) < 0) { 1778 spcs_s_kfree(kstatus); 1779 return (EFAULT); 1780 } 1781 1782 svc.svc_error = (spcs_s_info_t)svc32.svc_error; 1783 (void) strcpy(svc.svc_path, svc32.svc_path); 1784 svc.svc_flag = svc32.svc_flag; 1785 svc.svc_major = svc32.svc_major; 1786 svc.svc_minor = svc32.svc_minor; 1787 } else { 1788 if (ddi_copyin((void *)arg, &svc, 1789 sizeof (svc), mode) < 0) { 1790 spcs_s_kfree(kstatus); 1791 return (EFAULT); 1792 } 1793 } 1794 1795 /* force to raw access */ 1796 svc.svc_flag = NSC_DEVICE; 1797 1798 if (sv_tset == NULL) { 1799 mutex_enter(&sv_mutex); 1800 1801 if (sv_tset == NULL) { 1802 sv_tset = nst_init("sv_thr", sv_threads); 1803 } 1804 1805 mutex_exit(&sv_mutex); 1806 1807 if (sv_tset == NULL) { 1808 cmn_err(CE_WARN, 1809 "!sv: could not allocate %d threads", 1810 sv_threads); 1811 } 1812 } 1813 1814 rc = sv_enable(svc.svc_path, svc.svc_flag, 1815 makedevice(svc.svc_major, svc.svc_minor), kstatus); 1816 1817 if (rc == 0) { 1818 sv_config_time = nsc_lbolt(); 1819 1820 mutex_enter(&sv_mutex); 1821 sv_thread_tune(sv_threads_dev); 1822 mutex_exit(&sv_mutex); 1823 } 1824 1825 DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc); 1826 1827 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc)); 1828 /* NOTREACHED */ 1829 1830 case SVIOC_DISABLE: 1831 1832 if (ilp32) { 1833 sv_conf32_t svc32; 1834 1835 if (ddi_copyin((void *)arg, &svc32, 1836 sizeof (svc32), mode) < 0) { 1837 spcs_s_kfree(kstatus); 1838 return (EFAULT); 1839 } 1840 1841 svc.svc_error = (spcs_s_info_t)svc32.svc_error; 1842 svc.svc_major = svc32.svc_major; 1843 svc.svc_minor = svc32.svc_minor; 1844 (void) strcpy(svc.svc_path, svc32.svc_path); 1845 svc.svc_flag = svc32.svc_flag; 1846 } else { 1847 if (ddi_copyin((void *)arg, &svc, 1848 sizeof (svc), mode) < 0) { 1849 spcs_s_kfree(kstatus); 1850 return (EFAULT); 1851 } 1852 } 1853 1854 if (svc.svc_major == (major_t)-1 && 1855 svc.svc_minor == (minor_t)-1) { 1856 sv_dev_t *svp; 1857 int i; 1858 1859 /* 1860 * User level could not find the minor device 1861 * node, so do this the slow way by searching 1862 * the entire sv config for a matching pathname. 1863 */ 1864 1865 phash = nsc_strhash(svc.svc_path); 1866 1867 mutex_enter(&sv_mutex); 1868 1869 for (i = 0; i < sv_max_devices; i++) { 1870 svp = &sv_devs[i]; 1871 1872 if (svp->sv_state == SV_DISABLE || 1873 svp->sv_fd == NULL) 1874 continue; 1875 1876 if (nsc_fdpathcmp(svp->sv_fd, phash, 1877 svc.svc_path) == 0) { 1878 svc.svc_major = getmajor(svp->sv_dev); 1879 svc.svc_minor = getminor(svp->sv_dev); 1880 break; 1881 } 1882 } 1883 1884 mutex_exit(&sv_mutex); 1885 1886 if (svc.svc_major == (major_t)-1 && 1887 svc.svc_minor == (minor_t)-1) 1888 return (spcs_s_ocopyoutf(&kstatus, 1889 svc.svc_error, SV_ENODEV)); 1890 } 1891 1892 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor), 1893 kstatus); 1894 1895 if (rc == 0) { 1896 sv_config_time = nsc_lbolt(); 1897 1898 mutex_enter(&sv_mutex); 1899 sv_thread_tune(-sv_threads_dev); 1900 mutex_exit(&sv_mutex); 1901 } 1902 1903 DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc); 1904 1905 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc)); 1906 /* NOTREACHED */ 1907 1908 case SVIOC_LIST: 1909 1910 if (ilp32) { 1911 if (ddi_copyin((void *)arg, &svl32, 1912 sizeof (svl32), mode) < 0) { 1913 spcs_s_kfree(kstatus); 1914 return (EFAULT); 1915 } 1916 1917 ustatus = (spcs_s_info_t)svl32.svl_error; 1918 size = svl32.svl_count; 1919 usvn = (void *)(unsigned long)svl32.svl_names; 1920 } else { 1921 if (ddi_copyin((void *)arg, &svl, 1922 sizeof (svl), mode) < 0) { 1923 spcs_s_kfree(kstatus); 1924 return (EFAULT); 1925 } 1926 1927 ustatus = svl.svl_error; 1928 size = svl.svl_count; 1929 usvn = svl.svl_names; 1930 } 1931 1932 /* Do some boundary checking */ 1933 if ((size < 0) || (size > sv_max_devices)) { 1934 /* Array size is out of range */ 1935 return (spcs_s_ocopyoutf(&kstatus, ustatus, 1936 SV_EARRBOUNDS, "0", 1937 spcs_s_inttostring(sv_max_devices, itmp1, 1938 sizeof (itmp1), 0), 1939 spcs_s_inttostring(size, itmp2, 1940 sizeof (itmp2), 0))); 1941 } 1942 1943 if (ilp32) 1944 bytes = size * sizeof (sv_name32_t); 1945 else 1946 bytes = size * sizeof (sv_name_t); 1947 1948 /* Allocate memory for the array of structures */ 1949 if (bytes != 0) { 1950 svn = kmem_zalloc(bytes, KM_SLEEP); 1951 if (!svn) { 1952 return (spcs_s_ocopyoutf(&kstatus, 1953 ustatus, ENOMEM)); 1954 } 1955 } 1956 1957 rc = sv_list(svn, size, rvalp, ilp32); 1958 if (rc) { 1959 if (svn != NULL) 1960 kmem_free(svn, bytes); 1961 return (spcs_s_ocopyoutf(&kstatus, ustatus, rc)); 1962 } 1963 1964 if (ilp32) { 1965 svl32.svl_timestamp = (uint32_t)sv_config_time; 1966 svl32.svl_maxdevs = (int32_t)sv_max_devices; 1967 1968 /* Return the list structure */ 1969 if (ddi_copyout(&svl32, (void *)arg, 1970 sizeof (svl32), mode) < 0) { 1971 spcs_s_kfree(kstatus); 1972 if (svn != NULL) 1973 kmem_free(svn, bytes); 1974 return (EFAULT); 1975 } 1976 } else { 1977 svl.svl_timestamp = sv_config_time; 1978 svl.svl_maxdevs = sv_max_devices; 1979 1980 /* Return the list structure */ 1981 if (ddi_copyout(&svl, (void *)arg, 1982 sizeof (svl), mode) < 0) { 1983 spcs_s_kfree(kstatus); 1984 if (svn != NULL) 1985 kmem_free(svn, bytes); 1986 return (EFAULT); 1987 } 1988 } 1989 1990 /* Return the array */ 1991 if (svn != NULL) { 1992 if (ddi_copyout(svn, usvn, bytes, mode) < 0) { 1993 kmem_free(svn, bytes); 1994 spcs_s_kfree(kstatus); 1995 return (EFAULT); 1996 } 1997 kmem_free(svn, bytes); 1998 } 1999 2000 DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0); 2001 2002 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0)); 2003 /* NOTREACHED */ 2004 2005 case SVIOC_VERSION: 2006 2007 if (ilp32) { 2008 sv_version32_t svv32; 2009 2010 if (ddi_copyin((void *)arg, &svv32, 2011 sizeof (svv32), mode) < 0) { 2012 spcs_s_kfree(kstatus); 2013 return (EFAULT); 2014 } 2015 2016 svv32.svv_major_rev = sv_major_rev; 2017 svv32.svv_minor_rev = sv_minor_rev; 2018 svv32.svv_micro_rev = sv_micro_rev; 2019 svv32.svv_baseline_rev = sv_baseline_rev; 2020 2021 if (ddi_copyout(&svv32, (void *)arg, 2022 sizeof (svv32), mode) < 0) { 2023 spcs_s_kfree(kstatus); 2024 return (EFAULT); 2025 } 2026 2027 ustatus = (spcs_s_info_t)svv32.svv_error; 2028 } else { 2029 if (ddi_copyin((void *)arg, &svv, 2030 sizeof (svv), mode) < 0) { 2031 spcs_s_kfree(kstatus); 2032 return (EFAULT); 2033 } 2034 2035 svv.svv_major_rev = sv_major_rev; 2036 svv.svv_minor_rev = sv_minor_rev; 2037 svv.svv_micro_rev = sv_micro_rev; 2038 svv.svv_baseline_rev = sv_baseline_rev; 2039 2040 if (ddi_copyout(&svv, (void *)arg, 2041 sizeof (svv), mode) < 0) { 2042 spcs_s_kfree(kstatus); 2043 return (EFAULT); 2044 } 2045 2046 ustatus = svv.svv_error; 2047 } 2048 2049 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0); 2050 2051 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0)); 2052 /* NOTREACHED */ 2053 2054 case SVIOC_UNLOAD: 2055 rc = sv_prepare_unload(); 2056 2057 if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) { 2058 rc = EFAULT; 2059 } 2060 2061 spcs_s_kfree(kstatus); 2062 return (rc); 2063 2064 default: 2065 spcs_s_kfree(kstatus); 2066 2067 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL); 2068 2069 return (EINVAL); 2070 /* NOTREACHED */ 2071 } 2072 2073 /* NOTREACHED */ 2074 } 2075 2076 2077 /* ARGSUSED */ 2078 static int 2079 svprint(dev_t dev, char *str) 2080 { 2081 int instance = ddi_get_instance(sv_dip); 2082 cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str); 2083 return (0); 2084 } 2085 2086 2087 static void 2088 _sv_lyr_strategy(struct buf *bp) 2089 { 2090 caddr_t buf_addr; /* pointer to linear buffer in bp */ 2091 nsc_buf_t *bufh = NULL; 2092 nsc_buf_t *hndl = NULL; 2093 sv_dev_t *svp; 2094 nsc_vec_t *v; 2095 sv_maj_t *maj; 2096 nsc_size_t fba_req, fba_len; /* FBA lengths */ 2097 nsc_off_t fba_off; /* FBA offset */ 2098 size_t tocopy, nbytes; /* byte lengths */ 2099 int rw, rc; /* flags and return codes */ 2100 int (*fn)(); 2101 2102 rc = 0; 2103 2104 if (sv_debug > 5) 2105 cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp); 2106 2107 svp = sv_find_enabled(bp->b_edev, &maj); 2108 if (svp == NULL) { 2109 if (maj && (fn = maj->sm_strategy) != 0) { 2110 if (!(maj->sm_flag & D_MP)) { 2111 UNSAFE_ENTER(); 2112 rc = (*fn)(bp); 2113 UNSAFE_EXIT(); 2114 } else { 2115 rc = (*fn)(bp); 2116 } 2117 return; 2118 } else { 2119 bioerror(bp, ENODEV); 2120 biodone(bp); 2121 return; 2122 } 2123 } 2124 2125 ASSERT(RW_READ_HELD(&svp->sv_lock)); 2126 2127 if (svp->sv_flag == 0) { 2128 /* 2129 * guard access mode 2130 * - prevent user level access to the device 2131 */ 2132 DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp); 2133 bioerror(bp, EPERM); 2134 goto out; 2135 } 2136 2137 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) { 2138 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp); 2139 2140 if (rc == EINTR) 2141 cmn_err(CE_WARN, "!nsc_reserve() returned EINTR"); 2142 bioerror(bp, rc); 2143 goto out; 2144 } 2145 2146 if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) { 2147 DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp); 2148 2149 if (bp->b_flags & B_READ) { 2150 /* return EOF, not an error */ 2151 bp->b_resid = bp->b_bcount; 2152 bioerror(bp, 0); 2153 } else 2154 bioerror(bp, EINVAL); 2155 2156 goto done; 2157 } 2158 2159 /* 2160 * Preallocate a handle once per call to strategy. 2161 * If this fails, then the nsc_alloc_buf() will allocate 2162 * a temporary handle per allocation/free pair. 2163 */ 2164 2165 DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp); 2166 2167 bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL); 2168 2169 DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp); 2170 2171 if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) { 2172 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp); 2173 2174 cmn_err(CE_WARN, 2175 "!sv: allocated active handle (bufh %p, flags %x)", 2176 (void *)bufh, bufh->sb_flag); 2177 2178 bioerror(bp, ENXIO); 2179 goto done; 2180 } 2181 2182 fba_req = FBA_LEN(bp->b_bcount); 2183 if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks) 2184 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno); 2185 2186 rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE; 2187 2188 bp_mapin(bp); 2189 2190 bp->b_resid = bp->b_bcount; 2191 buf_addr = bp->b_un.b_addr; 2192 fba_off = 0; 2193 2194 /* 2195 * fba_req - requested size of transfer in FBAs after 2196 * truncation to device extent, and allowing for 2197 * possible non-FBA bounded final chunk. 2198 * fba_off - offset of start of chunk from start of bp in FBAs. 2199 * fba_len - size of this chunk in FBAs. 2200 */ 2201 2202 loop: 2203 fba_len = min(fba_req, svp->sv_maxfbas); 2204 hndl = bufh; 2205 2206 DTRACE_PROBE4(sv_dbg_allocb_start, 2207 sv_dev_t *, svp, 2208 uint64_t, (uint64_t)(bp->b_lblkno + fba_off), 2209 uint64_t, (uint64_t)fba_len, 2210 int, rw); 2211 2212 rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off), 2213 fba_len, rw, &hndl); 2214 2215 DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp); 2216 2217 if (rc > 0) { 2218 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp); 2219 bioerror(bp, rc); 2220 if (hndl != bufh) 2221 (void) nsc_free_buf(hndl); 2222 hndl = NULL; 2223 goto done; 2224 } 2225 2226 tocopy = min(FBA_SIZE(fba_len), bp->b_resid); 2227 v = hndl->sb_vec; 2228 2229 if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) { 2230 /* 2231 * Not overwriting all of the last FBA, so read in the 2232 * old contents now before we overwrite it with the new 2233 * data. 2234 */ 2235 2236 DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp, 2237 uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1)); 2238 2239 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0); 2240 if (rc > 0) { 2241 bioerror(bp, rc); 2242 goto done; 2243 } 2244 2245 DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp); 2246 } 2247 2248 DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp); 2249 2250 while (tocopy > 0) { 2251 nbytes = min(tocopy, (nsc_size_t)v->sv_len); 2252 2253 if (bp->b_flags & B_READ) 2254 (void) bcopy(v->sv_addr, buf_addr, nbytes); 2255 else 2256 (void) bcopy(buf_addr, v->sv_addr, nbytes); 2257 2258 bp->b_resid -= nbytes; 2259 buf_addr += nbytes; 2260 tocopy -= nbytes; 2261 v++; 2262 } 2263 2264 DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp); 2265 2266 if ((bp->b_flags & B_READ) == 0) { 2267 DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp, 2268 uint64_t, (uint64_t)hndl->sb_pos, 2269 uint64_t, (uint64_t)hndl->sb_len); 2270 2271 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0); 2272 2273 DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp); 2274 2275 if (rc > 0) { 2276 bioerror(bp, rc); 2277 goto done; 2278 } 2279 } 2280 2281 /* 2282 * Adjust FBA offset and requested (ie. remaining) length, 2283 * loop if more data to transfer. 2284 */ 2285 2286 fba_off += fba_len; 2287 fba_req -= fba_len; 2288 2289 if (fba_req > 0) { 2290 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp); 2291 2292 rc = nsc_free_buf(hndl); 2293 2294 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp); 2295 2296 if (rc > 0) { 2297 DTRACE_PROBE1(sv_lyr_strategy_err_free, 2298 struct buf *, bp); 2299 bioerror(bp, rc); 2300 } 2301 2302 hndl = NULL; 2303 2304 if (rc <= 0) 2305 goto loop; 2306 } 2307 2308 done: 2309 if (hndl != NULL) { 2310 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp); 2311 2312 rc = nsc_free_buf(hndl); 2313 2314 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp); 2315 2316 if (rc > 0) { 2317 DTRACE_PROBE1(sv_lyr_strategy_err_free, 2318 struct buf *, bp); 2319 bioerror(bp, rc); 2320 } 2321 2322 hndl = NULL; 2323 } 2324 2325 if (bufh) 2326 (void) nsc_free_handle(bufh); 2327 2328 DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp); 2329 2330 nsc_release(svp->sv_fd); 2331 2332 DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp); 2333 2334 out: 2335 if (sv_debug > 5) { 2336 cmn_err(CE_CONT, 2337 "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n", 2338 (void *)bp, (void *)bufh, bp->b_error); 2339 } 2340 2341 DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error); 2342 2343 rw_exit(&svp->sv_lock); 2344 biodone(bp); 2345 } 2346 2347 2348 static void 2349 sv_async_strategy(blind_t arg) 2350 { 2351 struct buf *bp = (struct buf *)arg; 2352 _sv_lyr_strategy(bp); 2353 } 2354 2355 2356 static int 2357 sv_lyr_strategy(struct buf *bp) 2358 { 2359 nsthread_t *tp; 2360 int nlive; 2361 2362 /* 2363 * If B_ASYNC was part of the DDI we could use it as a hint to 2364 * not create a thread for synchronous i/o. 2365 */ 2366 if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) { 2367 /* not sv enabled - just pass through */ 2368 DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp); 2369 _sv_lyr_strategy(bp); 2370 return (0); 2371 } 2372 2373 if (sv_debug > 4) { 2374 cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n", 2375 nst_nthread(sv_tset), nst_nlive(sv_tset)); 2376 } 2377 2378 /* 2379 * If there are only guard devices enabled there 2380 * won't be a threadset, so don't try and use it. 2381 */ 2382 tp = NULL; 2383 if (sv_tset != NULL) { 2384 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0); 2385 } 2386 2387 if (tp == NULL) { 2388 /* 2389 * out of threads, so fall back to synchronous io. 2390 */ 2391 if (sv_debug > 0) { 2392 cmn_err(CE_CONT, 2393 "!sv_lyr_strategy: thread alloc failed\n"); 2394 } 2395 2396 DTRACE_PROBE1(sv_lyr_strategy_no_thread, 2397 struct buf *, bp); 2398 2399 _sv_lyr_strategy(bp); 2400 sv_no_threads++; 2401 } else { 2402 nlive = nst_nlive(sv_tset); 2403 if (nlive > sv_max_nlive) { 2404 if (sv_debug > 0) { 2405 cmn_err(CE_CONT, 2406 "!sv_lyr_strategy: " 2407 "new max nlive %d (nthread %d)\n", 2408 nlive, nst_nthread(sv_tset)); 2409 } 2410 2411 sv_max_nlive = nlive; 2412 } 2413 } 2414 2415 return (0); 2416 } 2417 2418 2419 #ifndef offsetof 2420 #define offsetof(s, m) ((size_t)(&((s *)0)->m)) 2421 #endif 2422 2423 /* 2424 * re-write the size of the current partition 2425 */ 2426 static int 2427 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp) 2428 { 2429 size_t offset; 2430 int ilp32; 2431 int pnum; 2432 int rc; 2433 2434 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32); 2435 2436 rc = nskern_partition(svp->sv_dev, &pnum); 2437 if (rc != 0) { 2438 return (rc); 2439 } 2440 2441 if (pnum < 0 || pnum >= V_NUMPAR) { 2442 cmn_err(CE_WARN, 2443 "!sv_gvtoc: unable to determine partition number " 2444 "for dev %lx", svp->sv_dev); 2445 return (EINVAL); 2446 } 2447 2448 if (ilp32) { 2449 int32_t p_size; 2450 2451 #ifdef _SunOS_5_6 2452 offset = offsetof(struct vtoc, v_part); 2453 offset += sizeof (struct partition) * pnum; 2454 offset += offsetof(struct partition, p_size); 2455 #else 2456 offset = offsetof(struct vtoc32, v_part); 2457 offset += sizeof (struct partition32) * pnum; 2458 offset += offsetof(struct partition32, p_size); 2459 #endif 2460 2461 p_size = (int32_t)svp->sv_nblocks; 2462 if (p_size == 0) { 2463 if (sv_reserve(svp->sv_fd, 2464 NSC_MULTI|NSC_PCATCH) == 0) { 2465 p_size = (int32_t)svp->sv_nblocks; 2466 nsc_release(svp->sv_fd); 2467 } else { 2468 rc = EINTR; 2469 } 2470 } 2471 2472 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset), 2473 sizeof (p_size), mode) != 0) { 2474 rc = EFAULT; 2475 } 2476 } else { 2477 long p_size; 2478 2479 offset = offsetof(struct vtoc, v_part); 2480 offset += sizeof (struct partition) * pnum; 2481 offset += offsetof(struct partition, p_size); 2482 2483 p_size = (long)svp->sv_nblocks; 2484 if (p_size == 0) { 2485 if (sv_reserve(svp->sv_fd, 2486 NSC_MULTI|NSC_PCATCH) == 0) { 2487 p_size = (long)svp->sv_nblocks; 2488 nsc_release(svp->sv_fd); 2489 } else { 2490 rc = EINTR; 2491 } 2492 } 2493 2494 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset), 2495 sizeof (p_size), mode) != 0) { 2496 rc = EFAULT; 2497 } 2498 } 2499 2500 return (rc); 2501 } 2502 2503 2504 #ifdef DKIOCPARTITION 2505 /* 2506 * re-write the size of the current partition 2507 * 2508 * arg is dk_efi_t. 2509 * 2510 * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64; 2511 * 2512 * dk_efi_t->dki_data --> efi_gpt_t (label header) 2513 * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions) 2514 * 2515 * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts 2516 * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself 2517 * 2518 * This assumes that sizeof (efi_gpt_t) is the same as the size of a 2519 * logical block on the disk. 2520 * 2521 * Everything is little endian (i.e. disk format). 2522 */ 2523 static int 2524 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp) 2525 { 2526 dk_efi_t efi; 2527 efi_gpt_t gpt; 2528 efi_gpe_t *gpe = NULL; 2529 size_t sgpe; 2530 uint64_t p_size; /* virtual partition size from nsctl */ 2531 uint32_t crc; 2532 int unparts; /* number of parts in user's array */ 2533 int pnum; 2534 int rc; 2535 2536 rc = nskern_partition(svp->sv_dev, &pnum); 2537 if (rc != 0) { 2538 return (rc); 2539 } 2540 2541 if (pnum < 0) { 2542 cmn_err(CE_WARN, 2543 "!sv_efi: unable to determine partition number for dev %lx", 2544 svp->sv_dev); 2545 return (EINVAL); 2546 } 2547 2548 if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) { 2549 return (EFAULT); 2550 } 2551 2552 efi.dki_data = (void *)(uintptr_t)efi.dki_data_64; 2553 2554 if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) { 2555 return (EINVAL); 2556 } 2557 2558 if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) { 2559 rc = EFAULT; 2560 goto out; 2561 } 2562 2563 if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0) 2564 unparts = 1; 2565 else if (pnum >= unparts) { 2566 cmn_err(CE_WARN, 2567 "!sv_efi: partition# beyond end of user array (%d >= %d)", 2568 pnum, unparts); 2569 return (EINVAL); 2570 } 2571 2572 sgpe = sizeof (*gpe) * unparts; 2573 gpe = kmem_alloc(sgpe, KM_SLEEP); 2574 2575 if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) { 2576 rc = EFAULT; 2577 goto out; 2578 } 2579 2580 p_size = svp->sv_nblocks; 2581 if (p_size == 0) { 2582 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 2583 p_size = (diskaddr_t)svp->sv_nblocks; 2584 nsc_release(svp->sv_fd); 2585 } else { 2586 rc = EINTR; 2587 } 2588 } 2589 2590 gpe[pnum].efi_gpe_EndingLBA = LE_64( 2591 LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1); 2592 2593 gpt.efi_gpt_PartitionEntryArrayCRC32 = 0; 2594 CRC32(crc, gpe, sgpe, -1U, sv_crc32_table); 2595 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2596 2597 gpt.efi_gpt_HeaderCRC32 = 0; 2598 CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table); 2599 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); 2600 2601 if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) { 2602 rc = EFAULT; 2603 goto out; 2604 } 2605 2606 if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) { 2607 rc = EFAULT; 2608 goto out; 2609 } 2610 2611 out: 2612 if (gpe) { 2613 kmem_free(gpe, sgpe); 2614 } 2615 2616 return (rc); 2617 } 2618 2619 2620 /* 2621 * Re-write the size of the partition specified by p_partno 2622 * 2623 * Note that if a DKIOCPARTITION is issued to an fd opened against a 2624 * non-sv'd device, but p_partno requests the size for a different 2625 * device that is sv'd, this function will *not* be called as sv is 2626 * not interposed on the original device (the fd). 2627 * 2628 * It would not be easy to change this as we cannot get the partition 2629 * number for the non-sv'd device, so cannot compute the dev_t of the 2630 * (sv'd) p_partno device, and so cannot find out if it is sv'd or get 2631 * its size from nsctl. 2632 * 2633 * See also the "Bug 4755783" comment in sv_lyr_ioctl(). 2634 */ 2635 static int 2636 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp) 2637 { 2638 struct partition64 p64; 2639 sv_dev_t *nsvp = NULL; 2640 diskaddr_t p_size; 2641 minor_t nminor; 2642 int pnum, rc; 2643 dev_t ndev; 2644 2645 rc = nskern_partition(svp->sv_dev, &pnum); 2646 if (rc != 0) { 2647 return (rc); 2648 } 2649 2650 if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) { 2651 return (EFAULT); 2652 } 2653 2654 if (p64.p_partno != pnum) { 2655 /* switch to requested partition, not the current one */ 2656 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum); 2657 ndev = makedevice(getmajor(svp->sv_dev), nminor); 2658 nsvp = sv_find_enabled(ndev, NULL); 2659 if (nsvp == NULL) { 2660 /* not sv device - just return */ 2661 return (0); 2662 } 2663 2664 svp = nsvp; 2665 } 2666 2667 p_size = svp->sv_nblocks; 2668 if (p_size == 0) { 2669 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) { 2670 p_size = (diskaddr_t)svp->sv_nblocks; 2671 nsc_release(svp->sv_fd); 2672 } else { 2673 rc = EINTR; 2674 } 2675 } 2676 2677 if (nsvp != NULL) { 2678 rw_exit(&nsvp->sv_lock); 2679 } 2680 2681 if ((rc == 0) && ddi_copyout(&p_size, 2682 (void *)(arg + offsetof(struct partition64, p_size)), 2683 sizeof (p_size), mode) != 0) { 2684 return (EFAULT); 2685 } 2686 2687 return (rc); 2688 } 2689 #endif /* DKIOCPARTITION */ 2690 2691 2692 static int 2693 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg, 2694 const int mode, cred_t *crp, int *rvalp) 2695 { 2696 sv_dev_t *svp; 2697 sv_maj_t *maj; 2698 int (*fn)(); 2699 int rc = 0; 2700 2701 maj = 0; 2702 fn = 0; 2703 2704 /* 2705 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue. 2706 * else it means it previously was SV_PREVENT_UNLOAD, and now it's 2707 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload. 2708 * 2709 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex. 2710 */ 2711 if (sv_mod_status == SV_ALLOW_UNLOAD) { 2712 return (EBUSY); 2713 } 2714 2715 svp = sv_find_enabled(dev, &maj); 2716 if (svp != NULL) { 2717 if (nskernd_isdaemon()) { 2718 /* 2719 * This is nskernd which always needs to see 2720 * the underlying disk device accurately. 2721 * 2722 * So just pass the ioctl straight through 2723 * to the underlying driver as though the device 2724 * was not sv enabled. 2725 */ 2726 DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp, 2727 dev_t, dev); 2728 2729 rw_exit(&svp->sv_lock); 2730 svp = NULL; 2731 } else { 2732 ASSERT(RW_READ_HELD(&svp->sv_lock)); 2733 } 2734 } 2735 2736 /* 2737 * We now have a locked and enabled SV device, or a non-SV device. 2738 */ 2739 2740 switch (cmd) { 2741 /* 2742 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI 2743 * and DKIOCSETEFI are intercepted and faked up as some 2744 * i/o providers emulate volumes of a different size to 2745 * the underlying volume. 2746 * 2747 * Setting the size by rewriting the vtoc is not permitted. 2748 */ 2749 2750 case DKIOCSVTOC: 2751 #ifdef DKIOCPARTITION 2752 case DKIOCSETEFI: 2753 #endif 2754 if (svp == NULL) { 2755 /* not intercepted -- allow ioctl through */ 2756 break; 2757 } 2758 2759 rw_exit(&svp->sv_lock); 2760 2761 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM); 2762 2763 return (EPERM); 2764 2765 default: 2766 break; 2767 } 2768 2769 /* 2770 * Pass through the real ioctl command. 2771 */ 2772 2773 if (maj && (fn = maj->sm_ioctl) != 0) { 2774 if (!(maj->sm_flag & D_MP)) { 2775 UNSAFE_ENTER(); 2776 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp); 2777 UNSAFE_EXIT(); 2778 } else { 2779 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp); 2780 } 2781 } else { 2782 rc = ENODEV; 2783 } 2784 2785 /* 2786 * Bug 4755783 2787 * Fix up the size of the current partition to allow 2788 * for the virtual volume to be a different size to the 2789 * physical volume (e.g. for II compact dependent shadows). 2790 * 2791 * Note that this only attempts to fix up the current partition 2792 * - the one that the ioctl was issued against. There could be 2793 * other sv'd partitions in the same vtoc, but we cannot tell 2794 * so we don't attempt to fix them up. 2795 */ 2796 2797 if (svp != NULL && rc == 0) { 2798 switch (cmd) { 2799 case DKIOCGVTOC: 2800 rc = sv_fix_dkiocgvtoc(arg, mode, svp); 2801 break; 2802 2803 #ifdef DKIOCPARTITION 2804 case DKIOCGETEFI: 2805 rc = sv_fix_dkiocgetefi(arg, mode, svp); 2806 break; 2807 2808 case DKIOCPARTITION: 2809 rc = sv_fix_dkiocpartition(arg, mode, svp); 2810 break; 2811 #endif /* DKIOCPARTITION */ 2812 } 2813 } 2814 2815 if (svp != NULL) { 2816 rw_exit(&svp->sv_lock); 2817 } 2818 2819 return (rc); 2820 } 2821