1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sdt.h> 45 #include <sys/sunddi.h> 46 #include <sys/sunldi.h> 47 #include <sys/sysmacros.h> 48 #include <sys/vio_common.h> 49 #include <sys/vdsk_mailbox.h> 50 #include <sys/vdsk_common.h> 51 #include <sys/vtoc.h> 52 #include <sys/vfs.h> 53 #include <sys/stat.h> 54 #include <sys/scsi/impl/uscsi.h> 55 #include <vm/seg_map.h> 56 57 /* Virtual disk server initialization flags */ 58 #define VDS_LDI 0x01 59 #define VDS_MDEG 0x02 60 61 /* Virtual disk server tunable parameters */ 62 #define VDS_RETRIES 5 63 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 64 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 65 #define VDS_NCHAINS 32 66 67 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 68 #define VDS_NAME "virtual-disk-server" 69 70 #define VD_NAME "vd" 71 #define VD_VOLUME_NAME "vdisk" 72 #define VD_ASCIILABEL "Virtual Disk" 73 74 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 75 #define VD_ID_PROP "id" 76 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 77 #define VD_REG_PROP "reg" 78 79 /* Virtual disk initialization flags */ 80 #define VD_DISK_READY 0x01 81 #define VD_LOCKING 0x02 82 #define VD_LDC 0x04 83 #define VD_DRING 0x08 84 #define VD_SID 0x10 85 #define VD_SEQ_NUM 0x20 86 87 /* Flags for opening/closing backing devices via LDI */ 88 #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) 89 90 /* Flags for writing to a vdisk which is a file */ 91 #define VD_FILE_WRITE_FLAGS SM_ASYNC 92 93 /* Number of backup labels */ 94 #define VD_FILE_NUM_BACKUP 5 95 96 /* Timeout for SCSI I/O */ 97 #define VD_SCSI_RDWR_TIMEOUT 30 /* 30 secs */ 98 99 /* 100 * By Solaris convention, slice/partition 2 represents the entire disk; 101 * unfortunately, this convention does not appear to be codified. 102 */ 103 #define VD_ENTIRE_DISK_SLICE 2 104 105 /* Return a cpp token as a string */ 106 #define STRINGIZE(token) #token 107 108 /* 109 * Print a message prefixed with the current function name to the message log 110 * (and optionally to the console for verbose boots); these macros use cpp's 111 * concatenation of string literals and C99 variable-length-argument-list 112 * macros 113 */ 114 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 115 #define _PRN(format, ...) \ 116 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 117 118 /* Return a pointer to the "i"th vdisk dring element */ 119 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 120 (vd->dring + (i)*vd->descriptor_size)) 121 122 /* Return the virtual disk client's type as a string (for use in messages) */ 123 #define VD_CLIENT(vd) \ 124 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 125 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 126 (((vd)->xfer_mode == 0) ? "null client" : \ 127 "unsupported client"))) 128 129 /* Read disk label from a disk on file */ 130 #define VD_FILE_LABEL_READ(vd, labelp) \ 131 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \ 132 0, sizeof (struct dk_label)) 133 134 /* Write disk label to a disk on file */ 135 #define VD_FILE_LABEL_WRITE(vd, labelp) \ 136 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 137 0, sizeof (struct dk_label)) 138 139 /* 140 * Specification of an MD node passed to the MDEG to filter any 141 * 'vport' nodes that do not belong to the specified node. This 142 * template is copied for each vds instance and filled in with 143 * the appropriate 'cfg-handle' value before being passed to the MDEG. 144 */ 145 static mdeg_prop_spec_t vds_prop_template[] = { 146 { MDET_PROP_STR, "name", VDS_NAME }, 147 { MDET_PROP_VAL, "cfg-handle", NULL }, 148 { MDET_LIST_END, NULL, NULL } 149 }; 150 151 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 152 153 /* 154 * Matching criteria passed to the MDEG to register interest 155 * in changes to 'virtual-device-port' nodes identified by their 156 * 'id' property. 157 */ 158 static md_prop_match_t vd_prop_match[] = { 159 { MDET_PROP_VAL, VD_ID_PROP }, 160 { MDET_LIST_END, NULL } 161 }; 162 163 static mdeg_node_match_t vd_match = {"virtual-device-port", 164 vd_prop_match}; 165 166 /* Debugging macros */ 167 #ifdef DEBUG 168 169 static int vd_msglevel = 0; 170 171 #define PR0 if (vd_msglevel > 0) PRN 172 #define PR1 if (vd_msglevel > 1) PRN 173 #define PR2 if (vd_msglevel > 2) PRN 174 175 #define VD_DUMP_DRING_ELEM(elem) \ 176 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 177 elem->hdr.dstate, \ 178 elem->payload.operation, \ 179 elem->payload.status, \ 180 elem->payload.nbytes, \ 181 elem->payload.addr, \ 182 elem->payload.ncookies); 183 184 char * 185 vd_decode_state(int state) 186 { 187 char *str; 188 189 #define CASE_STATE(_s) case _s: str = #_s; break; 190 191 switch (state) { 192 CASE_STATE(VD_STATE_INIT) 193 CASE_STATE(VD_STATE_VER) 194 CASE_STATE(VD_STATE_ATTR) 195 CASE_STATE(VD_STATE_DRING) 196 CASE_STATE(VD_STATE_RDX) 197 CASE_STATE(VD_STATE_DATA) 198 default: str = "unknown"; break; 199 } 200 201 #undef CASE_STATE 202 203 return (str); 204 } 205 206 void 207 vd_decode_tag(vio_msg_t *msg) 208 { 209 char *tstr, *sstr, *estr; 210 211 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 212 213 switch (msg->tag.vio_msgtype) { 214 CASE_TYPE(VIO_TYPE_CTRL) 215 CASE_TYPE(VIO_TYPE_DATA) 216 CASE_TYPE(VIO_TYPE_ERR) 217 default: tstr = "unknown"; break; 218 } 219 220 #undef CASE_TYPE 221 222 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 223 224 switch (msg->tag.vio_subtype) { 225 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 226 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 227 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 228 default: sstr = "unknown"; break; 229 } 230 231 #undef CASE_SUBTYPE 232 233 #define CASE_ENV(_s) case _s: estr = #_s; break; 234 235 switch (msg->tag.vio_subtype_env) { 236 CASE_ENV(VIO_VER_INFO) 237 CASE_ENV(VIO_ATTR_INFO) 238 CASE_ENV(VIO_DRING_REG) 239 CASE_ENV(VIO_DRING_UNREG) 240 CASE_ENV(VIO_RDX) 241 CASE_ENV(VIO_PKT_DATA) 242 CASE_ENV(VIO_DESC_DATA) 243 CASE_ENV(VIO_DRING_DATA) 244 default: estr = "unknown"; break; 245 } 246 247 #undef CASE_ENV 248 249 PR1("(%x/%x/%x) message : (%s/%s/%s)", 250 msg->tag.vio_msgtype, msg->tag.vio_subtype, 251 msg->tag.vio_subtype_env, tstr, sstr, estr); 252 } 253 254 #else /* !DEBUG */ 255 256 #define PR0(...) 257 #define PR1(...) 258 #define PR2(...) 259 260 #define VD_DUMP_DRING_ELEM(elem) 261 262 #define vd_decode_state(_s) (NULL) 263 #define vd_decode_tag(_s) (NULL) 264 265 #endif /* DEBUG */ 266 267 268 /* 269 * Soft state structure for a vds instance 270 */ 271 typedef struct vds { 272 uint_t initialized; /* driver inst initialization flags */ 273 dev_info_t *dip; /* driver inst devinfo pointer */ 274 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 275 mod_hash_t *vd_table; /* table of virtual disks served */ 276 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 277 mdeg_handle_t mdeg; /* handle for MDEG operations */ 278 } vds_t; 279 280 /* 281 * Types of descriptor-processing tasks 282 */ 283 typedef enum vd_task_type { 284 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 285 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 286 } vd_task_type_t; 287 288 /* 289 * Structure describing the task for processing a descriptor 290 */ 291 typedef struct vd_task { 292 struct vd *vd; /* vd instance task is for */ 293 vd_task_type_t type; /* type of descriptor task */ 294 int index; /* dring elem index for task */ 295 vio_msg_t *msg; /* VIO message task is for */ 296 size_t msglen; /* length of message content */ 297 vd_dring_payload_t *request; /* request task will perform */ 298 struct buf buf; /* buf(9s) for I/O request */ 299 ldc_mem_handle_t mhdl; /* task memory handle */ 300 int status; /* status of processing task */ 301 int (*completef)(struct vd_task *task); /* completion func ptr */ 302 } vd_task_t; 303 304 /* 305 * Soft state structure for a virtual disk instance 306 */ 307 typedef struct vd { 308 uint_t initialized; /* vdisk initialization flags */ 309 vds_t *vds; /* server for this vdisk */ 310 ddi_taskq_t *startq; /* queue for I/O start tasks */ 311 ddi_taskq_t *completionq; /* queue for completion tasks */ 312 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 313 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 314 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 315 uint_t nslices; /* number of slices */ 316 size_t vdisk_size; /* number of blocks in vdisk */ 317 vd_disk_type_t vdisk_type; /* slice or entire disk */ 318 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 319 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 320 boolean_t pseudo; /* underlying pseudo dev */ 321 boolean_t file; /* underlying file */ 322 vnode_t *file_vnode; /* file vnode */ 323 size_t file_size; /* file size */ 324 ddi_devid_t file_devid; /* devid for disk image */ 325 struct dk_efi dk_efi; /* synthetic for slice type */ 326 struct dk_geom dk_geom; /* synthetic for slice type */ 327 struct vtoc vtoc; /* synthetic for slice type */ 328 ldc_status_t ldc_state; /* LDC connection state */ 329 ldc_handle_t ldc_handle; /* handle for LDC comm */ 330 size_t max_msglen; /* largest LDC message len */ 331 vd_state_t state; /* client handshake state */ 332 uint8_t xfer_mode; /* transfer mode with client */ 333 uint32_t sid; /* client's session ID */ 334 uint64_t seq_num; /* message sequence number */ 335 uint64_t dring_ident; /* identifier of dring */ 336 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 337 uint32_t descriptor_size; /* num bytes in desc */ 338 uint32_t dring_len; /* number of dring elements */ 339 caddr_t dring; /* address of dring */ 340 caddr_t vio_msgp; /* vio msg staging buffer */ 341 vd_task_t inband_task; /* task for inband descriptor */ 342 vd_task_t *dring_task; /* tasks dring elements */ 343 344 kmutex_t lock; /* protects variables below */ 345 boolean_t enabled; /* is vdisk enabled? */ 346 boolean_t reset_state; /* reset connection state? */ 347 boolean_t reset_ldc; /* reset LDC channel? */ 348 } vd_t; 349 350 typedef struct vds_operation { 351 char *namep; 352 uint8_t operation; 353 int (*start)(vd_task_t *task); 354 int (*complete)(vd_task_t *task); 355 } vds_operation_t; 356 357 typedef struct vd_ioctl { 358 uint8_t operation; /* vdisk operation */ 359 const char *operation_name; /* vdisk operation name */ 360 size_t nbytes; /* size of operation buffer */ 361 int cmd; /* corresponding ioctl cmd */ 362 const char *cmd_name; /* ioctl cmd name */ 363 void *arg; /* ioctl cmd argument */ 364 /* convert input vd_buf to output ioctl_arg */ 365 void (*copyin)(void *vd_buf, void *ioctl_arg); 366 /* convert input ioctl_arg to output vd_buf */ 367 void (*copyout)(void *ioctl_arg, void *vd_buf); 368 } vd_ioctl_t; 369 370 /* Define trivial copyin/copyout conversion function flag */ 371 #define VD_IDENTITY ((void (*)(void *, void *))-1) 372 373 374 static int vds_ldc_retries = VDS_RETRIES; 375 static int vds_ldc_delay = VDS_LDC_DELAY; 376 static int vds_dev_retries = VDS_RETRIES; 377 static int vds_dev_delay = VDS_DEV_DELAY; 378 static void *vds_state; 379 static uint64_t vds_operations; /* see vds_operation[] definition below */ 380 381 static int vd_open_flags = VD_OPEN_FLAGS; 382 383 static uint_t vd_file_write_flags = VD_FILE_WRITE_FLAGS; 384 385 static short vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT; 386 387 /* 388 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 389 * 390 * Each supported major version should appear only once, paired with (and only 391 * with) its highest supported minor version number (as the protocol requires 392 * supporting all lower minor version numbers as well) 393 */ 394 static const vio_ver_t vds_version[] = {{1, 0}}; 395 static const size_t vds_num_versions = 396 sizeof (vds_version)/sizeof (vds_version[0]); 397 398 static void vd_free_dring_task(vd_t *vdp); 399 static int vd_setup_vd(vd_t *vd); 400 static boolean_t vd_enabled(vd_t *vd); 401 402 /* 403 * Function: 404 * vd_file_rw 405 * 406 * Description: 407 * Read or write to a disk on file. 408 * 409 * Parameters: 410 * vd - disk on which the operation is performed. 411 * slice - slice on which the operation is performed, 412 * VD_SLICE_NONE indicates that the operation 413 * is done using an absolute disk offset. 414 * operation - operation to execute: read (VD_OP_BREAD) or 415 * write (VD_OP_BWRITE). 416 * data - buffer where data are read to or written from. 417 * blk - starting block for the operation. 418 * len - number of bytes to read or write. 419 * 420 * Return Code: 421 * n >= 0 - success, n indicates the number of bytes read 422 * or written. 423 * -1 - error. 424 */ 425 static ssize_t 426 vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk, 427 size_t len) 428 { 429 caddr_t maddr; 430 size_t offset, maxlen, moffset, mlen, n; 431 uint_t smflags; 432 enum seg_rw srw; 433 434 ASSERT(vd->file); 435 ASSERT(len > 0); 436 437 if (slice == VD_SLICE_NONE) { 438 /* raw disk access */ 439 offset = blk * DEV_BSIZE; 440 } else { 441 ASSERT(slice >= 0 && slice < V_NUMPAR); 442 if (blk >= vd->vtoc.v_part[slice].p_size) { 443 /* address past the end of the slice */ 444 PR0("req_addr (0x%lx) > psize (0x%lx)", 445 blk, vd->vtoc.v_part[slice].p_size); 446 return (0); 447 } 448 449 offset = (vd->vtoc.v_part[slice].p_start + blk) * DEV_BSIZE; 450 451 /* 452 * If the requested size is greater than the size 453 * of the partition, truncate the read/write. 454 */ 455 maxlen = (vd->vtoc.v_part[slice].p_size - blk) * DEV_BSIZE; 456 457 if (len > maxlen) { 458 PR0("I/O size truncated to %lu bytes from %lu bytes", 459 maxlen, len); 460 len = maxlen; 461 } 462 } 463 464 /* 465 * We have to ensure that we are reading/writing into the mmap 466 * range. If we have a partial disk image (e.g. an image of 467 * s0 instead s2) the system can try to access slices that 468 * are not included into the disk image. 469 */ 470 if ((offset + len) >= vd->file_size) { 471 PR0("offset + nbytes (0x%lx + 0x%lx) >= " 472 "file_size (0x%lx)", offset, len, vd->file_size); 473 return (-1); 474 } 475 476 srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE; 477 smflags = (operation == VD_OP_BREAD)? 0 : 478 (SM_WRITE | vd_file_write_flags); 479 n = len; 480 481 do { 482 /* 483 * segmap_getmapflt() returns a MAXBSIZE chunk which is 484 * MAXBSIZE aligned. 485 */ 486 moffset = offset & MAXBOFFSET; 487 mlen = MIN(MAXBSIZE - moffset, n); 488 maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset, 489 mlen, 1, srw); 490 /* 491 * Fault in the pages so we can check for error and ensure 492 * that we can safely used the mapped address. 493 */ 494 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 495 F_SOFTLOCK, srw) != 0) { 496 (void) segmap_release(segkmap, maddr, 0); 497 return (-1); 498 } 499 500 if (operation == VD_OP_BREAD) 501 bcopy(maddr + moffset, data, mlen); 502 else 503 bcopy(data, maddr + moffset, mlen); 504 505 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 506 F_SOFTUNLOCK, srw) != 0) { 507 (void) segmap_release(segkmap, maddr, 0); 508 return (-1); 509 } 510 if (segmap_release(segkmap, maddr, smflags) != 0) 511 return (-1); 512 n -= mlen; 513 offset += mlen; 514 data += mlen; 515 516 } while (n > 0); 517 518 return (len); 519 } 520 521 /* 522 * Function: 523 * vd_file_set_vtoc 524 * 525 * Description: 526 * Set the vtoc of a disk image by writing the label and backup 527 * labels into the disk image backend. 528 * 529 * Parameters: 530 * vd - disk on which the operation is performed. 531 * label - the data to be written. 532 * 533 * Return Code: 534 * 0 - success. 535 * n > 0 - error, n indicates the errno code. 536 */ 537 static int 538 vd_file_set_vtoc(vd_t *vd, struct dk_label *label) 539 { 540 int blk, sec, cyl, head, cnt; 541 542 ASSERT(vd->file); 543 544 if (VD_FILE_LABEL_WRITE(vd, label) < 0) { 545 PR0("fail to write disk label"); 546 return (EIO); 547 } 548 549 /* 550 * Backup labels are on the last alternate cylinder's 551 * first five odd sectors. 552 */ 553 if (label->dkl_acyl == 0) { 554 PR0("no alternate cylinder, can not store backup labels"); 555 return (0); 556 } 557 558 cyl = label->dkl_ncyl + label->dkl_acyl - 1; 559 head = label->dkl_nhead - 1; 560 561 blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) + 562 (head * label->dkl_nsect); 563 564 /* 565 * Write the backup labels. Make sure we don't try to write past 566 * the last cylinder. 567 */ 568 sec = 1; 569 570 for (cnt = 0; cnt < VD_FILE_NUM_BACKUP; cnt++) { 571 572 if (sec >= label->dkl_nsect) { 573 PR0("not enough sector to store all backup labels"); 574 return (0); 575 } 576 577 if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)label, 578 blk + sec, sizeof (struct dk_label)) < 0) { 579 PR0("error writing backup label at block %d\n", 580 blk + sec); 581 return (EIO); 582 } 583 584 PR1("wrote backup label at block %d\n", blk + sec); 585 586 sec += 2; 587 } 588 589 return (0); 590 } 591 592 /* 593 * Function: 594 * vd_file_get_devid_block 595 * 596 * Description: 597 * Return the block number where the device id is stored. 598 * 599 * Parameters: 600 * vd - disk on which the operation is performed. 601 * blkp - pointer to the block number 602 * 603 * Return Code: 604 * 0 - success 605 * ENOSPC - disk has no space to store a device id 606 */ 607 static int 608 vd_file_get_devid_block(vd_t *vd, size_t *blkp) 609 { 610 diskaddr_t spc, head, cyl; 611 612 ASSERT(vd->file); 613 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 614 615 /* this geometry doesn't allow us to have a devid */ 616 if (vd->dk_geom.dkg_acyl < 2) { 617 PR0("not enough alternate cylinder available for devid " 618 "(acyl=%u)", vd->dk_geom.dkg_acyl); 619 return (ENOSPC); 620 } 621 622 /* the devid is in on the track next to the last cylinder */ 623 cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2; 624 spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect; 625 head = vd->dk_geom.dkg_nhead - 1; 626 627 *blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) + 628 (head * vd->dk_geom.dkg_nsect) + 1; 629 630 return (0); 631 } 632 633 /* 634 * Return the checksum of a disk block containing an on-disk devid. 635 */ 636 static uint_t 637 vd_dkdevid2cksum(struct dk_devid *dkdevid) 638 { 639 uint_t chksum, *ip; 640 int i; 641 642 chksum = 0; 643 ip = (uint_t *)dkdevid; 644 for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++) 645 chksum ^= ip[i]; 646 647 return (chksum); 648 } 649 650 /* 651 * Function: 652 * vd_file_read_devid 653 * 654 * Description: 655 * Read the device id stored on a disk image. 656 * 657 * Parameters: 658 * vd - disk on which the operation is performed. 659 * devid - the return address of the device ID. 660 * 661 * Return Code: 662 * 0 - success 663 * EIO - I/O error while trying to access the disk image 664 * EINVAL - no valid device id was found 665 * ENOSPC - disk has no space to store a device id 666 */ 667 static int 668 vd_file_read_devid(vd_t *vd, ddi_devid_t *devid) 669 { 670 struct dk_devid *dkdevid; 671 size_t blk; 672 uint_t chksum; 673 int status, sz; 674 675 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 676 return (status); 677 678 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 679 680 /* get the devid */ 681 if ((vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk, 682 DEV_BSIZE)) < 0) { 683 PR0("error reading devid block at %lu", blk); 684 status = EIO; 685 goto done; 686 } 687 688 /* validate the revision */ 689 if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) || 690 (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) { 691 PR0("invalid devid found at block %lu (bad revision)", blk); 692 status = EINVAL; 693 goto done; 694 } 695 696 /* compute checksum */ 697 chksum = vd_dkdevid2cksum(dkdevid); 698 699 /* compare the checksums */ 700 if (DKD_GETCHKSUM(dkdevid) != chksum) { 701 PR0("invalid devid found at block %lu (bad checksum)", blk); 702 status = EINVAL; 703 goto done; 704 } 705 706 /* validate the device id */ 707 if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) { 708 PR0("invalid devid found at block %lu", blk); 709 status = EINVAL; 710 goto done; 711 } 712 713 PR1("devid read at block %lu", blk); 714 715 sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid); 716 *devid = kmem_alloc(sz, KM_SLEEP); 717 bcopy(&dkdevid->dkd_devid, *devid, sz); 718 719 done: 720 kmem_free(dkdevid, DEV_BSIZE); 721 return (status); 722 723 } 724 725 /* 726 * Function: 727 * vd_file_write_devid 728 * 729 * Description: 730 * Write a device id into disk image. 731 * 732 * Parameters: 733 * vd - disk on which the operation is performed. 734 * devid - the device ID to store. 735 * 736 * Return Code: 737 * 0 - success 738 * EIO - I/O error while trying to access the disk image 739 * ENOSPC - disk has no space to store a device id 740 */ 741 static int 742 vd_file_write_devid(vd_t *vd, ddi_devid_t devid) 743 { 744 struct dk_devid *dkdevid; 745 uint_t chksum; 746 size_t blk; 747 int status; 748 749 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 750 return (status); 751 752 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 753 754 /* set revision */ 755 dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB; 756 dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB; 757 758 /* copy devid */ 759 bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid)); 760 761 /* compute checksum */ 762 chksum = vd_dkdevid2cksum(dkdevid); 763 764 /* set checksum */ 765 DKD_FORMCHKSUM(chksum, dkdevid); 766 767 /* store the devid */ 768 if ((status = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, 769 (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) { 770 PR0("Error writing devid block at %lu", blk); 771 status = EIO; 772 } else { 773 PR1("devid written at block %lu", blk); 774 status = 0; 775 } 776 777 kmem_free(dkdevid, DEV_BSIZE); 778 return (status); 779 } 780 781 /* 782 * Function: 783 * vd_scsi_rdwr 784 * 785 * Description: 786 * Read or write to a SCSI disk using an absolute disk offset. 787 * 788 * Parameters: 789 * vd - disk on which the operation is performed. 790 * operation - operation to execute: read (VD_OP_BREAD) or 791 * write (VD_OP_BWRITE). 792 * data - buffer where data are read to or written from. 793 * blk - starting block for the operation. 794 * len - number of bytes to read or write. 795 * 796 * Return Code: 797 * 0 - success 798 * n != 0 - error. 799 */ 800 static int 801 vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len) 802 { 803 struct uscsi_cmd ucmd; 804 union scsi_cdb cdb; 805 int nsectors, nblk; 806 int max_sectors; 807 int status, rval; 808 809 ASSERT(!vd->file); 810 811 max_sectors = vd->max_xfer_sz; 812 nblk = (len / DEV_BSIZE); 813 814 if (len % DEV_BSIZE != 0) 815 return (EINVAL); 816 817 /* 818 * Build and execute the uscsi ioctl. We build a group0, group1 819 * or group4 command as necessary, since some targets 820 * do not support group1 commands. 821 */ 822 while (nblk) { 823 824 bzero(&ucmd, sizeof (ucmd)); 825 bzero(&cdb, sizeof (cdb)); 826 827 nsectors = (max_sectors < nblk) ? max_sectors : nblk; 828 829 if (blk < (2 << 20) && nsectors <= 0xff) { 830 FORMG0ADDR(&cdb, blk); 831 FORMG0COUNT(&cdb, nsectors); 832 ucmd.uscsi_cdblen = CDB_GROUP0; 833 } else if (blk > 0xffffffff) { 834 FORMG4LONGADDR(&cdb, blk); 835 FORMG4COUNT(&cdb, nsectors); 836 ucmd.uscsi_cdblen = CDB_GROUP4; 837 cdb.scc_cmd |= SCMD_GROUP4; 838 } else { 839 FORMG1ADDR(&cdb, blk); 840 FORMG1COUNT(&cdb, nsectors); 841 ucmd.uscsi_cdblen = CDB_GROUP1; 842 cdb.scc_cmd |= SCMD_GROUP1; 843 } 844 845 ucmd.uscsi_cdb = (caddr_t)&cdb; 846 ucmd.uscsi_bufaddr = data; 847 ucmd.uscsi_buflen = nsectors * DEV_BSIZE; 848 ucmd.uscsi_timeout = vd_scsi_rdwr_timeout; 849 /* 850 * Set flags so that the command is isolated from normal 851 * commands and no error message is printed. 852 */ 853 ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT; 854 855 if (operation == VD_OP_BREAD) { 856 cdb.scc_cmd |= SCMD_READ; 857 ucmd.uscsi_flags |= USCSI_READ; 858 } else { 859 cdb.scc_cmd |= SCMD_WRITE; 860 } 861 862 status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE], 863 USCSICMD, (intptr_t)&ucmd, (vd_open_flags | FKIOCTL), 864 kcred, &rval); 865 866 if (status == 0) 867 status = ucmd.uscsi_status; 868 869 if (status != 0) 870 break; 871 872 /* 873 * Check if partial DMA breakup is required. If so, reduce 874 * the request size by half and retry the last request. 875 */ 876 if (ucmd.uscsi_resid == ucmd.uscsi_buflen) { 877 max_sectors >>= 1; 878 if (max_sectors <= 0) { 879 status = EIO; 880 break; 881 } 882 continue; 883 } 884 885 if (ucmd.uscsi_resid != 0) { 886 status = EIO; 887 break; 888 } 889 890 blk += nsectors; 891 nblk -= nsectors; 892 data += nsectors * DEV_BSIZE; /* SECSIZE */ 893 } 894 895 return (status); 896 } 897 898 /* 899 * Return Values 900 * EINPROGRESS - operation was successfully started 901 * EIO - encountered LDC (aka. task error) 902 * 0 - operation completed successfully 903 * 904 * Side Effect 905 * sets request->status = <disk operation status> 906 */ 907 static int 908 vd_start_bio(vd_task_t *task) 909 { 910 int rv, status = 0; 911 vd_t *vd = task->vd; 912 vd_dring_payload_t *request = task->request; 913 struct buf *buf = &task->buf; 914 uint8_t mtype; 915 int slice; 916 917 ASSERT(vd != NULL); 918 ASSERT(request != NULL); 919 920 slice = request->slice; 921 922 ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices); 923 ASSERT((request->operation == VD_OP_BREAD) || 924 (request->operation == VD_OP_BWRITE)); 925 926 if (request->nbytes == 0) { 927 /* no service for trivial requests */ 928 request->status = EINVAL; 929 return (0); 930 } 931 932 PR1("%s %lu bytes at block %lu", 933 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 934 request->nbytes, request->addr); 935 936 bioinit(buf); 937 buf->b_flags = B_BUSY; 938 buf->b_bcount = request->nbytes; 939 buf->b_lblkno = request->addr; 940 buf->b_edev = (slice == VD_SLICE_NONE)? NODEV : vd->dev[slice]; 941 942 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 943 944 /* Map memory exported by client */ 945 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 946 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 947 &(buf->b_un.b_addr), NULL); 948 if (status != 0) { 949 PR0("ldc_mem_map() returned err %d ", status); 950 biofini(buf); 951 return (EIO); 952 } 953 954 status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount); 955 if (status != 0) { 956 (void) ldc_mem_unmap(task->mhdl); 957 PR0("ldc_mem_acquire() returned err %d ", status); 958 biofini(buf); 959 return (EIO); 960 } 961 962 buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE; 963 964 /* Start the block I/O */ 965 if (vd->file) { 966 rv = vd_file_rw(vd, slice, request->operation, buf->b_un.b_addr, 967 request->addr, request->nbytes); 968 if (rv < 0) { 969 request->nbytes = 0; 970 request->status = EIO; 971 } else { 972 request->nbytes = rv; 973 request->status = 0; 974 } 975 } else { 976 if (slice == VD_SLICE_NONE) { 977 /* 978 * This is not a disk image so it is a real disk. We 979 * assume that the underlying device driver supports 980 * USCSICMD ioctls. This is the case of all SCSI devices 981 * (sd, ssd...). 982 * 983 * In the future if we have non-SCSI disks we would need 984 * to invoke the appropriate function to do I/O using an 985 * absolute disk offset (for example using DKIOCTL_RWCMD 986 * for IDE disks). 987 */ 988 rv = vd_scsi_rdwr(vd, request->operation, 989 buf->b_un.b_addr, request->addr, request->nbytes); 990 if (rv != 0) { 991 request->nbytes = 0; 992 request->status = EIO; 993 } else { 994 request->status = 0; 995 } 996 } else { 997 request->status = 998 ldi_strategy(vd->ldi_handle[slice], buf); 999 1000 /* 1001 * This is to indicate to the caller that the request 1002 * needs to be finished by vd_complete_bio() by calling 1003 * biowait() there and waiting for that to return before 1004 * triggering the notification of the vDisk client. 1005 * 1006 * This is necessary when writing to real disks as 1007 * otherwise calls to ldi_strategy() would be serialized 1008 * behind the calls to biowait() and performance would 1009 * suffer. 1010 */ 1011 if (request->status == 0) 1012 return (EINPROGRESS); 1013 } 1014 } 1015 1016 /* Clean up after error */ 1017 rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 1018 if (rv) { 1019 PR0("ldc_mem_release() returned err %d ", rv); 1020 status = EIO; 1021 } 1022 rv = ldc_mem_unmap(task->mhdl); 1023 if (rv) { 1024 PR0("ldc_mem_unmap() returned err %d ", rv); 1025 status = EIO; 1026 } 1027 1028 biofini(buf); 1029 1030 return (status); 1031 } 1032 1033 /* 1034 * This function should only be called from vd_notify to ensure that requests 1035 * are responded to in the order that they are received. 1036 */ 1037 static int 1038 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 1039 { 1040 int status; 1041 size_t nbytes; 1042 1043 do { 1044 nbytes = msglen; 1045 status = ldc_write(ldc_handle, msg, &nbytes); 1046 if (status != EWOULDBLOCK) 1047 break; 1048 drv_usecwait(vds_ldc_delay); 1049 } while (status == EWOULDBLOCK); 1050 1051 if (status != 0) { 1052 if (status != ECONNRESET) 1053 PR0("ldc_write() returned errno %d", status); 1054 return (status); 1055 } else if (nbytes != msglen) { 1056 PR0("ldc_write() performed only partial write"); 1057 return (EIO); 1058 } 1059 1060 PR1("SENT %lu bytes", msglen); 1061 return (0); 1062 } 1063 1064 static void 1065 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 1066 { 1067 mutex_enter(&vd->lock); 1068 vd->reset_state = B_TRUE; 1069 vd->reset_ldc = reset_ldc; 1070 mutex_exit(&vd->lock); 1071 } 1072 1073 /* 1074 * Reset the state of the connection with a client, if needed; reset the LDC 1075 * transport as well, if needed. This function should only be called from the 1076 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 1077 */ 1078 static void 1079 vd_reset_if_needed(vd_t *vd) 1080 { 1081 int status = 0; 1082 1083 mutex_enter(&vd->lock); 1084 if (!vd->reset_state) { 1085 ASSERT(!vd->reset_ldc); 1086 mutex_exit(&vd->lock); 1087 return; 1088 } 1089 mutex_exit(&vd->lock); 1090 1091 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 1092 1093 /* 1094 * Let any asynchronous I/O complete before possibly pulling the rug 1095 * out from under it; defer checking vd->reset_ldc, as one of the 1096 * asynchronous tasks might set it 1097 */ 1098 ddi_taskq_wait(vd->completionq); 1099 1100 if (vd->file) { 1101 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred); 1102 if (status) { 1103 PR0("VOP_FSYNC returned errno %d", status); 1104 } 1105 } 1106 1107 if ((vd->initialized & VD_DRING) && 1108 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 1109 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1110 1111 vd_free_dring_task(vd); 1112 1113 /* Free the staging buffer for msgs */ 1114 if (vd->vio_msgp != NULL) { 1115 kmem_free(vd->vio_msgp, vd->max_msglen); 1116 vd->vio_msgp = NULL; 1117 } 1118 1119 /* Free the inband message buffer */ 1120 if (vd->inband_task.msg != NULL) { 1121 kmem_free(vd->inband_task.msg, vd->max_msglen); 1122 vd->inband_task.msg = NULL; 1123 } 1124 1125 mutex_enter(&vd->lock); 1126 1127 if (vd->reset_ldc) 1128 PR0("taking down LDC channel"); 1129 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 1130 PR0("ldc_down() returned errno %d", status); 1131 1132 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 1133 vd->state = VD_STATE_INIT; 1134 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 1135 1136 /* Allocate the staging buffer */ 1137 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 1138 1139 PR0("calling ldc_up\n"); 1140 (void) ldc_up(vd->ldc_handle); 1141 1142 vd->reset_state = B_FALSE; 1143 vd->reset_ldc = B_FALSE; 1144 1145 mutex_exit(&vd->lock); 1146 } 1147 1148 static void vd_recv_msg(void *arg); 1149 1150 static void 1151 vd_mark_in_reset(vd_t *vd) 1152 { 1153 int status; 1154 1155 PR0("vd_mark_in_reset: marking vd in reset\n"); 1156 1157 vd_need_reset(vd, B_FALSE); 1158 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 1159 if (status == DDI_FAILURE) { 1160 PR0("cannot schedule task to recv msg\n"); 1161 vd_need_reset(vd, B_TRUE); 1162 return; 1163 } 1164 } 1165 1166 static int 1167 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 1168 { 1169 boolean_t accepted; 1170 int status; 1171 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1172 1173 if (vd->reset_state) 1174 return (0); 1175 1176 /* Acquire the element */ 1177 if (!vd->reset_state && 1178 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1179 if (status == ECONNRESET) { 1180 vd_mark_in_reset(vd); 1181 return (0); 1182 } else { 1183 PR0("ldc_mem_dring_acquire() returned errno %d", 1184 status); 1185 return (status); 1186 } 1187 } 1188 1189 /* Set the element's status and mark it done */ 1190 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 1191 if (accepted) { 1192 elem->payload.nbytes = elem_nbytes; 1193 elem->payload.status = elem_status; 1194 elem->hdr.dstate = VIO_DESC_DONE; 1195 } else { 1196 /* Perhaps client timed out waiting for I/O... */ 1197 PR0("element %u no longer \"accepted\"", idx); 1198 VD_DUMP_DRING_ELEM(elem); 1199 } 1200 /* Release the element */ 1201 if (!vd->reset_state && 1202 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1203 if (status == ECONNRESET) { 1204 vd_mark_in_reset(vd); 1205 return (0); 1206 } else { 1207 PR0("ldc_mem_dring_release() returned errno %d", 1208 status); 1209 return (status); 1210 } 1211 } 1212 1213 return (accepted ? 0 : EINVAL); 1214 } 1215 1216 /* 1217 * Return Values 1218 * 0 - operation completed successfully 1219 * EIO - encountered LDC / task error 1220 * 1221 * Side Effect 1222 * sets request->status = <disk operation status> 1223 */ 1224 static int 1225 vd_complete_bio(vd_task_t *task) 1226 { 1227 int status = 0; 1228 int rv = 0; 1229 vd_t *vd = task->vd; 1230 vd_dring_payload_t *request = task->request; 1231 struct buf *buf = &task->buf; 1232 1233 1234 ASSERT(vd != NULL); 1235 ASSERT(request != NULL); 1236 ASSERT(task->msg != NULL); 1237 ASSERT(task->msglen >= sizeof (*task->msg)); 1238 ASSERT(!vd->file); 1239 ASSERT(request->slice != VD_SLICE_NONE); 1240 1241 /* Wait for the I/O to complete [ call to ldi_strategy(9f) ] */ 1242 request->status = biowait(buf); 1243 1244 /* return back the number of bytes read/written */ 1245 request->nbytes = buf->b_bcount - buf->b_resid; 1246 1247 /* Release the buffer */ 1248 if (!vd->reset_state) 1249 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 1250 if (status) { 1251 PR0("ldc_mem_release() returned errno %d copying to " 1252 "client", status); 1253 if (status == ECONNRESET) { 1254 vd_mark_in_reset(vd); 1255 } 1256 rv = EIO; 1257 } 1258 1259 /* Unmap the memory, even if in reset */ 1260 status = ldc_mem_unmap(task->mhdl); 1261 if (status) { 1262 PR0("ldc_mem_unmap() returned errno %d copying to client", 1263 status); 1264 if (status == ECONNRESET) { 1265 vd_mark_in_reset(vd); 1266 } 1267 rv = EIO; 1268 } 1269 1270 biofini(buf); 1271 1272 return (rv); 1273 } 1274 1275 /* 1276 * Description: 1277 * This function is called by the two functions called by a taskq 1278 * [ vd_complete_notify() and vd_serial_notify()) ] to send the 1279 * message to the client. 1280 * 1281 * Parameters: 1282 * arg - opaque pointer to structure containing task to be completed 1283 * 1284 * Return Values 1285 * None 1286 */ 1287 static void 1288 vd_notify(vd_task_t *task) 1289 { 1290 int status; 1291 1292 ASSERT(task != NULL); 1293 ASSERT(task->vd != NULL); 1294 1295 if (task->vd->reset_state) 1296 return; 1297 1298 /* 1299 * Send the "ack" or "nack" back to the client; if sending the message 1300 * via LDC fails, arrange to reset both the connection state and LDC 1301 * itself 1302 */ 1303 PR2("Sending %s", 1304 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 1305 1306 status = send_msg(task->vd->ldc_handle, task->msg, task->msglen); 1307 switch (status) { 1308 case 0: 1309 break; 1310 case ECONNRESET: 1311 vd_mark_in_reset(task->vd); 1312 break; 1313 default: 1314 PR0("initiating full reset"); 1315 vd_need_reset(task->vd, B_TRUE); 1316 break; 1317 } 1318 1319 DTRACE_PROBE1(task__end, vd_task_t *, task); 1320 } 1321 1322 /* 1323 * Description: 1324 * Mark the Dring entry as Done and (if necessary) send an ACK/NACK to 1325 * the vDisk client 1326 * 1327 * Parameters: 1328 * task - structure containing the request sent from client 1329 * 1330 * Return Values 1331 * None 1332 */ 1333 static void 1334 vd_complete_notify(vd_task_t *task) 1335 { 1336 int status = 0; 1337 vd_t *vd = task->vd; 1338 vd_dring_payload_t *request = task->request; 1339 1340 /* Update the dring element for a dring client */ 1341 if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE)) { 1342 status = vd_mark_elem_done(vd, task->index, 1343 request->status, request->nbytes); 1344 if (status == ECONNRESET) 1345 vd_mark_in_reset(vd); 1346 } 1347 1348 /* 1349 * If a transport error occurred while marking the element done or 1350 * previously while executing the task, arrange to "nack" the message 1351 * when the final task in the descriptor element range completes 1352 */ 1353 if ((status != 0) || (task->status != 0)) 1354 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 1355 1356 /* 1357 * Only the final task for a range of elements will respond to and 1358 * free the message 1359 */ 1360 if (task->type == VD_NONFINAL_RANGE_TASK) { 1361 return; 1362 } 1363 1364 vd_notify(task); 1365 } 1366 1367 /* 1368 * Description: 1369 * This is the basic completion function called to handle inband data 1370 * requests and handshake messages. All it needs to do is trigger a 1371 * message to the client that the request is completed. 1372 * 1373 * Parameters: 1374 * arg - opaque pointer to structure containing task to be completed 1375 * 1376 * Return Values 1377 * None 1378 */ 1379 static void 1380 vd_serial_notify(void *arg) 1381 { 1382 vd_task_t *task = (vd_task_t *)arg; 1383 1384 ASSERT(task != NULL); 1385 vd_notify(task); 1386 } 1387 1388 static void 1389 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 1390 { 1391 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 1392 } 1393 1394 static void 1395 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 1396 { 1397 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 1398 } 1399 1400 static void 1401 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 1402 { 1403 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 1404 } 1405 1406 static void 1407 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 1408 { 1409 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 1410 } 1411 1412 static void 1413 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 1414 { 1415 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1416 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1417 1418 dk_efi->dki_lba = vd_efi->lba; 1419 dk_efi->dki_length = vd_efi->length; 1420 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 1421 } 1422 1423 static void 1424 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 1425 { 1426 int len; 1427 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1428 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1429 1430 len = vd_efi->length; 1431 DK_EFI2VD_EFI(dk_efi, vd_efi); 1432 kmem_free(dk_efi->dki_data, len); 1433 } 1434 1435 static void 1436 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 1437 { 1438 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1439 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1440 1441 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 1442 VD_EFI2DK_EFI(vd_efi, dk_efi); 1443 } 1444 1445 static void 1446 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 1447 { 1448 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1449 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1450 1451 kmem_free(dk_efi->dki_data, vd_efi->length); 1452 } 1453 1454 static int 1455 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label) 1456 { 1457 int status, rval; 1458 struct dk_gpt *efi; 1459 size_t efi_len; 1460 1461 *label = VD_DISK_LABEL_UNK; 1462 1463 status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc, 1464 (vd_open_flags | FKIOCTL), kcred, &rval); 1465 1466 if (status == 0) { 1467 *label = VD_DISK_LABEL_VTOC; 1468 return (0); 1469 } else if (status != ENOTSUP) { 1470 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 1471 return (status); 1472 } 1473 1474 status = vds_efi_alloc_and_read(handle, &efi, &efi_len); 1475 1476 if (status) { 1477 PR0("vds_efi_alloc_and_read returned error %d", status); 1478 return (status); 1479 } 1480 1481 *label = VD_DISK_LABEL_EFI; 1482 vd_efi_to_vtoc(efi, vtoc); 1483 vd_efi_free(efi, efi_len); 1484 1485 return (0); 1486 } 1487 1488 static ushort_t 1489 vd_lbl2cksum(struct dk_label *label) 1490 { 1491 int count; 1492 ushort_t sum, *sp; 1493 1494 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 1495 sp = (ushort_t *)label; 1496 sum = 0; 1497 while (count--) { 1498 sum ^= *sp++; 1499 } 1500 1501 return (sum); 1502 } 1503 1504 /* 1505 * Handle ioctls to a disk slice. 1506 * 1507 * Return Values 1508 * 0 - Indicates that there are no errors in disk operations 1509 * ENOTSUP - Unknown disk label type or unsupported DKIO ioctl 1510 * EINVAL - Not enough room to copy the EFI label 1511 * 1512 */ 1513 static int 1514 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1515 { 1516 dk_efi_t *dk_ioc; 1517 1518 switch (vd->vdisk_label) { 1519 1520 /* ioctls for a slice from a disk with a VTOC label */ 1521 case VD_DISK_LABEL_VTOC: 1522 1523 switch (cmd) { 1524 case DKIOCGGEOM: 1525 ASSERT(ioctl_arg != NULL); 1526 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 1527 return (0); 1528 case DKIOCGVTOC: 1529 ASSERT(ioctl_arg != NULL); 1530 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 1531 return (0); 1532 default: 1533 return (ENOTSUP); 1534 } 1535 1536 /* ioctls for a slice from a disk with an EFI label */ 1537 case VD_DISK_LABEL_EFI: 1538 1539 switch (cmd) { 1540 case DKIOCGETEFI: 1541 ASSERT(ioctl_arg != NULL); 1542 dk_ioc = (dk_efi_t *)ioctl_arg; 1543 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 1544 return (EINVAL); 1545 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 1546 vd->dk_efi.dki_length); 1547 return (0); 1548 default: 1549 return (ENOTSUP); 1550 } 1551 1552 default: 1553 /* Unknown disk label type */ 1554 return (ENOTSUP); 1555 } 1556 } 1557 1558 /* 1559 * Handle ioctls to a disk image (file-based). 1560 * 1561 * Return Values 1562 * 0 - Indicates that there are no errors 1563 * != 0 - Disk operation returned an error 1564 */ 1565 static int 1566 vd_do_file_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1567 { 1568 struct dk_label label; 1569 struct dk_geom *geom; 1570 struct vtoc *vtoc; 1571 int i, rc; 1572 1573 ASSERT(vd->file); 1574 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 1575 1576 switch (cmd) { 1577 1578 case DKIOCGGEOM: 1579 ASSERT(ioctl_arg != NULL); 1580 geom = (struct dk_geom *)ioctl_arg; 1581 1582 if (VD_FILE_LABEL_READ(vd, &label) < 0) 1583 return (EIO); 1584 1585 if (label.dkl_magic != DKL_MAGIC || 1586 label.dkl_cksum != vd_lbl2cksum(&label)) 1587 return (EINVAL); 1588 1589 bzero(geom, sizeof (struct dk_geom)); 1590 geom->dkg_ncyl = label.dkl_ncyl; 1591 geom->dkg_acyl = label.dkl_acyl; 1592 geom->dkg_nhead = label.dkl_nhead; 1593 geom->dkg_nsect = label.dkl_nsect; 1594 geom->dkg_intrlv = label.dkl_intrlv; 1595 geom->dkg_apc = label.dkl_apc; 1596 geom->dkg_rpm = label.dkl_rpm; 1597 geom->dkg_pcyl = label.dkl_pcyl; 1598 geom->dkg_write_reinstruct = label.dkl_write_reinstruct; 1599 geom->dkg_read_reinstruct = label.dkl_read_reinstruct; 1600 1601 return (0); 1602 1603 case DKIOCGVTOC: 1604 ASSERT(ioctl_arg != NULL); 1605 vtoc = (struct vtoc *)ioctl_arg; 1606 1607 if (VD_FILE_LABEL_READ(vd, &label) < 0) 1608 return (EIO); 1609 1610 if (label.dkl_magic != DKL_MAGIC || 1611 label.dkl_cksum != vd_lbl2cksum(&label)) 1612 return (EINVAL); 1613 1614 bzero(vtoc, sizeof (struct vtoc)); 1615 1616 vtoc->v_sanity = label.dkl_vtoc.v_sanity; 1617 vtoc->v_version = label.dkl_vtoc.v_version; 1618 vtoc->v_sectorsz = DEV_BSIZE; 1619 vtoc->v_nparts = label.dkl_vtoc.v_nparts; 1620 1621 for (i = 0; i < vtoc->v_nparts; i++) { 1622 vtoc->v_part[i].p_tag = 1623 label.dkl_vtoc.v_part[i].p_tag; 1624 vtoc->v_part[i].p_flag = 1625 label.dkl_vtoc.v_part[i].p_flag; 1626 vtoc->v_part[i].p_start = 1627 label.dkl_map[i].dkl_cylno * 1628 (label.dkl_nhead * label.dkl_nsect); 1629 vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk; 1630 vtoc->timestamp[i] = 1631 label.dkl_vtoc.v_timestamp[i]; 1632 } 1633 /* 1634 * The bootinfo array can not be copied with bcopy() because 1635 * elements are of type long in vtoc (so 64-bit) and of type 1636 * int in dk_vtoc (so 32-bit). 1637 */ 1638 vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0]; 1639 vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1]; 1640 vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2]; 1641 bcopy(label.dkl_asciilabel, vtoc->v_asciilabel, 1642 LEN_DKL_ASCII); 1643 bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume, 1644 LEN_DKL_VVOL); 1645 1646 return (0); 1647 1648 case DKIOCSGEOM: 1649 ASSERT(ioctl_arg != NULL); 1650 geom = (struct dk_geom *)ioctl_arg; 1651 1652 if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0) 1653 return (EINVAL); 1654 1655 /* 1656 * The current device geometry is not updated, just the driver 1657 * "notion" of it. The device geometry will be effectively 1658 * updated when a label is written to the device during a next 1659 * DKIOCSVTOC. 1660 */ 1661 bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom)); 1662 return (0); 1663 1664 case DKIOCSVTOC: 1665 ASSERT(ioctl_arg != NULL); 1666 ASSERT(vd->dk_geom.dkg_nhead != 0 && 1667 vd->dk_geom.dkg_nsect != 0); 1668 vtoc = (struct vtoc *)ioctl_arg; 1669 1670 if (vtoc->v_sanity != VTOC_SANE || 1671 vtoc->v_sectorsz != DEV_BSIZE || 1672 vtoc->v_nparts != V_NUMPAR) 1673 return (EINVAL); 1674 1675 bzero(&label, sizeof (label)); 1676 label.dkl_ncyl = vd->dk_geom.dkg_ncyl; 1677 label.dkl_acyl = vd->dk_geom.dkg_acyl; 1678 label.dkl_pcyl = vd->dk_geom.dkg_pcyl; 1679 label.dkl_nhead = vd->dk_geom.dkg_nhead; 1680 label.dkl_nsect = vd->dk_geom.dkg_nsect; 1681 label.dkl_intrlv = vd->dk_geom.dkg_intrlv; 1682 label.dkl_apc = vd->dk_geom.dkg_apc; 1683 label.dkl_rpm = vd->dk_geom.dkg_rpm; 1684 label.dkl_write_reinstruct = vd->dk_geom.dkg_write_reinstruct; 1685 label.dkl_read_reinstruct = vd->dk_geom.dkg_read_reinstruct; 1686 1687 label.dkl_vtoc.v_nparts = V_NUMPAR; 1688 label.dkl_vtoc.v_sanity = VTOC_SANE; 1689 label.dkl_vtoc.v_version = vtoc->v_version; 1690 for (i = 0; i < V_NUMPAR; i++) { 1691 label.dkl_vtoc.v_timestamp[i] = 1692 vtoc->timestamp[i]; 1693 label.dkl_vtoc.v_part[i].p_tag = 1694 vtoc->v_part[i].p_tag; 1695 label.dkl_vtoc.v_part[i].p_flag = 1696 vtoc->v_part[i].p_flag; 1697 label.dkl_map[i].dkl_cylno = 1698 vtoc->v_part[i].p_start / 1699 (label.dkl_nhead * label.dkl_nsect); 1700 label.dkl_map[i].dkl_nblk = 1701 vtoc->v_part[i].p_size; 1702 } 1703 /* 1704 * The bootinfo array can not be copied with bcopy() because 1705 * elements are of type long in vtoc (so 64-bit) and of type 1706 * int in dk_vtoc (so 32-bit). 1707 */ 1708 label.dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0]; 1709 label.dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1]; 1710 label.dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2]; 1711 bcopy(vtoc->v_asciilabel, label.dkl_asciilabel, 1712 LEN_DKL_ASCII); 1713 bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume, 1714 LEN_DKL_VVOL); 1715 1716 /* re-compute checksum */ 1717 label.dkl_magic = DKL_MAGIC; 1718 label.dkl_cksum = vd_lbl2cksum(&label); 1719 1720 /* write label to the disk image */ 1721 if ((rc = vd_file_set_vtoc(vd, &label)) != 0) 1722 return (rc); 1723 1724 /* update the cached vdisk VTOC */ 1725 bcopy(vtoc, &vd->vtoc, sizeof (vd->vtoc)); 1726 1727 /* 1728 * The disk geometry may have changed, so we need to write 1729 * the devid (if there is one) so that it is stored at the 1730 * right location. 1731 */ 1732 if (vd->file_devid != NULL && 1733 vd_file_write_devid(vd, vd->file_devid) != 0) { 1734 PR0("Fail to write devid"); 1735 } 1736 1737 return (0); 1738 1739 default: 1740 return (ENOTSUP); 1741 } 1742 } 1743 1744 /* 1745 * Description: 1746 * This is the function that processes the ioctl requests (farming it 1747 * out to functions that handle slices, files or whole disks) 1748 * 1749 * Return Values 1750 * 0 - ioctl operation completed successfully 1751 * != 0 - The LDC error value encountered 1752 * (propagated back up the call stack as a task error) 1753 * 1754 * Side Effect 1755 * sets request->status to the return value of the ioctl function. 1756 */ 1757 static int 1758 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 1759 { 1760 int rval = 0, status = 0; 1761 size_t nbytes = request->nbytes; /* modifiable copy */ 1762 1763 1764 ASSERT(request->slice < vd->nslices); 1765 PR0("Performing %s", ioctl->operation_name); 1766 1767 /* Get data from client and convert, if necessary */ 1768 if (ioctl->copyin != NULL) { 1769 ASSERT(nbytes != 0 && buf != NULL); 1770 PR1("Getting \"arg\" data from client"); 1771 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1772 request->cookie, request->ncookies, 1773 LDC_COPY_IN)) != 0) { 1774 PR0("ldc_mem_copy() returned errno %d " 1775 "copying from client", status); 1776 return (status); 1777 } 1778 1779 /* Convert client's data, if necessary */ 1780 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 1781 ioctl->arg = buf; 1782 else /* convert client vdisk operation data to ioctl data */ 1783 (ioctl->copyin)(buf, (void *)ioctl->arg); 1784 } 1785 1786 /* 1787 * Handle single-slice block devices internally; otherwise, have the 1788 * real driver perform the ioctl() 1789 */ 1790 if (vd->file) { 1791 request->status = 1792 vd_do_file_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); 1793 1794 } else if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { 1795 request->status = 1796 vd_do_slice_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); 1797 1798 } else { 1799 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 1800 ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL), 1801 kcred, &rval); 1802 1803 #ifdef DEBUG 1804 if (rval != 0) { 1805 PR0("%s set rval = %d, which is not being returned to" 1806 " client", ioctl->cmd_name, rval); 1807 } 1808 #endif /* DEBUG */ 1809 } 1810 1811 if (request->status != 0) { 1812 PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status); 1813 return (0); 1814 } 1815 1816 /* Convert data and send to client, if necessary */ 1817 if (ioctl->copyout != NULL) { 1818 ASSERT(nbytes != 0 && buf != NULL); 1819 PR1("Sending \"arg\" data to client"); 1820 1821 /* Convert ioctl data to vdisk operation data, if necessary */ 1822 if (ioctl->copyout != VD_IDENTITY) 1823 (ioctl->copyout)((void *)ioctl->arg, buf); 1824 1825 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1826 request->cookie, request->ncookies, 1827 LDC_COPY_OUT)) != 0) { 1828 PR0("ldc_mem_copy() returned errno %d " 1829 "copying to client", status); 1830 return (status); 1831 } 1832 } 1833 1834 return (status); 1835 } 1836 1837 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 1838 1839 /* 1840 * Description: 1841 * This generic function is called by the task queue to complete 1842 * the processing of the tasks. The specific completion function 1843 * is passed in as a field in the task pointer. 1844 * 1845 * Parameters: 1846 * arg - opaque pointer to structure containing task to be completed 1847 * 1848 * Return Values 1849 * None 1850 */ 1851 static void 1852 vd_complete(void *arg) 1853 { 1854 vd_task_t *task = (vd_task_t *)arg; 1855 1856 ASSERT(task != NULL); 1857 ASSERT(task->status == EINPROGRESS); 1858 ASSERT(task->completef != NULL); 1859 1860 task->status = task->completef(task); 1861 if (task->status) 1862 PR0("%s: Error %d completing task", __func__, task->status); 1863 1864 /* Now notify the vDisk client */ 1865 vd_complete_notify(task); 1866 } 1867 1868 static int 1869 vd_ioctl(vd_task_t *task) 1870 { 1871 int i, status; 1872 void *buf = NULL; 1873 struct dk_geom dk_geom = {0}; 1874 struct vtoc vtoc = {0}; 1875 struct dk_efi dk_efi = {0}; 1876 vd_t *vd = task->vd; 1877 vd_dring_payload_t *request = task->request; 1878 vd_ioctl_t ioctl[] = { 1879 /* Command (no-copy) operations */ 1880 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 1881 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 1882 NULL, NULL, NULL}, 1883 1884 /* "Get" (copy-out) operations */ 1885 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 1886 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 1887 NULL, VD_IDENTITY, VD_IDENTITY}, 1888 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 1889 RNDSIZE(vd_geom_t), 1890 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 1891 &dk_geom, NULL, dk_geom2vd_geom}, 1892 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 1893 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 1894 &vtoc, NULL, vtoc2vd_vtoc}, 1895 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 1896 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 1897 &dk_efi, vd_get_efi_in, vd_get_efi_out}, 1898 1899 /* "Set" (copy-in) operations */ 1900 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 1901 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 1902 NULL, VD_IDENTITY, VD_IDENTITY}, 1903 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 1904 RNDSIZE(vd_geom_t), 1905 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 1906 &dk_geom, vd_geom2dk_geom, NULL}, 1907 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 1908 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 1909 &vtoc, vd_vtoc2vtoc, NULL}, 1910 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 1911 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 1912 &dk_efi, vd_set_efi_in, vd_set_efi_out}, 1913 }; 1914 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 1915 1916 1917 ASSERT(vd != NULL); 1918 ASSERT(request != NULL); 1919 ASSERT(request->slice < vd->nslices); 1920 1921 /* 1922 * Determine ioctl corresponding to caller's "operation" and 1923 * validate caller's "nbytes" 1924 */ 1925 for (i = 0; i < nioctls; i++) { 1926 if (request->operation == ioctl[i].operation) { 1927 /* LDC memory operations require 8-byte multiples */ 1928 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 1929 1930 if (request->operation == VD_OP_GET_EFI || 1931 request->operation == VD_OP_SET_EFI) { 1932 if (request->nbytes >= ioctl[i].nbytes) 1933 break; 1934 PR0("%s: Expected at least nbytes = %lu, " 1935 "got %lu", ioctl[i].operation_name, 1936 ioctl[i].nbytes, request->nbytes); 1937 return (EINVAL); 1938 } 1939 1940 if (request->nbytes != ioctl[i].nbytes) { 1941 PR0("%s: Expected nbytes = %lu, got %lu", 1942 ioctl[i].operation_name, ioctl[i].nbytes, 1943 request->nbytes); 1944 return (EINVAL); 1945 } 1946 1947 break; 1948 } 1949 } 1950 ASSERT(i < nioctls); /* because "operation" already validated */ 1951 1952 if (request->nbytes) 1953 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 1954 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 1955 if (request->nbytes) 1956 kmem_free(buf, request->nbytes); 1957 1958 return (status); 1959 } 1960 1961 static int 1962 vd_get_devid(vd_task_t *task) 1963 { 1964 vd_t *vd = task->vd; 1965 vd_dring_payload_t *request = task->request; 1966 vd_devid_t *vd_devid; 1967 impl_devid_t *devid; 1968 int status, bufid_len, devid_len, len, sz; 1969 int bufbytes; 1970 1971 PR1("Get Device ID, nbytes=%ld", request->nbytes); 1972 1973 if (vd->file) { 1974 if (vd->file_devid == NULL) { 1975 PR2("No Device ID"); 1976 request->status = ENOENT; 1977 return (0); 1978 } else { 1979 sz = ddi_devid_sizeof(vd->file_devid); 1980 devid = kmem_alloc(sz, KM_SLEEP); 1981 bcopy(vd->file_devid, devid, sz); 1982 } 1983 } else { 1984 if (ddi_lyr_get_devid(vd->dev[request->slice], 1985 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 1986 PR2("No Device ID"); 1987 request->status = ENOENT; 1988 return (0); 1989 } 1990 } 1991 1992 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 1993 devid_len = DEVID_GETLEN(devid); 1994 1995 /* 1996 * Save the buffer size here for use in deallocation. 1997 * The actual number of bytes copied is returned in 1998 * the 'nbytes' field of the request structure. 1999 */ 2000 bufbytes = request->nbytes; 2001 2002 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 2003 vd_devid->length = devid_len; 2004 vd_devid->type = DEVID_GETTYPE(devid); 2005 2006 len = (devid_len > bufid_len)? bufid_len : devid_len; 2007 2008 bcopy(devid->did_id, vd_devid->id, len); 2009 2010 /* LDC memory operations require 8-byte multiples */ 2011 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 2012 2013 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 2014 &request->nbytes, request->cookie, request->ncookies, 2015 LDC_COPY_OUT)) != 0) { 2016 PR0("ldc_mem_copy() returned errno %d copying to client", 2017 status); 2018 } 2019 PR1("post mem_copy: nbytes=%ld", request->nbytes); 2020 2021 kmem_free(vd_devid, bufbytes); 2022 ddi_devid_free((ddi_devid_t)devid); 2023 2024 return (status); 2025 } 2026 2027 /* 2028 * Define the supported operations once the functions for performing them have 2029 * been defined 2030 */ 2031 static const vds_operation_t vds_operation[] = { 2032 #define X(_s) #_s, _s 2033 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 2034 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 2035 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 2036 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 2037 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 2038 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 2039 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 2040 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 2041 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 2042 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 2043 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 2044 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 2045 #undef X 2046 }; 2047 2048 static const size_t vds_noperations = 2049 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 2050 2051 /* 2052 * Process a task specifying a client I/O request 2053 * 2054 * Parameters: 2055 * task - structure containing the request sent from client 2056 * 2057 * Return Value 2058 * 0 - success 2059 * ENOTSUP - Unknown/Unsupported VD_OP_XXX operation 2060 * EINVAL - Invalid disk slice 2061 * != 0 - some other non-zero return value from start function 2062 */ 2063 static int 2064 vd_do_process_task(vd_task_t *task) 2065 { 2066 int i; 2067 vd_t *vd = task->vd; 2068 vd_dring_payload_t *request = task->request; 2069 2070 ASSERT(vd != NULL); 2071 ASSERT(request != NULL); 2072 2073 /* Find the requested operation */ 2074 for (i = 0; i < vds_noperations; i++) { 2075 if (request->operation == vds_operation[i].operation) { 2076 /* all operations should have a start func */ 2077 ASSERT(vds_operation[i].start != NULL); 2078 2079 task->completef = vds_operation[i].complete; 2080 break; 2081 } 2082 } 2083 if (i == vds_noperations) { 2084 PR0("Unsupported operation %u", request->operation); 2085 return (ENOTSUP); 2086 } 2087 2088 /* Range-check slice */ 2089 if (request->slice >= vd->nslices && 2090 (vd->vdisk_type != VD_DISK_TYPE_DISK || 2091 request->slice != VD_SLICE_NONE)) { 2092 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 2093 request->slice, (vd->nslices - 1)); 2094 return (EINVAL); 2095 } 2096 2097 /* 2098 * Call the function pointer that starts the operation. 2099 */ 2100 return (vds_operation[i].start(task)); 2101 } 2102 2103 /* 2104 * Description: 2105 * This function is called by both the in-band and descriptor ring 2106 * message processing functions paths to actually execute the task 2107 * requested by the vDisk client. It in turn calls its worker 2108 * function, vd_do_process_task(), to carry our the request. 2109 * 2110 * Any transport errors (e.g. LDC errors, vDisk protocol errors) are 2111 * saved in the 'status' field of the task and are propagated back 2112 * up the call stack to trigger a NACK 2113 * 2114 * Any request errors (e.g. ENOTTY from an ioctl) are saved in 2115 * the 'status' field of the request and result in an ACK being sent 2116 * by the completion handler. 2117 * 2118 * Parameters: 2119 * task - structure containing the request sent from client 2120 * 2121 * Return Value 2122 * 0 - successful synchronous request. 2123 * != 0 - transport error (e.g. LDC errors, vDisk protocol) 2124 * EINPROGRESS - task will be finished in a completion handler 2125 */ 2126 static int 2127 vd_process_task(vd_task_t *task) 2128 { 2129 vd_t *vd = task->vd; 2130 int status; 2131 2132 DTRACE_PROBE1(task__start, vd_task_t *, task); 2133 2134 task->status = vd_do_process_task(task); 2135 2136 /* 2137 * If the task processing function returned EINPROGRESS indicating 2138 * that the task needs completing then schedule a taskq entry to 2139 * finish it now. 2140 * 2141 * Otherwise the task processing function returned either zero 2142 * indicating that the task was finished in the start function (and we 2143 * don't need to wait in a completion function) or the start function 2144 * returned an error - in both cases all that needs to happen is the 2145 * notification to the vDisk client higher up the call stack. 2146 * If the task was using a Descriptor Ring, we need to mark it as done 2147 * at this stage. 2148 */ 2149 if (task->status == EINPROGRESS) { 2150 /* Queue a task to complete the operation */ 2151 (void) ddi_taskq_dispatch(vd->completionq, vd_complete, 2152 task, DDI_SLEEP); 2153 2154 } else if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE)) { 2155 /* Update the dring element if it's a dring client */ 2156 status = vd_mark_elem_done(vd, task->index, 2157 task->request->status, task->request->nbytes); 2158 if (status == ECONNRESET) 2159 vd_mark_in_reset(vd); 2160 } 2161 2162 return (task->status); 2163 } 2164 2165 /* 2166 * Return true if the "type", "subtype", and "env" fields of the "tag" first 2167 * argument match the corresponding remaining arguments; otherwise, return false 2168 */ 2169 boolean_t 2170 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 2171 { 2172 return ((tag->vio_msgtype == type) && 2173 (tag->vio_subtype == subtype) && 2174 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 2175 } 2176 2177 /* 2178 * Check whether the major/minor version specified in "ver_msg" is supported 2179 * by this server. 2180 */ 2181 static boolean_t 2182 vds_supported_version(vio_ver_msg_t *ver_msg) 2183 { 2184 for (int i = 0; i < vds_num_versions; i++) { 2185 ASSERT(vds_version[i].major > 0); 2186 ASSERT((i == 0) || 2187 (vds_version[i].major < vds_version[i-1].major)); 2188 2189 /* 2190 * If the major versions match, adjust the minor version, if 2191 * necessary, down to the highest value supported by this 2192 * server and return true so this message will get "ack"ed; 2193 * the client should also support all minor versions lower 2194 * than the value it sent 2195 */ 2196 if (ver_msg->ver_major == vds_version[i].major) { 2197 if (ver_msg->ver_minor > vds_version[i].minor) { 2198 PR0("Adjusting minor version from %u to %u", 2199 ver_msg->ver_minor, vds_version[i].minor); 2200 ver_msg->ver_minor = vds_version[i].minor; 2201 } 2202 return (B_TRUE); 2203 } 2204 2205 /* 2206 * If the message contains a higher major version number, set 2207 * the message's major/minor versions to the current values 2208 * and return false, so this message will get "nack"ed with 2209 * these values, and the client will potentially try again 2210 * with the same or a lower version 2211 */ 2212 if (ver_msg->ver_major > vds_version[i].major) { 2213 ver_msg->ver_major = vds_version[i].major; 2214 ver_msg->ver_minor = vds_version[i].minor; 2215 return (B_FALSE); 2216 } 2217 2218 /* 2219 * Otherwise, the message's major version is less than the 2220 * current major version, so continue the loop to the next 2221 * (lower) supported version 2222 */ 2223 } 2224 2225 /* 2226 * No common version was found; "ground" the version pair in the 2227 * message to terminate negotiation 2228 */ 2229 ver_msg->ver_major = 0; 2230 ver_msg->ver_minor = 0; 2231 return (B_FALSE); 2232 } 2233 2234 /* 2235 * Process a version message from a client. vds expects to receive version 2236 * messages from clients seeking service, but never issues version messages 2237 * itself; therefore, vds can ACK or NACK client version messages, but does 2238 * not expect to receive version-message ACKs or NACKs (and will treat such 2239 * messages as invalid). 2240 */ 2241 static int 2242 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2243 { 2244 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 2245 2246 2247 ASSERT(msglen >= sizeof (msg->tag)); 2248 2249 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2250 VIO_VER_INFO)) { 2251 return (ENOMSG); /* not a version message */ 2252 } 2253 2254 if (msglen != sizeof (*ver_msg)) { 2255 PR0("Expected %lu-byte version message; " 2256 "received %lu bytes", sizeof (*ver_msg), msglen); 2257 return (EBADMSG); 2258 } 2259 2260 if (ver_msg->dev_class != VDEV_DISK) { 2261 PR0("Expected device class %u (disk); received %u", 2262 VDEV_DISK, ver_msg->dev_class); 2263 return (EBADMSG); 2264 } 2265 2266 /* 2267 * We're talking to the expected kind of client; set our device class 2268 * for "ack/nack" back to the client 2269 */ 2270 ver_msg->dev_class = VDEV_DISK_SERVER; 2271 2272 /* 2273 * Check whether the (valid) version message specifies a version 2274 * supported by this server. If the version is not supported, return 2275 * EBADMSG so the message will get "nack"ed; vds_supported_version() 2276 * will have updated the message with a supported version for the 2277 * client to consider 2278 */ 2279 if (!vds_supported_version(ver_msg)) 2280 return (EBADMSG); 2281 2282 2283 /* 2284 * A version has been agreed upon; use the client's SID for 2285 * communication on this channel now 2286 */ 2287 ASSERT(!(vd->initialized & VD_SID)); 2288 vd->sid = ver_msg->tag.vio_sid; 2289 vd->initialized |= VD_SID; 2290 2291 /* 2292 * When multiple versions are supported, this function should store 2293 * the negotiated major and minor version values in the "vd" data 2294 * structure to govern further communication; in particular, note that 2295 * the client might have specified a lower minor version for the 2296 * agreed major version than specifed in the vds_version[] array. The 2297 * following assertions should help remind future maintainers to make 2298 * the appropriate changes to support multiple versions. 2299 */ 2300 ASSERT(vds_num_versions == 1); 2301 ASSERT(ver_msg->ver_major == vds_version[0].major); 2302 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 2303 2304 PR0("Using major version %u, minor version %u", 2305 ver_msg->ver_major, ver_msg->ver_minor); 2306 return (0); 2307 } 2308 2309 static int 2310 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2311 { 2312 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 2313 int status, retry = 0; 2314 2315 2316 ASSERT(msglen >= sizeof (msg->tag)); 2317 2318 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2319 VIO_ATTR_INFO)) { 2320 PR0("Message is not an attribute message"); 2321 return (ENOMSG); 2322 } 2323 2324 if (msglen != sizeof (*attr_msg)) { 2325 PR0("Expected %lu-byte attribute message; " 2326 "received %lu bytes", sizeof (*attr_msg), msglen); 2327 return (EBADMSG); 2328 } 2329 2330 if (attr_msg->max_xfer_sz == 0) { 2331 PR0("Received maximum transfer size of 0 from client"); 2332 return (EBADMSG); 2333 } 2334 2335 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 2336 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 2337 PR0("Client requested unsupported transfer mode"); 2338 return (EBADMSG); 2339 } 2340 2341 /* 2342 * check if the underlying disk is ready, if not try accessing 2343 * the device again. Open the vdisk device and extract info 2344 * about it, as this is needed to respond to the attr info msg 2345 */ 2346 if ((vd->initialized & VD_DISK_READY) == 0) { 2347 PR0("Retry setting up disk (%s)", vd->device_path); 2348 do { 2349 status = vd_setup_vd(vd); 2350 if (status != EAGAIN || ++retry > vds_dev_retries) 2351 break; 2352 2353 /* incremental delay */ 2354 delay(drv_usectohz(vds_dev_delay)); 2355 2356 /* if vdisk is no longer enabled - return error */ 2357 if (!vd_enabled(vd)) 2358 return (ENXIO); 2359 2360 } while (status == EAGAIN); 2361 2362 if (status) 2363 return (ENXIO); 2364 2365 vd->initialized |= VD_DISK_READY; 2366 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 2367 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 2368 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 2369 (vd->pseudo ? "yes" : "no"), 2370 (vd->file ? "yes" : "no"), 2371 vd->nslices); 2372 } 2373 2374 /* Success: valid message and transfer mode */ 2375 vd->xfer_mode = attr_msg->xfer_mode; 2376 2377 if (vd->xfer_mode == VIO_DESC_MODE) { 2378 2379 /* 2380 * The vd_dring_inband_msg_t contains one cookie; need room 2381 * for up to n-1 more cookies, where "n" is the number of full 2382 * pages plus possibly one partial page required to cover 2383 * "max_xfer_sz". Add room for one more cookie if 2384 * "max_xfer_sz" isn't an integral multiple of the page size. 2385 * Must first get the maximum transfer size in bytes. 2386 */ 2387 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 2388 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 2389 attr_msg->max_xfer_sz; 2390 size_t max_inband_msglen = 2391 sizeof (vd_dring_inband_msg_t) + 2392 ((max_xfer_bytes/PAGESIZE + 2393 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 2394 (sizeof (ldc_mem_cookie_t))); 2395 2396 /* 2397 * Set the maximum expected message length to 2398 * accommodate in-band-descriptor messages with all 2399 * their cookies 2400 */ 2401 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 2402 2403 /* 2404 * Initialize the data structure for processing in-band I/O 2405 * request descriptors 2406 */ 2407 vd->inband_task.vd = vd; 2408 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2409 vd->inband_task.index = 0; 2410 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 2411 } 2412 2413 /* Return the device's block size and max transfer size to the client */ 2414 attr_msg->vdisk_block_size = DEV_BSIZE; 2415 attr_msg->max_xfer_sz = vd->max_xfer_sz; 2416 2417 attr_msg->vdisk_size = vd->vdisk_size; 2418 attr_msg->vdisk_type = vd->vdisk_type; 2419 attr_msg->operations = vds_operations; 2420 PR0("%s", VD_CLIENT(vd)); 2421 2422 ASSERT(vd->dring_task == NULL); 2423 2424 return (0); 2425 } 2426 2427 static int 2428 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2429 { 2430 int status; 2431 size_t expected; 2432 ldc_mem_info_t dring_minfo; 2433 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 2434 2435 2436 ASSERT(msglen >= sizeof (msg->tag)); 2437 2438 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2439 VIO_DRING_REG)) { 2440 PR0("Message is not a register-dring message"); 2441 return (ENOMSG); 2442 } 2443 2444 if (msglen < sizeof (*reg_msg)) { 2445 PR0("Expected at least %lu-byte register-dring message; " 2446 "received %lu bytes", sizeof (*reg_msg), msglen); 2447 return (EBADMSG); 2448 } 2449 2450 expected = sizeof (*reg_msg) + 2451 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 2452 if (msglen != expected) { 2453 PR0("Expected %lu-byte register-dring message; " 2454 "received %lu bytes", expected, msglen); 2455 return (EBADMSG); 2456 } 2457 2458 if (vd->initialized & VD_DRING) { 2459 PR0("A dring was previously registered; only support one"); 2460 return (EBADMSG); 2461 } 2462 2463 if (reg_msg->num_descriptors > INT32_MAX) { 2464 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 2465 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 2466 return (EBADMSG); 2467 } 2468 2469 if (reg_msg->ncookies != 1) { 2470 /* 2471 * In addition to fixing the assertion in the success case 2472 * below, supporting drings which require more than one 2473 * "cookie" requires increasing the value of vd->max_msglen 2474 * somewhere in the code path prior to receiving the message 2475 * which results in calling this function. Note that without 2476 * making this change, the larger message size required to 2477 * accommodate multiple cookies cannot be successfully 2478 * received, so this function will not even get called. 2479 * Gracefully accommodating more dring cookies might 2480 * reasonably demand exchanging an additional attribute or 2481 * making a minor protocol adjustment 2482 */ 2483 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 2484 return (EBADMSG); 2485 } 2486 2487 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 2488 reg_msg->ncookies, reg_msg->num_descriptors, 2489 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 2490 if (status != 0) { 2491 PR0("ldc_mem_dring_map() returned errno %d", status); 2492 return (status); 2493 } 2494 2495 /* 2496 * To remove the need for this assertion, must call 2497 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 2498 * successful call to ldc_mem_dring_map() 2499 */ 2500 ASSERT(reg_msg->ncookies == 1); 2501 2502 if ((status = 2503 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 2504 PR0("ldc_mem_dring_info() returned errno %d", status); 2505 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 2506 PR0("ldc_mem_dring_unmap() returned errno %d", status); 2507 return (status); 2508 } 2509 2510 if (dring_minfo.vaddr == NULL) { 2511 PR0("Descriptor ring virtual address is NULL"); 2512 return (ENXIO); 2513 } 2514 2515 2516 /* Initialize for valid message and mapped dring */ 2517 PR1("descriptor size = %u, dring length = %u", 2518 vd->descriptor_size, vd->dring_len); 2519 vd->initialized |= VD_DRING; 2520 vd->dring_ident = 1; /* "There Can Be Only One" */ 2521 vd->dring = dring_minfo.vaddr; 2522 vd->descriptor_size = reg_msg->descriptor_size; 2523 vd->dring_len = reg_msg->num_descriptors; 2524 reg_msg->dring_ident = vd->dring_ident; 2525 2526 /* 2527 * Allocate and initialize a "shadow" array of data structures for 2528 * tasks to process I/O requests in dring elements 2529 */ 2530 vd->dring_task = 2531 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 2532 for (int i = 0; i < vd->dring_len; i++) { 2533 vd->dring_task[i].vd = vd; 2534 vd->dring_task[i].index = i; 2535 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 2536 2537 status = ldc_mem_alloc_handle(vd->ldc_handle, 2538 &(vd->dring_task[i].mhdl)); 2539 if (status) { 2540 PR0("ldc_mem_alloc_handle() returned err %d ", status); 2541 return (ENXIO); 2542 } 2543 2544 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2545 } 2546 2547 return (0); 2548 } 2549 2550 static int 2551 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2552 { 2553 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 2554 2555 2556 ASSERT(msglen >= sizeof (msg->tag)); 2557 2558 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2559 VIO_DRING_UNREG)) { 2560 PR0("Message is not an unregister-dring message"); 2561 return (ENOMSG); 2562 } 2563 2564 if (msglen != sizeof (*unreg_msg)) { 2565 PR0("Expected %lu-byte unregister-dring message; " 2566 "received %lu bytes", sizeof (*unreg_msg), msglen); 2567 return (EBADMSG); 2568 } 2569 2570 if (unreg_msg->dring_ident != vd->dring_ident) { 2571 PR0("Expected dring ident %lu; received %lu", 2572 vd->dring_ident, unreg_msg->dring_ident); 2573 return (EBADMSG); 2574 } 2575 2576 return (0); 2577 } 2578 2579 static int 2580 process_rdx_msg(vio_msg_t *msg, size_t msglen) 2581 { 2582 ASSERT(msglen >= sizeof (msg->tag)); 2583 2584 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 2585 PR0("Message is not an RDX message"); 2586 return (ENOMSG); 2587 } 2588 2589 if (msglen != sizeof (vio_rdx_msg_t)) { 2590 PR0("Expected %lu-byte RDX message; received %lu bytes", 2591 sizeof (vio_rdx_msg_t), msglen); 2592 return (EBADMSG); 2593 } 2594 2595 PR0("Valid RDX message"); 2596 return (0); 2597 } 2598 2599 static int 2600 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 2601 { 2602 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 2603 PR0("Received seq_num %lu; expected %lu", 2604 seq_num, (vd->seq_num + 1)); 2605 PR0("initiating soft reset"); 2606 vd_need_reset(vd, B_FALSE); 2607 return (1); 2608 } 2609 2610 vd->seq_num = seq_num; 2611 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 2612 return (0); 2613 } 2614 2615 /* 2616 * Return the expected size of an inband-descriptor message with all the 2617 * cookies it claims to include 2618 */ 2619 static size_t 2620 expected_inband_size(vd_dring_inband_msg_t *msg) 2621 { 2622 return ((sizeof (*msg)) + 2623 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 2624 } 2625 2626 /* 2627 * Process an in-band descriptor message: used with clients like OBP, with 2628 * which vds exchanges descriptors within VIO message payloads, rather than 2629 * operating on them within a descriptor ring 2630 */ 2631 static int 2632 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2633 { 2634 size_t expected; 2635 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 2636 2637 2638 ASSERT(msglen >= sizeof (msg->tag)); 2639 2640 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 2641 VIO_DESC_DATA)) { 2642 PR1("Message is not an in-band-descriptor message"); 2643 return (ENOMSG); 2644 } 2645 2646 if (msglen < sizeof (*desc_msg)) { 2647 PR0("Expected at least %lu-byte descriptor message; " 2648 "received %lu bytes", sizeof (*desc_msg), msglen); 2649 return (EBADMSG); 2650 } 2651 2652 if (msglen != (expected = expected_inband_size(desc_msg))) { 2653 PR0("Expected %lu-byte descriptor message; " 2654 "received %lu bytes", expected, msglen); 2655 return (EBADMSG); 2656 } 2657 2658 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 2659 return (EBADMSG); 2660 2661 /* 2662 * Valid message: Set up the in-band descriptor task and process the 2663 * request. Arrange to acknowledge the client's message, unless an 2664 * error processing the descriptor task results in setting 2665 * VIO_SUBTYPE_NACK 2666 */ 2667 PR1("Valid in-band-descriptor message"); 2668 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2669 2670 ASSERT(vd->inband_task.msg != NULL); 2671 2672 bcopy(msg, vd->inband_task.msg, msglen); 2673 vd->inband_task.msglen = msglen; 2674 2675 /* 2676 * The task request is now the payload of the message 2677 * that was just copied into the body of the task. 2678 */ 2679 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 2680 vd->inband_task.request = &desc_msg->payload; 2681 2682 return (vd_process_task(&vd->inband_task)); 2683 } 2684 2685 static int 2686 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 2687 vio_msg_t *msg, size_t msglen) 2688 { 2689 int status; 2690 boolean_t ready; 2691 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 2692 2693 2694 /* Accept the updated dring element */ 2695 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 2696 PR0("ldc_mem_dring_acquire() returned errno %d", status); 2697 return (status); 2698 } 2699 ready = (elem->hdr.dstate == VIO_DESC_READY); 2700 if (ready) { 2701 elem->hdr.dstate = VIO_DESC_ACCEPTED; 2702 } else { 2703 PR0("descriptor %u not ready", idx); 2704 VD_DUMP_DRING_ELEM(elem); 2705 } 2706 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 2707 PR0("ldc_mem_dring_release() returned errno %d", status); 2708 return (status); 2709 } 2710 if (!ready) 2711 return (EBUSY); 2712 2713 2714 /* Initialize a task and process the accepted element */ 2715 PR1("Processing dring element %u", idx); 2716 vd->dring_task[idx].type = type; 2717 2718 /* duplicate msg buf for cookies etc. */ 2719 bcopy(msg, vd->dring_task[idx].msg, msglen); 2720 2721 vd->dring_task[idx].msglen = msglen; 2722 return (vd_process_task(&vd->dring_task[idx])); 2723 } 2724 2725 static int 2726 vd_process_element_range(vd_t *vd, int start, int end, 2727 vio_msg_t *msg, size_t msglen) 2728 { 2729 int i, n, nelem, status = 0; 2730 boolean_t inprogress = B_FALSE; 2731 vd_task_type_t type; 2732 2733 2734 ASSERT(start >= 0); 2735 ASSERT(end >= 0); 2736 2737 /* 2738 * Arrange to acknowledge the client's message, unless an error 2739 * processing one of the dring elements results in setting 2740 * VIO_SUBTYPE_NACK 2741 */ 2742 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2743 2744 /* 2745 * Process the dring elements in the range 2746 */ 2747 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 2748 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 2749 ((vio_dring_msg_t *)msg)->end_idx = i; 2750 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 2751 status = vd_process_element(vd, type, i, msg, msglen); 2752 if (status == EINPROGRESS) 2753 inprogress = B_TRUE; 2754 else if (status != 0) 2755 break; 2756 } 2757 2758 /* 2759 * If some, but not all, operations of a multi-element range are in 2760 * progress, wait for other operations to complete before returning 2761 * (which will result in "ack" or "nack" of the message). Note that 2762 * all outstanding operations will need to complete, not just the ones 2763 * corresponding to the current range of dring elements; howevever, as 2764 * this situation is an error case, performance is less critical. 2765 */ 2766 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 2767 ddi_taskq_wait(vd->completionq); 2768 2769 return (status); 2770 } 2771 2772 static int 2773 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2774 { 2775 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 2776 2777 2778 ASSERT(msglen >= sizeof (msg->tag)); 2779 2780 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 2781 VIO_DRING_DATA)) { 2782 PR1("Message is not a dring-data message"); 2783 return (ENOMSG); 2784 } 2785 2786 if (msglen != sizeof (*dring_msg)) { 2787 PR0("Expected %lu-byte dring message; received %lu bytes", 2788 sizeof (*dring_msg), msglen); 2789 return (EBADMSG); 2790 } 2791 2792 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 2793 return (EBADMSG); 2794 2795 if (dring_msg->dring_ident != vd->dring_ident) { 2796 PR0("Expected dring ident %lu; received ident %lu", 2797 vd->dring_ident, dring_msg->dring_ident); 2798 return (EBADMSG); 2799 } 2800 2801 if (dring_msg->start_idx >= vd->dring_len) { 2802 PR0("\"start_idx\" = %u; must be less than %u", 2803 dring_msg->start_idx, vd->dring_len); 2804 return (EBADMSG); 2805 } 2806 2807 if ((dring_msg->end_idx < 0) || 2808 (dring_msg->end_idx >= vd->dring_len)) { 2809 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 2810 dring_msg->end_idx, vd->dring_len); 2811 return (EBADMSG); 2812 } 2813 2814 /* Valid message; process range of updated dring elements */ 2815 PR1("Processing descriptor range, start = %u, end = %u", 2816 dring_msg->start_idx, dring_msg->end_idx); 2817 return (vd_process_element_range(vd, dring_msg->start_idx, 2818 dring_msg->end_idx, msg, msglen)); 2819 } 2820 2821 static int 2822 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 2823 { 2824 int retry, status; 2825 size_t size = *nbytes; 2826 2827 2828 for (retry = 0, status = ETIMEDOUT; 2829 retry < vds_ldc_retries && status == ETIMEDOUT; 2830 retry++) { 2831 PR1("ldc_read() attempt %d", (retry + 1)); 2832 *nbytes = size; 2833 status = ldc_read(ldc_handle, msg, nbytes); 2834 } 2835 2836 if (status) { 2837 PR0("ldc_read() returned errno %d", status); 2838 if (status != ECONNRESET) 2839 return (ENOMSG); 2840 return (status); 2841 } else if (*nbytes == 0) { 2842 PR1("ldc_read() returned 0 and no message read"); 2843 return (ENOMSG); 2844 } 2845 2846 PR1("RCVD %lu-byte message", *nbytes); 2847 return (0); 2848 } 2849 2850 static int 2851 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2852 { 2853 int status; 2854 2855 2856 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 2857 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 2858 #ifdef DEBUG 2859 vd_decode_tag(msg); 2860 #endif 2861 2862 /* 2863 * Validate session ID up front, since it applies to all messages 2864 * once set 2865 */ 2866 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 2867 PR0("Expected SID %u, received %u", vd->sid, 2868 msg->tag.vio_sid); 2869 return (EBADMSG); 2870 } 2871 2872 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 2873 2874 /* 2875 * Process the received message based on connection state 2876 */ 2877 switch (vd->state) { 2878 case VD_STATE_INIT: /* expect version message */ 2879 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 2880 return (status); 2881 2882 /* Version negotiated, move to that state */ 2883 vd->state = VD_STATE_VER; 2884 return (0); 2885 2886 case VD_STATE_VER: /* expect attribute message */ 2887 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 2888 return (status); 2889 2890 /* Attributes exchanged, move to that state */ 2891 vd->state = VD_STATE_ATTR; 2892 return (0); 2893 2894 case VD_STATE_ATTR: 2895 switch (vd->xfer_mode) { 2896 case VIO_DESC_MODE: /* expect RDX message */ 2897 if ((status = process_rdx_msg(msg, msglen)) != 0) 2898 return (status); 2899 2900 /* Ready to receive in-band descriptors */ 2901 vd->state = VD_STATE_DATA; 2902 return (0); 2903 2904 case VIO_DRING_MODE: /* expect register-dring message */ 2905 if ((status = 2906 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 2907 return (status); 2908 2909 /* One dring negotiated, move to that state */ 2910 vd->state = VD_STATE_DRING; 2911 return (0); 2912 2913 default: 2914 ASSERT("Unsupported transfer mode"); 2915 PR0("Unsupported transfer mode"); 2916 return (ENOTSUP); 2917 } 2918 2919 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 2920 if ((status = process_rdx_msg(msg, msglen)) == 0) { 2921 /* Ready to receive data */ 2922 vd->state = VD_STATE_DATA; 2923 return (0); 2924 } else if (status != ENOMSG) { 2925 return (status); 2926 } 2927 2928 2929 /* 2930 * If another register-dring message is received, stay in 2931 * dring state in case the client sends RDX; although the 2932 * protocol allows multiple drings, this server does not 2933 * support using more than one 2934 */ 2935 if ((status = 2936 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 2937 return (status); 2938 2939 /* 2940 * Acknowledge an unregister-dring message, but reset the 2941 * connection anyway: Although the protocol allows 2942 * unregistering drings, this server cannot serve a vdisk 2943 * without its only dring 2944 */ 2945 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2946 return ((status == 0) ? ENOTSUP : status); 2947 2948 case VD_STATE_DATA: 2949 switch (vd->xfer_mode) { 2950 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 2951 return (vd_process_desc_msg(vd, msg, msglen)); 2952 2953 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 2954 /* 2955 * Typically expect dring-data messages, so handle 2956 * them first 2957 */ 2958 if ((status = vd_process_dring_msg(vd, msg, 2959 msglen)) != ENOMSG) 2960 return (status); 2961 2962 /* 2963 * Acknowledge an unregister-dring message, but reset 2964 * the connection anyway: Although the protocol 2965 * allows unregistering drings, this server cannot 2966 * serve a vdisk without its only dring 2967 */ 2968 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2969 return ((status == 0) ? ENOTSUP : status); 2970 2971 default: 2972 ASSERT("Unsupported transfer mode"); 2973 PR0("Unsupported transfer mode"); 2974 return (ENOTSUP); 2975 } 2976 2977 default: 2978 ASSERT("Invalid client connection state"); 2979 PR0("Invalid client connection state"); 2980 return (ENOTSUP); 2981 } 2982 } 2983 2984 static int 2985 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2986 { 2987 int status; 2988 boolean_t reset_ldc = B_FALSE; 2989 vd_task_t task; 2990 2991 /* 2992 * Check that the message is at least big enough for a "tag", so that 2993 * message processing can proceed based on tag-specified message type 2994 */ 2995 if (msglen < sizeof (vio_msg_tag_t)) { 2996 PR0("Received short (%lu-byte) message", msglen); 2997 /* Can't "nack" short message, so drop the big hammer */ 2998 PR0("initiating full reset"); 2999 vd_need_reset(vd, B_TRUE); 3000 return (EBADMSG); 3001 } 3002 3003 /* 3004 * Process the message 3005 */ 3006 switch (status = vd_do_process_msg(vd, msg, msglen)) { 3007 case 0: 3008 /* "ack" valid, successfully-processed messages */ 3009 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3010 break; 3011 3012 case EINPROGRESS: 3013 /* The completion handler will "ack" or "nack" the message */ 3014 return (EINPROGRESS); 3015 case ENOMSG: 3016 PR0("Received unexpected message"); 3017 _NOTE(FALLTHROUGH); 3018 case EBADMSG: 3019 case ENOTSUP: 3020 /* "transport" error will cause NACK of invalid messages */ 3021 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 3022 break; 3023 3024 default: 3025 /* "transport" error will cause NACK of invalid messages */ 3026 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 3027 /* An LDC error probably occurred, so try resetting it */ 3028 reset_ldc = B_TRUE; 3029 break; 3030 } 3031 3032 PR1("\tResulting in state %d (%s)", vd->state, 3033 vd_decode_state(vd->state)); 3034 3035 /* populate the task so we can dispatch it on the taskq */ 3036 task.vd = vd; 3037 task.msg = msg; 3038 task.msglen = msglen; 3039 3040 /* 3041 * Queue a task to send the notification that the operation completed. 3042 * We need to ensure that requests are responded to in the correct 3043 * order and since the taskq is processed serially this ordering 3044 * is maintained. 3045 */ 3046 (void) ddi_taskq_dispatch(vd->completionq, vd_serial_notify, 3047 &task, DDI_SLEEP); 3048 3049 /* 3050 * To ensure handshake negotiations do not happen out of order, such 3051 * requests that come through this path should not be done in parallel 3052 * so we need to wait here until the response is sent to the client. 3053 */ 3054 ddi_taskq_wait(vd->completionq); 3055 3056 /* Arrange to reset the connection for nack'ed or failed messages */ 3057 if ((status != 0) || reset_ldc) { 3058 PR0("initiating %s reset", 3059 (reset_ldc) ? "full" : "soft"); 3060 vd_need_reset(vd, reset_ldc); 3061 } 3062 3063 return (status); 3064 } 3065 3066 static boolean_t 3067 vd_enabled(vd_t *vd) 3068 { 3069 boolean_t enabled; 3070 3071 mutex_enter(&vd->lock); 3072 enabled = vd->enabled; 3073 mutex_exit(&vd->lock); 3074 return (enabled); 3075 } 3076 3077 static void 3078 vd_recv_msg(void *arg) 3079 { 3080 vd_t *vd = (vd_t *)arg; 3081 int rv = 0, status = 0; 3082 3083 ASSERT(vd != NULL); 3084 3085 PR2("New task to receive incoming message(s)"); 3086 3087 3088 while (vd_enabled(vd) && status == 0) { 3089 size_t msglen, msgsize; 3090 ldc_status_t lstatus; 3091 3092 /* 3093 * Receive and process a message 3094 */ 3095 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 3096 3097 /* 3098 * check if channel is UP - else break out of loop 3099 */ 3100 status = ldc_status(vd->ldc_handle, &lstatus); 3101 if (lstatus != LDC_UP) { 3102 PR0("channel not up (status=%d), exiting recv loop\n", 3103 lstatus); 3104 break; 3105 } 3106 3107 ASSERT(vd->max_msglen != 0); 3108 3109 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 3110 msglen = msgsize; /* actual len after recv_msg() */ 3111 3112 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 3113 switch (status) { 3114 case 0: 3115 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 3116 msglen); 3117 /* check if max_msglen changed */ 3118 if (msgsize != vd->max_msglen) { 3119 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 3120 msgsize, vd->max_msglen); 3121 kmem_free(vd->vio_msgp, msgsize); 3122 vd->vio_msgp = 3123 kmem_alloc(vd->max_msglen, KM_SLEEP); 3124 } 3125 if (rv == EINPROGRESS) 3126 continue; 3127 break; 3128 3129 case ENOMSG: 3130 break; 3131 3132 case ECONNRESET: 3133 PR0("initiating soft reset (ECONNRESET)\n"); 3134 vd_need_reset(vd, B_FALSE); 3135 status = 0; 3136 break; 3137 3138 default: 3139 /* Probably an LDC failure; arrange to reset it */ 3140 PR0("initiating full reset (status=0x%x)", status); 3141 vd_need_reset(vd, B_TRUE); 3142 break; 3143 } 3144 } 3145 3146 PR2("Task finished"); 3147 } 3148 3149 static uint_t 3150 vd_handle_ldc_events(uint64_t event, caddr_t arg) 3151 { 3152 vd_t *vd = (vd_t *)(void *)arg; 3153 int status; 3154 3155 ASSERT(vd != NULL); 3156 3157 if (!vd_enabled(vd)) 3158 return (LDC_SUCCESS); 3159 3160 if (event & LDC_EVT_DOWN) { 3161 PR0("LDC_EVT_DOWN: LDC channel went down"); 3162 3163 vd_need_reset(vd, B_TRUE); 3164 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 3165 DDI_SLEEP); 3166 if (status == DDI_FAILURE) { 3167 PR0("cannot schedule task to recv msg\n"); 3168 vd_need_reset(vd, B_TRUE); 3169 } 3170 } 3171 3172 if (event & LDC_EVT_RESET) { 3173 PR0("LDC_EVT_RESET: LDC channel was reset"); 3174 3175 if (vd->state != VD_STATE_INIT) { 3176 PR0("scheduling full reset"); 3177 vd_need_reset(vd, B_FALSE); 3178 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 3179 vd, DDI_SLEEP); 3180 if (status == DDI_FAILURE) { 3181 PR0("cannot schedule task to recv msg\n"); 3182 vd_need_reset(vd, B_TRUE); 3183 } 3184 3185 } else { 3186 PR0("channel already reset, ignoring...\n"); 3187 PR0("doing ldc up...\n"); 3188 (void) ldc_up(vd->ldc_handle); 3189 } 3190 3191 return (LDC_SUCCESS); 3192 } 3193 3194 if (event & LDC_EVT_UP) { 3195 PR0("EVT_UP: LDC is up\nResetting client connection state"); 3196 PR0("initiating soft reset"); 3197 vd_need_reset(vd, B_FALSE); 3198 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 3199 vd, DDI_SLEEP); 3200 if (status == DDI_FAILURE) { 3201 PR0("cannot schedule task to recv msg\n"); 3202 vd_need_reset(vd, B_TRUE); 3203 return (LDC_SUCCESS); 3204 } 3205 } 3206 3207 if (event & LDC_EVT_READ) { 3208 int status; 3209 3210 PR1("New data available"); 3211 /* Queue a task to receive the new data */ 3212 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 3213 DDI_SLEEP); 3214 3215 if (status == DDI_FAILURE) { 3216 PR0("cannot schedule task to recv msg\n"); 3217 vd_need_reset(vd, B_TRUE); 3218 } 3219 } 3220 3221 return (LDC_SUCCESS); 3222 } 3223 3224 static uint_t 3225 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 3226 { 3227 _NOTE(ARGUNUSED(key, val)) 3228 (*((uint_t *)arg))++; 3229 return (MH_WALK_TERMINATE); 3230 } 3231 3232 3233 static int 3234 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3235 { 3236 uint_t vd_present = 0; 3237 minor_t instance; 3238 vds_t *vds; 3239 3240 3241 switch (cmd) { 3242 case DDI_DETACH: 3243 /* the real work happens below */ 3244 break; 3245 case DDI_SUSPEND: 3246 PR0("No action required for DDI_SUSPEND"); 3247 return (DDI_SUCCESS); 3248 default: 3249 PR0("Unrecognized \"cmd\""); 3250 return (DDI_FAILURE); 3251 } 3252 3253 ASSERT(cmd == DDI_DETACH); 3254 instance = ddi_get_instance(dip); 3255 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 3256 PR0("Could not get state for instance %u", instance); 3257 ddi_soft_state_free(vds_state, instance); 3258 return (DDI_FAILURE); 3259 } 3260 3261 /* Do no detach when serving any vdisks */ 3262 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 3263 if (vd_present) { 3264 PR0("Not detaching because serving vdisks"); 3265 return (DDI_FAILURE); 3266 } 3267 3268 PR0("Detaching"); 3269 if (vds->initialized & VDS_MDEG) { 3270 (void) mdeg_unregister(vds->mdeg); 3271 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 3272 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 3273 vds->ispecp = NULL; 3274 vds->mdeg = NULL; 3275 } 3276 3277 if (vds->initialized & VDS_LDI) 3278 (void) ldi_ident_release(vds->ldi_ident); 3279 mod_hash_destroy_hash(vds->vd_table); 3280 ddi_soft_state_free(vds_state, instance); 3281 return (DDI_SUCCESS); 3282 } 3283 3284 static boolean_t 3285 is_pseudo_device(dev_info_t *dip) 3286 { 3287 dev_info_t *parent, *root = ddi_root_node(); 3288 3289 3290 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 3291 parent = ddi_get_parent(parent)) { 3292 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 3293 return (B_TRUE); 3294 } 3295 3296 return (B_FALSE); 3297 } 3298 3299 static int 3300 vd_setup_full_disk(vd_t *vd) 3301 { 3302 int rval, status; 3303 major_t major = getmajor(vd->dev[0]); 3304 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 3305 struct dk_minfo dk_minfo; 3306 3307 /* 3308 * At this point, vdisk_size is set to the size of partition 2 but 3309 * this does not represent the size of the disk because partition 2 3310 * may not cover the entire disk and its size does not include reserved 3311 * blocks. So we update vdisk_size to be the size of the entire disk. 3312 */ 3313 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 3314 (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL), 3315 kcred, &rval)) != 0) { 3316 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 3317 status); 3318 return (status); 3319 } 3320 vd->vdisk_size = dk_minfo.dki_capacity; 3321 3322 /* Set full-disk parameters */ 3323 vd->vdisk_type = VD_DISK_TYPE_DISK; 3324 vd->nslices = (sizeof (vd->dev))/(sizeof (vd->dev[0])); 3325 3326 /* Move dev number and LDI handle to entire-disk-slice array elements */ 3327 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 3328 vd->dev[0] = 0; 3329 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 3330 vd->ldi_handle[0] = NULL; 3331 3332 /* Initialize device numbers for remaining slices and open them */ 3333 for (int slice = 0; slice < vd->nslices; slice++) { 3334 /* 3335 * Skip the entire-disk slice, as it's already open and its 3336 * device known 3337 */ 3338 if (slice == VD_ENTIRE_DISK_SLICE) 3339 continue; 3340 ASSERT(vd->dev[slice] == 0); 3341 ASSERT(vd->ldi_handle[slice] == NULL); 3342 3343 /* 3344 * Construct the device number for the current slice 3345 */ 3346 vd->dev[slice] = makedevice(major, (minor + slice)); 3347 3348 /* 3349 * Open all slices of the disk to serve them to the client. 3350 * Slices are opened exclusively to prevent other threads or 3351 * processes in the service domain from performing I/O to 3352 * slices being accessed by a client. Failure to open a slice 3353 * results in vds not serving this disk, as the client could 3354 * attempt (and should be able) to access any slice immediately. 3355 * Any slices successfully opened before a failure will get 3356 * closed by vds_destroy_vd() as a result of the error returned 3357 * by this function. 3358 * 3359 * We need to do the open with FNDELAY so that opening an empty 3360 * slice does not fail. 3361 */ 3362 PR0("Opening device major %u, minor %u = slice %u", 3363 major, minor, slice); 3364 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 3365 vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[slice], 3366 vd->vds->ldi_ident)) != 0) { 3367 PRN("ldi_open_by_dev() returned errno %d " 3368 "for slice %u", status, slice); 3369 /* vds_destroy_vd() will close any open slices */ 3370 vd->ldi_handle[slice] = NULL; 3371 return (status); 3372 } 3373 } 3374 3375 return (0); 3376 } 3377 3378 static int 3379 vd_setup_partition_efi(vd_t *vd) 3380 { 3381 efi_gpt_t *gpt; 3382 efi_gpe_t *gpe; 3383 struct uuid uuid = EFI_RESERVED; 3384 uint32_t crc; 3385 int length; 3386 3387 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 3388 3389 gpt = kmem_zalloc(length, KM_SLEEP); 3390 gpe = (efi_gpe_t *)(gpt + 1); 3391 3392 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 3393 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 3394 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 3395 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 3396 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 3397 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 3398 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 3399 3400 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 3401 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 3402 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 3403 3404 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 3405 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 3406 3407 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 3408 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 3409 3410 vd->dk_efi.dki_lba = 0; 3411 vd->dk_efi.dki_length = length; 3412 vd->dk_efi.dki_data = gpt; 3413 3414 return (0); 3415 } 3416 3417 static int 3418 vd_setup_file(vd_t *vd) 3419 { 3420 int i, rval, status; 3421 ushort_t sum; 3422 vattr_t vattr; 3423 dev_t dev; 3424 size_t size; 3425 char *file_path = vd->device_path; 3426 char dev_path[MAXPATHLEN + 1]; 3427 char prefix; 3428 ldi_handle_t lhandle; 3429 struct dk_cinfo dk_cinfo; 3430 struct dk_label label; 3431 3432 /* make sure the file is valid */ 3433 if ((status = lookupname(file_path, UIO_SYSSPACE, FOLLOW, 3434 NULLVPP, &vd->file_vnode)) != 0) { 3435 PRN("Cannot lookup file(%s) errno %d", file_path, status); 3436 return (status); 3437 } 3438 3439 if (vd->file_vnode->v_type != VREG) { 3440 PRN("Invalid file type (%s)\n", file_path); 3441 VN_RELE(vd->file_vnode); 3442 return (EBADF); 3443 } 3444 VN_RELE(vd->file_vnode); 3445 3446 if ((status = vn_open(file_path, UIO_SYSSPACE, vd_open_flags | FOFFMAX, 3447 0, &vd->file_vnode, 0, 0)) != 0) { 3448 PRN("vn_open(%s) = errno %d", file_path, status); 3449 return (status); 3450 } 3451 3452 /* 3453 * We set vd->file now so that vds_destroy_vd will take care of 3454 * closing the file and releasing the vnode in case of an error. 3455 */ 3456 vd->file = B_TRUE; 3457 vd->pseudo = B_FALSE; 3458 3459 vattr.va_mask = AT_SIZE; 3460 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) { 3461 PRN("VOP_GETATTR(%s) = errno %d", file_path, status); 3462 return (EIO); 3463 } 3464 3465 vd->file_size = vattr.va_size; 3466 /* size should be at least sizeof(dk_label) */ 3467 if (vd->file_size < sizeof (struct dk_label)) { 3468 PRN("Size of file has to be at least %ld bytes", 3469 sizeof (struct dk_label)); 3470 return (EIO); 3471 } 3472 3473 if (vd->file_vnode->v_flag & VNOMAP) { 3474 PRN("File %s cannot be mapped", file_path); 3475 return (EIO); 3476 } 3477 3478 /* read label from file */ 3479 if (VD_FILE_LABEL_READ(vd, &label) < 0) { 3480 PRN("Can't read label from %s", file_path); 3481 return (EIO); 3482 } 3483 3484 /* label checksum */ 3485 sum = vd_lbl2cksum(&label); 3486 3487 if (label.dkl_magic != DKL_MAGIC || label.dkl_cksum != sum) { 3488 PR0("%s has an invalid disk label " 3489 "(magic=%x cksum=%x (expect %x))", 3490 file_path, label.dkl_magic, label.dkl_cksum, sum); 3491 3492 /* default label */ 3493 bzero(&label, sizeof (struct dk_label)); 3494 3495 /* 3496 * We must have a resonable number of cylinders and sectors so 3497 * that newfs can run using default values. 3498 * 3499 * if (disk_size < 2MB) 3500 * phys_cylinders = disk_size / 100K 3501 * else 3502 * phys_cylinders = disk_size / 300K 3503 * 3504 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 3505 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 3506 * data_cylinders = phys_cylinders - alt_cylinders 3507 * 3508 * sectors = disk_size / (phys_cylinders * blk_size) 3509 */ 3510 if (vd->file_size < (2 * 1024 * 1024)) 3511 label.dkl_pcyl = vd->file_size / (100 * 1024); 3512 else 3513 label.dkl_pcyl = vd->file_size / (300 * 1024); 3514 3515 if (label.dkl_pcyl == 0) 3516 label.dkl_pcyl = 1; 3517 3518 if (label.dkl_pcyl > 2) 3519 label.dkl_acyl = 2; 3520 else 3521 label.dkl_acyl = 0; 3522 3523 label.dkl_nsect = vd->file_size / 3524 (DEV_BSIZE * label.dkl_pcyl); 3525 label.dkl_ncyl = label.dkl_pcyl - label.dkl_acyl; 3526 label.dkl_nhead = 1; 3527 label.dkl_write_reinstruct = 0; 3528 label.dkl_read_reinstruct = 0; 3529 label.dkl_rpm = 7200; 3530 label.dkl_apc = 0; 3531 label.dkl_intrlv = 0; 3532 label.dkl_magic = DKL_MAGIC; 3533 3534 PR0("requested disk size: %ld bytes\n", vd->file_size); 3535 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label.dkl_pcyl, 3536 label.dkl_nhead, label.dkl_nsect); 3537 PR0("provided disk size: %ld bytes\n", (uint64_t) 3538 (label.dkl_pcyl * 3539 label.dkl_nhead * label.dkl_nsect * DEV_BSIZE)); 3540 3541 if (vd->file_size < (1ULL << 20)) { 3542 size = vd->file_size >> 10; 3543 prefix = 'K'; /* Kilobyte */ 3544 } else if (vd->file_size < (1ULL << 30)) { 3545 size = vd->file_size >> 20; 3546 prefix = 'M'; /* Megabyte */ 3547 } else if (vd->file_size < (1ULL << 40)) { 3548 size = vd->file_size >> 30; 3549 prefix = 'G'; /* Gigabyte */ 3550 } else { 3551 size = vd->file_size >> 40; 3552 prefix = 'T'; /* Terabyte */ 3553 } 3554 3555 /* 3556 * We must have a correct label name otherwise format(1m) will 3557 * not recognized the disk as labeled. 3558 */ 3559 (void) snprintf(label.dkl_asciilabel, LEN_DKL_ASCII, 3560 "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d", 3561 size, prefix, 3562 label.dkl_ncyl, label.dkl_acyl, label.dkl_nhead, 3563 label.dkl_nsect); 3564 3565 /* default VTOC */ 3566 label.dkl_vtoc.v_version = V_VERSION; 3567 label.dkl_vtoc.v_nparts = V_NUMPAR; 3568 label.dkl_vtoc.v_sanity = VTOC_SANE; 3569 label.dkl_vtoc.v_part[2].p_tag = V_BACKUP; 3570 label.dkl_map[2].dkl_cylno = 0; 3571 label.dkl_map[2].dkl_nblk = label.dkl_ncyl * 3572 label.dkl_nhead * label.dkl_nsect; 3573 label.dkl_map[0] = label.dkl_map[2]; 3574 label.dkl_map[0] = label.dkl_map[2]; 3575 label.dkl_cksum = vd_lbl2cksum(&label); 3576 3577 /* write default label to file */ 3578 if ((rval = vd_file_set_vtoc(vd, &label)) != 0) { 3579 PRN("Can't write label to %s", file_path); 3580 return (rval); 3581 } 3582 } 3583 3584 vd->nslices = label.dkl_vtoc.v_nparts; 3585 3586 /* sector size = block size = DEV_BSIZE */ 3587 vd->vdisk_size = vd->file_size / DEV_BSIZE; 3588 vd->vdisk_type = VD_DISK_TYPE_DISK; 3589 vd->vdisk_label = VD_DISK_LABEL_VTOC; 3590 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 3591 3592 /* Get max_xfer_sz from the device where the file is */ 3593 dev = vd->file_vnode->v_vfsp->vfs_dev; 3594 dev_path[0] = NULL; 3595 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 3596 PR0("underlying device = %s\n", dev_path); 3597 } 3598 3599 if ((status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, 3600 kcred, &lhandle, vd->vds->ldi_ident)) != 0) { 3601 PR0("ldi_open_by_dev() returned errno %d for device %s", 3602 status, dev_path); 3603 } else { 3604 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 3605 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 3606 &rval)) != 0) { 3607 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 3608 status, dev_path); 3609 } else { 3610 /* 3611 * Store the device's max transfer size for 3612 * return to the client 3613 */ 3614 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 3615 } 3616 3617 PR0("close the device %s", dev_path); 3618 (void) ldi_close(lhandle, FREAD, kcred); 3619 } 3620 3621 PR0("using file %s, dev %s, max_xfer = %u blks", 3622 file_path, dev_path, vd->max_xfer_sz); 3623 3624 vd->dk_geom.dkg_ncyl = label.dkl_ncyl; 3625 vd->dk_geom.dkg_acyl = label.dkl_acyl; 3626 vd->dk_geom.dkg_pcyl = label.dkl_pcyl; 3627 vd->dk_geom.dkg_nhead = label.dkl_nhead; 3628 vd->dk_geom.dkg_nsect = label.dkl_nsect; 3629 vd->dk_geom.dkg_intrlv = label.dkl_intrlv; 3630 vd->dk_geom.dkg_apc = label.dkl_apc; 3631 vd->dk_geom.dkg_rpm = label.dkl_rpm; 3632 vd->dk_geom.dkg_write_reinstruct = label.dkl_write_reinstruct; 3633 vd->dk_geom.dkg_read_reinstruct = label.dkl_read_reinstruct; 3634 3635 vd->vtoc.v_sanity = label.dkl_vtoc.v_sanity; 3636 vd->vtoc.v_version = label.dkl_vtoc.v_version; 3637 vd->vtoc.v_sectorsz = DEV_BSIZE; 3638 vd->vtoc.v_nparts = label.dkl_vtoc.v_nparts; 3639 3640 bcopy(label.dkl_vtoc.v_volume, vd->vtoc.v_volume, 3641 LEN_DKL_VVOL); 3642 bcopy(label.dkl_asciilabel, vd->vtoc.v_asciilabel, 3643 LEN_DKL_ASCII); 3644 3645 for (i = 0; i < vd->nslices; i++) { 3646 vd->vtoc.timestamp[i] = label.dkl_vtoc.v_timestamp[i]; 3647 vd->vtoc.v_part[i].p_tag = label.dkl_vtoc.v_part[i].p_tag; 3648 vd->vtoc.v_part[i].p_flag = label.dkl_vtoc.v_part[i].p_flag; 3649 vd->vtoc.v_part[i].p_start = label.dkl_map[i].dkl_cylno * 3650 label.dkl_nhead * label.dkl_nsect; 3651 vd->vtoc.v_part[i].p_size = label.dkl_map[i].dkl_nblk; 3652 vd->ldi_handle[i] = NULL; 3653 vd->dev[i] = NULL; 3654 } 3655 3656 /* Setup devid for the disk image */ 3657 3658 status = vd_file_read_devid(vd, &vd->file_devid); 3659 3660 if (status == 0) { 3661 /* a valid devid was found */ 3662 return (0); 3663 } 3664 3665 if (status != EINVAL) { 3666 /* 3667 * There was an error while trying to read the devid. So this 3668 * disk image may have a devid but we are unable to read it. 3669 */ 3670 PR0("can not read devid for %s", file_path); 3671 vd->file_devid = NULL; 3672 return (0); 3673 } 3674 3675 /* 3676 * No valid device id was found so we create one. Note that a failure 3677 * to create a device id is not fatal and does not prevent the disk 3678 * image from being attached. 3679 */ 3680 PR1("creating devid for %s", file_path); 3681 3682 if (ddi_devid_init(vd->vds->dip, DEVID_FAB, NULL, 0, 3683 &vd->file_devid) != DDI_SUCCESS) { 3684 PR0("fail to create devid for %s", file_path); 3685 vd->file_devid = NULL; 3686 return (0); 3687 } 3688 3689 /* write devid to the disk image */ 3690 if (vd_file_write_devid(vd, vd->file_devid) != 0) { 3691 PR0("fail to write devid for %s", file_path); 3692 ddi_devid_free(vd->file_devid); 3693 vd->file_devid = NULL; 3694 } 3695 3696 return (0); 3697 } 3698 3699 static int 3700 vd_setup_vd(vd_t *vd) 3701 { 3702 int rval, status; 3703 dev_info_t *dip; 3704 struct dk_cinfo dk_cinfo; 3705 char *device_path = vd->device_path; 3706 3707 /* 3708 * We need to open with FNDELAY so that opening an empty partition 3709 * does not fail. 3710 */ 3711 if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY, 3712 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { 3713 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 3714 vd->ldi_handle[0] = NULL; 3715 3716 /* this may not be a device try opening as a file */ 3717 if (status == ENXIO || status == ENODEV) 3718 status = vd_setup_file(vd); 3719 if (status) { 3720 PRN("Cannot use device/file (%s), errno=%d\n", 3721 device_path, status); 3722 if (status == ENXIO || status == ENODEV || 3723 status == ENOENT) { 3724 return (EAGAIN); 3725 } 3726 } 3727 return (status); 3728 } 3729 3730 /* 3731 * nslices must be updated now so that vds_destroy_vd() will close 3732 * the slice we have just opened in case of an error. 3733 */ 3734 vd->nslices = 1; 3735 vd->file = B_FALSE; 3736 3737 /* Get device number and size of backing device */ 3738 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 3739 PRN("ldi_get_dev() returned errno %d for %s", 3740 status, device_path); 3741 return (status); 3742 } 3743 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 3744 PRN("ldi_get_size() failed for %s", device_path); 3745 return (EIO); 3746 } 3747 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 3748 3749 /* Verify backing device supports dk_cinfo, dk_geom, and vtoc */ 3750 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 3751 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 3752 &rval)) != 0) { 3753 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 3754 status, device_path); 3755 return (status); 3756 } 3757 if (dk_cinfo.dki_partition >= V_NUMPAR) { 3758 PRN("slice %u >= maximum slice %u for %s", 3759 dk_cinfo.dki_partition, V_NUMPAR, device_path); 3760 return (EIO); 3761 } 3762 3763 status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label); 3764 3765 if (status != 0) { 3766 PRN("vd_read_vtoc returned errno %d for %s", 3767 status, device_path); 3768 return (status); 3769 } 3770 3771 if (vd->vdisk_label == VD_DISK_LABEL_VTOC && 3772 (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 3773 (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), 3774 kcred, &rval)) != 0) { 3775 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 3776 status, device_path); 3777 return (status); 3778 } 3779 3780 /* Store the device's max transfer size for return to the client */ 3781 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 3782 3783 /* Determine if backing device is a pseudo device */ 3784 if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), 3785 dev_to_instance(vd->dev[0]), 0)) == NULL) { 3786 PRN("%s is no longer accessible", device_path); 3787 return (EIO); 3788 } 3789 vd->pseudo = is_pseudo_device(dip); 3790 ddi_release_devi(dip); 3791 if (vd->pseudo) { 3792 vd->vdisk_type = VD_DISK_TYPE_SLICE; 3793 vd->nslices = 1; 3794 return (0); /* ...and we're done */ 3795 } 3796 3797 /* If slice is entire-disk slice, initialize for full disk */ 3798 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) 3799 return (vd_setup_full_disk(vd)); 3800 3801 3802 /* Otherwise, we have a non-entire slice of a device */ 3803 vd->vdisk_type = VD_DISK_TYPE_SLICE; 3804 vd->nslices = 1; 3805 3806 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 3807 status = vd_setup_partition_efi(vd); 3808 return (status); 3809 } 3810 3811 /* Initialize dk_geom structure for single-slice device */ 3812 if (vd->dk_geom.dkg_nsect == 0) { 3813 PRN("%s geometry claims 0 sectors per track", device_path); 3814 return (EIO); 3815 } 3816 if (vd->dk_geom.dkg_nhead == 0) { 3817 PRN("%s geometry claims 0 heads", device_path); 3818 return (EIO); 3819 } 3820 vd->dk_geom.dkg_ncyl = 3821 vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; 3822 vd->dk_geom.dkg_acyl = 0; 3823 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 3824 3825 3826 /* Initialize vtoc structure for single-slice device */ 3827 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 3828 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 3829 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 3830 vd->vtoc.v_nparts = 1; 3831 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 3832 vd->vtoc.v_part[0].p_flag = 0; 3833 vd->vtoc.v_part[0].p_start = 0; 3834 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 3835 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 3836 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 3837 3838 3839 return (0); 3840 } 3841 3842 static int 3843 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id, 3844 vd_t **vdp) 3845 { 3846 char tq_name[TASKQ_NAMELEN]; 3847 int status; 3848 ddi_iblock_cookie_t iblock = NULL; 3849 ldc_attr_t ldc_attr; 3850 vd_t *vd; 3851 3852 3853 ASSERT(vds != NULL); 3854 ASSERT(device_path != NULL); 3855 ASSERT(vdp != NULL); 3856 PR0("Adding vdisk for %s", device_path); 3857 3858 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 3859 PRN("No memory for virtual disk"); 3860 return (EAGAIN); 3861 } 3862 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 3863 vd->vds = vds; 3864 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 3865 3866 /* Open vdisk and initialize parameters */ 3867 if ((status = vd_setup_vd(vd)) == 0) { 3868 vd->initialized |= VD_DISK_READY; 3869 3870 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 3871 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 3872 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 3873 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 3874 vd->nslices); 3875 } else { 3876 if (status != EAGAIN) 3877 return (status); 3878 } 3879 3880 /* Initialize locking */ 3881 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 3882 &iblock) != DDI_SUCCESS) { 3883 PRN("Could not get iblock cookie."); 3884 return (EIO); 3885 } 3886 3887 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 3888 vd->initialized |= VD_LOCKING; 3889 3890 3891 /* Create start and completion task queues for the vdisk */ 3892 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 3893 PR1("tq_name = %s", tq_name); 3894 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 3895 TASKQ_DEFAULTPRI, 0)) == NULL) { 3896 PRN("Could not create task queue"); 3897 return (EIO); 3898 } 3899 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 3900 PR1("tq_name = %s", tq_name); 3901 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 3902 TASKQ_DEFAULTPRI, 0)) == NULL) { 3903 PRN("Could not create task queue"); 3904 return (EIO); 3905 } 3906 vd->enabled = 1; /* before callback can dispatch to startq */ 3907 3908 3909 /* Bring up LDC */ 3910 ldc_attr.devclass = LDC_DEV_BLK_SVC; 3911 ldc_attr.instance = ddi_get_instance(vds->dip); 3912 ldc_attr.mode = LDC_MODE_UNRELIABLE; 3913 ldc_attr.mtu = VD_LDC_MTU; 3914 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 3915 PRN("Could not initialize LDC channel %lu, " 3916 "init failed with error %d", ldc_id, status); 3917 return (status); 3918 } 3919 vd->initialized |= VD_LDC; 3920 3921 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 3922 (caddr_t)vd)) != 0) { 3923 PRN("Could not initialize LDC channel %lu," 3924 "reg_callback failed with error %d", ldc_id, status); 3925 return (status); 3926 } 3927 3928 if ((status = ldc_open(vd->ldc_handle)) != 0) { 3929 PRN("Could not initialize LDC channel %lu," 3930 "open failed with error %d", ldc_id, status); 3931 return (status); 3932 } 3933 3934 if ((status = ldc_up(vd->ldc_handle)) != 0) { 3935 PR0("ldc_up() returned errno %d", status); 3936 } 3937 3938 /* Allocate the inband task memory handle */ 3939 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 3940 if (status) { 3941 PRN("Could not initialize LDC channel %lu," 3942 "alloc_handle failed with error %d", ldc_id, status); 3943 return (ENXIO); 3944 } 3945 3946 /* Add the successfully-initialized vdisk to the server's table */ 3947 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 3948 PRN("Error adding vdisk ID %lu to table", id); 3949 return (EIO); 3950 } 3951 3952 /* Allocate the staging buffer */ 3953 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 3954 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 3955 3956 /* store initial state */ 3957 vd->state = VD_STATE_INIT; 3958 3959 return (0); 3960 } 3961 3962 static void 3963 vd_free_dring_task(vd_t *vdp) 3964 { 3965 if (vdp->dring_task != NULL) { 3966 ASSERT(vdp->dring_len != 0); 3967 /* Free all dring_task memory handles */ 3968 for (int i = 0; i < vdp->dring_len; i++) { 3969 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 3970 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 3971 vdp->dring_task[i].msg = NULL; 3972 } 3973 kmem_free(vdp->dring_task, 3974 (sizeof (*vdp->dring_task)) * vdp->dring_len); 3975 vdp->dring_task = NULL; 3976 } 3977 } 3978 3979 /* 3980 * Destroy the state associated with a virtual disk 3981 */ 3982 static void 3983 vds_destroy_vd(void *arg) 3984 { 3985 vd_t *vd = (vd_t *)arg; 3986 int retry = 0, rv; 3987 3988 if (vd == NULL) 3989 return; 3990 3991 PR0("Destroying vdisk state"); 3992 3993 if (vd->dk_efi.dki_data != NULL) 3994 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 3995 3996 /* Disable queuing requests for the vdisk */ 3997 if (vd->initialized & VD_LOCKING) { 3998 mutex_enter(&vd->lock); 3999 vd->enabled = 0; 4000 mutex_exit(&vd->lock); 4001 } 4002 4003 /* Drain and destroy start queue (*before* destroying completionq) */ 4004 if (vd->startq != NULL) 4005 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 4006 4007 /* Drain and destroy completion queue (*before* shutting down LDC) */ 4008 if (vd->completionq != NULL) 4009 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 4010 4011 vd_free_dring_task(vd); 4012 4013 /* Free the inband task memory handle */ 4014 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 4015 4016 /* Shut down LDC */ 4017 if (vd->initialized & VD_LDC) { 4018 /* unmap the dring */ 4019 if (vd->initialized & VD_DRING) 4020 (void) ldc_mem_dring_unmap(vd->dring_handle); 4021 4022 /* close LDC channel - retry on EAGAIN */ 4023 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 4024 if (++retry > vds_ldc_retries) { 4025 PR0("Timed out closing channel"); 4026 break; 4027 } 4028 drv_usecwait(vds_ldc_delay); 4029 } 4030 if (rv == 0) { 4031 (void) ldc_unreg_callback(vd->ldc_handle); 4032 (void) ldc_fini(vd->ldc_handle); 4033 } else { 4034 /* 4035 * Closing the LDC channel has failed. Ideally we should 4036 * fail here but there is no Zeus level infrastructure 4037 * to handle this. The MD has already been changed and 4038 * we have to do the close. So we try to do as much 4039 * clean up as we can. 4040 */ 4041 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 4042 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 4043 drv_usecwait(vds_ldc_delay); 4044 } 4045 } 4046 4047 /* Free the staging buffer for msgs */ 4048 if (vd->vio_msgp != NULL) { 4049 kmem_free(vd->vio_msgp, vd->max_msglen); 4050 vd->vio_msgp = NULL; 4051 } 4052 4053 /* Free the inband message buffer */ 4054 if (vd->inband_task.msg != NULL) { 4055 kmem_free(vd->inband_task.msg, vd->max_msglen); 4056 vd->inband_task.msg = NULL; 4057 } 4058 if (vd->file) { 4059 /* Close file */ 4060 (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 4061 0, kcred); 4062 VN_RELE(vd->file_vnode); 4063 if (vd->file_devid != NULL) 4064 ddi_devid_free(vd->file_devid); 4065 } else { 4066 /* Close any open backing-device slices */ 4067 for (uint_t slice = 0; slice < vd->nslices; slice++) { 4068 if (vd->ldi_handle[slice] != NULL) { 4069 PR0("Closing slice %u", slice); 4070 (void) ldi_close(vd->ldi_handle[slice], 4071 vd_open_flags | FNDELAY, kcred); 4072 } 4073 } 4074 } 4075 4076 /* Free lock */ 4077 if (vd->initialized & VD_LOCKING) 4078 mutex_destroy(&vd->lock); 4079 4080 /* Finally, free the vdisk structure itself */ 4081 kmem_free(vd, sizeof (*vd)); 4082 } 4083 4084 static int 4085 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id) 4086 { 4087 int status; 4088 vd_t *vd = NULL; 4089 4090 4091 if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0) 4092 vds_destroy_vd(vd); 4093 4094 return (status); 4095 } 4096 4097 static int 4098 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 4099 uint64_t *ldc_id) 4100 { 4101 int num_channels; 4102 4103 4104 /* Look for channel endpoint child(ren) of the vdisk MD node */ 4105 if ((num_channels = md_scan_dag(md, vd_node, 4106 md_find_name(md, VD_CHANNEL_ENDPOINT), 4107 md_find_name(md, "fwd"), channel)) <= 0) { 4108 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 4109 return (-1); 4110 } 4111 4112 /* Get the "id" value for the first channel endpoint node */ 4113 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 4114 PRN("No \"%s\" property found for \"%s\" of vdisk", 4115 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 4116 return (-1); 4117 } 4118 4119 if (num_channels > 1) { 4120 PRN("Using ID of first of multiple channels for this vdisk"); 4121 } 4122 4123 return (0); 4124 } 4125 4126 static int 4127 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 4128 { 4129 int num_nodes, status; 4130 size_t size; 4131 mde_cookie_t *channel; 4132 4133 4134 if ((num_nodes = md_node_count(md)) <= 0) { 4135 PRN("Invalid node count in Machine Description subtree"); 4136 return (-1); 4137 } 4138 size = num_nodes*(sizeof (*channel)); 4139 channel = kmem_zalloc(size, KM_SLEEP); 4140 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 4141 kmem_free(channel, size); 4142 4143 return (status); 4144 } 4145 4146 static void 4147 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 4148 { 4149 char *device_path = NULL; 4150 uint64_t id = 0, ldc_id = 0; 4151 4152 4153 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 4154 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 4155 return; 4156 } 4157 PR0("Adding vdisk ID %lu", id); 4158 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 4159 &device_path) != 0) { 4160 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 4161 return; 4162 } 4163 4164 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 4165 PRN("Error getting LDC ID for vdisk %lu", id); 4166 return; 4167 } 4168 4169 if (vds_init_vd(vds, id, device_path, ldc_id) != 0) { 4170 PRN("Failed to add vdisk ID %lu", id); 4171 return; 4172 } 4173 } 4174 4175 static void 4176 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 4177 { 4178 uint64_t id = 0; 4179 4180 4181 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 4182 PRN("Unable to get \"%s\" property from vdisk's MD node", 4183 VD_ID_PROP); 4184 return; 4185 } 4186 PR0("Removing vdisk ID %lu", id); 4187 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 4188 PRN("No vdisk entry found for vdisk ID %lu", id); 4189 } 4190 4191 static void 4192 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 4193 md_t *curr_md, mde_cookie_t curr_vd_node) 4194 { 4195 char *curr_dev, *prev_dev; 4196 uint64_t curr_id = 0, curr_ldc_id = 0; 4197 uint64_t prev_id = 0, prev_ldc_id = 0; 4198 size_t len; 4199 4200 4201 /* Validate that vdisk ID has not changed */ 4202 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 4203 PRN("Error getting previous vdisk \"%s\" property", 4204 VD_ID_PROP); 4205 return; 4206 } 4207 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 4208 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 4209 return; 4210 } 4211 if (curr_id != prev_id) { 4212 PRN("Not changing vdisk: ID changed from %lu to %lu", 4213 prev_id, curr_id); 4214 return; 4215 } 4216 4217 /* Validate that LDC ID has not changed */ 4218 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 4219 PRN("Error getting LDC ID for vdisk %lu", prev_id); 4220 return; 4221 } 4222 4223 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 4224 PRN("Error getting LDC ID for vdisk %lu", curr_id); 4225 return; 4226 } 4227 if (curr_ldc_id != prev_ldc_id) { 4228 _NOTE(NOTREACHED); /* lint is confused */ 4229 PRN("Not changing vdisk: " 4230 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 4231 return; 4232 } 4233 4234 /* Determine whether device path has changed */ 4235 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 4236 &prev_dev) != 0) { 4237 PRN("Error getting previous vdisk \"%s\"", 4238 VD_BLOCK_DEVICE_PROP); 4239 return; 4240 } 4241 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 4242 &curr_dev) != 0) { 4243 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 4244 return; 4245 } 4246 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 4247 (strncmp(curr_dev, prev_dev, len) == 0)) 4248 return; /* no relevant (supported) change */ 4249 4250 PR0("Changing vdisk ID %lu", prev_id); 4251 4252 /* Remove old state, which will close vdisk and reset */ 4253 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 4254 PRN("No entry found for vdisk ID %lu", prev_id); 4255 4256 /* Re-initialize vdisk with new state */ 4257 if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { 4258 PRN("Failed to change vdisk ID %lu", curr_id); 4259 return; 4260 } 4261 } 4262 4263 static int 4264 vds_process_md(void *arg, mdeg_result_t *md) 4265 { 4266 int i; 4267 vds_t *vds = arg; 4268 4269 4270 if (md == NULL) 4271 return (MDEG_FAILURE); 4272 ASSERT(vds != NULL); 4273 4274 for (i = 0; i < md->removed.nelem; i++) 4275 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 4276 for (i = 0; i < md->match_curr.nelem; i++) 4277 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 4278 md->match_curr.mdp, md->match_curr.mdep[i]); 4279 for (i = 0; i < md->added.nelem; i++) 4280 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 4281 4282 return (MDEG_SUCCESS); 4283 } 4284 4285 4286 static int 4287 vds_do_attach(dev_info_t *dip) 4288 { 4289 int status, sz; 4290 int cfg_handle; 4291 minor_t instance = ddi_get_instance(dip); 4292 vds_t *vds; 4293 mdeg_prop_spec_t *pspecp; 4294 mdeg_node_spec_t *ispecp; 4295 4296 /* 4297 * The "cfg-handle" property of a vds node in an MD contains the MD's 4298 * notion of "instance", or unique identifier, for that node; OBP 4299 * stores the value of the "cfg-handle" MD property as the value of 4300 * the "reg" property on the node in the device tree it builds from 4301 * the MD and passes to Solaris. Thus, we look up the devinfo node's 4302 * "reg" property value to uniquely identify this device instance when 4303 * registering with the MD event-generation framework. If the "reg" 4304 * property cannot be found, the device tree state is presumably so 4305 * broken that there is no point in continuing. 4306 */ 4307 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 4308 VD_REG_PROP)) { 4309 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 4310 return (DDI_FAILURE); 4311 } 4312 4313 /* Get the MD instance for later MDEG registration */ 4314 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 4315 VD_REG_PROP, -1); 4316 4317 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 4318 PRN("Could not allocate state for instance %u", instance); 4319 return (DDI_FAILURE); 4320 } 4321 4322 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 4323 PRN("Could not get state for instance %u", instance); 4324 ddi_soft_state_free(vds_state, instance); 4325 return (DDI_FAILURE); 4326 } 4327 4328 vds->dip = dip; 4329 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 4330 vds_destroy_vd, sizeof (void *)); 4331 4332 ASSERT(vds->vd_table != NULL); 4333 4334 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 4335 PRN("ldi_ident_from_dip() returned errno %d", status); 4336 return (DDI_FAILURE); 4337 } 4338 vds->initialized |= VDS_LDI; 4339 4340 /* Register for MD updates */ 4341 sz = sizeof (vds_prop_template); 4342 pspecp = kmem_alloc(sz, KM_SLEEP); 4343 bcopy(vds_prop_template, pspecp, sz); 4344 4345 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 4346 4347 /* initialize the complete prop spec structure */ 4348 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 4349 ispecp->namep = "virtual-device"; 4350 ispecp->specp = pspecp; 4351 4352 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 4353 &vds->mdeg) != MDEG_SUCCESS) { 4354 PRN("Unable to register for MD updates"); 4355 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 4356 kmem_free(pspecp, sz); 4357 return (DDI_FAILURE); 4358 } 4359 4360 vds->ispecp = ispecp; 4361 vds->initialized |= VDS_MDEG; 4362 4363 /* Prevent auto-detaching so driver is available whenever MD changes */ 4364 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 4365 DDI_PROP_SUCCESS) { 4366 PRN("failed to set \"%s\" property for instance %u", 4367 DDI_NO_AUTODETACH, instance); 4368 } 4369 4370 ddi_report_dev(dip); 4371 return (DDI_SUCCESS); 4372 } 4373 4374 static int 4375 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 4376 { 4377 int status; 4378 4379 switch (cmd) { 4380 case DDI_ATTACH: 4381 PR0("Attaching"); 4382 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 4383 (void) vds_detach(dip, DDI_DETACH); 4384 return (status); 4385 case DDI_RESUME: 4386 PR0("No action required for DDI_RESUME"); 4387 return (DDI_SUCCESS); 4388 default: 4389 return (DDI_FAILURE); 4390 } 4391 } 4392 4393 static struct dev_ops vds_ops = { 4394 DEVO_REV, /* devo_rev */ 4395 0, /* devo_refcnt */ 4396 ddi_no_info, /* devo_getinfo */ 4397 nulldev, /* devo_identify */ 4398 nulldev, /* devo_probe */ 4399 vds_attach, /* devo_attach */ 4400 vds_detach, /* devo_detach */ 4401 nodev, /* devo_reset */ 4402 NULL, /* devo_cb_ops */ 4403 NULL, /* devo_bus_ops */ 4404 nulldev /* devo_power */ 4405 }; 4406 4407 static struct modldrv modldrv = { 4408 &mod_driverops, 4409 "virtual disk server", 4410 &vds_ops, 4411 }; 4412 4413 static struct modlinkage modlinkage = { 4414 MODREV_1, 4415 &modldrv, 4416 NULL 4417 }; 4418 4419 4420 int 4421 _init(void) 4422 { 4423 int i, status; 4424 4425 4426 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 4427 return (status); 4428 if ((status = mod_install(&modlinkage)) != 0) { 4429 ddi_soft_state_fini(&vds_state); 4430 return (status); 4431 } 4432 4433 /* Fill in the bit-mask of server-supported operations */ 4434 for (i = 0; i < vds_noperations; i++) 4435 vds_operations |= 1 << (vds_operation[i].operation - 1); 4436 4437 return (0); 4438 } 4439 4440 int 4441 _info(struct modinfo *modinfop) 4442 { 4443 return (mod_info(&modlinkage, modinfop)); 4444 } 4445 4446 int 4447 _fini(void) 4448 { 4449 int status; 4450 4451 4452 if ((status = mod_remove(&modlinkage)) != 0) 4453 return (status); 4454 ddi_soft_state_fini(&vds_state); 4455 return (0); 4456 } 4457