1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sunddi.h> 45 #include <sys/sunldi.h> 46 #include <sys/sysmacros.h> 47 #include <sys/vio_common.h> 48 #include <sys/vdsk_mailbox.h> 49 #include <sys/vdsk_common.h> 50 #include <sys/vtoc.h> 51 #include <sys/vfs.h> 52 #include <sys/stat.h> 53 #include <sys/scsi/impl/uscsi.h> 54 #include <vm/seg_map.h> 55 56 /* Virtual disk server initialization flags */ 57 #define VDS_LDI 0x01 58 #define VDS_MDEG 0x02 59 60 /* Virtual disk server tunable parameters */ 61 #define VDS_RETRIES 5 62 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 63 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 64 #define VDS_NCHAINS 32 65 66 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 67 #define VDS_NAME "virtual-disk-server" 68 69 #define VD_NAME "vd" 70 #define VD_VOLUME_NAME "vdisk" 71 #define VD_ASCIILABEL "Virtual Disk" 72 73 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 74 #define VD_ID_PROP "id" 75 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 76 #define VD_REG_PROP "reg" 77 78 /* Virtual disk initialization flags */ 79 #define VD_DISK_READY 0x01 80 #define VD_LOCKING 0x02 81 #define VD_LDC 0x04 82 #define VD_DRING 0x08 83 #define VD_SID 0x10 84 #define VD_SEQ_NUM 0x20 85 86 /* Flags for opening/closing backing devices via LDI */ 87 #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) 88 89 /* Flags for writing to a vdisk which is a file */ 90 #define VD_FILE_WRITE_FLAGS SM_ASYNC 91 92 /* Number of backup labels */ 93 #define VD_FILE_NUM_BACKUP 5 94 95 /* Timeout for SCSI I/O */ 96 #define VD_SCSI_RDWR_TIMEOUT 30 /* 30 secs */ 97 98 /* 99 * By Solaris convention, slice/partition 2 represents the entire disk; 100 * unfortunately, this convention does not appear to be codified. 101 */ 102 #define VD_ENTIRE_DISK_SLICE 2 103 104 /* Return a cpp token as a string */ 105 #define STRINGIZE(token) #token 106 107 /* 108 * Print a message prefixed with the current function name to the message log 109 * (and optionally to the console for verbose boots); these macros use cpp's 110 * concatenation of string literals and C99 variable-length-argument-list 111 * macros 112 */ 113 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 114 #define _PRN(format, ...) \ 115 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 116 117 /* Return a pointer to the "i"th vdisk dring element */ 118 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 119 (vd->dring + (i)*vd->descriptor_size)) 120 121 /* Return the virtual disk client's type as a string (for use in messages) */ 122 #define VD_CLIENT(vd) \ 123 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 124 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 125 (((vd)->xfer_mode == 0) ? "null client" : \ 126 "unsupported client"))) 127 128 /* Read disk label from a disk on file */ 129 #define VD_FILE_LABEL_READ(vd, labelp) \ 130 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \ 131 0, sizeof (struct dk_label)) 132 133 /* Write disk label to a disk on file */ 134 #define VD_FILE_LABEL_WRITE(vd, labelp) \ 135 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 136 0, sizeof (struct dk_label)) 137 138 /* 139 * Specification of an MD node passed to the MDEG to filter any 140 * 'vport' nodes that do not belong to the specified node. This 141 * template is copied for each vds instance and filled in with 142 * the appropriate 'cfg-handle' value before being passed to the MDEG. 143 */ 144 static mdeg_prop_spec_t vds_prop_template[] = { 145 { MDET_PROP_STR, "name", VDS_NAME }, 146 { MDET_PROP_VAL, "cfg-handle", NULL }, 147 { MDET_LIST_END, NULL, NULL } 148 }; 149 150 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 151 152 /* 153 * Matching criteria passed to the MDEG to register interest 154 * in changes to 'virtual-device-port' nodes identified by their 155 * 'id' property. 156 */ 157 static md_prop_match_t vd_prop_match[] = { 158 { MDET_PROP_VAL, VD_ID_PROP }, 159 { MDET_LIST_END, NULL } 160 }; 161 162 static mdeg_node_match_t vd_match = {"virtual-device-port", 163 vd_prop_match}; 164 165 /* Debugging macros */ 166 #ifdef DEBUG 167 168 static int vd_msglevel = 0; 169 170 #define PR0 if (vd_msglevel > 0) PRN 171 #define PR1 if (vd_msglevel > 1) PRN 172 #define PR2 if (vd_msglevel > 2) PRN 173 174 #define VD_DUMP_DRING_ELEM(elem) \ 175 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 176 elem->hdr.dstate, \ 177 elem->payload.operation, \ 178 elem->payload.status, \ 179 elem->payload.nbytes, \ 180 elem->payload.addr, \ 181 elem->payload.ncookies); 182 183 char * 184 vd_decode_state(int state) 185 { 186 char *str; 187 188 #define CASE_STATE(_s) case _s: str = #_s; break; 189 190 switch (state) { 191 CASE_STATE(VD_STATE_INIT) 192 CASE_STATE(VD_STATE_VER) 193 CASE_STATE(VD_STATE_ATTR) 194 CASE_STATE(VD_STATE_DRING) 195 CASE_STATE(VD_STATE_RDX) 196 CASE_STATE(VD_STATE_DATA) 197 default: str = "unknown"; break; 198 } 199 200 #undef CASE_STATE 201 202 return (str); 203 } 204 205 void 206 vd_decode_tag(vio_msg_t *msg) 207 { 208 char *tstr, *sstr, *estr; 209 210 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 211 212 switch (msg->tag.vio_msgtype) { 213 CASE_TYPE(VIO_TYPE_CTRL) 214 CASE_TYPE(VIO_TYPE_DATA) 215 CASE_TYPE(VIO_TYPE_ERR) 216 default: tstr = "unknown"; break; 217 } 218 219 #undef CASE_TYPE 220 221 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 222 223 switch (msg->tag.vio_subtype) { 224 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 225 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 226 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 227 default: sstr = "unknown"; break; 228 } 229 230 #undef CASE_SUBTYPE 231 232 #define CASE_ENV(_s) case _s: estr = #_s; break; 233 234 switch (msg->tag.vio_subtype_env) { 235 CASE_ENV(VIO_VER_INFO) 236 CASE_ENV(VIO_ATTR_INFO) 237 CASE_ENV(VIO_DRING_REG) 238 CASE_ENV(VIO_DRING_UNREG) 239 CASE_ENV(VIO_RDX) 240 CASE_ENV(VIO_PKT_DATA) 241 CASE_ENV(VIO_DESC_DATA) 242 CASE_ENV(VIO_DRING_DATA) 243 default: estr = "unknown"; break; 244 } 245 246 #undef CASE_ENV 247 248 PR1("(%x/%x/%x) message : (%s/%s/%s)", 249 msg->tag.vio_msgtype, msg->tag.vio_subtype, 250 msg->tag.vio_subtype_env, tstr, sstr, estr); 251 } 252 253 #else /* !DEBUG */ 254 255 #define PR0(...) 256 #define PR1(...) 257 #define PR2(...) 258 259 #define VD_DUMP_DRING_ELEM(elem) 260 261 #define vd_decode_state(_s) (NULL) 262 #define vd_decode_tag(_s) (NULL) 263 264 #endif /* DEBUG */ 265 266 267 /* 268 * Soft state structure for a vds instance 269 */ 270 typedef struct vds { 271 uint_t initialized; /* driver inst initialization flags */ 272 dev_info_t *dip; /* driver inst devinfo pointer */ 273 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 274 mod_hash_t *vd_table; /* table of virtual disks served */ 275 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 276 mdeg_handle_t mdeg; /* handle for MDEG operations */ 277 } vds_t; 278 279 /* 280 * Types of descriptor-processing tasks 281 */ 282 typedef enum vd_task_type { 283 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 284 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 285 } vd_task_type_t; 286 287 /* 288 * Structure describing the task for processing a descriptor 289 */ 290 typedef struct vd_task { 291 struct vd *vd; /* vd instance task is for */ 292 vd_task_type_t type; /* type of descriptor task */ 293 int index; /* dring elem index for task */ 294 vio_msg_t *msg; /* VIO message task is for */ 295 size_t msglen; /* length of message content */ 296 vd_dring_payload_t *request; /* request task will perform */ 297 struct buf buf; /* buf(9s) for I/O request */ 298 ldc_mem_handle_t mhdl; /* task memory handle */ 299 } vd_task_t; 300 301 /* 302 * Soft state structure for a virtual disk instance 303 */ 304 typedef struct vd { 305 uint_t initialized; /* vdisk initialization flags */ 306 vds_t *vds; /* server for this vdisk */ 307 ddi_taskq_t *startq; /* queue for I/O start tasks */ 308 ddi_taskq_t *completionq; /* queue for completion tasks */ 309 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 310 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 311 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 312 uint_t nslices; /* number of slices */ 313 size_t vdisk_size; /* number of blocks in vdisk */ 314 vd_disk_type_t vdisk_type; /* slice or entire disk */ 315 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 316 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 317 boolean_t pseudo; /* underlying pseudo dev */ 318 boolean_t file; /* underlying file */ 319 vnode_t *file_vnode; /* file vnode */ 320 size_t file_size; /* file size */ 321 ddi_devid_t file_devid; /* devid for disk image */ 322 struct dk_efi dk_efi; /* synthetic for slice type */ 323 struct dk_geom dk_geom; /* synthetic for slice type */ 324 struct vtoc vtoc; /* synthetic for slice type */ 325 ldc_status_t ldc_state; /* LDC connection state */ 326 ldc_handle_t ldc_handle; /* handle for LDC comm */ 327 size_t max_msglen; /* largest LDC message len */ 328 vd_state_t state; /* client handshake state */ 329 uint8_t xfer_mode; /* transfer mode with client */ 330 uint32_t sid; /* client's session ID */ 331 uint64_t seq_num; /* message sequence number */ 332 uint64_t dring_ident; /* identifier of dring */ 333 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 334 uint32_t descriptor_size; /* num bytes in desc */ 335 uint32_t dring_len; /* number of dring elements */ 336 caddr_t dring; /* address of dring */ 337 caddr_t vio_msgp; /* vio msg staging buffer */ 338 vd_task_t inband_task; /* task for inband descriptor */ 339 vd_task_t *dring_task; /* tasks dring elements */ 340 341 kmutex_t lock; /* protects variables below */ 342 boolean_t enabled; /* is vdisk enabled? */ 343 boolean_t reset_state; /* reset connection state? */ 344 boolean_t reset_ldc; /* reset LDC channel? */ 345 } vd_t; 346 347 typedef struct vds_operation { 348 char *namep; 349 uint8_t operation; 350 int (*start)(vd_task_t *task); 351 void (*complete)(void *arg); 352 } vds_operation_t; 353 354 typedef struct vd_ioctl { 355 uint8_t operation; /* vdisk operation */ 356 const char *operation_name; /* vdisk operation name */ 357 size_t nbytes; /* size of operation buffer */ 358 int cmd; /* corresponding ioctl cmd */ 359 const char *cmd_name; /* ioctl cmd name */ 360 void *arg; /* ioctl cmd argument */ 361 /* convert input vd_buf to output ioctl_arg */ 362 void (*copyin)(void *vd_buf, void *ioctl_arg); 363 /* convert input ioctl_arg to output vd_buf */ 364 void (*copyout)(void *ioctl_arg, void *vd_buf); 365 } vd_ioctl_t; 366 367 /* Define trivial copyin/copyout conversion function flag */ 368 #define VD_IDENTITY ((void (*)(void *, void *))-1) 369 370 371 static int vds_ldc_retries = VDS_RETRIES; 372 static int vds_ldc_delay = VDS_LDC_DELAY; 373 static int vds_dev_retries = VDS_RETRIES; 374 static int vds_dev_delay = VDS_DEV_DELAY; 375 static void *vds_state; 376 static uint64_t vds_operations; /* see vds_operation[] definition below */ 377 378 static int vd_open_flags = VD_OPEN_FLAGS; 379 380 static uint_t vd_file_write_flags = VD_FILE_WRITE_FLAGS; 381 382 static short vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT; 383 384 /* 385 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 386 * 387 * Each supported major version should appear only once, paired with (and only 388 * with) its highest supported minor version number (as the protocol requires 389 * supporting all lower minor version numbers as well) 390 */ 391 static const vio_ver_t vds_version[] = {{1, 0}}; 392 static const size_t vds_num_versions = 393 sizeof (vds_version)/sizeof (vds_version[0]); 394 395 static void vd_free_dring_task(vd_t *vdp); 396 static int vd_setup_vd(vd_t *vd); 397 static boolean_t vd_enabled(vd_t *vd); 398 399 /* 400 * Function: 401 * vd_file_rw 402 * 403 * Description: 404 * Read or write to a disk on file. 405 * 406 * Parameters: 407 * vd - disk on which the operation is performed. 408 * slice - slice on which the operation is performed, 409 * VD_SLICE_NONE indicates that the operation 410 * is done using an absolute disk offset. 411 * operation - operation to execute: read (VD_OP_BREAD) or 412 * write (VD_OP_BWRITE). 413 * data - buffer where data are read to or written from. 414 * blk - starting block for the operation. 415 * len - number of bytes to read or write. 416 * 417 * Return Code: 418 * n >= 0 - success, n indicates the number of bytes read 419 * or written. 420 * -1 - error. 421 */ 422 static ssize_t 423 vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk, 424 size_t len) 425 { 426 caddr_t maddr; 427 size_t offset, maxlen, moffset, mlen, n; 428 uint_t smflags; 429 enum seg_rw srw; 430 431 ASSERT(vd->file); 432 ASSERT(len > 0); 433 434 if (slice == VD_SLICE_NONE) { 435 /* raw disk access */ 436 offset = blk * DEV_BSIZE; 437 } else { 438 ASSERT(slice >= 0 && slice < V_NUMPAR); 439 if (blk >= vd->vtoc.v_part[slice].p_size) { 440 /* address past the end of the slice */ 441 PR0("req_addr (0x%lx) > psize (0x%lx)", 442 blk, vd->vtoc.v_part[slice].p_size); 443 return (0); 444 } 445 446 offset = (vd->vtoc.v_part[slice].p_start + blk) * DEV_BSIZE; 447 448 /* 449 * If the requested size is greater than the size 450 * of the partition, truncate the read/write. 451 */ 452 maxlen = (vd->vtoc.v_part[slice].p_size - blk) * DEV_BSIZE; 453 454 if (len > maxlen) { 455 PR0("I/O size truncated to %lu bytes from %lu bytes", 456 maxlen, len); 457 len = maxlen; 458 } 459 } 460 461 /* 462 * We have to ensure that we are reading/writing into the mmap 463 * range. If we have a partial disk image (e.g. an image of 464 * s0 instead s2) the system can try to access slices that 465 * are not included into the disk image. 466 */ 467 if ((offset + len) >= vd->file_size) { 468 PR0("offset + nbytes (0x%lx + 0x%lx) >= " 469 "file_size (0x%lx)", offset, len, vd->file_size); 470 return (-1); 471 } 472 473 srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE; 474 smflags = (operation == VD_OP_BREAD)? 0 : 475 (SM_WRITE | vd_file_write_flags); 476 n = len; 477 478 do { 479 /* 480 * segmap_getmapflt() returns a MAXBSIZE chunk which is 481 * MAXBSIZE aligned. 482 */ 483 moffset = offset & MAXBOFFSET; 484 mlen = MIN(MAXBSIZE - moffset, n); 485 maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset, 486 mlen, 1, srw); 487 /* 488 * Fault in the pages so we can check for error and ensure 489 * that we can safely used the mapped address. 490 */ 491 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 492 F_SOFTLOCK, srw) != 0) { 493 (void) segmap_release(segkmap, maddr, 0); 494 return (-1); 495 } 496 497 if (operation == VD_OP_BREAD) 498 bcopy(maddr + moffset, data, mlen); 499 else 500 bcopy(data, maddr + moffset, mlen); 501 502 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 503 F_SOFTUNLOCK, srw) != 0) { 504 (void) segmap_release(segkmap, maddr, 0); 505 return (-1); 506 } 507 if (segmap_release(segkmap, maddr, smflags) != 0) 508 return (-1); 509 n -= mlen; 510 offset += mlen; 511 data += mlen; 512 513 } while (n > 0); 514 515 return (len); 516 } 517 518 /* 519 * Function: 520 * vd_file_set_vtoc 521 * 522 * Description: 523 * Set the vtoc of a disk image by writing the label and backup 524 * labels into the disk image backend. 525 * 526 * Parameters: 527 * vd - disk on which the operation is performed. 528 * label - the data to be written. 529 * 530 * Return Code: 531 * 0 - success. 532 * n > 0 - error, n indicates the errno code. 533 */ 534 static int 535 vd_file_set_vtoc(vd_t *vd, struct dk_label *label) 536 { 537 int blk, sec, cyl, head, cnt; 538 539 ASSERT(vd->file); 540 541 if (VD_FILE_LABEL_WRITE(vd, label) < 0) { 542 PR0("fail to write disk label"); 543 return (EIO); 544 } 545 546 /* 547 * Backup labels are on the last alternate cylinder's 548 * first five odd sectors. 549 */ 550 if (label->dkl_acyl == 0) { 551 PR0("no alternate cylinder, can not store backup labels"); 552 return (0); 553 } 554 555 cyl = label->dkl_ncyl + label->dkl_acyl - 1; 556 head = label->dkl_nhead - 1; 557 558 blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) + 559 (head * label->dkl_nsect); 560 561 /* 562 * Write the backup labels. Make sure we don't try to write past 563 * the last cylinder. 564 */ 565 sec = 1; 566 567 for (cnt = 0; cnt < VD_FILE_NUM_BACKUP; cnt++) { 568 569 if (sec >= label->dkl_nsect) { 570 PR0("not enough sector to store all backup labels"); 571 return (0); 572 } 573 574 if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)label, 575 blk + sec, sizeof (struct dk_label)) < 0) { 576 PR0("error writing backup label at block %d\n", 577 blk + sec); 578 return (EIO); 579 } 580 581 PR1("wrote backup label at block %d\n", blk + sec); 582 583 sec += 2; 584 } 585 586 return (0); 587 } 588 589 /* 590 * Function: 591 * vd_file_get_devid_block 592 * 593 * Description: 594 * Return the block number where the device id is stored. 595 * 596 * Parameters: 597 * vd - disk on which the operation is performed. 598 * blkp - pointer to the block number 599 * 600 * Return Code: 601 * 0 - success 602 * ENOSPC - disk has no space to store a device id 603 */ 604 static int 605 vd_file_get_devid_block(vd_t *vd, size_t *blkp) 606 { 607 diskaddr_t spc, head, cyl; 608 609 ASSERT(vd->file); 610 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 611 612 /* this geometry doesn't allow us to have a devid */ 613 if (vd->dk_geom.dkg_acyl < 2) { 614 PR0("not enough alternate cylinder available for devid " 615 "(acyl=%u)", vd->dk_geom.dkg_acyl); 616 return (ENOSPC); 617 } 618 619 /* the devid is in on the track next to the last cylinder */ 620 cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2; 621 spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect; 622 head = vd->dk_geom.dkg_nhead - 1; 623 624 *blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) + 625 (head * vd->dk_geom.dkg_nsect) + 1; 626 627 return (0); 628 } 629 630 /* 631 * Return the checksum of a disk block containing an on-disk devid. 632 */ 633 static uint_t 634 vd_dkdevid2cksum(struct dk_devid *dkdevid) 635 { 636 uint_t chksum, *ip; 637 int i; 638 639 chksum = 0; 640 ip = (uint_t *)dkdevid; 641 for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++) 642 chksum ^= ip[i]; 643 644 return (chksum); 645 } 646 647 /* 648 * Function: 649 * vd_file_read_devid 650 * 651 * Description: 652 * Read the device id stored on a disk image. 653 * 654 * Parameters: 655 * vd - disk on which the operation is performed. 656 * devid - the return address of the device ID. 657 * 658 * Return Code: 659 * 0 - success 660 * EIO - I/O error while trying to access the disk image 661 * EINVAL - no valid device id was found 662 * ENOSPC - disk has no space to store a device id 663 */ 664 static int 665 vd_file_read_devid(vd_t *vd, ddi_devid_t *devid) 666 { 667 struct dk_devid *dkdevid; 668 size_t blk; 669 uint_t chksum; 670 int status, sz; 671 672 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 673 return (status); 674 675 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 676 677 /* get the devid */ 678 if ((vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk, 679 DEV_BSIZE)) < 0) { 680 PR0("error reading devid block at %lu", blk); 681 status = EIO; 682 goto done; 683 } 684 685 /* validate the revision */ 686 if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) || 687 (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) { 688 PR0("invalid devid found at block %lu (bad revision)", blk); 689 status = EINVAL; 690 goto done; 691 } 692 693 /* compute checksum */ 694 chksum = vd_dkdevid2cksum(dkdevid); 695 696 /* compare the checksums */ 697 if (DKD_GETCHKSUM(dkdevid) != chksum) { 698 PR0("invalid devid found at block %lu (bad checksum)", blk); 699 status = EINVAL; 700 goto done; 701 } 702 703 /* validate the device id */ 704 if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) { 705 PR0("invalid devid found at block %lu", blk); 706 status = EINVAL; 707 goto done; 708 } 709 710 PR1("devid read at block %lu", blk); 711 712 sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid); 713 *devid = kmem_alloc(sz, KM_SLEEP); 714 bcopy(&dkdevid->dkd_devid, *devid, sz); 715 716 done: 717 kmem_free(dkdevid, DEV_BSIZE); 718 return (status); 719 720 } 721 722 /* 723 * Function: 724 * vd_file_write_devid 725 * 726 * Description: 727 * Write a device id into disk image. 728 * 729 * Parameters: 730 * vd - disk on which the operation is performed. 731 * devid - the device ID to store. 732 * 733 * Return Code: 734 * 0 - success 735 * EIO - I/O error while trying to access the disk image 736 * ENOSPC - disk has no space to store a device id 737 */ 738 static int 739 vd_file_write_devid(vd_t *vd, ddi_devid_t devid) 740 { 741 struct dk_devid *dkdevid; 742 uint_t chksum; 743 size_t blk; 744 int status; 745 746 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 747 return (status); 748 749 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 750 751 /* set revision */ 752 dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB; 753 dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB; 754 755 /* copy devid */ 756 bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid)); 757 758 /* compute checksum */ 759 chksum = vd_dkdevid2cksum(dkdevid); 760 761 /* set checksum */ 762 DKD_FORMCHKSUM(chksum, dkdevid); 763 764 /* store the devid */ 765 if ((status = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, 766 (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) { 767 PR0("Error writing devid block at %lu", blk); 768 status = EIO; 769 } else { 770 PR1("devid written at block %lu", blk); 771 status = 0; 772 } 773 774 kmem_free(dkdevid, DEV_BSIZE); 775 return (status); 776 } 777 778 /* 779 * Function: 780 * vd_scsi_rdwr 781 * 782 * Description: 783 * Read or write to a SCSI disk using an absolute disk offset. 784 * 785 * Parameters: 786 * vd - disk on which the operation is performed. 787 * operation - operation to execute: read (VD_OP_BREAD) or 788 * write (VD_OP_BWRITE). 789 * data - buffer where data are read to or written from. 790 * blk - starting block for the operation. 791 * len - number of bytes to read or write. 792 * 793 * Return Code: 794 * 0 - success 795 * n != 0 - error. 796 */ 797 static int 798 vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len) 799 { 800 struct uscsi_cmd ucmd; 801 union scsi_cdb cdb; 802 int nsectors, nblk; 803 int max_sectors; 804 int status, rval; 805 806 ASSERT(!vd->file); 807 808 max_sectors = vd->max_xfer_sz; 809 nblk = (len / DEV_BSIZE); 810 811 if (len % DEV_BSIZE != 0) 812 return (EINVAL); 813 814 /* 815 * Build and execute the uscsi ioctl. We build a group0, group1 816 * or group4 command as necessary, since some targets 817 * do not support group1 commands. 818 */ 819 while (nblk) { 820 821 bzero(&ucmd, sizeof (ucmd)); 822 bzero(&cdb, sizeof (cdb)); 823 824 nsectors = (max_sectors < nblk) ? max_sectors : nblk; 825 826 if (blk < (2 << 20) && nsectors <= 0xff) { 827 FORMG0ADDR(&cdb, blk); 828 FORMG0COUNT(&cdb, nsectors); 829 ucmd.uscsi_cdblen = CDB_GROUP0; 830 } else if (blk > 0xffffffff) { 831 FORMG4LONGADDR(&cdb, blk); 832 FORMG4COUNT(&cdb, nsectors); 833 ucmd.uscsi_cdblen = CDB_GROUP4; 834 cdb.scc_cmd |= SCMD_GROUP4; 835 } else { 836 FORMG1ADDR(&cdb, blk); 837 FORMG1COUNT(&cdb, nsectors); 838 ucmd.uscsi_cdblen = CDB_GROUP1; 839 cdb.scc_cmd |= SCMD_GROUP1; 840 } 841 842 ucmd.uscsi_cdb = (caddr_t)&cdb; 843 ucmd.uscsi_bufaddr = data; 844 ucmd.uscsi_buflen = nsectors * DEV_BSIZE; 845 ucmd.uscsi_timeout = vd_scsi_rdwr_timeout; 846 /* 847 * Set flags so that the command is isolated from normal 848 * commands and no error message is printed. 849 */ 850 ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT; 851 852 if (operation == VD_OP_BREAD) { 853 cdb.scc_cmd |= SCMD_READ; 854 ucmd.uscsi_flags |= USCSI_READ; 855 } else { 856 cdb.scc_cmd |= SCMD_WRITE; 857 } 858 859 status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE], 860 USCSICMD, (intptr_t)&ucmd, (vd_open_flags | FKIOCTL), 861 kcred, &rval); 862 863 if (status == 0) 864 status = ucmd.uscsi_status; 865 866 if (status != 0) 867 break; 868 869 /* 870 * Check if partial DMA breakup is required. If so, reduce 871 * the request size by half and retry the last request. 872 */ 873 if (ucmd.uscsi_resid == ucmd.uscsi_buflen) { 874 max_sectors >>= 1; 875 if (max_sectors <= 0) { 876 status = EIO; 877 break; 878 } 879 continue; 880 } 881 882 if (ucmd.uscsi_resid != 0) { 883 status = EIO; 884 break; 885 } 886 887 blk += nsectors; 888 nblk -= nsectors; 889 data += nsectors * DEV_BSIZE; /* SECSIZE */ 890 } 891 892 return (status); 893 } 894 895 static int 896 vd_start_bio(vd_task_t *task) 897 { 898 int rv, status = 0; 899 vd_t *vd = task->vd; 900 vd_dring_payload_t *request = task->request; 901 struct buf *buf = &task->buf; 902 uint8_t mtype; 903 int slice; 904 905 ASSERT(vd != NULL); 906 ASSERT(request != NULL); 907 908 slice = request->slice; 909 910 ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices); 911 ASSERT((request->operation == VD_OP_BREAD) || 912 (request->operation == VD_OP_BWRITE)); 913 914 if (request->nbytes == 0) 915 return (EINVAL); /* no service for trivial requests */ 916 917 PR1("%s %lu bytes at block %lu", 918 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 919 request->nbytes, request->addr); 920 921 bioinit(buf); 922 buf->b_flags = B_BUSY; 923 buf->b_bcount = request->nbytes; 924 buf->b_lblkno = request->addr; 925 buf->b_edev = (slice == VD_SLICE_NONE)? NODEV : vd->dev[slice]; 926 927 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 928 929 /* Map memory exported by client */ 930 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 931 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 932 &(buf->b_un.b_addr), NULL); 933 if (status != 0) { 934 PR0("ldc_mem_map() returned err %d ", status); 935 biofini(buf); 936 return (status); 937 } 938 939 status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount); 940 if (status != 0) { 941 (void) ldc_mem_unmap(task->mhdl); 942 PR0("ldc_mem_acquire() returned err %d ", status); 943 biofini(buf); 944 return (status); 945 } 946 947 buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE; 948 949 /* Start the block I/O */ 950 if (vd->file) { 951 rv = vd_file_rw(vd, slice, request->operation, buf->b_un.b_addr, 952 request->addr, request->nbytes); 953 if (rv < 0) { 954 request->nbytes = 0; 955 status = EIO; 956 } else { 957 request->nbytes = rv; 958 status = 0; 959 } 960 } else { 961 if (slice == VD_SLICE_NONE) { 962 /* 963 * This is not a disk image so it is a real disk. We 964 * assume that the underlying device driver supports 965 * USCSICMD ioctls. This is the case of all SCSI devices 966 * (sd, ssd...). 967 * 968 * In the future if we have non-SCSI disks we would need 969 * to invoke the appropriate function to do I/O using an 970 * absolute disk offset (for example using DKIOCTL_RWCMD 971 * for IDE disks). 972 */ 973 rv = vd_scsi_rdwr(vd, request->operation, 974 buf->b_un.b_addr, request->addr, request->nbytes); 975 if (rv != 0) { 976 request->nbytes = 0; 977 status = EIO; 978 } else { 979 status = 0; 980 } 981 } else { 982 status = ldi_strategy(vd->ldi_handle[slice], buf); 983 if (status == 0) 984 /* will complete on completionq */ 985 return (EINPROGRESS); 986 } 987 } 988 989 /* Clean up after error */ 990 rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 991 if (rv) { 992 PR0("ldc_mem_release() returned err %d ", rv); 993 } 994 rv = ldc_mem_unmap(task->mhdl); 995 if (rv) { 996 PR0("ldc_mem_unmap() returned err %d ", status); 997 } 998 999 biofini(buf); 1000 return (status); 1001 } 1002 1003 static int 1004 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 1005 { 1006 int status; 1007 size_t nbytes; 1008 1009 do { 1010 nbytes = msglen; 1011 status = ldc_write(ldc_handle, msg, &nbytes); 1012 if (status != EWOULDBLOCK) 1013 break; 1014 drv_usecwait(vds_ldc_delay); 1015 } while (status == EWOULDBLOCK); 1016 1017 if (status != 0) { 1018 if (status != ECONNRESET) 1019 PR0("ldc_write() returned errno %d", status); 1020 return (status); 1021 } else if (nbytes != msglen) { 1022 PR0("ldc_write() performed only partial write"); 1023 return (EIO); 1024 } 1025 1026 PR1("SENT %lu bytes", msglen); 1027 return (0); 1028 } 1029 1030 static void 1031 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 1032 { 1033 mutex_enter(&vd->lock); 1034 vd->reset_state = B_TRUE; 1035 vd->reset_ldc = reset_ldc; 1036 mutex_exit(&vd->lock); 1037 } 1038 1039 /* 1040 * Reset the state of the connection with a client, if needed; reset the LDC 1041 * transport as well, if needed. This function should only be called from the 1042 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 1043 */ 1044 static void 1045 vd_reset_if_needed(vd_t *vd) 1046 { 1047 int status = 0; 1048 1049 mutex_enter(&vd->lock); 1050 if (!vd->reset_state) { 1051 ASSERT(!vd->reset_ldc); 1052 mutex_exit(&vd->lock); 1053 return; 1054 } 1055 mutex_exit(&vd->lock); 1056 1057 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 1058 1059 /* 1060 * Let any asynchronous I/O complete before possibly pulling the rug 1061 * out from under it; defer checking vd->reset_ldc, as one of the 1062 * asynchronous tasks might set it 1063 */ 1064 ddi_taskq_wait(vd->completionq); 1065 1066 if (vd->file) { 1067 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred); 1068 if (status) { 1069 PR0("VOP_FSYNC returned errno %d", status); 1070 } 1071 } 1072 1073 if ((vd->initialized & VD_DRING) && 1074 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 1075 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1076 1077 vd_free_dring_task(vd); 1078 1079 /* Free the staging buffer for msgs */ 1080 if (vd->vio_msgp != NULL) { 1081 kmem_free(vd->vio_msgp, vd->max_msglen); 1082 vd->vio_msgp = NULL; 1083 } 1084 1085 /* Free the inband message buffer */ 1086 if (vd->inband_task.msg != NULL) { 1087 kmem_free(vd->inband_task.msg, vd->max_msglen); 1088 vd->inband_task.msg = NULL; 1089 } 1090 1091 mutex_enter(&vd->lock); 1092 1093 if (vd->reset_ldc) 1094 PR0("taking down LDC channel"); 1095 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 1096 PR0("ldc_down() returned errno %d", status); 1097 1098 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 1099 vd->state = VD_STATE_INIT; 1100 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 1101 1102 /* Allocate the staging buffer */ 1103 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 1104 1105 PR0("calling ldc_up\n"); 1106 (void) ldc_up(vd->ldc_handle); 1107 1108 vd->reset_state = B_FALSE; 1109 vd->reset_ldc = B_FALSE; 1110 1111 mutex_exit(&vd->lock); 1112 } 1113 1114 static void vd_recv_msg(void *arg); 1115 1116 static void 1117 vd_mark_in_reset(vd_t *vd) 1118 { 1119 int status; 1120 1121 PR0("vd_mark_in_reset: marking vd in reset\n"); 1122 1123 vd_need_reset(vd, B_FALSE); 1124 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 1125 if (status == DDI_FAILURE) { 1126 PR0("cannot schedule task to recv msg\n"); 1127 vd_need_reset(vd, B_TRUE); 1128 return; 1129 } 1130 } 1131 1132 static int 1133 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 1134 { 1135 boolean_t accepted; 1136 int status; 1137 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1138 1139 if (vd->reset_state) 1140 return (0); 1141 1142 /* Acquire the element */ 1143 if (!vd->reset_state && 1144 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1145 if (status == ECONNRESET) { 1146 vd_mark_in_reset(vd); 1147 return (0); 1148 } else { 1149 PR0("ldc_mem_dring_acquire() returned errno %d", 1150 status); 1151 return (status); 1152 } 1153 } 1154 1155 /* Set the element's status and mark it done */ 1156 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 1157 if (accepted) { 1158 elem->payload.nbytes = elem_nbytes; 1159 elem->payload.status = elem_status; 1160 elem->hdr.dstate = VIO_DESC_DONE; 1161 } else { 1162 /* Perhaps client timed out waiting for I/O... */ 1163 PR0("element %u no longer \"accepted\"", idx); 1164 VD_DUMP_DRING_ELEM(elem); 1165 } 1166 /* Release the element */ 1167 if (!vd->reset_state && 1168 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1169 if (status == ECONNRESET) { 1170 vd_mark_in_reset(vd); 1171 return (0); 1172 } else { 1173 PR0("ldc_mem_dring_release() returned errno %d", 1174 status); 1175 return (status); 1176 } 1177 } 1178 1179 return (accepted ? 0 : EINVAL); 1180 } 1181 1182 static void 1183 vd_complete_bio(void *arg) 1184 { 1185 int status = 0; 1186 vd_task_t *task = (vd_task_t *)arg; 1187 vd_t *vd = task->vd; 1188 vd_dring_payload_t *request = task->request; 1189 struct buf *buf = &task->buf; 1190 1191 1192 ASSERT(vd != NULL); 1193 ASSERT(request != NULL); 1194 ASSERT(task->msg != NULL); 1195 ASSERT(task->msglen >= sizeof (*task->msg)); 1196 ASSERT(!vd->file); 1197 1198 /* Wait for the I/O to complete */ 1199 request->status = biowait(buf); 1200 1201 /* return back the number of bytes read/written */ 1202 request->nbytes = buf->b_bcount - buf->b_resid; 1203 1204 /* Release the buffer */ 1205 if (!vd->reset_state) 1206 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 1207 if (status) { 1208 PR0("ldc_mem_release() returned errno %d copying to " 1209 "client", status); 1210 if (status == ECONNRESET) { 1211 vd_mark_in_reset(vd); 1212 } 1213 } 1214 1215 /* Unmap the memory, even if in reset */ 1216 status = ldc_mem_unmap(task->mhdl); 1217 if (status) { 1218 PR0("ldc_mem_unmap() returned errno %d copying to client", 1219 status); 1220 if (status == ECONNRESET) { 1221 vd_mark_in_reset(vd); 1222 } 1223 } 1224 1225 biofini(buf); 1226 1227 /* Update the dring element for a dring client */ 1228 if (!vd->reset_state && (status == 0) && 1229 (vd->xfer_mode == VIO_DRING_MODE)) { 1230 status = vd_mark_elem_done(vd, task->index, 1231 request->status, request->nbytes); 1232 if (status == ECONNRESET) 1233 vd_mark_in_reset(vd); 1234 } 1235 1236 /* 1237 * If a transport error occurred, arrange to "nack" the message when 1238 * the final task in the descriptor element range completes 1239 */ 1240 if (status != 0) 1241 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 1242 1243 /* 1244 * Only the final task for a range of elements will respond to and 1245 * free the message 1246 */ 1247 if (task->type == VD_NONFINAL_RANGE_TASK) { 1248 return; 1249 } 1250 1251 /* 1252 * Send the "ack" or "nack" back to the client; if sending the message 1253 * via LDC fails, arrange to reset both the connection state and LDC 1254 * itself 1255 */ 1256 PR1("Sending %s", 1257 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 1258 if (!vd->reset_state) { 1259 status = send_msg(vd->ldc_handle, task->msg, task->msglen); 1260 switch (status) { 1261 case 0: 1262 break; 1263 case ECONNRESET: 1264 vd_mark_in_reset(vd); 1265 break; 1266 default: 1267 PR0("initiating full reset"); 1268 vd_need_reset(vd, B_TRUE); 1269 break; 1270 } 1271 } 1272 } 1273 1274 static void 1275 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 1276 { 1277 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 1278 } 1279 1280 static void 1281 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 1282 { 1283 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 1284 } 1285 1286 static void 1287 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 1288 { 1289 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 1290 } 1291 1292 static void 1293 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 1294 { 1295 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 1296 } 1297 1298 static void 1299 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 1300 { 1301 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1302 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1303 1304 dk_efi->dki_lba = vd_efi->lba; 1305 dk_efi->dki_length = vd_efi->length; 1306 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 1307 } 1308 1309 static void 1310 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 1311 { 1312 int len; 1313 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1314 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1315 1316 len = vd_efi->length; 1317 DK_EFI2VD_EFI(dk_efi, vd_efi); 1318 kmem_free(dk_efi->dki_data, len); 1319 } 1320 1321 static void 1322 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 1323 { 1324 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1325 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1326 1327 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 1328 VD_EFI2DK_EFI(vd_efi, dk_efi); 1329 } 1330 1331 static void 1332 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 1333 { 1334 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1335 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1336 1337 kmem_free(dk_efi->dki_data, vd_efi->length); 1338 } 1339 1340 static int 1341 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label) 1342 { 1343 int status, rval; 1344 struct dk_gpt *efi; 1345 size_t efi_len; 1346 1347 *label = VD_DISK_LABEL_UNK; 1348 1349 status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc, 1350 (vd_open_flags | FKIOCTL), kcred, &rval); 1351 1352 if (status == 0) { 1353 *label = VD_DISK_LABEL_VTOC; 1354 return (0); 1355 } else if (status != ENOTSUP) { 1356 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 1357 return (status); 1358 } 1359 1360 status = vds_efi_alloc_and_read(handle, &efi, &efi_len); 1361 1362 if (status) { 1363 PR0("vds_efi_alloc_and_read returned error %d", status); 1364 return (status); 1365 } 1366 1367 *label = VD_DISK_LABEL_EFI; 1368 vd_efi_to_vtoc(efi, vtoc); 1369 vd_efi_free(efi, efi_len); 1370 1371 return (0); 1372 } 1373 1374 static ushort_t 1375 vd_lbl2cksum(struct dk_label *label) 1376 { 1377 int count; 1378 ushort_t sum, *sp; 1379 1380 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 1381 sp = (ushort_t *)label; 1382 sum = 0; 1383 while (count--) { 1384 sum ^= *sp++; 1385 } 1386 1387 return (sum); 1388 } 1389 1390 /* 1391 * Handle ioctls to a disk slice. 1392 */ 1393 static int 1394 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1395 { 1396 dk_efi_t *dk_ioc; 1397 1398 switch (vd->vdisk_label) { 1399 1400 /* ioctls for a slice from a disk with a VTOC label */ 1401 case VD_DISK_LABEL_VTOC: 1402 1403 switch (cmd) { 1404 case DKIOCGGEOM: 1405 ASSERT(ioctl_arg != NULL); 1406 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 1407 return (0); 1408 case DKIOCGVTOC: 1409 ASSERT(ioctl_arg != NULL); 1410 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 1411 return (0); 1412 default: 1413 return (ENOTSUP); 1414 } 1415 1416 /* ioctls for a slice from a disk with an EFI label */ 1417 case VD_DISK_LABEL_EFI: 1418 1419 switch (cmd) { 1420 case DKIOCGETEFI: 1421 ASSERT(ioctl_arg != NULL); 1422 dk_ioc = (dk_efi_t *)ioctl_arg; 1423 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 1424 return (EINVAL); 1425 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 1426 vd->dk_efi.dki_length); 1427 return (0); 1428 default: 1429 return (ENOTSUP); 1430 } 1431 1432 default: 1433 return (ENOTSUP); 1434 } 1435 } 1436 1437 /* 1438 * Handle ioctls to a disk image. 1439 */ 1440 static int 1441 vd_do_file_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1442 { 1443 struct dk_label label; 1444 struct dk_geom *geom; 1445 struct vtoc *vtoc; 1446 int i, rc; 1447 1448 ASSERT(vd->file); 1449 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 1450 1451 switch (cmd) { 1452 1453 case DKIOCGGEOM: 1454 ASSERT(ioctl_arg != NULL); 1455 geom = (struct dk_geom *)ioctl_arg; 1456 1457 if (VD_FILE_LABEL_READ(vd, &label) < 0) 1458 return (EIO); 1459 1460 if (label.dkl_magic != DKL_MAGIC || 1461 label.dkl_cksum != vd_lbl2cksum(&label)) 1462 return (EINVAL); 1463 1464 bzero(geom, sizeof (struct dk_geom)); 1465 geom->dkg_ncyl = label.dkl_ncyl; 1466 geom->dkg_acyl = label.dkl_acyl; 1467 geom->dkg_nhead = label.dkl_nhead; 1468 geom->dkg_nsect = label.dkl_nsect; 1469 geom->dkg_intrlv = label.dkl_intrlv; 1470 geom->dkg_apc = label.dkl_apc; 1471 geom->dkg_rpm = label.dkl_rpm; 1472 geom->dkg_pcyl = label.dkl_pcyl; 1473 geom->dkg_write_reinstruct = label.dkl_write_reinstruct; 1474 geom->dkg_read_reinstruct = label.dkl_read_reinstruct; 1475 1476 return (0); 1477 1478 case DKIOCGVTOC: 1479 ASSERT(ioctl_arg != NULL); 1480 vtoc = (struct vtoc *)ioctl_arg; 1481 1482 if (VD_FILE_LABEL_READ(vd, &label) < 0) 1483 return (EIO); 1484 1485 if (label.dkl_magic != DKL_MAGIC || 1486 label.dkl_cksum != vd_lbl2cksum(&label)) 1487 return (EINVAL); 1488 1489 bzero(vtoc, sizeof (struct vtoc)); 1490 1491 vtoc->v_sanity = label.dkl_vtoc.v_sanity; 1492 vtoc->v_version = label.dkl_vtoc.v_version; 1493 vtoc->v_sectorsz = DEV_BSIZE; 1494 vtoc->v_nparts = label.dkl_vtoc.v_nparts; 1495 1496 for (i = 0; i < vtoc->v_nparts; i++) { 1497 vtoc->v_part[i].p_tag = 1498 label.dkl_vtoc.v_part[i].p_tag; 1499 vtoc->v_part[i].p_flag = 1500 label.dkl_vtoc.v_part[i].p_flag; 1501 vtoc->v_part[i].p_start = 1502 label.dkl_map[i].dkl_cylno * 1503 (label.dkl_nhead * label.dkl_nsect); 1504 vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk; 1505 vtoc->timestamp[i] = 1506 label.dkl_vtoc.v_timestamp[i]; 1507 } 1508 /* 1509 * The bootinfo array can not be copied with bcopy() because 1510 * elements are of type long in vtoc (so 64-bit) and of type 1511 * int in dk_vtoc (so 32-bit). 1512 */ 1513 vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0]; 1514 vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1]; 1515 vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2]; 1516 bcopy(label.dkl_asciilabel, vtoc->v_asciilabel, 1517 LEN_DKL_ASCII); 1518 bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume, 1519 LEN_DKL_VVOL); 1520 1521 return (0); 1522 1523 case DKIOCSGEOM: 1524 ASSERT(ioctl_arg != NULL); 1525 geom = (struct dk_geom *)ioctl_arg; 1526 1527 if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0) 1528 return (EINVAL); 1529 1530 /* 1531 * The current device geometry is not updated, just the driver 1532 * "notion" of it. The device geometry will be effectively 1533 * updated when a label is written to the device during a next 1534 * DKIOCSVTOC. 1535 */ 1536 bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom)); 1537 return (0); 1538 1539 case DKIOCSVTOC: 1540 ASSERT(ioctl_arg != NULL); 1541 ASSERT(vd->dk_geom.dkg_nhead != 0 && 1542 vd->dk_geom.dkg_nsect != 0); 1543 vtoc = (struct vtoc *)ioctl_arg; 1544 1545 if (vtoc->v_sanity != VTOC_SANE || 1546 vtoc->v_sectorsz != DEV_BSIZE || 1547 vtoc->v_nparts != V_NUMPAR) 1548 return (EINVAL); 1549 1550 bzero(&label, sizeof (label)); 1551 label.dkl_ncyl = vd->dk_geom.dkg_ncyl; 1552 label.dkl_acyl = vd->dk_geom.dkg_acyl; 1553 label.dkl_pcyl = vd->dk_geom.dkg_pcyl; 1554 label.dkl_nhead = vd->dk_geom.dkg_nhead; 1555 label.dkl_nsect = vd->dk_geom.dkg_nsect; 1556 label.dkl_intrlv = vd->dk_geom.dkg_intrlv; 1557 label.dkl_apc = vd->dk_geom.dkg_apc; 1558 label.dkl_rpm = vd->dk_geom.dkg_rpm; 1559 label.dkl_write_reinstruct = vd->dk_geom.dkg_write_reinstruct; 1560 label.dkl_read_reinstruct = vd->dk_geom.dkg_read_reinstruct; 1561 1562 label.dkl_vtoc.v_nparts = V_NUMPAR; 1563 label.dkl_vtoc.v_sanity = VTOC_SANE; 1564 label.dkl_vtoc.v_version = vtoc->v_version; 1565 for (i = 0; i < V_NUMPAR; i++) { 1566 label.dkl_vtoc.v_timestamp[i] = 1567 vtoc->timestamp[i]; 1568 label.dkl_vtoc.v_part[i].p_tag = 1569 vtoc->v_part[i].p_tag; 1570 label.dkl_vtoc.v_part[i].p_flag = 1571 vtoc->v_part[i].p_flag; 1572 label.dkl_map[i].dkl_cylno = 1573 vtoc->v_part[i].p_start / 1574 (label.dkl_nhead * label.dkl_nsect); 1575 label.dkl_map[i].dkl_nblk = 1576 vtoc->v_part[i].p_size; 1577 } 1578 /* 1579 * The bootinfo array can not be copied with bcopy() because 1580 * elements are of type long in vtoc (so 64-bit) and of type 1581 * int in dk_vtoc (so 32-bit). 1582 */ 1583 label.dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0]; 1584 label.dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1]; 1585 label.dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2]; 1586 bcopy(vtoc->v_asciilabel, label.dkl_asciilabel, 1587 LEN_DKL_ASCII); 1588 bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume, 1589 LEN_DKL_VVOL); 1590 1591 /* re-compute checksum */ 1592 label.dkl_magic = DKL_MAGIC; 1593 label.dkl_cksum = vd_lbl2cksum(&label); 1594 1595 /* write label to the disk image */ 1596 if ((rc = vd_file_set_vtoc(vd, &label)) != 0) 1597 return (rc); 1598 1599 /* update the cached vdisk VTOC */ 1600 bcopy(vtoc, &vd->vtoc, sizeof (vd->vtoc)); 1601 1602 /* 1603 * The disk geometry may have changed, so we need to write 1604 * the devid (if there is one) so that it is stored at the 1605 * right location. 1606 */ 1607 if (vd->file_devid != NULL && 1608 vd_file_write_devid(vd, vd->file_devid) != 0) { 1609 PR0("Fail to write devid"); 1610 } 1611 1612 return (0); 1613 1614 default: 1615 return (ENOTSUP); 1616 } 1617 } 1618 1619 static int 1620 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 1621 { 1622 int rval = 0, status; 1623 size_t nbytes = request->nbytes; /* modifiable copy */ 1624 1625 1626 ASSERT(request->slice < vd->nslices); 1627 PR0("Performing %s", ioctl->operation_name); 1628 1629 /* Get data from client and convert, if necessary */ 1630 if (ioctl->copyin != NULL) { 1631 ASSERT(nbytes != 0 && buf != NULL); 1632 PR1("Getting \"arg\" data from client"); 1633 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1634 request->cookie, request->ncookies, 1635 LDC_COPY_IN)) != 0) { 1636 PR0("ldc_mem_copy() returned errno %d " 1637 "copying from client", status); 1638 return (status); 1639 } 1640 1641 /* Convert client's data, if necessary */ 1642 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 1643 ioctl->arg = buf; 1644 else /* convert client vdisk operation data to ioctl data */ 1645 (ioctl->copyin)(buf, (void *)ioctl->arg); 1646 } 1647 1648 /* 1649 * Handle single-slice block devices internally; otherwise, have the 1650 * real driver perform the ioctl() 1651 */ 1652 if (vd->file) { 1653 if ((status = vd_do_file_ioctl(vd, ioctl->cmd, 1654 (void *)ioctl->arg)) != 0) 1655 return (status); 1656 } else if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { 1657 if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, 1658 (void *)ioctl->arg)) != 0) 1659 return (status); 1660 } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], 1661 ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL), 1662 kcred, &rval)) != 0) { 1663 PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); 1664 return (status); 1665 } 1666 #ifdef DEBUG 1667 if (rval != 0) { 1668 PR0("%s set rval = %d, which is not being returned to client", 1669 ioctl->cmd_name, rval); 1670 } 1671 #endif /* DEBUG */ 1672 1673 /* Convert data and send to client, if necessary */ 1674 if (ioctl->copyout != NULL) { 1675 ASSERT(nbytes != 0 && buf != NULL); 1676 PR1("Sending \"arg\" data to client"); 1677 1678 /* Convert ioctl data to vdisk operation data, if necessary */ 1679 if (ioctl->copyout != VD_IDENTITY) 1680 (ioctl->copyout)((void *)ioctl->arg, buf); 1681 1682 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1683 request->cookie, request->ncookies, 1684 LDC_COPY_OUT)) != 0) { 1685 PR0("ldc_mem_copy() returned errno %d " 1686 "copying to client", status); 1687 return (status); 1688 } 1689 } 1690 1691 return (status); 1692 } 1693 1694 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 1695 static int 1696 vd_ioctl(vd_task_t *task) 1697 { 1698 int i, status; 1699 void *buf = NULL; 1700 struct dk_geom dk_geom = {0}; 1701 struct vtoc vtoc = {0}; 1702 struct dk_efi dk_efi = {0}; 1703 vd_t *vd = task->vd; 1704 vd_dring_payload_t *request = task->request; 1705 vd_ioctl_t ioctl[] = { 1706 /* Command (no-copy) operations */ 1707 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 1708 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 1709 NULL, NULL, NULL}, 1710 1711 /* "Get" (copy-out) operations */ 1712 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 1713 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 1714 NULL, VD_IDENTITY, VD_IDENTITY}, 1715 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 1716 RNDSIZE(vd_geom_t), 1717 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 1718 &dk_geom, NULL, dk_geom2vd_geom}, 1719 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 1720 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 1721 &vtoc, NULL, vtoc2vd_vtoc}, 1722 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 1723 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 1724 &dk_efi, vd_get_efi_in, vd_get_efi_out}, 1725 1726 /* "Set" (copy-in) operations */ 1727 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 1728 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 1729 NULL, VD_IDENTITY, VD_IDENTITY}, 1730 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 1731 RNDSIZE(vd_geom_t), 1732 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 1733 &dk_geom, vd_geom2dk_geom, NULL}, 1734 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 1735 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 1736 &vtoc, vd_vtoc2vtoc, NULL}, 1737 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 1738 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 1739 &dk_efi, vd_set_efi_in, vd_set_efi_out}, 1740 }; 1741 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 1742 1743 1744 ASSERT(vd != NULL); 1745 ASSERT(request != NULL); 1746 ASSERT(request->slice < vd->nslices); 1747 1748 /* 1749 * Determine ioctl corresponding to caller's "operation" and 1750 * validate caller's "nbytes" 1751 */ 1752 for (i = 0; i < nioctls; i++) { 1753 if (request->operation == ioctl[i].operation) { 1754 /* LDC memory operations require 8-byte multiples */ 1755 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 1756 1757 if (request->operation == VD_OP_GET_EFI || 1758 request->operation == VD_OP_SET_EFI) { 1759 if (request->nbytes >= ioctl[i].nbytes) 1760 break; 1761 PR0("%s: Expected at least nbytes = %lu, " 1762 "got %lu", ioctl[i].operation_name, 1763 ioctl[i].nbytes, request->nbytes); 1764 return (EINVAL); 1765 } 1766 1767 if (request->nbytes != ioctl[i].nbytes) { 1768 PR0("%s: Expected nbytes = %lu, got %lu", 1769 ioctl[i].operation_name, ioctl[i].nbytes, 1770 request->nbytes); 1771 return (EINVAL); 1772 } 1773 1774 break; 1775 } 1776 } 1777 ASSERT(i < nioctls); /* because "operation" already validated */ 1778 1779 if (request->nbytes) 1780 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 1781 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 1782 if (request->nbytes) 1783 kmem_free(buf, request->nbytes); 1784 1785 PR0("Returning %d", status); 1786 return (status); 1787 } 1788 1789 static int 1790 vd_get_devid(vd_task_t *task) 1791 { 1792 vd_t *vd = task->vd; 1793 vd_dring_payload_t *request = task->request; 1794 vd_devid_t *vd_devid; 1795 impl_devid_t *devid; 1796 int status, bufid_len, devid_len, len, sz; 1797 int bufbytes; 1798 1799 PR1("Get Device ID, nbytes=%ld", request->nbytes); 1800 1801 if (vd->file) { 1802 if (vd->file_devid == NULL) { 1803 PR2("No Device ID"); 1804 return (ENOENT); 1805 } else { 1806 sz = ddi_devid_sizeof(vd->file_devid); 1807 devid = kmem_alloc(sz, KM_SLEEP); 1808 bcopy(vd->file_devid, devid, sz); 1809 } 1810 } else { 1811 if (ddi_lyr_get_devid(vd->dev[request->slice], 1812 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 1813 PR2("No Device ID"); 1814 return (ENOENT); 1815 } 1816 } 1817 1818 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 1819 devid_len = DEVID_GETLEN(devid); 1820 1821 /* 1822 * Save the buffer size here for use in deallocation. 1823 * The actual number of bytes copied is returned in 1824 * the 'nbytes' field of the request structure. 1825 */ 1826 bufbytes = request->nbytes; 1827 1828 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 1829 vd_devid->length = devid_len; 1830 vd_devid->type = DEVID_GETTYPE(devid); 1831 1832 len = (devid_len > bufid_len)? bufid_len : devid_len; 1833 1834 bcopy(devid->did_id, vd_devid->id, len); 1835 1836 /* LDC memory operations require 8-byte multiples */ 1837 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 1838 1839 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 1840 &request->nbytes, request->cookie, request->ncookies, 1841 LDC_COPY_OUT)) != 0) { 1842 PR0("ldc_mem_copy() returned errno %d copying to client", 1843 status); 1844 } 1845 PR1("post mem_copy: nbytes=%ld", request->nbytes); 1846 1847 kmem_free(vd_devid, bufbytes); 1848 ddi_devid_free((ddi_devid_t)devid); 1849 1850 return (status); 1851 } 1852 1853 /* 1854 * Define the supported operations once the functions for performing them have 1855 * been defined 1856 */ 1857 static const vds_operation_t vds_operation[] = { 1858 #define X(_s) #_s, _s 1859 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 1860 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 1861 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 1862 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 1863 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 1864 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 1865 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 1866 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 1867 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 1868 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 1869 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 1870 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 1871 #undef X 1872 }; 1873 1874 static const size_t vds_noperations = 1875 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 1876 1877 /* 1878 * Process a task specifying a client I/O request 1879 */ 1880 static int 1881 vd_process_task(vd_task_t *task) 1882 { 1883 int i, status; 1884 vd_t *vd = task->vd; 1885 vd_dring_payload_t *request = task->request; 1886 1887 1888 ASSERT(vd != NULL); 1889 ASSERT(request != NULL); 1890 1891 /* Find the requested operation */ 1892 for (i = 0; i < vds_noperations; i++) 1893 if (request->operation == vds_operation[i].operation) 1894 break; 1895 if (i == vds_noperations) { 1896 PR0("Unsupported operation %u", request->operation); 1897 return (ENOTSUP); 1898 } 1899 1900 /* Range-check slice */ 1901 if (request->slice >= vd->nslices && 1902 (vd->vdisk_type != VD_DISK_TYPE_DISK || 1903 request->slice != VD_SLICE_NONE)) { 1904 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 1905 request->slice, (vd->nslices - 1)); 1906 return (EINVAL); 1907 } 1908 1909 PR1("operation : %s", vds_operation[i].namep); 1910 1911 /* Start the operation */ 1912 if ((status = vds_operation[i].start(task)) != EINPROGRESS) { 1913 PR0("operation : %s returned status %d", 1914 vds_operation[i].namep, status); 1915 request->status = status; /* op succeeded or failed */ 1916 return (0); /* but request completed */ 1917 } 1918 1919 ASSERT(vds_operation[i].complete != NULL); /* debug case */ 1920 if (vds_operation[i].complete == NULL) { /* non-debug case */ 1921 PR0("Unexpected return of EINPROGRESS " 1922 "with no I/O completion handler"); 1923 request->status = EIO; /* operation failed */ 1924 return (0); /* but request completed */ 1925 } 1926 1927 PR1("operation : kick off taskq entry for %s", vds_operation[i].namep); 1928 1929 /* Queue a task to complete the operation */ 1930 status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete, 1931 task, DDI_SLEEP); 1932 /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ 1933 ASSERT(status == DDI_SUCCESS); 1934 1935 PR1("Operation in progress"); 1936 return (EINPROGRESS); /* completion handler will finish request */ 1937 } 1938 1939 /* 1940 * Return true if the "type", "subtype", and "env" fields of the "tag" first 1941 * argument match the corresponding remaining arguments; otherwise, return false 1942 */ 1943 boolean_t 1944 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 1945 { 1946 return ((tag->vio_msgtype == type) && 1947 (tag->vio_subtype == subtype) && 1948 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 1949 } 1950 1951 /* 1952 * Check whether the major/minor version specified in "ver_msg" is supported 1953 * by this server. 1954 */ 1955 static boolean_t 1956 vds_supported_version(vio_ver_msg_t *ver_msg) 1957 { 1958 for (int i = 0; i < vds_num_versions; i++) { 1959 ASSERT(vds_version[i].major > 0); 1960 ASSERT((i == 0) || 1961 (vds_version[i].major < vds_version[i-1].major)); 1962 1963 /* 1964 * If the major versions match, adjust the minor version, if 1965 * necessary, down to the highest value supported by this 1966 * server and return true so this message will get "ack"ed; 1967 * the client should also support all minor versions lower 1968 * than the value it sent 1969 */ 1970 if (ver_msg->ver_major == vds_version[i].major) { 1971 if (ver_msg->ver_minor > vds_version[i].minor) { 1972 PR0("Adjusting minor version from %u to %u", 1973 ver_msg->ver_minor, vds_version[i].minor); 1974 ver_msg->ver_minor = vds_version[i].minor; 1975 } 1976 return (B_TRUE); 1977 } 1978 1979 /* 1980 * If the message contains a higher major version number, set 1981 * the message's major/minor versions to the current values 1982 * and return false, so this message will get "nack"ed with 1983 * these values, and the client will potentially try again 1984 * with the same or a lower version 1985 */ 1986 if (ver_msg->ver_major > vds_version[i].major) { 1987 ver_msg->ver_major = vds_version[i].major; 1988 ver_msg->ver_minor = vds_version[i].minor; 1989 return (B_FALSE); 1990 } 1991 1992 /* 1993 * Otherwise, the message's major version is less than the 1994 * current major version, so continue the loop to the next 1995 * (lower) supported version 1996 */ 1997 } 1998 1999 /* 2000 * No common version was found; "ground" the version pair in the 2001 * message to terminate negotiation 2002 */ 2003 ver_msg->ver_major = 0; 2004 ver_msg->ver_minor = 0; 2005 return (B_FALSE); 2006 } 2007 2008 /* 2009 * Process a version message from a client. vds expects to receive version 2010 * messages from clients seeking service, but never issues version messages 2011 * itself; therefore, vds can ACK or NACK client version messages, but does 2012 * not expect to receive version-message ACKs or NACKs (and will treat such 2013 * messages as invalid). 2014 */ 2015 static int 2016 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2017 { 2018 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 2019 2020 2021 ASSERT(msglen >= sizeof (msg->tag)); 2022 2023 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2024 VIO_VER_INFO)) { 2025 return (ENOMSG); /* not a version message */ 2026 } 2027 2028 if (msglen != sizeof (*ver_msg)) { 2029 PR0("Expected %lu-byte version message; " 2030 "received %lu bytes", sizeof (*ver_msg), msglen); 2031 return (EBADMSG); 2032 } 2033 2034 if (ver_msg->dev_class != VDEV_DISK) { 2035 PR0("Expected device class %u (disk); received %u", 2036 VDEV_DISK, ver_msg->dev_class); 2037 return (EBADMSG); 2038 } 2039 2040 /* 2041 * We're talking to the expected kind of client; set our device class 2042 * for "ack/nack" back to the client 2043 */ 2044 ver_msg->dev_class = VDEV_DISK_SERVER; 2045 2046 /* 2047 * Check whether the (valid) version message specifies a version 2048 * supported by this server. If the version is not supported, return 2049 * EBADMSG so the message will get "nack"ed; vds_supported_version() 2050 * will have updated the message with a supported version for the 2051 * client to consider 2052 */ 2053 if (!vds_supported_version(ver_msg)) 2054 return (EBADMSG); 2055 2056 2057 /* 2058 * A version has been agreed upon; use the client's SID for 2059 * communication on this channel now 2060 */ 2061 ASSERT(!(vd->initialized & VD_SID)); 2062 vd->sid = ver_msg->tag.vio_sid; 2063 vd->initialized |= VD_SID; 2064 2065 /* 2066 * When multiple versions are supported, this function should store 2067 * the negotiated major and minor version values in the "vd" data 2068 * structure to govern further communication; in particular, note that 2069 * the client might have specified a lower minor version for the 2070 * agreed major version than specifed in the vds_version[] array. The 2071 * following assertions should help remind future maintainers to make 2072 * the appropriate changes to support multiple versions. 2073 */ 2074 ASSERT(vds_num_versions == 1); 2075 ASSERT(ver_msg->ver_major == vds_version[0].major); 2076 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 2077 2078 PR0("Using major version %u, minor version %u", 2079 ver_msg->ver_major, ver_msg->ver_minor); 2080 return (0); 2081 } 2082 2083 static int 2084 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2085 { 2086 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 2087 int status, retry = 0; 2088 2089 2090 ASSERT(msglen >= sizeof (msg->tag)); 2091 2092 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2093 VIO_ATTR_INFO)) { 2094 PR0("Message is not an attribute message"); 2095 return (ENOMSG); 2096 } 2097 2098 if (msglen != sizeof (*attr_msg)) { 2099 PR0("Expected %lu-byte attribute message; " 2100 "received %lu bytes", sizeof (*attr_msg), msglen); 2101 return (EBADMSG); 2102 } 2103 2104 if (attr_msg->max_xfer_sz == 0) { 2105 PR0("Received maximum transfer size of 0 from client"); 2106 return (EBADMSG); 2107 } 2108 2109 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 2110 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 2111 PR0("Client requested unsupported transfer mode"); 2112 return (EBADMSG); 2113 } 2114 2115 /* 2116 * check if the underlying disk is ready, if not try accessing 2117 * the device again. Open the vdisk device and extract info 2118 * about it, as this is needed to respond to the attr info msg 2119 */ 2120 if ((vd->initialized & VD_DISK_READY) == 0) { 2121 PR0("Retry setting up disk (%s)", vd->device_path); 2122 do { 2123 status = vd_setup_vd(vd); 2124 if (status != EAGAIN || ++retry > vds_dev_retries) 2125 break; 2126 2127 /* incremental delay */ 2128 delay(drv_usectohz(vds_dev_delay)); 2129 2130 /* if vdisk is no longer enabled - return error */ 2131 if (!vd_enabled(vd)) 2132 return (ENXIO); 2133 2134 } while (status == EAGAIN); 2135 2136 if (status) 2137 return (ENXIO); 2138 2139 vd->initialized |= VD_DISK_READY; 2140 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 2141 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 2142 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 2143 (vd->pseudo ? "yes" : "no"), 2144 (vd->file ? "yes" : "no"), 2145 vd->nslices); 2146 } 2147 2148 /* Success: valid message and transfer mode */ 2149 vd->xfer_mode = attr_msg->xfer_mode; 2150 2151 if (vd->xfer_mode == VIO_DESC_MODE) { 2152 2153 /* 2154 * The vd_dring_inband_msg_t contains one cookie; need room 2155 * for up to n-1 more cookies, where "n" is the number of full 2156 * pages plus possibly one partial page required to cover 2157 * "max_xfer_sz". Add room for one more cookie if 2158 * "max_xfer_sz" isn't an integral multiple of the page size. 2159 * Must first get the maximum transfer size in bytes. 2160 */ 2161 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 2162 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 2163 attr_msg->max_xfer_sz; 2164 size_t max_inband_msglen = 2165 sizeof (vd_dring_inband_msg_t) + 2166 ((max_xfer_bytes/PAGESIZE + 2167 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 2168 (sizeof (ldc_mem_cookie_t))); 2169 2170 /* 2171 * Set the maximum expected message length to 2172 * accommodate in-band-descriptor messages with all 2173 * their cookies 2174 */ 2175 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 2176 2177 /* 2178 * Initialize the data structure for processing in-band I/O 2179 * request descriptors 2180 */ 2181 vd->inband_task.vd = vd; 2182 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2183 vd->inband_task.index = 0; 2184 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 2185 } 2186 2187 /* Return the device's block size and max transfer size to the client */ 2188 attr_msg->vdisk_block_size = DEV_BSIZE; 2189 attr_msg->max_xfer_sz = vd->max_xfer_sz; 2190 2191 attr_msg->vdisk_size = vd->vdisk_size; 2192 attr_msg->vdisk_type = vd->vdisk_type; 2193 attr_msg->operations = vds_operations; 2194 PR0("%s", VD_CLIENT(vd)); 2195 2196 ASSERT(vd->dring_task == NULL); 2197 2198 return (0); 2199 } 2200 2201 static int 2202 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2203 { 2204 int status; 2205 size_t expected; 2206 ldc_mem_info_t dring_minfo; 2207 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 2208 2209 2210 ASSERT(msglen >= sizeof (msg->tag)); 2211 2212 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2213 VIO_DRING_REG)) { 2214 PR0("Message is not a register-dring message"); 2215 return (ENOMSG); 2216 } 2217 2218 if (msglen < sizeof (*reg_msg)) { 2219 PR0("Expected at least %lu-byte register-dring message; " 2220 "received %lu bytes", sizeof (*reg_msg), msglen); 2221 return (EBADMSG); 2222 } 2223 2224 expected = sizeof (*reg_msg) + 2225 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 2226 if (msglen != expected) { 2227 PR0("Expected %lu-byte register-dring message; " 2228 "received %lu bytes", expected, msglen); 2229 return (EBADMSG); 2230 } 2231 2232 if (vd->initialized & VD_DRING) { 2233 PR0("A dring was previously registered; only support one"); 2234 return (EBADMSG); 2235 } 2236 2237 if (reg_msg->num_descriptors > INT32_MAX) { 2238 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 2239 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 2240 return (EBADMSG); 2241 } 2242 2243 if (reg_msg->ncookies != 1) { 2244 /* 2245 * In addition to fixing the assertion in the success case 2246 * below, supporting drings which require more than one 2247 * "cookie" requires increasing the value of vd->max_msglen 2248 * somewhere in the code path prior to receiving the message 2249 * which results in calling this function. Note that without 2250 * making this change, the larger message size required to 2251 * accommodate multiple cookies cannot be successfully 2252 * received, so this function will not even get called. 2253 * Gracefully accommodating more dring cookies might 2254 * reasonably demand exchanging an additional attribute or 2255 * making a minor protocol adjustment 2256 */ 2257 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 2258 return (EBADMSG); 2259 } 2260 2261 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 2262 reg_msg->ncookies, reg_msg->num_descriptors, 2263 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 2264 if (status != 0) { 2265 PR0("ldc_mem_dring_map() returned errno %d", status); 2266 return (status); 2267 } 2268 2269 /* 2270 * To remove the need for this assertion, must call 2271 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 2272 * successful call to ldc_mem_dring_map() 2273 */ 2274 ASSERT(reg_msg->ncookies == 1); 2275 2276 if ((status = 2277 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 2278 PR0("ldc_mem_dring_info() returned errno %d", status); 2279 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 2280 PR0("ldc_mem_dring_unmap() returned errno %d", status); 2281 return (status); 2282 } 2283 2284 if (dring_minfo.vaddr == NULL) { 2285 PR0("Descriptor ring virtual address is NULL"); 2286 return (ENXIO); 2287 } 2288 2289 2290 /* Initialize for valid message and mapped dring */ 2291 PR1("descriptor size = %u, dring length = %u", 2292 vd->descriptor_size, vd->dring_len); 2293 vd->initialized |= VD_DRING; 2294 vd->dring_ident = 1; /* "There Can Be Only One" */ 2295 vd->dring = dring_minfo.vaddr; 2296 vd->descriptor_size = reg_msg->descriptor_size; 2297 vd->dring_len = reg_msg->num_descriptors; 2298 reg_msg->dring_ident = vd->dring_ident; 2299 2300 /* 2301 * Allocate and initialize a "shadow" array of data structures for 2302 * tasks to process I/O requests in dring elements 2303 */ 2304 vd->dring_task = 2305 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 2306 for (int i = 0; i < vd->dring_len; i++) { 2307 vd->dring_task[i].vd = vd; 2308 vd->dring_task[i].index = i; 2309 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 2310 2311 status = ldc_mem_alloc_handle(vd->ldc_handle, 2312 &(vd->dring_task[i].mhdl)); 2313 if (status) { 2314 PR0("ldc_mem_alloc_handle() returned err %d ", status); 2315 return (ENXIO); 2316 } 2317 2318 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2319 } 2320 2321 return (0); 2322 } 2323 2324 static int 2325 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2326 { 2327 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 2328 2329 2330 ASSERT(msglen >= sizeof (msg->tag)); 2331 2332 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2333 VIO_DRING_UNREG)) { 2334 PR0("Message is not an unregister-dring message"); 2335 return (ENOMSG); 2336 } 2337 2338 if (msglen != sizeof (*unreg_msg)) { 2339 PR0("Expected %lu-byte unregister-dring message; " 2340 "received %lu bytes", sizeof (*unreg_msg), msglen); 2341 return (EBADMSG); 2342 } 2343 2344 if (unreg_msg->dring_ident != vd->dring_ident) { 2345 PR0("Expected dring ident %lu; received %lu", 2346 vd->dring_ident, unreg_msg->dring_ident); 2347 return (EBADMSG); 2348 } 2349 2350 return (0); 2351 } 2352 2353 static int 2354 process_rdx_msg(vio_msg_t *msg, size_t msglen) 2355 { 2356 ASSERT(msglen >= sizeof (msg->tag)); 2357 2358 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 2359 PR0("Message is not an RDX message"); 2360 return (ENOMSG); 2361 } 2362 2363 if (msglen != sizeof (vio_rdx_msg_t)) { 2364 PR0("Expected %lu-byte RDX message; received %lu bytes", 2365 sizeof (vio_rdx_msg_t), msglen); 2366 return (EBADMSG); 2367 } 2368 2369 PR0("Valid RDX message"); 2370 return (0); 2371 } 2372 2373 static int 2374 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 2375 { 2376 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 2377 PR0("Received seq_num %lu; expected %lu", 2378 seq_num, (vd->seq_num + 1)); 2379 PR0("initiating soft reset"); 2380 vd_need_reset(vd, B_FALSE); 2381 return (1); 2382 } 2383 2384 vd->seq_num = seq_num; 2385 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 2386 return (0); 2387 } 2388 2389 /* 2390 * Return the expected size of an inband-descriptor message with all the 2391 * cookies it claims to include 2392 */ 2393 static size_t 2394 expected_inband_size(vd_dring_inband_msg_t *msg) 2395 { 2396 return ((sizeof (*msg)) + 2397 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 2398 } 2399 2400 /* 2401 * Process an in-band descriptor message: used with clients like OBP, with 2402 * which vds exchanges descriptors within VIO message payloads, rather than 2403 * operating on them within a descriptor ring 2404 */ 2405 static int 2406 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2407 { 2408 size_t expected; 2409 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 2410 2411 2412 ASSERT(msglen >= sizeof (msg->tag)); 2413 2414 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 2415 VIO_DESC_DATA)) { 2416 PR1("Message is not an in-band-descriptor message"); 2417 return (ENOMSG); 2418 } 2419 2420 if (msglen < sizeof (*desc_msg)) { 2421 PR0("Expected at least %lu-byte descriptor message; " 2422 "received %lu bytes", sizeof (*desc_msg), msglen); 2423 return (EBADMSG); 2424 } 2425 2426 if (msglen != (expected = expected_inband_size(desc_msg))) { 2427 PR0("Expected %lu-byte descriptor message; " 2428 "received %lu bytes", expected, msglen); 2429 return (EBADMSG); 2430 } 2431 2432 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 2433 return (EBADMSG); 2434 2435 /* 2436 * Valid message: Set up the in-band descriptor task and process the 2437 * request. Arrange to acknowledge the client's message, unless an 2438 * error processing the descriptor task results in setting 2439 * VIO_SUBTYPE_NACK 2440 */ 2441 PR1("Valid in-band-descriptor message"); 2442 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2443 2444 ASSERT(vd->inband_task.msg != NULL); 2445 2446 bcopy(msg, vd->inband_task.msg, msglen); 2447 vd->inband_task.msglen = msglen; 2448 2449 /* 2450 * The task request is now the payload of the message 2451 * that was just copied into the body of the task. 2452 */ 2453 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 2454 vd->inband_task.request = &desc_msg->payload; 2455 2456 return (vd_process_task(&vd->inband_task)); 2457 } 2458 2459 static int 2460 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 2461 vio_msg_t *msg, size_t msglen) 2462 { 2463 int status; 2464 boolean_t ready; 2465 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 2466 2467 2468 /* Accept the updated dring element */ 2469 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 2470 PR0("ldc_mem_dring_acquire() returned errno %d", status); 2471 return (status); 2472 } 2473 ready = (elem->hdr.dstate == VIO_DESC_READY); 2474 if (ready) { 2475 elem->hdr.dstate = VIO_DESC_ACCEPTED; 2476 } else { 2477 PR0("descriptor %u not ready", idx); 2478 VD_DUMP_DRING_ELEM(elem); 2479 } 2480 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 2481 PR0("ldc_mem_dring_release() returned errno %d", status); 2482 return (status); 2483 } 2484 if (!ready) 2485 return (EBUSY); 2486 2487 2488 /* Initialize a task and process the accepted element */ 2489 PR1("Processing dring element %u", idx); 2490 vd->dring_task[idx].type = type; 2491 2492 /* duplicate msg buf for cookies etc. */ 2493 bcopy(msg, vd->dring_task[idx].msg, msglen); 2494 2495 vd->dring_task[idx].msglen = msglen; 2496 if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS) 2497 status = vd_mark_elem_done(vd, idx, 2498 vd->dring_task[idx].request->status, 2499 vd->dring_task[idx].request->nbytes); 2500 2501 return (status); 2502 } 2503 2504 static int 2505 vd_process_element_range(vd_t *vd, int start, int end, 2506 vio_msg_t *msg, size_t msglen) 2507 { 2508 int i, n, nelem, status = 0; 2509 boolean_t inprogress = B_FALSE; 2510 vd_task_type_t type; 2511 2512 2513 ASSERT(start >= 0); 2514 ASSERT(end >= 0); 2515 2516 /* 2517 * Arrange to acknowledge the client's message, unless an error 2518 * processing one of the dring elements results in setting 2519 * VIO_SUBTYPE_NACK 2520 */ 2521 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2522 2523 /* 2524 * Process the dring elements in the range 2525 */ 2526 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 2527 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 2528 ((vio_dring_msg_t *)msg)->end_idx = i; 2529 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 2530 status = vd_process_element(vd, type, i, msg, msglen); 2531 if (status == EINPROGRESS) 2532 inprogress = B_TRUE; 2533 else if (status != 0) 2534 break; 2535 } 2536 2537 /* 2538 * If some, but not all, operations of a multi-element range are in 2539 * progress, wait for other operations to complete before returning 2540 * (which will result in "ack" or "nack" of the message). Note that 2541 * all outstanding operations will need to complete, not just the ones 2542 * corresponding to the current range of dring elements; howevever, as 2543 * this situation is an error case, performance is less critical. 2544 */ 2545 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 2546 ddi_taskq_wait(vd->completionq); 2547 2548 return (status); 2549 } 2550 2551 static int 2552 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2553 { 2554 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 2555 2556 2557 ASSERT(msglen >= sizeof (msg->tag)); 2558 2559 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 2560 VIO_DRING_DATA)) { 2561 PR1("Message is not a dring-data message"); 2562 return (ENOMSG); 2563 } 2564 2565 if (msglen != sizeof (*dring_msg)) { 2566 PR0("Expected %lu-byte dring message; received %lu bytes", 2567 sizeof (*dring_msg), msglen); 2568 return (EBADMSG); 2569 } 2570 2571 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 2572 return (EBADMSG); 2573 2574 if (dring_msg->dring_ident != vd->dring_ident) { 2575 PR0("Expected dring ident %lu; received ident %lu", 2576 vd->dring_ident, dring_msg->dring_ident); 2577 return (EBADMSG); 2578 } 2579 2580 if (dring_msg->start_idx >= vd->dring_len) { 2581 PR0("\"start_idx\" = %u; must be less than %u", 2582 dring_msg->start_idx, vd->dring_len); 2583 return (EBADMSG); 2584 } 2585 2586 if ((dring_msg->end_idx < 0) || 2587 (dring_msg->end_idx >= vd->dring_len)) { 2588 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 2589 dring_msg->end_idx, vd->dring_len); 2590 return (EBADMSG); 2591 } 2592 2593 /* Valid message; process range of updated dring elements */ 2594 PR1("Processing descriptor range, start = %u, end = %u", 2595 dring_msg->start_idx, dring_msg->end_idx); 2596 return (vd_process_element_range(vd, dring_msg->start_idx, 2597 dring_msg->end_idx, msg, msglen)); 2598 } 2599 2600 static int 2601 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 2602 { 2603 int retry, status; 2604 size_t size = *nbytes; 2605 2606 2607 for (retry = 0, status = ETIMEDOUT; 2608 retry < vds_ldc_retries && status == ETIMEDOUT; 2609 retry++) { 2610 PR1("ldc_read() attempt %d", (retry + 1)); 2611 *nbytes = size; 2612 status = ldc_read(ldc_handle, msg, nbytes); 2613 } 2614 2615 if (status) { 2616 PR0("ldc_read() returned errno %d", status); 2617 if (status != ECONNRESET) 2618 return (ENOMSG); 2619 return (status); 2620 } else if (*nbytes == 0) { 2621 PR1("ldc_read() returned 0 and no message read"); 2622 return (ENOMSG); 2623 } 2624 2625 PR1("RCVD %lu-byte message", *nbytes); 2626 return (0); 2627 } 2628 2629 static int 2630 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2631 { 2632 int status; 2633 2634 2635 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 2636 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 2637 #ifdef DEBUG 2638 vd_decode_tag(msg); 2639 #endif 2640 2641 /* 2642 * Validate session ID up front, since it applies to all messages 2643 * once set 2644 */ 2645 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 2646 PR0("Expected SID %u, received %u", vd->sid, 2647 msg->tag.vio_sid); 2648 return (EBADMSG); 2649 } 2650 2651 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 2652 2653 /* 2654 * Process the received message based on connection state 2655 */ 2656 switch (vd->state) { 2657 case VD_STATE_INIT: /* expect version message */ 2658 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 2659 return (status); 2660 2661 /* Version negotiated, move to that state */ 2662 vd->state = VD_STATE_VER; 2663 return (0); 2664 2665 case VD_STATE_VER: /* expect attribute message */ 2666 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 2667 return (status); 2668 2669 /* Attributes exchanged, move to that state */ 2670 vd->state = VD_STATE_ATTR; 2671 return (0); 2672 2673 case VD_STATE_ATTR: 2674 switch (vd->xfer_mode) { 2675 case VIO_DESC_MODE: /* expect RDX message */ 2676 if ((status = process_rdx_msg(msg, msglen)) != 0) 2677 return (status); 2678 2679 /* Ready to receive in-band descriptors */ 2680 vd->state = VD_STATE_DATA; 2681 return (0); 2682 2683 case VIO_DRING_MODE: /* expect register-dring message */ 2684 if ((status = 2685 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 2686 return (status); 2687 2688 /* One dring negotiated, move to that state */ 2689 vd->state = VD_STATE_DRING; 2690 return (0); 2691 2692 default: 2693 ASSERT("Unsupported transfer mode"); 2694 PR0("Unsupported transfer mode"); 2695 return (ENOTSUP); 2696 } 2697 2698 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 2699 if ((status = process_rdx_msg(msg, msglen)) == 0) { 2700 /* Ready to receive data */ 2701 vd->state = VD_STATE_DATA; 2702 return (0); 2703 } else if (status != ENOMSG) { 2704 return (status); 2705 } 2706 2707 2708 /* 2709 * If another register-dring message is received, stay in 2710 * dring state in case the client sends RDX; although the 2711 * protocol allows multiple drings, this server does not 2712 * support using more than one 2713 */ 2714 if ((status = 2715 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 2716 return (status); 2717 2718 /* 2719 * Acknowledge an unregister-dring message, but reset the 2720 * connection anyway: Although the protocol allows 2721 * unregistering drings, this server cannot serve a vdisk 2722 * without its only dring 2723 */ 2724 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2725 return ((status == 0) ? ENOTSUP : status); 2726 2727 case VD_STATE_DATA: 2728 switch (vd->xfer_mode) { 2729 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 2730 return (vd_process_desc_msg(vd, msg, msglen)); 2731 2732 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 2733 /* 2734 * Typically expect dring-data messages, so handle 2735 * them first 2736 */ 2737 if ((status = vd_process_dring_msg(vd, msg, 2738 msglen)) != ENOMSG) 2739 return (status); 2740 2741 /* 2742 * Acknowledge an unregister-dring message, but reset 2743 * the connection anyway: Although the protocol 2744 * allows unregistering drings, this server cannot 2745 * serve a vdisk without its only dring 2746 */ 2747 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2748 return ((status == 0) ? ENOTSUP : status); 2749 2750 default: 2751 ASSERT("Unsupported transfer mode"); 2752 PR0("Unsupported transfer mode"); 2753 return (ENOTSUP); 2754 } 2755 2756 default: 2757 ASSERT("Invalid client connection state"); 2758 PR0("Invalid client connection state"); 2759 return (ENOTSUP); 2760 } 2761 } 2762 2763 static int 2764 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2765 { 2766 int status; 2767 boolean_t reset_ldc = B_FALSE; 2768 2769 2770 /* 2771 * Check that the message is at least big enough for a "tag", so that 2772 * message processing can proceed based on tag-specified message type 2773 */ 2774 if (msglen < sizeof (vio_msg_tag_t)) { 2775 PR0("Received short (%lu-byte) message", msglen); 2776 /* Can't "nack" short message, so drop the big hammer */ 2777 PR0("initiating full reset"); 2778 vd_need_reset(vd, B_TRUE); 2779 return (EBADMSG); 2780 } 2781 2782 /* 2783 * Process the message 2784 */ 2785 switch (status = vd_do_process_msg(vd, msg, msglen)) { 2786 case 0: 2787 /* "ack" valid, successfully-processed messages */ 2788 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2789 break; 2790 2791 case EINPROGRESS: 2792 /* The completion handler will "ack" or "nack" the message */ 2793 return (EINPROGRESS); 2794 case ENOMSG: 2795 PR0("Received unexpected message"); 2796 _NOTE(FALLTHROUGH); 2797 case EBADMSG: 2798 case ENOTSUP: 2799 /* "nack" invalid messages */ 2800 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2801 break; 2802 2803 default: 2804 /* "nack" failed messages */ 2805 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2806 /* An LDC error probably occurred, so try resetting it */ 2807 reset_ldc = B_TRUE; 2808 break; 2809 } 2810 2811 PR1("\tResulting in state %d (%s)", vd->state, 2812 vd_decode_state(vd->state)); 2813 2814 /* Send the "ack" or "nack" to the client */ 2815 PR1("Sending %s", 2816 (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 2817 if (send_msg(vd->ldc_handle, msg, msglen) != 0) 2818 reset_ldc = B_TRUE; 2819 2820 /* Arrange to reset the connection for nack'ed or failed messages */ 2821 if ((status != 0) || reset_ldc) { 2822 PR0("initiating %s reset", 2823 (reset_ldc) ? "full" : "soft"); 2824 vd_need_reset(vd, reset_ldc); 2825 } 2826 2827 return (status); 2828 } 2829 2830 static boolean_t 2831 vd_enabled(vd_t *vd) 2832 { 2833 boolean_t enabled; 2834 2835 2836 mutex_enter(&vd->lock); 2837 enabled = vd->enabled; 2838 mutex_exit(&vd->lock); 2839 return (enabled); 2840 } 2841 2842 static void 2843 vd_recv_msg(void *arg) 2844 { 2845 vd_t *vd = (vd_t *)arg; 2846 int rv = 0, status = 0; 2847 2848 ASSERT(vd != NULL); 2849 2850 PR2("New task to receive incoming message(s)"); 2851 2852 2853 while (vd_enabled(vd) && status == 0) { 2854 size_t msglen, msgsize; 2855 ldc_status_t lstatus; 2856 2857 /* 2858 * Receive and process a message 2859 */ 2860 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 2861 2862 /* 2863 * check if channel is UP - else break out of loop 2864 */ 2865 status = ldc_status(vd->ldc_handle, &lstatus); 2866 if (lstatus != LDC_UP) { 2867 PR0("channel not up (status=%d), exiting recv loop\n", 2868 lstatus); 2869 break; 2870 } 2871 2872 ASSERT(vd->max_msglen != 0); 2873 2874 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 2875 msglen = msgsize; /* actual len after recv_msg() */ 2876 2877 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 2878 switch (status) { 2879 case 0: 2880 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 2881 msglen); 2882 /* check if max_msglen changed */ 2883 if (msgsize != vd->max_msglen) { 2884 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 2885 msgsize, vd->max_msglen); 2886 kmem_free(vd->vio_msgp, msgsize); 2887 vd->vio_msgp = 2888 kmem_alloc(vd->max_msglen, KM_SLEEP); 2889 } 2890 if (rv == EINPROGRESS) 2891 continue; 2892 break; 2893 2894 case ENOMSG: 2895 break; 2896 2897 case ECONNRESET: 2898 PR0("initiating soft reset (ECONNRESET)\n"); 2899 vd_need_reset(vd, B_FALSE); 2900 status = 0; 2901 break; 2902 2903 default: 2904 /* Probably an LDC failure; arrange to reset it */ 2905 PR0("initiating full reset (status=0x%x)", status); 2906 vd_need_reset(vd, B_TRUE); 2907 break; 2908 } 2909 } 2910 2911 PR2("Task finished"); 2912 } 2913 2914 static uint_t 2915 vd_handle_ldc_events(uint64_t event, caddr_t arg) 2916 { 2917 vd_t *vd = (vd_t *)(void *)arg; 2918 int status; 2919 2920 ASSERT(vd != NULL); 2921 2922 if (!vd_enabled(vd)) 2923 return (LDC_SUCCESS); 2924 2925 if (event & LDC_EVT_DOWN) { 2926 PR0("LDC_EVT_DOWN: LDC channel went down"); 2927 2928 vd_need_reset(vd, B_TRUE); 2929 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2930 DDI_SLEEP); 2931 if (status == DDI_FAILURE) { 2932 PR0("cannot schedule task to recv msg\n"); 2933 vd_need_reset(vd, B_TRUE); 2934 } 2935 } 2936 2937 if (event & LDC_EVT_RESET) { 2938 PR0("LDC_EVT_RESET: LDC channel was reset"); 2939 2940 if (vd->state != VD_STATE_INIT) { 2941 PR0("scheduling full reset"); 2942 vd_need_reset(vd, B_FALSE); 2943 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2944 vd, DDI_SLEEP); 2945 if (status == DDI_FAILURE) { 2946 PR0("cannot schedule task to recv msg\n"); 2947 vd_need_reset(vd, B_TRUE); 2948 } 2949 2950 } else { 2951 PR0("channel already reset, ignoring...\n"); 2952 PR0("doing ldc up...\n"); 2953 (void) ldc_up(vd->ldc_handle); 2954 } 2955 2956 return (LDC_SUCCESS); 2957 } 2958 2959 if (event & LDC_EVT_UP) { 2960 PR0("EVT_UP: LDC is up\nResetting client connection state"); 2961 PR0("initiating soft reset"); 2962 vd_need_reset(vd, B_FALSE); 2963 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2964 vd, DDI_SLEEP); 2965 if (status == DDI_FAILURE) { 2966 PR0("cannot schedule task to recv msg\n"); 2967 vd_need_reset(vd, B_TRUE); 2968 return (LDC_SUCCESS); 2969 } 2970 } 2971 2972 if (event & LDC_EVT_READ) { 2973 int status; 2974 2975 PR1("New data available"); 2976 /* Queue a task to receive the new data */ 2977 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2978 DDI_SLEEP); 2979 2980 if (status == DDI_FAILURE) { 2981 PR0("cannot schedule task to recv msg\n"); 2982 vd_need_reset(vd, B_TRUE); 2983 } 2984 } 2985 2986 return (LDC_SUCCESS); 2987 } 2988 2989 static uint_t 2990 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 2991 { 2992 _NOTE(ARGUNUSED(key, val)) 2993 (*((uint_t *)arg))++; 2994 return (MH_WALK_TERMINATE); 2995 } 2996 2997 2998 static int 2999 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3000 { 3001 uint_t vd_present = 0; 3002 minor_t instance; 3003 vds_t *vds; 3004 3005 3006 switch (cmd) { 3007 case DDI_DETACH: 3008 /* the real work happens below */ 3009 break; 3010 case DDI_SUSPEND: 3011 PR0("No action required for DDI_SUSPEND"); 3012 return (DDI_SUCCESS); 3013 default: 3014 PR0("Unrecognized \"cmd\""); 3015 return (DDI_FAILURE); 3016 } 3017 3018 ASSERT(cmd == DDI_DETACH); 3019 instance = ddi_get_instance(dip); 3020 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 3021 PR0("Could not get state for instance %u", instance); 3022 ddi_soft_state_free(vds_state, instance); 3023 return (DDI_FAILURE); 3024 } 3025 3026 /* Do no detach when serving any vdisks */ 3027 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 3028 if (vd_present) { 3029 PR0("Not detaching because serving vdisks"); 3030 return (DDI_FAILURE); 3031 } 3032 3033 PR0("Detaching"); 3034 if (vds->initialized & VDS_MDEG) { 3035 (void) mdeg_unregister(vds->mdeg); 3036 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 3037 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 3038 vds->ispecp = NULL; 3039 vds->mdeg = NULL; 3040 } 3041 3042 if (vds->initialized & VDS_LDI) 3043 (void) ldi_ident_release(vds->ldi_ident); 3044 mod_hash_destroy_hash(vds->vd_table); 3045 ddi_soft_state_free(vds_state, instance); 3046 return (DDI_SUCCESS); 3047 } 3048 3049 static boolean_t 3050 is_pseudo_device(dev_info_t *dip) 3051 { 3052 dev_info_t *parent, *root = ddi_root_node(); 3053 3054 3055 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 3056 parent = ddi_get_parent(parent)) { 3057 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 3058 return (B_TRUE); 3059 } 3060 3061 return (B_FALSE); 3062 } 3063 3064 static int 3065 vd_setup_full_disk(vd_t *vd) 3066 { 3067 int rval, status; 3068 major_t major = getmajor(vd->dev[0]); 3069 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 3070 struct dk_minfo dk_minfo; 3071 3072 /* 3073 * At this point, vdisk_size is set to the size of partition 2 but 3074 * this does not represent the size of the disk because partition 2 3075 * may not cover the entire disk and its size does not include reserved 3076 * blocks. So we update vdisk_size to be the size of the entire disk. 3077 */ 3078 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 3079 (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL), 3080 kcred, &rval)) != 0) { 3081 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 3082 status); 3083 return (status); 3084 } 3085 vd->vdisk_size = dk_minfo.dki_capacity; 3086 3087 /* Set full-disk parameters */ 3088 vd->vdisk_type = VD_DISK_TYPE_DISK; 3089 vd->nslices = (sizeof (vd->dev))/(sizeof (vd->dev[0])); 3090 3091 /* Move dev number and LDI handle to entire-disk-slice array elements */ 3092 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 3093 vd->dev[0] = 0; 3094 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 3095 vd->ldi_handle[0] = NULL; 3096 3097 /* Initialize device numbers for remaining slices and open them */ 3098 for (int slice = 0; slice < vd->nslices; slice++) { 3099 /* 3100 * Skip the entire-disk slice, as it's already open and its 3101 * device known 3102 */ 3103 if (slice == VD_ENTIRE_DISK_SLICE) 3104 continue; 3105 ASSERT(vd->dev[slice] == 0); 3106 ASSERT(vd->ldi_handle[slice] == NULL); 3107 3108 /* 3109 * Construct the device number for the current slice 3110 */ 3111 vd->dev[slice] = makedevice(major, (minor + slice)); 3112 3113 /* 3114 * Open all slices of the disk to serve them to the client. 3115 * Slices are opened exclusively to prevent other threads or 3116 * processes in the service domain from performing I/O to 3117 * slices being accessed by a client. Failure to open a slice 3118 * results in vds not serving this disk, as the client could 3119 * attempt (and should be able) to access any slice immediately. 3120 * Any slices successfully opened before a failure will get 3121 * closed by vds_destroy_vd() as a result of the error returned 3122 * by this function. 3123 * 3124 * We need to do the open with FNDELAY so that opening an empty 3125 * slice does not fail. 3126 */ 3127 PR0("Opening device major %u, minor %u = slice %u", 3128 major, minor, slice); 3129 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 3130 vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[slice], 3131 vd->vds->ldi_ident)) != 0) { 3132 PRN("ldi_open_by_dev() returned errno %d " 3133 "for slice %u", status, slice); 3134 /* vds_destroy_vd() will close any open slices */ 3135 vd->ldi_handle[slice] = NULL; 3136 return (status); 3137 } 3138 } 3139 3140 return (0); 3141 } 3142 3143 static int 3144 vd_setup_partition_efi(vd_t *vd) 3145 { 3146 efi_gpt_t *gpt; 3147 efi_gpe_t *gpe; 3148 struct uuid uuid = EFI_RESERVED; 3149 uint32_t crc; 3150 int length; 3151 3152 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 3153 3154 gpt = kmem_zalloc(length, KM_SLEEP); 3155 gpe = (efi_gpe_t *)(gpt + 1); 3156 3157 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 3158 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 3159 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 3160 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 3161 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 3162 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 3163 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 3164 3165 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 3166 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 3167 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 3168 3169 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 3170 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 3171 3172 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 3173 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 3174 3175 vd->dk_efi.dki_lba = 0; 3176 vd->dk_efi.dki_length = length; 3177 vd->dk_efi.dki_data = gpt; 3178 3179 return (0); 3180 } 3181 3182 static int 3183 vd_setup_file(vd_t *vd) 3184 { 3185 int i, rval, status; 3186 ushort_t sum; 3187 vattr_t vattr; 3188 dev_t dev; 3189 size_t size; 3190 char *file_path = vd->device_path; 3191 char dev_path[MAXPATHLEN + 1]; 3192 char prefix; 3193 ldi_handle_t lhandle; 3194 struct dk_cinfo dk_cinfo; 3195 struct dk_label label; 3196 3197 /* make sure the file is valid */ 3198 if ((status = lookupname(file_path, UIO_SYSSPACE, FOLLOW, 3199 NULLVPP, &vd->file_vnode)) != 0) { 3200 PRN("Cannot lookup file(%s) errno %d", file_path, status); 3201 return (status); 3202 } 3203 3204 if (vd->file_vnode->v_type != VREG) { 3205 PRN("Invalid file type (%s)\n", file_path); 3206 VN_RELE(vd->file_vnode); 3207 return (EBADF); 3208 } 3209 VN_RELE(vd->file_vnode); 3210 3211 if ((status = vn_open(file_path, UIO_SYSSPACE, vd_open_flags | FOFFMAX, 3212 0, &vd->file_vnode, 0, 0)) != 0) { 3213 PRN("vn_open(%s) = errno %d", file_path, status); 3214 return (status); 3215 } 3216 3217 /* 3218 * We set vd->file now so that vds_destroy_vd will take care of 3219 * closing the file and releasing the vnode in case of an error. 3220 */ 3221 vd->file = B_TRUE; 3222 vd->pseudo = B_FALSE; 3223 3224 vattr.va_mask = AT_SIZE; 3225 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) { 3226 PRN("VOP_GETATTR(%s) = errno %d", file_path, status); 3227 return (EIO); 3228 } 3229 3230 vd->file_size = vattr.va_size; 3231 /* size should be at least sizeof(dk_label) */ 3232 if (vd->file_size < sizeof (struct dk_label)) { 3233 PRN("Size of file has to be at least %ld bytes", 3234 sizeof (struct dk_label)); 3235 return (EIO); 3236 } 3237 3238 if (vd->file_vnode->v_flag & VNOMAP) { 3239 PRN("File %s cannot be mapped", file_path); 3240 return (EIO); 3241 } 3242 3243 /* read label from file */ 3244 if (VD_FILE_LABEL_READ(vd, &label) < 0) { 3245 PRN("Can't read label from %s", file_path); 3246 return (EIO); 3247 } 3248 3249 /* label checksum */ 3250 sum = vd_lbl2cksum(&label); 3251 3252 if (label.dkl_magic != DKL_MAGIC || label.dkl_cksum != sum) { 3253 PR0("%s has an invalid disk label " 3254 "(magic=%x cksum=%x (expect %x))", 3255 file_path, label.dkl_magic, label.dkl_cksum, sum); 3256 3257 /* default label */ 3258 bzero(&label, sizeof (struct dk_label)); 3259 3260 /* 3261 * We must have a resonable number of cylinders and sectors so 3262 * that newfs can run using default values. 3263 * 3264 * if (disk_size < 2MB) 3265 * phys_cylinders = disk_size / 100K 3266 * else 3267 * phys_cylinders = disk_size / 300K 3268 * 3269 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 3270 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 3271 * data_cylinders = phys_cylinders - alt_cylinders 3272 * 3273 * sectors = disk_size / (phys_cylinders * blk_size) 3274 */ 3275 if (vd->file_size < (2 * 1024 * 1024)) 3276 label.dkl_pcyl = vd->file_size / (100 * 1024); 3277 else 3278 label.dkl_pcyl = vd->file_size / (300 * 1024); 3279 3280 if (label.dkl_pcyl == 0) 3281 label.dkl_pcyl = 1; 3282 3283 if (label.dkl_pcyl > 2) 3284 label.dkl_acyl = 2; 3285 else 3286 label.dkl_acyl = 0; 3287 3288 label.dkl_nsect = vd->file_size / 3289 (DEV_BSIZE * label.dkl_pcyl); 3290 label.dkl_ncyl = label.dkl_pcyl - label.dkl_acyl; 3291 label.dkl_nhead = 1; 3292 label.dkl_write_reinstruct = 0; 3293 label.dkl_read_reinstruct = 0; 3294 label.dkl_rpm = 7200; 3295 label.dkl_apc = 0; 3296 label.dkl_intrlv = 0; 3297 label.dkl_magic = DKL_MAGIC; 3298 3299 PR0("requested disk size: %ld bytes\n", vd->file_size); 3300 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label.dkl_pcyl, 3301 label.dkl_nhead, label.dkl_nsect); 3302 PR0("provided disk size: %ld bytes\n", (uint64_t) 3303 (label.dkl_pcyl * 3304 label.dkl_nhead * label.dkl_nsect * DEV_BSIZE)); 3305 3306 if (vd->file_size < (1ULL << 20)) { 3307 size = vd->file_size >> 10; 3308 prefix = 'K'; /* Kilobyte */ 3309 } else if (vd->file_size < (1ULL << 30)) { 3310 size = vd->file_size >> 20; 3311 prefix = 'M'; /* Megabyte */ 3312 } else if (vd->file_size < (1ULL << 40)) { 3313 size = vd->file_size >> 30; 3314 prefix = 'G'; /* Gigabyte */ 3315 } else { 3316 size = vd->file_size >> 40; 3317 prefix = 'T'; /* Terabyte */ 3318 } 3319 3320 /* 3321 * We must have a correct label name otherwise format(1m) will 3322 * not recognized the disk as labeled. 3323 */ 3324 (void) snprintf(label.dkl_asciilabel, LEN_DKL_ASCII, 3325 "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d", 3326 size, prefix, 3327 label.dkl_ncyl, label.dkl_acyl, label.dkl_nhead, 3328 label.dkl_nsect); 3329 3330 /* default VTOC */ 3331 label.dkl_vtoc.v_version = V_VERSION; 3332 label.dkl_vtoc.v_nparts = V_NUMPAR; 3333 label.dkl_vtoc.v_sanity = VTOC_SANE; 3334 label.dkl_vtoc.v_part[2].p_tag = V_BACKUP; 3335 label.dkl_map[2].dkl_cylno = 0; 3336 label.dkl_map[2].dkl_nblk = label.dkl_ncyl * 3337 label.dkl_nhead * label.dkl_nsect; 3338 label.dkl_map[0] = label.dkl_map[2]; 3339 label.dkl_map[0] = label.dkl_map[2]; 3340 label.dkl_cksum = vd_lbl2cksum(&label); 3341 3342 /* write default label to file */ 3343 if ((rval = vd_file_set_vtoc(vd, &label)) != 0) { 3344 PRN("Can't write label to %s", file_path); 3345 return (rval); 3346 } 3347 } 3348 3349 vd->nslices = label.dkl_vtoc.v_nparts; 3350 3351 /* sector size = block size = DEV_BSIZE */ 3352 vd->vdisk_size = vd->file_size / DEV_BSIZE; 3353 vd->vdisk_type = VD_DISK_TYPE_DISK; 3354 vd->vdisk_label = VD_DISK_LABEL_VTOC; 3355 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 3356 3357 /* Get max_xfer_sz from the device where the file is */ 3358 dev = vd->file_vnode->v_vfsp->vfs_dev; 3359 dev_path[0] = NULL; 3360 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 3361 PR0("underlying device = %s\n", dev_path); 3362 } 3363 3364 if ((status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, 3365 kcred, &lhandle, vd->vds->ldi_ident)) != 0) { 3366 PR0("ldi_open_by_dev() returned errno %d for device %s", 3367 status, dev_path); 3368 } else { 3369 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 3370 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 3371 &rval)) != 0) { 3372 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 3373 status, dev_path); 3374 } else { 3375 /* 3376 * Store the device's max transfer size for 3377 * return to the client 3378 */ 3379 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 3380 } 3381 3382 PR0("close the device %s", dev_path); 3383 (void) ldi_close(lhandle, FREAD, kcred); 3384 } 3385 3386 PR0("using for file %s, dev %s, max_xfer = %u blks", 3387 file_path, dev_path, vd->max_xfer_sz); 3388 3389 vd->dk_geom.dkg_ncyl = label.dkl_ncyl; 3390 vd->dk_geom.dkg_acyl = label.dkl_acyl; 3391 vd->dk_geom.dkg_pcyl = label.dkl_pcyl; 3392 vd->dk_geom.dkg_nhead = label.dkl_nhead; 3393 vd->dk_geom.dkg_nsect = label.dkl_nsect; 3394 vd->dk_geom.dkg_intrlv = label.dkl_intrlv; 3395 vd->dk_geom.dkg_apc = label.dkl_apc; 3396 vd->dk_geom.dkg_rpm = label.dkl_rpm; 3397 vd->dk_geom.dkg_write_reinstruct = label.dkl_write_reinstruct; 3398 vd->dk_geom.dkg_read_reinstruct = label.dkl_read_reinstruct; 3399 3400 vd->vtoc.v_sanity = label.dkl_vtoc.v_sanity; 3401 vd->vtoc.v_version = label.dkl_vtoc.v_version; 3402 vd->vtoc.v_sectorsz = DEV_BSIZE; 3403 vd->vtoc.v_nparts = label.dkl_vtoc.v_nparts; 3404 3405 bcopy(label.dkl_vtoc.v_volume, vd->vtoc.v_volume, 3406 LEN_DKL_VVOL); 3407 bcopy(label.dkl_asciilabel, vd->vtoc.v_asciilabel, 3408 LEN_DKL_ASCII); 3409 3410 for (i = 0; i < vd->nslices; i++) { 3411 vd->vtoc.timestamp[i] = label.dkl_vtoc.v_timestamp[i]; 3412 vd->vtoc.v_part[i].p_tag = label.dkl_vtoc.v_part[i].p_tag; 3413 vd->vtoc.v_part[i].p_flag = label.dkl_vtoc.v_part[i].p_flag; 3414 vd->vtoc.v_part[i].p_start = label.dkl_map[i].dkl_cylno * 3415 label.dkl_nhead * label.dkl_nsect; 3416 vd->vtoc.v_part[i].p_size = label.dkl_map[i].dkl_nblk; 3417 vd->ldi_handle[i] = NULL; 3418 vd->dev[i] = NULL; 3419 } 3420 3421 /* Setup devid for the disk image */ 3422 3423 status = vd_file_read_devid(vd, &vd->file_devid); 3424 3425 if (status == 0) { 3426 /* a valid devid was found */ 3427 return (0); 3428 } 3429 3430 if (status != EINVAL) { 3431 /* 3432 * There was an error while trying to read the devid. So this 3433 * disk image may have a devid but we are unable to read it. 3434 */ 3435 PR0("can not read devid for %s", file_path); 3436 vd->file_devid = NULL; 3437 return (0); 3438 } 3439 3440 /* 3441 * No valid device id was found so we create one. Note that a failure 3442 * to create a device id is not fatal and does not prevent the disk 3443 * image from being attached. 3444 */ 3445 PR1("creating devid for %s", file_path); 3446 3447 if (ddi_devid_init(vd->vds->dip, DEVID_FAB, NULL, 0, 3448 &vd->file_devid) != DDI_SUCCESS) { 3449 PR0("fail to create devid for %s", file_path); 3450 vd->file_devid = NULL; 3451 return (0); 3452 } 3453 3454 /* write devid to the disk image */ 3455 if (vd_file_write_devid(vd, vd->file_devid) != 0) { 3456 PR0("fail to write devid for %s", file_path); 3457 ddi_devid_free(vd->file_devid); 3458 vd->file_devid = NULL; 3459 } 3460 3461 return (0); 3462 } 3463 3464 static int 3465 vd_setup_vd(vd_t *vd) 3466 { 3467 int rval, status; 3468 dev_info_t *dip; 3469 struct dk_cinfo dk_cinfo; 3470 char *device_path = vd->device_path; 3471 3472 /* 3473 * We need to open with FNDELAY so that opening an empty partition 3474 * does not fail. 3475 */ 3476 if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY, 3477 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { 3478 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 3479 vd->ldi_handle[0] = NULL; 3480 3481 /* this may not be a device try opening as a file */ 3482 if (status == ENXIO || status == ENODEV) 3483 status = vd_setup_file(vd); 3484 if (status) { 3485 PRN("Cannot use device/file (%s), errno=%d\n", 3486 device_path, status); 3487 if (status == ENXIO || status == ENODEV || 3488 status == ENOENT) { 3489 return (EAGAIN); 3490 } 3491 } 3492 return (status); 3493 } 3494 3495 /* 3496 * nslices must be updated now so that vds_destroy_vd() will close 3497 * the slice we have just opened in case of an error. 3498 */ 3499 vd->nslices = 1; 3500 vd->file = B_FALSE; 3501 3502 /* Get device number and size of backing device */ 3503 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 3504 PRN("ldi_get_dev() returned errno %d for %s", 3505 status, device_path); 3506 return (status); 3507 } 3508 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 3509 PRN("ldi_get_size() failed for %s", device_path); 3510 return (EIO); 3511 } 3512 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 3513 3514 /* Verify backing device supports dk_cinfo, dk_geom, and vtoc */ 3515 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 3516 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 3517 &rval)) != 0) { 3518 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 3519 status, device_path); 3520 return (status); 3521 } 3522 if (dk_cinfo.dki_partition >= V_NUMPAR) { 3523 PRN("slice %u >= maximum slice %u for %s", 3524 dk_cinfo.dki_partition, V_NUMPAR, device_path); 3525 return (EIO); 3526 } 3527 3528 status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label); 3529 3530 if (status != 0) { 3531 PRN("vd_read_vtoc returned errno %d for %s", 3532 status, device_path); 3533 return (status); 3534 } 3535 3536 if (vd->vdisk_label == VD_DISK_LABEL_VTOC && 3537 (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 3538 (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), 3539 kcred, &rval)) != 0) { 3540 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 3541 status, device_path); 3542 return (status); 3543 } 3544 3545 /* Store the device's max transfer size for return to the client */ 3546 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 3547 3548 /* Determine if backing device is a pseudo device */ 3549 if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), 3550 dev_to_instance(vd->dev[0]), 0)) == NULL) { 3551 PRN("%s is no longer accessible", device_path); 3552 return (EIO); 3553 } 3554 vd->pseudo = is_pseudo_device(dip); 3555 ddi_release_devi(dip); 3556 if (vd->pseudo) { 3557 vd->vdisk_type = VD_DISK_TYPE_SLICE; 3558 vd->nslices = 1; 3559 return (0); /* ...and we're done */ 3560 } 3561 3562 /* If slice is entire-disk slice, initialize for full disk */ 3563 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) 3564 return (vd_setup_full_disk(vd)); 3565 3566 3567 /* Otherwise, we have a non-entire slice of a device */ 3568 vd->vdisk_type = VD_DISK_TYPE_SLICE; 3569 vd->nslices = 1; 3570 3571 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 3572 status = vd_setup_partition_efi(vd); 3573 return (status); 3574 } 3575 3576 /* Initialize dk_geom structure for single-slice device */ 3577 if (vd->dk_geom.dkg_nsect == 0) { 3578 PRN("%s geometry claims 0 sectors per track", device_path); 3579 return (EIO); 3580 } 3581 if (vd->dk_geom.dkg_nhead == 0) { 3582 PRN("%s geometry claims 0 heads", device_path); 3583 return (EIO); 3584 } 3585 vd->dk_geom.dkg_ncyl = 3586 vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; 3587 vd->dk_geom.dkg_acyl = 0; 3588 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 3589 3590 3591 /* Initialize vtoc structure for single-slice device */ 3592 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 3593 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 3594 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 3595 vd->vtoc.v_nparts = 1; 3596 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 3597 vd->vtoc.v_part[0].p_flag = 0; 3598 vd->vtoc.v_part[0].p_start = 0; 3599 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 3600 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 3601 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 3602 3603 3604 return (0); 3605 } 3606 3607 static int 3608 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id, 3609 vd_t **vdp) 3610 { 3611 char tq_name[TASKQ_NAMELEN]; 3612 int status; 3613 ddi_iblock_cookie_t iblock = NULL; 3614 ldc_attr_t ldc_attr; 3615 vd_t *vd; 3616 3617 3618 ASSERT(vds != NULL); 3619 ASSERT(device_path != NULL); 3620 ASSERT(vdp != NULL); 3621 PR0("Adding vdisk for %s", device_path); 3622 3623 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 3624 PRN("No memory for virtual disk"); 3625 return (EAGAIN); 3626 } 3627 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 3628 vd->vds = vds; 3629 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 3630 3631 /* Open vdisk and initialize parameters */ 3632 if ((status = vd_setup_vd(vd)) == 0) { 3633 vd->initialized |= VD_DISK_READY; 3634 3635 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 3636 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 3637 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 3638 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 3639 vd->nslices); 3640 } else { 3641 if (status != EAGAIN) 3642 return (status); 3643 } 3644 3645 /* Initialize locking */ 3646 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 3647 &iblock) != DDI_SUCCESS) { 3648 PRN("Could not get iblock cookie."); 3649 return (EIO); 3650 } 3651 3652 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 3653 vd->initialized |= VD_LOCKING; 3654 3655 3656 /* Create start and completion task queues for the vdisk */ 3657 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 3658 PR1("tq_name = %s", tq_name); 3659 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 3660 TASKQ_DEFAULTPRI, 0)) == NULL) { 3661 PRN("Could not create task queue"); 3662 return (EIO); 3663 } 3664 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 3665 PR1("tq_name = %s", tq_name); 3666 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 3667 TASKQ_DEFAULTPRI, 0)) == NULL) { 3668 PRN("Could not create task queue"); 3669 return (EIO); 3670 } 3671 vd->enabled = 1; /* before callback can dispatch to startq */ 3672 3673 3674 /* Bring up LDC */ 3675 ldc_attr.devclass = LDC_DEV_BLK_SVC; 3676 ldc_attr.instance = ddi_get_instance(vds->dip); 3677 ldc_attr.mode = LDC_MODE_UNRELIABLE; 3678 ldc_attr.mtu = VD_LDC_MTU; 3679 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 3680 PRN("Could not initialize LDC channel %lu, " 3681 "init failed with error %d", ldc_id, status); 3682 return (status); 3683 } 3684 vd->initialized |= VD_LDC; 3685 3686 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 3687 (caddr_t)vd)) != 0) { 3688 PRN("Could not initialize LDC channel %lu," 3689 "reg_callback failed with error %d", ldc_id, status); 3690 return (status); 3691 } 3692 3693 if ((status = ldc_open(vd->ldc_handle)) != 0) { 3694 PRN("Could not initialize LDC channel %lu," 3695 "open failed with error %d", ldc_id, status); 3696 return (status); 3697 } 3698 3699 if ((status = ldc_up(vd->ldc_handle)) != 0) { 3700 PR0("ldc_up() returned errno %d", status); 3701 } 3702 3703 /* Allocate the inband task memory handle */ 3704 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 3705 if (status) { 3706 PRN("Could not initialize LDC channel %lu," 3707 "alloc_handle failed with error %d", ldc_id, status); 3708 return (ENXIO); 3709 } 3710 3711 /* Add the successfully-initialized vdisk to the server's table */ 3712 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 3713 PRN("Error adding vdisk ID %lu to table", id); 3714 return (EIO); 3715 } 3716 3717 /* Allocate the staging buffer */ 3718 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 3719 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 3720 3721 /* store initial state */ 3722 vd->state = VD_STATE_INIT; 3723 3724 return (0); 3725 } 3726 3727 static void 3728 vd_free_dring_task(vd_t *vdp) 3729 { 3730 if (vdp->dring_task != NULL) { 3731 ASSERT(vdp->dring_len != 0); 3732 /* Free all dring_task memory handles */ 3733 for (int i = 0; i < vdp->dring_len; i++) { 3734 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 3735 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 3736 vdp->dring_task[i].msg = NULL; 3737 } 3738 kmem_free(vdp->dring_task, 3739 (sizeof (*vdp->dring_task)) * vdp->dring_len); 3740 vdp->dring_task = NULL; 3741 } 3742 } 3743 3744 /* 3745 * Destroy the state associated with a virtual disk 3746 */ 3747 static void 3748 vds_destroy_vd(void *arg) 3749 { 3750 vd_t *vd = (vd_t *)arg; 3751 int retry = 0, rv; 3752 3753 if (vd == NULL) 3754 return; 3755 3756 PR0("Destroying vdisk state"); 3757 3758 if (vd->dk_efi.dki_data != NULL) 3759 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 3760 3761 /* Disable queuing requests for the vdisk */ 3762 if (vd->initialized & VD_LOCKING) { 3763 mutex_enter(&vd->lock); 3764 vd->enabled = 0; 3765 mutex_exit(&vd->lock); 3766 } 3767 3768 /* Drain and destroy start queue (*before* destroying completionq) */ 3769 if (vd->startq != NULL) 3770 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 3771 3772 /* Drain and destroy completion queue (*before* shutting down LDC) */ 3773 if (vd->completionq != NULL) 3774 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 3775 3776 vd_free_dring_task(vd); 3777 3778 /* Free the inband task memory handle */ 3779 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 3780 3781 /* Shut down LDC */ 3782 if (vd->initialized & VD_LDC) { 3783 /* unmap the dring */ 3784 if (vd->initialized & VD_DRING) 3785 (void) ldc_mem_dring_unmap(vd->dring_handle); 3786 3787 /* close LDC channel - retry on EAGAIN */ 3788 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 3789 if (++retry > vds_ldc_retries) { 3790 PR0("Timed out closing channel"); 3791 break; 3792 } 3793 drv_usecwait(vds_ldc_delay); 3794 } 3795 if (rv == 0) { 3796 (void) ldc_unreg_callback(vd->ldc_handle); 3797 (void) ldc_fini(vd->ldc_handle); 3798 } else { 3799 /* 3800 * Closing the LDC channel has failed. Ideally we should 3801 * fail here but there is no Zeus level infrastructure 3802 * to handle this. The MD has already been changed and 3803 * we have to do the close. So we try to do as much 3804 * clean up as we can. 3805 */ 3806 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 3807 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 3808 drv_usecwait(vds_ldc_delay); 3809 } 3810 } 3811 3812 /* Free the staging buffer for msgs */ 3813 if (vd->vio_msgp != NULL) { 3814 kmem_free(vd->vio_msgp, vd->max_msglen); 3815 vd->vio_msgp = NULL; 3816 } 3817 3818 /* Free the inband message buffer */ 3819 if (vd->inband_task.msg != NULL) { 3820 kmem_free(vd->inband_task.msg, vd->max_msglen); 3821 vd->inband_task.msg = NULL; 3822 } 3823 if (vd->file) { 3824 /* Close file */ 3825 (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 3826 0, kcred); 3827 VN_RELE(vd->file_vnode); 3828 if (vd->file_devid != NULL) 3829 ddi_devid_free(vd->file_devid); 3830 } else { 3831 /* Close any open backing-device slices */ 3832 for (uint_t slice = 0; slice < vd->nslices; slice++) { 3833 if (vd->ldi_handle[slice] != NULL) { 3834 PR0("Closing slice %u", slice); 3835 (void) ldi_close(vd->ldi_handle[slice], 3836 vd_open_flags | FNDELAY, kcred); 3837 } 3838 } 3839 } 3840 3841 /* Free lock */ 3842 if (vd->initialized & VD_LOCKING) 3843 mutex_destroy(&vd->lock); 3844 3845 /* Finally, free the vdisk structure itself */ 3846 kmem_free(vd, sizeof (*vd)); 3847 } 3848 3849 static int 3850 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id) 3851 { 3852 int status; 3853 vd_t *vd = NULL; 3854 3855 3856 if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0) 3857 vds_destroy_vd(vd); 3858 3859 return (status); 3860 } 3861 3862 static int 3863 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 3864 uint64_t *ldc_id) 3865 { 3866 int num_channels; 3867 3868 3869 /* Look for channel endpoint child(ren) of the vdisk MD node */ 3870 if ((num_channels = md_scan_dag(md, vd_node, 3871 md_find_name(md, VD_CHANNEL_ENDPOINT), 3872 md_find_name(md, "fwd"), channel)) <= 0) { 3873 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 3874 return (-1); 3875 } 3876 3877 /* Get the "id" value for the first channel endpoint node */ 3878 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 3879 PRN("No \"%s\" property found for \"%s\" of vdisk", 3880 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 3881 return (-1); 3882 } 3883 3884 if (num_channels > 1) { 3885 PRN("Using ID of first of multiple channels for this vdisk"); 3886 } 3887 3888 return (0); 3889 } 3890 3891 static int 3892 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 3893 { 3894 int num_nodes, status; 3895 size_t size; 3896 mde_cookie_t *channel; 3897 3898 3899 if ((num_nodes = md_node_count(md)) <= 0) { 3900 PRN("Invalid node count in Machine Description subtree"); 3901 return (-1); 3902 } 3903 size = num_nodes*(sizeof (*channel)); 3904 channel = kmem_zalloc(size, KM_SLEEP); 3905 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 3906 kmem_free(channel, size); 3907 3908 return (status); 3909 } 3910 3911 static void 3912 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 3913 { 3914 char *device_path = NULL; 3915 uint64_t id = 0, ldc_id = 0; 3916 3917 3918 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 3919 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 3920 return; 3921 } 3922 PR0("Adding vdisk ID %lu", id); 3923 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 3924 &device_path) != 0) { 3925 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 3926 return; 3927 } 3928 3929 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 3930 PRN("Error getting LDC ID for vdisk %lu", id); 3931 return; 3932 } 3933 3934 if (vds_init_vd(vds, id, device_path, ldc_id) != 0) { 3935 PRN("Failed to add vdisk ID %lu", id); 3936 return; 3937 } 3938 } 3939 3940 static void 3941 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 3942 { 3943 uint64_t id = 0; 3944 3945 3946 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 3947 PRN("Unable to get \"%s\" property from vdisk's MD node", 3948 VD_ID_PROP); 3949 return; 3950 } 3951 PR0("Removing vdisk ID %lu", id); 3952 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 3953 PRN("No vdisk entry found for vdisk ID %lu", id); 3954 } 3955 3956 static void 3957 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 3958 md_t *curr_md, mde_cookie_t curr_vd_node) 3959 { 3960 char *curr_dev, *prev_dev; 3961 uint64_t curr_id = 0, curr_ldc_id = 0; 3962 uint64_t prev_id = 0, prev_ldc_id = 0; 3963 size_t len; 3964 3965 3966 /* Validate that vdisk ID has not changed */ 3967 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 3968 PRN("Error getting previous vdisk \"%s\" property", 3969 VD_ID_PROP); 3970 return; 3971 } 3972 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 3973 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 3974 return; 3975 } 3976 if (curr_id != prev_id) { 3977 PRN("Not changing vdisk: ID changed from %lu to %lu", 3978 prev_id, curr_id); 3979 return; 3980 } 3981 3982 /* Validate that LDC ID has not changed */ 3983 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 3984 PRN("Error getting LDC ID for vdisk %lu", prev_id); 3985 return; 3986 } 3987 3988 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 3989 PRN("Error getting LDC ID for vdisk %lu", curr_id); 3990 return; 3991 } 3992 if (curr_ldc_id != prev_ldc_id) { 3993 _NOTE(NOTREACHED); /* lint is confused */ 3994 PRN("Not changing vdisk: " 3995 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 3996 return; 3997 } 3998 3999 /* Determine whether device path has changed */ 4000 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 4001 &prev_dev) != 0) { 4002 PRN("Error getting previous vdisk \"%s\"", 4003 VD_BLOCK_DEVICE_PROP); 4004 return; 4005 } 4006 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 4007 &curr_dev) != 0) { 4008 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 4009 return; 4010 } 4011 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 4012 (strncmp(curr_dev, prev_dev, len) == 0)) 4013 return; /* no relevant (supported) change */ 4014 4015 PR0("Changing vdisk ID %lu", prev_id); 4016 4017 /* Remove old state, which will close vdisk and reset */ 4018 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 4019 PRN("No entry found for vdisk ID %lu", prev_id); 4020 4021 /* Re-initialize vdisk with new state */ 4022 if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { 4023 PRN("Failed to change vdisk ID %lu", curr_id); 4024 return; 4025 } 4026 } 4027 4028 static int 4029 vds_process_md(void *arg, mdeg_result_t *md) 4030 { 4031 int i; 4032 vds_t *vds = arg; 4033 4034 4035 if (md == NULL) 4036 return (MDEG_FAILURE); 4037 ASSERT(vds != NULL); 4038 4039 for (i = 0; i < md->removed.nelem; i++) 4040 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 4041 for (i = 0; i < md->match_curr.nelem; i++) 4042 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 4043 md->match_curr.mdp, md->match_curr.mdep[i]); 4044 for (i = 0; i < md->added.nelem; i++) 4045 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 4046 4047 return (MDEG_SUCCESS); 4048 } 4049 4050 4051 static int 4052 vds_do_attach(dev_info_t *dip) 4053 { 4054 int status, sz; 4055 int cfg_handle; 4056 minor_t instance = ddi_get_instance(dip); 4057 vds_t *vds; 4058 mdeg_prop_spec_t *pspecp; 4059 mdeg_node_spec_t *ispecp; 4060 4061 /* 4062 * The "cfg-handle" property of a vds node in an MD contains the MD's 4063 * notion of "instance", or unique identifier, for that node; OBP 4064 * stores the value of the "cfg-handle" MD property as the value of 4065 * the "reg" property on the node in the device tree it builds from 4066 * the MD and passes to Solaris. Thus, we look up the devinfo node's 4067 * "reg" property value to uniquely identify this device instance when 4068 * registering with the MD event-generation framework. If the "reg" 4069 * property cannot be found, the device tree state is presumably so 4070 * broken that there is no point in continuing. 4071 */ 4072 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 4073 VD_REG_PROP)) { 4074 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 4075 return (DDI_FAILURE); 4076 } 4077 4078 /* Get the MD instance for later MDEG registration */ 4079 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 4080 VD_REG_PROP, -1); 4081 4082 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 4083 PRN("Could not allocate state for instance %u", instance); 4084 return (DDI_FAILURE); 4085 } 4086 4087 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 4088 PRN("Could not get state for instance %u", instance); 4089 ddi_soft_state_free(vds_state, instance); 4090 return (DDI_FAILURE); 4091 } 4092 4093 vds->dip = dip; 4094 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 4095 vds_destroy_vd, sizeof (void *)); 4096 4097 ASSERT(vds->vd_table != NULL); 4098 4099 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 4100 PRN("ldi_ident_from_dip() returned errno %d", status); 4101 return (DDI_FAILURE); 4102 } 4103 vds->initialized |= VDS_LDI; 4104 4105 /* Register for MD updates */ 4106 sz = sizeof (vds_prop_template); 4107 pspecp = kmem_alloc(sz, KM_SLEEP); 4108 bcopy(vds_prop_template, pspecp, sz); 4109 4110 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 4111 4112 /* initialize the complete prop spec structure */ 4113 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 4114 ispecp->namep = "virtual-device"; 4115 ispecp->specp = pspecp; 4116 4117 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 4118 &vds->mdeg) != MDEG_SUCCESS) { 4119 PRN("Unable to register for MD updates"); 4120 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 4121 kmem_free(pspecp, sz); 4122 return (DDI_FAILURE); 4123 } 4124 4125 vds->ispecp = ispecp; 4126 vds->initialized |= VDS_MDEG; 4127 4128 /* Prevent auto-detaching so driver is available whenever MD changes */ 4129 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 4130 DDI_PROP_SUCCESS) { 4131 PRN("failed to set \"%s\" property for instance %u", 4132 DDI_NO_AUTODETACH, instance); 4133 } 4134 4135 ddi_report_dev(dip); 4136 return (DDI_SUCCESS); 4137 } 4138 4139 static int 4140 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 4141 { 4142 int status; 4143 4144 switch (cmd) { 4145 case DDI_ATTACH: 4146 PR0("Attaching"); 4147 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 4148 (void) vds_detach(dip, DDI_DETACH); 4149 return (status); 4150 case DDI_RESUME: 4151 PR0("No action required for DDI_RESUME"); 4152 return (DDI_SUCCESS); 4153 default: 4154 return (DDI_FAILURE); 4155 } 4156 } 4157 4158 static struct dev_ops vds_ops = { 4159 DEVO_REV, /* devo_rev */ 4160 0, /* devo_refcnt */ 4161 ddi_no_info, /* devo_getinfo */ 4162 nulldev, /* devo_identify */ 4163 nulldev, /* devo_probe */ 4164 vds_attach, /* devo_attach */ 4165 vds_detach, /* devo_detach */ 4166 nodev, /* devo_reset */ 4167 NULL, /* devo_cb_ops */ 4168 NULL, /* devo_bus_ops */ 4169 nulldev /* devo_power */ 4170 }; 4171 4172 static struct modldrv modldrv = { 4173 &mod_driverops, 4174 "virtual disk server v%I%", 4175 &vds_ops, 4176 }; 4177 4178 static struct modlinkage modlinkage = { 4179 MODREV_1, 4180 &modldrv, 4181 NULL 4182 }; 4183 4184 4185 int 4186 _init(void) 4187 { 4188 int i, status; 4189 4190 4191 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 4192 return (status); 4193 if ((status = mod_install(&modlinkage)) != 0) { 4194 ddi_soft_state_fini(&vds_state); 4195 return (status); 4196 } 4197 4198 /* Fill in the bit-mask of server-supported operations */ 4199 for (i = 0; i < vds_noperations; i++) 4200 vds_operations |= 1 << (vds_operation[i].operation - 1); 4201 4202 return (0); 4203 } 4204 4205 int 4206 _info(struct modinfo *modinfop) 4207 { 4208 return (mod_info(&modlinkage, modinfop)); 4209 } 4210 4211 int 4212 _fini(void) 4213 { 4214 int status; 4215 4216 4217 if ((status = mod_remove(&modlinkage)) != 0) 4218 return (status); 4219 ddi_soft_state_fini(&vds_state); 4220 return (0); 4221 } 4222