1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/fs/hsfs_isospec.h> 41 #include <sys/mdeg.h> 42 #include <sys/modhash.h> 43 #include <sys/note.h> 44 #include <sys/pathname.h> 45 #include <sys/sdt.h> 46 #include <sys/sunddi.h> 47 #include <sys/sunldi.h> 48 #include <sys/sysmacros.h> 49 #include <sys/vio_common.h> 50 #include <sys/vio_util.h> 51 #include <sys/vdsk_mailbox.h> 52 #include <sys/vdsk_common.h> 53 #include <sys/vtoc.h> 54 #include <sys/vfs.h> 55 #include <sys/stat.h> 56 #include <sys/scsi/impl/uscsi.h> 57 #include <vm/seg_map.h> 58 59 /* Virtual disk server initialization flags */ 60 #define VDS_LDI 0x01 61 #define VDS_MDEG 0x02 62 63 /* Virtual disk server tunable parameters */ 64 #define VDS_RETRIES 5 65 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 66 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 67 #define VDS_NCHAINS 32 68 69 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 70 #define VDS_NAME "virtual-disk-server" 71 72 #define VD_NAME "vd" 73 #define VD_VOLUME_NAME "vdisk" 74 #define VD_ASCIILABEL "Virtual Disk" 75 76 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 77 #define VD_ID_PROP "id" 78 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 79 #define VD_BLOCK_DEVICE_OPTS "vds-block-device-opts" 80 #define VD_REG_PROP "reg" 81 82 /* Virtual disk initialization flags */ 83 #define VD_DISK_READY 0x01 84 #define VD_LOCKING 0x02 85 #define VD_LDC 0x04 86 #define VD_DRING 0x08 87 #define VD_SID 0x10 88 #define VD_SEQ_NUM 0x20 89 #define VD_SETUP_ERROR 0x40 90 91 /* Flags for writing to a vdisk which is a file */ 92 #define VD_FILE_WRITE_FLAGS SM_ASYNC 93 94 /* Number of backup labels */ 95 #define VD_FILE_NUM_BACKUP 5 96 97 /* Timeout for SCSI I/O */ 98 #define VD_SCSI_RDWR_TIMEOUT 30 /* 30 secs */ 99 100 /* 101 * By Solaris convention, slice/partition 2 represents the entire disk; 102 * unfortunately, this convention does not appear to be codified. 103 */ 104 #define VD_ENTIRE_DISK_SLICE 2 105 106 /* Return a cpp token as a string */ 107 #define STRINGIZE(token) #token 108 109 /* 110 * Print a message prefixed with the current function name to the message log 111 * (and optionally to the console for verbose boots); these macros use cpp's 112 * concatenation of string literals and C99 variable-length-argument-list 113 * macros 114 */ 115 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 116 #define _PRN(format, ...) \ 117 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 118 119 /* Return a pointer to the "i"th vdisk dring element */ 120 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 121 (vd->dring + (i)*vd->descriptor_size)) 122 123 /* Return the virtual disk client's type as a string (for use in messages) */ 124 #define VD_CLIENT(vd) \ 125 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 126 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 127 (((vd)->xfer_mode == 0) ? "null client" : \ 128 "unsupported client"))) 129 130 /* Read disk label from a disk on file */ 131 #define VD_FILE_LABEL_READ(vd, labelp) \ 132 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \ 133 0, sizeof (struct dk_label)) 134 135 /* Write disk label to a disk on file */ 136 #define VD_FILE_LABEL_WRITE(vd, labelp) \ 137 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 138 0, sizeof (struct dk_label)) 139 140 /* 141 * Specification of an MD node passed to the MDEG to filter any 142 * 'vport' nodes that do not belong to the specified node. This 143 * template is copied for each vds instance and filled in with 144 * the appropriate 'cfg-handle' value before being passed to the MDEG. 145 */ 146 static mdeg_prop_spec_t vds_prop_template[] = { 147 { MDET_PROP_STR, "name", VDS_NAME }, 148 { MDET_PROP_VAL, "cfg-handle", NULL }, 149 { MDET_LIST_END, NULL, NULL } 150 }; 151 152 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 153 154 /* 155 * Matching criteria passed to the MDEG to register interest 156 * in changes to 'virtual-device-port' nodes identified by their 157 * 'id' property. 158 */ 159 static md_prop_match_t vd_prop_match[] = { 160 { MDET_PROP_VAL, VD_ID_PROP }, 161 { MDET_LIST_END, NULL } 162 }; 163 164 static mdeg_node_match_t vd_match = {"virtual-device-port", 165 vd_prop_match}; 166 167 /* 168 * Options for the VD_BLOCK_DEVICE_OPTS property. 169 */ 170 #define VD_OPT_RDONLY 0x1 /* read-only */ 171 #define VD_OPT_SLICE 0x2 /* single slice */ 172 #define VD_OPT_EXCLUSIVE 0x4 /* exclusive access */ 173 174 #define VD_OPTION_NLEN 128 175 176 typedef struct vd_option { 177 char vdo_name[VD_OPTION_NLEN]; 178 uint64_t vdo_value; 179 } vd_option_t; 180 181 vd_option_t vd_bdev_options[] = { 182 { "ro", VD_OPT_RDONLY }, 183 { "slice", VD_OPT_SLICE }, 184 { "excl", VD_OPT_EXCLUSIVE } 185 }; 186 187 /* Debugging macros */ 188 #ifdef DEBUG 189 190 static int vd_msglevel = 0; 191 192 #define PR0 if (vd_msglevel > 0) PRN 193 #define PR1 if (vd_msglevel > 1) PRN 194 #define PR2 if (vd_msglevel > 2) PRN 195 196 #define VD_DUMP_DRING_ELEM(elem) \ 197 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 198 elem->hdr.dstate, \ 199 elem->payload.operation, \ 200 elem->payload.status, \ 201 elem->payload.nbytes, \ 202 elem->payload.addr, \ 203 elem->payload.ncookies); 204 205 char * 206 vd_decode_state(int state) 207 { 208 char *str; 209 210 #define CASE_STATE(_s) case _s: str = #_s; break; 211 212 switch (state) { 213 CASE_STATE(VD_STATE_INIT) 214 CASE_STATE(VD_STATE_VER) 215 CASE_STATE(VD_STATE_ATTR) 216 CASE_STATE(VD_STATE_DRING) 217 CASE_STATE(VD_STATE_RDX) 218 CASE_STATE(VD_STATE_DATA) 219 default: str = "unknown"; break; 220 } 221 222 #undef CASE_STATE 223 224 return (str); 225 } 226 227 void 228 vd_decode_tag(vio_msg_t *msg) 229 { 230 char *tstr, *sstr, *estr; 231 232 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 233 234 switch (msg->tag.vio_msgtype) { 235 CASE_TYPE(VIO_TYPE_CTRL) 236 CASE_TYPE(VIO_TYPE_DATA) 237 CASE_TYPE(VIO_TYPE_ERR) 238 default: tstr = "unknown"; break; 239 } 240 241 #undef CASE_TYPE 242 243 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 244 245 switch (msg->tag.vio_subtype) { 246 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 247 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 248 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 249 default: sstr = "unknown"; break; 250 } 251 252 #undef CASE_SUBTYPE 253 254 #define CASE_ENV(_s) case _s: estr = #_s; break; 255 256 switch (msg->tag.vio_subtype_env) { 257 CASE_ENV(VIO_VER_INFO) 258 CASE_ENV(VIO_ATTR_INFO) 259 CASE_ENV(VIO_DRING_REG) 260 CASE_ENV(VIO_DRING_UNREG) 261 CASE_ENV(VIO_RDX) 262 CASE_ENV(VIO_PKT_DATA) 263 CASE_ENV(VIO_DESC_DATA) 264 CASE_ENV(VIO_DRING_DATA) 265 default: estr = "unknown"; break; 266 } 267 268 #undef CASE_ENV 269 270 PR1("(%x/%x/%x) message : (%s/%s/%s)", 271 msg->tag.vio_msgtype, msg->tag.vio_subtype, 272 msg->tag.vio_subtype_env, tstr, sstr, estr); 273 } 274 275 #else /* !DEBUG */ 276 277 #define PR0(...) 278 #define PR1(...) 279 #define PR2(...) 280 281 #define VD_DUMP_DRING_ELEM(elem) 282 283 #define vd_decode_state(_s) (NULL) 284 #define vd_decode_tag(_s) (NULL) 285 286 #endif /* DEBUG */ 287 288 289 /* 290 * Soft state structure for a vds instance 291 */ 292 typedef struct vds { 293 uint_t initialized; /* driver inst initialization flags */ 294 dev_info_t *dip; /* driver inst devinfo pointer */ 295 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 296 mod_hash_t *vd_table; /* table of virtual disks served */ 297 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 298 mdeg_handle_t mdeg; /* handle for MDEG operations */ 299 } vds_t; 300 301 /* 302 * Types of descriptor-processing tasks 303 */ 304 typedef enum vd_task_type { 305 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 306 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 307 } vd_task_type_t; 308 309 /* 310 * Structure describing the task for processing a descriptor 311 */ 312 typedef struct vd_task { 313 struct vd *vd; /* vd instance task is for */ 314 vd_task_type_t type; /* type of descriptor task */ 315 int index; /* dring elem index for task */ 316 vio_msg_t *msg; /* VIO message task is for */ 317 size_t msglen; /* length of message content */ 318 vd_dring_payload_t *request; /* request task will perform */ 319 struct buf buf; /* buf(9s) for I/O request */ 320 ldc_mem_handle_t mhdl; /* task memory handle */ 321 int status; /* status of processing task */ 322 int (*completef)(struct vd_task *task); /* completion func ptr */ 323 } vd_task_t; 324 325 /* 326 * Soft state structure for a virtual disk instance 327 */ 328 typedef struct vd { 329 uint_t initialized; /* vdisk initialization flags */ 330 uint64_t operations; /* bitmask of VD_OPs exported */ 331 vio_ver_t version; /* ver negotiated with client */ 332 vds_t *vds; /* server for this vdisk */ 333 ddi_taskq_t *startq; /* queue for I/O start tasks */ 334 ddi_taskq_t *completionq; /* queue for completion tasks */ 335 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 336 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 337 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 338 int open_flags; /* open flags */ 339 uint_t nslices; /* number of slices */ 340 size_t vdisk_size; /* number of blocks in vdisk */ 341 size_t vdisk_block_size; /* size of each vdisk block */ 342 vd_disk_type_t vdisk_type; /* slice or entire disk */ 343 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 344 vd_media_t vdisk_media; /* media type of backing dev. */ 345 boolean_t is_atapi_dev; /* Is this an IDE CD-ROM dev? */ 346 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 347 size_t block_size; /* blk size of actual device */ 348 boolean_t pseudo; /* underlying pseudo dev */ 349 boolean_t file; /* is vDisk backed by a file? */ 350 vnode_t *file_vnode; /* file vnode */ 351 size_t file_size; /* file size */ 352 ddi_devid_t file_devid; /* devid for disk image */ 353 struct dk_efi dk_efi; /* synthetic for slice type */ 354 struct dk_geom dk_geom; /* synthetic for slice type */ 355 struct dk_minfo dk_minfo; /* synthetic for slice type */ 356 struct vtoc vtoc; /* synthetic for slice type */ 357 ldc_status_t ldc_state; /* LDC connection state */ 358 ldc_handle_t ldc_handle; /* handle for LDC comm */ 359 size_t max_msglen; /* largest LDC message len */ 360 vd_state_t state; /* client handshake state */ 361 uint8_t xfer_mode; /* transfer mode with client */ 362 uint32_t sid; /* client's session ID */ 363 uint64_t seq_num; /* message sequence number */ 364 uint64_t dring_ident; /* identifier of dring */ 365 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 366 uint32_t descriptor_size; /* num bytes in desc */ 367 uint32_t dring_len; /* number of dring elements */ 368 caddr_t dring; /* address of dring */ 369 caddr_t vio_msgp; /* vio msg staging buffer */ 370 vd_task_t inband_task; /* task for inband descriptor */ 371 vd_task_t *dring_task; /* tasks dring elements */ 372 373 kmutex_t lock; /* protects variables below */ 374 boolean_t enabled; /* is vdisk enabled? */ 375 boolean_t reset_state; /* reset connection state? */ 376 boolean_t reset_ldc; /* reset LDC channel? */ 377 } vd_t; 378 379 typedef struct vds_operation { 380 char *namep; 381 uint8_t operation; 382 int (*start)(vd_task_t *task); 383 int (*complete)(vd_task_t *task); 384 } vds_operation_t; 385 386 typedef struct vd_ioctl { 387 uint8_t operation; /* vdisk operation */ 388 const char *operation_name; /* vdisk operation name */ 389 size_t nbytes; /* size of operation buffer */ 390 int cmd; /* corresponding ioctl cmd */ 391 const char *cmd_name; /* ioctl cmd name */ 392 void *arg; /* ioctl cmd argument */ 393 /* convert input vd_buf to output ioctl_arg */ 394 void (*copyin)(void *vd_buf, void *ioctl_arg); 395 /* convert input ioctl_arg to output vd_buf */ 396 void (*copyout)(void *ioctl_arg, void *vd_buf); 397 /* write is true if the operation writes any data to the backend */ 398 boolean_t write; 399 } vd_ioctl_t; 400 401 /* Define trivial copyin/copyout conversion function flag */ 402 #define VD_IDENTITY ((void (*)(void *, void *))-1) 403 404 405 static int vds_ldc_retries = VDS_RETRIES; 406 static int vds_ldc_delay = VDS_LDC_DELAY; 407 static int vds_dev_retries = VDS_RETRIES; 408 static int vds_dev_delay = VDS_DEV_DELAY; 409 static void *vds_state; 410 411 static uint_t vd_file_write_flags = VD_FILE_WRITE_FLAGS; 412 413 static short vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT; 414 415 /* 416 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 417 * 418 * Each supported major version should appear only once, paired with (and only 419 * with) its highest supported minor version number (as the protocol requires 420 * supporting all lower minor version numbers as well) 421 */ 422 static const vio_ver_t vds_version[] = {{1, 1}}; 423 static const size_t vds_num_versions = 424 sizeof (vds_version)/sizeof (vds_version[0]); 425 426 static void vd_free_dring_task(vd_t *vdp); 427 static int vd_setup_vd(vd_t *vd); 428 static int vd_setup_single_slice_disk(vd_t *vd); 429 static boolean_t vd_enabled(vd_t *vd); 430 static ushort_t vd_lbl2cksum(struct dk_label *label); 431 static int vd_file_validate_geometry(vd_t *vd); 432 static boolean_t vd_file_is_iso_image(vd_t *vd); 433 static void vd_set_exported_operations(vd_t *vd); 434 435 /* 436 * Function: 437 * vd_file_rw 438 * 439 * Description: 440 * Read or write to a disk on file. 441 * 442 * Parameters: 443 * vd - disk on which the operation is performed. 444 * slice - slice on which the operation is performed, 445 * VD_SLICE_NONE indicates that the operation 446 * is done using an absolute disk offset. 447 * operation - operation to execute: read (VD_OP_BREAD) or 448 * write (VD_OP_BWRITE). 449 * data - buffer where data are read to or written from. 450 * blk - starting block for the operation. 451 * len - number of bytes to read or write. 452 * 453 * Return Code: 454 * n >= 0 - success, n indicates the number of bytes read 455 * or written. 456 * -1 - error. 457 */ 458 static ssize_t 459 vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk, 460 size_t len) 461 { 462 caddr_t maddr; 463 size_t offset, maxlen, moffset, mlen, n; 464 uint_t smflags; 465 enum seg_rw srw; 466 467 ASSERT(vd->file); 468 ASSERT(len > 0); 469 470 /* 471 * If a file is exported as a slice then we don't care about the vtoc. 472 * In that case, the vtoc is a fake mainly to make newfs happy and we 473 * handle any I/O as a raw disk access so that we can have access to the 474 * entire backend. 475 */ 476 if (vd->vdisk_type == VD_DISK_TYPE_SLICE || slice == VD_SLICE_NONE) { 477 /* raw disk access */ 478 offset = blk * DEV_BSIZE; 479 } else { 480 ASSERT(slice >= 0 && slice < V_NUMPAR); 481 482 /* 483 * v1.0 vDisk clients depended on the server not verifying 484 * the label of a unformatted disk. This "feature" is 485 * maintained for backward compatibility but all versions 486 * from v1.1 onwards must do the right thing. 487 */ 488 if (vd->vdisk_label == VD_DISK_LABEL_UNK && 489 vio_ver_is_supported(vd->version, 1, 1) && 490 vd_file_validate_geometry(vd) != 0) { 491 PR0("Unknown disk label, can't do I/O from slice %d", 492 slice); 493 return (-1); 494 } 495 496 if (blk >= vd->vtoc.v_part[slice].p_size) { 497 /* address past the end of the slice */ 498 PR0("req_addr (0x%lx) > psize (0x%lx)", 499 blk, vd->vtoc.v_part[slice].p_size); 500 return (0); 501 } 502 503 offset = (vd->vtoc.v_part[slice].p_start + blk) * DEV_BSIZE; 504 505 /* 506 * If the requested size is greater than the size 507 * of the partition, truncate the read/write. 508 */ 509 maxlen = (vd->vtoc.v_part[slice].p_size - blk) * DEV_BSIZE; 510 511 if (len > maxlen) { 512 PR0("I/O size truncated to %lu bytes from %lu bytes", 513 maxlen, len); 514 len = maxlen; 515 } 516 } 517 518 /* 519 * We have to ensure that we are reading/writing into the mmap 520 * range. If we have a partial disk image (e.g. an image of 521 * s0 instead s2) the system can try to access slices that 522 * are not included into the disk image. 523 */ 524 if ((offset + len) >= vd->file_size) { 525 PR0("offset + nbytes (0x%lx + 0x%lx) >= " 526 "file_size (0x%lx)", offset, len, vd->file_size); 527 return (-1); 528 } 529 530 srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE; 531 smflags = (operation == VD_OP_BREAD)? 0 : 532 (SM_WRITE | vd_file_write_flags); 533 n = len; 534 535 do { 536 /* 537 * segmap_getmapflt() returns a MAXBSIZE chunk which is 538 * MAXBSIZE aligned. 539 */ 540 moffset = offset & MAXBOFFSET; 541 mlen = MIN(MAXBSIZE - moffset, n); 542 maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset, 543 mlen, 1, srw); 544 /* 545 * Fault in the pages so we can check for error and ensure 546 * that we can safely used the mapped address. 547 */ 548 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 549 F_SOFTLOCK, srw) != 0) { 550 (void) segmap_release(segkmap, maddr, 0); 551 return (-1); 552 } 553 554 if (operation == VD_OP_BREAD) 555 bcopy(maddr + moffset, data, mlen); 556 else 557 bcopy(data, maddr + moffset, mlen); 558 559 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 560 F_SOFTUNLOCK, srw) != 0) { 561 (void) segmap_release(segkmap, maddr, 0); 562 return (-1); 563 } 564 if (segmap_release(segkmap, maddr, smflags) != 0) 565 return (-1); 566 n -= mlen; 567 offset += mlen; 568 data += mlen; 569 570 } while (n > 0); 571 572 return (len); 573 } 574 575 /* 576 * Function: 577 * vd_file_build_default_label 578 * 579 * Description: 580 * Return a default label for the given disk. This is used when the disk 581 * does not have a valid VTOC so that the user can get a valid default 582 * configuration. The default label has all slice sizes set to 0 (except 583 * slice 2 which is the entire disk) to force the user to write a valid 584 * label onto the disk image. 585 * 586 * Parameters: 587 * vd - disk on which the operation is performed. 588 * label - the returned default label. 589 * 590 * Return Code: 591 * none. 592 */ 593 static void 594 vd_file_build_default_label(vd_t *vd, struct dk_label *label) 595 { 596 size_t size; 597 char prefix; 598 int slice, nparts; 599 uint16_t tag; 600 601 ASSERT(vd->file); 602 603 /* 604 * We must have a resonable number of cylinders and sectors so 605 * that newfs can run using default values. 606 * 607 * if (disk_size < 2MB) 608 * phys_cylinders = disk_size / 100K 609 * else 610 * phys_cylinders = disk_size / 300K 611 * 612 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 613 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 614 * data_cylinders = phys_cylinders - alt_cylinders 615 * 616 * sectors = disk_size / (phys_cylinders * blk_size) 617 * 618 * The file size test is an attempt to not have too few cylinders 619 * for a small file, or so many on a big file that you waste space 620 * for backup superblocks or cylinder group structures. 621 */ 622 if (vd->file_size < (2 * 1024 * 1024)) 623 label->dkl_pcyl = vd->file_size / (100 * 1024); 624 else 625 label->dkl_pcyl = vd->file_size / (300 * 1024); 626 627 if (label->dkl_pcyl == 0) 628 label->dkl_pcyl = 1; 629 630 label->dkl_acyl = 0; 631 632 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { 633 nparts = 1; 634 slice = 0; 635 tag = V_UNASSIGNED; 636 } else { 637 if (label->dkl_pcyl > 2) 638 label->dkl_acyl = 2; 639 nparts = V_NUMPAR; 640 slice = VD_ENTIRE_DISK_SLICE; 641 tag = V_BACKUP; 642 } 643 644 label->dkl_nsect = vd->file_size / 645 (DEV_BSIZE * label->dkl_pcyl); 646 label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl; 647 label->dkl_nhead = 1; 648 label->dkl_write_reinstruct = 0; 649 label->dkl_read_reinstruct = 0; 650 label->dkl_rpm = 7200; 651 label->dkl_apc = 0; 652 label->dkl_intrlv = 0; 653 654 PR0("requested disk size: %ld bytes\n", vd->file_size); 655 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl, 656 label->dkl_nhead, label->dkl_nsect); 657 PR0("provided disk size: %ld bytes\n", (uint64_t) 658 (label->dkl_pcyl * label->dkl_nhead * 659 label->dkl_nsect * DEV_BSIZE)); 660 661 if (vd->file_size < (1ULL << 20)) { 662 size = vd->file_size >> 10; 663 prefix = 'K'; /* Kilobyte */ 664 } else if (vd->file_size < (1ULL << 30)) { 665 size = vd->file_size >> 20; 666 prefix = 'M'; /* Megabyte */ 667 } else if (vd->file_size < (1ULL << 40)) { 668 size = vd->file_size >> 30; 669 prefix = 'G'; /* Gigabyte */ 670 } else { 671 size = vd->file_size >> 40; 672 prefix = 'T'; /* Terabyte */ 673 } 674 675 /* 676 * We must have a correct label name otherwise format(1m) will 677 * not recognized the disk as labeled. 678 */ 679 (void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII, 680 "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d", 681 size, prefix, 682 label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead, 683 label->dkl_nsect); 684 685 /* default VTOC */ 686 label->dkl_vtoc.v_version = V_VERSION; 687 label->dkl_vtoc.v_nparts = nparts; 688 label->dkl_vtoc.v_sanity = VTOC_SANE; 689 label->dkl_vtoc.v_part[slice].p_tag = tag; 690 label->dkl_map[slice].dkl_cylno = 0; 691 label->dkl_map[slice].dkl_nblk = label->dkl_ncyl * 692 label->dkl_nhead * label->dkl_nsect; 693 label->dkl_cksum = vd_lbl2cksum(label); 694 } 695 696 /* 697 * Function: 698 * vd_file_set_vtoc 699 * 700 * Description: 701 * Set the vtoc of a disk image by writing the label and backup 702 * labels into the disk image backend. 703 * 704 * Parameters: 705 * vd - disk on which the operation is performed. 706 * label - the data to be written. 707 * 708 * Return Code: 709 * 0 - success. 710 * n > 0 - error, n indicates the errno code. 711 */ 712 static int 713 vd_file_set_vtoc(vd_t *vd, struct dk_label *label) 714 { 715 int blk, sec, cyl, head, cnt; 716 717 ASSERT(vd->file); 718 719 if (VD_FILE_LABEL_WRITE(vd, label) < 0) { 720 PR0("fail to write disk label"); 721 return (EIO); 722 } 723 724 /* 725 * Backup labels are on the last alternate cylinder's 726 * first five odd sectors. 727 */ 728 if (label->dkl_acyl == 0) { 729 PR0("no alternate cylinder, can not store backup labels"); 730 return (0); 731 } 732 733 cyl = label->dkl_ncyl + label->dkl_acyl - 1; 734 head = label->dkl_nhead - 1; 735 736 blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) + 737 (head * label->dkl_nsect); 738 739 /* 740 * Write the backup labels. Make sure we don't try to write past 741 * the last cylinder. 742 */ 743 sec = 1; 744 745 for (cnt = 0; cnt < VD_FILE_NUM_BACKUP; cnt++) { 746 747 if (sec >= label->dkl_nsect) { 748 PR0("not enough sector to store all backup labels"); 749 return (0); 750 } 751 752 if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)label, 753 blk + sec, sizeof (struct dk_label)) < 0) { 754 PR0("error writing backup label at block %d\n", 755 blk + sec); 756 return (EIO); 757 } 758 759 PR1("wrote backup label at block %d\n", blk + sec); 760 761 sec += 2; 762 } 763 764 return (0); 765 } 766 767 /* 768 * Function: 769 * vd_file_get_devid_block 770 * 771 * Description: 772 * Return the block number where the device id is stored. 773 * 774 * Parameters: 775 * vd - disk on which the operation is performed. 776 * blkp - pointer to the block number 777 * 778 * Return Code: 779 * 0 - success 780 * ENOSPC - disk has no space to store a device id 781 */ 782 static int 783 vd_file_get_devid_block(vd_t *vd, size_t *blkp) 784 { 785 diskaddr_t spc, head, cyl; 786 787 ASSERT(vd->file); 788 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 789 790 /* this geometry doesn't allow us to have a devid */ 791 if (vd->dk_geom.dkg_acyl < 2) { 792 PR0("not enough alternate cylinder available for devid " 793 "(acyl=%u)", vd->dk_geom.dkg_acyl); 794 return (ENOSPC); 795 } 796 797 /* the devid is in on the track next to the last cylinder */ 798 cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2; 799 spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect; 800 head = vd->dk_geom.dkg_nhead - 1; 801 802 *blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) + 803 (head * vd->dk_geom.dkg_nsect) + 1; 804 805 return (0); 806 } 807 808 /* 809 * Return the checksum of a disk block containing an on-disk devid. 810 */ 811 static uint_t 812 vd_dkdevid2cksum(struct dk_devid *dkdevid) 813 { 814 uint_t chksum, *ip; 815 int i; 816 817 chksum = 0; 818 ip = (uint_t *)dkdevid; 819 for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++) 820 chksum ^= ip[i]; 821 822 return (chksum); 823 } 824 825 /* 826 * Function: 827 * vd_file_read_devid 828 * 829 * Description: 830 * Read the device id stored on a disk image. 831 * 832 * Parameters: 833 * vd - disk on which the operation is performed. 834 * devid - the return address of the device ID. 835 * 836 * Return Code: 837 * 0 - success 838 * EIO - I/O error while trying to access the disk image 839 * EINVAL - no valid device id was found 840 * ENOSPC - disk has no space to store a device id 841 */ 842 static int 843 vd_file_read_devid(vd_t *vd, ddi_devid_t *devid) 844 { 845 struct dk_devid *dkdevid; 846 size_t blk; 847 uint_t chksum; 848 int status, sz; 849 850 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 851 return (status); 852 853 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 854 855 /* get the devid */ 856 if ((vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk, 857 DEV_BSIZE)) < 0) { 858 PR0("error reading devid block at %lu", blk); 859 status = EIO; 860 goto done; 861 } 862 863 /* validate the revision */ 864 if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) || 865 (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) { 866 PR0("invalid devid found at block %lu (bad revision)", blk); 867 status = EINVAL; 868 goto done; 869 } 870 871 /* compute checksum */ 872 chksum = vd_dkdevid2cksum(dkdevid); 873 874 /* compare the checksums */ 875 if (DKD_GETCHKSUM(dkdevid) != chksum) { 876 PR0("invalid devid found at block %lu (bad checksum)", blk); 877 status = EINVAL; 878 goto done; 879 } 880 881 /* validate the device id */ 882 if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) { 883 PR0("invalid devid found at block %lu", blk); 884 status = EINVAL; 885 goto done; 886 } 887 888 PR1("devid read at block %lu", blk); 889 890 sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid); 891 *devid = kmem_alloc(sz, KM_SLEEP); 892 bcopy(&dkdevid->dkd_devid, *devid, sz); 893 894 done: 895 kmem_free(dkdevid, DEV_BSIZE); 896 return (status); 897 898 } 899 900 /* 901 * Function: 902 * vd_file_write_devid 903 * 904 * Description: 905 * Write a device id into disk image. 906 * 907 * Parameters: 908 * vd - disk on which the operation is performed. 909 * devid - the device ID to store. 910 * 911 * Return Code: 912 * 0 - success 913 * EIO - I/O error while trying to access the disk image 914 * ENOSPC - disk has no space to store a device id 915 */ 916 static int 917 vd_file_write_devid(vd_t *vd, ddi_devid_t devid) 918 { 919 struct dk_devid *dkdevid; 920 uint_t chksum; 921 size_t blk; 922 int status; 923 924 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 925 return (status); 926 927 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 928 929 /* set revision */ 930 dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB; 931 dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB; 932 933 /* copy devid */ 934 bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid)); 935 936 /* compute checksum */ 937 chksum = vd_dkdevid2cksum(dkdevid); 938 939 /* set checksum */ 940 DKD_FORMCHKSUM(chksum, dkdevid); 941 942 /* store the devid */ 943 if ((status = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, 944 (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) { 945 PR0("Error writing devid block at %lu", blk); 946 status = EIO; 947 } else { 948 PR1("devid written at block %lu", blk); 949 status = 0; 950 } 951 952 kmem_free(dkdevid, DEV_BSIZE); 953 return (status); 954 } 955 956 /* 957 * Function: 958 * vd_do_scsi_rdwr 959 * 960 * Description: 961 * Read or write to a SCSI disk using an absolute disk offset. 962 * 963 * Parameters: 964 * vd - disk on which the operation is performed. 965 * operation - operation to execute: read (VD_OP_BREAD) or 966 * write (VD_OP_BWRITE). 967 * data - buffer where data are read to or written from. 968 * blk - starting block for the operation. 969 * len - number of bytes to read or write. 970 * 971 * Return Code: 972 * 0 - success 973 * n != 0 - error. 974 */ 975 static int 976 vd_do_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len) 977 { 978 struct uscsi_cmd ucmd; 979 union scsi_cdb cdb; 980 int nsectors, nblk; 981 int max_sectors; 982 int status, rval; 983 984 ASSERT(!vd->file); 985 ASSERT(vd->vdisk_block_size > 0); 986 987 max_sectors = vd->max_xfer_sz; 988 nblk = (len / vd->vdisk_block_size); 989 990 if (len % vd->vdisk_block_size != 0) 991 return (EINVAL); 992 993 /* 994 * Build and execute the uscsi ioctl. We build a group0, group1 995 * or group4 command as necessary, since some targets 996 * do not support group1 commands. 997 */ 998 while (nblk) { 999 1000 bzero(&ucmd, sizeof (ucmd)); 1001 bzero(&cdb, sizeof (cdb)); 1002 1003 nsectors = (max_sectors < nblk) ? max_sectors : nblk; 1004 1005 /* 1006 * Some of the optical drives on sun4v machines are ATAPI 1007 * devices which use Group 1 Read/Write commands so we need 1008 * to explicitly check a flag which is set when a domain 1009 * is bound. 1010 */ 1011 if (blk < (2 << 20) && nsectors <= 0xff && !vd->is_atapi_dev) { 1012 FORMG0ADDR(&cdb, blk); 1013 FORMG0COUNT(&cdb, nsectors); 1014 ucmd.uscsi_cdblen = CDB_GROUP0; 1015 } else if (blk > 0xffffffff) { 1016 FORMG4LONGADDR(&cdb, blk); 1017 FORMG4COUNT(&cdb, nsectors); 1018 ucmd.uscsi_cdblen = CDB_GROUP4; 1019 cdb.scc_cmd |= SCMD_GROUP4; 1020 } else { 1021 FORMG1ADDR(&cdb, blk); 1022 FORMG1COUNT(&cdb, nsectors); 1023 ucmd.uscsi_cdblen = CDB_GROUP1; 1024 cdb.scc_cmd |= SCMD_GROUP1; 1025 } 1026 ucmd.uscsi_cdb = (caddr_t)&cdb; 1027 ucmd.uscsi_bufaddr = data; 1028 ucmd.uscsi_buflen = nsectors * vd->block_size; 1029 ucmd.uscsi_timeout = vd_scsi_rdwr_timeout; 1030 /* 1031 * Set flags so that the command is isolated from normal 1032 * commands and no error message is printed. 1033 */ 1034 ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT; 1035 1036 if (operation == VD_OP_BREAD) { 1037 cdb.scc_cmd |= SCMD_READ; 1038 ucmd.uscsi_flags |= USCSI_READ; 1039 } else { 1040 cdb.scc_cmd |= SCMD_WRITE; 1041 } 1042 1043 status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE], 1044 USCSICMD, (intptr_t)&ucmd, (vd->open_flags | FKIOCTL), 1045 kcred, &rval); 1046 1047 if (status == 0) 1048 status = ucmd.uscsi_status; 1049 1050 if (status != 0) 1051 break; 1052 1053 /* 1054 * Check if partial DMA breakup is required. If so, reduce 1055 * the request size by half and retry the last request. 1056 */ 1057 if (ucmd.uscsi_resid == ucmd.uscsi_buflen) { 1058 max_sectors >>= 1; 1059 if (max_sectors <= 0) { 1060 status = EIO; 1061 break; 1062 } 1063 continue; 1064 } 1065 1066 if (ucmd.uscsi_resid != 0) { 1067 status = EIO; 1068 break; 1069 } 1070 1071 blk += nsectors; 1072 nblk -= nsectors; 1073 data += nsectors * vd->vdisk_block_size; /* SECSIZE */ 1074 } 1075 1076 return (status); 1077 } 1078 1079 /* 1080 * Function: 1081 * vd_scsi_rdwr 1082 * 1083 * Description: 1084 * Wrapper function to read or write to a SCSI disk using an absolute 1085 * disk offset. It checks the blocksize of the underlying device and, 1086 * if necessary, adjusts the buffers accordingly before calling 1087 * vd_do_scsi_rdwr() to do the actual read or write. 1088 * 1089 * Parameters: 1090 * vd - disk on which the operation is performed. 1091 * operation - operation to execute: read (VD_OP_BREAD) or 1092 * write (VD_OP_BWRITE). 1093 * data - buffer where data are read to or written from. 1094 * blk - starting block for the operation. 1095 * len - number of bytes to read or write. 1096 * 1097 * Return Code: 1098 * 0 - success 1099 * n != 0 - error. 1100 */ 1101 static int 1102 vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t vblk, size_t vlen) 1103 { 1104 int rv; 1105 1106 size_t pblk; /* physical device block number of data on device */ 1107 size_t delta; /* relative offset between pblk and vblk */ 1108 size_t pnblk; /* number of physical blocks to be read from device */ 1109 size_t plen; /* length of data to be read from physical device */ 1110 char *buf; /* buffer area to fit physical device's block size */ 1111 1112 /* 1113 * If the vdisk block size and the block size of the underlying device 1114 * match we can skip straight to vd_do_scsi_rdwr(), otherwise we need 1115 * to create a buffer large enough to handle the device's block size 1116 * and adjust the block to be read from and the amount of data to 1117 * read to correspond with the device's block size. 1118 */ 1119 if (vd->vdisk_block_size == vd->block_size) 1120 return (vd_do_scsi_rdwr(vd, operation, data, vblk, vlen)); 1121 1122 if (vd->vdisk_block_size > vd->block_size) 1123 return (EINVAL); 1124 1125 /* 1126 * Writing of physical block sizes larger than the virtual block size 1127 * is not supported. This would be added if/when support for guests 1128 * writing to DVDs is implemented. 1129 */ 1130 if (operation == VD_OP_BWRITE) 1131 return (ENOTSUP); 1132 1133 /* BEGIN CSTYLED */ 1134 /* 1135 * Below is a diagram showing the relationship between the physical 1136 * and virtual blocks. If the virtual blocks marked by 'X' below are 1137 * requested, then the physical blocks denoted by 'Y' are read. 1138 * 1139 * vblk 1140 * | vlen 1141 * |<--------------->| 1142 * v v 1143 * --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+- virtual disk: 1144 * | | | |XX|XX|XX|XX|XX|XX| | | | | | } block size is 1145 * --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+- vd->vdisk_block_size 1146 * : : : : 1147 * >:==:< delta : : 1148 * : : : : 1149 * --+-----+-----+-----+-----+-----+-----+-----+-- physical disk: 1150 * | |YY:YY|YYYYY|YYYYY|YY:YY| | | } block size is 1151 * --+-----+-----+-----+-----+-----+-----+-----+-- vd->block_size 1152 * ^ ^ 1153 * |<--------------------->| 1154 * | plen 1155 * pblk 1156 */ 1157 /* END CSTYLED */ 1158 pblk = (vblk * vd->vdisk_block_size) / vd->block_size; 1159 delta = (vblk * vd->vdisk_block_size) - (pblk * vd->block_size); 1160 pnblk = ((delta + vlen - 1) / vd->block_size) + 1; 1161 plen = pnblk * vd->block_size; 1162 1163 PR2("vblk %lx:pblk %lx: vlen %ld:plen %ld", vblk, pblk, vlen, plen); 1164 1165 buf = kmem_zalloc(sizeof (caddr_t) * plen, KM_SLEEP); 1166 rv = vd_do_scsi_rdwr(vd, operation, (caddr_t)buf, pblk, plen); 1167 bcopy(buf + delta, data, vlen); 1168 1169 kmem_free(buf, sizeof (caddr_t) * plen); 1170 1171 return (rv); 1172 } 1173 1174 /* 1175 * Return Values 1176 * EINPROGRESS - operation was successfully started 1177 * EIO - encountered LDC (aka. task error) 1178 * 0 - operation completed successfully 1179 * 1180 * Side Effect 1181 * sets request->status = <disk operation status> 1182 */ 1183 static int 1184 vd_start_bio(vd_task_t *task) 1185 { 1186 int rv, status = 0; 1187 vd_t *vd = task->vd; 1188 vd_dring_payload_t *request = task->request; 1189 struct buf *buf = &task->buf; 1190 uint8_t mtype; 1191 int slice; 1192 char *bufaddr = 0; 1193 size_t buflen; 1194 1195 ASSERT(vd != NULL); 1196 ASSERT(request != NULL); 1197 1198 slice = request->slice; 1199 1200 ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices); 1201 ASSERT((request->operation == VD_OP_BREAD) || 1202 (request->operation == VD_OP_BWRITE)); 1203 1204 if (request->nbytes == 0) { 1205 /* no service for trivial requests */ 1206 request->status = EINVAL; 1207 return (0); 1208 } 1209 1210 PR1("%s %lu bytes at block %lu", 1211 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 1212 request->nbytes, request->addr); 1213 1214 /* 1215 * We have to check the open flags because the functions processing 1216 * the read/write request will not do it. 1217 */ 1218 if (request->operation == VD_OP_BWRITE && !(vd->open_flags & FWRITE)) { 1219 PR0("write fails because backend is opened read-only"); 1220 request->nbytes = 0; 1221 request->status = EROFS; 1222 return (0); 1223 } 1224 1225 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 1226 1227 /* Map memory exported by client */ 1228 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 1229 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 1230 &bufaddr, NULL); 1231 if (status != 0) { 1232 PR0("ldc_mem_map() returned err %d ", status); 1233 return (EIO); 1234 } 1235 1236 buflen = request->nbytes; 1237 1238 status = ldc_mem_acquire(task->mhdl, 0, buflen); 1239 if (status != 0) { 1240 (void) ldc_mem_unmap(task->mhdl); 1241 PR0("ldc_mem_acquire() returned err %d ", status); 1242 return (EIO); 1243 } 1244 1245 /* Start the block I/O */ 1246 if (vd->file) { 1247 rv = vd_file_rw(vd, slice, request->operation, bufaddr, 1248 request->addr, request->nbytes); 1249 if (rv < 0) { 1250 request->nbytes = 0; 1251 request->status = EIO; 1252 } else { 1253 request->nbytes = rv; 1254 request->status = 0; 1255 } 1256 } else { 1257 if (slice == VD_SLICE_NONE) { 1258 /* 1259 * This is not a disk image so it is a real disk. We 1260 * assume that the underlying device driver supports 1261 * USCSICMD ioctls. This is the case of all SCSI devices 1262 * (sd, ssd...). 1263 * 1264 * In the future if we have non-SCSI disks we would need 1265 * to invoke the appropriate function to do I/O using an 1266 * absolute disk offset (for example using DIOCTL_RWCMD 1267 * for IDE disks). 1268 */ 1269 rv = vd_scsi_rdwr(vd, request->operation, bufaddr, 1270 request->addr, request->nbytes); 1271 if (rv != 0) { 1272 request->nbytes = 0; 1273 request->status = EIO; 1274 } else { 1275 request->status = 0; 1276 } 1277 } else { 1278 bioinit(buf); 1279 buf->b_flags = B_BUSY; 1280 buf->b_bcount = request->nbytes; 1281 buf->b_lblkno = request->addr; 1282 buf->b_edev = vd->dev[slice]; 1283 buf->b_un.b_addr = bufaddr; 1284 buf->b_flags |= (request->operation == VD_OP_BREAD)? 1285 B_READ : B_WRITE; 1286 1287 request->status = 1288 ldi_strategy(vd->ldi_handle[slice], buf); 1289 1290 /* 1291 * This is to indicate to the caller that the request 1292 * needs to be finished by vd_complete_bio() by calling 1293 * biowait() there and waiting for that to return before 1294 * triggering the notification of the vDisk client. 1295 * 1296 * This is necessary when writing to real disks as 1297 * otherwise calls to ldi_strategy() would be serialized 1298 * behind the calls to biowait() and performance would 1299 * suffer. 1300 */ 1301 if (request->status == 0) 1302 return (EINPROGRESS); 1303 1304 biofini(buf); 1305 } 1306 } 1307 1308 /* Clean up after error */ 1309 rv = ldc_mem_release(task->mhdl, 0, buflen); 1310 if (rv) { 1311 PR0("ldc_mem_release() returned err %d ", rv); 1312 status = EIO; 1313 } 1314 rv = ldc_mem_unmap(task->mhdl); 1315 if (rv) { 1316 PR0("ldc_mem_unmap() returned err %d ", rv); 1317 status = EIO; 1318 } 1319 1320 return (status); 1321 } 1322 1323 /* 1324 * This function should only be called from vd_notify to ensure that requests 1325 * are responded to in the order that they are received. 1326 */ 1327 static int 1328 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 1329 { 1330 int status; 1331 size_t nbytes; 1332 1333 do { 1334 nbytes = msglen; 1335 status = ldc_write(ldc_handle, msg, &nbytes); 1336 if (status != EWOULDBLOCK) 1337 break; 1338 drv_usecwait(vds_ldc_delay); 1339 } while (status == EWOULDBLOCK); 1340 1341 if (status != 0) { 1342 if (status != ECONNRESET) 1343 PR0("ldc_write() returned errno %d", status); 1344 return (status); 1345 } else if (nbytes != msglen) { 1346 PR0("ldc_write() performed only partial write"); 1347 return (EIO); 1348 } 1349 1350 PR1("SENT %lu bytes", msglen); 1351 return (0); 1352 } 1353 1354 static void 1355 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 1356 { 1357 mutex_enter(&vd->lock); 1358 vd->reset_state = B_TRUE; 1359 vd->reset_ldc = reset_ldc; 1360 mutex_exit(&vd->lock); 1361 } 1362 1363 /* 1364 * Reset the state of the connection with a client, if needed; reset the LDC 1365 * transport as well, if needed. This function should only be called from the 1366 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 1367 */ 1368 static void 1369 vd_reset_if_needed(vd_t *vd) 1370 { 1371 int status = 0; 1372 1373 mutex_enter(&vd->lock); 1374 if (!vd->reset_state) { 1375 ASSERT(!vd->reset_ldc); 1376 mutex_exit(&vd->lock); 1377 return; 1378 } 1379 mutex_exit(&vd->lock); 1380 1381 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 1382 1383 /* 1384 * Let any asynchronous I/O complete before possibly pulling the rug 1385 * out from under it; defer checking vd->reset_ldc, as one of the 1386 * asynchronous tasks might set it 1387 */ 1388 ddi_taskq_wait(vd->completionq); 1389 1390 if (vd->file) { 1391 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL); 1392 if (status) { 1393 PR0("VOP_FSYNC returned errno %d", status); 1394 } 1395 } 1396 1397 if ((vd->initialized & VD_DRING) && 1398 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 1399 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1400 1401 vd_free_dring_task(vd); 1402 1403 /* Free the staging buffer for msgs */ 1404 if (vd->vio_msgp != NULL) { 1405 kmem_free(vd->vio_msgp, vd->max_msglen); 1406 vd->vio_msgp = NULL; 1407 } 1408 1409 /* Free the inband message buffer */ 1410 if (vd->inband_task.msg != NULL) { 1411 kmem_free(vd->inband_task.msg, vd->max_msglen); 1412 vd->inband_task.msg = NULL; 1413 } 1414 1415 mutex_enter(&vd->lock); 1416 1417 if (vd->reset_ldc) 1418 PR0("taking down LDC channel"); 1419 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 1420 PR0("ldc_down() returned errno %d", status); 1421 1422 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 1423 vd->state = VD_STATE_INIT; 1424 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 1425 1426 /* Allocate the staging buffer */ 1427 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 1428 1429 PR0("calling ldc_up\n"); 1430 (void) ldc_up(vd->ldc_handle); 1431 1432 vd->reset_state = B_FALSE; 1433 vd->reset_ldc = B_FALSE; 1434 1435 mutex_exit(&vd->lock); 1436 } 1437 1438 static void vd_recv_msg(void *arg); 1439 1440 static void 1441 vd_mark_in_reset(vd_t *vd) 1442 { 1443 int status; 1444 1445 PR0("vd_mark_in_reset: marking vd in reset\n"); 1446 1447 vd_need_reset(vd, B_FALSE); 1448 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 1449 if (status == DDI_FAILURE) { 1450 PR0("cannot schedule task to recv msg\n"); 1451 vd_need_reset(vd, B_TRUE); 1452 return; 1453 } 1454 } 1455 1456 static int 1457 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 1458 { 1459 boolean_t accepted; 1460 int status; 1461 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1462 1463 if (vd->reset_state) 1464 return (0); 1465 1466 /* Acquire the element */ 1467 if (!vd->reset_state && 1468 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1469 if (status == ECONNRESET) { 1470 vd_mark_in_reset(vd); 1471 return (0); 1472 } else { 1473 PR0("ldc_mem_dring_acquire() returned errno %d", 1474 status); 1475 return (status); 1476 } 1477 } 1478 1479 /* Set the element's status and mark it done */ 1480 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 1481 if (accepted) { 1482 elem->payload.nbytes = elem_nbytes; 1483 elem->payload.status = elem_status; 1484 elem->hdr.dstate = VIO_DESC_DONE; 1485 } else { 1486 /* Perhaps client timed out waiting for I/O... */ 1487 PR0("element %u no longer \"accepted\"", idx); 1488 VD_DUMP_DRING_ELEM(elem); 1489 } 1490 /* Release the element */ 1491 if (!vd->reset_state && 1492 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1493 if (status == ECONNRESET) { 1494 vd_mark_in_reset(vd); 1495 return (0); 1496 } else { 1497 PR0("ldc_mem_dring_release() returned errno %d", 1498 status); 1499 return (status); 1500 } 1501 } 1502 1503 return (accepted ? 0 : EINVAL); 1504 } 1505 1506 /* 1507 * Return Values 1508 * 0 - operation completed successfully 1509 * EIO - encountered LDC / task error 1510 * 1511 * Side Effect 1512 * sets request->status = <disk operation status> 1513 */ 1514 static int 1515 vd_complete_bio(vd_task_t *task) 1516 { 1517 int status = 0; 1518 int rv = 0; 1519 vd_t *vd = task->vd; 1520 vd_dring_payload_t *request = task->request; 1521 struct buf *buf = &task->buf; 1522 1523 1524 ASSERT(vd != NULL); 1525 ASSERT(request != NULL); 1526 ASSERT(task->msg != NULL); 1527 ASSERT(task->msglen >= sizeof (*task->msg)); 1528 ASSERT(!vd->file); 1529 ASSERT(request->slice != VD_SLICE_NONE); 1530 1531 /* Wait for the I/O to complete [ call to ldi_strategy(9f) ] */ 1532 request->status = biowait(buf); 1533 1534 /* return back the number of bytes read/written */ 1535 request->nbytes = buf->b_bcount - buf->b_resid; 1536 1537 /* Release the buffer */ 1538 if (!vd->reset_state) 1539 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 1540 if (status) { 1541 PR0("ldc_mem_release() returned errno %d copying to " 1542 "client", status); 1543 if (status == ECONNRESET) { 1544 vd_mark_in_reset(vd); 1545 } 1546 rv = EIO; 1547 } 1548 1549 /* Unmap the memory, even if in reset */ 1550 status = ldc_mem_unmap(task->mhdl); 1551 if (status) { 1552 PR0("ldc_mem_unmap() returned errno %d copying to client", 1553 status); 1554 if (status == ECONNRESET) { 1555 vd_mark_in_reset(vd); 1556 } 1557 rv = EIO; 1558 } 1559 1560 biofini(buf); 1561 1562 return (rv); 1563 } 1564 1565 /* 1566 * Description: 1567 * This function is called by the two functions called by a taskq 1568 * [ vd_complete_notify() and vd_serial_notify()) ] to send the 1569 * message to the client. 1570 * 1571 * Parameters: 1572 * arg - opaque pointer to structure containing task to be completed 1573 * 1574 * Return Values 1575 * None 1576 */ 1577 static void 1578 vd_notify(vd_task_t *task) 1579 { 1580 int status; 1581 1582 ASSERT(task != NULL); 1583 ASSERT(task->vd != NULL); 1584 1585 if (task->vd->reset_state) 1586 return; 1587 1588 /* 1589 * Send the "ack" or "nack" back to the client; if sending the message 1590 * via LDC fails, arrange to reset both the connection state and LDC 1591 * itself 1592 */ 1593 PR2("Sending %s", 1594 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 1595 1596 status = send_msg(task->vd->ldc_handle, task->msg, task->msglen); 1597 switch (status) { 1598 case 0: 1599 break; 1600 case ECONNRESET: 1601 vd_mark_in_reset(task->vd); 1602 break; 1603 default: 1604 PR0("initiating full reset"); 1605 vd_need_reset(task->vd, B_TRUE); 1606 break; 1607 } 1608 1609 DTRACE_PROBE1(task__end, vd_task_t *, task); 1610 } 1611 1612 /* 1613 * Description: 1614 * Mark the Dring entry as Done and (if necessary) send an ACK/NACK to 1615 * the vDisk client 1616 * 1617 * Parameters: 1618 * task - structure containing the request sent from client 1619 * 1620 * Return Values 1621 * None 1622 */ 1623 static void 1624 vd_complete_notify(vd_task_t *task) 1625 { 1626 int status = 0; 1627 vd_t *vd = task->vd; 1628 vd_dring_payload_t *request = task->request; 1629 1630 /* Update the dring element for a dring client */ 1631 if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE)) { 1632 status = vd_mark_elem_done(vd, task->index, 1633 request->status, request->nbytes); 1634 if (status == ECONNRESET) 1635 vd_mark_in_reset(vd); 1636 } 1637 1638 /* 1639 * If a transport error occurred while marking the element done or 1640 * previously while executing the task, arrange to "nack" the message 1641 * when the final task in the descriptor element range completes 1642 */ 1643 if ((status != 0) || (task->status != 0)) 1644 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 1645 1646 /* 1647 * Only the final task for a range of elements will respond to and 1648 * free the message 1649 */ 1650 if (task->type == VD_NONFINAL_RANGE_TASK) { 1651 return; 1652 } 1653 1654 vd_notify(task); 1655 } 1656 1657 /* 1658 * Description: 1659 * This is the basic completion function called to handle inband data 1660 * requests and handshake messages. All it needs to do is trigger a 1661 * message to the client that the request is completed. 1662 * 1663 * Parameters: 1664 * arg - opaque pointer to structure containing task to be completed 1665 * 1666 * Return Values 1667 * None 1668 */ 1669 static void 1670 vd_serial_notify(void *arg) 1671 { 1672 vd_task_t *task = (vd_task_t *)arg; 1673 1674 ASSERT(task != NULL); 1675 vd_notify(task); 1676 } 1677 1678 static void 1679 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 1680 { 1681 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 1682 } 1683 1684 static void 1685 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 1686 { 1687 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 1688 } 1689 1690 static void 1691 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 1692 { 1693 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 1694 } 1695 1696 static void 1697 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 1698 { 1699 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 1700 } 1701 1702 static void 1703 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 1704 { 1705 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1706 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1707 1708 dk_efi->dki_lba = vd_efi->lba; 1709 dk_efi->dki_length = vd_efi->length; 1710 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 1711 } 1712 1713 static void 1714 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 1715 { 1716 int len; 1717 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1718 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1719 1720 len = vd_efi->length; 1721 DK_EFI2VD_EFI(dk_efi, vd_efi); 1722 kmem_free(dk_efi->dki_data, len); 1723 } 1724 1725 static void 1726 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 1727 { 1728 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1729 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1730 1731 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 1732 VD_EFI2DK_EFI(vd_efi, dk_efi); 1733 } 1734 1735 static void 1736 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 1737 { 1738 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1739 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1740 1741 kmem_free(dk_efi->dki_data, vd_efi->length); 1742 } 1743 1744 static vd_disk_label_t 1745 vd_read_vtoc(vd_t *vd, struct vtoc *vtoc) 1746 { 1747 int status, rval; 1748 struct dk_gpt *efi; 1749 size_t efi_len; 1750 1751 ASSERT(vd->ldi_handle[0] != NULL); 1752 1753 status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)vtoc, 1754 (vd->open_flags | FKIOCTL), kcred, &rval); 1755 1756 if (status == 0) { 1757 return (VD_DISK_LABEL_VTOC); 1758 } else if (status != ENOTSUP) { 1759 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 1760 return (VD_DISK_LABEL_UNK); 1761 } 1762 1763 status = vds_efi_alloc_and_read(vd->ldi_handle[0], &efi, &efi_len); 1764 1765 if (status) { 1766 PR0("vds_efi_alloc_and_read returned error %d", status); 1767 return (VD_DISK_LABEL_UNK); 1768 } 1769 1770 vd_efi_to_vtoc(efi, vtoc); 1771 vd_efi_free(efi, efi_len); 1772 1773 return (VD_DISK_LABEL_EFI); 1774 } 1775 1776 static ushort_t 1777 vd_lbl2cksum(struct dk_label *label) 1778 { 1779 int count; 1780 ushort_t sum, *sp; 1781 1782 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 1783 sp = (ushort_t *)label; 1784 sum = 0; 1785 while (count--) { 1786 sum ^= *sp++; 1787 } 1788 1789 return (sum); 1790 } 1791 1792 /* 1793 * Handle ioctls to a disk slice. 1794 * 1795 * Return Values 1796 * 0 - Indicates that there are no errors in disk operations 1797 * ENOTSUP - Unknown disk label type or unsupported DKIO ioctl 1798 * EINVAL - Not enough room to copy the EFI label 1799 * 1800 */ 1801 static int 1802 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1803 { 1804 dk_efi_t *dk_ioc; 1805 1806 switch (vd->vdisk_label) { 1807 1808 /* ioctls for a slice from a disk with a VTOC label */ 1809 case VD_DISK_LABEL_VTOC: 1810 1811 switch (cmd) { 1812 case DKIOCGGEOM: 1813 ASSERT(ioctl_arg != NULL); 1814 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 1815 return (0); 1816 case DKIOCGVTOC: 1817 ASSERT(ioctl_arg != NULL); 1818 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 1819 return (0); 1820 default: 1821 return (ENOTSUP); 1822 } 1823 1824 /* ioctls for a slice from a disk with an EFI label */ 1825 case VD_DISK_LABEL_EFI: 1826 1827 switch (cmd) { 1828 case DKIOCGETEFI: 1829 ASSERT(ioctl_arg != NULL); 1830 dk_ioc = (dk_efi_t *)ioctl_arg; 1831 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 1832 return (EINVAL); 1833 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 1834 vd->dk_efi.dki_length); 1835 return (0); 1836 default: 1837 return (ENOTSUP); 1838 } 1839 1840 default: 1841 /* Unknown disk label type */ 1842 return (ENOTSUP); 1843 } 1844 } 1845 1846 /* 1847 * Function: 1848 * vd_file_validate_geometry 1849 * 1850 * Description: 1851 * Read the label and validate the geometry of a disk image. The driver 1852 * label, vtoc and geometry information are updated according to the 1853 * label read from the disk image. 1854 * 1855 * If no valid label is found, the label is set to unknown and the 1856 * function returns EINVAL, but a default vtoc and geometry are provided 1857 * to the driver. 1858 * 1859 * Parameters: 1860 * vd - disk on which the operation is performed. 1861 * 1862 * Return Code: 1863 * 0 - success. 1864 * EIO - error reading the label from the disk image. 1865 * EINVAL - unknown disk label. 1866 */ 1867 static int 1868 vd_file_validate_geometry(vd_t *vd) 1869 { 1870 struct dk_label label; 1871 struct dk_geom *geom = &vd->dk_geom; 1872 struct vtoc *vtoc = &vd->vtoc; 1873 int i; 1874 int status = 0; 1875 1876 ASSERT(vd->file); 1877 1878 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { 1879 /* 1880 * For single slice disk we always fake the geometry, and we 1881 * only need to do it once because the geometry will never 1882 * change. 1883 */ 1884 if (vd->vdisk_label == VD_DISK_LABEL_VTOC) 1885 /* geometry was already validated */ 1886 return (0); 1887 1888 ASSERT(vd->vdisk_label == VD_DISK_LABEL_UNK); 1889 vd_file_build_default_label(vd, &label); 1890 vd->vdisk_label = VD_DISK_LABEL_VTOC; 1891 } else { 1892 if (VD_FILE_LABEL_READ(vd, &label) < 0) 1893 return (EIO); 1894 1895 if (label.dkl_magic != DKL_MAGIC || 1896 label.dkl_cksum != vd_lbl2cksum(&label) || 1897 label.dkl_vtoc.v_sanity != VTOC_SANE || 1898 label.dkl_vtoc.v_nparts != V_NUMPAR) { 1899 vd->vdisk_label = VD_DISK_LABEL_UNK; 1900 vd_file_build_default_label(vd, &label); 1901 status = EINVAL; 1902 } else { 1903 vd->vdisk_label = VD_DISK_LABEL_VTOC; 1904 } 1905 } 1906 1907 /* Update the driver geometry */ 1908 bzero(geom, sizeof (struct dk_geom)); 1909 1910 geom->dkg_ncyl = label.dkl_ncyl; 1911 geom->dkg_acyl = label.dkl_acyl; 1912 geom->dkg_nhead = label.dkl_nhead; 1913 geom->dkg_nsect = label.dkl_nsect; 1914 geom->dkg_intrlv = label.dkl_intrlv; 1915 geom->dkg_apc = label.dkl_apc; 1916 geom->dkg_rpm = label.dkl_rpm; 1917 geom->dkg_pcyl = label.dkl_pcyl; 1918 geom->dkg_write_reinstruct = label.dkl_write_reinstruct; 1919 geom->dkg_read_reinstruct = label.dkl_read_reinstruct; 1920 1921 /* Update the driver vtoc */ 1922 bzero(vtoc, sizeof (struct vtoc)); 1923 1924 vtoc->v_sanity = label.dkl_vtoc.v_sanity; 1925 vtoc->v_version = label.dkl_vtoc.v_version; 1926 vtoc->v_sectorsz = DEV_BSIZE; 1927 vtoc->v_nparts = label.dkl_vtoc.v_nparts; 1928 1929 for (i = 0; i < vtoc->v_nparts; i++) { 1930 vtoc->v_part[i].p_tag = 1931 label.dkl_vtoc.v_part[i].p_tag; 1932 vtoc->v_part[i].p_flag = 1933 label.dkl_vtoc.v_part[i].p_flag; 1934 vtoc->v_part[i].p_start = 1935 label.dkl_map[i].dkl_cylno * 1936 (label.dkl_nhead * label.dkl_nsect); 1937 vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk; 1938 vtoc->timestamp[i] = 1939 label.dkl_vtoc.v_timestamp[i]; 1940 } 1941 /* 1942 * The bootinfo array can not be copied with bcopy() because 1943 * elements are of type long in vtoc (so 64-bit) and of type 1944 * int in dk_vtoc (so 32-bit). 1945 */ 1946 vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0]; 1947 vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1]; 1948 vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2]; 1949 bcopy(label.dkl_asciilabel, vtoc->v_asciilabel, 1950 LEN_DKL_ASCII); 1951 bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume, 1952 LEN_DKL_VVOL); 1953 1954 return (status); 1955 } 1956 1957 /* 1958 * Handle ioctls to a disk image (file-based). 1959 * 1960 * Return Values 1961 * 0 - Indicates that there are no errors 1962 * != 0 - Disk operation returned an error 1963 */ 1964 static int 1965 vd_do_file_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1966 { 1967 struct dk_label label; 1968 struct dk_geom *geom; 1969 struct vtoc *vtoc; 1970 int i, rc; 1971 1972 ASSERT(vd->file); 1973 1974 switch (cmd) { 1975 1976 case DKIOCGGEOM: 1977 ASSERT(ioctl_arg != NULL); 1978 geom = (struct dk_geom *)ioctl_arg; 1979 1980 rc = vd_file_validate_geometry(vd); 1981 if (rc != 0 && rc != EINVAL) { 1982 ASSERT(vd->vdisk_type != VD_DISK_TYPE_SLICE); 1983 return (rc); 1984 } 1985 1986 bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom)); 1987 return (0); 1988 1989 case DKIOCGVTOC: 1990 ASSERT(ioctl_arg != NULL); 1991 vtoc = (struct vtoc *)ioctl_arg; 1992 1993 rc = vd_file_validate_geometry(vd); 1994 if (rc != 0 && rc != EINVAL) { 1995 ASSERT(vd->vdisk_type != VD_DISK_TYPE_SLICE); 1996 return (rc); 1997 } 1998 1999 bcopy(&vd->vtoc, vtoc, sizeof (struct vtoc)); 2000 return (0); 2001 2002 case DKIOCSGEOM: 2003 ASSERT(ioctl_arg != NULL); 2004 geom = (struct dk_geom *)ioctl_arg; 2005 2006 /* geometry can only be changed for full disk */ 2007 if (vd->vdisk_type != VD_DISK_TYPE_DISK) 2008 return (ENOTSUP); 2009 2010 if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0) 2011 return (EINVAL); 2012 2013 /* 2014 * The current device geometry is not updated, just the driver 2015 * "notion" of it. The device geometry will be effectively 2016 * updated when a label is written to the device during a next 2017 * DKIOCSVTOC. 2018 */ 2019 bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom)); 2020 return (0); 2021 2022 case DKIOCSVTOC: 2023 ASSERT(ioctl_arg != NULL); 2024 ASSERT(vd->dk_geom.dkg_nhead != 0 && 2025 vd->dk_geom.dkg_nsect != 0); 2026 vtoc = (struct vtoc *)ioctl_arg; 2027 2028 /* vtoc can only be changed for full disk */ 2029 if (vd->vdisk_type != VD_DISK_TYPE_DISK) 2030 return (ENOTSUP); 2031 2032 if (vtoc->v_sanity != VTOC_SANE || 2033 vtoc->v_sectorsz != DEV_BSIZE || 2034 vtoc->v_nparts != V_NUMPAR) 2035 return (EINVAL); 2036 2037 bzero(&label, sizeof (label)); 2038 label.dkl_ncyl = vd->dk_geom.dkg_ncyl; 2039 label.dkl_acyl = vd->dk_geom.dkg_acyl; 2040 label.dkl_pcyl = vd->dk_geom.dkg_pcyl; 2041 label.dkl_nhead = vd->dk_geom.dkg_nhead; 2042 label.dkl_nsect = vd->dk_geom.dkg_nsect; 2043 label.dkl_intrlv = vd->dk_geom.dkg_intrlv; 2044 label.dkl_apc = vd->dk_geom.dkg_apc; 2045 label.dkl_rpm = vd->dk_geom.dkg_rpm; 2046 label.dkl_write_reinstruct = vd->dk_geom.dkg_write_reinstruct; 2047 label.dkl_read_reinstruct = vd->dk_geom.dkg_read_reinstruct; 2048 2049 label.dkl_vtoc.v_nparts = V_NUMPAR; 2050 label.dkl_vtoc.v_sanity = VTOC_SANE; 2051 label.dkl_vtoc.v_version = vtoc->v_version; 2052 for (i = 0; i < V_NUMPAR; i++) { 2053 label.dkl_vtoc.v_timestamp[i] = 2054 vtoc->timestamp[i]; 2055 label.dkl_vtoc.v_part[i].p_tag = 2056 vtoc->v_part[i].p_tag; 2057 label.dkl_vtoc.v_part[i].p_flag = 2058 vtoc->v_part[i].p_flag; 2059 label.dkl_map[i].dkl_cylno = 2060 vtoc->v_part[i].p_start / 2061 (label.dkl_nhead * label.dkl_nsect); 2062 label.dkl_map[i].dkl_nblk = 2063 vtoc->v_part[i].p_size; 2064 } 2065 /* 2066 * The bootinfo array can not be copied with bcopy() because 2067 * elements are of type long in vtoc (so 64-bit) and of type 2068 * int in dk_vtoc (so 32-bit). 2069 */ 2070 label.dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0]; 2071 label.dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1]; 2072 label.dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2]; 2073 bcopy(vtoc->v_asciilabel, label.dkl_asciilabel, 2074 LEN_DKL_ASCII); 2075 bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume, 2076 LEN_DKL_VVOL); 2077 2078 /* re-compute checksum */ 2079 label.dkl_magic = DKL_MAGIC; 2080 label.dkl_cksum = vd_lbl2cksum(&label); 2081 2082 /* write label to the disk image */ 2083 if ((rc = vd_file_set_vtoc(vd, &label)) != 0) 2084 return (rc); 2085 2086 /* check the geometry and update the driver info */ 2087 if ((rc = vd_file_validate_geometry(vd)) != 0) 2088 return (rc); 2089 2090 /* 2091 * The disk geometry may have changed, so we need to write 2092 * the devid (if there is one) so that it is stored at the 2093 * right location. 2094 */ 2095 if (vd->file_devid != NULL && 2096 vd_file_write_devid(vd, vd->file_devid) != 0) { 2097 PR0("Fail to write devid"); 2098 } 2099 2100 return (0); 2101 2102 case DKIOCFLUSHWRITECACHE: 2103 return (VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL)); 2104 2105 default: 2106 return (ENOTSUP); 2107 } 2108 } 2109 2110 /* 2111 * Description: 2112 * This is the function that processes the ioctl requests (farming it 2113 * out to functions that handle slices, files or whole disks) 2114 * 2115 * Return Values 2116 * 0 - ioctl operation completed successfully 2117 * != 0 - The LDC error value encountered 2118 * (propagated back up the call stack as a task error) 2119 * 2120 * Side Effect 2121 * sets request->status to the return value of the ioctl function. 2122 */ 2123 static int 2124 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 2125 { 2126 int rval = 0, status = 0; 2127 size_t nbytes = request->nbytes; /* modifiable copy */ 2128 2129 2130 ASSERT(request->slice < vd->nslices); 2131 PR0("Performing %s", ioctl->operation_name); 2132 2133 /* Get data from client and convert, if necessary */ 2134 if (ioctl->copyin != NULL) { 2135 ASSERT(nbytes != 0 && buf != NULL); 2136 PR1("Getting \"arg\" data from client"); 2137 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 2138 request->cookie, request->ncookies, 2139 LDC_COPY_IN)) != 0) { 2140 PR0("ldc_mem_copy() returned errno %d " 2141 "copying from client", status); 2142 return (status); 2143 } 2144 2145 /* Convert client's data, if necessary */ 2146 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 2147 ioctl->arg = buf; 2148 else /* convert client vdisk operation data to ioctl data */ 2149 (ioctl->copyin)(buf, (void *)ioctl->arg); 2150 } 2151 2152 /* 2153 * Handle single-slice block devices internally; otherwise, have the 2154 * real driver perform the ioctl() 2155 */ 2156 if (vd->file) { 2157 request->status = 2158 vd_do_file_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); 2159 2160 } else if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { 2161 request->status = 2162 vd_do_slice_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); 2163 2164 } else { 2165 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2166 ioctl->cmd, (intptr_t)ioctl->arg, vd->open_flags | FKIOCTL, 2167 kcred, &rval); 2168 2169 #ifdef DEBUG 2170 if (rval != 0) { 2171 PR0("%s set rval = %d, which is not being returned to" 2172 " client", ioctl->cmd_name, rval); 2173 } 2174 #endif /* DEBUG */ 2175 } 2176 2177 if (request->status != 0) { 2178 PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status); 2179 return (0); 2180 } 2181 2182 /* Convert data and send to client, if necessary */ 2183 if (ioctl->copyout != NULL) { 2184 ASSERT(nbytes != 0 && buf != NULL); 2185 PR1("Sending \"arg\" data to client"); 2186 2187 /* Convert ioctl data to vdisk operation data, if necessary */ 2188 if (ioctl->copyout != VD_IDENTITY) 2189 (ioctl->copyout)((void *)ioctl->arg, buf); 2190 2191 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 2192 request->cookie, request->ncookies, 2193 LDC_COPY_OUT)) != 0) { 2194 PR0("ldc_mem_copy() returned errno %d " 2195 "copying to client", status); 2196 return (status); 2197 } 2198 } 2199 2200 return (status); 2201 } 2202 2203 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 2204 2205 /* 2206 * Description: 2207 * This generic function is called by the task queue to complete 2208 * the processing of the tasks. The specific completion function 2209 * is passed in as a field in the task pointer. 2210 * 2211 * Parameters: 2212 * arg - opaque pointer to structure containing task to be completed 2213 * 2214 * Return Values 2215 * None 2216 */ 2217 static void 2218 vd_complete(void *arg) 2219 { 2220 vd_task_t *task = (vd_task_t *)arg; 2221 2222 ASSERT(task != NULL); 2223 ASSERT(task->status == EINPROGRESS); 2224 ASSERT(task->completef != NULL); 2225 2226 task->status = task->completef(task); 2227 if (task->status) 2228 PR0("%s: Error %d completing task", __func__, task->status); 2229 2230 /* Now notify the vDisk client */ 2231 vd_complete_notify(task); 2232 } 2233 2234 static int 2235 vd_ioctl(vd_task_t *task) 2236 { 2237 int i, status; 2238 void *buf = NULL; 2239 struct dk_geom dk_geom = {0}; 2240 struct vtoc vtoc = {0}; 2241 struct dk_efi dk_efi = {0}; 2242 vd_t *vd = task->vd; 2243 vd_dring_payload_t *request = task->request; 2244 vd_ioctl_t ioctl[] = { 2245 /* Command (no-copy) operations */ 2246 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 2247 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 2248 NULL, NULL, NULL, B_TRUE}, 2249 2250 /* "Get" (copy-out) operations */ 2251 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 2252 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 2253 NULL, VD_IDENTITY, VD_IDENTITY, B_FALSE}, 2254 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 2255 RNDSIZE(vd_geom_t), 2256 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 2257 &dk_geom, NULL, dk_geom2vd_geom, B_FALSE}, 2258 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 2259 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 2260 &vtoc, NULL, vtoc2vd_vtoc, B_FALSE}, 2261 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 2262 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 2263 &dk_efi, vd_get_efi_in, vd_get_efi_out, B_FALSE}, 2264 2265 /* "Set" (copy-in) operations */ 2266 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 2267 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 2268 NULL, VD_IDENTITY, VD_IDENTITY, B_TRUE}, 2269 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 2270 RNDSIZE(vd_geom_t), 2271 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 2272 &dk_geom, vd_geom2dk_geom, NULL, B_TRUE}, 2273 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 2274 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 2275 &vtoc, vd_vtoc2vtoc, NULL, B_TRUE}, 2276 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 2277 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 2278 &dk_efi, vd_set_efi_in, vd_set_efi_out, B_TRUE}, 2279 }; 2280 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 2281 2282 2283 ASSERT(vd != NULL); 2284 ASSERT(request != NULL); 2285 ASSERT(request->slice < vd->nslices); 2286 2287 /* 2288 * Determine ioctl corresponding to caller's "operation" and 2289 * validate caller's "nbytes" 2290 */ 2291 for (i = 0; i < nioctls; i++) { 2292 if (request->operation == ioctl[i].operation) { 2293 /* LDC memory operations require 8-byte multiples */ 2294 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 2295 2296 if (request->operation == VD_OP_GET_EFI || 2297 request->operation == VD_OP_SET_EFI) { 2298 if (request->nbytes >= ioctl[i].nbytes) 2299 break; 2300 PR0("%s: Expected at least nbytes = %lu, " 2301 "got %lu", ioctl[i].operation_name, 2302 ioctl[i].nbytes, request->nbytes); 2303 return (EINVAL); 2304 } 2305 2306 if (request->nbytes != ioctl[i].nbytes) { 2307 PR0("%s: Expected nbytes = %lu, got %lu", 2308 ioctl[i].operation_name, ioctl[i].nbytes, 2309 request->nbytes); 2310 return (EINVAL); 2311 } 2312 2313 break; 2314 } 2315 } 2316 ASSERT(i < nioctls); /* because "operation" already validated */ 2317 2318 if (!(vd->open_flags & FWRITE) && ioctl[i].write) { 2319 PR0("%s fails because backend is opened read-only", 2320 ioctl[i].operation_name); 2321 request->status = EROFS; 2322 return (0); 2323 } 2324 2325 if (request->nbytes) 2326 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 2327 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 2328 if (request->nbytes) 2329 kmem_free(buf, request->nbytes); 2330 2331 return (status); 2332 } 2333 2334 static int 2335 vd_get_devid(vd_task_t *task) 2336 { 2337 vd_t *vd = task->vd; 2338 vd_dring_payload_t *request = task->request; 2339 vd_devid_t *vd_devid; 2340 impl_devid_t *devid; 2341 int status, bufid_len, devid_len, len, sz; 2342 int bufbytes; 2343 2344 PR1("Get Device ID, nbytes=%ld", request->nbytes); 2345 2346 if (vd->file) { 2347 if (vd->file_devid == NULL) { 2348 PR2("No Device ID"); 2349 request->status = ENOENT; 2350 return (0); 2351 } else { 2352 sz = ddi_devid_sizeof(vd->file_devid); 2353 devid = kmem_alloc(sz, KM_SLEEP); 2354 bcopy(vd->file_devid, devid, sz); 2355 } 2356 } else { 2357 if (ddi_lyr_get_devid(vd->dev[request->slice], 2358 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 2359 PR2("No Device ID"); 2360 request->status = ENOENT; 2361 return (0); 2362 } 2363 } 2364 2365 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 2366 devid_len = DEVID_GETLEN(devid); 2367 2368 /* 2369 * Save the buffer size here for use in deallocation. 2370 * The actual number of bytes copied is returned in 2371 * the 'nbytes' field of the request structure. 2372 */ 2373 bufbytes = request->nbytes; 2374 2375 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 2376 vd_devid->length = devid_len; 2377 vd_devid->type = DEVID_GETTYPE(devid); 2378 2379 len = (devid_len > bufid_len)? bufid_len : devid_len; 2380 2381 bcopy(devid->did_id, vd_devid->id, len); 2382 2383 request->status = 0; 2384 2385 /* LDC memory operations require 8-byte multiples */ 2386 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 2387 2388 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 2389 &request->nbytes, request->cookie, request->ncookies, 2390 LDC_COPY_OUT)) != 0) { 2391 PR0("ldc_mem_copy() returned errno %d copying to client", 2392 status); 2393 } 2394 PR1("post mem_copy: nbytes=%ld", request->nbytes); 2395 2396 kmem_free(vd_devid, bufbytes); 2397 ddi_devid_free((ddi_devid_t)devid); 2398 2399 return (status); 2400 } 2401 2402 /* 2403 * Define the supported operations once the functions for performing them have 2404 * been defined 2405 */ 2406 static const vds_operation_t vds_operation[] = { 2407 #define X(_s) #_s, _s 2408 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 2409 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 2410 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 2411 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 2412 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 2413 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 2414 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 2415 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 2416 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 2417 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 2418 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 2419 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 2420 #undef X 2421 }; 2422 2423 static const size_t vds_noperations = 2424 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 2425 2426 /* 2427 * Process a task specifying a client I/O request 2428 * 2429 * Parameters: 2430 * task - structure containing the request sent from client 2431 * 2432 * Return Value 2433 * 0 - success 2434 * ENOTSUP - Unknown/Unsupported VD_OP_XXX operation 2435 * EINVAL - Invalid disk slice 2436 * != 0 - some other non-zero return value from start function 2437 */ 2438 static int 2439 vd_do_process_task(vd_task_t *task) 2440 { 2441 int i; 2442 vd_t *vd = task->vd; 2443 vd_dring_payload_t *request = task->request; 2444 2445 ASSERT(vd != NULL); 2446 ASSERT(request != NULL); 2447 2448 /* Find the requested operation */ 2449 for (i = 0; i < vds_noperations; i++) { 2450 if (request->operation == vds_operation[i].operation) { 2451 /* all operations should have a start func */ 2452 ASSERT(vds_operation[i].start != NULL); 2453 2454 task->completef = vds_operation[i].complete; 2455 break; 2456 } 2457 } 2458 2459 /* 2460 * We need to check that the requested operation is permitted 2461 * for the particular client that sent it or that the loop above 2462 * did not complete without finding the operation type (indicating 2463 * that the requested operation is unknown/unimplemented) 2464 */ 2465 if ((VD_OP_SUPPORTED(vd->operations, request->operation) == B_FALSE) || 2466 (i == vds_noperations)) { 2467 PR0("Unsupported operation %u", request->operation); 2468 request->status = ENOTSUP; 2469 return (0); 2470 } 2471 2472 /* Range-check slice */ 2473 if (request->slice >= vd->nslices && 2474 (vd->vdisk_type != VD_DISK_TYPE_DISK || 2475 request->slice != VD_SLICE_NONE)) { 2476 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 2477 request->slice, (vd->nslices - 1)); 2478 return (EINVAL); 2479 } 2480 2481 /* 2482 * Call the function pointer that starts the operation. 2483 */ 2484 return (vds_operation[i].start(task)); 2485 } 2486 2487 /* 2488 * Description: 2489 * This function is called by both the in-band and descriptor ring 2490 * message processing functions paths to actually execute the task 2491 * requested by the vDisk client. It in turn calls its worker 2492 * function, vd_do_process_task(), to carry our the request. 2493 * 2494 * Any transport errors (e.g. LDC errors, vDisk protocol errors) are 2495 * saved in the 'status' field of the task and are propagated back 2496 * up the call stack to trigger a NACK 2497 * 2498 * Any request errors (e.g. ENOTTY from an ioctl) are saved in 2499 * the 'status' field of the request and result in an ACK being sent 2500 * by the completion handler. 2501 * 2502 * Parameters: 2503 * task - structure containing the request sent from client 2504 * 2505 * Return Value 2506 * 0 - successful synchronous request. 2507 * != 0 - transport error (e.g. LDC errors, vDisk protocol) 2508 * EINPROGRESS - task will be finished in a completion handler 2509 */ 2510 static int 2511 vd_process_task(vd_task_t *task) 2512 { 2513 vd_t *vd = task->vd; 2514 int status; 2515 2516 DTRACE_PROBE1(task__start, vd_task_t *, task); 2517 2518 task->status = vd_do_process_task(task); 2519 2520 /* 2521 * If the task processing function returned EINPROGRESS indicating 2522 * that the task needs completing then schedule a taskq entry to 2523 * finish it now. 2524 * 2525 * Otherwise the task processing function returned either zero 2526 * indicating that the task was finished in the start function (and we 2527 * don't need to wait in a completion function) or the start function 2528 * returned an error - in both cases all that needs to happen is the 2529 * notification to the vDisk client higher up the call stack. 2530 * If the task was using a Descriptor Ring, we need to mark it as done 2531 * at this stage. 2532 */ 2533 if (task->status == EINPROGRESS) { 2534 /* Queue a task to complete the operation */ 2535 (void) ddi_taskq_dispatch(vd->completionq, vd_complete, 2536 task, DDI_SLEEP); 2537 2538 } else if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE)) { 2539 /* Update the dring element if it's a dring client */ 2540 status = vd_mark_elem_done(vd, task->index, 2541 task->request->status, task->request->nbytes); 2542 if (status == ECONNRESET) 2543 vd_mark_in_reset(vd); 2544 } 2545 2546 return (task->status); 2547 } 2548 2549 /* 2550 * Return true if the "type", "subtype", and "env" fields of the "tag" first 2551 * argument match the corresponding remaining arguments; otherwise, return false 2552 */ 2553 boolean_t 2554 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 2555 { 2556 return ((tag->vio_msgtype == type) && 2557 (tag->vio_subtype == subtype) && 2558 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 2559 } 2560 2561 /* 2562 * Check whether the major/minor version specified in "ver_msg" is supported 2563 * by this server. 2564 */ 2565 static boolean_t 2566 vds_supported_version(vio_ver_msg_t *ver_msg) 2567 { 2568 for (int i = 0; i < vds_num_versions; i++) { 2569 ASSERT(vds_version[i].major > 0); 2570 ASSERT((i == 0) || 2571 (vds_version[i].major < vds_version[i-1].major)); 2572 2573 /* 2574 * If the major versions match, adjust the minor version, if 2575 * necessary, down to the highest value supported by this 2576 * server and return true so this message will get "ack"ed; 2577 * the client should also support all minor versions lower 2578 * than the value it sent 2579 */ 2580 if (ver_msg->ver_major == vds_version[i].major) { 2581 if (ver_msg->ver_minor > vds_version[i].minor) { 2582 PR0("Adjusting minor version from %u to %u", 2583 ver_msg->ver_minor, vds_version[i].minor); 2584 ver_msg->ver_minor = vds_version[i].minor; 2585 } 2586 return (B_TRUE); 2587 } 2588 2589 /* 2590 * If the message contains a higher major version number, set 2591 * the message's major/minor versions to the current values 2592 * and return false, so this message will get "nack"ed with 2593 * these values, and the client will potentially try again 2594 * with the same or a lower version 2595 */ 2596 if (ver_msg->ver_major > vds_version[i].major) { 2597 ver_msg->ver_major = vds_version[i].major; 2598 ver_msg->ver_minor = vds_version[i].minor; 2599 return (B_FALSE); 2600 } 2601 2602 /* 2603 * Otherwise, the message's major version is less than the 2604 * current major version, so continue the loop to the next 2605 * (lower) supported version 2606 */ 2607 } 2608 2609 /* 2610 * No common version was found; "ground" the version pair in the 2611 * message to terminate negotiation 2612 */ 2613 ver_msg->ver_major = 0; 2614 ver_msg->ver_minor = 0; 2615 return (B_FALSE); 2616 } 2617 2618 /* 2619 * Process a version message from a client. vds expects to receive version 2620 * messages from clients seeking service, but never issues version messages 2621 * itself; therefore, vds can ACK or NACK client version messages, but does 2622 * not expect to receive version-message ACKs or NACKs (and will treat such 2623 * messages as invalid). 2624 */ 2625 static int 2626 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2627 { 2628 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 2629 2630 2631 ASSERT(msglen >= sizeof (msg->tag)); 2632 2633 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2634 VIO_VER_INFO)) { 2635 return (ENOMSG); /* not a version message */ 2636 } 2637 2638 if (msglen != sizeof (*ver_msg)) { 2639 PR0("Expected %lu-byte version message; " 2640 "received %lu bytes", sizeof (*ver_msg), msglen); 2641 return (EBADMSG); 2642 } 2643 2644 if (ver_msg->dev_class != VDEV_DISK) { 2645 PR0("Expected device class %u (disk); received %u", 2646 VDEV_DISK, ver_msg->dev_class); 2647 return (EBADMSG); 2648 } 2649 2650 /* 2651 * We're talking to the expected kind of client; set our device class 2652 * for "ack/nack" back to the client 2653 */ 2654 ver_msg->dev_class = VDEV_DISK_SERVER; 2655 2656 /* 2657 * Check whether the (valid) version message specifies a version 2658 * supported by this server. If the version is not supported, return 2659 * EBADMSG so the message will get "nack"ed; vds_supported_version() 2660 * will have updated the message with a supported version for the 2661 * client to consider 2662 */ 2663 if (!vds_supported_version(ver_msg)) 2664 return (EBADMSG); 2665 2666 2667 /* 2668 * A version has been agreed upon; use the client's SID for 2669 * communication on this channel now 2670 */ 2671 ASSERT(!(vd->initialized & VD_SID)); 2672 vd->sid = ver_msg->tag.vio_sid; 2673 vd->initialized |= VD_SID; 2674 2675 /* 2676 * Store the negotiated major and minor version values in the "vd" data 2677 * structure so that we can check if certain operations are supported 2678 * by the client. 2679 */ 2680 vd->version.major = ver_msg->ver_major; 2681 vd->version.minor = ver_msg->ver_minor; 2682 2683 PR0("Using major version %u, minor version %u", 2684 ver_msg->ver_major, ver_msg->ver_minor); 2685 return (0); 2686 } 2687 2688 static void 2689 vd_set_exported_operations(vd_t *vd) 2690 { 2691 vd->operations = 0; /* clear field */ 2692 2693 /* 2694 * We need to check from the highest version supported to the 2695 * lowest because versions with a higher minor number implicitly 2696 * support versions with a lower minor number. 2697 */ 2698 if (vio_ver_is_supported(vd->version, 1, 1)) { 2699 ASSERT(vd->open_flags & FREAD); 2700 vd->operations |= VD_OP_MASK_READ; 2701 2702 if (vd->open_flags & FWRITE) 2703 vd->operations |= VD_OP_MASK_WRITE; 2704 2705 if (vd->file && vd_file_is_iso_image(vd)) { 2706 /* 2707 * can't write to ISO images, make sure that write 2708 * support is not set in case administrator did not 2709 * use "options=ro" when doing an ldm add-vdsdev 2710 */ 2711 vd->operations &= ~VD_OP_MASK_WRITE; 2712 } 2713 } else if (vio_ver_is_supported(vd->version, 1, 0)) { 2714 vd->operations = VD_OP_MASK_READ | VD_OP_MASK_WRITE; 2715 } 2716 2717 /* we should have already agreed on a version */ 2718 ASSERT(vd->operations != 0); 2719 } 2720 2721 static int 2722 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2723 { 2724 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 2725 int status, retry = 0; 2726 2727 2728 ASSERT(msglen >= sizeof (msg->tag)); 2729 2730 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2731 VIO_ATTR_INFO)) { 2732 PR0("Message is not an attribute message"); 2733 return (ENOMSG); 2734 } 2735 2736 if (msglen != sizeof (*attr_msg)) { 2737 PR0("Expected %lu-byte attribute message; " 2738 "received %lu bytes", sizeof (*attr_msg), msglen); 2739 return (EBADMSG); 2740 } 2741 2742 if (attr_msg->max_xfer_sz == 0) { 2743 PR0("Received maximum transfer size of 0 from client"); 2744 return (EBADMSG); 2745 } 2746 2747 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 2748 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 2749 PR0("Client requested unsupported transfer mode"); 2750 return (EBADMSG); 2751 } 2752 2753 /* 2754 * check if the underlying disk is ready, if not try accessing 2755 * the device again. Open the vdisk device and extract info 2756 * about it, as this is needed to respond to the attr info msg 2757 */ 2758 if ((vd->initialized & VD_DISK_READY) == 0) { 2759 PR0("Retry setting up disk (%s)", vd->device_path); 2760 do { 2761 status = vd_setup_vd(vd); 2762 if (status != EAGAIN || ++retry > vds_dev_retries) 2763 break; 2764 2765 /* incremental delay */ 2766 delay(drv_usectohz(vds_dev_delay)); 2767 2768 /* if vdisk is no longer enabled - return error */ 2769 if (!vd_enabled(vd)) 2770 return (ENXIO); 2771 2772 } while (status == EAGAIN); 2773 2774 if (status) 2775 return (ENXIO); 2776 2777 vd->initialized |= VD_DISK_READY; 2778 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 2779 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 2780 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 2781 (vd->pseudo ? "yes" : "no"), 2782 (vd->file ? "yes" : "no"), 2783 vd->nslices); 2784 } 2785 2786 /* Success: valid message and transfer mode */ 2787 vd->xfer_mode = attr_msg->xfer_mode; 2788 2789 if (vd->xfer_mode == VIO_DESC_MODE) { 2790 2791 /* 2792 * The vd_dring_inband_msg_t contains one cookie; need room 2793 * for up to n-1 more cookies, where "n" is the number of full 2794 * pages plus possibly one partial page required to cover 2795 * "max_xfer_sz". Add room for one more cookie if 2796 * "max_xfer_sz" isn't an integral multiple of the page size. 2797 * Must first get the maximum transfer size in bytes. 2798 */ 2799 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 2800 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 2801 attr_msg->max_xfer_sz; 2802 size_t max_inband_msglen = 2803 sizeof (vd_dring_inband_msg_t) + 2804 ((max_xfer_bytes/PAGESIZE + 2805 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 2806 (sizeof (ldc_mem_cookie_t))); 2807 2808 /* 2809 * Set the maximum expected message length to 2810 * accommodate in-band-descriptor messages with all 2811 * their cookies 2812 */ 2813 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 2814 2815 /* 2816 * Initialize the data structure for processing in-band I/O 2817 * request descriptors 2818 */ 2819 vd->inband_task.vd = vd; 2820 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2821 vd->inband_task.index = 0; 2822 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 2823 } 2824 2825 /* Return the device's block size and max transfer size to the client */ 2826 attr_msg->vdisk_block_size = DEV_BSIZE; 2827 attr_msg->vdisk_block_size = vd->block_size; 2828 attr_msg->max_xfer_sz = vd->max_xfer_sz; 2829 2830 attr_msg->vdisk_size = vd->vdisk_size; 2831 attr_msg->vdisk_type = vd->vdisk_type; 2832 attr_msg->vdisk_media = vd->vdisk_media; 2833 2834 /* Discover and save the list of supported VD_OP_XXX operations */ 2835 vd_set_exported_operations(vd); 2836 attr_msg->operations = vd->operations; 2837 2838 PR0("%s", VD_CLIENT(vd)); 2839 2840 ASSERT(vd->dring_task == NULL); 2841 2842 return (0); 2843 } 2844 2845 static int 2846 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2847 { 2848 int status; 2849 size_t expected; 2850 ldc_mem_info_t dring_minfo; 2851 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 2852 2853 2854 ASSERT(msglen >= sizeof (msg->tag)); 2855 2856 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2857 VIO_DRING_REG)) { 2858 PR0("Message is not a register-dring message"); 2859 return (ENOMSG); 2860 } 2861 2862 if (msglen < sizeof (*reg_msg)) { 2863 PR0("Expected at least %lu-byte register-dring message; " 2864 "received %lu bytes", sizeof (*reg_msg), msglen); 2865 return (EBADMSG); 2866 } 2867 2868 expected = sizeof (*reg_msg) + 2869 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 2870 if (msglen != expected) { 2871 PR0("Expected %lu-byte register-dring message; " 2872 "received %lu bytes", expected, msglen); 2873 return (EBADMSG); 2874 } 2875 2876 if (vd->initialized & VD_DRING) { 2877 PR0("A dring was previously registered; only support one"); 2878 return (EBADMSG); 2879 } 2880 2881 if (reg_msg->num_descriptors > INT32_MAX) { 2882 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 2883 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 2884 return (EBADMSG); 2885 } 2886 2887 if (reg_msg->ncookies != 1) { 2888 /* 2889 * In addition to fixing the assertion in the success case 2890 * below, supporting drings which require more than one 2891 * "cookie" requires increasing the value of vd->max_msglen 2892 * somewhere in the code path prior to receiving the message 2893 * which results in calling this function. Note that without 2894 * making this change, the larger message size required to 2895 * accommodate multiple cookies cannot be successfully 2896 * received, so this function will not even get called. 2897 * Gracefully accommodating more dring cookies might 2898 * reasonably demand exchanging an additional attribute or 2899 * making a minor protocol adjustment 2900 */ 2901 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 2902 return (EBADMSG); 2903 } 2904 2905 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 2906 reg_msg->ncookies, reg_msg->num_descriptors, 2907 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 2908 if (status != 0) { 2909 PR0("ldc_mem_dring_map() returned errno %d", status); 2910 return (status); 2911 } 2912 2913 /* 2914 * To remove the need for this assertion, must call 2915 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 2916 * successful call to ldc_mem_dring_map() 2917 */ 2918 ASSERT(reg_msg->ncookies == 1); 2919 2920 if ((status = 2921 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 2922 PR0("ldc_mem_dring_info() returned errno %d", status); 2923 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 2924 PR0("ldc_mem_dring_unmap() returned errno %d", status); 2925 return (status); 2926 } 2927 2928 if (dring_minfo.vaddr == NULL) { 2929 PR0("Descriptor ring virtual address is NULL"); 2930 return (ENXIO); 2931 } 2932 2933 2934 /* Initialize for valid message and mapped dring */ 2935 PR1("descriptor size = %u, dring length = %u", 2936 vd->descriptor_size, vd->dring_len); 2937 vd->initialized |= VD_DRING; 2938 vd->dring_ident = 1; /* "There Can Be Only One" */ 2939 vd->dring = dring_minfo.vaddr; 2940 vd->descriptor_size = reg_msg->descriptor_size; 2941 vd->dring_len = reg_msg->num_descriptors; 2942 reg_msg->dring_ident = vd->dring_ident; 2943 2944 /* 2945 * Allocate and initialize a "shadow" array of data structures for 2946 * tasks to process I/O requests in dring elements 2947 */ 2948 vd->dring_task = 2949 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 2950 for (int i = 0; i < vd->dring_len; i++) { 2951 vd->dring_task[i].vd = vd; 2952 vd->dring_task[i].index = i; 2953 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 2954 2955 status = ldc_mem_alloc_handle(vd->ldc_handle, 2956 &(vd->dring_task[i].mhdl)); 2957 if (status) { 2958 PR0("ldc_mem_alloc_handle() returned err %d ", status); 2959 return (ENXIO); 2960 } 2961 2962 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2963 } 2964 2965 return (0); 2966 } 2967 2968 static int 2969 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2970 { 2971 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 2972 2973 2974 ASSERT(msglen >= sizeof (msg->tag)); 2975 2976 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2977 VIO_DRING_UNREG)) { 2978 PR0("Message is not an unregister-dring message"); 2979 return (ENOMSG); 2980 } 2981 2982 if (msglen != sizeof (*unreg_msg)) { 2983 PR0("Expected %lu-byte unregister-dring message; " 2984 "received %lu bytes", sizeof (*unreg_msg), msglen); 2985 return (EBADMSG); 2986 } 2987 2988 if (unreg_msg->dring_ident != vd->dring_ident) { 2989 PR0("Expected dring ident %lu; received %lu", 2990 vd->dring_ident, unreg_msg->dring_ident); 2991 return (EBADMSG); 2992 } 2993 2994 return (0); 2995 } 2996 2997 static int 2998 process_rdx_msg(vio_msg_t *msg, size_t msglen) 2999 { 3000 ASSERT(msglen >= sizeof (msg->tag)); 3001 3002 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 3003 PR0("Message is not an RDX message"); 3004 return (ENOMSG); 3005 } 3006 3007 if (msglen != sizeof (vio_rdx_msg_t)) { 3008 PR0("Expected %lu-byte RDX message; received %lu bytes", 3009 sizeof (vio_rdx_msg_t), msglen); 3010 return (EBADMSG); 3011 } 3012 3013 PR0("Valid RDX message"); 3014 return (0); 3015 } 3016 3017 static int 3018 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 3019 { 3020 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 3021 PR0("Received seq_num %lu; expected %lu", 3022 seq_num, (vd->seq_num + 1)); 3023 PR0("initiating soft reset"); 3024 vd_need_reset(vd, B_FALSE); 3025 return (1); 3026 } 3027 3028 vd->seq_num = seq_num; 3029 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 3030 return (0); 3031 } 3032 3033 /* 3034 * Return the expected size of an inband-descriptor message with all the 3035 * cookies it claims to include 3036 */ 3037 static size_t 3038 expected_inband_size(vd_dring_inband_msg_t *msg) 3039 { 3040 return ((sizeof (*msg)) + 3041 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 3042 } 3043 3044 /* 3045 * Process an in-band descriptor message: used with clients like OBP, with 3046 * which vds exchanges descriptors within VIO message payloads, rather than 3047 * operating on them within a descriptor ring 3048 */ 3049 static int 3050 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3051 { 3052 size_t expected; 3053 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 3054 3055 3056 ASSERT(msglen >= sizeof (msg->tag)); 3057 3058 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 3059 VIO_DESC_DATA)) { 3060 PR1("Message is not an in-band-descriptor message"); 3061 return (ENOMSG); 3062 } 3063 3064 if (msglen < sizeof (*desc_msg)) { 3065 PR0("Expected at least %lu-byte descriptor message; " 3066 "received %lu bytes", sizeof (*desc_msg), msglen); 3067 return (EBADMSG); 3068 } 3069 3070 if (msglen != (expected = expected_inband_size(desc_msg))) { 3071 PR0("Expected %lu-byte descriptor message; " 3072 "received %lu bytes", expected, msglen); 3073 return (EBADMSG); 3074 } 3075 3076 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 3077 return (EBADMSG); 3078 3079 /* 3080 * Valid message: Set up the in-band descriptor task and process the 3081 * request. Arrange to acknowledge the client's message, unless an 3082 * error processing the descriptor task results in setting 3083 * VIO_SUBTYPE_NACK 3084 */ 3085 PR1("Valid in-band-descriptor message"); 3086 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3087 3088 ASSERT(vd->inband_task.msg != NULL); 3089 3090 bcopy(msg, vd->inband_task.msg, msglen); 3091 vd->inband_task.msglen = msglen; 3092 3093 /* 3094 * The task request is now the payload of the message 3095 * that was just copied into the body of the task. 3096 */ 3097 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 3098 vd->inband_task.request = &desc_msg->payload; 3099 3100 return (vd_process_task(&vd->inband_task)); 3101 } 3102 3103 static int 3104 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 3105 vio_msg_t *msg, size_t msglen) 3106 { 3107 int status; 3108 boolean_t ready; 3109 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 3110 3111 3112 /* Accept the updated dring element */ 3113 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 3114 PR0("ldc_mem_dring_acquire() returned errno %d", status); 3115 return (status); 3116 } 3117 ready = (elem->hdr.dstate == VIO_DESC_READY); 3118 if (ready) { 3119 elem->hdr.dstate = VIO_DESC_ACCEPTED; 3120 } else { 3121 PR0("descriptor %u not ready", idx); 3122 VD_DUMP_DRING_ELEM(elem); 3123 } 3124 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 3125 PR0("ldc_mem_dring_release() returned errno %d", status); 3126 return (status); 3127 } 3128 if (!ready) 3129 return (EBUSY); 3130 3131 3132 /* Initialize a task and process the accepted element */ 3133 PR1("Processing dring element %u", idx); 3134 vd->dring_task[idx].type = type; 3135 3136 /* duplicate msg buf for cookies etc. */ 3137 bcopy(msg, vd->dring_task[idx].msg, msglen); 3138 3139 vd->dring_task[idx].msglen = msglen; 3140 return (vd_process_task(&vd->dring_task[idx])); 3141 } 3142 3143 static int 3144 vd_process_element_range(vd_t *vd, int start, int end, 3145 vio_msg_t *msg, size_t msglen) 3146 { 3147 int i, n, nelem, status = 0; 3148 boolean_t inprogress = B_FALSE; 3149 vd_task_type_t type; 3150 3151 3152 ASSERT(start >= 0); 3153 ASSERT(end >= 0); 3154 3155 /* 3156 * Arrange to acknowledge the client's message, unless an error 3157 * processing one of the dring elements results in setting 3158 * VIO_SUBTYPE_NACK 3159 */ 3160 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3161 3162 /* 3163 * Process the dring elements in the range 3164 */ 3165 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 3166 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 3167 ((vio_dring_msg_t *)msg)->end_idx = i; 3168 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 3169 status = vd_process_element(vd, type, i, msg, msglen); 3170 if (status == EINPROGRESS) 3171 inprogress = B_TRUE; 3172 else if (status != 0) 3173 break; 3174 } 3175 3176 /* 3177 * If some, but not all, operations of a multi-element range are in 3178 * progress, wait for other operations to complete before returning 3179 * (which will result in "ack" or "nack" of the message). Note that 3180 * all outstanding operations will need to complete, not just the ones 3181 * corresponding to the current range of dring elements; howevever, as 3182 * this situation is an error case, performance is less critical. 3183 */ 3184 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 3185 ddi_taskq_wait(vd->completionq); 3186 3187 return (status); 3188 } 3189 3190 static int 3191 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3192 { 3193 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 3194 3195 3196 ASSERT(msglen >= sizeof (msg->tag)); 3197 3198 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 3199 VIO_DRING_DATA)) { 3200 PR1("Message is not a dring-data message"); 3201 return (ENOMSG); 3202 } 3203 3204 if (msglen != sizeof (*dring_msg)) { 3205 PR0("Expected %lu-byte dring message; received %lu bytes", 3206 sizeof (*dring_msg), msglen); 3207 return (EBADMSG); 3208 } 3209 3210 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 3211 return (EBADMSG); 3212 3213 if (dring_msg->dring_ident != vd->dring_ident) { 3214 PR0("Expected dring ident %lu; received ident %lu", 3215 vd->dring_ident, dring_msg->dring_ident); 3216 return (EBADMSG); 3217 } 3218 3219 if (dring_msg->start_idx >= vd->dring_len) { 3220 PR0("\"start_idx\" = %u; must be less than %u", 3221 dring_msg->start_idx, vd->dring_len); 3222 return (EBADMSG); 3223 } 3224 3225 if ((dring_msg->end_idx < 0) || 3226 (dring_msg->end_idx >= vd->dring_len)) { 3227 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 3228 dring_msg->end_idx, vd->dring_len); 3229 return (EBADMSG); 3230 } 3231 3232 /* Valid message; process range of updated dring elements */ 3233 PR1("Processing descriptor range, start = %u, end = %u", 3234 dring_msg->start_idx, dring_msg->end_idx); 3235 return (vd_process_element_range(vd, dring_msg->start_idx, 3236 dring_msg->end_idx, msg, msglen)); 3237 } 3238 3239 static int 3240 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 3241 { 3242 int retry, status; 3243 size_t size = *nbytes; 3244 3245 3246 for (retry = 0, status = ETIMEDOUT; 3247 retry < vds_ldc_retries && status == ETIMEDOUT; 3248 retry++) { 3249 PR1("ldc_read() attempt %d", (retry + 1)); 3250 *nbytes = size; 3251 status = ldc_read(ldc_handle, msg, nbytes); 3252 } 3253 3254 if (status) { 3255 PR0("ldc_read() returned errno %d", status); 3256 if (status != ECONNRESET) 3257 return (ENOMSG); 3258 return (status); 3259 } else if (*nbytes == 0) { 3260 PR1("ldc_read() returned 0 and no message read"); 3261 return (ENOMSG); 3262 } 3263 3264 PR1("RCVD %lu-byte message", *nbytes); 3265 return (0); 3266 } 3267 3268 static int 3269 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3270 { 3271 int status; 3272 3273 3274 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 3275 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 3276 #ifdef DEBUG 3277 vd_decode_tag(msg); 3278 #endif 3279 3280 /* 3281 * Validate session ID up front, since it applies to all messages 3282 * once set 3283 */ 3284 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 3285 PR0("Expected SID %u, received %u", vd->sid, 3286 msg->tag.vio_sid); 3287 return (EBADMSG); 3288 } 3289 3290 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 3291 3292 /* 3293 * Process the received message based on connection state 3294 */ 3295 switch (vd->state) { 3296 case VD_STATE_INIT: /* expect version message */ 3297 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 3298 return (status); 3299 3300 /* Version negotiated, move to that state */ 3301 vd->state = VD_STATE_VER; 3302 return (0); 3303 3304 case VD_STATE_VER: /* expect attribute message */ 3305 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 3306 return (status); 3307 3308 /* Attributes exchanged, move to that state */ 3309 vd->state = VD_STATE_ATTR; 3310 return (0); 3311 3312 case VD_STATE_ATTR: 3313 switch (vd->xfer_mode) { 3314 case VIO_DESC_MODE: /* expect RDX message */ 3315 if ((status = process_rdx_msg(msg, msglen)) != 0) 3316 return (status); 3317 3318 /* Ready to receive in-band descriptors */ 3319 vd->state = VD_STATE_DATA; 3320 return (0); 3321 3322 case VIO_DRING_MODE: /* expect register-dring message */ 3323 if ((status = 3324 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 3325 return (status); 3326 3327 /* One dring negotiated, move to that state */ 3328 vd->state = VD_STATE_DRING; 3329 return (0); 3330 3331 default: 3332 ASSERT("Unsupported transfer mode"); 3333 PR0("Unsupported transfer mode"); 3334 return (ENOTSUP); 3335 } 3336 3337 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 3338 if ((status = process_rdx_msg(msg, msglen)) == 0) { 3339 /* Ready to receive data */ 3340 vd->state = VD_STATE_DATA; 3341 return (0); 3342 } else if (status != ENOMSG) { 3343 return (status); 3344 } 3345 3346 3347 /* 3348 * If another register-dring message is received, stay in 3349 * dring state in case the client sends RDX; although the 3350 * protocol allows multiple drings, this server does not 3351 * support using more than one 3352 */ 3353 if ((status = 3354 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 3355 return (status); 3356 3357 /* 3358 * Acknowledge an unregister-dring message, but reset the 3359 * connection anyway: Although the protocol allows 3360 * unregistering drings, this server cannot serve a vdisk 3361 * without its only dring 3362 */ 3363 status = vd_process_dring_unreg_msg(vd, msg, msglen); 3364 return ((status == 0) ? ENOTSUP : status); 3365 3366 case VD_STATE_DATA: 3367 switch (vd->xfer_mode) { 3368 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 3369 return (vd_process_desc_msg(vd, msg, msglen)); 3370 3371 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 3372 /* 3373 * Typically expect dring-data messages, so handle 3374 * them first 3375 */ 3376 if ((status = vd_process_dring_msg(vd, msg, 3377 msglen)) != ENOMSG) 3378 return (status); 3379 3380 /* 3381 * Acknowledge an unregister-dring message, but reset 3382 * the connection anyway: Although the protocol 3383 * allows unregistering drings, this server cannot 3384 * serve a vdisk without its only dring 3385 */ 3386 status = vd_process_dring_unreg_msg(vd, msg, msglen); 3387 return ((status == 0) ? ENOTSUP : status); 3388 3389 default: 3390 ASSERT("Unsupported transfer mode"); 3391 PR0("Unsupported transfer mode"); 3392 return (ENOTSUP); 3393 } 3394 3395 default: 3396 ASSERT("Invalid client connection state"); 3397 PR0("Invalid client connection state"); 3398 return (ENOTSUP); 3399 } 3400 } 3401 3402 static int 3403 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3404 { 3405 int status; 3406 boolean_t reset_ldc = B_FALSE; 3407 vd_task_t task; 3408 3409 /* 3410 * Check that the message is at least big enough for a "tag", so that 3411 * message processing can proceed based on tag-specified message type 3412 */ 3413 if (msglen < sizeof (vio_msg_tag_t)) { 3414 PR0("Received short (%lu-byte) message", msglen); 3415 /* Can't "nack" short message, so drop the big hammer */ 3416 PR0("initiating full reset"); 3417 vd_need_reset(vd, B_TRUE); 3418 return (EBADMSG); 3419 } 3420 3421 /* 3422 * Process the message 3423 */ 3424 switch (status = vd_do_process_msg(vd, msg, msglen)) { 3425 case 0: 3426 /* "ack" valid, successfully-processed messages */ 3427 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3428 break; 3429 3430 case EINPROGRESS: 3431 /* The completion handler will "ack" or "nack" the message */ 3432 return (EINPROGRESS); 3433 case ENOMSG: 3434 PR0("Received unexpected message"); 3435 _NOTE(FALLTHROUGH); 3436 case EBADMSG: 3437 case ENOTSUP: 3438 /* "transport" error will cause NACK of invalid messages */ 3439 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 3440 break; 3441 3442 default: 3443 /* "transport" error will cause NACK of invalid messages */ 3444 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 3445 /* An LDC error probably occurred, so try resetting it */ 3446 reset_ldc = B_TRUE; 3447 break; 3448 } 3449 3450 PR1("\tResulting in state %d (%s)", vd->state, 3451 vd_decode_state(vd->state)); 3452 3453 /* populate the task so we can dispatch it on the taskq */ 3454 task.vd = vd; 3455 task.msg = msg; 3456 task.msglen = msglen; 3457 3458 /* 3459 * Queue a task to send the notification that the operation completed. 3460 * We need to ensure that requests are responded to in the correct 3461 * order and since the taskq is processed serially this ordering 3462 * is maintained. 3463 */ 3464 (void) ddi_taskq_dispatch(vd->completionq, vd_serial_notify, 3465 &task, DDI_SLEEP); 3466 3467 /* 3468 * To ensure handshake negotiations do not happen out of order, such 3469 * requests that come through this path should not be done in parallel 3470 * so we need to wait here until the response is sent to the client. 3471 */ 3472 ddi_taskq_wait(vd->completionq); 3473 3474 /* Arrange to reset the connection for nack'ed or failed messages */ 3475 if ((status != 0) || reset_ldc) { 3476 PR0("initiating %s reset", 3477 (reset_ldc) ? "full" : "soft"); 3478 vd_need_reset(vd, reset_ldc); 3479 } 3480 3481 return (status); 3482 } 3483 3484 static boolean_t 3485 vd_enabled(vd_t *vd) 3486 { 3487 boolean_t enabled; 3488 3489 mutex_enter(&vd->lock); 3490 enabled = vd->enabled; 3491 mutex_exit(&vd->lock); 3492 return (enabled); 3493 } 3494 3495 static void 3496 vd_recv_msg(void *arg) 3497 { 3498 vd_t *vd = (vd_t *)arg; 3499 int rv = 0, status = 0; 3500 3501 ASSERT(vd != NULL); 3502 3503 PR2("New task to receive incoming message(s)"); 3504 3505 3506 while (vd_enabled(vd) && status == 0) { 3507 size_t msglen, msgsize; 3508 ldc_status_t lstatus; 3509 3510 /* 3511 * Receive and process a message 3512 */ 3513 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 3514 3515 /* 3516 * check if channel is UP - else break out of loop 3517 */ 3518 status = ldc_status(vd->ldc_handle, &lstatus); 3519 if (lstatus != LDC_UP) { 3520 PR0("channel not up (status=%d), exiting recv loop\n", 3521 lstatus); 3522 break; 3523 } 3524 3525 ASSERT(vd->max_msglen != 0); 3526 3527 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 3528 msglen = msgsize; /* actual len after recv_msg() */ 3529 3530 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 3531 switch (status) { 3532 case 0: 3533 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 3534 msglen); 3535 /* check if max_msglen changed */ 3536 if (msgsize != vd->max_msglen) { 3537 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 3538 msgsize, vd->max_msglen); 3539 kmem_free(vd->vio_msgp, msgsize); 3540 vd->vio_msgp = 3541 kmem_alloc(vd->max_msglen, KM_SLEEP); 3542 } 3543 if (rv == EINPROGRESS) 3544 continue; 3545 break; 3546 3547 case ENOMSG: 3548 break; 3549 3550 case ECONNRESET: 3551 PR0("initiating soft reset (ECONNRESET)\n"); 3552 vd_need_reset(vd, B_FALSE); 3553 status = 0; 3554 break; 3555 3556 default: 3557 /* Probably an LDC failure; arrange to reset it */ 3558 PR0("initiating full reset (status=0x%x)", status); 3559 vd_need_reset(vd, B_TRUE); 3560 break; 3561 } 3562 } 3563 3564 PR2("Task finished"); 3565 } 3566 3567 static uint_t 3568 vd_handle_ldc_events(uint64_t event, caddr_t arg) 3569 { 3570 vd_t *vd = (vd_t *)(void *)arg; 3571 int status; 3572 3573 ASSERT(vd != NULL); 3574 3575 if (!vd_enabled(vd)) 3576 return (LDC_SUCCESS); 3577 3578 if (event & LDC_EVT_DOWN) { 3579 PR0("LDC_EVT_DOWN: LDC channel went down"); 3580 3581 vd_need_reset(vd, B_TRUE); 3582 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 3583 DDI_SLEEP); 3584 if (status == DDI_FAILURE) { 3585 PR0("cannot schedule task to recv msg\n"); 3586 vd_need_reset(vd, B_TRUE); 3587 } 3588 } 3589 3590 if (event & LDC_EVT_RESET) { 3591 PR0("LDC_EVT_RESET: LDC channel was reset"); 3592 3593 if (vd->state != VD_STATE_INIT) { 3594 PR0("scheduling full reset"); 3595 vd_need_reset(vd, B_FALSE); 3596 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 3597 vd, DDI_SLEEP); 3598 if (status == DDI_FAILURE) { 3599 PR0("cannot schedule task to recv msg\n"); 3600 vd_need_reset(vd, B_TRUE); 3601 } 3602 3603 } else { 3604 PR0("channel already reset, ignoring...\n"); 3605 PR0("doing ldc up...\n"); 3606 (void) ldc_up(vd->ldc_handle); 3607 } 3608 3609 return (LDC_SUCCESS); 3610 } 3611 3612 if (event & LDC_EVT_UP) { 3613 PR0("EVT_UP: LDC is up\nResetting client connection state"); 3614 PR0("initiating soft reset"); 3615 vd_need_reset(vd, B_FALSE); 3616 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 3617 vd, DDI_SLEEP); 3618 if (status == DDI_FAILURE) { 3619 PR0("cannot schedule task to recv msg\n"); 3620 vd_need_reset(vd, B_TRUE); 3621 return (LDC_SUCCESS); 3622 } 3623 } 3624 3625 if (event & LDC_EVT_READ) { 3626 int status; 3627 3628 PR1("New data available"); 3629 /* Queue a task to receive the new data */ 3630 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 3631 DDI_SLEEP); 3632 3633 if (status == DDI_FAILURE) { 3634 PR0("cannot schedule task to recv msg\n"); 3635 vd_need_reset(vd, B_TRUE); 3636 } 3637 } 3638 3639 return (LDC_SUCCESS); 3640 } 3641 3642 static uint_t 3643 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 3644 { 3645 _NOTE(ARGUNUSED(key, val)) 3646 (*((uint_t *)arg))++; 3647 return (MH_WALK_TERMINATE); 3648 } 3649 3650 3651 static int 3652 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3653 { 3654 uint_t vd_present = 0; 3655 minor_t instance; 3656 vds_t *vds; 3657 3658 3659 switch (cmd) { 3660 case DDI_DETACH: 3661 /* the real work happens below */ 3662 break; 3663 case DDI_SUSPEND: 3664 PR0("No action required for DDI_SUSPEND"); 3665 return (DDI_SUCCESS); 3666 default: 3667 PR0("Unrecognized \"cmd\""); 3668 return (DDI_FAILURE); 3669 } 3670 3671 ASSERT(cmd == DDI_DETACH); 3672 instance = ddi_get_instance(dip); 3673 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 3674 PR0("Could not get state for instance %u", instance); 3675 ddi_soft_state_free(vds_state, instance); 3676 return (DDI_FAILURE); 3677 } 3678 3679 /* Do no detach when serving any vdisks */ 3680 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 3681 if (vd_present) { 3682 PR0("Not detaching because serving vdisks"); 3683 return (DDI_FAILURE); 3684 } 3685 3686 PR0("Detaching"); 3687 if (vds->initialized & VDS_MDEG) { 3688 (void) mdeg_unregister(vds->mdeg); 3689 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 3690 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 3691 vds->ispecp = NULL; 3692 vds->mdeg = NULL; 3693 } 3694 3695 if (vds->initialized & VDS_LDI) 3696 (void) ldi_ident_release(vds->ldi_ident); 3697 mod_hash_destroy_hash(vds->vd_table); 3698 ddi_soft_state_free(vds_state, instance); 3699 return (DDI_SUCCESS); 3700 } 3701 3702 static boolean_t 3703 is_pseudo_device(dev_info_t *dip) 3704 { 3705 dev_info_t *parent, *root = ddi_root_node(); 3706 3707 3708 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 3709 parent = ddi_get_parent(parent)) { 3710 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 3711 return (B_TRUE); 3712 } 3713 3714 return (B_FALSE); 3715 } 3716 3717 /* 3718 * Description: 3719 * This function checks to see if the file being used as a 3720 * virtual disk is an ISO image. An ISO image is a special 3721 * case which can be booted/installed from like a CD/DVD 3722 * 3723 * Parameters: 3724 * vd - disk on which the operation is performed. 3725 * 3726 * Return Code: 3727 * B_TRUE - The file is an ISO 9660 compliant image 3728 * B_FALSE - just a regular disk image file 3729 */ 3730 static boolean_t 3731 vd_file_is_iso_image(vd_t *vd) 3732 { 3733 char iso_buf[ISO_SECTOR_SIZE]; 3734 int i, rv; 3735 uint_t sec; 3736 3737 ASSERT(vd->file); 3738 3739 /* 3740 * If we have already discovered and saved this info we can 3741 * short-circuit the check and avoid reading the file. 3742 */ 3743 if (vd->vdisk_media == VD_MEDIA_DVD || vd->vdisk_media == VD_MEDIA_CD) 3744 return (B_TRUE); 3745 3746 /* 3747 * We wish to read the sector that should contain the 2nd ISO volume 3748 * descriptor. The second field in this descriptor is called the 3749 * Standard Identifier and is set to CD001 for a CD-ROM compliant 3750 * to the ISO 9660 standard. 3751 */ 3752 sec = (ISO_VOLDESC_SEC * ISO_SECTOR_SIZE) / vd->vdisk_block_size; 3753 rv = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)iso_buf, 3754 sec, ISO_SECTOR_SIZE); 3755 3756 if (rv < 0) 3757 return (B_FALSE); 3758 3759 for (i = 0; i < ISO_ID_STRLEN; i++) { 3760 if (ISO_STD_ID(iso_buf)[i] != ISO_ID_STRING[i]) 3761 return (B_FALSE); 3762 } 3763 3764 return (B_TRUE); 3765 } 3766 3767 /* 3768 * Description: 3769 * This function checks to see if the virtual device is an ATAPI 3770 * device. ATAPI devices use Group 1 Read/Write commands, so 3771 * any USCSI calls vds makes need to take this into account. 3772 * 3773 * Parameters: 3774 * vd - disk on which the operation is performed. 3775 * 3776 * Return Code: 3777 * B_TRUE - The virtual disk is backed by an ATAPI device 3778 * B_FALSE - not an ATAPI device (presumably SCSI) 3779 */ 3780 static boolean_t 3781 vd_is_atapi_device(vd_t *vd) 3782 { 3783 boolean_t is_atapi = B_FALSE; 3784 char *variantp; 3785 int rv; 3786 3787 ASSERT(vd->ldi_handle[0] != NULL); 3788 ASSERT(!vd->file); 3789 3790 rv = ldi_prop_lookup_string(vd->ldi_handle[0], 3791 (LDI_DEV_T_ANY | DDI_PROP_DONTPASS), "variant", &variantp); 3792 if (rv == DDI_PROP_SUCCESS) { 3793 PR0("'variant' property exists for %s", vd->device_path); 3794 if (strcmp(variantp, "atapi") == 0) 3795 is_atapi = B_TRUE; 3796 ddi_prop_free(variantp); 3797 } 3798 3799 rv = ldi_prop_exists(vd->ldi_handle[0], LDI_DEV_T_ANY, "atapi"); 3800 if (rv) { 3801 PR0("'atapi' property exists for %s", vd->device_path); 3802 is_atapi = B_TRUE; 3803 } 3804 3805 return (is_atapi); 3806 } 3807 3808 static int 3809 vd_setup_full_disk(vd_t *vd) 3810 { 3811 int rval, status; 3812 major_t major = getmajor(vd->dev[0]); 3813 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 3814 struct dk_minfo dk_minfo; 3815 3816 ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); 3817 3818 /* 3819 * At this point, vdisk_size is set to the size of partition 2 but 3820 * this does not represent the size of the disk because partition 2 3821 * may not cover the entire disk and its size does not include reserved 3822 * blocks. So we update vdisk_size to be the size of the entire disk. 3823 */ 3824 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 3825 (intptr_t)&dk_minfo, (vd->open_flags | FKIOCTL), 3826 kcred, &rval)) != 0) { 3827 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 3828 status); 3829 return (status); 3830 } 3831 vd->vdisk_size = dk_minfo.dki_capacity; 3832 vd->block_size = dk_minfo.dki_lbsize; 3833 vd->vdisk_media = DK_MEDIATYPE2VD_MEDIATYPE(dk_minfo.dki_media_type); 3834 vd->vdisk_block_size = DEV_BSIZE; 3835 3836 /* Move dev number and LDI handle to entire-disk-slice array elements */ 3837 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 3838 vd->dev[0] = 0; 3839 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 3840 vd->ldi_handle[0] = NULL; 3841 3842 /* Initialize device numbers for remaining slices and open them */ 3843 for (int slice = 0; slice < vd->nslices; slice++) { 3844 /* 3845 * Skip the entire-disk slice, as it's already open and its 3846 * device known 3847 */ 3848 if (slice == VD_ENTIRE_DISK_SLICE) 3849 continue; 3850 ASSERT(vd->dev[slice] == 0); 3851 ASSERT(vd->ldi_handle[slice] == NULL); 3852 3853 /* 3854 * Construct the device number for the current slice 3855 */ 3856 vd->dev[slice] = makedevice(major, (minor + slice)); 3857 3858 /* 3859 * Open all slices of the disk to serve them to the client. 3860 * Slices are opened exclusively to prevent other threads or 3861 * processes in the service domain from performing I/O to 3862 * slices being accessed by a client. Failure to open a slice 3863 * results in vds not serving this disk, as the client could 3864 * attempt (and should be able) to access any slice immediately. 3865 * Any slices successfully opened before a failure will get 3866 * closed by vds_destroy_vd() as a result of the error returned 3867 * by this function. 3868 * 3869 * We need to do the open with FNDELAY so that opening an empty 3870 * slice does not fail. 3871 */ 3872 PR0("Opening device major %u, minor %u = slice %u", 3873 major, minor, slice); 3874 3875 /* 3876 * Try to open the device. This can fail for example if we are 3877 * opening an empty slice. So in case of a failure, we try the 3878 * open again but this time with the FNDELAY flag. 3879 */ 3880 status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 3881 vd->open_flags, kcred, &vd->ldi_handle[slice], 3882 vd->vds->ldi_ident); 3883 3884 if (status != 0) { 3885 status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 3886 vd->open_flags | FNDELAY, kcred, 3887 &vd->ldi_handle[slice], vd->vds->ldi_ident); 3888 } 3889 3890 if (status != 0) { 3891 PRN("ldi_open_by_dev() returned errno %d " 3892 "for slice %u", status, slice); 3893 /* vds_destroy_vd() will close any open slices */ 3894 vd->ldi_handle[slice] = NULL; 3895 return (status); 3896 } 3897 } 3898 3899 return (0); 3900 } 3901 3902 static int 3903 vd_setup_partition_vtoc(vd_t *vd) 3904 { 3905 int rval, status; 3906 char *device_path = vd->device_path; 3907 3908 status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 3909 (intptr_t)&vd->dk_geom, (vd->open_flags | FKIOCTL), kcred, &rval); 3910 3911 if (status != 0) { 3912 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 3913 status, device_path); 3914 return (status); 3915 } 3916 3917 /* Initialize dk_geom structure for single-slice device */ 3918 if (vd->dk_geom.dkg_nsect == 0) { 3919 PRN("%s geometry claims 0 sectors per track", device_path); 3920 return (EIO); 3921 } 3922 if (vd->dk_geom.dkg_nhead == 0) { 3923 PRN("%s geometry claims 0 heads", device_path); 3924 return (EIO); 3925 } 3926 vd->dk_geom.dkg_ncyl = vd->vdisk_size / vd->dk_geom.dkg_nsect / 3927 vd->dk_geom.dkg_nhead; 3928 vd->dk_geom.dkg_acyl = 0; 3929 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 3930 3931 3932 /* Initialize vtoc structure for single-slice device */ 3933 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 3934 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 3935 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 3936 vd->vtoc.v_nparts = 1; 3937 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 3938 vd->vtoc.v_part[0].p_flag = 0; 3939 vd->vtoc.v_part[0].p_start = 0; 3940 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 3941 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 3942 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 3943 3944 return (0); 3945 } 3946 3947 static int 3948 vd_setup_partition_efi(vd_t *vd) 3949 { 3950 efi_gpt_t *gpt; 3951 efi_gpe_t *gpe; 3952 struct uuid uuid = EFI_RESERVED; 3953 uint32_t crc; 3954 int length; 3955 3956 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 3957 3958 gpt = kmem_zalloc(length, KM_SLEEP); 3959 gpe = (efi_gpe_t *)(gpt + 1); 3960 3961 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 3962 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 3963 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 3964 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 3965 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 3966 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 3967 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 3968 3969 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 3970 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 3971 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 3972 3973 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 3974 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 3975 3976 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 3977 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 3978 3979 vd->dk_efi.dki_lba = 0; 3980 vd->dk_efi.dki_length = length; 3981 vd->dk_efi.dki_data = gpt; 3982 3983 return (0); 3984 } 3985 3986 /* 3987 * Setup for a virtual disk whose backend is a file (exported as a single slice 3988 * or as a full disk) or a pseudo device (for example a ZFS, SVM or VxVM volume) 3989 * exported as a full disk. In these cases, the backend is accessed using the 3990 * vnode interface. 3991 */ 3992 static int 3993 vd_setup_backend_vnode(vd_t *vd) 3994 { 3995 int rval, status; 3996 vattr_t vattr; 3997 dev_t dev; 3998 char *file_path = vd->device_path; 3999 char dev_path[MAXPATHLEN + 1]; 4000 ldi_handle_t lhandle; 4001 struct dk_cinfo dk_cinfo; 4002 4003 if ((status = vn_open(file_path, UIO_SYSSPACE, vd->open_flags | FOFFMAX, 4004 0, &vd->file_vnode, 0, 0)) != 0) { 4005 PRN("vn_open(%s) = errno %d", file_path, status); 4006 return (status); 4007 } 4008 4009 /* 4010 * We set vd->file now so that vds_destroy_vd will take care of 4011 * closing the file and releasing the vnode in case of an error. 4012 */ 4013 vd->file = B_TRUE; 4014 4015 vattr.va_mask = AT_SIZE; 4016 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred, NULL)) 4017 != 0) { 4018 PRN("VOP_GETATTR(%s) = errno %d", file_path, status); 4019 return (EIO); 4020 } 4021 4022 vd->file_size = vattr.va_size; 4023 /* size should be at least sizeof(dk_label) */ 4024 if (vd->file_size < sizeof (struct dk_label)) { 4025 PRN("Size of file has to be at least %ld bytes", 4026 sizeof (struct dk_label)); 4027 return (EIO); 4028 } 4029 4030 if (vd->file_vnode->v_flag & VNOMAP) { 4031 PRN("File %s cannot be mapped", file_path); 4032 return (EIO); 4033 } 4034 4035 /* 4036 * Find and validate the geometry of a disk image. For a single slice 4037 * disk image, this will build a fake geometry and vtoc. 4038 */ 4039 status = vd_file_validate_geometry(vd); 4040 if (status != 0 && status != EINVAL) { 4041 PRN("Failed to read label from %s", file_path); 4042 return (EIO); 4043 } 4044 4045 /* sector size = block size = DEV_BSIZE */ 4046 vd->block_size = DEV_BSIZE; 4047 vd->vdisk_block_size = DEV_BSIZE; 4048 vd->vdisk_size = vd->file_size / DEV_BSIZE; 4049 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 4050 4051 if (vd_file_is_iso_image(vd)) { 4052 /* 4053 * Indicate whether to call this a CD or DVD from the size 4054 * of the ISO image (images for both drive types are stored 4055 * in the ISO-9600 format). CDs can store up to just under 1Gb 4056 */ 4057 if ((vd->vdisk_size * vd->vdisk_block_size) > 4058 (1024 * 1024 * 1024)) 4059 vd->vdisk_media = VD_MEDIA_DVD; 4060 else 4061 vd->vdisk_media = VD_MEDIA_CD; 4062 } else { 4063 vd->vdisk_media = VD_MEDIA_FIXED; 4064 } 4065 4066 /* 4067 * Get max_xfer_sz from the device where the file is or from the device 4068 * itself if we have a pseudo device. 4069 */ 4070 dev_path[0] = '\0'; 4071 4072 if (vd->pseudo) { 4073 status = ldi_open_by_name(file_path, FREAD, kcred, &lhandle, 4074 vd->vds->ldi_ident); 4075 } else { 4076 dev = vd->file_vnode->v_vfsp->vfs_dev; 4077 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 4078 PR0("underlying device = %s\n", dev_path); 4079 } 4080 4081 status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, kcred, &lhandle, 4082 vd->vds->ldi_ident); 4083 } 4084 4085 if (status != 0) { 4086 PR0("ldi_open() returned errno %d for device %s", 4087 status, (dev_path[0] == '\0')? file_path : dev_path); 4088 } else { 4089 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 4090 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 4091 &rval)) != 0) { 4092 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 4093 status, dev_path); 4094 } else { 4095 /* 4096 * Store the device's max transfer size for 4097 * return to the client 4098 */ 4099 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 4100 } 4101 4102 PR0("close the device %s", dev_path); 4103 (void) ldi_close(lhandle, FREAD, kcred); 4104 } 4105 4106 PR0("using file %s, dev %s, max_xfer = %u blks", 4107 file_path, dev_path, vd->max_xfer_sz); 4108 4109 /* Setup devid for the disk image */ 4110 4111 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) 4112 return (0); 4113 4114 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 4115 4116 status = vd_file_read_devid(vd, &vd->file_devid); 4117 4118 if (status == 0) { 4119 /* a valid devid was found */ 4120 return (0); 4121 } 4122 4123 if (status != EINVAL) { 4124 /* 4125 * There was an error while trying to read the devid. 4126 * So this disk image may have a devid but we are 4127 * unable to read it. 4128 */ 4129 PR0("can not read devid for %s", file_path); 4130 vd->file_devid = NULL; 4131 return (0); 4132 } 4133 } 4134 4135 /* 4136 * No valid device id was found so we create one. Note that a failure 4137 * to create a device id is not fatal and does not prevent the disk 4138 * image from being attached. 4139 */ 4140 PR1("creating devid for %s", file_path); 4141 4142 if (ddi_devid_init(vd->vds->dip, DEVID_FAB, NULL, 0, 4143 &vd->file_devid) != DDI_SUCCESS) { 4144 PR0("fail to create devid for %s", file_path); 4145 vd->file_devid = NULL; 4146 return (0); 4147 } 4148 4149 /* 4150 * Write devid to the disk image. The devid is stored into the disk 4151 * image if we have a valid label; otherwise the devid will be stored 4152 * when the user writes a valid label. 4153 */ 4154 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 4155 if (vd_file_write_devid(vd, vd->file_devid) != 0) { 4156 PR0("fail to write devid for %s", file_path); 4157 ddi_devid_free(vd->file_devid); 4158 vd->file_devid = NULL; 4159 } 4160 } 4161 4162 return (0); 4163 } 4164 4165 4166 /* 4167 * Description: 4168 * Open a device using its device path (supplied by ldm(1m)) 4169 * 4170 * Parameters: 4171 * vd - pointer to structure containing the vDisk info 4172 * 4173 * Return Value 4174 * 0 - success 4175 * EIO - Invalid number of partitions 4176 * != 0 - some other non-zero return value from ldi(9F) functions 4177 */ 4178 static int 4179 vd_open_using_ldi_by_name(vd_t *vd) 4180 { 4181 int rval, status, open_flags; 4182 struct dk_cinfo dk_cinfo; 4183 char *device_path = vd->device_path; 4184 4185 /* 4186 * Try to open the device. If the flags indicate that the device should 4187 * be opened write-enabled, we first we try to open it "read-only" 4188 * to see if we have an optical device such as a CD-ROM which, for 4189 * now, we do not permit writes to and thus should not export write 4190 * operations to the client. 4191 * 4192 * Future: if/when we implement support for guest domains writing to 4193 * optical devices we will need to do further checking of the media type 4194 * to distinguish between read-only and writable discs. 4195 */ 4196 if (vd->open_flags & FWRITE) { 4197 open_flags = vd->open_flags & ~FWRITE; 4198 status = ldi_open_by_name(device_path, open_flags, kcred, 4199 &vd->ldi_handle[0], vd->vds->ldi_ident); 4200 4201 if (status == 0) { 4202 /* Verify backing device supports dk_cinfo */ 4203 status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 4204 (intptr_t)&dk_cinfo, (open_flags | FKIOCTL), 4205 kcred, &rval); 4206 if (status != 0) { 4207 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for" 4208 " %s opened as RO", status, device_path); 4209 return (status); 4210 } 4211 4212 if (dk_cinfo.dki_partition >= V_NUMPAR) { 4213 PRN("slice %u >= maximum slice %u for %s", 4214 dk_cinfo.dki_partition, V_NUMPAR, 4215 device_path); 4216 return (EIO); 4217 } 4218 4219 /* 4220 * If this is an optical device then we disable 4221 * write access and return, otherwise we close 4222 * the device and try again with writes enabled. 4223 */ 4224 if (dk_cinfo.dki_ctype == DKC_CDROM) { 4225 vd->open_flags = open_flags; 4226 return (0); 4227 } else { 4228 (void) ldi_close(vd->ldi_handle[0], 4229 open_flags, kcred); 4230 } 4231 } 4232 } 4233 4234 /* Attempt to (re)open device */ 4235 status = ldi_open_by_name(device_path, open_flags, kcred, 4236 &vd->ldi_handle[0], vd->vds->ldi_ident); 4237 4238 /* 4239 * The open can fail for example if we are opening an empty slice. 4240 * In case of a failure, we try the open again but this time with 4241 * the FNDELAY flag. 4242 */ 4243 if (status != 0) 4244 status = ldi_open_by_name(device_path, vd->open_flags | FNDELAY, 4245 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident); 4246 4247 if (status != 0) { 4248 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 4249 vd->ldi_handle[0] = NULL; 4250 return (status); 4251 } 4252 4253 /* Verify backing device supports dk_cinfo */ 4254 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 4255 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 4256 &rval)) != 0) { 4257 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 4258 status, device_path); 4259 return (status); 4260 } 4261 if (dk_cinfo.dki_partition >= V_NUMPAR) { 4262 PRN("slice %u >= maximum slice %u for %s", 4263 dk_cinfo.dki_partition, V_NUMPAR, device_path); 4264 return (EIO); 4265 } 4266 4267 return (0); 4268 } 4269 4270 4271 /* 4272 * Setup for a virtual disk which backend is a device (a physical disk, 4273 * slice or pseudo device) that is directly exported either as a full disk 4274 * for a physical disk or as a slice for a pseudo device or a disk slice. 4275 * In these cases, the backend is accessed using the LDI interface. 4276 */ 4277 static int 4278 vd_setup_backend_ldi(vd_t *vd) 4279 { 4280 int rval, status; 4281 struct dk_cinfo dk_cinfo; 4282 char *device_path = vd->device_path; 4283 4284 status = vd_open_using_ldi_by_name(vd); 4285 if (status != 0) { 4286 PR0("Failed to open (%s) = errno %d", device_path, status); 4287 return (status); 4288 } 4289 4290 vd->file = B_FALSE; 4291 4292 /* Get device number of backing device */ 4293 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 4294 PRN("ldi_get_dev() returned errno %d for %s", 4295 status, device_path); 4296 return (status); 4297 } 4298 4299 /* Verify backing device supports dk_cinfo */ 4300 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 4301 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 4302 &rval)) != 0) { 4303 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 4304 status, device_path); 4305 return (status); 4306 } 4307 if (dk_cinfo.dki_partition >= V_NUMPAR) { 4308 PRN("slice %u >= maximum slice %u for %s", 4309 dk_cinfo.dki_partition, V_NUMPAR, device_path); 4310 return (EIO); 4311 } 4312 4313 vd->vdisk_label = vd_read_vtoc(vd, &vd->vtoc); 4314 4315 /* Store the device's max transfer size for return to the client */ 4316 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 4317 4318 /* 4319 * We need to work out if it's an ATAPI (IDE CD-ROM) or SCSI device so 4320 * that we can use the correct CDB group when sending USCSI commands. 4321 */ 4322 vd->is_atapi_dev = vd_is_atapi_device(vd); 4323 4324 /* 4325 * Export a full disk. 4326 * 4327 * When we use the LDI interface, we export a device as a full disk 4328 * if we have an entire disk slice (slice 2) and if this slice is 4329 * exported as a full disk and not as a single slice disk. 4330 * Similarly, we want to use LDI if we are accessing a CD or DVD 4331 * device (even if it isn't s2) 4332 * 4333 * Note that pseudo devices are exported as full disks using the vnode 4334 * interface, not the LDI interface. 4335 */ 4336 if ((dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE && 4337 vd->vdisk_type == VD_DISK_TYPE_DISK) || 4338 dk_cinfo.dki_ctype == DKC_CDROM) { 4339 ASSERT(!vd->pseudo); 4340 return (vd_setup_full_disk(vd)); 4341 } 4342 4343 /* 4344 * Export a single slice disk. 4345 * 4346 * The exported device can be either a pseudo device or a disk slice. If 4347 * it is a disk slice different from slice 2 then it is always exported 4348 * as a single slice disk even if the "slice" option is not specified. 4349 * If it is disk slice 2 or a pseudo device then it is exported as a 4350 * single slice disk only if the "slice" option is specified. 4351 */ 4352 ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE || 4353 dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE); 4354 return (vd_setup_single_slice_disk(vd)); 4355 } 4356 4357 static int 4358 vd_setup_single_slice_disk(vd_t *vd) 4359 { 4360 int status; 4361 char *device_path = vd->device_path; 4362 4363 /* Get size of backing device */ 4364 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 4365 PRN("ldi_get_size() failed for %s", device_path); 4366 return (EIO); 4367 } 4368 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 4369 vd->block_size = DEV_BSIZE; 4370 vd->vdisk_block_size = DEV_BSIZE; 4371 vd->vdisk_media = VD_MEDIA_FIXED; 4372 4373 if (vd->pseudo) { 4374 4375 ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE); 4376 4377 /* 4378 * Currently we only support exporting pseudo devices which 4379 * provide a valid disk label. 4380 */ 4381 if (vd->vdisk_label == VD_DISK_LABEL_UNK) { 4382 PRN("%s is a pseudo device with an invalid disk " 4383 "label\n", device_path); 4384 return (EINVAL); 4385 } 4386 return (0); /* ...and we're done */ 4387 } 4388 4389 /* We can only export a slice if the disk has a valid label */ 4390 if (vd->vdisk_label == VD_DISK_LABEL_UNK) { 4391 PRN("%s is a slice from a disk with an unknown disk label\n", 4392 device_path); 4393 return (EINVAL); 4394 } 4395 4396 /* 4397 * We export the slice as a single slice disk even if the "slice" 4398 * option was not specified. 4399 */ 4400 vd->vdisk_type = VD_DISK_TYPE_SLICE; 4401 vd->nslices = 1; 4402 4403 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 4404 /* Slice from a disk with an EFI label */ 4405 status = vd_setup_partition_efi(vd); 4406 } else { 4407 /* Slice from a disk with a VTOC label */ 4408 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 4409 status = vd_setup_partition_vtoc(vd); 4410 } 4411 4412 return (status); 4413 } 4414 4415 static int 4416 vd_setup_vd(vd_t *vd) 4417 { 4418 int status; 4419 dev_info_t *dip; 4420 vnode_t *vnp; 4421 char *path = vd->device_path; 4422 4423 /* make sure the vdisk backend is valid */ 4424 if ((status = lookupname(path, UIO_SYSSPACE, 4425 FOLLOW, NULLVPP, &vnp)) != 0) { 4426 PR0("Cannot lookup %s errno %d", path, status); 4427 goto done; 4428 } 4429 4430 switch (vnp->v_type) { 4431 case VREG: 4432 /* 4433 * Backend is a file so it is exported as a full disk or as a 4434 * single slice disk using the vnode interface. 4435 */ 4436 VN_RELE(vnp); 4437 vd->pseudo = B_FALSE; 4438 status = vd_setup_backend_vnode(vd); 4439 break; 4440 4441 case VBLK: 4442 case VCHR: 4443 /* 4444 * Backend is a device. The way it is exported depends on the 4445 * type of the device. 4446 * 4447 * - A pseudo device is exported as a full disk using the vnode 4448 * interface or as a single slice disk using the LDI 4449 * interface. 4450 * 4451 * - A disk (represented by the slice 2 of that disk) is 4452 * exported as a full disk using the LDI interface. 4453 * 4454 * - A disk slice (different from slice 2) is always exported 4455 * as a single slice disk using the LDI interface. 4456 * 4457 * - The slice 2 of a disk is exported as a single slice disk 4458 * if the "slice" option is specified, otherwise the entire 4459 * disk will be exported. In any case, the LDI interface is 4460 * used. 4461 */ 4462 4463 /* check if this is a pseudo device */ 4464 if ((dip = ddi_hold_devi_by_instance(getmajor(vnp->v_rdev), 4465 dev_to_instance(vnp->v_rdev), 0)) == NULL) { 4466 PRN("%s is no longer accessible", path); 4467 VN_RELE(vnp); 4468 status = EIO; 4469 break; 4470 } 4471 vd->pseudo = is_pseudo_device(dip); 4472 ddi_release_devi(dip); 4473 VN_RELE(vnp); 4474 4475 /* 4476 * If this is a pseudo device then its usage depends if the 4477 * "slice" option is set or not. If the "slice" option is set 4478 * then the pseudo device will be exported as a single slice, 4479 * otherwise it will be exported as a full disk. 4480 */ 4481 if (vd->pseudo && vd->vdisk_type == VD_DISK_TYPE_DISK) 4482 status = vd_setup_backend_vnode(vd); 4483 else 4484 status = vd_setup_backend_ldi(vd); 4485 break; 4486 4487 default: 4488 PRN("Unsupported vdisk backend %s", path); 4489 VN_RELE(vnp); 4490 status = EBADF; 4491 } 4492 4493 done: 4494 if (status != 0) { 4495 /* 4496 * If the error is retryable print an error message only 4497 * during the first try. 4498 */ 4499 if (status == ENXIO || status == ENODEV || 4500 status == ENOENT || status == EROFS) { 4501 if (!(vd->initialized & VD_SETUP_ERROR)) { 4502 PRN("%s is currently inaccessible (error %d)", 4503 path, status); 4504 } 4505 status = EAGAIN; 4506 } else { 4507 PRN("%s can not be exported as a virtual disk " 4508 "(error %d)", path, status); 4509 } 4510 vd->initialized |= VD_SETUP_ERROR; 4511 4512 } else if (vd->initialized & VD_SETUP_ERROR) { 4513 /* print a message only if we previously had an error */ 4514 PRN("%s is now online", path); 4515 vd->initialized &= ~VD_SETUP_ERROR; 4516 } 4517 4518 return (status); 4519 } 4520 4521 static int 4522 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options, 4523 uint64_t ldc_id, vd_t **vdp) 4524 { 4525 char tq_name[TASKQ_NAMELEN]; 4526 int status; 4527 ddi_iblock_cookie_t iblock = NULL; 4528 ldc_attr_t ldc_attr; 4529 vd_t *vd; 4530 4531 4532 ASSERT(vds != NULL); 4533 ASSERT(device_path != NULL); 4534 ASSERT(vdp != NULL); 4535 PR0("Adding vdisk for %s", device_path); 4536 4537 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 4538 PRN("No memory for virtual disk"); 4539 return (EAGAIN); 4540 } 4541 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 4542 vd->vds = vds; 4543 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 4544 4545 /* Setup open flags */ 4546 vd->open_flags = FREAD; 4547 4548 if (!(options & VD_OPT_RDONLY)) 4549 vd->open_flags |= FWRITE; 4550 4551 if (options & VD_OPT_EXCLUSIVE) 4552 vd->open_flags |= FEXCL; 4553 4554 /* Setup disk type */ 4555 if (options & VD_OPT_SLICE) { 4556 vd->vdisk_type = VD_DISK_TYPE_SLICE; 4557 vd->nslices = 1; 4558 } else { 4559 vd->vdisk_type = VD_DISK_TYPE_DISK; 4560 vd->nslices = V_NUMPAR; 4561 } 4562 4563 /* default disk label */ 4564 vd->vdisk_label = VD_DISK_LABEL_UNK; 4565 4566 /* Open vdisk and initialize parameters */ 4567 if ((status = vd_setup_vd(vd)) == 0) { 4568 vd->initialized |= VD_DISK_READY; 4569 4570 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 4571 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 4572 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 4573 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 4574 vd->nslices); 4575 } else { 4576 if (status != EAGAIN) 4577 return (status); 4578 } 4579 4580 /* Initialize locking */ 4581 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 4582 &iblock) != DDI_SUCCESS) { 4583 PRN("Could not get iblock cookie."); 4584 return (EIO); 4585 } 4586 4587 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 4588 vd->initialized |= VD_LOCKING; 4589 4590 4591 /* Create start and completion task queues for the vdisk */ 4592 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 4593 PR1("tq_name = %s", tq_name); 4594 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 4595 TASKQ_DEFAULTPRI, 0)) == NULL) { 4596 PRN("Could not create task queue"); 4597 return (EIO); 4598 } 4599 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 4600 PR1("tq_name = %s", tq_name); 4601 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 4602 TASKQ_DEFAULTPRI, 0)) == NULL) { 4603 PRN("Could not create task queue"); 4604 return (EIO); 4605 } 4606 vd->enabled = 1; /* before callback can dispatch to startq */ 4607 4608 4609 /* Bring up LDC */ 4610 ldc_attr.devclass = LDC_DEV_BLK_SVC; 4611 ldc_attr.instance = ddi_get_instance(vds->dip); 4612 ldc_attr.mode = LDC_MODE_UNRELIABLE; 4613 ldc_attr.mtu = VD_LDC_MTU; 4614 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 4615 PRN("Could not initialize LDC channel %lx, " 4616 "init failed with error %d", ldc_id, status); 4617 return (status); 4618 } 4619 vd->initialized |= VD_LDC; 4620 4621 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 4622 (caddr_t)vd)) != 0) { 4623 PRN("Could not initialize LDC channel %lu," 4624 "reg_callback failed with error %d", ldc_id, status); 4625 return (status); 4626 } 4627 4628 if ((status = ldc_open(vd->ldc_handle)) != 0) { 4629 PRN("Could not initialize LDC channel %lu," 4630 "open failed with error %d", ldc_id, status); 4631 return (status); 4632 } 4633 4634 if ((status = ldc_up(vd->ldc_handle)) != 0) { 4635 PR0("ldc_up() returned errno %d", status); 4636 } 4637 4638 /* Allocate the inband task memory handle */ 4639 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 4640 if (status) { 4641 PRN("Could not initialize LDC channel %lu," 4642 "alloc_handle failed with error %d", ldc_id, status); 4643 return (ENXIO); 4644 } 4645 4646 /* Add the successfully-initialized vdisk to the server's table */ 4647 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 4648 PRN("Error adding vdisk ID %lu to table", id); 4649 return (EIO); 4650 } 4651 4652 /* Allocate the staging buffer */ 4653 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 4654 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 4655 4656 /* store initial state */ 4657 vd->state = VD_STATE_INIT; 4658 4659 return (0); 4660 } 4661 4662 static void 4663 vd_free_dring_task(vd_t *vdp) 4664 { 4665 if (vdp->dring_task != NULL) { 4666 ASSERT(vdp->dring_len != 0); 4667 /* Free all dring_task memory handles */ 4668 for (int i = 0; i < vdp->dring_len; i++) { 4669 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 4670 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 4671 vdp->dring_task[i].msg = NULL; 4672 } 4673 kmem_free(vdp->dring_task, 4674 (sizeof (*vdp->dring_task)) * vdp->dring_len); 4675 vdp->dring_task = NULL; 4676 } 4677 } 4678 4679 /* 4680 * Destroy the state associated with a virtual disk 4681 */ 4682 static void 4683 vds_destroy_vd(void *arg) 4684 { 4685 vd_t *vd = (vd_t *)arg; 4686 int retry = 0, rv; 4687 4688 if (vd == NULL) 4689 return; 4690 4691 PR0("Destroying vdisk state"); 4692 4693 if (vd->dk_efi.dki_data != NULL) 4694 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 4695 4696 /* Disable queuing requests for the vdisk */ 4697 if (vd->initialized & VD_LOCKING) { 4698 mutex_enter(&vd->lock); 4699 vd->enabled = 0; 4700 mutex_exit(&vd->lock); 4701 } 4702 4703 /* Drain and destroy start queue (*before* destroying completionq) */ 4704 if (vd->startq != NULL) 4705 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 4706 4707 /* Drain and destroy completion queue (*before* shutting down LDC) */ 4708 if (vd->completionq != NULL) 4709 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 4710 4711 vd_free_dring_task(vd); 4712 4713 /* Free the inband task memory handle */ 4714 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 4715 4716 /* Shut down LDC */ 4717 if (vd->initialized & VD_LDC) { 4718 /* unmap the dring */ 4719 if (vd->initialized & VD_DRING) 4720 (void) ldc_mem_dring_unmap(vd->dring_handle); 4721 4722 /* close LDC channel - retry on EAGAIN */ 4723 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 4724 if (++retry > vds_ldc_retries) { 4725 PR0("Timed out closing channel"); 4726 break; 4727 } 4728 drv_usecwait(vds_ldc_delay); 4729 } 4730 if (rv == 0) { 4731 (void) ldc_unreg_callback(vd->ldc_handle); 4732 (void) ldc_fini(vd->ldc_handle); 4733 } else { 4734 /* 4735 * Closing the LDC channel has failed. Ideally we should 4736 * fail here but there is no Zeus level infrastructure 4737 * to handle this. The MD has already been changed and 4738 * we have to do the close. So we try to do as much 4739 * clean up as we can. 4740 */ 4741 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 4742 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 4743 drv_usecwait(vds_ldc_delay); 4744 } 4745 } 4746 4747 /* Free the staging buffer for msgs */ 4748 if (vd->vio_msgp != NULL) { 4749 kmem_free(vd->vio_msgp, vd->max_msglen); 4750 vd->vio_msgp = NULL; 4751 } 4752 4753 /* Free the inband message buffer */ 4754 if (vd->inband_task.msg != NULL) { 4755 kmem_free(vd->inband_task.msg, vd->max_msglen); 4756 vd->inband_task.msg = NULL; 4757 } 4758 4759 if (vd->file) { 4760 /* Close file */ 4761 (void) VOP_CLOSE(vd->file_vnode, vd->open_flags, 1, 4762 0, kcred, NULL); 4763 VN_RELE(vd->file_vnode); 4764 if (vd->file_devid != NULL) 4765 ddi_devid_free(vd->file_devid); 4766 } else { 4767 /* Close any open backing-device slices */ 4768 for (uint_t slice = 0; slice < vd->nslices; slice++) { 4769 if (vd->ldi_handle[slice] != NULL) { 4770 PR0("Closing slice %u", slice); 4771 (void) ldi_close(vd->ldi_handle[slice], 4772 vd->open_flags, kcred); 4773 } 4774 } 4775 } 4776 4777 /* Free lock */ 4778 if (vd->initialized & VD_LOCKING) 4779 mutex_destroy(&vd->lock); 4780 4781 /* Finally, free the vdisk structure itself */ 4782 kmem_free(vd, sizeof (*vd)); 4783 } 4784 4785 static int 4786 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options, 4787 uint64_t ldc_id) 4788 { 4789 int status; 4790 vd_t *vd = NULL; 4791 4792 4793 if ((status = vds_do_init_vd(vds, id, device_path, options, 4794 ldc_id, &vd)) != 0) 4795 vds_destroy_vd(vd); 4796 4797 return (status); 4798 } 4799 4800 static int 4801 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 4802 uint64_t *ldc_id) 4803 { 4804 int num_channels; 4805 4806 4807 /* Look for channel endpoint child(ren) of the vdisk MD node */ 4808 if ((num_channels = md_scan_dag(md, vd_node, 4809 md_find_name(md, VD_CHANNEL_ENDPOINT), 4810 md_find_name(md, "fwd"), channel)) <= 0) { 4811 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 4812 return (-1); 4813 } 4814 4815 /* Get the "id" value for the first channel endpoint node */ 4816 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 4817 PRN("No \"%s\" property found for \"%s\" of vdisk", 4818 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 4819 return (-1); 4820 } 4821 4822 if (num_channels > 1) { 4823 PRN("Using ID of first of multiple channels for this vdisk"); 4824 } 4825 4826 return (0); 4827 } 4828 4829 static int 4830 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 4831 { 4832 int num_nodes, status; 4833 size_t size; 4834 mde_cookie_t *channel; 4835 4836 4837 if ((num_nodes = md_node_count(md)) <= 0) { 4838 PRN("Invalid node count in Machine Description subtree"); 4839 return (-1); 4840 } 4841 size = num_nodes*(sizeof (*channel)); 4842 channel = kmem_zalloc(size, KM_SLEEP); 4843 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 4844 kmem_free(channel, size); 4845 4846 return (status); 4847 } 4848 4849 /* 4850 * Function: 4851 * vds_get_options 4852 * 4853 * Description: 4854 * Parse the options of a vds node. Options are defined as an array 4855 * of strings in the vds-block-device-opts property of the vds node 4856 * in the machine description. Options are returned as a bitmask. The 4857 * mapping between the bitmask options and the options strings from the 4858 * machine description is defined in the vd_bdev_options[] array. 4859 * 4860 * The vds-block-device-opts property is optional. If a vds has no such 4861 * property then no option is defined. 4862 * 4863 * Parameters: 4864 * md - machine description. 4865 * vd_node - vds node in the machine description for which 4866 * options have to be parsed. 4867 * options - the returned options. 4868 * 4869 * Return Code: 4870 * none. 4871 */ 4872 static void 4873 vds_get_options(md_t *md, mde_cookie_t vd_node, uint64_t *options) 4874 { 4875 char *optstr, *opt; 4876 int len, n, i; 4877 4878 *options = 0; 4879 4880 if (md_get_prop_data(md, vd_node, VD_BLOCK_DEVICE_OPTS, 4881 (uint8_t **)&optstr, &len) != 0) { 4882 PR0("No options found"); 4883 return; 4884 } 4885 4886 /* parse options */ 4887 opt = optstr; 4888 n = sizeof (vd_bdev_options) / sizeof (vd_option_t); 4889 4890 while (opt < optstr + len) { 4891 for (i = 0; i < n; i++) { 4892 if (strncmp(vd_bdev_options[i].vdo_name, 4893 opt, VD_OPTION_NLEN) == 0) { 4894 *options |= vd_bdev_options[i].vdo_value; 4895 break; 4896 } 4897 } 4898 4899 if (i < n) { 4900 PR0("option: %s", opt); 4901 } else { 4902 PRN("option %s is unknown or unsupported", opt); 4903 } 4904 4905 opt += strlen(opt) + 1; 4906 } 4907 } 4908 4909 static void 4910 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 4911 { 4912 char *device_path = NULL; 4913 uint64_t id = 0, ldc_id = 0, options = 0; 4914 4915 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 4916 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 4917 return; 4918 } 4919 PR0("Adding vdisk ID %lu", id); 4920 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 4921 &device_path) != 0) { 4922 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 4923 return; 4924 } 4925 4926 vds_get_options(md, vd_node, &options); 4927 4928 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 4929 PRN("Error getting LDC ID for vdisk %lu", id); 4930 return; 4931 } 4932 4933 if (vds_init_vd(vds, id, device_path, options, ldc_id) != 0) { 4934 PRN("Failed to add vdisk ID %lu", id); 4935 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 4936 PRN("No vDisk entry found for vdisk ID %lu", id); 4937 return; 4938 } 4939 } 4940 4941 static void 4942 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 4943 { 4944 uint64_t id = 0; 4945 4946 4947 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 4948 PRN("Unable to get \"%s\" property from vdisk's MD node", 4949 VD_ID_PROP); 4950 return; 4951 } 4952 PR0("Removing vdisk ID %lu", id); 4953 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 4954 PRN("No vdisk entry found for vdisk ID %lu", id); 4955 } 4956 4957 static void 4958 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 4959 md_t *curr_md, mde_cookie_t curr_vd_node) 4960 { 4961 char *curr_dev, *prev_dev; 4962 uint64_t curr_id = 0, curr_ldc_id = 0, curr_options = 0; 4963 uint64_t prev_id = 0, prev_ldc_id = 0, prev_options = 0; 4964 size_t len; 4965 4966 4967 /* Validate that vdisk ID has not changed */ 4968 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 4969 PRN("Error getting previous vdisk \"%s\" property", 4970 VD_ID_PROP); 4971 return; 4972 } 4973 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 4974 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 4975 return; 4976 } 4977 if (curr_id != prev_id) { 4978 PRN("Not changing vdisk: ID changed from %lu to %lu", 4979 prev_id, curr_id); 4980 return; 4981 } 4982 4983 /* Validate that LDC ID has not changed */ 4984 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 4985 PRN("Error getting LDC ID for vdisk %lu", prev_id); 4986 return; 4987 } 4988 4989 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 4990 PRN("Error getting LDC ID for vdisk %lu", curr_id); 4991 return; 4992 } 4993 if (curr_ldc_id != prev_ldc_id) { 4994 _NOTE(NOTREACHED); /* lint is confused */ 4995 PRN("Not changing vdisk: " 4996 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 4997 return; 4998 } 4999 5000 /* Determine whether device path has changed */ 5001 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 5002 &prev_dev) != 0) { 5003 PRN("Error getting previous vdisk \"%s\"", 5004 VD_BLOCK_DEVICE_PROP); 5005 return; 5006 } 5007 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 5008 &curr_dev) != 0) { 5009 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 5010 return; 5011 } 5012 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 5013 (strncmp(curr_dev, prev_dev, len) == 0)) 5014 return; /* no relevant (supported) change */ 5015 5016 /* Validate that options have not changed */ 5017 vds_get_options(prev_md, prev_vd_node, &prev_options); 5018 vds_get_options(curr_md, curr_vd_node, &curr_options); 5019 if (prev_options != curr_options) { 5020 PRN("Not changing vdisk: options changed from %lx to %lx", 5021 prev_options, curr_options); 5022 return; 5023 } 5024 5025 PR0("Changing vdisk ID %lu", prev_id); 5026 5027 /* Remove old state, which will close vdisk and reset */ 5028 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 5029 PRN("No entry found for vdisk ID %lu", prev_id); 5030 5031 /* Re-initialize vdisk with new state */ 5032 if (vds_init_vd(vds, curr_id, curr_dev, curr_options, 5033 curr_ldc_id) != 0) { 5034 PRN("Failed to change vdisk ID %lu", curr_id); 5035 return; 5036 } 5037 } 5038 5039 static int 5040 vds_process_md(void *arg, mdeg_result_t *md) 5041 { 5042 int i; 5043 vds_t *vds = arg; 5044 5045 5046 if (md == NULL) 5047 return (MDEG_FAILURE); 5048 ASSERT(vds != NULL); 5049 5050 for (i = 0; i < md->removed.nelem; i++) 5051 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 5052 for (i = 0; i < md->match_curr.nelem; i++) 5053 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 5054 md->match_curr.mdp, md->match_curr.mdep[i]); 5055 for (i = 0; i < md->added.nelem; i++) 5056 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 5057 5058 return (MDEG_SUCCESS); 5059 } 5060 5061 5062 static int 5063 vds_do_attach(dev_info_t *dip) 5064 { 5065 int status, sz; 5066 int cfg_handle; 5067 minor_t instance = ddi_get_instance(dip); 5068 vds_t *vds; 5069 mdeg_prop_spec_t *pspecp; 5070 mdeg_node_spec_t *ispecp; 5071 5072 /* 5073 * The "cfg-handle" property of a vds node in an MD contains the MD's 5074 * notion of "instance", or unique identifier, for that node; OBP 5075 * stores the value of the "cfg-handle" MD property as the value of 5076 * the "reg" property on the node in the device tree it builds from 5077 * the MD and passes to Solaris. Thus, we look up the devinfo node's 5078 * "reg" property value to uniquely identify this device instance when 5079 * registering with the MD event-generation framework. If the "reg" 5080 * property cannot be found, the device tree state is presumably so 5081 * broken that there is no point in continuing. 5082 */ 5083 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 5084 VD_REG_PROP)) { 5085 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 5086 return (DDI_FAILURE); 5087 } 5088 5089 /* Get the MD instance for later MDEG registration */ 5090 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 5091 VD_REG_PROP, -1); 5092 5093 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 5094 PRN("Could not allocate state for instance %u", instance); 5095 return (DDI_FAILURE); 5096 } 5097 5098 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 5099 PRN("Could not get state for instance %u", instance); 5100 ddi_soft_state_free(vds_state, instance); 5101 return (DDI_FAILURE); 5102 } 5103 5104 vds->dip = dip; 5105 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 5106 vds_destroy_vd, sizeof (void *)); 5107 5108 ASSERT(vds->vd_table != NULL); 5109 5110 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 5111 PRN("ldi_ident_from_dip() returned errno %d", status); 5112 return (DDI_FAILURE); 5113 } 5114 vds->initialized |= VDS_LDI; 5115 5116 /* Register for MD updates */ 5117 sz = sizeof (vds_prop_template); 5118 pspecp = kmem_alloc(sz, KM_SLEEP); 5119 bcopy(vds_prop_template, pspecp, sz); 5120 5121 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 5122 5123 /* initialize the complete prop spec structure */ 5124 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 5125 ispecp->namep = "virtual-device"; 5126 ispecp->specp = pspecp; 5127 5128 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 5129 &vds->mdeg) != MDEG_SUCCESS) { 5130 PRN("Unable to register for MD updates"); 5131 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 5132 kmem_free(pspecp, sz); 5133 return (DDI_FAILURE); 5134 } 5135 5136 vds->ispecp = ispecp; 5137 vds->initialized |= VDS_MDEG; 5138 5139 /* Prevent auto-detaching so driver is available whenever MD changes */ 5140 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 5141 DDI_PROP_SUCCESS) { 5142 PRN("failed to set \"%s\" property for instance %u", 5143 DDI_NO_AUTODETACH, instance); 5144 } 5145 5146 ddi_report_dev(dip); 5147 return (DDI_SUCCESS); 5148 } 5149 5150 static int 5151 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 5152 { 5153 int status; 5154 5155 switch (cmd) { 5156 case DDI_ATTACH: 5157 PR0("Attaching"); 5158 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 5159 (void) vds_detach(dip, DDI_DETACH); 5160 return (status); 5161 case DDI_RESUME: 5162 PR0("No action required for DDI_RESUME"); 5163 return (DDI_SUCCESS); 5164 default: 5165 return (DDI_FAILURE); 5166 } 5167 } 5168 5169 static struct dev_ops vds_ops = { 5170 DEVO_REV, /* devo_rev */ 5171 0, /* devo_refcnt */ 5172 ddi_no_info, /* devo_getinfo */ 5173 nulldev, /* devo_identify */ 5174 nulldev, /* devo_probe */ 5175 vds_attach, /* devo_attach */ 5176 vds_detach, /* devo_detach */ 5177 nodev, /* devo_reset */ 5178 NULL, /* devo_cb_ops */ 5179 NULL, /* devo_bus_ops */ 5180 nulldev /* devo_power */ 5181 }; 5182 5183 static struct modldrv modldrv = { 5184 &mod_driverops, 5185 "virtual disk server", 5186 &vds_ops, 5187 }; 5188 5189 static struct modlinkage modlinkage = { 5190 MODREV_1, 5191 &modldrv, 5192 NULL 5193 }; 5194 5195 5196 int 5197 _init(void) 5198 { 5199 int status; 5200 5201 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 5202 return (status); 5203 5204 if ((status = mod_install(&modlinkage)) != 0) { 5205 ddi_soft_state_fini(&vds_state); 5206 return (status); 5207 } 5208 5209 return (0); 5210 } 5211 5212 int 5213 _info(struct modinfo *modinfop) 5214 { 5215 return (mod_info(&modlinkage, modinfop)); 5216 } 5217 5218 int 5219 _fini(void) 5220 { 5221 int status; 5222 5223 if ((status = mod_remove(&modlinkage)) != 0) 5224 return (status); 5225 ddi_soft_state_fini(&vds_state); 5226 return (0); 5227 } 5228