1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sdt.h> 45 #include <sys/sunddi.h> 46 #include <sys/sunldi.h> 47 #include <sys/sysmacros.h> 48 #include <sys/vio_common.h> 49 #include <sys/vdsk_mailbox.h> 50 #include <sys/vdsk_common.h> 51 #include <sys/vtoc.h> 52 #include <sys/vfs.h> 53 #include <sys/stat.h> 54 #include <sys/scsi/impl/uscsi.h> 55 #include <vm/seg_map.h> 56 57 /* Virtual disk server initialization flags */ 58 #define VDS_LDI 0x01 59 #define VDS_MDEG 0x02 60 61 /* Virtual disk server tunable parameters */ 62 #define VDS_RETRIES 5 63 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 64 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 65 #define VDS_NCHAINS 32 66 67 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 68 #define VDS_NAME "virtual-disk-server" 69 70 #define VD_NAME "vd" 71 #define VD_VOLUME_NAME "vdisk" 72 #define VD_ASCIILABEL "Virtual Disk" 73 74 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 75 #define VD_ID_PROP "id" 76 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 77 #define VD_BLOCK_DEVICE_OPTS "vds-block-device-opts" 78 #define VD_REG_PROP "reg" 79 80 /* Virtual disk initialization flags */ 81 #define VD_DISK_READY 0x01 82 #define VD_LOCKING 0x02 83 #define VD_LDC 0x04 84 #define VD_DRING 0x08 85 #define VD_SID 0x10 86 #define VD_SEQ_NUM 0x20 87 #define VD_SETUP_ERROR 0x40 88 89 /* Flags for writing to a vdisk which is a file */ 90 #define VD_FILE_WRITE_FLAGS SM_ASYNC 91 92 /* Number of backup labels */ 93 #define VD_FILE_NUM_BACKUP 5 94 95 /* Timeout for SCSI I/O */ 96 #define VD_SCSI_RDWR_TIMEOUT 30 /* 30 secs */ 97 98 /* 99 * By Solaris convention, slice/partition 2 represents the entire disk; 100 * unfortunately, this convention does not appear to be codified. 101 */ 102 #define VD_ENTIRE_DISK_SLICE 2 103 104 /* Return a cpp token as a string */ 105 #define STRINGIZE(token) #token 106 107 /* 108 * Print a message prefixed with the current function name to the message log 109 * (and optionally to the console for verbose boots); these macros use cpp's 110 * concatenation of string literals and C99 variable-length-argument-list 111 * macros 112 */ 113 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 114 #define _PRN(format, ...) \ 115 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 116 117 /* Return a pointer to the "i"th vdisk dring element */ 118 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 119 (vd->dring + (i)*vd->descriptor_size)) 120 121 /* Return the virtual disk client's type as a string (for use in messages) */ 122 #define VD_CLIENT(vd) \ 123 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 124 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 125 (((vd)->xfer_mode == 0) ? "null client" : \ 126 "unsupported client"))) 127 128 /* Read disk label from a disk on file */ 129 #define VD_FILE_LABEL_READ(vd, labelp) \ 130 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \ 131 0, sizeof (struct dk_label)) 132 133 /* Write disk label to a disk on file */ 134 #define VD_FILE_LABEL_WRITE(vd, labelp) \ 135 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 136 0, sizeof (struct dk_label)) 137 138 /* 139 * Specification of an MD node passed to the MDEG to filter any 140 * 'vport' nodes that do not belong to the specified node. This 141 * template is copied for each vds instance and filled in with 142 * the appropriate 'cfg-handle' value before being passed to the MDEG. 143 */ 144 static mdeg_prop_spec_t vds_prop_template[] = { 145 { MDET_PROP_STR, "name", VDS_NAME }, 146 { MDET_PROP_VAL, "cfg-handle", NULL }, 147 { MDET_LIST_END, NULL, NULL } 148 }; 149 150 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 151 152 /* 153 * Matching criteria passed to the MDEG to register interest 154 * in changes to 'virtual-device-port' nodes identified by their 155 * 'id' property. 156 */ 157 static md_prop_match_t vd_prop_match[] = { 158 { MDET_PROP_VAL, VD_ID_PROP }, 159 { MDET_LIST_END, NULL } 160 }; 161 162 static mdeg_node_match_t vd_match = {"virtual-device-port", 163 vd_prop_match}; 164 165 /* 166 * Options for the VD_BLOCK_DEVICE_OPTS property. 167 */ 168 #define VD_OPT_RDONLY 0x1 /* read-only */ 169 #define VD_OPT_SLICE 0x2 /* single slice */ 170 #define VD_OPT_EXCLUSIVE 0x4 /* exclusive access */ 171 172 #define VD_OPTION_NLEN 128 173 174 typedef struct vd_option { 175 char vdo_name[VD_OPTION_NLEN]; 176 uint64_t vdo_value; 177 } vd_option_t; 178 179 vd_option_t vd_bdev_options[] = { 180 { "ro", VD_OPT_RDONLY }, 181 { "slice", VD_OPT_SLICE }, 182 { "excl", VD_OPT_EXCLUSIVE } 183 }; 184 185 /* Debugging macros */ 186 #ifdef DEBUG 187 188 static int vd_msglevel = 0; 189 190 #define PR0 if (vd_msglevel > 0) PRN 191 #define PR1 if (vd_msglevel > 1) PRN 192 #define PR2 if (vd_msglevel > 2) PRN 193 194 #define VD_DUMP_DRING_ELEM(elem) \ 195 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 196 elem->hdr.dstate, \ 197 elem->payload.operation, \ 198 elem->payload.status, \ 199 elem->payload.nbytes, \ 200 elem->payload.addr, \ 201 elem->payload.ncookies); 202 203 char * 204 vd_decode_state(int state) 205 { 206 char *str; 207 208 #define CASE_STATE(_s) case _s: str = #_s; break; 209 210 switch (state) { 211 CASE_STATE(VD_STATE_INIT) 212 CASE_STATE(VD_STATE_VER) 213 CASE_STATE(VD_STATE_ATTR) 214 CASE_STATE(VD_STATE_DRING) 215 CASE_STATE(VD_STATE_RDX) 216 CASE_STATE(VD_STATE_DATA) 217 default: str = "unknown"; break; 218 } 219 220 #undef CASE_STATE 221 222 return (str); 223 } 224 225 void 226 vd_decode_tag(vio_msg_t *msg) 227 { 228 char *tstr, *sstr, *estr; 229 230 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 231 232 switch (msg->tag.vio_msgtype) { 233 CASE_TYPE(VIO_TYPE_CTRL) 234 CASE_TYPE(VIO_TYPE_DATA) 235 CASE_TYPE(VIO_TYPE_ERR) 236 default: tstr = "unknown"; break; 237 } 238 239 #undef CASE_TYPE 240 241 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 242 243 switch (msg->tag.vio_subtype) { 244 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 245 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 246 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 247 default: sstr = "unknown"; break; 248 } 249 250 #undef CASE_SUBTYPE 251 252 #define CASE_ENV(_s) case _s: estr = #_s; break; 253 254 switch (msg->tag.vio_subtype_env) { 255 CASE_ENV(VIO_VER_INFO) 256 CASE_ENV(VIO_ATTR_INFO) 257 CASE_ENV(VIO_DRING_REG) 258 CASE_ENV(VIO_DRING_UNREG) 259 CASE_ENV(VIO_RDX) 260 CASE_ENV(VIO_PKT_DATA) 261 CASE_ENV(VIO_DESC_DATA) 262 CASE_ENV(VIO_DRING_DATA) 263 default: estr = "unknown"; break; 264 } 265 266 #undef CASE_ENV 267 268 PR1("(%x/%x/%x) message : (%s/%s/%s)", 269 msg->tag.vio_msgtype, msg->tag.vio_subtype, 270 msg->tag.vio_subtype_env, tstr, sstr, estr); 271 } 272 273 #else /* !DEBUG */ 274 275 #define PR0(...) 276 #define PR1(...) 277 #define PR2(...) 278 279 #define VD_DUMP_DRING_ELEM(elem) 280 281 #define vd_decode_state(_s) (NULL) 282 #define vd_decode_tag(_s) (NULL) 283 284 #endif /* DEBUG */ 285 286 287 /* 288 * Soft state structure for a vds instance 289 */ 290 typedef struct vds { 291 uint_t initialized; /* driver inst initialization flags */ 292 dev_info_t *dip; /* driver inst devinfo pointer */ 293 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 294 mod_hash_t *vd_table; /* table of virtual disks served */ 295 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 296 mdeg_handle_t mdeg; /* handle for MDEG operations */ 297 } vds_t; 298 299 /* 300 * Types of descriptor-processing tasks 301 */ 302 typedef enum vd_task_type { 303 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 304 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 305 } vd_task_type_t; 306 307 /* 308 * Structure describing the task for processing a descriptor 309 */ 310 typedef struct vd_task { 311 struct vd *vd; /* vd instance task is for */ 312 vd_task_type_t type; /* type of descriptor task */ 313 int index; /* dring elem index for task */ 314 vio_msg_t *msg; /* VIO message task is for */ 315 size_t msglen; /* length of message content */ 316 vd_dring_payload_t *request; /* request task will perform */ 317 struct buf buf; /* buf(9s) for I/O request */ 318 ldc_mem_handle_t mhdl; /* task memory handle */ 319 int status; /* status of processing task */ 320 int (*completef)(struct vd_task *task); /* completion func ptr */ 321 } vd_task_t; 322 323 /* 324 * Soft state structure for a virtual disk instance 325 */ 326 typedef struct vd { 327 uint_t initialized; /* vdisk initialization flags */ 328 vds_t *vds; /* server for this vdisk */ 329 ddi_taskq_t *startq; /* queue for I/O start tasks */ 330 ddi_taskq_t *completionq; /* queue for completion tasks */ 331 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 332 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 333 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 334 int open_flags; /* open flags */ 335 uint_t nslices; /* number of slices */ 336 size_t vdisk_size; /* number of blocks in vdisk */ 337 vd_disk_type_t vdisk_type; /* slice or entire disk */ 338 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 339 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 340 boolean_t pseudo; /* underlying pseudo dev */ 341 boolean_t file; /* underlying file */ 342 vnode_t *file_vnode; /* file vnode */ 343 size_t file_size; /* file size */ 344 ddi_devid_t file_devid; /* devid for disk image */ 345 struct dk_efi dk_efi; /* synthetic for slice type */ 346 struct dk_geom dk_geom; /* synthetic for slice type */ 347 struct vtoc vtoc; /* synthetic for slice type */ 348 ldc_status_t ldc_state; /* LDC connection state */ 349 ldc_handle_t ldc_handle; /* handle for LDC comm */ 350 size_t max_msglen; /* largest LDC message len */ 351 vd_state_t state; /* client handshake state */ 352 uint8_t xfer_mode; /* transfer mode with client */ 353 uint32_t sid; /* client's session ID */ 354 uint64_t seq_num; /* message sequence number */ 355 uint64_t dring_ident; /* identifier of dring */ 356 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 357 uint32_t descriptor_size; /* num bytes in desc */ 358 uint32_t dring_len; /* number of dring elements */ 359 caddr_t dring; /* address of dring */ 360 caddr_t vio_msgp; /* vio msg staging buffer */ 361 vd_task_t inband_task; /* task for inband descriptor */ 362 vd_task_t *dring_task; /* tasks dring elements */ 363 364 kmutex_t lock; /* protects variables below */ 365 boolean_t enabled; /* is vdisk enabled? */ 366 boolean_t reset_state; /* reset connection state? */ 367 boolean_t reset_ldc; /* reset LDC channel? */ 368 } vd_t; 369 370 typedef struct vds_operation { 371 char *namep; 372 uint8_t operation; 373 int (*start)(vd_task_t *task); 374 int (*complete)(vd_task_t *task); 375 } vds_operation_t; 376 377 typedef struct vd_ioctl { 378 uint8_t operation; /* vdisk operation */ 379 const char *operation_name; /* vdisk operation name */ 380 size_t nbytes; /* size of operation buffer */ 381 int cmd; /* corresponding ioctl cmd */ 382 const char *cmd_name; /* ioctl cmd name */ 383 void *arg; /* ioctl cmd argument */ 384 /* convert input vd_buf to output ioctl_arg */ 385 void (*copyin)(void *vd_buf, void *ioctl_arg); 386 /* convert input ioctl_arg to output vd_buf */ 387 void (*copyout)(void *ioctl_arg, void *vd_buf); 388 /* write is true if the operation writes any data to the backend */ 389 boolean_t write; 390 } vd_ioctl_t; 391 392 /* Define trivial copyin/copyout conversion function flag */ 393 #define VD_IDENTITY ((void (*)(void *, void *))-1) 394 395 396 static int vds_ldc_retries = VDS_RETRIES; 397 static int vds_ldc_delay = VDS_LDC_DELAY; 398 static int vds_dev_retries = VDS_RETRIES; 399 static int vds_dev_delay = VDS_DEV_DELAY; 400 static void *vds_state; 401 static uint64_t vds_operations; /* see vds_operation[] definition below */ 402 403 static uint_t vd_file_write_flags = VD_FILE_WRITE_FLAGS; 404 405 static short vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT; 406 407 /* 408 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 409 * 410 * Each supported major version should appear only once, paired with (and only 411 * with) its highest supported minor version number (as the protocol requires 412 * supporting all lower minor version numbers as well) 413 */ 414 static const vio_ver_t vds_version[] = {{1, 0}}; 415 static const size_t vds_num_versions = 416 sizeof (vds_version)/sizeof (vds_version[0]); 417 418 static void vd_free_dring_task(vd_t *vdp); 419 static int vd_setup_vd(vd_t *vd); 420 static int vd_setup_single_slice_disk(vd_t *vd); 421 static boolean_t vd_enabled(vd_t *vd); 422 static ushort_t vd_lbl2cksum(struct dk_label *label); 423 static int vd_file_validate_geometry(vd_t *vd); 424 425 /* 426 * Function: 427 * vd_file_rw 428 * 429 * Description: 430 * Read or write to a disk on file. 431 * 432 * Parameters: 433 * vd - disk on which the operation is performed. 434 * slice - slice on which the operation is performed, 435 * VD_SLICE_NONE indicates that the operation 436 * is done using an absolute disk offset. 437 * operation - operation to execute: read (VD_OP_BREAD) or 438 * write (VD_OP_BWRITE). 439 * data - buffer where data are read to or written from. 440 * blk - starting block for the operation. 441 * len - number of bytes to read or write. 442 * 443 * Return Code: 444 * n >= 0 - success, n indicates the number of bytes read 445 * or written. 446 * -1 - error. 447 */ 448 static ssize_t 449 vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk, 450 size_t len) 451 { 452 caddr_t maddr; 453 size_t offset, maxlen, moffset, mlen, n; 454 uint_t smflags; 455 enum seg_rw srw; 456 457 ASSERT(vd->file); 458 ASSERT(len > 0); 459 460 /* 461 * If a file is exported as a slice then we don't care about the vtoc. 462 * In that case, the vtoc is a fake mainly to make newfs happy and we 463 * handle any I/O as a raw disk access so that we can have access to the 464 * entire backend. 465 */ 466 if (vd->vdisk_type == VD_DISK_TYPE_SLICE || slice == VD_SLICE_NONE) { 467 /* raw disk access */ 468 offset = blk * DEV_BSIZE; 469 } else { 470 ASSERT(slice >= 0 && slice < V_NUMPAR); 471 472 if (vd->vdisk_label == VD_DISK_LABEL_UNK && 473 vd_file_validate_geometry(vd) != 0) { 474 PR0("Unknown disk label, can't do I/O from slice %d", 475 slice); 476 return (-1); 477 } 478 479 if (blk >= vd->vtoc.v_part[slice].p_size) { 480 /* address past the end of the slice */ 481 PR0("req_addr (0x%lx) > psize (0x%lx)", 482 blk, vd->vtoc.v_part[slice].p_size); 483 return (0); 484 } 485 486 offset = (vd->vtoc.v_part[slice].p_start + blk) * DEV_BSIZE; 487 488 /* 489 * If the requested size is greater than the size 490 * of the partition, truncate the read/write. 491 */ 492 maxlen = (vd->vtoc.v_part[slice].p_size - blk) * DEV_BSIZE; 493 494 if (len > maxlen) { 495 PR0("I/O size truncated to %lu bytes from %lu bytes", 496 maxlen, len); 497 len = maxlen; 498 } 499 } 500 501 /* 502 * We have to ensure that we are reading/writing into the mmap 503 * range. If we have a partial disk image (e.g. an image of 504 * s0 instead s2) the system can try to access slices that 505 * are not included into the disk image. 506 */ 507 if ((offset + len) >= vd->file_size) { 508 PR0("offset + nbytes (0x%lx + 0x%lx) >= " 509 "file_size (0x%lx)", offset, len, vd->file_size); 510 return (-1); 511 } 512 513 srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE; 514 smflags = (operation == VD_OP_BREAD)? 0 : 515 (SM_WRITE | vd_file_write_flags); 516 n = len; 517 518 do { 519 /* 520 * segmap_getmapflt() returns a MAXBSIZE chunk which is 521 * MAXBSIZE aligned. 522 */ 523 moffset = offset & MAXBOFFSET; 524 mlen = MIN(MAXBSIZE - moffset, n); 525 maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset, 526 mlen, 1, srw); 527 /* 528 * Fault in the pages so we can check for error and ensure 529 * that we can safely used the mapped address. 530 */ 531 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 532 F_SOFTLOCK, srw) != 0) { 533 (void) segmap_release(segkmap, maddr, 0); 534 return (-1); 535 } 536 537 if (operation == VD_OP_BREAD) 538 bcopy(maddr + moffset, data, mlen); 539 else 540 bcopy(data, maddr + moffset, mlen); 541 542 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 543 F_SOFTUNLOCK, srw) != 0) { 544 (void) segmap_release(segkmap, maddr, 0); 545 return (-1); 546 } 547 if (segmap_release(segkmap, maddr, smflags) != 0) 548 return (-1); 549 n -= mlen; 550 offset += mlen; 551 data += mlen; 552 553 } while (n > 0); 554 555 return (len); 556 } 557 558 /* 559 * Function: 560 * vd_file_build_default_label 561 * 562 * Description: 563 * Return a default label for the given disk. This is used when the disk 564 * does not have a valid VTOC so that the user can get a valid default 565 * configuration. The default label have all slices size set to 0 (except 566 * slice 2 which is the entire disk) to force the user to write a valid 567 * label onto the disk image. 568 * 569 * Parameters: 570 * vd - disk on which the operation is performed. 571 * label - the returned default label. 572 * 573 * Return Code: 574 * none. 575 */ 576 static void 577 vd_file_build_default_label(vd_t *vd, struct dk_label *label) 578 { 579 size_t size; 580 char prefix; 581 int slice, nparts; 582 uint16_t tag; 583 584 ASSERT(vd->file); 585 586 /* 587 * We must have a resonable number of cylinders and sectors so 588 * that newfs can run using default values. 589 * 590 * if (disk_size < 2MB) 591 * phys_cylinders = disk_size / 100K 592 * else 593 * phys_cylinders = disk_size / 300K 594 * 595 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 596 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 597 * data_cylinders = phys_cylinders - alt_cylinders 598 * 599 * sectors = disk_size / (phys_cylinders * blk_size) 600 * 601 * The file size test is an attempt to not have too few cylinders 602 * for a small file, or so many on a big file that you waste space 603 * for backup superblocks or cylinder group structures. 604 */ 605 if (vd->file_size < (2 * 1024 * 1024)) 606 label->dkl_pcyl = vd->file_size / (100 * 1024); 607 else 608 label->dkl_pcyl = vd->file_size / (300 * 1024); 609 610 if (label->dkl_pcyl == 0) 611 label->dkl_pcyl = 1; 612 613 label->dkl_acyl = 0; 614 615 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { 616 nparts = 1; 617 slice = 0; 618 tag = V_UNASSIGNED; 619 } else { 620 if (label->dkl_pcyl > 2) 621 label->dkl_acyl = 2; 622 nparts = V_NUMPAR; 623 slice = VD_ENTIRE_DISK_SLICE; 624 tag = V_BACKUP; 625 } 626 627 label->dkl_nsect = vd->file_size / 628 (DEV_BSIZE * label->dkl_pcyl); 629 label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl; 630 label->dkl_nhead = 1; 631 label->dkl_write_reinstruct = 0; 632 label->dkl_read_reinstruct = 0; 633 label->dkl_rpm = 7200; 634 label->dkl_apc = 0; 635 label->dkl_intrlv = 0; 636 637 PR0("requested disk size: %ld bytes\n", vd->file_size); 638 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl, 639 label->dkl_nhead, label->dkl_nsect); 640 PR0("provided disk size: %ld bytes\n", (uint64_t) 641 (label->dkl_pcyl * label->dkl_nhead * 642 label->dkl_nsect * DEV_BSIZE)); 643 644 if (vd->file_size < (1ULL << 20)) { 645 size = vd->file_size >> 10; 646 prefix = 'K'; /* Kilobyte */ 647 } else if (vd->file_size < (1ULL << 30)) { 648 size = vd->file_size >> 20; 649 prefix = 'M'; /* Megabyte */ 650 } else if (vd->file_size < (1ULL << 40)) { 651 size = vd->file_size >> 30; 652 prefix = 'G'; /* Gigabyte */ 653 } else { 654 size = vd->file_size >> 40; 655 prefix = 'T'; /* Terabyte */ 656 } 657 658 /* 659 * We must have a correct label name otherwise format(1m) will 660 * not recognized the disk as labeled. 661 */ 662 (void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII, 663 "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d", 664 size, prefix, 665 label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead, 666 label->dkl_nsect); 667 668 /* default VTOC */ 669 label->dkl_vtoc.v_version = V_VERSION; 670 label->dkl_vtoc.v_nparts = nparts; 671 label->dkl_vtoc.v_sanity = VTOC_SANE; 672 label->dkl_vtoc.v_part[slice].p_tag = tag; 673 label->dkl_map[slice].dkl_cylno = 0; 674 label->dkl_map[slice].dkl_nblk = label->dkl_ncyl * 675 label->dkl_nhead * label->dkl_nsect; 676 label->dkl_cksum = vd_lbl2cksum(label); 677 } 678 679 /* 680 * Function: 681 * vd_file_set_vtoc 682 * 683 * Description: 684 * Set the vtoc of a disk image by writing the label and backup 685 * labels into the disk image backend. 686 * 687 * Parameters: 688 * vd - disk on which the operation is performed. 689 * label - the data to be written. 690 * 691 * Return Code: 692 * 0 - success. 693 * n > 0 - error, n indicates the errno code. 694 */ 695 static int 696 vd_file_set_vtoc(vd_t *vd, struct dk_label *label) 697 { 698 int blk, sec, cyl, head, cnt; 699 700 ASSERT(vd->file); 701 702 if (VD_FILE_LABEL_WRITE(vd, label) < 0) { 703 PR0("fail to write disk label"); 704 return (EIO); 705 } 706 707 /* 708 * Backup labels are on the last alternate cylinder's 709 * first five odd sectors. 710 */ 711 if (label->dkl_acyl == 0) { 712 PR0("no alternate cylinder, can not store backup labels"); 713 return (0); 714 } 715 716 cyl = label->dkl_ncyl + label->dkl_acyl - 1; 717 head = label->dkl_nhead - 1; 718 719 blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) + 720 (head * label->dkl_nsect); 721 722 /* 723 * Write the backup labels. Make sure we don't try to write past 724 * the last cylinder. 725 */ 726 sec = 1; 727 728 for (cnt = 0; cnt < VD_FILE_NUM_BACKUP; cnt++) { 729 730 if (sec >= label->dkl_nsect) { 731 PR0("not enough sector to store all backup labels"); 732 return (0); 733 } 734 735 if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)label, 736 blk + sec, sizeof (struct dk_label)) < 0) { 737 PR0("error writing backup label at block %d\n", 738 blk + sec); 739 return (EIO); 740 } 741 742 PR1("wrote backup label at block %d\n", blk + sec); 743 744 sec += 2; 745 } 746 747 return (0); 748 } 749 750 /* 751 * Function: 752 * vd_file_get_devid_block 753 * 754 * Description: 755 * Return the block number where the device id is stored. 756 * 757 * Parameters: 758 * vd - disk on which the operation is performed. 759 * blkp - pointer to the block number 760 * 761 * Return Code: 762 * 0 - success 763 * ENOSPC - disk has no space to store a device id 764 */ 765 static int 766 vd_file_get_devid_block(vd_t *vd, size_t *blkp) 767 { 768 diskaddr_t spc, head, cyl; 769 770 ASSERT(vd->file); 771 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 772 773 /* this geometry doesn't allow us to have a devid */ 774 if (vd->dk_geom.dkg_acyl < 2) { 775 PR0("not enough alternate cylinder available for devid " 776 "(acyl=%u)", vd->dk_geom.dkg_acyl); 777 return (ENOSPC); 778 } 779 780 /* the devid is in on the track next to the last cylinder */ 781 cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2; 782 spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect; 783 head = vd->dk_geom.dkg_nhead - 1; 784 785 *blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) + 786 (head * vd->dk_geom.dkg_nsect) + 1; 787 788 return (0); 789 } 790 791 /* 792 * Return the checksum of a disk block containing an on-disk devid. 793 */ 794 static uint_t 795 vd_dkdevid2cksum(struct dk_devid *dkdevid) 796 { 797 uint_t chksum, *ip; 798 int i; 799 800 chksum = 0; 801 ip = (uint_t *)dkdevid; 802 for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++) 803 chksum ^= ip[i]; 804 805 return (chksum); 806 } 807 808 /* 809 * Function: 810 * vd_file_read_devid 811 * 812 * Description: 813 * Read the device id stored on a disk image. 814 * 815 * Parameters: 816 * vd - disk on which the operation is performed. 817 * devid - the return address of the device ID. 818 * 819 * Return Code: 820 * 0 - success 821 * EIO - I/O error while trying to access the disk image 822 * EINVAL - no valid device id was found 823 * ENOSPC - disk has no space to store a device id 824 */ 825 static int 826 vd_file_read_devid(vd_t *vd, ddi_devid_t *devid) 827 { 828 struct dk_devid *dkdevid; 829 size_t blk; 830 uint_t chksum; 831 int status, sz; 832 833 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 834 return (status); 835 836 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 837 838 /* get the devid */ 839 if ((vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk, 840 DEV_BSIZE)) < 0) { 841 PR0("error reading devid block at %lu", blk); 842 status = EIO; 843 goto done; 844 } 845 846 /* validate the revision */ 847 if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) || 848 (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) { 849 PR0("invalid devid found at block %lu (bad revision)", blk); 850 status = EINVAL; 851 goto done; 852 } 853 854 /* compute checksum */ 855 chksum = vd_dkdevid2cksum(dkdevid); 856 857 /* compare the checksums */ 858 if (DKD_GETCHKSUM(dkdevid) != chksum) { 859 PR0("invalid devid found at block %lu (bad checksum)", blk); 860 status = EINVAL; 861 goto done; 862 } 863 864 /* validate the device id */ 865 if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) { 866 PR0("invalid devid found at block %lu", blk); 867 status = EINVAL; 868 goto done; 869 } 870 871 PR1("devid read at block %lu", blk); 872 873 sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid); 874 *devid = kmem_alloc(sz, KM_SLEEP); 875 bcopy(&dkdevid->dkd_devid, *devid, sz); 876 877 done: 878 kmem_free(dkdevid, DEV_BSIZE); 879 return (status); 880 881 } 882 883 /* 884 * Function: 885 * vd_file_write_devid 886 * 887 * Description: 888 * Write a device id into disk image. 889 * 890 * Parameters: 891 * vd - disk on which the operation is performed. 892 * devid - the device ID to store. 893 * 894 * Return Code: 895 * 0 - success 896 * EIO - I/O error while trying to access the disk image 897 * ENOSPC - disk has no space to store a device id 898 */ 899 static int 900 vd_file_write_devid(vd_t *vd, ddi_devid_t devid) 901 { 902 struct dk_devid *dkdevid; 903 uint_t chksum; 904 size_t blk; 905 int status; 906 907 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 908 return (status); 909 910 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 911 912 /* set revision */ 913 dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB; 914 dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB; 915 916 /* copy devid */ 917 bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid)); 918 919 /* compute checksum */ 920 chksum = vd_dkdevid2cksum(dkdevid); 921 922 /* set checksum */ 923 DKD_FORMCHKSUM(chksum, dkdevid); 924 925 /* store the devid */ 926 if ((status = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, 927 (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) { 928 PR0("Error writing devid block at %lu", blk); 929 status = EIO; 930 } else { 931 PR1("devid written at block %lu", blk); 932 status = 0; 933 } 934 935 kmem_free(dkdevid, DEV_BSIZE); 936 return (status); 937 } 938 939 /* 940 * Function: 941 * vd_scsi_rdwr 942 * 943 * Description: 944 * Read or write to a SCSI disk using an absolute disk offset. 945 * 946 * Parameters: 947 * vd - disk on which the operation is performed. 948 * operation - operation to execute: read (VD_OP_BREAD) or 949 * write (VD_OP_BWRITE). 950 * data - buffer where data are read to or written from. 951 * blk - starting block for the operation. 952 * len - number of bytes to read or write. 953 * 954 * Return Code: 955 * 0 - success 956 * n != 0 - error. 957 */ 958 static int 959 vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len) 960 { 961 struct uscsi_cmd ucmd; 962 union scsi_cdb cdb; 963 int nsectors, nblk; 964 int max_sectors; 965 int status, rval; 966 967 ASSERT(!vd->file); 968 969 max_sectors = vd->max_xfer_sz; 970 nblk = (len / DEV_BSIZE); 971 972 if (len % DEV_BSIZE != 0) 973 return (EINVAL); 974 975 /* 976 * Build and execute the uscsi ioctl. We build a group0, group1 977 * or group4 command as necessary, since some targets 978 * do not support group1 commands. 979 */ 980 while (nblk) { 981 982 bzero(&ucmd, sizeof (ucmd)); 983 bzero(&cdb, sizeof (cdb)); 984 985 nsectors = (max_sectors < nblk) ? max_sectors : nblk; 986 987 if (blk < (2 << 20) && nsectors <= 0xff) { 988 FORMG0ADDR(&cdb, blk); 989 FORMG0COUNT(&cdb, nsectors); 990 ucmd.uscsi_cdblen = CDB_GROUP0; 991 } else if (blk > 0xffffffff) { 992 FORMG4LONGADDR(&cdb, blk); 993 FORMG4COUNT(&cdb, nsectors); 994 ucmd.uscsi_cdblen = CDB_GROUP4; 995 cdb.scc_cmd |= SCMD_GROUP4; 996 } else { 997 FORMG1ADDR(&cdb, blk); 998 FORMG1COUNT(&cdb, nsectors); 999 ucmd.uscsi_cdblen = CDB_GROUP1; 1000 cdb.scc_cmd |= SCMD_GROUP1; 1001 } 1002 1003 ucmd.uscsi_cdb = (caddr_t)&cdb; 1004 ucmd.uscsi_bufaddr = data; 1005 ucmd.uscsi_buflen = nsectors * DEV_BSIZE; 1006 ucmd.uscsi_timeout = vd_scsi_rdwr_timeout; 1007 /* 1008 * Set flags so that the command is isolated from normal 1009 * commands and no error message is printed. 1010 */ 1011 ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT; 1012 1013 if (operation == VD_OP_BREAD) { 1014 cdb.scc_cmd |= SCMD_READ; 1015 ucmd.uscsi_flags |= USCSI_READ; 1016 } else { 1017 cdb.scc_cmd |= SCMD_WRITE; 1018 } 1019 1020 status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE], 1021 USCSICMD, (intptr_t)&ucmd, (vd->open_flags | FKIOCTL), 1022 kcred, &rval); 1023 1024 if (status == 0) 1025 status = ucmd.uscsi_status; 1026 1027 if (status != 0) 1028 break; 1029 1030 /* 1031 * Check if partial DMA breakup is required. If so, reduce 1032 * the request size by half and retry the last request. 1033 */ 1034 if (ucmd.uscsi_resid == ucmd.uscsi_buflen) { 1035 max_sectors >>= 1; 1036 if (max_sectors <= 0) { 1037 status = EIO; 1038 break; 1039 } 1040 continue; 1041 } 1042 1043 if (ucmd.uscsi_resid != 0) { 1044 status = EIO; 1045 break; 1046 } 1047 1048 blk += nsectors; 1049 nblk -= nsectors; 1050 data += nsectors * DEV_BSIZE; /* SECSIZE */ 1051 } 1052 1053 return (status); 1054 } 1055 1056 /* 1057 * Return Values 1058 * EINPROGRESS - operation was successfully started 1059 * EIO - encountered LDC (aka. task error) 1060 * 0 - operation completed successfully 1061 * 1062 * Side Effect 1063 * sets request->status = <disk operation status> 1064 */ 1065 static int 1066 vd_start_bio(vd_task_t *task) 1067 { 1068 int rv, status = 0; 1069 vd_t *vd = task->vd; 1070 vd_dring_payload_t *request = task->request; 1071 struct buf *buf = &task->buf; 1072 uint8_t mtype; 1073 int slice; 1074 char *bufaddr = 0; 1075 size_t buflen; 1076 1077 ASSERT(vd != NULL); 1078 ASSERT(request != NULL); 1079 1080 slice = request->slice; 1081 1082 ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices); 1083 ASSERT((request->operation == VD_OP_BREAD) || 1084 (request->operation == VD_OP_BWRITE)); 1085 1086 if (request->nbytes == 0) { 1087 /* no service for trivial requests */ 1088 request->status = EINVAL; 1089 return (0); 1090 } 1091 1092 PR1("%s %lu bytes at block %lu", 1093 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 1094 request->nbytes, request->addr); 1095 1096 /* 1097 * We have to check the open flags because the functions processing 1098 * the read/write request will not do it. 1099 */ 1100 if (request->operation == VD_OP_BWRITE && !(vd->open_flags & FWRITE)) { 1101 PR0("write fails because backend is opened read-only"); 1102 request->nbytes = 0; 1103 request->status = EROFS; 1104 return (0); 1105 } 1106 1107 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 1108 1109 /* Map memory exported by client */ 1110 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 1111 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 1112 &bufaddr, NULL); 1113 if (status != 0) { 1114 PR0("ldc_mem_map() returned err %d ", status); 1115 return (EIO); 1116 } 1117 1118 buflen = request->nbytes; 1119 1120 status = ldc_mem_acquire(task->mhdl, 0, buflen); 1121 if (status != 0) { 1122 (void) ldc_mem_unmap(task->mhdl); 1123 PR0("ldc_mem_acquire() returned err %d ", status); 1124 return (EIO); 1125 } 1126 1127 /* Start the block I/O */ 1128 if (vd->file) { 1129 rv = vd_file_rw(vd, slice, request->operation, bufaddr, 1130 request->addr, request->nbytes); 1131 if (rv < 0) { 1132 request->nbytes = 0; 1133 request->status = EIO; 1134 } else { 1135 request->nbytes = rv; 1136 request->status = 0; 1137 } 1138 } else { 1139 if (slice == VD_SLICE_NONE) { 1140 /* 1141 * This is not a disk image so it is a real disk. We 1142 * assume that the underlying device driver supports 1143 * USCSICMD ioctls. This is the case of all SCSI devices 1144 * (sd, ssd...). 1145 * 1146 * In the future if we have non-SCSI disks we would need 1147 * to invoke the appropriate function to do I/O using an 1148 * absolute disk offset (for example using DKIOCTL_RWCMD 1149 * for IDE disks). 1150 */ 1151 rv = vd_scsi_rdwr(vd, request->operation, bufaddr, 1152 request->addr, request->nbytes); 1153 if (rv != 0) { 1154 request->nbytes = 0; 1155 request->status = EIO; 1156 } else { 1157 request->status = 0; 1158 } 1159 } else { 1160 bioinit(buf); 1161 buf->b_flags = B_BUSY; 1162 buf->b_bcount = request->nbytes; 1163 buf->b_lblkno = request->addr; 1164 buf->b_edev = vd->dev[slice]; 1165 buf->b_un.b_addr = bufaddr; 1166 buf->b_flags |= (request->operation == VD_OP_BREAD)? 1167 B_READ : B_WRITE; 1168 1169 request->status = 1170 ldi_strategy(vd->ldi_handle[slice], buf); 1171 1172 /* 1173 * This is to indicate to the caller that the request 1174 * needs to be finished by vd_complete_bio() by calling 1175 * biowait() there and waiting for that to return before 1176 * triggering the notification of the vDisk client. 1177 * 1178 * This is necessary when writing to real disks as 1179 * otherwise calls to ldi_strategy() would be serialized 1180 * behind the calls to biowait() and performance would 1181 * suffer. 1182 */ 1183 if (request->status == 0) 1184 return (EINPROGRESS); 1185 1186 biofini(buf); 1187 } 1188 } 1189 1190 /* Clean up after error */ 1191 rv = ldc_mem_release(task->mhdl, 0, buflen); 1192 if (rv) { 1193 PR0("ldc_mem_release() returned err %d ", rv); 1194 status = EIO; 1195 } 1196 rv = ldc_mem_unmap(task->mhdl); 1197 if (rv) { 1198 PR0("ldc_mem_unmap() returned err %d ", rv); 1199 status = EIO; 1200 } 1201 1202 return (status); 1203 } 1204 1205 /* 1206 * This function should only be called from vd_notify to ensure that requests 1207 * are responded to in the order that they are received. 1208 */ 1209 static int 1210 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 1211 { 1212 int status; 1213 size_t nbytes; 1214 1215 do { 1216 nbytes = msglen; 1217 status = ldc_write(ldc_handle, msg, &nbytes); 1218 if (status != EWOULDBLOCK) 1219 break; 1220 drv_usecwait(vds_ldc_delay); 1221 } while (status == EWOULDBLOCK); 1222 1223 if (status != 0) { 1224 if (status != ECONNRESET) 1225 PR0("ldc_write() returned errno %d", status); 1226 return (status); 1227 } else if (nbytes != msglen) { 1228 PR0("ldc_write() performed only partial write"); 1229 return (EIO); 1230 } 1231 1232 PR1("SENT %lu bytes", msglen); 1233 return (0); 1234 } 1235 1236 static void 1237 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 1238 { 1239 mutex_enter(&vd->lock); 1240 vd->reset_state = B_TRUE; 1241 vd->reset_ldc = reset_ldc; 1242 mutex_exit(&vd->lock); 1243 } 1244 1245 /* 1246 * Reset the state of the connection with a client, if needed; reset the LDC 1247 * transport as well, if needed. This function should only be called from the 1248 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 1249 */ 1250 static void 1251 vd_reset_if_needed(vd_t *vd) 1252 { 1253 int status = 0; 1254 1255 mutex_enter(&vd->lock); 1256 if (!vd->reset_state) { 1257 ASSERT(!vd->reset_ldc); 1258 mutex_exit(&vd->lock); 1259 return; 1260 } 1261 mutex_exit(&vd->lock); 1262 1263 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 1264 1265 /* 1266 * Let any asynchronous I/O complete before possibly pulling the rug 1267 * out from under it; defer checking vd->reset_ldc, as one of the 1268 * asynchronous tasks might set it 1269 */ 1270 ddi_taskq_wait(vd->completionq); 1271 1272 if (vd->file) { 1273 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred); 1274 if (status) { 1275 PR0("VOP_FSYNC returned errno %d", status); 1276 } 1277 } 1278 1279 if ((vd->initialized & VD_DRING) && 1280 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 1281 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1282 1283 vd_free_dring_task(vd); 1284 1285 /* Free the staging buffer for msgs */ 1286 if (vd->vio_msgp != NULL) { 1287 kmem_free(vd->vio_msgp, vd->max_msglen); 1288 vd->vio_msgp = NULL; 1289 } 1290 1291 /* Free the inband message buffer */ 1292 if (vd->inband_task.msg != NULL) { 1293 kmem_free(vd->inband_task.msg, vd->max_msglen); 1294 vd->inband_task.msg = NULL; 1295 } 1296 1297 mutex_enter(&vd->lock); 1298 1299 if (vd->reset_ldc) 1300 PR0("taking down LDC channel"); 1301 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 1302 PR0("ldc_down() returned errno %d", status); 1303 1304 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 1305 vd->state = VD_STATE_INIT; 1306 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 1307 1308 /* Allocate the staging buffer */ 1309 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 1310 1311 PR0("calling ldc_up\n"); 1312 (void) ldc_up(vd->ldc_handle); 1313 1314 vd->reset_state = B_FALSE; 1315 vd->reset_ldc = B_FALSE; 1316 1317 mutex_exit(&vd->lock); 1318 } 1319 1320 static void vd_recv_msg(void *arg); 1321 1322 static void 1323 vd_mark_in_reset(vd_t *vd) 1324 { 1325 int status; 1326 1327 PR0("vd_mark_in_reset: marking vd in reset\n"); 1328 1329 vd_need_reset(vd, B_FALSE); 1330 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 1331 if (status == DDI_FAILURE) { 1332 PR0("cannot schedule task to recv msg\n"); 1333 vd_need_reset(vd, B_TRUE); 1334 return; 1335 } 1336 } 1337 1338 static int 1339 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 1340 { 1341 boolean_t accepted; 1342 int status; 1343 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1344 1345 if (vd->reset_state) 1346 return (0); 1347 1348 /* Acquire the element */ 1349 if (!vd->reset_state && 1350 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1351 if (status == ECONNRESET) { 1352 vd_mark_in_reset(vd); 1353 return (0); 1354 } else { 1355 PR0("ldc_mem_dring_acquire() returned errno %d", 1356 status); 1357 return (status); 1358 } 1359 } 1360 1361 /* Set the element's status and mark it done */ 1362 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 1363 if (accepted) { 1364 elem->payload.nbytes = elem_nbytes; 1365 elem->payload.status = elem_status; 1366 elem->hdr.dstate = VIO_DESC_DONE; 1367 } else { 1368 /* Perhaps client timed out waiting for I/O... */ 1369 PR0("element %u no longer \"accepted\"", idx); 1370 VD_DUMP_DRING_ELEM(elem); 1371 } 1372 /* Release the element */ 1373 if (!vd->reset_state && 1374 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1375 if (status == ECONNRESET) { 1376 vd_mark_in_reset(vd); 1377 return (0); 1378 } else { 1379 PR0("ldc_mem_dring_release() returned errno %d", 1380 status); 1381 return (status); 1382 } 1383 } 1384 1385 return (accepted ? 0 : EINVAL); 1386 } 1387 1388 /* 1389 * Return Values 1390 * 0 - operation completed successfully 1391 * EIO - encountered LDC / task error 1392 * 1393 * Side Effect 1394 * sets request->status = <disk operation status> 1395 */ 1396 static int 1397 vd_complete_bio(vd_task_t *task) 1398 { 1399 int status = 0; 1400 int rv = 0; 1401 vd_t *vd = task->vd; 1402 vd_dring_payload_t *request = task->request; 1403 struct buf *buf = &task->buf; 1404 1405 1406 ASSERT(vd != NULL); 1407 ASSERT(request != NULL); 1408 ASSERT(task->msg != NULL); 1409 ASSERT(task->msglen >= sizeof (*task->msg)); 1410 ASSERT(!vd->file); 1411 ASSERT(request->slice != VD_SLICE_NONE); 1412 1413 /* Wait for the I/O to complete [ call to ldi_strategy(9f) ] */ 1414 request->status = biowait(buf); 1415 1416 /* return back the number of bytes read/written */ 1417 request->nbytes = buf->b_bcount - buf->b_resid; 1418 1419 /* Release the buffer */ 1420 if (!vd->reset_state) 1421 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 1422 if (status) { 1423 PR0("ldc_mem_release() returned errno %d copying to " 1424 "client", status); 1425 if (status == ECONNRESET) { 1426 vd_mark_in_reset(vd); 1427 } 1428 rv = EIO; 1429 } 1430 1431 /* Unmap the memory, even if in reset */ 1432 status = ldc_mem_unmap(task->mhdl); 1433 if (status) { 1434 PR0("ldc_mem_unmap() returned errno %d copying to client", 1435 status); 1436 if (status == ECONNRESET) { 1437 vd_mark_in_reset(vd); 1438 } 1439 rv = EIO; 1440 } 1441 1442 biofini(buf); 1443 1444 return (rv); 1445 } 1446 1447 /* 1448 * Description: 1449 * This function is called by the two functions called by a taskq 1450 * [ vd_complete_notify() and vd_serial_notify()) ] to send the 1451 * message to the client. 1452 * 1453 * Parameters: 1454 * arg - opaque pointer to structure containing task to be completed 1455 * 1456 * Return Values 1457 * None 1458 */ 1459 static void 1460 vd_notify(vd_task_t *task) 1461 { 1462 int status; 1463 1464 ASSERT(task != NULL); 1465 ASSERT(task->vd != NULL); 1466 1467 if (task->vd->reset_state) 1468 return; 1469 1470 /* 1471 * Send the "ack" or "nack" back to the client; if sending the message 1472 * via LDC fails, arrange to reset both the connection state and LDC 1473 * itself 1474 */ 1475 PR2("Sending %s", 1476 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 1477 1478 status = send_msg(task->vd->ldc_handle, task->msg, task->msglen); 1479 switch (status) { 1480 case 0: 1481 break; 1482 case ECONNRESET: 1483 vd_mark_in_reset(task->vd); 1484 break; 1485 default: 1486 PR0("initiating full reset"); 1487 vd_need_reset(task->vd, B_TRUE); 1488 break; 1489 } 1490 1491 DTRACE_PROBE1(task__end, vd_task_t *, task); 1492 } 1493 1494 /* 1495 * Description: 1496 * Mark the Dring entry as Done and (if necessary) send an ACK/NACK to 1497 * the vDisk client 1498 * 1499 * Parameters: 1500 * task - structure containing the request sent from client 1501 * 1502 * Return Values 1503 * None 1504 */ 1505 static void 1506 vd_complete_notify(vd_task_t *task) 1507 { 1508 int status = 0; 1509 vd_t *vd = task->vd; 1510 vd_dring_payload_t *request = task->request; 1511 1512 /* Update the dring element for a dring client */ 1513 if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE)) { 1514 status = vd_mark_elem_done(vd, task->index, 1515 request->status, request->nbytes); 1516 if (status == ECONNRESET) 1517 vd_mark_in_reset(vd); 1518 } 1519 1520 /* 1521 * If a transport error occurred while marking the element done or 1522 * previously while executing the task, arrange to "nack" the message 1523 * when the final task in the descriptor element range completes 1524 */ 1525 if ((status != 0) || (task->status != 0)) 1526 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 1527 1528 /* 1529 * Only the final task for a range of elements will respond to and 1530 * free the message 1531 */ 1532 if (task->type == VD_NONFINAL_RANGE_TASK) { 1533 return; 1534 } 1535 1536 vd_notify(task); 1537 } 1538 1539 /* 1540 * Description: 1541 * This is the basic completion function called to handle inband data 1542 * requests and handshake messages. All it needs to do is trigger a 1543 * message to the client that the request is completed. 1544 * 1545 * Parameters: 1546 * arg - opaque pointer to structure containing task to be completed 1547 * 1548 * Return Values 1549 * None 1550 */ 1551 static void 1552 vd_serial_notify(void *arg) 1553 { 1554 vd_task_t *task = (vd_task_t *)arg; 1555 1556 ASSERT(task != NULL); 1557 vd_notify(task); 1558 } 1559 1560 static void 1561 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 1562 { 1563 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 1564 } 1565 1566 static void 1567 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 1568 { 1569 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 1570 } 1571 1572 static void 1573 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 1574 { 1575 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 1576 } 1577 1578 static void 1579 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 1580 { 1581 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 1582 } 1583 1584 static void 1585 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 1586 { 1587 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1588 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1589 1590 dk_efi->dki_lba = vd_efi->lba; 1591 dk_efi->dki_length = vd_efi->length; 1592 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 1593 } 1594 1595 static void 1596 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 1597 { 1598 int len; 1599 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1600 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1601 1602 len = vd_efi->length; 1603 DK_EFI2VD_EFI(dk_efi, vd_efi); 1604 kmem_free(dk_efi->dki_data, len); 1605 } 1606 1607 static void 1608 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 1609 { 1610 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1611 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1612 1613 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 1614 VD_EFI2DK_EFI(vd_efi, dk_efi); 1615 } 1616 1617 static void 1618 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 1619 { 1620 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1621 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1622 1623 kmem_free(dk_efi->dki_data, vd_efi->length); 1624 } 1625 1626 static vd_disk_label_t 1627 vd_read_vtoc(vd_t *vd, struct vtoc *vtoc) 1628 { 1629 int status, rval; 1630 struct dk_gpt *efi; 1631 size_t efi_len; 1632 1633 ASSERT(vd->ldi_handle[0] != NULL); 1634 1635 status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)vtoc, 1636 (vd->open_flags | FKIOCTL), kcred, &rval); 1637 1638 if (status == 0) { 1639 return (VD_DISK_LABEL_VTOC); 1640 } else if (status != ENOTSUP) { 1641 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 1642 return (VD_DISK_LABEL_UNK); 1643 } 1644 1645 status = vds_efi_alloc_and_read(vd->ldi_handle[0], &efi, &efi_len); 1646 1647 if (status) { 1648 PR0("vds_efi_alloc_and_read returned error %d", status); 1649 return (VD_DISK_LABEL_UNK); 1650 } 1651 1652 vd_efi_to_vtoc(efi, vtoc); 1653 vd_efi_free(efi, efi_len); 1654 1655 return (VD_DISK_LABEL_EFI); 1656 } 1657 1658 static ushort_t 1659 vd_lbl2cksum(struct dk_label *label) 1660 { 1661 int count; 1662 ushort_t sum, *sp; 1663 1664 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 1665 sp = (ushort_t *)label; 1666 sum = 0; 1667 while (count--) { 1668 sum ^= *sp++; 1669 } 1670 1671 return (sum); 1672 } 1673 1674 /* 1675 * Handle ioctls to a disk slice. 1676 * 1677 * Return Values 1678 * 0 - Indicates that there are no errors in disk operations 1679 * ENOTSUP - Unknown disk label type or unsupported DKIO ioctl 1680 * EINVAL - Not enough room to copy the EFI label 1681 * 1682 */ 1683 static int 1684 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1685 { 1686 dk_efi_t *dk_ioc; 1687 1688 switch (vd->vdisk_label) { 1689 1690 /* ioctls for a slice from a disk with a VTOC label */ 1691 case VD_DISK_LABEL_VTOC: 1692 1693 switch (cmd) { 1694 case DKIOCGGEOM: 1695 ASSERT(ioctl_arg != NULL); 1696 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 1697 return (0); 1698 case DKIOCGVTOC: 1699 ASSERT(ioctl_arg != NULL); 1700 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 1701 return (0); 1702 default: 1703 return (ENOTSUP); 1704 } 1705 1706 /* ioctls for a slice from a disk with an EFI label */ 1707 case VD_DISK_LABEL_EFI: 1708 1709 switch (cmd) { 1710 case DKIOCGETEFI: 1711 ASSERT(ioctl_arg != NULL); 1712 dk_ioc = (dk_efi_t *)ioctl_arg; 1713 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 1714 return (EINVAL); 1715 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 1716 vd->dk_efi.dki_length); 1717 return (0); 1718 default: 1719 return (ENOTSUP); 1720 } 1721 1722 default: 1723 /* Unknown disk label type */ 1724 return (ENOTSUP); 1725 } 1726 } 1727 1728 /* 1729 * Function: 1730 * vd_file_validate_geometry 1731 * 1732 * Description: 1733 * Read the label and validate the geometry of a disk image. The driver 1734 * label, vtoc and geometry information are updated according to the 1735 * label read from the disk image. 1736 * 1737 * If no valid label is found, the label is set to unknown and the 1738 * function returns EINVAL, but a default vtoc and geometry are provided 1739 * to the driver. 1740 * 1741 * Parameters: 1742 * vd - disk on which the operation is performed. 1743 * 1744 * Return Code: 1745 * 0 - success. 1746 * EIO - error reading the label from the disk image. 1747 * EINVAL - unknown disk label. 1748 */ 1749 static int 1750 vd_file_validate_geometry(vd_t *vd) 1751 { 1752 struct dk_label label; 1753 struct dk_geom *geom = &vd->dk_geom; 1754 struct vtoc *vtoc = &vd->vtoc; 1755 int i; 1756 int status = 0; 1757 1758 ASSERT(vd->file); 1759 1760 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { 1761 /* 1762 * For single slice disk we always fake the geometry, and we 1763 * only need to do it once because the geometry will never 1764 * change. 1765 */ 1766 if (vd->vdisk_label == VD_DISK_LABEL_VTOC) 1767 /* geometry was already validated */ 1768 return (0); 1769 1770 ASSERT(vd->vdisk_label == VD_DISK_LABEL_UNK); 1771 vd_file_build_default_label(vd, &label); 1772 vd->vdisk_label = VD_DISK_LABEL_VTOC; 1773 } else { 1774 if (VD_FILE_LABEL_READ(vd, &label) < 0) 1775 return (EIO); 1776 1777 if (label.dkl_magic != DKL_MAGIC || 1778 label.dkl_cksum != vd_lbl2cksum(&label) || 1779 label.dkl_vtoc.v_sanity != VTOC_SANE || 1780 label.dkl_vtoc.v_nparts != V_NUMPAR) { 1781 vd->vdisk_label = VD_DISK_LABEL_UNK; 1782 vd_file_build_default_label(vd, &label); 1783 status = EINVAL; 1784 } else { 1785 vd->vdisk_label = VD_DISK_LABEL_VTOC; 1786 } 1787 } 1788 1789 /* Update the driver geometry */ 1790 bzero(geom, sizeof (struct dk_geom)); 1791 1792 geom->dkg_ncyl = label.dkl_ncyl; 1793 geom->dkg_acyl = label.dkl_acyl; 1794 geom->dkg_nhead = label.dkl_nhead; 1795 geom->dkg_nsect = label.dkl_nsect; 1796 geom->dkg_intrlv = label.dkl_intrlv; 1797 geom->dkg_apc = label.dkl_apc; 1798 geom->dkg_rpm = label.dkl_rpm; 1799 geom->dkg_pcyl = label.dkl_pcyl; 1800 geom->dkg_write_reinstruct = label.dkl_write_reinstruct; 1801 geom->dkg_read_reinstruct = label.dkl_read_reinstruct; 1802 1803 /* Update the driver vtoc */ 1804 bzero(vtoc, sizeof (struct vtoc)); 1805 1806 vtoc->v_sanity = label.dkl_vtoc.v_sanity; 1807 vtoc->v_version = label.dkl_vtoc.v_version; 1808 vtoc->v_sectorsz = DEV_BSIZE; 1809 vtoc->v_nparts = label.dkl_vtoc.v_nparts; 1810 1811 for (i = 0; i < vtoc->v_nparts; i++) { 1812 vtoc->v_part[i].p_tag = 1813 label.dkl_vtoc.v_part[i].p_tag; 1814 vtoc->v_part[i].p_flag = 1815 label.dkl_vtoc.v_part[i].p_flag; 1816 vtoc->v_part[i].p_start = 1817 label.dkl_map[i].dkl_cylno * 1818 (label.dkl_nhead * label.dkl_nsect); 1819 vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk; 1820 vtoc->timestamp[i] = 1821 label.dkl_vtoc.v_timestamp[i]; 1822 } 1823 /* 1824 * The bootinfo array can not be copied with bcopy() because 1825 * elements are of type long in vtoc (so 64-bit) and of type 1826 * int in dk_vtoc (so 32-bit). 1827 */ 1828 vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0]; 1829 vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1]; 1830 vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2]; 1831 bcopy(label.dkl_asciilabel, vtoc->v_asciilabel, 1832 LEN_DKL_ASCII); 1833 bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume, 1834 LEN_DKL_VVOL); 1835 1836 return (status); 1837 } 1838 1839 /* 1840 * Handle ioctls to a disk image (file-based). 1841 * 1842 * Return Values 1843 * 0 - Indicates that there are no errors 1844 * != 0 - Disk operation returned an error 1845 */ 1846 static int 1847 vd_do_file_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1848 { 1849 struct dk_label label; 1850 struct dk_geom *geom; 1851 struct vtoc *vtoc; 1852 int i, rc; 1853 1854 ASSERT(vd->file); 1855 1856 switch (cmd) { 1857 1858 case DKIOCGGEOM: 1859 ASSERT(ioctl_arg != NULL); 1860 geom = (struct dk_geom *)ioctl_arg; 1861 1862 rc = vd_file_validate_geometry(vd); 1863 if (rc != 0 && rc != EINVAL) { 1864 ASSERT(vd->vdisk_type != VD_DISK_TYPE_SLICE); 1865 return (rc); 1866 } 1867 1868 bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom)); 1869 return (0); 1870 1871 case DKIOCGVTOC: 1872 ASSERT(ioctl_arg != NULL); 1873 vtoc = (struct vtoc *)ioctl_arg; 1874 1875 rc = vd_file_validate_geometry(vd); 1876 if (rc != 0 && rc != EINVAL) { 1877 ASSERT(vd->vdisk_type != VD_DISK_TYPE_SLICE); 1878 return (rc); 1879 } 1880 1881 bcopy(&vd->vtoc, vtoc, sizeof (struct vtoc)); 1882 return (0); 1883 1884 case DKIOCSGEOM: 1885 ASSERT(ioctl_arg != NULL); 1886 geom = (struct dk_geom *)ioctl_arg; 1887 1888 /* geometry can only be changed for full disk */ 1889 if (vd->vdisk_type != VD_DISK_TYPE_DISK) 1890 return (ENOTSUP); 1891 1892 if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0) 1893 return (EINVAL); 1894 1895 /* 1896 * The current device geometry is not updated, just the driver 1897 * "notion" of it. The device geometry will be effectively 1898 * updated when a label is written to the device during a next 1899 * DKIOCSVTOC. 1900 */ 1901 bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom)); 1902 return (0); 1903 1904 case DKIOCSVTOC: 1905 ASSERT(ioctl_arg != NULL); 1906 ASSERT(vd->dk_geom.dkg_nhead != 0 && 1907 vd->dk_geom.dkg_nsect != 0); 1908 vtoc = (struct vtoc *)ioctl_arg; 1909 1910 /* vtoc can only be changed for full disk */ 1911 if (vd->vdisk_type != VD_DISK_TYPE_DISK) 1912 return (ENOTSUP); 1913 1914 if (vtoc->v_sanity != VTOC_SANE || 1915 vtoc->v_sectorsz != DEV_BSIZE || 1916 vtoc->v_nparts != V_NUMPAR) 1917 return (EINVAL); 1918 1919 bzero(&label, sizeof (label)); 1920 label.dkl_ncyl = vd->dk_geom.dkg_ncyl; 1921 label.dkl_acyl = vd->dk_geom.dkg_acyl; 1922 label.dkl_pcyl = vd->dk_geom.dkg_pcyl; 1923 label.dkl_nhead = vd->dk_geom.dkg_nhead; 1924 label.dkl_nsect = vd->dk_geom.dkg_nsect; 1925 label.dkl_intrlv = vd->dk_geom.dkg_intrlv; 1926 label.dkl_apc = vd->dk_geom.dkg_apc; 1927 label.dkl_rpm = vd->dk_geom.dkg_rpm; 1928 label.dkl_write_reinstruct = vd->dk_geom.dkg_write_reinstruct; 1929 label.dkl_read_reinstruct = vd->dk_geom.dkg_read_reinstruct; 1930 1931 label.dkl_vtoc.v_nparts = V_NUMPAR; 1932 label.dkl_vtoc.v_sanity = VTOC_SANE; 1933 label.dkl_vtoc.v_version = vtoc->v_version; 1934 for (i = 0; i < V_NUMPAR; i++) { 1935 label.dkl_vtoc.v_timestamp[i] = 1936 vtoc->timestamp[i]; 1937 label.dkl_vtoc.v_part[i].p_tag = 1938 vtoc->v_part[i].p_tag; 1939 label.dkl_vtoc.v_part[i].p_flag = 1940 vtoc->v_part[i].p_flag; 1941 label.dkl_map[i].dkl_cylno = 1942 vtoc->v_part[i].p_start / 1943 (label.dkl_nhead * label.dkl_nsect); 1944 label.dkl_map[i].dkl_nblk = 1945 vtoc->v_part[i].p_size; 1946 } 1947 /* 1948 * The bootinfo array can not be copied with bcopy() because 1949 * elements are of type long in vtoc (so 64-bit) and of type 1950 * int in dk_vtoc (so 32-bit). 1951 */ 1952 label.dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0]; 1953 label.dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1]; 1954 label.dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2]; 1955 bcopy(vtoc->v_asciilabel, label.dkl_asciilabel, 1956 LEN_DKL_ASCII); 1957 bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume, 1958 LEN_DKL_VVOL); 1959 1960 /* re-compute checksum */ 1961 label.dkl_magic = DKL_MAGIC; 1962 label.dkl_cksum = vd_lbl2cksum(&label); 1963 1964 /* write label to the disk image */ 1965 if ((rc = vd_file_set_vtoc(vd, &label)) != 0) 1966 return (rc); 1967 1968 /* check the geometry and update the driver info */ 1969 if ((rc = vd_file_validate_geometry(vd)) != 0) 1970 return (rc); 1971 1972 /* 1973 * The disk geometry may have changed, so we need to write 1974 * the devid (if there is one) so that it is stored at the 1975 * right location. 1976 */ 1977 if (vd->file_devid != NULL && 1978 vd_file_write_devid(vd, vd->file_devid) != 0) { 1979 PR0("Fail to write devid"); 1980 } 1981 1982 return (0); 1983 1984 case DKIOCFLUSHWRITECACHE: 1985 return (VOP_FSYNC(vd->file_vnode, FSYNC, kcred)); 1986 1987 default: 1988 return (ENOTSUP); 1989 } 1990 } 1991 1992 /* 1993 * Description: 1994 * This is the function that processes the ioctl requests (farming it 1995 * out to functions that handle slices, files or whole disks) 1996 * 1997 * Return Values 1998 * 0 - ioctl operation completed successfully 1999 * != 0 - The LDC error value encountered 2000 * (propagated back up the call stack as a task error) 2001 * 2002 * Side Effect 2003 * sets request->status to the return value of the ioctl function. 2004 */ 2005 static int 2006 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 2007 { 2008 int rval = 0, status = 0; 2009 size_t nbytes = request->nbytes; /* modifiable copy */ 2010 2011 2012 ASSERT(request->slice < vd->nslices); 2013 PR0("Performing %s", ioctl->operation_name); 2014 2015 /* Get data from client and convert, if necessary */ 2016 if (ioctl->copyin != NULL) { 2017 ASSERT(nbytes != 0 && buf != NULL); 2018 PR1("Getting \"arg\" data from client"); 2019 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 2020 request->cookie, request->ncookies, 2021 LDC_COPY_IN)) != 0) { 2022 PR0("ldc_mem_copy() returned errno %d " 2023 "copying from client", status); 2024 return (status); 2025 } 2026 2027 /* Convert client's data, if necessary */ 2028 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 2029 ioctl->arg = buf; 2030 else /* convert client vdisk operation data to ioctl data */ 2031 (ioctl->copyin)(buf, (void *)ioctl->arg); 2032 } 2033 2034 /* 2035 * Handle single-slice block devices internally; otherwise, have the 2036 * real driver perform the ioctl() 2037 */ 2038 if (vd->file) { 2039 request->status = 2040 vd_do_file_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); 2041 2042 } else if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { 2043 request->status = 2044 vd_do_slice_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); 2045 2046 } else { 2047 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2048 ioctl->cmd, (intptr_t)ioctl->arg, vd->open_flags | FKIOCTL, 2049 kcred, &rval); 2050 2051 #ifdef DEBUG 2052 if (rval != 0) { 2053 PR0("%s set rval = %d, which is not being returned to" 2054 " client", ioctl->cmd_name, rval); 2055 } 2056 #endif /* DEBUG */ 2057 } 2058 2059 if (request->status != 0) { 2060 PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status); 2061 return (0); 2062 } 2063 2064 /* Convert data and send to client, if necessary */ 2065 if (ioctl->copyout != NULL) { 2066 ASSERT(nbytes != 0 && buf != NULL); 2067 PR1("Sending \"arg\" data to client"); 2068 2069 /* Convert ioctl data to vdisk operation data, if necessary */ 2070 if (ioctl->copyout != VD_IDENTITY) 2071 (ioctl->copyout)((void *)ioctl->arg, buf); 2072 2073 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 2074 request->cookie, request->ncookies, 2075 LDC_COPY_OUT)) != 0) { 2076 PR0("ldc_mem_copy() returned errno %d " 2077 "copying to client", status); 2078 return (status); 2079 } 2080 } 2081 2082 return (status); 2083 } 2084 2085 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 2086 2087 /* 2088 * Description: 2089 * This generic function is called by the task queue to complete 2090 * the processing of the tasks. The specific completion function 2091 * is passed in as a field in the task pointer. 2092 * 2093 * Parameters: 2094 * arg - opaque pointer to structure containing task to be completed 2095 * 2096 * Return Values 2097 * None 2098 */ 2099 static void 2100 vd_complete(void *arg) 2101 { 2102 vd_task_t *task = (vd_task_t *)arg; 2103 2104 ASSERT(task != NULL); 2105 ASSERT(task->status == EINPROGRESS); 2106 ASSERT(task->completef != NULL); 2107 2108 task->status = task->completef(task); 2109 if (task->status) 2110 PR0("%s: Error %d completing task", __func__, task->status); 2111 2112 /* Now notify the vDisk client */ 2113 vd_complete_notify(task); 2114 } 2115 2116 static int 2117 vd_ioctl(vd_task_t *task) 2118 { 2119 int i, status; 2120 void *buf = NULL; 2121 struct dk_geom dk_geom = {0}; 2122 struct vtoc vtoc = {0}; 2123 struct dk_efi dk_efi = {0}; 2124 vd_t *vd = task->vd; 2125 vd_dring_payload_t *request = task->request; 2126 vd_ioctl_t ioctl[] = { 2127 /* Command (no-copy) operations */ 2128 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 2129 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 2130 NULL, NULL, NULL, B_TRUE}, 2131 2132 /* "Get" (copy-out) operations */ 2133 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 2134 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 2135 NULL, VD_IDENTITY, VD_IDENTITY, B_FALSE}, 2136 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 2137 RNDSIZE(vd_geom_t), 2138 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 2139 &dk_geom, NULL, dk_geom2vd_geom, B_FALSE}, 2140 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 2141 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 2142 &vtoc, NULL, vtoc2vd_vtoc, B_FALSE}, 2143 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 2144 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 2145 &dk_efi, vd_get_efi_in, vd_get_efi_out, B_FALSE}, 2146 2147 /* "Set" (copy-in) operations */ 2148 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 2149 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 2150 NULL, VD_IDENTITY, VD_IDENTITY, B_TRUE}, 2151 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 2152 RNDSIZE(vd_geom_t), 2153 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 2154 &dk_geom, vd_geom2dk_geom, NULL, B_TRUE}, 2155 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 2156 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 2157 &vtoc, vd_vtoc2vtoc, NULL, B_TRUE}, 2158 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 2159 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 2160 &dk_efi, vd_set_efi_in, vd_set_efi_out, B_TRUE}, 2161 }; 2162 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 2163 2164 2165 ASSERT(vd != NULL); 2166 ASSERT(request != NULL); 2167 ASSERT(request->slice < vd->nslices); 2168 2169 /* 2170 * Determine ioctl corresponding to caller's "operation" and 2171 * validate caller's "nbytes" 2172 */ 2173 for (i = 0; i < nioctls; i++) { 2174 if (request->operation == ioctl[i].operation) { 2175 /* LDC memory operations require 8-byte multiples */ 2176 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 2177 2178 if (request->operation == VD_OP_GET_EFI || 2179 request->operation == VD_OP_SET_EFI) { 2180 if (request->nbytes >= ioctl[i].nbytes) 2181 break; 2182 PR0("%s: Expected at least nbytes = %lu, " 2183 "got %lu", ioctl[i].operation_name, 2184 ioctl[i].nbytes, request->nbytes); 2185 return (EINVAL); 2186 } 2187 2188 if (request->nbytes != ioctl[i].nbytes) { 2189 PR0("%s: Expected nbytes = %lu, got %lu", 2190 ioctl[i].operation_name, ioctl[i].nbytes, 2191 request->nbytes); 2192 return (EINVAL); 2193 } 2194 2195 break; 2196 } 2197 } 2198 ASSERT(i < nioctls); /* because "operation" already validated */ 2199 2200 if (!(vd->open_flags & FWRITE) && ioctl[i].write) { 2201 PR0("%s fails because backend is opened read-only", 2202 ioctl[i].operation_name); 2203 request->status = EROFS; 2204 return (0); 2205 } 2206 2207 if (request->nbytes) 2208 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 2209 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 2210 if (request->nbytes) 2211 kmem_free(buf, request->nbytes); 2212 2213 return (status); 2214 } 2215 2216 static int 2217 vd_get_devid(vd_task_t *task) 2218 { 2219 vd_t *vd = task->vd; 2220 vd_dring_payload_t *request = task->request; 2221 vd_devid_t *vd_devid; 2222 impl_devid_t *devid; 2223 int status, bufid_len, devid_len, len, sz; 2224 int bufbytes; 2225 2226 PR1("Get Device ID, nbytes=%ld", request->nbytes); 2227 2228 if (vd->file) { 2229 if (vd->file_devid == NULL) { 2230 PR2("No Device ID"); 2231 request->status = ENOENT; 2232 return (0); 2233 } else { 2234 sz = ddi_devid_sizeof(vd->file_devid); 2235 devid = kmem_alloc(sz, KM_SLEEP); 2236 bcopy(vd->file_devid, devid, sz); 2237 } 2238 } else { 2239 if (ddi_lyr_get_devid(vd->dev[request->slice], 2240 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 2241 PR2("No Device ID"); 2242 request->status = ENOENT; 2243 return (0); 2244 } 2245 } 2246 2247 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 2248 devid_len = DEVID_GETLEN(devid); 2249 2250 /* 2251 * Save the buffer size here for use in deallocation. 2252 * The actual number of bytes copied is returned in 2253 * the 'nbytes' field of the request structure. 2254 */ 2255 bufbytes = request->nbytes; 2256 2257 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 2258 vd_devid->length = devid_len; 2259 vd_devid->type = DEVID_GETTYPE(devid); 2260 2261 len = (devid_len > bufid_len)? bufid_len : devid_len; 2262 2263 bcopy(devid->did_id, vd_devid->id, len); 2264 2265 request->status = 0; 2266 2267 /* LDC memory operations require 8-byte multiples */ 2268 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 2269 2270 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 2271 &request->nbytes, request->cookie, request->ncookies, 2272 LDC_COPY_OUT)) != 0) { 2273 PR0("ldc_mem_copy() returned errno %d copying to client", 2274 status); 2275 } 2276 PR1("post mem_copy: nbytes=%ld", request->nbytes); 2277 2278 kmem_free(vd_devid, bufbytes); 2279 ddi_devid_free((ddi_devid_t)devid); 2280 2281 return (status); 2282 } 2283 2284 /* 2285 * Define the supported operations once the functions for performing them have 2286 * been defined 2287 */ 2288 static const vds_operation_t vds_operation[] = { 2289 #define X(_s) #_s, _s 2290 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 2291 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 2292 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 2293 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 2294 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 2295 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 2296 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 2297 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 2298 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 2299 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 2300 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 2301 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 2302 #undef X 2303 }; 2304 2305 static const size_t vds_noperations = 2306 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 2307 2308 /* 2309 * Process a task specifying a client I/O request 2310 * 2311 * Parameters: 2312 * task - structure containing the request sent from client 2313 * 2314 * Return Value 2315 * 0 - success 2316 * ENOTSUP - Unknown/Unsupported VD_OP_XXX operation 2317 * EINVAL - Invalid disk slice 2318 * != 0 - some other non-zero return value from start function 2319 */ 2320 static int 2321 vd_do_process_task(vd_task_t *task) 2322 { 2323 int i; 2324 vd_t *vd = task->vd; 2325 vd_dring_payload_t *request = task->request; 2326 2327 ASSERT(vd != NULL); 2328 ASSERT(request != NULL); 2329 2330 /* Find the requested operation */ 2331 for (i = 0; i < vds_noperations; i++) { 2332 if (request->operation == vds_operation[i].operation) { 2333 /* all operations should have a start func */ 2334 ASSERT(vds_operation[i].start != NULL); 2335 2336 task->completef = vds_operation[i].complete; 2337 break; 2338 } 2339 } 2340 if (i == vds_noperations) { 2341 PR0("Unsupported operation %u", request->operation); 2342 return (ENOTSUP); 2343 } 2344 2345 /* Range-check slice */ 2346 if (request->slice >= vd->nslices && 2347 (vd->vdisk_type != VD_DISK_TYPE_DISK || 2348 request->slice != VD_SLICE_NONE)) { 2349 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 2350 request->slice, (vd->nslices - 1)); 2351 return (EINVAL); 2352 } 2353 2354 /* 2355 * Call the function pointer that starts the operation. 2356 */ 2357 return (vds_operation[i].start(task)); 2358 } 2359 2360 /* 2361 * Description: 2362 * This function is called by both the in-band and descriptor ring 2363 * message processing functions paths to actually execute the task 2364 * requested by the vDisk client. It in turn calls its worker 2365 * function, vd_do_process_task(), to carry our the request. 2366 * 2367 * Any transport errors (e.g. LDC errors, vDisk protocol errors) are 2368 * saved in the 'status' field of the task and are propagated back 2369 * up the call stack to trigger a NACK 2370 * 2371 * Any request errors (e.g. ENOTTY from an ioctl) are saved in 2372 * the 'status' field of the request and result in an ACK being sent 2373 * by the completion handler. 2374 * 2375 * Parameters: 2376 * task - structure containing the request sent from client 2377 * 2378 * Return Value 2379 * 0 - successful synchronous request. 2380 * != 0 - transport error (e.g. LDC errors, vDisk protocol) 2381 * EINPROGRESS - task will be finished in a completion handler 2382 */ 2383 static int 2384 vd_process_task(vd_task_t *task) 2385 { 2386 vd_t *vd = task->vd; 2387 int status; 2388 2389 DTRACE_PROBE1(task__start, vd_task_t *, task); 2390 2391 task->status = vd_do_process_task(task); 2392 2393 /* 2394 * If the task processing function returned EINPROGRESS indicating 2395 * that the task needs completing then schedule a taskq entry to 2396 * finish it now. 2397 * 2398 * Otherwise the task processing function returned either zero 2399 * indicating that the task was finished in the start function (and we 2400 * don't need to wait in a completion function) or the start function 2401 * returned an error - in both cases all that needs to happen is the 2402 * notification to the vDisk client higher up the call stack. 2403 * If the task was using a Descriptor Ring, we need to mark it as done 2404 * at this stage. 2405 */ 2406 if (task->status == EINPROGRESS) { 2407 /* Queue a task to complete the operation */ 2408 (void) ddi_taskq_dispatch(vd->completionq, vd_complete, 2409 task, DDI_SLEEP); 2410 2411 } else if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE)) { 2412 /* Update the dring element if it's a dring client */ 2413 status = vd_mark_elem_done(vd, task->index, 2414 task->request->status, task->request->nbytes); 2415 if (status == ECONNRESET) 2416 vd_mark_in_reset(vd); 2417 } 2418 2419 return (task->status); 2420 } 2421 2422 /* 2423 * Return true if the "type", "subtype", and "env" fields of the "tag" first 2424 * argument match the corresponding remaining arguments; otherwise, return false 2425 */ 2426 boolean_t 2427 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 2428 { 2429 return ((tag->vio_msgtype == type) && 2430 (tag->vio_subtype == subtype) && 2431 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 2432 } 2433 2434 /* 2435 * Check whether the major/minor version specified in "ver_msg" is supported 2436 * by this server. 2437 */ 2438 static boolean_t 2439 vds_supported_version(vio_ver_msg_t *ver_msg) 2440 { 2441 for (int i = 0; i < vds_num_versions; i++) { 2442 ASSERT(vds_version[i].major > 0); 2443 ASSERT((i == 0) || 2444 (vds_version[i].major < vds_version[i-1].major)); 2445 2446 /* 2447 * If the major versions match, adjust the minor version, if 2448 * necessary, down to the highest value supported by this 2449 * server and return true so this message will get "ack"ed; 2450 * the client should also support all minor versions lower 2451 * than the value it sent 2452 */ 2453 if (ver_msg->ver_major == vds_version[i].major) { 2454 if (ver_msg->ver_minor > vds_version[i].minor) { 2455 PR0("Adjusting minor version from %u to %u", 2456 ver_msg->ver_minor, vds_version[i].minor); 2457 ver_msg->ver_minor = vds_version[i].minor; 2458 } 2459 return (B_TRUE); 2460 } 2461 2462 /* 2463 * If the message contains a higher major version number, set 2464 * the message's major/minor versions to the current values 2465 * and return false, so this message will get "nack"ed with 2466 * these values, and the client will potentially try again 2467 * with the same or a lower version 2468 */ 2469 if (ver_msg->ver_major > vds_version[i].major) { 2470 ver_msg->ver_major = vds_version[i].major; 2471 ver_msg->ver_minor = vds_version[i].minor; 2472 return (B_FALSE); 2473 } 2474 2475 /* 2476 * Otherwise, the message's major version is less than the 2477 * current major version, so continue the loop to the next 2478 * (lower) supported version 2479 */ 2480 } 2481 2482 /* 2483 * No common version was found; "ground" the version pair in the 2484 * message to terminate negotiation 2485 */ 2486 ver_msg->ver_major = 0; 2487 ver_msg->ver_minor = 0; 2488 return (B_FALSE); 2489 } 2490 2491 /* 2492 * Process a version message from a client. vds expects to receive version 2493 * messages from clients seeking service, but never issues version messages 2494 * itself; therefore, vds can ACK or NACK client version messages, but does 2495 * not expect to receive version-message ACKs or NACKs (and will treat such 2496 * messages as invalid). 2497 */ 2498 static int 2499 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2500 { 2501 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 2502 2503 2504 ASSERT(msglen >= sizeof (msg->tag)); 2505 2506 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2507 VIO_VER_INFO)) { 2508 return (ENOMSG); /* not a version message */ 2509 } 2510 2511 if (msglen != sizeof (*ver_msg)) { 2512 PR0("Expected %lu-byte version message; " 2513 "received %lu bytes", sizeof (*ver_msg), msglen); 2514 return (EBADMSG); 2515 } 2516 2517 if (ver_msg->dev_class != VDEV_DISK) { 2518 PR0("Expected device class %u (disk); received %u", 2519 VDEV_DISK, ver_msg->dev_class); 2520 return (EBADMSG); 2521 } 2522 2523 /* 2524 * We're talking to the expected kind of client; set our device class 2525 * for "ack/nack" back to the client 2526 */ 2527 ver_msg->dev_class = VDEV_DISK_SERVER; 2528 2529 /* 2530 * Check whether the (valid) version message specifies a version 2531 * supported by this server. If the version is not supported, return 2532 * EBADMSG so the message will get "nack"ed; vds_supported_version() 2533 * will have updated the message with a supported version for the 2534 * client to consider 2535 */ 2536 if (!vds_supported_version(ver_msg)) 2537 return (EBADMSG); 2538 2539 2540 /* 2541 * A version has been agreed upon; use the client's SID for 2542 * communication on this channel now 2543 */ 2544 ASSERT(!(vd->initialized & VD_SID)); 2545 vd->sid = ver_msg->tag.vio_sid; 2546 vd->initialized |= VD_SID; 2547 2548 /* 2549 * When multiple versions are supported, this function should store 2550 * the negotiated major and minor version values in the "vd" data 2551 * structure to govern further communication; in particular, note that 2552 * the client might have specified a lower minor version for the 2553 * agreed major version than specifed in the vds_version[] array. The 2554 * following assertions should help remind future maintainers to make 2555 * the appropriate changes to support multiple versions. 2556 */ 2557 ASSERT(vds_num_versions == 1); 2558 ASSERT(ver_msg->ver_major == vds_version[0].major); 2559 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 2560 2561 PR0("Using major version %u, minor version %u", 2562 ver_msg->ver_major, ver_msg->ver_minor); 2563 return (0); 2564 } 2565 2566 static int 2567 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2568 { 2569 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 2570 int status, retry = 0; 2571 2572 2573 ASSERT(msglen >= sizeof (msg->tag)); 2574 2575 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2576 VIO_ATTR_INFO)) { 2577 PR0("Message is not an attribute message"); 2578 return (ENOMSG); 2579 } 2580 2581 if (msglen != sizeof (*attr_msg)) { 2582 PR0("Expected %lu-byte attribute message; " 2583 "received %lu bytes", sizeof (*attr_msg), msglen); 2584 return (EBADMSG); 2585 } 2586 2587 if (attr_msg->max_xfer_sz == 0) { 2588 PR0("Received maximum transfer size of 0 from client"); 2589 return (EBADMSG); 2590 } 2591 2592 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 2593 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 2594 PR0("Client requested unsupported transfer mode"); 2595 return (EBADMSG); 2596 } 2597 2598 /* 2599 * check if the underlying disk is ready, if not try accessing 2600 * the device again. Open the vdisk device and extract info 2601 * about it, as this is needed to respond to the attr info msg 2602 */ 2603 if ((vd->initialized & VD_DISK_READY) == 0) { 2604 PR0("Retry setting up disk (%s)", vd->device_path); 2605 do { 2606 status = vd_setup_vd(vd); 2607 if (status != EAGAIN || ++retry > vds_dev_retries) 2608 break; 2609 2610 /* incremental delay */ 2611 delay(drv_usectohz(vds_dev_delay)); 2612 2613 /* if vdisk is no longer enabled - return error */ 2614 if (!vd_enabled(vd)) 2615 return (ENXIO); 2616 2617 } while (status == EAGAIN); 2618 2619 if (status) 2620 return (ENXIO); 2621 2622 vd->initialized |= VD_DISK_READY; 2623 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 2624 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 2625 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 2626 (vd->pseudo ? "yes" : "no"), 2627 (vd->file ? "yes" : "no"), 2628 vd->nslices); 2629 } 2630 2631 /* Success: valid message and transfer mode */ 2632 vd->xfer_mode = attr_msg->xfer_mode; 2633 2634 if (vd->xfer_mode == VIO_DESC_MODE) { 2635 2636 /* 2637 * The vd_dring_inband_msg_t contains one cookie; need room 2638 * for up to n-1 more cookies, where "n" is the number of full 2639 * pages plus possibly one partial page required to cover 2640 * "max_xfer_sz". Add room for one more cookie if 2641 * "max_xfer_sz" isn't an integral multiple of the page size. 2642 * Must first get the maximum transfer size in bytes. 2643 */ 2644 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 2645 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 2646 attr_msg->max_xfer_sz; 2647 size_t max_inband_msglen = 2648 sizeof (vd_dring_inband_msg_t) + 2649 ((max_xfer_bytes/PAGESIZE + 2650 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 2651 (sizeof (ldc_mem_cookie_t))); 2652 2653 /* 2654 * Set the maximum expected message length to 2655 * accommodate in-band-descriptor messages with all 2656 * their cookies 2657 */ 2658 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 2659 2660 /* 2661 * Initialize the data structure for processing in-band I/O 2662 * request descriptors 2663 */ 2664 vd->inband_task.vd = vd; 2665 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2666 vd->inband_task.index = 0; 2667 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 2668 } 2669 2670 /* Return the device's block size and max transfer size to the client */ 2671 attr_msg->vdisk_block_size = DEV_BSIZE; 2672 attr_msg->max_xfer_sz = vd->max_xfer_sz; 2673 2674 attr_msg->vdisk_size = vd->vdisk_size; 2675 attr_msg->vdisk_type = vd->vdisk_type; 2676 attr_msg->operations = vds_operations; 2677 PR0("%s", VD_CLIENT(vd)); 2678 2679 ASSERT(vd->dring_task == NULL); 2680 2681 return (0); 2682 } 2683 2684 static int 2685 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2686 { 2687 int status; 2688 size_t expected; 2689 ldc_mem_info_t dring_minfo; 2690 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 2691 2692 2693 ASSERT(msglen >= sizeof (msg->tag)); 2694 2695 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2696 VIO_DRING_REG)) { 2697 PR0("Message is not a register-dring message"); 2698 return (ENOMSG); 2699 } 2700 2701 if (msglen < sizeof (*reg_msg)) { 2702 PR0("Expected at least %lu-byte register-dring message; " 2703 "received %lu bytes", sizeof (*reg_msg), msglen); 2704 return (EBADMSG); 2705 } 2706 2707 expected = sizeof (*reg_msg) + 2708 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 2709 if (msglen != expected) { 2710 PR0("Expected %lu-byte register-dring message; " 2711 "received %lu bytes", expected, msglen); 2712 return (EBADMSG); 2713 } 2714 2715 if (vd->initialized & VD_DRING) { 2716 PR0("A dring was previously registered; only support one"); 2717 return (EBADMSG); 2718 } 2719 2720 if (reg_msg->num_descriptors > INT32_MAX) { 2721 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 2722 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 2723 return (EBADMSG); 2724 } 2725 2726 if (reg_msg->ncookies != 1) { 2727 /* 2728 * In addition to fixing the assertion in the success case 2729 * below, supporting drings which require more than one 2730 * "cookie" requires increasing the value of vd->max_msglen 2731 * somewhere in the code path prior to receiving the message 2732 * which results in calling this function. Note that without 2733 * making this change, the larger message size required to 2734 * accommodate multiple cookies cannot be successfully 2735 * received, so this function will not even get called. 2736 * Gracefully accommodating more dring cookies might 2737 * reasonably demand exchanging an additional attribute or 2738 * making a minor protocol adjustment 2739 */ 2740 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 2741 return (EBADMSG); 2742 } 2743 2744 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 2745 reg_msg->ncookies, reg_msg->num_descriptors, 2746 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 2747 if (status != 0) { 2748 PR0("ldc_mem_dring_map() returned errno %d", status); 2749 return (status); 2750 } 2751 2752 /* 2753 * To remove the need for this assertion, must call 2754 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 2755 * successful call to ldc_mem_dring_map() 2756 */ 2757 ASSERT(reg_msg->ncookies == 1); 2758 2759 if ((status = 2760 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 2761 PR0("ldc_mem_dring_info() returned errno %d", status); 2762 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 2763 PR0("ldc_mem_dring_unmap() returned errno %d", status); 2764 return (status); 2765 } 2766 2767 if (dring_minfo.vaddr == NULL) { 2768 PR0("Descriptor ring virtual address is NULL"); 2769 return (ENXIO); 2770 } 2771 2772 2773 /* Initialize for valid message and mapped dring */ 2774 PR1("descriptor size = %u, dring length = %u", 2775 vd->descriptor_size, vd->dring_len); 2776 vd->initialized |= VD_DRING; 2777 vd->dring_ident = 1; /* "There Can Be Only One" */ 2778 vd->dring = dring_minfo.vaddr; 2779 vd->descriptor_size = reg_msg->descriptor_size; 2780 vd->dring_len = reg_msg->num_descriptors; 2781 reg_msg->dring_ident = vd->dring_ident; 2782 2783 /* 2784 * Allocate and initialize a "shadow" array of data structures for 2785 * tasks to process I/O requests in dring elements 2786 */ 2787 vd->dring_task = 2788 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 2789 for (int i = 0; i < vd->dring_len; i++) { 2790 vd->dring_task[i].vd = vd; 2791 vd->dring_task[i].index = i; 2792 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 2793 2794 status = ldc_mem_alloc_handle(vd->ldc_handle, 2795 &(vd->dring_task[i].mhdl)); 2796 if (status) { 2797 PR0("ldc_mem_alloc_handle() returned err %d ", status); 2798 return (ENXIO); 2799 } 2800 2801 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2802 } 2803 2804 return (0); 2805 } 2806 2807 static int 2808 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2809 { 2810 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 2811 2812 2813 ASSERT(msglen >= sizeof (msg->tag)); 2814 2815 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2816 VIO_DRING_UNREG)) { 2817 PR0("Message is not an unregister-dring message"); 2818 return (ENOMSG); 2819 } 2820 2821 if (msglen != sizeof (*unreg_msg)) { 2822 PR0("Expected %lu-byte unregister-dring message; " 2823 "received %lu bytes", sizeof (*unreg_msg), msglen); 2824 return (EBADMSG); 2825 } 2826 2827 if (unreg_msg->dring_ident != vd->dring_ident) { 2828 PR0("Expected dring ident %lu; received %lu", 2829 vd->dring_ident, unreg_msg->dring_ident); 2830 return (EBADMSG); 2831 } 2832 2833 return (0); 2834 } 2835 2836 static int 2837 process_rdx_msg(vio_msg_t *msg, size_t msglen) 2838 { 2839 ASSERT(msglen >= sizeof (msg->tag)); 2840 2841 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 2842 PR0("Message is not an RDX message"); 2843 return (ENOMSG); 2844 } 2845 2846 if (msglen != sizeof (vio_rdx_msg_t)) { 2847 PR0("Expected %lu-byte RDX message; received %lu bytes", 2848 sizeof (vio_rdx_msg_t), msglen); 2849 return (EBADMSG); 2850 } 2851 2852 PR0("Valid RDX message"); 2853 return (0); 2854 } 2855 2856 static int 2857 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 2858 { 2859 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 2860 PR0("Received seq_num %lu; expected %lu", 2861 seq_num, (vd->seq_num + 1)); 2862 PR0("initiating soft reset"); 2863 vd_need_reset(vd, B_FALSE); 2864 return (1); 2865 } 2866 2867 vd->seq_num = seq_num; 2868 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 2869 return (0); 2870 } 2871 2872 /* 2873 * Return the expected size of an inband-descriptor message with all the 2874 * cookies it claims to include 2875 */ 2876 static size_t 2877 expected_inband_size(vd_dring_inband_msg_t *msg) 2878 { 2879 return ((sizeof (*msg)) + 2880 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 2881 } 2882 2883 /* 2884 * Process an in-band descriptor message: used with clients like OBP, with 2885 * which vds exchanges descriptors within VIO message payloads, rather than 2886 * operating on them within a descriptor ring 2887 */ 2888 static int 2889 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2890 { 2891 size_t expected; 2892 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 2893 2894 2895 ASSERT(msglen >= sizeof (msg->tag)); 2896 2897 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 2898 VIO_DESC_DATA)) { 2899 PR1("Message is not an in-band-descriptor message"); 2900 return (ENOMSG); 2901 } 2902 2903 if (msglen < sizeof (*desc_msg)) { 2904 PR0("Expected at least %lu-byte descriptor message; " 2905 "received %lu bytes", sizeof (*desc_msg), msglen); 2906 return (EBADMSG); 2907 } 2908 2909 if (msglen != (expected = expected_inband_size(desc_msg))) { 2910 PR0("Expected %lu-byte descriptor message; " 2911 "received %lu bytes", expected, msglen); 2912 return (EBADMSG); 2913 } 2914 2915 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 2916 return (EBADMSG); 2917 2918 /* 2919 * Valid message: Set up the in-band descriptor task and process the 2920 * request. Arrange to acknowledge the client's message, unless an 2921 * error processing the descriptor task results in setting 2922 * VIO_SUBTYPE_NACK 2923 */ 2924 PR1("Valid in-band-descriptor message"); 2925 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2926 2927 ASSERT(vd->inband_task.msg != NULL); 2928 2929 bcopy(msg, vd->inband_task.msg, msglen); 2930 vd->inband_task.msglen = msglen; 2931 2932 /* 2933 * The task request is now the payload of the message 2934 * that was just copied into the body of the task. 2935 */ 2936 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 2937 vd->inband_task.request = &desc_msg->payload; 2938 2939 return (vd_process_task(&vd->inband_task)); 2940 } 2941 2942 static int 2943 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 2944 vio_msg_t *msg, size_t msglen) 2945 { 2946 int status; 2947 boolean_t ready; 2948 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 2949 2950 2951 /* Accept the updated dring element */ 2952 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 2953 PR0("ldc_mem_dring_acquire() returned errno %d", status); 2954 return (status); 2955 } 2956 ready = (elem->hdr.dstate == VIO_DESC_READY); 2957 if (ready) { 2958 elem->hdr.dstate = VIO_DESC_ACCEPTED; 2959 } else { 2960 PR0("descriptor %u not ready", idx); 2961 VD_DUMP_DRING_ELEM(elem); 2962 } 2963 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 2964 PR0("ldc_mem_dring_release() returned errno %d", status); 2965 return (status); 2966 } 2967 if (!ready) 2968 return (EBUSY); 2969 2970 2971 /* Initialize a task and process the accepted element */ 2972 PR1("Processing dring element %u", idx); 2973 vd->dring_task[idx].type = type; 2974 2975 /* duplicate msg buf for cookies etc. */ 2976 bcopy(msg, vd->dring_task[idx].msg, msglen); 2977 2978 vd->dring_task[idx].msglen = msglen; 2979 return (vd_process_task(&vd->dring_task[idx])); 2980 } 2981 2982 static int 2983 vd_process_element_range(vd_t *vd, int start, int end, 2984 vio_msg_t *msg, size_t msglen) 2985 { 2986 int i, n, nelem, status = 0; 2987 boolean_t inprogress = B_FALSE; 2988 vd_task_type_t type; 2989 2990 2991 ASSERT(start >= 0); 2992 ASSERT(end >= 0); 2993 2994 /* 2995 * Arrange to acknowledge the client's message, unless an error 2996 * processing one of the dring elements results in setting 2997 * VIO_SUBTYPE_NACK 2998 */ 2999 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3000 3001 /* 3002 * Process the dring elements in the range 3003 */ 3004 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 3005 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 3006 ((vio_dring_msg_t *)msg)->end_idx = i; 3007 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 3008 status = vd_process_element(vd, type, i, msg, msglen); 3009 if (status == EINPROGRESS) 3010 inprogress = B_TRUE; 3011 else if (status != 0) 3012 break; 3013 } 3014 3015 /* 3016 * If some, but not all, operations of a multi-element range are in 3017 * progress, wait for other operations to complete before returning 3018 * (which will result in "ack" or "nack" of the message). Note that 3019 * all outstanding operations will need to complete, not just the ones 3020 * corresponding to the current range of dring elements; howevever, as 3021 * this situation is an error case, performance is less critical. 3022 */ 3023 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 3024 ddi_taskq_wait(vd->completionq); 3025 3026 return (status); 3027 } 3028 3029 static int 3030 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3031 { 3032 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 3033 3034 3035 ASSERT(msglen >= sizeof (msg->tag)); 3036 3037 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 3038 VIO_DRING_DATA)) { 3039 PR1("Message is not a dring-data message"); 3040 return (ENOMSG); 3041 } 3042 3043 if (msglen != sizeof (*dring_msg)) { 3044 PR0("Expected %lu-byte dring message; received %lu bytes", 3045 sizeof (*dring_msg), msglen); 3046 return (EBADMSG); 3047 } 3048 3049 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 3050 return (EBADMSG); 3051 3052 if (dring_msg->dring_ident != vd->dring_ident) { 3053 PR0("Expected dring ident %lu; received ident %lu", 3054 vd->dring_ident, dring_msg->dring_ident); 3055 return (EBADMSG); 3056 } 3057 3058 if (dring_msg->start_idx >= vd->dring_len) { 3059 PR0("\"start_idx\" = %u; must be less than %u", 3060 dring_msg->start_idx, vd->dring_len); 3061 return (EBADMSG); 3062 } 3063 3064 if ((dring_msg->end_idx < 0) || 3065 (dring_msg->end_idx >= vd->dring_len)) { 3066 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 3067 dring_msg->end_idx, vd->dring_len); 3068 return (EBADMSG); 3069 } 3070 3071 /* Valid message; process range of updated dring elements */ 3072 PR1("Processing descriptor range, start = %u, end = %u", 3073 dring_msg->start_idx, dring_msg->end_idx); 3074 return (vd_process_element_range(vd, dring_msg->start_idx, 3075 dring_msg->end_idx, msg, msglen)); 3076 } 3077 3078 static int 3079 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 3080 { 3081 int retry, status; 3082 size_t size = *nbytes; 3083 3084 3085 for (retry = 0, status = ETIMEDOUT; 3086 retry < vds_ldc_retries && status == ETIMEDOUT; 3087 retry++) { 3088 PR1("ldc_read() attempt %d", (retry + 1)); 3089 *nbytes = size; 3090 status = ldc_read(ldc_handle, msg, nbytes); 3091 } 3092 3093 if (status) { 3094 PR0("ldc_read() returned errno %d", status); 3095 if (status != ECONNRESET) 3096 return (ENOMSG); 3097 return (status); 3098 } else if (*nbytes == 0) { 3099 PR1("ldc_read() returned 0 and no message read"); 3100 return (ENOMSG); 3101 } 3102 3103 PR1("RCVD %lu-byte message", *nbytes); 3104 return (0); 3105 } 3106 3107 static int 3108 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3109 { 3110 int status; 3111 3112 3113 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 3114 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 3115 #ifdef DEBUG 3116 vd_decode_tag(msg); 3117 #endif 3118 3119 /* 3120 * Validate session ID up front, since it applies to all messages 3121 * once set 3122 */ 3123 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 3124 PR0("Expected SID %u, received %u", vd->sid, 3125 msg->tag.vio_sid); 3126 return (EBADMSG); 3127 } 3128 3129 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 3130 3131 /* 3132 * Process the received message based on connection state 3133 */ 3134 switch (vd->state) { 3135 case VD_STATE_INIT: /* expect version message */ 3136 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 3137 return (status); 3138 3139 /* Version negotiated, move to that state */ 3140 vd->state = VD_STATE_VER; 3141 return (0); 3142 3143 case VD_STATE_VER: /* expect attribute message */ 3144 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 3145 return (status); 3146 3147 /* Attributes exchanged, move to that state */ 3148 vd->state = VD_STATE_ATTR; 3149 return (0); 3150 3151 case VD_STATE_ATTR: 3152 switch (vd->xfer_mode) { 3153 case VIO_DESC_MODE: /* expect RDX message */ 3154 if ((status = process_rdx_msg(msg, msglen)) != 0) 3155 return (status); 3156 3157 /* Ready to receive in-band descriptors */ 3158 vd->state = VD_STATE_DATA; 3159 return (0); 3160 3161 case VIO_DRING_MODE: /* expect register-dring message */ 3162 if ((status = 3163 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 3164 return (status); 3165 3166 /* One dring negotiated, move to that state */ 3167 vd->state = VD_STATE_DRING; 3168 return (0); 3169 3170 default: 3171 ASSERT("Unsupported transfer mode"); 3172 PR0("Unsupported transfer mode"); 3173 return (ENOTSUP); 3174 } 3175 3176 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 3177 if ((status = process_rdx_msg(msg, msglen)) == 0) { 3178 /* Ready to receive data */ 3179 vd->state = VD_STATE_DATA; 3180 return (0); 3181 } else if (status != ENOMSG) { 3182 return (status); 3183 } 3184 3185 3186 /* 3187 * If another register-dring message is received, stay in 3188 * dring state in case the client sends RDX; although the 3189 * protocol allows multiple drings, this server does not 3190 * support using more than one 3191 */ 3192 if ((status = 3193 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 3194 return (status); 3195 3196 /* 3197 * Acknowledge an unregister-dring message, but reset the 3198 * connection anyway: Although the protocol allows 3199 * unregistering drings, this server cannot serve a vdisk 3200 * without its only dring 3201 */ 3202 status = vd_process_dring_unreg_msg(vd, msg, msglen); 3203 return ((status == 0) ? ENOTSUP : status); 3204 3205 case VD_STATE_DATA: 3206 switch (vd->xfer_mode) { 3207 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 3208 return (vd_process_desc_msg(vd, msg, msglen)); 3209 3210 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 3211 /* 3212 * Typically expect dring-data messages, so handle 3213 * them first 3214 */ 3215 if ((status = vd_process_dring_msg(vd, msg, 3216 msglen)) != ENOMSG) 3217 return (status); 3218 3219 /* 3220 * Acknowledge an unregister-dring message, but reset 3221 * the connection anyway: Although the protocol 3222 * allows unregistering drings, this server cannot 3223 * serve a vdisk without its only dring 3224 */ 3225 status = vd_process_dring_unreg_msg(vd, msg, msglen); 3226 return ((status == 0) ? ENOTSUP : status); 3227 3228 default: 3229 ASSERT("Unsupported transfer mode"); 3230 PR0("Unsupported transfer mode"); 3231 return (ENOTSUP); 3232 } 3233 3234 default: 3235 ASSERT("Invalid client connection state"); 3236 PR0("Invalid client connection state"); 3237 return (ENOTSUP); 3238 } 3239 } 3240 3241 static int 3242 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3243 { 3244 int status; 3245 boolean_t reset_ldc = B_FALSE; 3246 vd_task_t task; 3247 3248 /* 3249 * Check that the message is at least big enough for a "tag", so that 3250 * message processing can proceed based on tag-specified message type 3251 */ 3252 if (msglen < sizeof (vio_msg_tag_t)) { 3253 PR0("Received short (%lu-byte) message", msglen); 3254 /* Can't "nack" short message, so drop the big hammer */ 3255 PR0("initiating full reset"); 3256 vd_need_reset(vd, B_TRUE); 3257 return (EBADMSG); 3258 } 3259 3260 /* 3261 * Process the message 3262 */ 3263 switch (status = vd_do_process_msg(vd, msg, msglen)) { 3264 case 0: 3265 /* "ack" valid, successfully-processed messages */ 3266 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3267 break; 3268 3269 case EINPROGRESS: 3270 /* The completion handler will "ack" or "nack" the message */ 3271 return (EINPROGRESS); 3272 case ENOMSG: 3273 PR0("Received unexpected message"); 3274 _NOTE(FALLTHROUGH); 3275 case EBADMSG: 3276 case ENOTSUP: 3277 /* "transport" error will cause NACK of invalid messages */ 3278 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 3279 break; 3280 3281 default: 3282 /* "transport" error will cause NACK of invalid messages */ 3283 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 3284 /* An LDC error probably occurred, so try resetting it */ 3285 reset_ldc = B_TRUE; 3286 break; 3287 } 3288 3289 PR1("\tResulting in state %d (%s)", vd->state, 3290 vd_decode_state(vd->state)); 3291 3292 /* populate the task so we can dispatch it on the taskq */ 3293 task.vd = vd; 3294 task.msg = msg; 3295 task.msglen = msglen; 3296 3297 /* 3298 * Queue a task to send the notification that the operation completed. 3299 * We need to ensure that requests are responded to in the correct 3300 * order and since the taskq is processed serially this ordering 3301 * is maintained. 3302 */ 3303 (void) ddi_taskq_dispatch(vd->completionq, vd_serial_notify, 3304 &task, DDI_SLEEP); 3305 3306 /* 3307 * To ensure handshake negotiations do not happen out of order, such 3308 * requests that come through this path should not be done in parallel 3309 * so we need to wait here until the response is sent to the client. 3310 */ 3311 ddi_taskq_wait(vd->completionq); 3312 3313 /* Arrange to reset the connection for nack'ed or failed messages */ 3314 if ((status != 0) || reset_ldc) { 3315 PR0("initiating %s reset", 3316 (reset_ldc) ? "full" : "soft"); 3317 vd_need_reset(vd, reset_ldc); 3318 } 3319 3320 return (status); 3321 } 3322 3323 static boolean_t 3324 vd_enabled(vd_t *vd) 3325 { 3326 boolean_t enabled; 3327 3328 mutex_enter(&vd->lock); 3329 enabled = vd->enabled; 3330 mutex_exit(&vd->lock); 3331 return (enabled); 3332 } 3333 3334 static void 3335 vd_recv_msg(void *arg) 3336 { 3337 vd_t *vd = (vd_t *)arg; 3338 int rv = 0, status = 0; 3339 3340 ASSERT(vd != NULL); 3341 3342 PR2("New task to receive incoming message(s)"); 3343 3344 3345 while (vd_enabled(vd) && status == 0) { 3346 size_t msglen, msgsize; 3347 ldc_status_t lstatus; 3348 3349 /* 3350 * Receive and process a message 3351 */ 3352 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 3353 3354 /* 3355 * check if channel is UP - else break out of loop 3356 */ 3357 status = ldc_status(vd->ldc_handle, &lstatus); 3358 if (lstatus != LDC_UP) { 3359 PR0("channel not up (status=%d), exiting recv loop\n", 3360 lstatus); 3361 break; 3362 } 3363 3364 ASSERT(vd->max_msglen != 0); 3365 3366 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 3367 msglen = msgsize; /* actual len after recv_msg() */ 3368 3369 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 3370 switch (status) { 3371 case 0: 3372 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 3373 msglen); 3374 /* check if max_msglen changed */ 3375 if (msgsize != vd->max_msglen) { 3376 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 3377 msgsize, vd->max_msglen); 3378 kmem_free(vd->vio_msgp, msgsize); 3379 vd->vio_msgp = 3380 kmem_alloc(vd->max_msglen, KM_SLEEP); 3381 } 3382 if (rv == EINPROGRESS) 3383 continue; 3384 break; 3385 3386 case ENOMSG: 3387 break; 3388 3389 case ECONNRESET: 3390 PR0("initiating soft reset (ECONNRESET)\n"); 3391 vd_need_reset(vd, B_FALSE); 3392 status = 0; 3393 break; 3394 3395 default: 3396 /* Probably an LDC failure; arrange to reset it */ 3397 PR0("initiating full reset (status=0x%x)", status); 3398 vd_need_reset(vd, B_TRUE); 3399 break; 3400 } 3401 } 3402 3403 PR2("Task finished"); 3404 } 3405 3406 static uint_t 3407 vd_handle_ldc_events(uint64_t event, caddr_t arg) 3408 { 3409 vd_t *vd = (vd_t *)(void *)arg; 3410 int status; 3411 3412 ASSERT(vd != NULL); 3413 3414 if (!vd_enabled(vd)) 3415 return (LDC_SUCCESS); 3416 3417 if (event & LDC_EVT_DOWN) { 3418 PR0("LDC_EVT_DOWN: LDC channel went down"); 3419 3420 vd_need_reset(vd, B_TRUE); 3421 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 3422 DDI_SLEEP); 3423 if (status == DDI_FAILURE) { 3424 PR0("cannot schedule task to recv msg\n"); 3425 vd_need_reset(vd, B_TRUE); 3426 } 3427 } 3428 3429 if (event & LDC_EVT_RESET) { 3430 PR0("LDC_EVT_RESET: LDC channel was reset"); 3431 3432 if (vd->state != VD_STATE_INIT) { 3433 PR0("scheduling full reset"); 3434 vd_need_reset(vd, B_FALSE); 3435 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 3436 vd, DDI_SLEEP); 3437 if (status == DDI_FAILURE) { 3438 PR0("cannot schedule task to recv msg\n"); 3439 vd_need_reset(vd, B_TRUE); 3440 } 3441 3442 } else { 3443 PR0("channel already reset, ignoring...\n"); 3444 PR0("doing ldc up...\n"); 3445 (void) ldc_up(vd->ldc_handle); 3446 } 3447 3448 return (LDC_SUCCESS); 3449 } 3450 3451 if (event & LDC_EVT_UP) { 3452 PR0("EVT_UP: LDC is up\nResetting client connection state"); 3453 PR0("initiating soft reset"); 3454 vd_need_reset(vd, B_FALSE); 3455 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 3456 vd, DDI_SLEEP); 3457 if (status == DDI_FAILURE) { 3458 PR0("cannot schedule task to recv msg\n"); 3459 vd_need_reset(vd, B_TRUE); 3460 return (LDC_SUCCESS); 3461 } 3462 } 3463 3464 if (event & LDC_EVT_READ) { 3465 int status; 3466 3467 PR1("New data available"); 3468 /* Queue a task to receive the new data */ 3469 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 3470 DDI_SLEEP); 3471 3472 if (status == DDI_FAILURE) { 3473 PR0("cannot schedule task to recv msg\n"); 3474 vd_need_reset(vd, B_TRUE); 3475 } 3476 } 3477 3478 return (LDC_SUCCESS); 3479 } 3480 3481 static uint_t 3482 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 3483 { 3484 _NOTE(ARGUNUSED(key, val)) 3485 (*((uint_t *)arg))++; 3486 return (MH_WALK_TERMINATE); 3487 } 3488 3489 3490 static int 3491 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3492 { 3493 uint_t vd_present = 0; 3494 minor_t instance; 3495 vds_t *vds; 3496 3497 3498 switch (cmd) { 3499 case DDI_DETACH: 3500 /* the real work happens below */ 3501 break; 3502 case DDI_SUSPEND: 3503 PR0("No action required for DDI_SUSPEND"); 3504 return (DDI_SUCCESS); 3505 default: 3506 PR0("Unrecognized \"cmd\""); 3507 return (DDI_FAILURE); 3508 } 3509 3510 ASSERT(cmd == DDI_DETACH); 3511 instance = ddi_get_instance(dip); 3512 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 3513 PR0("Could not get state for instance %u", instance); 3514 ddi_soft_state_free(vds_state, instance); 3515 return (DDI_FAILURE); 3516 } 3517 3518 /* Do no detach when serving any vdisks */ 3519 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 3520 if (vd_present) { 3521 PR0("Not detaching because serving vdisks"); 3522 return (DDI_FAILURE); 3523 } 3524 3525 PR0("Detaching"); 3526 if (vds->initialized & VDS_MDEG) { 3527 (void) mdeg_unregister(vds->mdeg); 3528 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 3529 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 3530 vds->ispecp = NULL; 3531 vds->mdeg = NULL; 3532 } 3533 3534 if (vds->initialized & VDS_LDI) 3535 (void) ldi_ident_release(vds->ldi_ident); 3536 mod_hash_destroy_hash(vds->vd_table); 3537 ddi_soft_state_free(vds_state, instance); 3538 return (DDI_SUCCESS); 3539 } 3540 3541 static boolean_t 3542 is_pseudo_device(dev_info_t *dip) 3543 { 3544 dev_info_t *parent, *root = ddi_root_node(); 3545 3546 3547 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 3548 parent = ddi_get_parent(parent)) { 3549 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 3550 return (B_TRUE); 3551 } 3552 3553 return (B_FALSE); 3554 } 3555 3556 static int 3557 vd_setup_full_disk(vd_t *vd) 3558 { 3559 int rval, status; 3560 major_t major = getmajor(vd->dev[0]); 3561 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 3562 struct dk_minfo dk_minfo; 3563 3564 ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); 3565 3566 /* 3567 * At this point, vdisk_size is set to the size of partition 2 but 3568 * this does not represent the size of the disk because partition 2 3569 * may not cover the entire disk and its size does not include reserved 3570 * blocks. So we update vdisk_size to be the size of the entire disk. 3571 */ 3572 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 3573 (intptr_t)&dk_minfo, (vd->open_flags | FKIOCTL), 3574 kcred, &rval)) != 0) { 3575 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 3576 status); 3577 return (status); 3578 } 3579 vd->vdisk_size = dk_minfo.dki_capacity; 3580 3581 /* Move dev number and LDI handle to entire-disk-slice array elements */ 3582 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 3583 vd->dev[0] = 0; 3584 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 3585 vd->ldi_handle[0] = NULL; 3586 3587 /* Initialize device numbers for remaining slices and open them */ 3588 for (int slice = 0; slice < vd->nslices; slice++) { 3589 /* 3590 * Skip the entire-disk slice, as it's already open and its 3591 * device known 3592 */ 3593 if (slice == VD_ENTIRE_DISK_SLICE) 3594 continue; 3595 ASSERT(vd->dev[slice] == 0); 3596 ASSERT(vd->ldi_handle[slice] == NULL); 3597 3598 /* 3599 * Construct the device number for the current slice 3600 */ 3601 vd->dev[slice] = makedevice(major, (minor + slice)); 3602 3603 /* 3604 * Open all slices of the disk to serve them to the client. 3605 * Slices are opened exclusively to prevent other threads or 3606 * processes in the service domain from performing I/O to 3607 * slices being accessed by a client. Failure to open a slice 3608 * results in vds not serving this disk, as the client could 3609 * attempt (and should be able) to access any slice immediately. 3610 * Any slices successfully opened before a failure will get 3611 * closed by vds_destroy_vd() as a result of the error returned 3612 * by this function. 3613 * 3614 * We need to do the open with FNDELAY so that opening an empty 3615 * slice does not fail. 3616 */ 3617 PR0("Opening device major %u, minor %u = slice %u", 3618 major, minor, slice); 3619 3620 /* 3621 * Try to open the device. This can fail for example if we are 3622 * opening an empty slice. So in case of a failure, we try the 3623 * open again but this time with the FNDELAY flag. 3624 */ 3625 status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 3626 vd->open_flags, kcred, &vd->ldi_handle[slice], 3627 vd->vds->ldi_ident); 3628 3629 if (status != 0) { 3630 status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 3631 vd->open_flags | FNDELAY, kcred, 3632 &vd->ldi_handle[slice], vd->vds->ldi_ident); 3633 } 3634 3635 if (status != 0) { 3636 PRN("ldi_open_by_dev() returned errno %d " 3637 "for slice %u", status, slice); 3638 /* vds_destroy_vd() will close any open slices */ 3639 vd->ldi_handle[slice] = NULL; 3640 return (status); 3641 } 3642 } 3643 3644 return (0); 3645 } 3646 3647 static int 3648 vd_setup_partition_vtoc(vd_t *vd) 3649 { 3650 int rval, status; 3651 char *device_path = vd->device_path; 3652 3653 status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 3654 (intptr_t)&vd->dk_geom, (vd->open_flags | FKIOCTL), kcred, &rval); 3655 3656 if (status != 0) { 3657 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 3658 status, device_path); 3659 return (status); 3660 } 3661 3662 /* Initialize dk_geom structure for single-slice device */ 3663 if (vd->dk_geom.dkg_nsect == 0) { 3664 PRN("%s geometry claims 0 sectors per track", device_path); 3665 return (EIO); 3666 } 3667 if (vd->dk_geom.dkg_nhead == 0) { 3668 PRN("%s geometry claims 0 heads", device_path); 3669 return (EIO); 3670 } 3671 vd->dk_geom.dkg_ncyl = vd->vdisk_size / vd->dk_geom.dkg_nsect / 3672 vd->dk_geom.dkg_nhead; 3673 vd->dk_geom.dkg_acyl = 0; 3674 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 3675 3676 3677 /* Initialize vtoc structure for single-slice device */ 3678 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 3679 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 3680 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 3681 vd->vtoc.v_nparts = 1; 3682 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 3683 vd->vtoc.v_part[0].p_flag = 0; 3684 vd->vtoc.v_part[0].p_start = 0; 3685 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 3686 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 3687 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 3688 3689 return (0); 3690 } 3691 3692 static int 3693 vd_setup_partition_efi(vd_t *vd) 3694 { 3695 efi_gpt_t *gpt; 3696 efi_gpe_t *gpe; 3697 struct uuid uuid = EFI_RESERVED; 3698 uint32_t crc; 3699 int length; 3700 3701 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 3702 3703 gpt = kmem_zalloc(length, KM_SLEEP); 3704 gpe = (efi_gpe_t *)(gpt + 1); 3705 3706 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 3707 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 3708 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 3709 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 3710 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 3711 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 3712 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 3713 3714 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 3715 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 3716 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 3717 3718 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 3719 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 3720 3721 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 3722 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 3723 3724 vd->dk_efi.dki_lba = 0; 3725 vd->dk_efi.dki_length = length; 3726 vd->dk_efi.dki_data = gpt; 3727 3728 return (0); 3729 } 3730 3731 /* 3732 * Setup for a virtual disk whose backend is a file (exported as a single slice 3733 * or as a full disk) or a pseudo device (for example a ZFS, SVM or VxVM volume) 3734 * exported as a full disk. In these cases, the backend is accessed using the 3735 * vnode interface. 3736 */ 3737 static int 3738 vd_setup_backend_vnode(vd_t *vd) 3739 { 3740 int rval, status; 3741 vattr_t vattr; 3742 dev_t dev; 3743 char *file_path = vd->device_path; 3744 char dev_path[MAXPATHLEN + 1]; 3745 ldi_handle_t lhandle; 3746 struct dk_cinfo dk_cinfo; 3747 3748 if ((status = vn_open(file_path, UIO_SYSSPACE, vd->open_flags | FOFFMAX, 3749 0, &vd->file_vnode, 0, 0)) != 0) { 3750 PRN("vn_open(%s) = errno %d", file_path, status); 3751 return (status); 3752 } 3753 3754 /* 3755 * We set vd->file now so that vds_destroy_vd will take care of 3756 * closing the file and releasing the vnode in case of an error. 3757 */ 3758 vd->file = B_TRUE; 3759 3760 vattr.va_mask = AT_SIZE; 3761 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) { 3762 PRN("VOP_GETATTR(%s) = errno %d", file_path, status); 3763 return (EIO); 3764 } 3765 3766 vd->file_size = vattr.va_size; 3767 /* size should be at least sizeof(dk_label) */ 3768 if (vd->file_size < sizeof (struct dk_label)) { 3769 PRN("Size of file has to be at least %ld bytes", 3770 sizeof (struct dk_label)); 3771 return (EIO); 3772 } 3773 3774 if (vd->file_vnode->v_flag & VNOMAP) { 3775 PRN("File %s cannot be mapped", file_path); 3776 return (EIO); 3777 } 3778 3779 /* 3780 * Find and validate the geometry of a disk image. For a single slice 3781 * disk image, this will build a fake geometry and vtoc. 3782 */ 3783 status = vd_file_validate_geometry(vd); 3784 if (status != 0 && status != EINVAL) { 3785 PRN("Fail to read label from %s", file_path); 3786 return (EIO); 3787 } 3788 3789 /* sector size = block size = DEV_BSIZE */ 3790 vd->vdisk_size = vd->file_size / DEV_BSIZE; 3791 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 3792 3793 /* 3794 * Get max_xfer_sz from the device where the file is or from the device 3795 * itself if we have a pseudo device. 3796 */ 3797 dev_path[0] = '\0'; 3798 3799 if (vd->pseudo) { 3800 status = ldi_open_by_name(file_path, FREAD, kcred, &lhandle, 3801 vd->vds->ldi_ident); 3802 } else { 3803 dev = vd->file_vnode->v_vfsp->vfs_dev; 3804 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 3805 PR0("underlying device = %s\n", dev_path); 3806 } 3807 3808 status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, kcred, &lhandle, 3809 vd->vds->ldi_ident); 3810 } 3811 3812 if (status != 0) { 3813 PR0("ldi_open() returned errno %d for device %s", 3814 status, (dev_path[0] == '\0')? file_path : dev_path); 3815 } else { 3816 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 3817 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 3818 &rval)) != 0) { 3819 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 3820 status, dev_path); 3821 } else { 3822 /* 3823 * Store the device's max transfer size for 3824 * return to the client 3825 */ 3826 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 3827 } 3828 3829 PR0("close the device %s", dev_path); 3830 (void) ldi_close(lhandle, FREAD, kcred); 3831 } 3832 3833 PR0("using file %s, dev %s, max_xfer = %u blks", 3834 file_path, dev_path, vd->max_xfer_sz); 3835 3836 /* Setup devid for the disk image */ 3837 3838 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) 3839 return (0); 3840 3841 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 3842 3843 status = vd_file_read_devid(vd, &vd->file_devid); 3844 3845 if (status == 0) { 3846 /* a valid devid was found */ 3847 return (0); 3848 } 3849 3850 if (status != EINVAL) { 3851 /* 3852 * There was an error while trying to read the devid. 3853 * So this disk image may have a devid but we are 3854 * unable to read it. 3855 */ 3856 PR0("can not read devid for %s", file_path); 3857 vd->file_devid = NULL; 3858 return (0); 3859 } 3860 } 3861 3862 /* 3863 * No valid device id was found so we create one. Note that a failure 3864 * to create a device id is not fatal and does not prevent the disk 3865 * image from being attached. 3866 */ 3867 PR1("creating devid for %s", file_path); 3868 3869 if (ddi_devid_init(vd->vds->dip, DEVID_FAB, NULL, 0, 3870 &vd->file_devid) != DDI_SUCCESS) { 3871 PR0("fail to create devid for %s", file_path); 3872 vd->file_devid = NULL; 3873 return (0); 3874 } 3875 3876 /* 3877 * Write devid to the disk image. The devid is stored into the disk 3878 * image if we have a valid label; otherwise the devid will be stored 3879 * when the user writes a valid label. 3880 */ 3881 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 3882 if (vd_file_write_devid(vd, vd->file_devid) != 0) { 3883 PR0("fail to write devid for %s", file_path); 3884 ddi_devid_free(vd->file_devid); 3885 vd->file_devid = NULL; 3886 } 3887 } 3888 3889 return (0); 3890 } 3891 3892 /* 3893 * Setup for a virtual disk which backend is a device (a physical disk, 3894 * slice or pseudo device) that is directly exported either as a full disk 3895 * for a physical disk or as a slice for a pseudo device or a disk slice. 3896 * In these cases, the backend is accessed using the LDI interface. 3897 */ 3898 static int 3899 vd_setup_backend_ldi(vd_t *vd) 3900 { 3901 int rval, status; 3902 struct dk_cinfo dk_cinfo; 3903 char *device_path = vd->device_path; 3904 3905 /* 3906 * Try to open the device. This can fail for example if we are opening 3907 * an empty slice. So in case of a failure, we try the open again but 3908 * this time with the FNDELAY flag. 3909 */ 3910 status = ldi_open_by_name(device_path, vd->open_flags, kcred, 3911 &vd->ldi_handle[0], vd->vds->ldi_ident); 3912 3913 if (status != 0) 3914 status = ldi_open_by_name(device_path, vd->open_flags | FNDELAY, 3915 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident); 3916 3917 if (status != 0) { 3918 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 3919 vd->ldi_handle[0] = NULL; 3920 return (status); 3921 } 3922 3923 vd->file = B_FALSE; 3924 3925 /* Get device number of backing device */ 3926 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 3927 PRN("ldi_get_dev() returned errno %d for %s", 3928 status, device_path); 3929 return (status); 3930 } 3931 3932 /* Verify backing device supports dk_cinfo */ 3933 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 3934 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 3935 &rval)) != 0) { 3936 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 3937 status, device_path); 3938 return (status); 3939 } 3940 if (dk_cinfo.dki_partition >= V_NUMPAR) { 3941 PRN("slice %u >= maximum slice %u for %s", 3942 dk_cinfo.dki_partition, V_NUMPAR, device_path); 3943 return (EIO); 3944 } 3945 3946 vd->vdisk_label = vd_read_vtoc(vd, &vd->vtoc); 3947 3948 /* Store the device's max transfer size for return to the client */ 3949 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 3950 3951 /* 3952 * Export a full disk. 3953 * 3954 * When we use the LDI interface, we export a device as a full disk 3955 * if we have an entire disk slice (slice 2) and if this slice is 3956 * exported as a full disk and not as a single slice disk. 3957 * 3958 * Note that pseudo devices are exported as full disks using the vnode 3959 * interface, not the LDI interface. 3960 */ 3961 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE && 3962 vd->vdisk_type == VD_DISK_TYPE_DISK) { 3963 ASSERT(!vd->pseudo); 3964 return (vd_setup_full_disk(vd)); 3965 } 3966 3967 /* 3968 * Export a single slice disk. 3969 * 3970 * The exported device can be either a pseudo device or a disk slice. If 3971 * it is a disk slice different from slice 2 then it is always exported 3972 * as a single slice disk even if the "slice" option is not specified. 3973 * If it is disk slice 2 or a pseudo device then it is exported as a 3974 * single slice disk only if the "slice" option is specified. 3975 */ 3976 ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE || 3977 dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE); 3978 return (vd_setup_single_slice_disk(vd)); 3979 } 3980 3981 static int 3982 vd_setup_single_slice_disk(vd_t *vd) 3983 { 3984 int status; 3985 char *device_path = vd->device_path; 3986 3987 /* Get size of backing device */ 3988 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 3989 PRN("ldi_get_size() failed for %s", device_path); 3990 return (EIO); 3991 } 3992 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 3993 3994 if (vd->pseudo) { 3995 3996 ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE); 3997 3998 /* 3999 * Currently we only support exporting pseudo devices which 4000 * provide a valid disk label. 4001 */ 4002 if (vd->vdisk_label == VD_DISK_LABEL_UNK) { 4003 PRN("%s is a pseudo device with an invalid disk " 4004 "label\n", device_path); 4005 return (EINVAL); 4006 } 4007 return (0); /* ...and we're done */ 4008 } 4009 4010 /* We can only export a slice if the disk has a valid label */ 4011 if (vd->vdisk_label == VD_DISK_LABEL_UNK) { 4012 PRN("%s is a slice from a disk with an unknown disk label\n", 4013 device_path); 4014 return (EINVAL); 4015 } 4016 4017 /* 4018 * We export the slice as a single slice disk even if the "slice" 4019 * option was not specified. 4020 */ 4021 vd->vdisk_type = VD_DISK_TYPE_SLICE; 4022 vd->nslices = 1; 4023 4024 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 4025 /* Slice from a disk with an EFI label */ 4026 status = vd_setup_partition_efi(vd); 4027 } else { 4028 /* Slice from a disk with a VTOC label */ 4029 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 4030 status = vd_setup_partition_vtoc(vd); 4031 } 4032 4033 return (status); 4034 } 4035 4036 static int 4037 vd_setup_vd(vd_t *vd) 4038 { 4039 int status; 4040 dev_info_t *dip; 4041 vnode_t *vnp; 4042 char *path = vd->device_path; 4043 4044 /* make sure the vdisk backend is valid */ 4045 if ((status = lookupname(path, UIO_SYSSPACE, 4046 FOLLOW, NULLVPP, &vnp)) != 0) { 4047 PR0("Cannot lookup %s errno %d", path, status); 4048 goto done; 4049 } 4050 4051 switch (vnp->v_type) { 4052 case VREG: 4053 /* 4054 * Backend is a file so it is exported as a full disk or as a 4055 * single slice disk using the vnode interface. 4056 */ 4057 VN_RELE(vnp); 4058 vd->pseudo = B_FALSE; 4059 status = vd_setup_backend_vnode(vd); 4060 break; 4061 4062 case VBLK: 4063 case VCHR: 4064 /* 4065 * Backend is a device. The way it is exported depends on the 4066 * type of the device. 4067 * 4068 * - A pseudo device is exported as a full disk using the vnode 4069 * interface or as a single slice disk using the LDI 4070 * interface. 4071 * 4072 * - A disk (represented by the slice 2 of that disk) is 4073 * exported as a full disk using the LDI interface. 4074 * 4075 * - A disk slice (different from slice 2) is always exported 4076 * as a single slice disk using the LDI interface. 4077 * 4078 * - The slice 2 of a disk is exported as a single slice disk 4079 * if the "slice" option is specified, otherwise the entire 4080 * disk will be exported. In any case, the LDI interface is 4081 * used. 4082 */ 4083 4084 /* check if this is a pseudo device */ 4085 if ((dip = ddi_hold_devi_by_instance(getmajor(vnp->v_rdev), 4086 dev_to_instance(vnp->v_rdev), 0)) == NULL) { 4087 PRN("%s is no longer accessible", path); 4088 VN_RELE(vnp); 4089 status = EIO; 4090 break; 4091 } 4092 vd->pseudo = is_pseudo_device(dip); 4093 ddi_release_devi(dip); 4094 VN_RELE(vnp); 4095 4096 /* 4097 * If this is a pseudo device then its usage depends if the 4098 * "slice" option is set or not. If the "slice" option is set 4099 * then the pseudo device will be exported as a single slice, 4100 * otherwise it will be exported as a full disk. 4101 */ 4102 if (vd->pseudo && vd->vdisk_type == VD_DISK_TYPE_DISK) 4103 status = vd_setup_backend_vnode(vd); 4104 else 4105 status = vd_setup_backend_ldi(vd); 4106 break; 4107 4108 default: 4109 PRN("Unsupported vdisk backend %s", path); 4110 VN_RELE(vnp); 4111 status = EBADF; 4112 } 4113 4114 done: 4115 if (status != 0) { 4116 /* 4117 * If the error is retryable print an error message only 4118 * during the first try. 4119 */ 4120 if (status == ENXIO || status == ENODEV || 4121 status == ENOENT || status == EROFS) { 4122 if (!(vd->initialized & VD_SETUP_ERROR)) { 4123 PRN("%s is currently inaccessible (error %d)", 4124 path, status); 4125 } 4126 status = EAGAIN; 4127 } else { 4128 PRN("%s can not be exported as a virtual disk " 4129 "(error %d)", path, status); 4130 } 4131 vd->initialized |= VD_SETUP_ERROR; 4132 4133 } else if (vd->initialized & VD_SETUP_ERROR) { 4134 /* print a message only if we previously had an error */ 4135 PRN("%s is now online", path); 4136 vd->initialized &= ~VD_SETUP_ERROR; 4137 } 4138 4139 return (status); 4140 } 4141 4142 static int 4143 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options, 4144 uint64_t ldc_id, vd_t **vdp) 4145 { 4146 char tq_name[TASKQ_NAMELEN]; 4147 int status; 4148 ddi_iblock_cookie_t iblock = NULL; 4149 ldc_attr_t ldc_attr; 4150 vd_t *vd; 4151 4152 4153 ASSERT(vds != NULL); 4154 ASSERT(device_path != NULL); 4155 ASSERT(vdp != NULL); 4156 PR0("Adding vdisk for %s", device_path); 4157 4158 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 4159 PRN("No memory for virtual disk"); 4160 return (EAGAIN); 4161 } 4162 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 4163 vd->vds = vds; 4164 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 4165 4166 /* Setup open flags */ 4167 vd->open_flags = FREAD; 4168 4169 if (!(options & VD_OPT_RDONLY)) 4170 vd->open_flags |= FWRITE; 4171 4172 if (options & VD_OPT_EXCLUSIVE) 4173 vd->open_flags |= FEXCL; 4174 4175 /* Setup disk type */ 4176 if (options & VD_OPT_SLICE) { 4177 vd->vdisk_type = VD_DISK_TYPE_SLICE; 4178 vd->nslices = 1; 4179 } else { 4180 vd->vdisk_type = VD_DISK_TYPE_DISK; 4181 vd->nslices = V_NUMPAR; 4182 } 4183 4184 /* default disk label */ 4185 vd->vdisk_label = VD_DISK_LABEL_UNK; 4186 4187 /* Open vdisk and initialize parameters */ 4188 if ((status = vd_setup_vd(vd)) == 0) { 4189 vd->initialized |= VD_DISK_READY; 4190 4191 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 4192 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 4193 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 4194 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 4195 vd->nslices); 4196 } else { 4197 if (status != EAGAIN) 4198 return (status); 4199 } 4200 4201 /* Initialize locking */ 4202 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 4203 &iblock) != DDI_SUCCESS) { 4204 PRN("Could not get iblock cookie."); 4205 return (EIO); 4206 } 4207 4208 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 4209 vd->initialized |= VD_LOCKING; 4210 4211 4212 /* Create start and completion task queues for the vdisk */ 4213 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 4214 PR1("tq_name = %s", tq_name); 4215 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 4216 TASKQ_DEFAULTPRI, 0)) == NULL) { 4217 PRN("Could not create task queue"); 4218 return (EIO); 4219 } 4220 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 4221 PR1("tq_name = %s", tq_name); 4222 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 4223 TASKQ_DEFAULTPRI, 0)) == NULL) { 4224 PRN("Could not create task queue"); 4225 return (EIO); 4226 } 4227 vd->enabled = 1; /* before callback can dispatch to startq */ 4228 4229 4230 /* Bring up LDC */ 4231 ldc_attr.devclass = LDC_DEV_BLK_SVC; 4232 ldc_attr.instance = ddi_get_instance(vds->dip); 4233 ldc_attr.mode = LDC_MODE_UNRELIABLE; 4234 ldc_attr.mtu = VD_LDC_MTU; 4235 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 4236 PRN("Could not initialize LDC channel %lu, " 4237 "init failed with error %d", ldc_id, status); 4238 return (status); 4239 } 4240 vd->initialized |= VD_LDC; 4241 4242 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 4243 (caddr_t)vd)) != 0) { 4244 PRN("Could not initialize LDC channel %lu," 4245 "reg_callback failed with error %d", ldc_id, status); 4246 return (status); 4247 } 4248 4249 if ((status = ldc_open(vd->ldc_handle)) != 0) { 4250 PRN("Could not initialize LDC channel %lu," 4251 "open failed with error %d", ldc_id, status); 4252 return (status); 4253 } 4254 4255 if ((status = ldc_up(vd->ldc_handle)) != 0) { 4256 PR0("ldc_up() returned errno %d", status); 4257 } 4258 4259 /* Allocate the inband task memory handle */ 4260 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 4261 if (status) { 4262 PRN("Could not initialize LDC channel %lu," 4263 "alloc_handle failed with error %d", ldc_id, status); 4264 return (ENXIO); 4265 } 4266 4267 /* Add the successfully-initialized vdisk to the server's table */ 4268 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 4269 PRN("Error adding vdisk ID %lu to table", id); 4270 return (EIO); 4271 } 4272 4273 /* Allocate the staging buffer */ 4274 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 4275 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 4276 4277 /* store initial state */ 4278 vd->state = VD_STATE_INIT; 4279 4280 return (0); 4281 } 4282 4283 static void 4284 vd_free_dring_task(vd_t *vdp) 4285 { 4286 if (vdp->dring_task != NULL) { 4287 ASSERT(vdp->dring_len != 0); 4288 /* Free all dring_task memory handles */ 4289 for (int i = 0; i < vdp->dring_len; i++) { 4290 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 4291 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 4292 vdp->dring_task[i].msg = NULL; 4293 } 4294 kmem_free(vdp->dring_task, 4295 (sizeof (*vdp->dring_task)) * vdp->dring_len); 4296 vdp->dring_task = NULL; 4297 } 4298 } 4299 4300 /* 4301 * Destroy the state associated with a virtual disk 4302 */ 4303 static void 4304 vds_destroy_vd(void *arg) 4305 { 4306 vd_t *vd = (vd_t *)arg; 4307 int retry = 0, rv; 4308 4309 if (vd == NULL) 4310 return; 4311 4312 PR0("Destroying vdisk state"); 4313 4314 if (vd->dk_efi.dki_data != NULL) 4315 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 4316 4317 /* Disable queuing requests for the vdisk */ 4318 if (vd->initialized & VD_LOCKING) { 4319 mutex_enter(&vd->lock); 4320 vd->enabled = 0; 4321 mutex_exit(&vd->lock); 4322 } 4323 4324 /* Drain and destroy start queue (*before* destroying completionq) */ 4325 if (vd->startq != NULL) 4326 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 4327 4328 /* Drain and destroy completion queue (*before* shutting down LDC) */ 4329 if (vd->completionq != NULL) 4330 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 4331 4332 vd_free_dring_task(vd); 4333 4334 /* Free the inband task memory handle */ 4335 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 4336 4337 /* Shut down LDC */ 4338 if (vd->initialized & VD_LDC) { 4339 /* unmap the dring */ 4340 if (vd->initialized & VD_DRING) 4341 (void) ldc_mem_dring_unmap(vd->dring_handle); 4342 4343 /* close LDC channel - retry on EAGAIN */ 4344 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 4345 if (++retry > vds_ldc_retries) { 4346 PR0("Timed out closing channel"); 4347 break; 4348 } 4349 drv_usecwait(vds_ldc_delay); 4350 } 4351 if (rv == 0) { 4352 (void) ldc_unreg_callback(vd->ldc_handle); 4353 (void) ldc_fini(vd->ldc_handle); 4354 } else { 4355 /* 4356 * Closing the LDC channel has failed. Ideally we should 4357 * fail here but there is no Zeus level infrastructure 4358 * to handle this. The MD has already been changed and 4359 * we have to do the close. So we try to do as much 4360 * clean up as we can. 4361 */ 4362 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 4363 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 4364 drv_usecwait(vds_ldc_delay); 4365 } 4366 } 4367 4368 /* Free the staging buffer for msgs */ 4369 if (vd->vio_msgp != NULL) { 4370 kmem_free(vd->vio_msgp, vd->max_msglen); 4371 vd->vio_msgp = NULL; 4372 } 4373 4374 /* Free the inband message buffer */ 4375 if (vd->inband_task.msg != NULL) { 4376 kmem_free(vd->inband_task.msg, vd->max_msglen); 4377 vd->inband_task.msg = NULL; 4378 } 4379 if (vd->file) { 4380 /* Close file */ 4381 (void) VOP_CLOSE(vd->file_vnode, vd->open_flags, 1, 4382 0, kcred); 4383 VN_RELE(vd->file_vnode); 4384 if (vd->file_devid != NULL) 4385 ddi_devid_free(vd->file_devid); 4386 } else { 4387 /* Close any open backing-device slices */ 4388 for (uint_t slice = 0; slice < vd->nslices; slice++) { 4389 if (vd->ldi_handle[slice] != NULL) { 4390 PR0("Closing slice %u", slice); 4391 (void) ldi_close(vd->ldi_handle[slice], 4392 vd->open_flags, kcred); 4393 } 4394 } 4395 } 4396 4397 /* Free lock */ 4398 if (vd->initialized & VD_LOCKING) 4399 mutex_destroy(&vd->lock); 4400 4401 /* Finally, free the vdisk structure itself */ 4402 kmem_free(vd, sizeof (*vd)); 4403 } 4404 4405 static int 4406 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options, 4407 uint64_t ldc_id) 4408 { 4409 int status; 4410 vd_t *vd = NULL; 4411 4412 4413 if ((status = vds_do_init_vd(vds, id, device_path, options, 4414 ldc_id, &vd)) != 0) 4415 vds_destroy_vd(vd); 4416 4417 return (status); 4418 } 4419 4420 static int 4421 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 4422 uint64_t *ldc_id) 4423 { 4424 int num_channels; 4425 4426 4427 /* Look for channel endpoint child(ren) of the vdisk MD node */ 4428 if ((num_channels = md_scan_dag(md, vd_node, 4429 md_find_name(md, VD_CHANNEL_ENDPOINT), 4430 md_find_name(md, "fwd"), channel)) <= 0) { 4431 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 4432 return (-1); 4433 } 4434 4435 /* Get the "id" value for the first channel endpoint node */ 4436 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 4437 PRN("No \"%s\" property found for \"%s\" of vdisk", 4438 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 4439 return (-1); 4440 } 4441 4442 if (num_channels > 1) { 4443 PRN("Using ID of first of multiple channels for this vdisk"); 4444 } 4445 4446 return (0); 4447 } 4448 4449 static int 4450 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 4451 { 4452 int num_nodes, status; 4453 size_t size; 4454 mde_cookie_t *channel; 4455 4456 4457 if ((num_nodes = md_node_count(md)) <= 0) { 4458 PRN("Invalid node count in Machine Description subtree"); 4459 return (-1); 4460 } 4461 size = num_nodes*(sizeof (*channel)); 4462 channel = kmem_zalloc(size, KM_SLEEP); 4463 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 4464 kmem_free(channel, size); 4465 4466 return (status); 4467 } 4468 4469 /* 4470 * Function: 4471 * vds_get_options 4472 * 4473 * Description: 4474 * Parse the options of a vds node. Options are defined as an array 4475 * of strings in the vds-block-device-opts property of the vds node 4476 * in the machine description. Options are returned as a bitmask. The 4477 * mapping between the bitmask options and the options strings from the 4478 * machine description is defined in the vd_bdev_options[] array. 4479 * 4480 * The vds-block-device-opts property is optional. If a vds has no such 4481 * property then no option is defined. 4482 * 4483 * Parameters: 4484 * md - machine description. 4485 * vd_node - vds node in the machine description for which 4486 * options have to be parsed. 4487 * options - the returned options. 4488 * 4489 * Return Code: 4490 * none. 4491 */ 4492 static void 4493 vds_get_options(md_t *md, mde_cookie_t vd_node, uint64_t *options) 4494 { 4495 char *optstr, *opt; 4496 int len, n, i; 4497 4498 *options = 0; 4499 4500 if (md_get_prop_data(md, vd_node, VD_BLOCK_DEVICE_OPTS, 4501 (uint8_t **)&optstr, &len) != 0) { 4502 PR0("No options found"); 4503 return; 4504 } 4505 4506 /* parse options */ 4507 opt = optstr; 4508 n = sizeof (vd_bdev_options) / sizeof (vd_option_t); 4509 4510 while (opt < optstr + len) { 4511 for (i = 0; i < n; i++) { 4512 if (strncmp(vd_bdev_options[i].vdo_name, 4513 opt, VD_OPTION_NLEN) == 0) { 4514 *options |= vd_bdev_options[i].vdo_value; 4515 break; 4516 } 4517 } 4518 4519 if (i < n) { 4520 PR0("option: %s", opt); 4521 } else { 4522 PRN("option %s is unknown or unsupported", opt); 4523 } 4524 4525 opt += strlen(opt) + 1; 4526 } 4527 } 4528 4529 static void 4530 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 4531 { 4532 char *device_path = NULL; 4533 uint64_t id = 0, ldc_id = 0, options = 0; 4534 4535 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 4536 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 4537 return; 4538 } 4539 PR0("Adding vdisk ID %lu", id); 4540 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 4541 &device_path) != 0) { 4542 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 4543 return; 4544 } 4545 4546 vds_get_options(md, vd_node, &options); 4547 4548 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 4549 PRN("Error getting LDC ID for vdisk %lu", id); 4550 return; 4551 } 4552 4553 if (vds_init_vd(vds, id, device_path, options, ldc_id) != 0) { 4554 PRN("Failed to add vdisk ID %lu", id); 4555 return; 4556 } 4557 } 4558 4559 static void 4560 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 4561 { 4562 uint64_t id = 0; 4563 4564 4565 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 4566 PRN("Unable to get \"%s\" property from vdisk's MD node", 4567 VD_ID_PROP); 4568 return; 4569 } 4570 PR0("Removing vdisk ID %lu", id); 4571 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 4572 PRN("No vdisk entry found for vdisk ID %lu", id); 4573 } 4574 4575 static void 4576 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 4577 md_t *curr_md, mde_cookie_t curr_vd_node) 4578 { 4579 char *curr_dev, *prev_dev; 4580 uint64_t curr_id = 0, curr_ldc_id = 0, curr_options = 0; 4581 uint64_t prev_id = 0, prev_ldc_id = 0, prev_options = 0; 4582 size_t len; 4583 4584 4585 /* Validate that vdisk ID has not changed */ 4586 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 4587 PRN("Error getting previous vdisk \"%s\" property", 4588 VD_ID_PROP); 4589 return; 4590 } 4591 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 4592 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 4593 return; 4594 } 4595 if (curr_id != prev_id) { 4596 PRN("Not changing vdisk: ID changed from %lu to %lu", 4597 prev_id, curr_id); 4598 return; 4599 } 4600 4601 /* Validate that LDC ID has not changed */ 4602 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 4603 PRN("Error getting LDC ID for vdisk %lu", prev_id); 4604 return; 4605 } 4606 4607 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 4608 PRN("Error getting LDC ID for vdisk %lu", curr_id); 4609 return; 4610 } 4611 if (curr_ldc_id != prev_ldc_id) { 4612 _NOTE(NOTREACHED); /* lint is confused */ 4613 PRN("Not changing vdisk: " 4614 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 4615 return; 4616 } 4617 4618 /* Determine whether device path has changed */ 4619 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 4620 &prev_dev) != 0) { 4621 PRN("Error getting previous vdisk \"%s\"", 4622 VD_BLOCK_DEVICE_PROP); 4623 return; 4624 } 4625 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 4626 &curr_dev) != 0) { 4627 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 4628 return; 4629 } 4630 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 4631 (strncmp(curr_dev, prev_dev, len) == 0)) 4632 return; /* no relevant (supported) change */ 4633 4634 /* Validate that options have not changed */ 4635 vds_get_options(prev_md, prev_vd_node, &prev_options); 4636 vds_get_options(curr_md, curr_vd_node, &curr_options); 4637 if (prev_options != curr_options) { 4638 PRN("Not changing vdisk: options changed from %lx to %lx", 4639 prev_options, curr_options); 4640 return; 4641 } 4642 4643 PR0("Changing vdisk ID %lu", prev_id); 4644 4645 /* Remove old state, which will close vdisk and reset */ 4646 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 4647 PRN("No entry found for vdisk ID %lu", prev_id); 4648 4649 /* Re-initialize vdisk with new state */ 4650 if (vds_init_vd(vds, curr_id, curr_dev, curr_options, 4651 curr_ldc_id) != 0) { 4652 PRN("Failed to change vdisk ID %lu", curr_id); 4653 return; 4654 } 4655 } 4656 4657 static int 4658 vds_process_md(void *arg, mdeg_result_t *md) 4659 { 4660 int i; 4661 vds_t *vds = arg; 4662 4663 4664 if (md == NULL) 4665 return (MDEG_FAILURE); 4666 ASSERT(vds != NULL); 4667 4668 for (i = 0; i < md->removed.nelem; i++) 4669 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 4670 for (i = 0; i < md->match_curr.nelem; i++) 4671 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 4672 md->match_curr.mdp, md->match_curr.mdep[i]); 4673 for (i = 0; i < md->added.nelem; i++) 4674 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 4675 4676 return (MDEG_SUCCESS); 4677 } 4678 4679 4680 static int 4681 vds_do_attach(dev_info_t *dip) 4682 { 4683 int status, sz; 4684 int cfg_handle; 4685 minor_t instance = ddi_get_instance(dip); 4686 vds_t *vds; 4687 mdeg_prop_spec_t *pspecp; 4688 mdeg_node_spec_t *ispecp; 4689 4690 /* 4691 * The "cfg-handle" property of a vds node in an MD contains the MD's 4692 * notion of "instance", or unique identifier, for that node; OBP 4693 * stores the value of the "cfg-handle" MD property as the value of 4694 * the "reg" property on the node in the device tree it builds from 4695 * the MD and passes to Solaris. Thus, we look up the devinfo node's 4696 * "reg" property value to uniquely identify this device instance when 4697 * registering with the MD event-generation framework. If the "reg" 4698 * property cannot be found, the device tree state is presumably so 4699 * broken that there is no point in continuing. 4700 */ 4701 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 4702 VD_REG_PROP)) { 4703 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 4704 return (DDI_FAILURE); 4705 } 4706 4707 /* Get the MD instance for later MDEG registration */ 4708 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 4709 VD_REG_PROP, -1); 4710 4711 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 4712 PRN("Could not allocate state for instance %u", instance); 4713 return (DDI_FAILURE); 4714 } 4715 4716 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 4717 PRN("Could not get state for instance %u", instance); 4718 ddi_soft_state_free(vds_state, instance); 4719 return (DDI_FAILURE); 4720 } 4721 4722 vds->dip = dip; 4723 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 4724 vds_destroy_vd, sizeof (void *)); 4725 4726 ASSERT(vds->vd_table != NULL); 4727 4728 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 4729 PRN("ldi_ident_from_dip() returned errno %d", status); 4730 return (DDI_FAILURE); 4731 } 4732 vds->initialized |= VDS_LDI; 4733 4734 /* Register for MD updates */ 4735 sz = sizeof (vds_prop_template); 4736 pspecp = kmem_alloc(sz, KM_SLEEP); 4737 bcopy(vds_prop_template, pspecp, sz); 4738 4739 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 4740 4741 /* initialize the complete prop spec structure */ 4742 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 4743 ispecp->namep = "virtual-device"; 4744 ispecp->specp = pspecp; 4745 4746 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 4747 &vds->mdeg) != MDEG_SUCCESS) { 4748 PRN("Unable to register for MD updates"); 4749 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 4750 kmem_free(pspecp, sz); 4751 return (DDI_FAILURE); 4752 } 4753 4754 vds->ispecp = ispecp; 4755 vds->initialized |= VDS_MDEG; 4756 4757 /* Prevent auto-detaching so driver is available whenever MD changes */ 4758 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 4759 DDI_PROP_SUCCESS) { 4760 PRN("failed to set \"%s\" property for instance %u", 4761 DDI_NO_AUTODETACH, instance); 4762 } 4763 4764 ddi_report_dev(dip); 4765 return (DDI_SUCCESS); 4766 } 4767 4768 static int 4769 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 4770 { 4771 int status; 4772 4773 switch (cmd) { 4774 case DDI_ATTACH: 4775 PR0("Attaching"); 4776 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 4777 (void) vds_detach(dip, DDI_DETACH); 4778 return (status); 4779 case DDI_RESUME: 4780 PR0("No action required for DDI_RESUME"); 4781 return (DDI_SUCCESS); 4782 default: 4783 return (DDI_FAILURE); 4784 } 4785 } 4786 4787 static struct dev_ops vds_ops = { 4788 DEVO_REV, /* devo_rev */ 4789 0, /* devo_refcnt */ 4790 ddi_no_info, /* devo_getinfo */ 4791 nulldev, /* devo_identify */ 4792 nulldev, /* devo_probe */ 4793 vds_attach, /* devo_attach */ 4794 vds_detach, /* devo_detach */ 4795 nodev, /* devo_reset */ 4796 NULL, /* devo_cb_ops */ 4797 NULL, /* devo_bus_ops */ 4798 nulldev /* devo_power */ 4799 }; 4800 4801 static struct modldrv modldrv = { 4802 &mod_driverops, 4803 "virtual disk server", 4804 &vds_ops, 4805 }; 4806 4807 static struct modlinkage modlinkage = { 4808 MODREV_1, 4809 &modldrv, 4810 NULL 4811 }; 4812 4813 4814 int 4815 _init(void) 4816 { 4817 int i, status; 4818 4819 4820 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 4821 return (status); 4822 if ((status = mod_install(&modlinkage)) != 0) { 4823 ddi_soft_state_fini(&vds_state); 4824 return (status); 4825 } 4826 4827 /* Fill in the bit-mask of server-supported operations */ 4828 for (i = 0; i < vds_noperations; i++) 4829 vds_operations |= 1 << (vds_operation[i].operation - 1); 4830 4831 return (0); 4832 } 4833 4834 int 4835 _info(struct modinfo *modinfop) 4836 { 4837 return (mod_info(&modlinkage, modinfop)); 4838 } 4839 4840 int 4841 _fini(void) 4842 { 4843 int status; 4844 4845 4846 if ((status = mod_remove(&modlinkage)) != 0) 4847 return (status); 4848 ddi_soft_state_fini(&vds_state); 4849 return (0); 4850 } 4851