1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sdt.h> 45 #include <sys/sunddi.h> 46 #include <sys/sunldi.h> 47 #include <sys/sysmacros.h> 48 #include <sys/vio_common.h> 49 #include <sys/vdsk_mailbox.h> 50 #include <sys/vdsk_common.h> 51 #include <sys/vtoc.h> 52 #include <sys/vfs.h> 53 #include <sys/stat.h> 54 #include <sys/scsi/impl/uscsi.h> 55 #include <vm/seg_map.h> 56 57 /* Virtual disk server initialization flags */ 58 #define VDS_LDI 0x01 59 #define VDS_MDEG 0x02 60 61 /* Virtual disk server tunable parameters */ 62 #define VDS_RETRIES 5 63 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 64 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 65 #define VDS_NCHAINS 32 66 67 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 68 #define VDS_NAME "virtual-disk-server" 69 70 #define VD_NAME "vd" 71 #define VD_VOLUME_NAME "vdisk" 72 #define VD_ASCIILABEL "Virtual Disk" 73 74 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 75 #define VD_ID_PROP "id" 76 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 77 #define VD_BLOCK_DEVICE_OPTS "vds-block-device-opts" 78 #define VD_REG_PROP "reg" 79 80 /* Virtual disk initialization flags */ 81 #define VD_DISK_READY 0x01 82 #define VD_LOCKING 0x02 83 #define VD_LDC 0x04 84 #define VD_DRING 0x08 85 #define VD_SID 0x10 86 #define VD_SEQ_NUM 0x20 87 #define VD_SETUP_ERROR 0x40 88 89 /* Flags for writing to a vdisk which is a file */ 90 #define VD_FILE_WRITE_FLAGS SM_ASYNC 91 92 /* Number of backup labels */ 93 #define VD_FILE_NUM_BACKUP 5 94 95 /* Timeout for SCSI I/O */ 96 #define VD_SCSI_RDWR_TIMEOUT 30 /* 30 secs */ 97 98 /* 99 * By Solaris convention, slice/partition 2 represents the entire disk; 100 * unfortunately, this convention does not appear to be codified. 101 */ 102 #define VD_ENTIRE_DISK_SLICE 2 103 104 /* Return a cpp token as a string */ 105 #define STRINGIZE(token) #token 106 107 /* 108 * Print a message prefixed with the current function name to the message log 109 * (and optionally to the console for verbose boots); these macros use cpp's 110 * concatenation of string literals and C99 variable-length-argument-list 111 * macros 112 */ 113 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 114 #define _PRN(format, ...) \ 115 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 116 117 /* Return a pointer to the "i"th vdisk dring element */ 118 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 119 (vd->dring + (i)*vd->descriptor_size)) 120 121 /* Return the virtual disk client's type as a string (for use in messages) */ 122 #define VD_CLIENT(vd) \ 123 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 124 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 125 (((vd)->xfer_mode == 0) ? "null client" : \ 126 "unsupported client"))) 127 128 /* Read disk label from a disk on file */ 129 #define VD_FILE_LABEL_READ(vd, labelp) \ 130 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \ 131 0, sizeof (struct dk_label)) 132 133 /* Write disk label to a disk on file */ 134 #define VD_FILE_LABEL_WRITE(vd, labelp) \ 135 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 136 0, sizeof (struct dk_label)) 137 138 /* 139 * Specification of an MD node passed to the MDEG to filter any 140 * 'vport' nodes that do not belong to the specified node. This 141 * template is copied for each vds instance and filled in with 142 * the appropriate 'cfg-handle' value before being passed to the MDEG. 143 */ 144 static mdeg_prop_spec_t vds_prop_template[] = { 145 { MDET_PROP_STR, "name", VDS_NAME }, 146 { MDET_PROP_VAL, "cfg-handle", NULL }, 147 { MDET_LIST_END, NULL, NULL } 148 }; 149 150 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 151 152 /* 153 * Matching criteria passed to the MDEG to register interest 154 * in changes to 'virtual-device-port' nodes identified by their 155 * 'id' property. 156 */ 157 static md_prop_match_t vd_prop_match[] = { 158 { MDET_PROP_VAL, VD_ID_PROP }, 159 { MDET_LIST_END, NULL } 160 }; 161 162 static mdeg_node_match_t vd_match = {"virtual-device-port", 163 vd_prop_match}; 164 165 /* 166 * Options for the VD_BLOCK_DEVICE_OPTS property. 167 */ 168 #define VD_OPT_RDONLY 0x1 /* read-only */ 169 #define VD_OPT_SLICE 0x2 /* single slice */ 170 #define VD_OPT_EXCLUSIVE 0x4 /* exclusive access */ 171 172 #define VD_OPTION_NLEN 128 173 174 typedef struct vd_option { 175 char vdo_name[VD_OPTION_NLEN]; 176 uint64_t vdo_value; 177 } vd_option_t; 178 179 vd_option_t vd_bdev_options[] = { 180 { "ro", VD_OPT_RDONLY }, 181 { "slice", VD_OPT_SLICE }, 182 { "excl", VD_OPT_EXCLUSIVE } 183 }; 184 185 /* Debugging macros */ 186 #ifdef DEBUG 187 188 static int vd_msglevel = 0; 189 190 #define PR0 if (vd_msglevel > 0) PRN 191 #define PR1 if (vd_msglevel > 1) PRN 192 #define PR2 if (vd_msglevel > 2) PRN 193 194 #define VD_DUMP_DRING_ELEM(elem) \ 195 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 196 elem->hdr.dstate, \ 197 elem->payload.operation, \ 198 elem->payload.status, \ 199 elem->payload.nbytes, \ 200 elem->payload.addr, \ 201 elem->payload.ncookies); 202 203 char * 204 vd_decode_state(int state) 205 { 206 char *str; 207 208 #define CASE_STATE(_s) case _s: str = #_s; break; 209 210 switch (state) { 211 CASE_STATE(VD_STATE_INIT) 212 CASE_STATE(VD_STATE_VER) 213 CASE_STATE(VD_STATE_ATTR) 214 CASE_STATE(VD_STATE_DRING) 215 CASE_STATE(VD_STATE_RDX) 216 CASE_STATE(VD_STATE_DATA) 217 default: str = "unknown"; break; 218 } 219 220 #undef CASE_STATE 221 222 return (str); 223 } 224 225 void 226 vd_decode_tag(vio_msg_t *msg) 227 { 228 char *tstr, *sstr, *estr; 229 230 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 231 232 switch (msg->tag.vio_msgtype) { 233 CASE_TYPE(VIO_TYPE_CTRL) 234 CASE_TYPE(VIO_TYPE_DATA) 235 CASE_TYPE(VIO_TYPE_ERR) 236 default: tstr = "unknown"; break; 237 } 238 239 #undef CASE_TYPE 240 241 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 242 243 switch (msg->tag.vio_subtype) { 244 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 245 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 246 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 247 default: sstr = "unknown"; break; 248 } 249 250 #undef CASE_SUBTYPE 251 252 #define CASE_ENV(_s) case _s: estr = #_s; break; 253 254 switch (msg->tag.vio_subtype_env) { 255 CASE_ENV(VIO_VER_INFO) 256 CASE_ENV(VIO_ATTR_INFO) 257 CASE_ENV(VIO_DRING_REG) 258 CASE_ENV(VIO_DRING_UNREG) 259 CASE_ENV(VIO_RDX) 260 CASE_ENV(VIO_PKT_DATA) 261 CASE_ENV(VIO_DESC_DATA) 262 CASE_ENV(VIO_DRING_DATA) 263 default: estr = "unknown"; break; 264 } 265 266 #undef CASE_ENV 267 268 PR1("(%x/%x/%x) message : (%s/%s/%s)", 269 msg->tag.vio_msgtype, msg->tag.vio_subtype, 270 msg->tag.vio_subtype_env, tstr, sstr, estr); 271 } 272 273 #else /* !DEBUG */ 274 275 #define PR0(...) 276 #define PR1(...) 277 #define PR2(...) 278 279 #define VD_DUMP_DRING_ELEM(elem) 280 281 #define vd_decode_state(_s) (NULL) 282 #define vd_decode_tag(_s) (NULL) 283 284 #endif /* DEBUG */ 285 286 287 /* 288 * Soft state structure for a vds instance 289 */ 290 typedef struct vds { 291 uint_t initialized; /* driver inst initialization flags */ 292 dev_info_t *dip; /* driver inst devinfo pointer */ 293 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 294 mod_hash_t *vd_table; /* table of virtual disks served */ 295 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 296 mdeg_handle_t mdeg; /* handle for MDEG operations */ 297 } vds_t; 298 299 /* 300 * Types of descriptor-processing tasks 301 */ 302 typedef enum vd_task_type { 303 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 304 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 305 } vd_task_type_t; 306 307 /* 308 * Structure describing the task for processing a descriptor 309 */ 310 typedef struct vd_task { 311 struct vd *vd; /* vd instance task is for */ 312 vd_task_type_t type; /* type of descriptor task */ 313 int index; /* dring elem index for task */ 314 vio_msg_t *msg; /* VIO message task is for */ 315 size_t msglen; /* length of message content */ 316 vd_dring_payload_t *request; /* request task will perform */ 317 struct buf buf; /* buf(9s) for I/O request */ 318 ldc_mem_handle_t mhdl; /* task memory handle */ 319 int status; /* status of processing task */ 320 int (*completef)(struct vd_task *task); /* completion func ptr */ 321 } vd_task_t; 322 323 /* 324 * Soft state structure for a virtual disk instance 325 */ 326 typedef struct vd { 327 uint_t initialized; /* vdisk initialization flags */ 328 vds_t *vds; /* server for this vdisk */ 329 ddi_taskq_t *startq; /* queue for I/O start tasks */ 330 ddi_taskq_t *completionq; /* queue for completion tasks */ 331 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 332 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 333 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 334 int open_flags; /* open flags */ 335 uint_t nslices; /* number of slices */ 336 size_t vdisk_size; /* number of blocks in vdisk */ 337 vd_disk_type_t vdisk_type; /* slice or entire disk */ 338 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 339 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 340 boolean_t pseudo; /* underlying pseudo dev */ 341 boolean_t file; /* underlying file */ 342 vnode_t *file_vnode; /* file vnode */ 343 size_t file_size; /* file size */ 344 ddi_devid_t file_devid; /* devid for disk image */ 345 struct dk_efi dk_efi; /* synthetic for slice type */ 346 struct dk_geom dk_geom; /* synthetic for slice type */ 347 struct vtoc vtoc; /* synthetic for slice type */ 348 ldc_status_t ldc_state; /* LDC connection state */ 349 ldc_handle_t ldc_handle; /* handle for LDC comm */ 350 size_t max_msglen; /* largest LDC message len */ 351 vd_state_t state; /* client handshake state */ 352 uint8_t xfer_mode; /* transfer mode with client */ 353 uint32_t sid; /* client's session ID */ 354 uint64_t seq_num; /* message sequence number */ 355 uint64_t dring_ident; /* identifier of dring */ 356 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 357 uint32_t descriptor_size; /* num bytes in desc */ 358 uint32_t dring_len; /* number of dring elements */ 359 caddr_t dring; /* address of dring */ 360 caddr_t vio_msgp; /* vio msg staging buffer */ 361 vd_task_t inband_task; /* task for inband descriptor */ 362 vd_task_t *dring_task; /* tasks dring elements */ 363 364 kmutex_t lock; /* protects variables below */ 365 boolean_t enabled; /* is vdisk enabled? */ 366 boolean_t reset_state; /* reset connection state? */ 367 boolean_t reset_ldc; /* reset LDC channel? */ 368 } vd_t; 369 370 typedef struct vds_operation { 371 char *namep; 372 uint8_t operation; 373 int (*start)(vd_task_t *task); 374 int (*complete)(vd_task_t *task); 375 } vds_operation_t; 376 377 typedef struct vd_ioctl { 378 uint8_t operation; /* vdisk operation */ 379 const char *operation_name; /* vdisk operation name */ 380 size_t nbytes; /* size of operation buffer */ 381 int cmd; /* corresponding ioctl cmd */ 382 const char *cmd_name; /* ioctl cmd name */ 383 void *arg; /* ioctl cmd argument */ 384 /* convert input vd_buf to output ioctl_arg */ 385 void (*copyin)(void *vd_buf, void *ioctl_arg); 386 /* convert input ioctl_arg to output vd_buf */ 387 void (*copyout)(void *ioctl_arg, void *vd_buf); 388 /* write is true if the operation writes any data to the backend */ 389 boolean_t write; 390 } vd_ioctl_t; 391 392 /* Define trivial copyin/copyout conversion function flag */ 393 #define VD_IDENTITY ((void (*)(void *, void *))-1) 394 395 396 static int vds_ldc_retries = VDS_RETRIES; 397 static int vds_ldc_delay = VDS_LDC_DELAY; 398 static int vds_dev_retries = VDS_RETRIES; 399 static int vds_dev_delay = VDS_DEV_DELAY; 400 static void *vds_state; 401 static uint64_t vds_operations; /* see vds_operation[] definition below */ 402 403 static uint_t vd_file_write_flags = VD_FILE_WRITE_FLAGS; 404 405 static short vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT; 406 407 /* 408 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 409 * 410 * Each supported major version should appear only once, paired with (and only 411 * with) its highest supported minor version number (as the protocol requires 412 * supporting all lower minor version numbers as well) 413 */ 414 static const vio_ver_t vds_version[] = {{1, 0}}; 415 static const size_t vds_num_versions = 416 sizeof (vds_version)/sizeof (vds_version[0]); 417 418 static void vd_free_dring_task(vd_t *vdp); 419 static int vd_setup_vd(vd_t *vd); 420 static int vd_setup_single_slice_disk(vd_t *vd); 421 static boolean_t vd_enabled(vd_t *vd); 422 static ushort_t vd_lbl2cksum(struct dk_label *label); 423 static int vd_file_validate_geometry(vd_t *vd); 424 425 /* 426 * Function: 427 * vd_file_rw 428 * 429 * Description: 430 * Read or write to a disk on file. 431 * 432 * Parameters: 433 * vd - disk on which the operation is performed. 434 * slice - slice on which the operation is performed, 435 * VD_SLICE_NONE indicates that the operation 436 * is done using an absolute disk offset. 437 * operation - operation to execute: read (VD_OP_BREAD) or 438 * write (VD_OP_BWRITE). 439 * data - buffer where data are read to or written from. 440 * blk - starting block for the operation. 441 * len - number of bytes to read or write. 442 * 443 * Return Code: 444 * n >= 0 - success, n indicates the number of bytes read 445 * or written. 446 * -1 - error. 447 */ 448 static ssize_t 449 vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk, 450 size_t len) 451 { 452 caddr_t maddr; 453 size_t offset, maxlen, moffset, mlen, n; 454 uint_t smflags; 455 enum seg_rw srw; 456 457 ASSERT(vd->file); 458 ASSERT(len > 0); 459 460 /* 461 * If a file is exported as a slice then we don't care about the vtoc. 462 * In that case, the vtoc is a fake mainly to make newfs happy and we 463 * handle any I/O as a raw disk access so that we can have access to the 464 * entire backend. 465 */ 466 if (vd->vdisk_type == VD_DISK_TYPE_SLICE || slice == VD_SLICE_NONE) { 467 /* raw disk access */ 468 offset = blk * DEV_BSIZE; 469 } else { 470 ASSERT(slice >= 0 && slice < V_NUMPAR); 471 472 if (vd->vdisk_label == VD_DISK_LABEL_UNK && 473 vd_file_validate_geometry(vd) != 0) { 474 PR0("Unknown disk label, can't do I/O from slice %d", 475 slice); 476 return (-1); 477 } 478 479 if (blk >= vd->vtoc.v_part[slice].p_size) { 480 /* address past the end of the slice */ 481 PR0("req_addr (0x%lx) > psize (0x%lx)", 482 blk, vd->vtoc.v_part[slice].p_size); 483 return (0); 484 } 485 486 offset = (vd->vtoc.v_part[slice].p_start + blk) * DEV_BSIZE; 487 488 /* 489 * If the requested size is greater than the size 490 * of the partition, truncate the read/write. 491 */ 492 maxlen = (vd->vtoc.v_part[slice].p_size - blk) * DEV_BSIZE; 493 494 if (len > maxlen) { 495 PR0("I/O size truncated to %lu bytes from %lu bytes", 496 maxlen, len); 497 len = maxlen; 498 } 499 } 500 501 /* 502 * We have to ensure that we are reading/writing into the mmap 503 * range. If we have a partial disk image (e.g. an image of 504 * s0 instead s2) the system can try to access slices that 505 * are not included into the disk image. 506 */ 507 if ((offset + len) >= vd->file_size) { 508 PR0("offset + nbytes (0x%lx + 0x%lx) >= " 509 "file_size (0x%lx)", offset, len, vd->file_size); 510 return (-1); 511 } 512 513 srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE; 514 smflags = (operation == VD_OP_BREAD)? 0 : 515 (SM_WRITE | vd_file_write_flags); 516 n = len; 517 518 do { 519 /* 520 * segmap_getmapflt() returns a MAXBSIZE chunk which is 521 * MAXBSIZE aligned. 522 */ 523 moffset = offset & MAXBOFFSET; 524 mlen = MIN(MAXBSIZE - moffset, n); 525 maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset, 526 mlen, 1, srw); 527 /* 528 * Fault in the pages so we can check for error and ensure 529 * that we can safely used the mapped address. 530 */ 531 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 532 F_SOFTLOCK, srw) != 0) { 533 (void) segmap_release(segkmap, maddr, 0); 534 return (-1); 535 } 536 537 if (operation == VD_OP_BREAD) 538 bcopy(maddr + moffset, data, mlen); 539 else 540 bcopy(data, maddr + moffset, mlen); 541 542 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 543 F_SOFTUNLOCK, srw) != 0) { 544 (void) segmap_release(segkmap, maddr, 0); 545 return (-1); 546 } 547 if (segmap_release(segkmap, maddr, smflags) != 0) 548 return (-1); 549 n -= mlen; 550 offset += mlen; 551 data += mlen; 552 553 } while (n > 0); 554 555 return (len); 556 } 557 558 /* 559 * Function: 560 * vd_file_build_default_label 561 * 562 * Description: 563 * Return a default label for the given disk. This is used when the disk 564 * does not have a valid VTOC so that the user can get a valid default 565 * configuration. The default label have all slices size set to 0 (except 566 * slice 2 which is the entire disk) to force the user to write a valid 567 * label onto the disk image. 568 * 569 * Parameters: 570 * vd - disk on which the operation is performed. 571 * label - the returned default label. 572 * 573 * Return Code: 574 * none. 575 */ 576 static void 577 vd_file_build_default_label(vd_t *vd, struct dk_label *label) 578 { 579 size_t size; 580 char prefix; 581 int slice, nparts; 582 uint16_t tag; 583 584 ASSERT(vd->file); 585 586 /* 587 * We must have a resonable number of cylinders and sectors so 588 * that newfs can run using default values. 589 * 590 * if (disk_size < 2MB) 591 * phys_cylinders = disk_size / 100K 592 * else 593 * phys_cylinders = disk_size / 300K 594 * 595 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 596 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 597 * data_cylinders = phys_cylinders - alt_cylinders 598 * 599 * sectors = disk_size / (phys_cylinders * blk_size) 600 * 601 * The file size test is an attempt to not have too few cylinders 602 * for a small file, or so many on a big file that you waste space 603 * for backup superblocks or cylinder group structures. 604 */ 605 if (vd->file_size < (2 * 1024 * 1024)) 606 label->dkl_pcyl = vd->file_size / (100 * 1024); 607 else 608 label->dkl_pcyl = vd->file_size / (300 * 1024); 609 610 if (label->dkl_pcyl == 0) 611 label->dkl_pcyl = 1; 612 613 label->dkl_acyl = 0; 614 615 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { 616 nparts = 1; 617 slice = 0; 618 tag = V_UNASSIGNED; 619 } else { 620 if (label->dkl_pcyl > 2) 621 label->dkl_acyl = 2; 622 nparts = V_NUMPAR; 623 slice = VD_ENTIRE_DISK_SLICE; 624 tag = V_BACKUP; 625 } 626 627 label->dkl_nsect = vd->file_size / 628 (DEV_BSIZE * label->dkl_pcyl); 629 label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl; 630 label->dkl_nhead = 1; 631 label->dkl_write_reinstruct = 0; 632 label->dkl_read_reinstruct = 0; 633 label->dkl_rpm = 7200; 634 label->dkl_apc = 0; 635 label->dkl_intrlv = 0; 636 637 PR0("requested disk size: %ld bytes\n", vd->file_size); 638 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl, 639 label->dkl_nhead, label->dkl_nsect); 640 PR0("provided disk size: %ld bytes\n", (uint64_t) 641 (label->dkl_pcyl * label->dkl_nhead * 642 label->dkl_nsect * DEV_BSIZE)); 643 644 if (vd->file_size < (1ULL << 20)) { 645 size = vd->file_size >> 10; 646 prefix = 'K'; /* Kilobyte */ 647 } else if (vd->file_size < (1ULL << 30)) { 648 size = vd->file_size >> 20; 649 prefix = 'M'; /* Megabyte */ 650 } else if (vd->file_size < (1ULL << 40)) { 651 size = vd->file_size >> 30; 652 prefix = 'G'; /* Gigabyte */ 653 } else { 654 size = vd->file_size >> 40; 655 prefix = 'T'; /* Terabyte */ 656 } 657 658 /* 659 * We must have a correct label name otherwise format(1m) will 660 * not recognized the disk as labeled. 661 */ 662 (void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII, 663 "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d", 664 size, prefix, 665 label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead, 666 label->dkl_nsect); 667 668 /* default VTOC */ 669 label->dkl_vtoc.v_version = V_VERSION; 670 label->dkl_vtoc.v_nparts = nparts; 671 label->dkl_vtoc.v_sanity = VTOC_SANE; 672 label->dkl_vtoc.v_part[slice].p_tag = tag; 673 label->dkl_map[slice].dkl_cylno = 0; 674 label->dkl_map[slice].dkl_nblk = label->dkl_ncyl * 675 label->dkl_nhead * label->dkl_nsect; 676 label->dkl_cksum = vd_lbl2cksum(label); 677 } 678 679 /* 680 * Function: 681 * vd_file_set_vtoc 682 * 683 * Description: 684 * Set the vtoc of a disk image by writing the label and backup 685 * labels into the disk image backend. 686 * 687 * Parameters: 688 * vd - disk on which the operation is performed. 689 * label - the data to be written. 690 * 691 * Return Code: 692 * 0 - success. 693 * n > 0 - error, n indicates the errno code. 694 */ 695 static int 696 vd_file_set_vtoc(vd_t *vd, struct dk_label *label) 697 { 698 int blk, sec, cyl, head, cnt; 699 700 ASSERT(vd->file); 701 702 if (VD_FILE_LABEL_WRITE(vd, label) < 0) { 703 PR0("fail to write disk label"); 704 return (EIO); 705 } 706 707 /* 708 * Backup labels are on the last alternate cylinder's 709 * first five odd sectors. 710 */ 711 if (label->dkl_acyl == 0) { 712 PR0("no alternate cylinder, can not store backup labels"); 713 return (0); 714 } 715 716 cyl = label->dkl_ncyl + label->dkl_acyl - 1; 717 head = label->dkl_nhead - 1; 718 719 blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) + 720 (head * label->dkl_nsect); 721 722 /* 723 * Write the backup labels. Make sure we don't try to write past 724 * the last cylinder. 725 */ 726 sec = 1; 727 728 for (cnt = 0; cnt < VD_FILE_NUM_BACKUP; cnt++) { 729 730 if (sec >= label->dkl_nsect) { 731 PR0("not enough sector to store all backup labels"); 732 return (0); 733 } 734 735 if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)label, 736 blk + sec, sizeof (struct dk_label)) < 0) { 737 PR0("error writing backup label at block %d\n", 738 blk + sec); 739 return (EIO); 740 } 741 742 PR1("wrote backup label at block %d\n", blk + sec); 743 744 sec += 2; 745 } 746 747 return (0); 748 } 749 750 /* 751 * Function: 752 * vd_file_get_devid_block 753 * 754 * Description: 755 * Return the block number where the device id is stored. 756 * 757 * Parameters: 758 * vd - disk on which the operation is performed. 759 * blkp - pointer to the block number 760 * 761 * Return Code: 762 * 0 - success 763 * ENOSPC - disk has no space to store a device id 764 */ 765 static int 766 vd_file_get_devid_block(vd_t *vd, size_t *blkp) 767 { 768 diskaddr_t spc, head, cyl; 769 770 ASSERT(vd->file); 771 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 772 773 /* this geometry doesn't allow us to have a devid */ 774 if (vd->dk_geom.dkg_acyl < 2) { 775 PR0("not enough alternate cylinder available for devid " 776 "(acyl=%u)", vd->dk_geom.dkg_acyl); 777 return (ENOSPC); 778 } 779 780 /* the devid is in on the track next to the last cylinder */ 781 cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2; 782 spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect; 783 head = vd->dk_geom.dkg_nhead - 1; 784 785 *blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) + 786 (head * vd->dk_geom.dkg_nsect) + 1; 787 788 return (0); 789 } 790 791 /* 792 * Return the checksum of a disk block containing an on-disk devid. 793 */ 794 static uint_t 795 vd_dkdevid2cksum(struct dk_devid *dkdevid) 796 { 797 uint_t chksum, *ip; 798 int i; 799 800 chksum = 0; 801 ip = (uint_t *)dkdevid; 802 for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++) 803 chksum ^= ip[i]; 804 805 return (chksum); 806 } 807 808 /* 809 * Function: 810 * vd_file_read_devid 811 * 812 * Description: 813 * Read the device id stored on a disk image. 814 * 815 * Parameters: 816 * vd - disk on which the operation is performed. 817 * devid - the return address of the device ID. 818 * 819 * Return Code: 820 * 0 - success 821 * EIO - I/O error while trying to access the disk image 822 * EINVAL - no valid device id was found 823 * ENOSPC - disk has no space to store a device id 824 */ 825 static int 826 vd_file_read_devid(vd_t *vd, ddi_devid_t *devid) 827 { 828 struct dk_devid *dkdevid; 829 size_t blk; 830 uint_t chksum; 831 int status, sz; 832 833 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 834 return (status); 835 836 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 837 838 /* get the devid */ 839 if ((vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk, 840 DEV_BSIZE)) < 0) { 841 PR0("error reading devid block at %lu", blk); 842 status = EIO; 843 goto done; 844 } 845 846 /* validate the revision */ 847 if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) || 848 (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) { 849 PR0("invalid devid found at block %lu (bad revision)", blk); 850 status = EINVAL; 851 goto done; 852 } 853 854 /* compute checksum */ 855 chksum = vd_dkdevid2cksum(dkdevid); 856 857 /* compare the checksums */ 858 if (DKD_GETCHKSUM(dkdevid) != chksum) { 859 PR0("invalid devid found at block %lu (bad checksum)", blk); 860 status = EINVAL; 861 goto done; 862 } 863 864 /* validate the device id */ 865 if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) { 866 PR0("invalid devid found at block %lu", blk); 867 status = EINVAL; 868 goto done; 869 } 870 871 PR1("devid read at block %lu", blk); 872 873 sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid); 874 *devid = kmem_alloc(sz, KM_SLEEP); 875 bcopy(&dkdevid->dkd_devid, *devid, sz); 876 877 done: 878 kmem_free(dkdevid, DEV_BSIZE); 879 return (status); 880 881 } 882 883 /* 884 * Function: 885 * vd_file_write_devid 886 * 887 * Description: 888 * Write a device id into disk image. 889 * 890 * Parameters: 891 * vd - disk on which the operation is performed. 892 * devid - the device ID to store. 893 * 894 * Return Code: 895 * 0 - success 896 * EIO - I/O error while trying to access the disk image 897 * ENOSPC - disk has no space to store a device id 898 */ 899 static int 900 vd_file_write_devid(vd_t *vd, ddi_devid_t devid) 901 { 902 struct dk_devid *dkdevid; 903 uint_t chksum; 904 size_t blk; 905 int status; 906 907 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 908 return (status); 909 910 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 911 912 /* set revision */ 913 dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB; 914 dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB; 915 916 /* copy devid */ 917 bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid)); 918 919 /* compute checksum */ 920 chksum = vd_dkdevid2cksum(dkdevid); 921 922 /* set checksum */ 923 DKD_FORMCHKSUM(chksum, dkdevid); 924 925 /* store the devid */ 926 if ((status = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, 927 (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) { 928 PR0("Error writing devid block at %lu", blk); 929 status = EIO; 930 } else { 931 PR1("devid written at block %lu", blk); 932 status = 0; 933 } 934 935 kmem_free(dkdevid, DEV_BSIZE); 936 return (status); 937 } 938 939 /* 940 * Function: 941 * vd_scsi_rdwr 942 * 943 * Description: 944 * Read or write to a SCSI disk using an absolute disk offset. 945 * 946 * Parameters: 947 * vd - disk on which the operation is performed. 948 * operation - operation to execute: read (VD_OP_BREAD) or 949 * write (VD_OP_BWRITE). 950 * data - buffer where data are read to or written from. 951 * blk - starting block for the operation. 952 * len - number of bytes to read or write. 953 * 954 * Return Code: 955 * 0 - success 956 * n != 0 - error. 957 */ 958 static int 959 vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len) 960 { 961 struct uscsi_cmd ucmd; 962 union scsi_cdb cdb; 963 int nsectors, nblk; 964 int max_sectors; 965 int status, rval; 966 967 ASSERT(!vd->file); 968 969 max_sectors = vd->max_xfer_sz; 970 nblk = (len / DEV_BSIZE); 971 972 if (len % DEV_BSIZE != 0) 973 return (EINVAL); 974 975 /* 976 * Build and execute the uscsi ioctl. We build a group0, group1 977 * or group4 command as necessary, since some targets 978 * do not support group1 commands. 979 */ 980 while (nblk) { 981 982 bzero(&ucmd, sizeof (ucmd)); 983 bzero(&cdb, sizeof (cdb)); 984 985 nsectors = (max_sectors < nblk) ? max_sectors : nblk; 986 987 if (blk < (2 << 20) && nsectors <= 0xff) { 988 FORMG0ADDR(&cdb, blk); 989 FORMG0COUNT(&cdb, nsectors); 990 ucmd.uscsi_cdblen = CDB_GROUP0; 991 } else if (blk > 0xffffffff) { 992 FORMG4LONGADDR(&cdb, blk); 993 FORMG4COUNT(&cdb, nsectors); 994 ucmd.uscsi_cdblen = CDB_GROUP4; 995 cdb.scc_cmd |= SCMD_GROUP4; 996 } else { 997 FORMG1ADDR(&cdb, blk); 998 FORMG1COUNT(&cdb, nsectors); 999 ucmd.uscsi_cdblen = CDB_GROUP1; 1000 cdb.scc_cmd |= SCMD_GROUP1; 1001 } 1002 1003 ucmd.uscsi_cdb = (caddr_t)&cdb; 1004 ucmd.uscsi_bufaddr = data; 1005 ucmd.uscsi_buflen = nsectors * DEV_BSIZE; 1006 ucmd.uscsi_timeout = vd_scsi_rdwr_timeout; 1007 /* 1008 * Set flags so that the command is isolated from normal 1009 * commands and no error message is printed. 1010 */ 1011 ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT; 1012 1013 if (operation == VD_OP_BREAD) { 1014 cdb.scc_cmd |= SCMD_READ; 1015 ucmd.uscsi_flags |= USCSI_READ; 1016 } else { 1017 cdb.scc_cmd |= SCMD_WRITE; 1018 } 1019 1020 status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE], 1021 USCSICMD, (intptr_t)&ucmd, (vd->open_flags | FKIOCTL), 1022 kcred, &rval); 1023 1024 if (status == 0) 1025 status = ucmd.uscsi_status; 1026 1027 if (status != 0) 1028 break; 1029 1030 /* 1031 * Check if partial DMA breakup is required. If so, reduce 1032 * the request size by half and retry the last request. 1033 */ 1034 if (ucmd.uscsi_resid == ucmd.uscsi_buflen) { 1035 max_sectors >>= 1; 1036 if (max_sectors <= 0) { 1037 status = EIO; 1038 break; 1039 } 1040 continue; 1041 } 1042 1043 if (ucmd.uscsi_resid != 0) { 1044 status = EIO; 1045 break; 1046 } 1047 1048 blk += nsectors; 1049 nblk -= nsectors; 1050 data += nsectors * DEV_BSIZE; /* SECSIZE */ 1051 } 1052 1053 return (status); 1054 } 1055 1056 /* 1057 * Return Values 1058 * EINPROGRESS - operation was successfully started 1059 * EIO - encountered LDC (aka. task error) 1060 * 0 - operation completed successfully 1061 * 1062 * Side Effect 1063 * sets request->status = <disk operation status> 1064 */ 1065 static int 1066 vd_start_bio(vd_task_t *task) 1067 { 1068 int rv, status = 0; 1069 vd_t *vd = task->vd; 1070 vd_dring_payload_t *request = task->request; 1071 struct buf *buf = &task->buf; 1072 uint8_t mtype; 1073 int slice; 1074 char *bufaddr = 0; 1075 size_t buflen; 1076 1077 ASSERT(vd != NULL); 1078 ASSERT(request != NULL); 1079 1080 slice = request->slice; 1081 1082 ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices); 1083 ASSERT((request->operation == VD_OP_BREAD) || 1084 (request->operation == VD_OP_BWRITE)); 1085 1086 if (request->nbytes == 0) { 1087 /* no service for trivial requests */ 1088 request->status = EINVAL; 1089 return (0); 1090 } 1091 1092 PR1("%s %lu bytes at block %lu", 1093 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 1094 request->nbytes, request->addr); 1095 1096 /* 1097 * We have to check the open flags because the functions processing 1098 * the read/write request will not do it. 1099 */ 1100 if (request->operation == VD_OP_BWRITE && !(vd->open_flags & FWRITE)) { 1101 PR0("write fails because backend is opened read-only"); 1102 request->nbytes = 0; 1103 request->status = EROFS; 1104 return (0); 1105 } 1106 1107 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 1108 1109 /* Map memory exported by client */ 1110 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 1111 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 1112 &bufaddr, NULL); 1113 if (status != 0) { 1114 PR0("ldc_mem_map() returned err %d ", status); 1115 return (EIO); 1116 } 1117 1118 buflen = request->nbytes; 1119 1120 status = ldc_mem_acquire(task->mhdl, 0, buflen); 1121 if (status != 0) { 1122 (void) ldc_mem_unmap(task->mhdl); 1123 PR0("ldc_mem_acquire() returned err %d ", status); 1124 return (EIO); 1125 } 1126 1127 /* Start the block I/O */ 1128 if (vd->file) { 1129 rv = vd_file_rw(vd, slice, request->operation, bufaddr, 1130 request->addr, request->nbytes); 1131 if (rv < 0) { 1132 request->nbytes = 0; 1133 request->status = EIO; 1134 } else { 1135 request->nbytes = rv; 1136 request->status = 0; 1137 } 1138 } else { 1139 if (slice == VD_SLICE_NONE) { 1140 /* 1141 * This is not a disk image so it is a real disk. We 1142 * assume that the underlying device driver supports 1143 * USCSICMD ioctls. This is the case of all SCSI devices 1144 * (sd, ssd...). 1145 * 1146 * In the future if we have non-SCSI disks we would need 1147 * to invoke the appropriate function to do I/O using an 1148 * absolute disk offset (for example using DKIOCTL_RWCMD 1149 * for IDE disks). 1150 */ 1151 rv = vd_scsi_rdwr(vd, request->operation, bufaddr, 1152 request->addr, request->nbytes); 1153 if (rv != 0) { 1154 request->nbytes = 0; 1155 request->status = EIO; 1156 } else { 1157 request->status = 0; 1158 } 1159 } else { 1160 bioinit(buf); 1161 buf->b_flags = B_BUSY; 1162 buf->b_bcount = request->nbytes; 1163 buf->b_lblkno = request->addr; 1164 buf->b_edev = vd->dev[slice]; 1165 buf->b_un.b_addr = bufaddr; 1166 buf->b_flags |= (request->operation == VD_OP_BREAD)? 1167 B_READ : B_WRITE; 1168 1169 request->status = 1170 ldi_strategy(vd->ldi_handle[slice], buf); 1171 1172 /* 1173 * This is to indicate to the caller that the request 1174 * needs to be finished by vd_complete_bio() by calling 1175 * biowait() there and waiting for that to return before 1176 * triggering the notification of the vDisk client. 1177 * 1178 * This is necessary when writing to real disks as 1179 * otherwise calls to ldi_strategy() would be serialized 1180 * behind the calls to biowait() and performance would 1181 * suffer. 1182 */ 1183 if (request->status == 0) 1184 return (EINPROGRESS); 1185 1186 biofini(buf); 1187 } 1188 } 1189 1190 /* Clean up after error */ 1191 rv = ldc_mem_release(task->mhdl, 0, buflen); 1192 if (rv) { 1193 PR0("ldc_mem_release() returned err %d ", rv); 1194 status = EIO; 1195 } 1196 rv = ldc_mem_unmap(task->mhdl); 1197 if (rv) { 1198 PR0("ldc_mem_unmap() returned err %d ", rv); 1199 status = EIO; 1200 } 1201 1202 return (status); 1203 } 1204 1205 /* 1206 * This function should only be called from vd_notify to ensure that requests 1207 * are responded to in the order that they are received. 1208 */ 1209 static int 1210 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 1211 { 1212 int status; 1213 size_t nbytes; 1214 1215 do { 1216 nbytes = msglen; 1217 status = ldc_write(ldc_handle, msg, &nbytes); 1218 if (status != EWOULDBLOCK) 1219 break; 1220 drv_usecwait(vds_ldc_delay); 1221 } while (status == EWOULDBLOCK); 1222 1223 if (status != 0) { 1224 if (status != ECONNRESET) 1225 PR0("ldc_write() returned errno %d", status); 1226 return (status); 1227 } else if (nbytes != msglen) { 1228 PR0("ldc_write() performed only partial write"); 1229 return (EIO); 1230 } 1231 1232 PR1("SENT %lu bytes", msglen); 1233 return (0); 1234 } 1235 1236 static void 1237 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 1238 { 1239 mutex_enter(&vd->lock); 1240 vd->reset_state = B_TRUE; 1241 vd->reset_ldc = reset_ldc; 1242 mutex_exit(&vd->lock); 1243 } 1244 1245 /* 1246 * Reset the state of the connection with a client, if needed; reset the LDC 1247 * transport as well, if needed. This function should only be called from the 1248 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 1249 */ 1250 static void 1251 vd_reset_if_needed(vd_t *vd) 1252 { 1253 int status = 0; 1254 1255 mutex_enter(&vd->lock); 1256 if (!vd->reset_state) { 1257 ASSERT(!vd->reset_ldc); 1258 mutex_exit(&vd->lock); 1259 return; 1260 } 1261 mutex_exit(&vd->lock); 1262 1263 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 1264 1265 /* 1266 * Let any asynchronous I/O complete before possibly pulling the rug 1267 * out from under it; defer checking vd->reset_ldc, as one of the 1268 * asynchronous tasks might set it 1269 */ 1270 ddi_taskq_wait(vd->completionq); 1271 1272 if (vd->file) { 1273 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred); 1274 if (status) { 1275 PR0("VOP_FSYNC returned errno %d", status); 1276 } 1277 } 1278 1279 if ((vd->initialized & VD_DRING) && 1280 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 1281 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1282 1283 vd_free_dring_task(vd); 1284 1285 /* Free the staging buffer for msgs */ 1286 if (vd->vio_msgp != NULL) { 1287 kmem_free(vd->vio_msgp, vd->max_msglen); 1288 vd->vio_msgp = NULL; 1289 } 1290 1291 /* Free the inband message buffer */ 1292 if (vd->inband_task.msg != NULL) { 1293 kmem_free(vd->inband_task.msg, vd->max_msglen); 1294 vd->inband_task.msg = NULL; 1295 } 1296 1297 mutex_enter(&vd->lock); 1298 1299 if (vd->reset_ldc) 1300 PR0("taking down LDC channel"); 1301 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 1302 PR0("ldc_down() returned errno %d", status); 1303 1304 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 1305 vd->state = VD_STATE_INIT; 1306 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 1307 1308 /* Allocate the staging buffer */ 1309 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 1310 1311 PR0("calling ldc_up\n"); 1312 (void) ldc_up(vd->ldc_handle); 1313 1314 vd->reset_state = B_FALSE; 1315 vd->reset_ldc = B_FALSE; 1316 1317 mutex_exit(&vd->lock); 1318 } 1319 1320 static void vd_recv_msg(void *arg); 1321 1322 static void 1323 vd_mark_in_reset(vd_t *vd) 1324 { 1325 int status; 1326 1327 PR0("vd_mark_in_reset: marking vd in reset\n"); 1328 1329 vd_need_reset(vd, B_FALSE); 1330 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 1331 if (status == DDI_FAILURE) { 1332 PR0("cannot schedule task to recv msg\n"); 1333 vd_need_reset(vd, B_TRUE); 1334 return; 1335 } 1336 } 1337 1338 static int 1339 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 1340 { 1341 boolean_t accepted; 1342 int status; 1343 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1344 1345 if (vd->reset_state) 1346 return (0); 1347 1348 /* Acquire the element */ 1349 if (!vd->reset_state && 1350 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1351 if (status == ECONNRESET) { 1352 vd_mark_in_reset(vd); 1353 return (0); 1354 } else { 1355 PR0("ldc_mem_dring_acquire() returned errno %d", 1356 status); 1357 return (status); 1358 } 1359 } 1360 1361 /* Set the element's status and mark it done */ 1362 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 1363 if (accepted) { 1364 elem->payload.nbytes = elem_nbytes; 1365 elem->payload.status = elem_status; 1366 elem->hdr.dstate = VIO_DESC_DONE; 1367 } else { 1368 /* Perhaps client timed out waiting for I/O... */ 1369 PR0("element %u no longer \"accepted\"", idx); 1370 VD_DUMP_DRING_ELEM(elem); 1371 } 1372 /* Release the element */ 1373 if (!vd->reset_state && 1374 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1375 if (status == ECONNRESET) { 1376 vd_mark_in_reset(vd); 1377 return (0); 1378 } else { 1379 PR0("ldc_mem_dring_release() returned errno %d", 1380 status); 1381 return (status); 1382 } 1383 } 1384 1385 return (accepted ? 0 : EINVAL); 1386 } 1387 1388 /* 1389 * Return Values 1390 * 0 - operation completed successfully 1391 * EIO - encountered LDC / task error 1392 * 1393 * Side Effect 1394 * sets request->status = <disk operation status> 1395 */ 1396 static int 1397 vd_complete_bio(vd_task_t *task) 1398 { 1399 int status = 0; 1400 int rv = 0; 1401 vd_t *vd = task->vd; 1402 vd_dring_payload_t *request = task->request; 1403 struct buf *buf = &task->buf; 1404 1405 1406 ASSERT(vd != NULL); 1407 ASSERT(request != NULL); 1408 ASSERT(task->msg != NULL); 1409 ASSERT(task->msglen >= sizeof (*task->msg)); 1410 ASSERT(!vd->file); 1411 ASSERT(request->slice != VD_SLICE_NONE); 1412 1413 /* Wait for the I/O to complete [ call to ldi_strategy(9f) ] */ 1414 request->status = biowait(buf); 1415 1416 /* return back the number of bytes read/written */ 1417 request->nbytes = buf->b_bcount - buf->b_resid; 1418 1419 /* Release the buffer */ 1420 if (!vd->reset_state) 1421 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 1422 if (status) { 1423 PR0("ldc_mem_release() returned errno %d copying to " 1424 "client", status); 1425 if (status == ECONNRESET) { 1426 vd_mark_in_reset(vd); 1427 } 1428 rv = EIO; 1429 } 1430 1431 /* Unmap the memory, even if in reset */ 1432 status = ldc_mem_unmap(task->mhdl); 1433 if (status) { 1434 PR0("ldc_mem_unmap() returned errno %d copying to client", 1435 status); 1436 if (status == ECONNRESET) { 1437 vd_mark_in_reset(vd); 1438 } 1439 rv = EIO; 1440 } 1441 1442 biofini(buf); 1443 1444 return (rv); 1445 } 1446 1447 /* 1448 * Description: 1449 * This function is called by the two functions called by a taskq 1450 * [ vd_complete_notify() and vd_serial_notify()) ] to send the 1451 * message to the client. 1452 * 1453 * Parameters: 1454 * arg - opaque pointer to structure containing task to be completed 1455 * 1456 * Return Values 1457 * None 1458 */ 1459 static void 1460 vd_notify(vd_task_t *task) 1461 { 1462 int status; 1463 1464 ASSERT(task != NULL); 1465 ASSERT(task->vd != NULL); 1466 1467 if (task->vd->reset_state) 1468 return; 1469 1470 /* 1471 * Send the "ack" or "nack" back to the client; if sending the message 1472 * via LDC fails, arrange to reset both the connection state and LDC 1473 * itself 1474 */ 1475 PR2("Sending %s", 1476 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 1477 1478 status = send_msg(task->vd->ldc_handle, task->msg, task->msglen); 1479 switch (status) { 1480 case 0: 1481 break; 1482 case ECONNRESET: 1483 vd_mark_in_reset(task->vd); 1484 break; 1485 default: 1486 PR0("initiating full reset"); 1487 vd_need_reset(task->vd, B_TRUE); 1488 break; 1489 } 1490 1491 DTRACE_PROBE1(task__end, vd_task_t *, task); 1492 } 1493 1494 /* 1495 * Description: 1496 * Mark the Dring entry as Done and (if necessary) send an ACK/NACK to 1497 * the vDisk client 1498 * 1499 * Parameters: 1500 * task - structure containing the request sent from client 1501 * 1502 * Return Values 1503 * None 1504 */ 1505 static void 1506 vd_complete_notify(vd_task_t *task) 1507 { 1508 int status = 0; 1509 vd_t *vd = task->vd; 1510 vd_dring_payload_t *request = task->request; 1511 1512 /* Update the dring element for a dring client */ 1513 if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE)) { 1514 status = vd_mark_elem_done(vd, task->index, 1515 request->status, request->nbytes); 1516 if (status == ECONNRESET) 1517 vd_mark_in_reset(vd); 1518 } 1519 1520 /* 1521 * If a transport error occurred while marking the element done or 1522 * previously while executing the task, arrange to "nack" the message 1523 * when the final task in the descriptor element range completes 1524 */ 1525 if ((status != 0) || (task->status != 0)) 1526 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 1527 1528 /* 1529 * Only the final task for a range of elements will respond to and 1530 * free the message 1531 */ 1532 if (task->type == VD_NONFINAL_RANGE_TASK) { 1533 return; 1534 } 1535 1536 vd_notify(task); 1537 } 1538 1539 /* 1540 * Description: 1541 * This is the basic completion function called to handle inband data 1542 * requests and handshake messages. All it needs to do is trigger a 1543 * message to the client that the request is completed. 1544 * 1545 * Parameters: 1546 * arg - opaque pointer to structure containing task to be completed 1547 * 1548 * Return Values 1549 * None 1550 */ 1551 static void 1552 vd_serial_notify(void *arg) 1553 { 1554 vd_task_t *task = (vd_task_t *)arg; 1555 1556 ASSERT(task != NULL); 1557 vd_notify(task); 1558 } 1559 1560 static void 1561 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 1562 { 1563 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 1564 } 1565 1566 static void 1567 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 1568 { 1569 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 1570 } 1571 1572 static void 1573 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 1574 { 1575 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 1576 } 1577 1578 static void 1579 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 1580 { 1581 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 1582 } 1583 1584 static void 1585 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 1586 { 1587 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1588 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1589 1590 dk_efi->dki_lba = vd_efi->lba; 1591 dk_efi->dki_length = vd_efi->length; 1592 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 1593 } 1594 1595 static void 1596 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 1597 { 1598 int len; 1599 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1600 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1601 1602 len = vd_efi->length; 1603 DK_EFI2VD_EFI(dk_efi, vd_efi); 1604 kmem_free(dk_efi->dki_data, len); 1605 } 1606 1607 static void 1608 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 1609 { 1610 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1611 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1612 1613 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 1614 VD_EFI2DK_EFI(vd_efi, dk_efi); 1615 } 1616 1617 static void 1618 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 1619 { 1620 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1621 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1622 1623 kmem_free(dk_efi->dki_data, vd_efi->length); 1624 } 1625 1626 static vd_disk_label_t 1627 vd_read_vtoc(vd_t *vd, struct vtoc *vtoc) 1628 { 1629 int status, rval; 1630 struct dk_gpt *efi; 1631 size_t efi_len; 1632 1633 ASSERT(vd->ldi_handle[0] != NULL); 1634 1635 status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)vtoc, 1636 (vd->open_flags | FKIOCTL), kcred, &rval); 1637 1638 if (status == 0) { 1639 return (VD_DISK_LABEL_VTOC); 1640 } else if (status != ENOTSUP) { 1641 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 1642 return (VD_DISK_LABEL_UNK); 1643 } 1644 1645 status = vds_efi_alloc_and_read(vd->ldi_handle[0], &efi, &efi_len); 1646 1647 if (status) { 1648 PR0("vds_efi_alloc_and_read returned error %d", status); 1649 return (VD_DISK_LABEL_UNK); 1650 } 1651 1652 vd_efi_to_vtoc(efi, vtoc); 1653 vd_efi_free(efi, efi_len); 1654 1655 return (VD_DISK_LABEL_EFI); 1656 } 1657 1658 static ushort_t 1659 vd_lbl2cksum(struct dk_label *label) 1660 { 1661 int count; 1662 ushort_t sum, *sp; 1663 1664 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 1665 sp = (ushort_t *)label; 1666 sum = 0; 1667 while (count--) { 1668 sum ^= *sp++; 1669 } 1670 1671 return (sum); 1672 } 1673 1674 /* 1675 * Handle ioctls to a disk slice. 1676 * 1677 * Return Values 1678 * 0 - Indicates that there are no errors in disk operations 1679 * ENOTSUP - Unknown disk label type or unsupported DKIO ioctl 1680 * EINVAL - Not enough room to copy the EFI label 1681 * 1682 */ 1683 static int 1684 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1685 { 1686 dk_efi_t *dk_ioc; 1687 1688 switch (vd->vdisk_label) { 1689 1690 /* ioctls for a slice from a disk with a VTOC label */ 1691 case VD_DISK_LABEL_VTOC: 1692 1693 switch (cmd) { 1694 case DKIOCGGEOM: 1695 ASSERT(ioctl_arg != NULL); 1696 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 1697 return (0); 1698 case DKIOCGVTOC: 1699 ASSERT(ioctl_arg != NULL); 1700 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 1701 return (0); 1702 default: 1703 return (ENOTSUP); 1704 } 1705 1706 /* ioctls for a slice from a disk with an EFI label */ 1707 case VD_DISK_LABEL_EFI: 1708 1709 switch (cmd) { 1710 case DKIOCGETEFI: 1711 ASSERT(ioctl_arg != NULL); 1712 dk_ioc = (dk_efi_t *)ioctl_arg; 1713 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 1714 return (EINVAL); 1715 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 1716 vd->dk_efi.dki_length); 1717 return (0); 1718 default: 1719 return (ENOTSUP); 1720 } 1721 1722 default: 1723 /* Unknown disk label type */ 1724 return (ENOTSUP); 1725 } 1726 } 1727 1728 /* 1729 * Function: 1730 * vd_file_validate_geometry 1731 * 1732 * Description: 1733 * Read the label and validate the geometry of a disk image. The driver 1734 * label, vtoc and geometry information are updated according to the 1735 * label read from the disk image. 1736 * 1737 * If no valid label is found, the label is set to unknown and the 1738 * function returns EINVAL, but a default vtoc and geometry are provided 1739 * to the driver. 1740 * 1741 * Parameters: 1742 * vd - disk on which the operation is performed. 1743 * 1744 * Return Code: 1745 * 0 - success. 1746 * EIO - error reading the label from the disk image. 1747 * EINVAL - unknown disk label. 1748 */ 1749 static int 1750 vd_file_validate_geometry(vd_t *vd) 1751 { 1752 struct dk_label label; 1753 struct dk_geom *geom = &vd->dk_geom; 1754 struct vtoc *vtoc = &vd->vtoc; 1755 int i; 1756 int status = 0; 1757 1758 ASSERT(vd->file); 1759 1760 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { 1761 /* 1762 * For single slice disk we always fake the geometry, and we 1763 * only need to do it once because the geometry will never 1764 * change. 1765 */ 1766 if (vd->vdisk_label == VD_DISK_LABEL_VTOC) 1767 /* geometry was already validated */ 1768 return (0); 1769 1770 ASSERT(vd->vdisk_label == VD_DISK_LABEL_UNK); 1771 vd_file_build_default_label(vd, &label); 1772 vd->vdisk_label = VD_DISK_LABEL_VTOC; 1773 } else { 1774 if (VD_FILE_LABEL_READ(vd, &label) < 0) 1775 return (EIO); 1776 1777 if (label.dkl_magic != DKL_MAGIC || 1778 label.dkl_cksum != vd_lbl2cksum(&label) || 1779 label.dkl_vtoc.v_sanity != VTOC_SANE || 1780 label.dkl_vtoc.v_nparts != V_NUMPAR) { 1781 vd->vdisk_label = VD_DISK_LABEL_UNK; 1782 vd_file_build_default_label(vd, &label); 1783 status = EINVAL; 1784 } else { 1785 vd->vdisk_label = VD_DISK_LABEL_VTOC; 1786 } 1787 } 1788 1789 /* Update the driver geometry */ 1790 bzero(geom, sizeof (struct dk_geom)); 1791 1792 geom->dkg_ncyl = label.dkl_ncyl; 1793 geom->dkg_acyl = label.dkl_acyl; 1794 geom->dkg_nhead = label.dkl_nhead; 1795 geom->dkg_nsect = label.dkl_nsect; 1796 geom->dkg_intrlv = label.dkl_intrlv; 1797 geom->dkg_apc = label.dkl_apc; 1798 geom->dkg_rpm = label.dkl_rpm; 1799 geom->dkg_pcyl = label.dkl_pcyl; 1800 geom->dkg_write_reinstruct = label.dkl_write_reinstruct; 1801 geom->dkg_read_reinstruct = label.dkl_read_reinstruct; 1802 1803 /* Update the driver vtoc */ 1804 bzero(vtoc, sizeof (struct vtoc)); 1805 1806 vtoc->v_sanity = label.dkl_vtoc.v_sanity; 1807 vtoc->v_version = label.dkl_vtoc.v_version; 1808 vtoc->v_sectorsz = DEV_BSIZE; 1809 vtoc->v_nparts = label.dkl_vtoc.v_nparts; 1810 1811 for (i = 0; i < vtoc->v_nparts; i++) { 1812 vtoc->v_part[i].p_tag = 1813 label.dkl_vtoc.v_part[i].p_tag; 1814 vtoc->v_part[i].p_flag = 1815 label.dkl_vtoc.v_part[i].p_flag; 1816 vtoc->v_part[i].p_start = 1817 label.dkl_map[i].dkl_cylno * 1818 (label.dkl_nhead * label.dkl_nsect); 1819 vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk; 1820 vtoc->timestamp[i] = 1821 label.dkl_vtoc.v_timestamp[i]; 1822 } 1823 /* 1824 * The bootinfo array can not be copied with bcopy() because 1825 * elements are of type long in vtoc (so 64-bit) and of type 1826 * int in dk_vtoc (so 32-bit). 1827 */ 1828 vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0]; 1829 vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1]; 1830 vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2]; 1831 bcopy(label.dkl_asciilabel, vtoc->v_asciilabel, 1832 LEN_DKL_ASCII); 1833 bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume, 1834 LEN_DKL_VVOL); 1835 1836 return (status); 1837 } 1838 1839 /* 1840 * Handle ioctls to a disk image (file-based). 1841 * 1842 * Return Values 1843 * 0 - Indicates that there are no errors 1844 * != 0 - Disk operation returned an error 1845 */ 1846 static int 1847 vd_do_file_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 1848 { 1849 struct dk_label label; 1850 struct dk_geom *geom; 1851 struct vtoc *vtoc; 1852 int i, rc; 1853 1854 ASSERT(vd->file); 1855 1856 switch (cmd) { 1857 1858 case DKIOCGGEOM: 1859 ASSERT(ioctl_arg != NULL); 1860 geom = (struct dk_geom *)ioctl_arg; 1861 1862 rc = vd_file_validate_geometry(vd); 1863 if (rc != 0 && rc != EINVAL) { 1864 ASSERT(vd->vdisk_type != VD_DISK_TYPE_SLICE); 1865 return (rc); 1866 } 1867 1868 bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom)); 1869 return (0); 1870 1871 case DKIOCGVTOC: 1872 ASSERT(ioctl_arg != NULL); 1873 vtoc = (struct vtoc *)ioctl_arg; 1874 1875 rc = vd_file_validate_geometry(vd); 1876 if (rc != 0 && rc != EINVAL) { 1877 ASSERT(vd->vdisk_type != VD_DISK_TYPE_SLICE); 1878 return (rc); 1879 } 1880 1881 bcopy(&vd->vtoc, vtoc, sizeof (struct vtoc)); 1882 return (0); 1883 1884 case DKIOCSGEOM: 1885 ASSERT(ioctl_arg != NULL); 1886 geom = (struct dk_geom *)ioctl_arg; 1887 1888 /* geometry can only be changed for full disk */ 1889 if (vd->vdisk_type != VD_DISK_TYPE_DISK) 1890 return (ENOTSUP); 1891 1892 if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0) 1893 return (EINVAL); 1894 1895 /* 1896 * The current device geometry is not updated, just the driver 1897 * "notion" of it. The device geometry will be effectively 1898 * updated when a label is written to the device during a next 1899 * DKIOCSVTOC. 1900 */ 1901 bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom)); 1902 return (0); 1903 1904 case DKIOCSVTOC: 1905 ASSERT(ioctl_arg != NULL); 1906 ASSERT(vd->dk_geom.dkg_nhead != 0 && 1907 vd->dk_geom.dkg_nsect != 0); 1908 vtoc = (struct vtoc *)ioctl_arg; 1909 1910 /* vtoc can only be changed for full disk */ 1911 if (vd->vdisk_type != VD_DISK_TYPE_DISK) 1912 return (ENOTSUP); 1913 1914 if (vtoc->v_sanity != VTOC_SANE || 1915 vtoc->v_sectorsz != DEV_BSIZE || 1916 vtoc->v_nparts != V_NUMPAR) 1917 return (EINVAL); 1918 1919 bzero(&label, sizeof (label)); 1920 label.dkl_ncyl = vd->dk_geom.dkg_ncyl; 1921 label.dkl_acyl = vd->dk_geom.dkg_acyl; 1922 label.dkl_pcyl = vd->dk_geom.dkg_pcyl; 1923 label.dkl_nhead = vd->dk_geom.dkg_nhead; 1924 label.dkl_nsect = vd->dk_geom.dkg_nsect; 1925 label.dkl_intrlv = vd->dk_geom.dkg_intrlv; 1926 label.dkl_apc = vd->dk_geom.dkg_apc; 1927 label.dkl_rpm = vd->dk_geom.dkg_rpm; 1928 label.dkl_write_reinstruct = vd->dk_geom.dkg_write_reinstruct; 1929 label.dkl_read_reinstruct = vd->dk_geom.dkg_read_reinstruct; 1930 1931 label.dkl_vtoc.v_nparts = V_NUMPAR; 1932 label.dkl_vtoc.v_sanity = VTOC_SANE; 1933 label.dkl_vtoc.v_version = vtoc->v_version; 1934 for (i = 0; i < V_NUMPAR; i++) { 1935 label.dkl_vtoc.v_timestamp[i] = 1936 vtoc->timestamp[i]; 1937 label.dkl_vtoc.v_part[i].p_tag = 1938 vtoc->v_part[i].p_tag; 1939 label.dkl_vtoc.v_part[i].p_flag = 1940 vtoc->v_part[i].p_flag; 1941 label.dkl_map[i].dkl_cylno = 1942 vtoc->v_part[i].p_start / 1943 (label.dkl_nhead * label.dkl_nsect); 1944 label.dkl_map[i].dkl_nblk = 1945 vtoc->v_part[i].p_size; 1946 } 1947 /* 1948 * The bootinfo array can not be copied with bcopy() because 1949 * elements are of type long in vtoc (so 64-bit) and of type 1950 * int in dk_vtoc (so 32-bit). 1951 */ 1952 label.dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0]; 1953 label.dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1]; 1954 label.dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2]; 1955 bcopy(vtoc->v_asciilabel, label.dkl_asciilabel, 1956 LEN_DKL_ASCII); 1957 bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume, 1958 LEN_DKL_VVOL); 1959 1960 /* re-compute checksum */ 1961 label.dkl_magic = DKL_MAGIC; 1962 label.dkl_cksum = vd_lbl2cksum(&label); 1963 1964 /* write label to the disk image */ 1965 if ((rc = vd_file_set_vtoc(vd, &label)) != 0) 1966 return (rc); 1967 1968 /* check the geometry and update the driver info */ 1969 if ((rc = vd_file_validate_geometry(vd)) != 0) 1970 return (rc); 1971 1972 /* 1973 * The disk geometry may have changed, so we need to write 1974 * the devid (if there is one) so that it is stored at the 1975 * right location. 1976 */ 1977 if (vd->file_devid != NULL && 1978 vd_file_write_devid(vd, vd->file_devid) != 0) { 1979 PR0("Fail to write devid"); 1980 } 1981 1982 return (0); 1983 1984 default: 1985 return (ENOTSUP); 1986 } 1987 } 1988 1989 /* 1990 * Description: 1991 * This is the function that processes the ioctl requests (farming it 1992 * out to functions that handle slices, files or whole disks) 1993 * 1994 * Return Values 1995 * 0 - ioctl operation completed successfully 1996 * != 0 - The LDC error value encountered 1997 * (propagated back up the call stack as a task error) 1998 * 1999 * Side Effect 2000 * sets request->status to the return value of the ioctl function. 2001 */ 2002 static int 2003 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 2004 { 2005 int rval = 0, status = 0; 2006 size_t nbytes = request->nbytes; /* modifiable copy */ 2007 2008 2009 ASSERT(request->slice < vd->nslices); 2010 PR0("Performing %s", ioctl->operation_name); 2011 2012 /* Get data from client and convert, if necessary */ 2013 if (ioctl->copyin != NULL) { 2014 ASSERT(nbytes != 0 && buf != NULL); 2015 PR1("Getting \"arg\" data from client"); 2016 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 2017 request->cookie, request->ncookies, 2018 LDC_COPY_IN)) != 0) { 2019 PR0("ldc_mem_copy() returned errno %d " 2020 "copying from client", status); 2021 return (status); 2022 } 2023 2024 /* Convert client's data, if necessary */ 2025 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 2026 ioctl->arg = buf; 2027 else /* convert client vdisk operation data to ioctl data */ 2028 (ioctl->copyin)(buf, (void *)ioctl->arg); 2029 } 2030 2031 /* 2032 * Handle single-slice block devices internally; otherwise, have the 2033 * real driver perform the ioctl() 2034 */ 2035 if (vd->file) { 2036 request->status = 2037 vd_do_file_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); 2038 2039 } else if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { 2040 request->status = 2041 vd_do_slice_ioctl(vd, ioctl->cmd, (void *)ioctl->arg); 2042 2043 } else { 2044 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2045 ioctl->cmd, (intptr_t)ioctl->arg, vd->open_flags | FKIOCTL, 2046 kcred, &rval); 2047 2048 #ifdef DEBUG 2049 if (rval != 0) { 2050 PR0("%s set rval = %d, which is not being returned to" 2051 " client", ioctl->cmd_name, rval); 2052 } 2053 #endif /* DEBUG */ 2054 } 2055 2056 if (request->status != 0) { 2057 PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status); 2058 return (0); 2059 } 2060 2061 /* Convert data and send to client, if necessary */ 2062 if (ioctl->copyout != NULL) { 2063 ASSERT(nbytes != 0 && buf != NULL); 2064 PR1("Sending \"arg\" data to client"); 2065 2066 /* Convert ioctl data to vdisk operation data, if necessary */ 2067 if (ioctl->copyout != VD_IDENTITY) 2068 (ioctl->copyout)((void *)ioctl->arg, buf); 2069 2070 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 2071 request->cookie, request->ncookies, 2072 LDC_COPY_OUT)) != 0) { 2073 PR0("ldc_mem_copy() returned errno %d " 2074 "copying to client", status); 2075 return (status); 2076 } 2077 } 2078 2079 return (status); 2080 } 2081 2082 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 2083 2084 /* 2085 * Description: 2086 * This generic function is called by the task queue to complete 2087 * the processing of the tasks. The specific completion function 2088 * is passed in as a field in the task pointer. 2089 * 2090 * Parameters: 2091 * arg - opaque pointer to structure containing task to be completed 2092 * 2093 * Return Values 2094 * None 2095 */ 2096 static void 2097 vd_complete(void *arg) 2098 { 2099 vd_task_t *task = (vd_task_t *)arg; 2100 2101 ASSERT(task != NULL); 2102 ASSERT(task->status == EINPROGRESS); 2103 ASSERT(task->completef != NULL); 2104 2105 task->status = task->completef(task); 2106 if (task->status) 2107 PR0("%s: Error %d completing task", __func__, task->status); 2108 2109 /* Now notify the vDisk client */ 2110 vd_complete_notify(task); 2111 } 2112 2113 static int 2114 vd_ioctl(vd_task_t *task) 2115 { 2116 int i, status; 2117 void *buf = NULL; 2118 struct dk_geom dk_geom = {0}; 2119 struct vtoc vtoc = {0}; 2120 struct dk_efi dk_efi = {0}; 2121 vd_t *vd = task->vd; 2122 vd_dring_payload_t *request = task->request; 2123 vd_ioctl_t ioctl[] = { 2124 /* Command (no-copy) operations */ 2125 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 2126 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 2127 NULL, NULL, NULL, B_TRUE}, 2128 2129 /* "Get" (copy-out) operations */ 2130 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 2131 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 2132 NULL, VD_IDENTITY, VD_IDENTITY, B_FALSE}, 2133 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 2134 RNDSIZE(vd_geom_t), 2135 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 2136 &dk_geom, NULL, dk_geom2vd_geom, B_FALSE}, 2137 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 2138 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 2139 &vtoc, NULL, vtoc2vd_vtoc, B_FALSE}, 2140 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 2141 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 2142 &dk_efi, vd_get_efi_in, vd_get_efi_out, B_FALSE}, 2143 2144 /* "Set" (copy-in) operations */ 2145 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 2146 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 2147 NULL, VD_IDENTITY, VD_IDENTITY, B_TRUE}, 2148 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 2149 RNDSIZE(vd_geom_t), 2150 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 2151 &dk_geom, vd_geom2dk_geom, NULL, B_TRUE}, 2152 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 2153 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 2154 &vtoc, vd_vtoc2vtoc, NULL, B_TRUE}, 2155 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 2156 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 2157 &dk_efi, vd_set_efi_in, vd_set_efi_out, B_TRUE}, 2158 }; 2159 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 2160 2161 2162 ASSERT(vd != NULL); 2163 ASSERT(request != NULL); 2164 ASSERT(request->slice < vd->nslices); 2165 2166 /* 2167 * Determine ioctl corresponding to caller's "operation" and 2168 * validate caller's "nbytes" 2169 */ 2170 for (i = 0; i < nioctls; i++) { 2171 if (request->operation == ioctl[i].operation) { 2172 /* LDC memory operations require 8-byte multiples */ 2173 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 2174 2175 if (request->operation == VD_OP_GET_EFI || 2176 request->operation == VD_OP_SET_EFI) { 2177 if (request->nbytes >= ioctl[i].nbytes) 2178 break; 2179 PR0("%s: Expected at least nbytes = %lu, " 2180 "got %lu", ioctl[i].operation_name, 2181 ioctl[i].nbytes, request->nbytes); 2182 return (EINVAL); 2183 } 2184 2185 if (request->nbytes != ioctl[i].nbytes) { 2186 PR0("%s: Expected nbytes = %lu, got %lu", 2187 ioctl[i].operation_name, ioctl[i].nbytes, 2188 request->nbytes); 2189 return (EINVAL); 2190 } 2191 2192 break; 2193 } 2194 } 2195 ASSERT(i < nioctls); /* because "operation" already validated */ 2196 2197 if (!(vd->open_flags & FWRITE) && ioctl[i].write) { 2198 PR0("%s fails because backend is opened read-only", 2199 ioctl[i].operation_name); 2200 request->status = EROFS; 2201 return (0); 2202 } 2203 2204 if (request->nbytes) 2205 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 2206 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 2207 if (request->nbytes) 2208 kmem_free(buf, request->nbytes); 2209 2210 return (status); 2211 } 2212 2213 static int 2214 vd_get_devid(vd_task_t *task) 2215 { 2216 vd_t *vd = task->vd; 2217 vd_dring_payload_t *request = task->request; 2218 vd_devid_t *vd_devid; 2219 impl_devid_t *devid; 2220 int status, bufid_len, devid_len, len, sz; 2221 int bufbytes; 2222 2223 PR1("Get Device ID, nbytes=%ld", request->nbytes); 2224 2225 if (vd->file) { 2226 if (vd->file_devid == NULL) { 2227 PR2("No Device ID"); 2228 request->status = ENOENT; 2229 return (0); 2230 } else { 2231 sz = ddi_devid_sizeof(vd->file_devid); 2232 devid = kmem_alloc(sz, KM_SLEEP); 2233 bcopy(vd->file_devid, devid, sz); 2234 } 2235 } else { 2236 if (ddi_lyr_get_devid(vd->dev[request->slice], 2237 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 2238 PR2("No Device ID"); 2239 request->status = ENOENT; 2240 return (0); 2241 } 2242 } 2243 2244 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 2245 devid_len = DEVID_GETLEN(devid); 2246 2247 /* 2248 * Save the buffer size here for use in deallocation. 2249 * The actual number of bytes copied is returned in 2250 * the 'nbytes' field of the request structure. 2251 */ 2252 bufbytes = request->nbytes; 2253 2254 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 2255 vd_devid->length = devid_len; 2256 vd_devid->type = DEVID_GETTYPE(devid); 2257 2258 len = (devid_len > bufid_len)? bufid_len : devid_len; 2259 2260 bcopy(devid->did_id, vd_devid->id, len); 2261 2262 request->status = 0; 2263 2264 /* LDC memory operations require 8-byte multiples */ 2265 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 2266 2267 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 2268 &request->nbytes, request->cookie, request->ncookies, 2269 LDC_COPY_OUT)) != 0) { 2270 PR0("ldc_mem_copy() returned errno %d copying to client", 2271 status); 2272 } 2273 PR1("post mem_copy: nbytes=%ld", request->nbytes); 2274 2275 kmem_free(vd_devid, bufbytes); 2276 ddi_devid_free((ddi_devid_t)devid); 2277 2278 return (status); 2279 } 2280 2281 /* 2282 * Define the supported operations once the functions for performing them have 2283 * been defined 2284 */ 2285 static const vds_operation_t vds_operation[] = { 2286 #define X(_s) #_s, _s 2287 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 2288 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 2289 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 2290 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 2291 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 2292 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 2293 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 2294 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 2295 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 2296 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 2297 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 2298 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 2299 #undef X 2300 }; 2301 2302 static const size_t vds_noperations = 2303 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 2304 2305 /* 2306 * Process a task specifying a client I/O request 2307 * 2308 * Parameters: 2309 * task - structure containing the request sent from client 2310 * 2311 * Return Value 2312 * 0 - success 2313 * ENOTSUP - Unknown/Unsupported VD_OP_XXX operation 2314 * EINVAL - Invalid disk slice 2315 * != 0 - some other non-zero return value from start function 2316 */ 2317 static int 2318 vd_do_process_task(vd_task_t *task) 2319 { 2320 int i; 2321 vd_t *vd = task->vd; 2322 vd_dring_payload_t *request = task->request; 2323 2324 ASSERT(vd != NULL); 2325 ASSERT(request != NULL); 2326 2327 /* Find the requested operation */ 2328 for (i = 0; i < vds_noperations; i++) { 2329 if (request->operation == vds_operation[i].operation) { 2330 /* all operations should have a start func */ 2331 ASSERT(vds_operation[i].start != NULL); 2332 2333 task->completef = vds_operation[i].complete; 2334 break; 2335 } 2336 } 2337 if (i == vds_noperations) { 2338 PR0("Unsupported operation %u", request->operation); 2339 return (ENOTSUP); 2340 } 2341 2342 /* Range-check slice */ 2343 if (request->slice >= vd->nslices && 2344 (vd->vdisk_type != VD_DISK_TYPE_DISK || 2345 request->slice != VD_SLICE_NONE)) { 2346 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 2347 request->slice, (vd->nslices - 1)); 2348 return (EINVAL); 2349 } 2350 2351 /* 2352 * Call the function pointer that starts the operation. 2353 */ 2354 return (vds_operation[i].start(task)); 2355 } 2356 2357 /* 2358 * Description: 2359 * This function is called by both the in-band and descriptor ring 2360 * message processing functions paths to actually execute the task 2361 * requested by the vDisk client. It in turn calls its worker 2362 * function, vd_do_process_task(), to carry our the request. 2363 * 2364 * Any transport errors (e.g. LDC errors, vDisk protocol errors) are 2365 * saved in the 'status' field of the task and are propagated back 2366 * up the call stack to trigger a NACK 2367 * 2368 * Any request errors (e.g. ENOTTY from an ioctl) are saved in 2369 * the 'status' field of the request and result in an ACK being sent 2370 * by the completion handler. 2371 * 2372 * Parameters: 2373 * task - structure containing the request sent from client 2374 * 2375 * Return Value 2376 * 0 - successful synchronous request. 2377 * != 0 - transport error (e.g. LDC errors, vDisk protocol) 2378 * EINPROGRESS - task will be finished in a completion handler 2379 */ 2380 static int 2381 vd_process_task(vd_task_t *task) 2382 { 2383 vd_t *vd = task->vd; 2384 int status; 2385 2386 DTRACE_PROBE1(task__start, vd_task_t *, task); 2387 2388 task->status = vd_do_process_task(task); 2389 2390 /* 2391 * If the task processing function returned EINPROGRESS indicating 2392 * that the task needs completing then schedule a taskq entry to 2393 * finish it now. 2394 * 2395 * Otherwise the task processing function returned either zero 2396 * indicating that the task was finished in the start function (and we 2397 * don't need to wait in a completion function) or the start function 2398 * returned an error - in both cases all that needs to happen is the 2399 * notification to the vDisk client higher up the call stack. 2400 * If the task was using a Descriptor Ring, we need to mark it as done 2401 * at this stage. 2402 */ 2403 if (task->status == EINPROGRESS) { 2404 /* Queue a task to complete the operation */ 2405 (void) ddi_taskq_dispatch(vd->completionq, vd_complete, 2406 task, DDI_SLEEP); 2407 2408 } else if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE)) { 2409 /* Update the dring element if it's a dring client */ 2410 status = vd_mark_elem_done(vd, task->index, 2411 task->request->status, task->request->nbytes); 2412 if (status == ECONNRESET) 2413 vd_mark_in_reset(vd); 2414 } 2415 2416 return (task->status); 2417 } 2418 2419 /* 2420 * Return true if the "type", "subtype", and "env" fields of the "tag" first 2421 * argument match the corresponding remaining arguments; otherwise, return false 2422 */ 2423 boolean_t 2424 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 2425 { 2426 return ((tag->vio_msgtype == type) && 2427 (tag->vio_subtype == subtype) && 2428 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 2429 } 2430 2431 /* 2432 * Check whether the major/minor version specified in "ver_msg" is supported 2433 * by this server. 2434 */ 2435 static boolean_t 2436 vds_supported_version(vio_ver_msg_t *ver_msg) 2437 { 2438 for (int i = 0; i < vds_num_versions; i++) { 2439 ASSERT(vds_version[i].major > 0); 2440 ASSERT((i == 0) || 2441 (vds_version[i].major < vds_version[i-1].major)); 2442 2443 /* 2444 * If the major versions match, adjust the minor version, if 2445 * necessary, down to the highest value supported by this 2446 * server and return true so this message will get "ack"ed; 2447 * the client should also support all minor versions lower 2448 * than the value it sent 2449 */ 2450 if (ver_msg->ver_major == vds_version[i].major) { 2451 if (ver_msg->ver_minor > vds_version[i].minor) { 2452 PR0("Adjusting minor version from %u to %u", 2453 ver_msg->ver_minor, vds_version[i].minor); 2454 ver_msg->ver_minor = vds_version[i].minor; 2455 } 2456 return (B_TRUE); 2457 } 2458 2459 /* 2460 * If the message contains a higher major version number, set 2461 * the message's major/minor versions to the current values 2462 * and return false, so this message will get "nack"ed with 2463 * these values, and the client will potentially try again 2464 * with the same or a lower version 2465 */ 2466 if (ver_msg->ver_major > vds_version[i].major) { 2467 ver_msg->ver_major = vds_version[i].major; 2468 ver_msg->ver_minor = vds_version[i].minor; 2469 return (B_FALSE); 2470 } 2471 2472 /* 2473 * Otherwise, the message's major version is less than the 2474 * current major version, so continue the loop to the next 2475 * (lower) supported version 2476 */ 2477 } 2478 2479 /* 2480 * No common version was found; "ground" the version pair in the 2481 * message to terminate negotiation 2482 */ 2483 ver_msg->ver_major = 0; 2484 ver_msg->ver_minor = 0; 2485 return (B_FALSE); 2486 } 2487 2488 /* 2489 * Process a version message from a client. vds expects to receive version 2490 * messages from clients seeking service, but never issues version messages 2491 * itself; therefore, vds can ACK or NACK client version messages, but does 2492 * not expect to receive version-message ACKs or NACKs (and will treat such 2493 * messages as invalid). 2494 */ 2495 static int 2496 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2497 { 2498 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 2499 2500 2501 ASSERT(msglen >= sizeof (msg->tag)); 2502 2503 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2504 VIO_VER_INFO)) { 2505 return (ENOMSG); /* not a version message */ 2506 } 2507 2508 if (msglen != sizeof (*ver_msg)) { 2509 PR0("Expected %lu-byte version message; " 2510 "received %lu bytes", sizeof (*ver_msg), msglen); 2511 return (EBADMSG); 2512 } 2513 2514 if (ver_msg->dev_class != VDEV_DISK) { 2515 PR0("Expected device class %u (disk); received %u", 2516 VDEV_DISK, ver_msg->dev_class); 2517 return (EBADMSG); 2518 } 2519 2520 /* 2521 * We're talking to the expected kind of client; set our device class 2522 * for "ack/nack" back to the client 2523 */ 2524 ver_msg->dev_class = VDEV_DISK_SERVER; 2525 2526 /* 2527 * Check whether the (valid) version message specifies a version 2528 * supported by this server. If the version is not supported, return 2529 * EBADMSG so the message will get "nack"ed; vds_supported_version() 2530 * will have updated the message with a supported version for the 2531 * client to consider 2532 */ 2533 if (!vds_supported_version(ver_msg)) 2534 return (EBADMSG); 2535 2536 2537 /* 2538 * A version has been agreed upon; use the client's SID for 2539 * communication on this channel now 2540 */ 2541 ASSERT(!(vd->initialized & VD_SID)); 2542 vd->sid = ver_msg->tag.vio_sid; 2543 vd->initialized |= VD_SID; 2544 2545 /* 2546 * When multiple versions are supported, this function should store 2547 * the negotiated major and minor version values in the "vd" data 2548 * structure to govern further communication; in particular, note that 2549 * the client might have specified a lower minor version for the 2550 * agreed major version than specifed in the vds_version[] array. The 2551 * following assertions should help remind future maintainers to make 2552 * the appropriate changes to support multiple versions. 2553 */ 2554 ASSERT(vds_num_versions == 1); 2555 ASSERT(ver_msg->ver_major == vds_version[0].major); 2556 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 2557 2558 PR0("Using major version %u, minor version %u", 2559 ver_msg->ver_major, ver_msg->ver_minor); 2560 return (0); 2561 } 2562 2563 static int 2564 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2565 { 2566 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 2567 int status, retry = 0; 2568 2569 2570 ASSERT(msglen >= sizeof (msg->tag)); 2571 2572 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2573 VIO_ATTR_INFO)) { 2574 PR0("Message is not an attribute message"); 2575 return (ENOMSG); 2576 } 2577 2578 if (msglen != sizeof (*attr_msg)) { 2579 PR0("Expected %lu-byte attribute message; " 2580 "received %lu bytes", sizeof (*attr_msg), msglen); 2581 return (EBADMSG); 2582 } 2583 2584 if (attr_msg->max_xfer_sz == 0) { 2585 PR0("Received maximum transfer size of 0 from client"); 2586 return (EBADMSG); 2587 } 2588 2589 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 2590 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 2591 PR0("Client requested unsupported transfer mode"); 2592 return (EBADMSG); 2593 } 2594 2595 /* 2596 * check if the underlying disk is ready, if not try accessing 2597 * the device again. Open the vdisk device and extract info 2598 * about it, as this is needed to respond to the attr info msg 2599 */ 2600 if ((vd->initialized & VD_DISK_READY) == 0) { 2601 PR0("Retry setting up disk (%s)", vd->device_path); 2602 do { 2603 status = vd_setup_vd(vd); 2604 if (status != EAGAIN || ++retry > vds_dev_retries) 2605 break; 2606 2607 /* incremental delay */ 2608 delay(drv_usectohz(vds_dev_delay)); 2609 2610 /* if vdisk is no longer enabled - return error */ 2611 if (!vd_enabled(vd)) 2612 return (ENXIO); 2613 2614 } while (status == EAGAIN); 2615 2616 if (status) 2617 return (ENXIO); 2618 2619 vd->initialized |= VD_DISK_READY; 2620 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 2621 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 2622 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 2623 (vd->pseudo ? "yes" : "no"), 2624 (vd->file ? "yes" : "no"), 2625 vd->nslices); 2626 } 2627 2628 /* Success: valid message and transfer mode */ 2629 vd->xfer_mode = attr_msg->xfer_mode; 2630 2631 if (vd->xfer_mode == VIO_DESC_MODE) { 2632 2633 /* 2634 * The vd_dring_inband_msg_t contains one cookie; need room 2635 * for up to n-1 more cookies, where "n" is the number of full 2636 * pages plus possibly one partial page required to cover 2637 * "max_xfer_sz". Add room for one more cookie if 2638 * "max_xfer_sz" isn't an integral multiple of the page size. 2639 * Must first get the maximum transfer size in bytes. 2640 */ 2641 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 2642 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 2643 attr_msg->max_xfer_sz; 2644 size_t max_inband_msglen = 2645 sizeof (vd_dring_inband_msg_t) + 2646 ((max_xfer_bytes/PAGESIZE + 2647 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 2648 (sizeof (ldc_mem_cookie_t))); 2649 2650 /* 2651 * Set the maximum expected message length to 2652 * accommodate in-band-descriptor messages with all 2653 * their cookies 2654 */ 2655 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 2656 2657 /* 2658 * Initialize the data structure for processing in-band I/O 2659 * request descriptors 2660 */ 2661 vd->inband_task.vd = vd; 2662 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2663 vd->inband_task.index = 0; 2664 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 2665 } 2666 2667 /* Return the device's block size and max transfer size to the client */ 2668 attr_msg->vdisk_block_size = DEV_BSIZE; 2669 attr_msg->max_xfer_sz = vd->max_xfer_sz; 2670 2671 attr_msg->vdisk_size = vd->vdisk_size; 2672 attr_msg->vdisk_type = vd->vdisk_type; 2673 attr_msg->operations = vds_operations; 2674 PR0("%s", VD_CLIENT(vd)); 2675 2676 ASSERT(vd->dring_task == NULL); 2677 2678 return (0); 2679 } 2680 2681 static int 2682 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2683 { 2684 int status; 2685 size_t expected; 2686 ldc_mem_info_t dring_minfo; 2687 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 2688 2689 2690 ASSERT(msglen >= sizeof (msg->tag)); 2691 2692 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2693 VIO_DRING_REG)) { 2694 PR0("Message is not a register-dring message"); 2695 return (ENOMSG); 2696 } 2697 2698 if (msglen < sizeof (*reg_msg)) { 2699 PR0("Expected at least %lu-byte register-dring message; " 2700 "received %lu bytes", sizeof (*reg_msg), msglen); 2701 return (EBADMSG); 2702 } 2703 2704 expected = sizeof (*reg_msg) + 2705 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 2706 if (msglen != expected) { 2707 PR0("Expected %lu-byte register-dring message; " 2708 "received %lu bytes", expected, msglen); 2709 return (EBADMSG); 2710 } 2711 2712 if (vd->initialized & VD_DRING) { 2713 PR0("A dring was previously registered; only support one"); 2714 return (EBADMSG); 2715 } 2716 2717 if (reg_msg->num_descriptors > INT32_MAX) { 2718 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 2719 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 2720 return (EBADMSG); 2721 } 2722 2723 if (reg_msg->ncookies != 1) { 2724 /* 2725 * In addition to fixing the assertion in the success case 2726 * below, supporting drings which require more than one 2727 * "cookie" requires increasing the value of vd->max_msglen 2728 * somewhere in the code path prior to receiving the message 2729 * which results in calling this function. Note that without 2730 * making this change, the larger message size required to 2731 * accommodate multiple cookies cannot be successfully 2732 * received, so this function will not even get called. 2733 * Gracefully accommodating more dring cookies might 2734 * reasonably demand exchanging an additional attribute or 2735 * making a minor protocol adjustment 2736 */ 2737 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 2738 return (EBADMSG); 2739 } 2740 2741 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 2742 reg_msg->ncookies, reg_msg->num_descriptors, 2743 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 2744 if (status != 0) { 2745 PR0("ldc_mem_dring_map() returned errno %d", status); 2746 return (status); 2747 } 2748 2749 /* 2750 * To remove the need for this assertion, must call 2751 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 2752 * successful call to ldc_mem_dring_map() 2753 */ 2754 ASSERT(reg_msg->ncookies == 1); 2755 2756 if ((status = 2757 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 2758 PR0("ldc_mem_dring_info() returned errno %d", status); 2759 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 2760 PR0("ldc_mem_dring_unmap() returned errno %d", status); 2761 return (status); 2762 } 2763 2764 if (dring_minfo.vaddr == NULL) { 2765 PR0("Descriptor ring virtual address is NULL"); 2766 return (ENXIO); 2767 } 2768 2769 2770 /* Initialize for valid message and mapped dring */ 2771 PR1("descriptor size = %u, dring length = %u", 2772 vd->descriptor_size, vd->dring_len); 2773 vd->initialized |= VD_DRING; 2774 vd->dring_ident = 1; /* "There Can Be Only One" */ 2775 vd->dring = dring_minfo.vaddr; 2776 vd->descriptor_size = reg_msg->descriptor_size; 2777 vd->dring_len = reg_msg->num_descriptors; 2778 reg_msg->dring_ident = vd->dring_ident; 2779 2780 /* 2781 * Allocate and initialize a "shadow" array of data structures for 2782 * tasks to process I/O requests in dring elements 2783 */ 2784 vd->dring_task = 2785 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 2786 for (int i = 0; i < vd->dring_len; i++) { 2787 vd->dring_task[i].vd = vd; 2788 vd->dring_task[i].index = i; 2789 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 2790 2791 status = ldc_mem_alloc_handle(vd->ldc_handle, 2792 &(vd->dring_task[i].mhdl)); 2793 if (status) { 2794 PR0("ldc_mem_alloc_handle() returned err %d ", status); 2795 return (ENXIO); 2796 } 2797 2798 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 2799 } 2800 2801 return (0); 2802 } 2803 2804 static int 2805 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2806 { 2807 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 2808 2809 2810 ASSERT(msglen >= sizeof (msg->tag)); 2811 2812 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 2813 VIO_DRING_UNREG)) { 2814 PR0("Message is not an unregister-dring message"); 2815 return (ENOMSG); 2816 } 2817 2818 if (msglen != sizeof (*unreg_msg)) { 2819 PR0("Expected %lu-byte unregister-dring message; " 2820 "received %lu bytes", sizeof (*unreg_msg), msglen); 2821 return (EBADMSG); 2822 } 2823 2824 if (unreg_msg->dring_ident != vd->dring_ident) { 2825 PR0("Expected dring ident %lu; received %lu", 2826 vd->dring_ident, unreg_msg->dring_ident); 2827 return (EBADMSG); 2828 } 2829 2830 return (0); 2831 } 2832 2833 static int 2834 process_rdx_msg(vio_msg_t *msg, size_t msglen) 2835 { 2836 ASSERT(msglen >= sizeof (msg->tag)); 2837 2838 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 2839 PR0("Message is not an RDX message"); 2840 return (ENOMSG); 2841 } 2842 2843 if (msglen != sizeof (vio_rdx_msg_t)) { 2844 PR0("Expected %lu-byte RDX message; received %lu bytes", 2845 sizeof (vio_rdx_msg_t), msglen); 2846 return (EBADMSG); 2847 } 2848 2849 PR0("Valid RDX message"); 2850 return (0); 2851 } 2852 2853 static int 2854 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 2855 { 2856 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 2857 PR0("Received seq_num %lu; expected %lu", 2858 seq_num, (vd->seq_num + 1)); 2859 PR0("initiating soft reset"); 2860 vd_need_reset(vd, B_FALSE); 2861 return (1); 2862 } 2863 2864 vd->seq_num = seq_num; 2865 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 2866 return (0); 2867 } 2868 2869 /* 2870 * Return the expected size of an inband-descriptor message with all the 2871 * cookies it claims to include 2872 */ 2873 static size_t 2874 expected_inband_size(vd_dring_inband_msg_t *msg) 2875 { 2876 return ((sizeof (*msg)) + 2877 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 2878 } 2879 2880 /* 2881 * Process an in-band descriptor message: used with clients like OBP, with 2882 * which vds exchanges descriptors within VIO message payloads, rather than 2883 * operating on them within a descriptor ring 2884 */ 2885 static int 2886 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2887 { 2888 size_t expected; 2889 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 2890 2891 2892 ASSERT(msglen >= sizeof (msg->tag)); 2893 2894 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 2895 VIO_DESC_DATA)) { 2896 PR1("Message is not an in-band-descriptor message"); 2897 return (ENOMSG); 2898 } 2899 2900 if (msglen < sizeof (*desc_msg)) { 2901 PR0("Expected at least %lu-byte descriptor message; " 2902 "received %lu bytes", sizeof (*desc_msg), msglen); 2903 return (EBADMSG); 2904 } 2905 2906 if (msglen != (expected = expected_inband_size(desc_msg))) { 2907 PR0("Expected %lu-byte descriptor message; " 2908 "received %lu bytes", expected, msglen); 2909 return (EBADMSG); 2910 } 2911 2912 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 2913 return (EBADMSG); 2914 2915 /* 2916 * Valid message: Set up the in-band descriptor task and process the 2917 * request. Arrange to acknowledge the client's message, unless an 2918 * error processing the descriptor task results in setting 2919 * VIO_SUBTYPE_NACK 2920 */ 2921 PR1("Valid in-band-descriptor message"); 2922 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2923 2924 ASSERT(vd->inband_task.msg != NULL); 2925 2926 bcopy(msg, vd->inband_task.msg, msglen); 2927 vd->inband_task.msglen = msglen; 2928 2929 /* 2930 * The task request is now the payload of the message 2931 * that was just copied into the body of the task. 2932 */ 2933 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 2934 vd->inband_task.request = &desc_msg->payload; 2935 2936 return (vd_process_task(&vd->inband_task)); 2937 } 2938 2939 static int 2940 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 2941 vio_msg_t *msg, size_t msglen) 2942 { 2943 int status; 2944 boolean_t ready; 2945 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 2946 2947 2948 /* Accept the updated dring element */ 2949 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 2950 PR0("ldc_mem_dring_acquire() returned errno %d", status); 2951 return (status); 2952 } 2953 ready = (elem->hdr.dstate == VIO_DESC_READY); 2954 if (ready) { 2955 elem->hdr.dstate = VIO_DESC_ACCEPTED; 2956 } else { 2957 PR0("descriptor %u not ready", idx); 2958 VD_DUMP_DRING_ELEM(elem); 2959 } 2960 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 2961 PR0("ldc_mem_dring_release() returned errno %d", status); 2962 return (status); 2963 } 2964 if (!ready) 2965 return (EBUSY); 2966 2967 2968 /* Initialize a task and process the accepted element */ 2969 PR1("Processing dring element %u", idx); 2970 vd->dring_task[idx].type = type; 2971 2972 /* duplicate msg buf for cookies etc. */ 2973 bcopy(msg, vd->dring_task[idx].msg, msglen); 2974 2975 vd->dring_task[idx].msglen = msglen; 2976 return (vd_process_task(&vd->dring_task[idx])); 2977 } 2978 2979 static int 2980 vd_process_element_range(vd_t *vd, int start, int end, 2981 vio_msg_t *msg, size_t msglen) 2982 { 2983 int i, n, nelem, status = 0; 2984 boolean_t inprogress = B_FALSE; 2985 vd_task_type_t type; 2986 2987 2988 ASSERT(start >= 0); 2989 ASSERT(end >= 0); 2990 2991 /* 2992 * Arrange to acknowledge the client's message, unless an error 2993 * processing one of the dring elements results in setting 2994 * VIO_SUBTYPE_NACK 2995 */ 2996 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2997 2998 /* 2999 * Process the dring elements in the range 3000 */ 3001 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 3002 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 3003 ((vio_dring_msg_t *)msg)->end_idx = i; 3004 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 3005 status = vd_process_element(vd, type, i, msg, msglen); 3006 if (status == EINPROGRESS) 3007 inprogress = B_TRUE; 3008 else if (status != 0) 3009 break; 3010 } 3011 3012 /* 3013 * If some, but not all, operations of a multi-element range are in 3014 * progress, wait for other operations to complete before returning 3015 * (which will result in "ack" or "nack" of the message). Note that 3016 * all outstanding operations will need to complete, not just the ones 3017 * corresponding to the current range of dring elements; howevever, as 3018 * this situation is an error case, performance is less critical. 3019 */ 3020 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 3021 ddi_taskq_wait(vd->completionq); 3022 3023 return (status); 3024 } 3025 3026 static int 3027 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3028 { 3029 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 3030 3031 3032 ASSERT(msglen >= sizeof (msg->tag)); 3033 3034 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 3035 VIO_DRING_DATA)) { 3036 PR1("Message is not a dring-data message"); 3037 return (ENOMSG); 3038 } 3039 3040 if (msglen != sizeof (*dring_msg)) { 3041 PR0("Expected %lu-byte dring message; received %lu bytes", 3042 sizeof (*dring_msg), msglen); 3043 return (EBADMSG); 3044 } 3045 3046 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 3047 return (EBADMSG); 3048 3049 if (dring_msg->dring_ident != vd->dring_ident) { 3050 PR0("Expected dring ident %lu; received ident %lu", 3051 vd->dring_ident, dring_msg->dring_ident); 3052 return (EBADMSG); 3053 } 3054 3055 if (dring_msg->start_idx >= vd->dring_len) { 3056 PR0("\"start_idx\" = %u; must be less than %u", 3057 dring_msg->start_idx, vd->dring_len); 3058 return (EBADMSG); 3059 } 3060 3061 if ((dring_msg->end_idx < 0) || 3062 (dring_msg->end_idx >= vd->dring_len)) { 3063 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 3064 dring_msg->end_idx, vd->dring_len); 3065 return (EBADMSG); 3066 } 3067 3068 /* Valid message; process range of updated dring elements */ 3069 PR1("Processing descriptor range, start = %u, end = %u", 3070 dring_msg->start_idx, dring_msg->end_idx); 3071 return (vd_process_element_range(vd, dring_msg->start_idx, 3072 dring_msg->end_idx, msg, msglen)); 3073 } 3074 3075 static int 3076 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 3077 { 3078 int retry, status; 3079 size_t size = *nbytes; 3080 3081 3082 for (retry = 0, status = ETIMEDOUT; 3083 retry < vds_ldc_retries && status == ETIMEDOUT; 3084 retry++) { 3085 PR1("ldc_read() attempt %d", (retry + 1)); 3086 *nbytes = size; 3087 status = ldc_read(ldc_handle, msg, nbytes); 3088 } 3089 3090 if (status) { 3091 PR0("ldc_read() returned errno %d", status); 3092 if (status != ECONNRESET) 3093 return (ENOMSG); 3094 return (status); 3095 } else if (*nbytes == 0) { 3096 PR1("ldc_read() returned 0 and no message read"); 3097 return (ENOMSG); 3098 } 3099 3100 PR1("RCVD %lu-byte message", *nbytes); 3101 return (0); 3102 } 3103 3104 static int 3105 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3106 { 3107 int status; 3108 3109 3110 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 3111 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 3112 #ifdef DEBUG 3113 vd_decode_tag(msg); 3114 #endif 3115 3116 /* 3117 * Validate session ID up front, since it applies to all messages 3118 * once set 3119 */ 3120 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 3121 PR0("Expected SID %u, received %u", vd->sid, 3122 msg->tag.vio_sid); 3123 return (EBADMSG); 3124 } 3125 3126 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 3127 3128 /* 3129 * Process the received message based on connection state 3130 */ 3131 switch (vd->state) { 3132 case VD_STATE_INIT: /* expect version message */ 3133 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 3134 return (status); 3135 3136 /* Version negotiated, move to that state */ 3137 vd->state = VD_STATE_VER; 3138 return (0); 3139 3140 case VD_STATE_VER: /* expect attribute message */ 3141 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 3142 return (status); 3143 3144 /* Attributes exchanged, move to that state */ 3145 vd->state = VD_STATE_ATTR; 3146 return (0); 3147 3148 case VD_STATE_ATTR: 3149 switch (vd->xfer_mode) { 3150 case VIO_DESC_MODE: /* expect RDX message */ 3151 if ((status = process_rdx_msg(msg, msglen)) != 0) 3152 return (status); 3153 3154 /* Ready to receive in-band descriptors */ 3155 vd->state = VD_STATE_DATA; 3156 return (0); 3157 3158 case VIO_DRING_MODE: /* expect register-dring message */ 3159 if ((status = 3160 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 3161 return (status); 3162 3163 /* One dring negotiated, move to that state */ 3164 vd->state = VD_STATE_DRING; 3165 return (0); 3166 3167 default: 3168 ASSERT("Unsupported transfer mode"); 3169 PR0("Unsupported transfer mode"); 3170 return (ENOTSUP); 3171 } 3172 3173 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 3174 if ((status = process_rdx_msg(msg, msglen)) == 0) { 3175 /* Ready to receive data */ 3176 vd->state = VD_STATE_DATA; 3177 return (0); 3178 } else if (status != ENOMSG) { 3179 return (status); 3180 } 3181 3182 3183 /* 3184 * If another register-dring message is received, stay in 3185 * dring state in case the client sends RDX; although the 3186 * protocol allows multiple drings, this server does not 3187 * support using more than one 3188 */ 3189 if ((status = 3190 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 3191 return (status); 3192 3193 /* 3194 * Acknowledge an unregister-dring message, but reset the 3195 * connection anyway: Although the protocol allows 3196 * unregistering drings, this server cannot serve a vdisk 3197 * without its only dring 3198 */ 3199 status = vd_process_dring_unreg_msg(vd, msg, msglen); 3200 return ((status == 0) ? ENOTSUP : status); 3201 3202 case VD_STATE_DATA: 3203 switch (vd->xfer_mode) { 3204 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 3205 return (vd_process_desc_msg(vd, msg, msglen)); 3206 3207 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 3208 /* 3209 * Typically expect dring-data messages, so handle 3210 * them first 3211 */ 3212 if ((status = vd_process_dring_msg(vd, msg, 3213 msglen)) != ENOMSG) 3214 return (status); 3215 3216 /* 3217 * Acknowledge an unregister-dring message, but reset 3218 * the connection anyway: Although the protocol 3219 * allows unregistering drings, this server cannot 3220 * serve a vdisk without its only dring 3221 */ 3222 status = vd_process_dring_unreg_msg(vd, msg, msglen); 3223 return ((status == 0) ? ENOTSUP : status); 3224 3225 default: 3226 ASSERT("Unsupported transfer mode"); 3227 PR0("Unsupported transfer mode"); 3228 return (ENOTSUP); 3229 } 3230 3231 default: 3232 ASSERT("Invalid client connection state"); 3233 PR0("Invalid client connection state"); 3234 return (ENOTSUP); 3235 } 3236 } 3237 3238 static int 3239 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3240 { 3241 int status; 3242 boolean_t reset_ldc = B_FALSE; 3243 vd_task_t task; 3244 3245 /* 3246 * Check that the message is at least big enough for a "tag", so that 3247 * message processing can proceed based on tag-specified message type 3248 */ 3249 if (msglen < sizeof (vio_msg_tag_t)) { 3250 PR0("Received short (%lu-byte) message", msglen); 3251 /* Can't "nack" short message, so drop the big hammer */ 3252 PR0("initiating full reset"); 3253 vd_need_reset(vd, B_TRUE); 3254 return (EBADMSG); 3255 } 3256 3257 /* 3258 * Process the message 3259 */ 3260 switch (status = vd_do_process_msg(vd, msg, msglen)) { 3261 case 0: 3262 /* "ack" valid, successfully-processed messages */ 3263 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3264 break; 3265 3266 case EINPROGRESS: 3267 /* The completion handler will "ack" or "nack" the message */ 3268 return (EINPROGRESS); 3269 case ENOMSG: 3270 PR0("Received unexpected message"); 3271 _NOTE(FALLTHROUGH); 3272 case EBADMSG: 3273 case ENOTSUP: 3274 /* "transport" error will cause NACK of invalid messages */ 3275 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 3276 break; 3277 3278 default: 3279 /* "transport" error will cause NACK of invalid messages */ 3280 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 3281 /* An LDC error probably occurred, so try resetting it */ 3282 reset_ldc = B_TRUE; 3283 break; 3284 } 3285 3286 PR1("\tResulting in state %d (%s)", vd->state, 3287 vd_decode_state(vd->state)); 3288 3289 /* populate the task so we can dispatch it on the taskq */ 3290 task.vd = vd; 3291 task.msg = msg; 3292 task.msglen = msglen; 3293 3294 /* 3295 * Queue a task to send the notification that the operation completed. 3296 * We need to ensure that requests are responded to in the correct 3297 * order and since the taskq is processed serially this ordering 3298 * is maintained. 3299 */ 3300 (void) ddi_taskq_dispatch(vd->completionq, vd_serial_notify, 3301 &task, DDI_SLEEP); 3302 3303 /* 3304 * To ensure handshake negotiations do not happen out of order, such 3305 * requests that come through this path should not be done in parallel 3306 * so we need to wait here until the response is sent to the client. 3307 */ 3308 ddi_taskq_wait(vd->completionq); 3309 3310 /* Arrange to reset the connection for nack'ed or failed messages */ 3311 if ((status != 0) || reset_ldc) { 3312 PR0("initiating %s reset", 3313 (reset_ldc) ? "full" : "soft"); 3314 vd_need_reset(vd, reset_ldc); 3315 } 3316 3317 return (status); 3318 } 3319 3320 static boolean_t 3321 vd_enabled(vd_t *vd) 3322 { 3323 boolean_t enabled; 3324 3325 mutex_enter(&vd->lock); 3326 enabled = vd->enabled; 3327 mutex_exit(&vd->lock); 3328 return (enabled); 3329 } 3330 3331 static void 3332 vd_recv_msg(void *arg) 3333 { 3334 vd_t *vd = (vd_t *)arg; 3335 int rv = 0, status = 0; 3336 3337 ASSERT(vd != NULL); 3338 3339 PR2("New task to receive incoming message(s)"); 3340 3341 3342 while (vd_enabled(vd) && status == 0) { 3343 size_t msglen, msgsize; 3344 ldc_status_t lstatus; 3345 3346 /* 3347 * Receive and process a message 3348 */ 3349 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 3350 3351 /* 3352 * check if channel is UP - else break out of loop 3353 */ 3354 status = ldc_status(vd->ldc_handle, &lstatus); 3355 if (lstatus != LDC_UP) { 3356 PR0("channel not up (status=%d), exiting recv loop\n", 3357 lstatus); 3358 break; 3359 } 3360 3361 ASSERT(vd->max_msglen != 0); 3362 3363 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 3364 msglen = msgsize; /* actual len after recv_msg() */ 3365 3366 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 3367 switch (status) { 3368 case 0: 3369 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 3370 msglen); 3371 /* check if max_msglen changed */ 3372 if (msgsize != vd->max_msglen) { 3373 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 3374 msgsize, vd->max_msglen); 3375 kmem_free(vd->vio_msgp, msgsize); 3376 vd->vio_msgp = 3377 kmem_alloc(vd->max_msglen, KM_SLEEP); 3378 } 3379 if (rv == EINPROGRESS) 3380 continue; 3381 break; 3382 3383 case ENOMSG: 3384 break; 3385 3386 case ECONNRESET: 3387 PR0("initiating soft reset (ECONNRESET)\n"); 3388 vd_need_reset(vd, B_FALSE); 3389 status = 0; 3390 break; 3391 3392 default: 3393 /* Probably an LDC failure; arrange to reset it */ 3394 PR0("initiating full reset (status=0x%x)", status); 3395 vd_need_reset(vd, B_TRUE); 3396 break; 3397 } 3398 } 3399 3400 PR2("Task finished"); 3401 } 3402 3403 static uint_t 3404 vd_handle_ldc_events(uint64_t event, caddr_t arg) 3405 { 3406 vd_t *vd = (vd_t *)(void *)arg; 3407 int status; 3408 3409 ASSERT(vd != NULL); 3410 3411 if (!vd_enabled(vd)) 3412 return (LDC_SUCCESS); 3413 3414 if (event & LDC_EVT_DOWN) { 3415 PR0("LDC_EVT_DOWN: LDC channel went down"); 3416 3417 vd_need_reset(vd, B_TRUE); 3418 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 3419 DDI_SLEEP); 3420 if (status == DDI_FAILURE) { 3421 PR0("cannot schedule task to recv msg\n"); 3422 vd_need_reset(vd, B_TRUE); 3423 } 3424 } 3425 3426 if (event & LDC_EVT_RESET) { 3427 PR0("LDC_EVT_RESET: LDC channel was reset"); 3428 3429 if (vd->state != VD_STATE_INIT) { 3430 PR0("scheduling full reset"); 3431 vd_need_reset(vd, B_FALSE); 3432 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 3433 vd, DDI_SLEEP); 3434 if (status == DDI_FAILURE) { 3435 PR0("cannot schedule task to recv msg\n"); 3436 vd_need_reset(vd, B_TRUE); 3437 } 3438 3439 } else { 3440 PR0("channel already reset, ignoring...\n"); 3441 PR0("doing ldc up...\n"); 3442 (void) ldc_up(vd->ldc_handle); 3443 } 3444 3445 return (LDC_SUCCESS); 3446 } 3447 3448 if (event & LDC_EVT_UP) { 3449 PR0("EVT_UP: LDC is up\nResetting client connection state"); 3450 PR0("initiating soft reset"); 3451 vd_need_reset(vd, B_FALSE); 3452 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 3453 vd, DDI_SLEEP); 3454 if (status == DDI_FAILURE) { 3455 PR0("cannot schedule task to recv msg\n"); 3456 vd_need_reset(vd, B_TRUE); 3457 return (LDC_SUCCESS); 3458 } 3459 } 3460 3461 if (event & LDC_EVT_READ) { 3462 int status; 3463 3464 PR1("New data available"); 3465 /* Queue a task to receive the new data */ 3466 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 3467 DDI_SLEEP); 3468 3469 if (status == DDI_FAILURE) { 3470 PR0("cannot schedule task to recv msg\n"); 3471 vd_need_reset(vd, B_TRUE); 3472 } 3473 } 3474 3475 return (LDC_SUCCESS); 3476 } 3477 3478 static uint_t 3479 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 3480 { 3481 _NOTE(ARGUNUSED(key, val)) 3482 (*((uint_t *)arg))++; 3483 return (MH_WALK_TERMINATE); 3484 } 3485 3486 3487 static int 3488 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3489 { 3490 uint_t vd_present = 0; 3491 minor_t instance; 3492 vds_t *vds; 3493 3494 3495 switch (cmd) { 3496 case DDI_DETACH: 3497 /* the real work happens below */ 3498 break; 3499 case DDI_SUSPEND: 3500 PR0("No action required for DDI_SUSPEND"); 3501 return (DDI_SUCCESS); 3502 default: 3503 PR0("Unrecognized \"cmd\""); 3504 return (DDI_FAILURE); 3505 } 3506 3507 ASSERT(cmd == DDI_DETACH); 3508 instance = ddi_get_instance(dip); 3509 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 3510 PR0("Could not get state for instance %u", instance); 3511 ddi_soft_state_free(vds_state, instance); 3512 return (DDI_FAILURE); 3513 } 3514 3515 /* Do no detach when serving any vdisks */ 3516 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 3517 if (vd_present) { 3518 PR0("Not detaching because serving vdisks"); 3519 return (DDI_FAILURE); 3520 } 3521 3522 PR0("Detaching"); 3523 if (vds->initialized & VDS_MDEG) { 3524 (void) mdeg_unregister(vds->mdeg); 3525 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 3526 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 3527 vds->ispecp = NULL; 3528 vds->mdeg = NULL; 3529 } 3530 3531 if (vds->initialized & VDS_LDI) 3532 (void) ldi_ident_release(vds->ldi_ident); 3533 mod_hash_destroy_hash(vds->vd_table); 3534 ddi_soft_state_free(vds_state, instance); 3535 return (DDI_SUCCESS); 3536 } 3537 3538 static boolean_t 3539 is_pseudo_device(dev_info_t *dip) 3540 { 3541 dev_info_t *parent, *root = ddi_root_node(); 3542 3543 3544 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 3545 parent = ddi_get_parent(parent)) { 3546 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 3547 return (B_TRUE); 3548 } 3549 3550 return (B_FALSE); 3551 } 3552 3553 static int 3554 vd_setup_full_disk(vd_t *vd) 3555 { 3556 int rval, status; 3557 major_t major = getmajor(vd->dev[0]); 3558 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 3559 struct dk_minfo dk_minfo; 3560 3561 ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); 3562 3563 /* 3564 * At this point, vdisk_size is set to the size of partition 2 but 3565 * this does not represent the size of the disk because partition 2 3566 * may not cover the entire disk and its size does not include reserved 3567 * blocks. So we update vdisk_size to be the size of the entire disk. 3568 */ 3569 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 3570 (intptr_t)&dk_minfo, (vd->open_flags | FKIOCTL), 3571 kcred, &rval)) != 0) { 3572 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 3573 status); 3574 return (status); 3575 } 3576 vd->vdisk_size = dk_minfo.dki_capacity; 3577 3578 /* Move dev number and LDI handle to entire-disk-slice array elements */ 3579 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 3580 vd->dev[0] = 0; 3581 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 3582 vd->ldi_handle[0] = NULL; 3583 3584 /* Initialize device numbers for remaining slices and open them */ 3585 for (int slice = 0; slice < vd->nslices; slice++) { 3586 /* 3587 * Skip the entire-disk slice, as it's already open and its 3588 * device known 3589 */ 3590 if (slice == VD_ENTIRE_DISK_SLICE) 3591 continue; 3592 ASSERT(vd->dev[slice] == 0); 3593 ASSERT(vd->ldi_handle[slice] == NULL); 3594 3595 /* 3596 * Construct the device number for the current slice 3597 */ 3598 vd->dev[slice] = makedevice(major, (minor + slice)); 3599 3600 /* 3601 * Open all slices of the disk to serve them to the client. 3602 * Slices are opened exclusively to prevent other threads or 3603 * processes in the service domain from performing I/O to 3604 * slices being accessed by a client. Failure to open a slice 3605 * results in vds not serving this disk, as the client could 3606 * attempt (and should be able) to access any slice immediately. 3607 * Any slices successfully opened before a failure will get 3608 * closed by vds_destroy_vd() as a result of the error returned 3609 * by this function. 3610 * 3611 * We need to do the open with FNDELAY so that opening an empty 3612 * slice does not fail. 3613 */ 3614 PR0("Opening device major %u, minor %u = slice %u", 3615 major, minor, slice); 3616 3617 /* 3618 * Try to open the device. This can fail for example if we are 3619 * opening an empty slice. So in case of a failure, we try the 3620 * open again but this time with the FNDELAY flag. 3621 */ 3622 status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 3623 vd->open_flags, kcred, &vd->ldi_handle[slice], 3624 vd->vds->ldi_ident); 3625 3626 if (status != 0) { 3627 status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 3628 vd->open_flags | FNDELAY, kcred, 3629 &vd->ldi_handle[slice], vd->vds->ldi_ident); 3630 } 3631 3632 if (status != 0) { 3633 PRN("ldi_open_by_dev() returned errno %d " 3634 "for slice %u", status, slice); 3635 /* vds_destroy_vd() will close any open slices */ 3636 vd->ldi_handle[slice] = NULL; 3637 return (status); 3638 } 3639 } 3640 3641 return (0); 3642 } 3643 3644 static int 3645 vd_setup_partition_vtoc(vd_t *vd) 3646 { 3647 int rval, status; 3648 char *device_path = vd->device_path; 3649 3650 status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 3651 (intptr_t)&vd->dk_geom, (vd->open_flags | FKIOCTL), kcred, &rval); 3652 3653 if (status != 0) { 3654 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 3655 status, device_path); 3656 return (status); 3657 } 3658 3659 /* Initialize dk_geom structure for single-slice device */ 3660 if (vd->dk_geom.dkg_nsect == 0) { 3661 PRN("%s geometry claims 0 sectors per track", device_path); 3662 return (EIO); 3663 } 3664 if (vd->dk_geom.dkg_nhead == 0) { 3665 PRN("%s geometry claims 0 heads", device_path); 3666 return (EIO); 3667 } 3668 vd->dk_geom.dkg_ncyl = vd->vdisk_size / vd->dk_geom.dkg_nsect / 3669 vd->dk_geom.dkg_nhead; 3670 vd->dk_geom.dkg_acyl = 0; 3671 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 3672 3673 3674 /* Initialize vtoc structure for single-slice device */ 3675 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 3676 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 3677 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 3678 vd->vtoc.v_nparts = 1; 3679 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 3680 vd->vtoc.v_part[0].p_flag = 0; 3681 vd->vtoc.v_part[0].p_start = 0; 3682 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 3683 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 3684 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 3685 3686 return (0); 3687 } 3688 3689 static int 3690 vd_setup_partition_efi(vd_t *vd) 3691 { 3692 efi_gpt_t *gpt; 3693 efi_gpe_t *gpe; 3694 struct uuid uuid = EFI_RESERVED; 3695 uint32_t crc; 3696 int length; 3697 3698 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 3699 3700 gpt = kmem_zalloc(length, KM_SLEEP); 3701 gpe = (efi_gpe_t *)(gpt + 1); 3702 3703 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 3704 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 3705 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 3706 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 3707 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 3708 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 3709 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 3710 3711 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 3712 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 3713 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 3714 3715 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 3716 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 3717 3718 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 3719 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 3720 3721 vd->dk_efi.dki_lba = 0; 3722 vd->dk_efi.dki_length = length; 3723 vd->dk_efi.dki_data = gpt; 3724 3725 return (0); 3726 } 3727 3728 /* 3729 * Setup for a virtual disk whose backend is a file (exported as a single slice 3730 * or as a full disk) or a pseudo device (for example a ZFS, SVM or VxVM volume) 3731 * exported as a full disk. In these cases, the backend is accessed using the 3732 * vnode interface. 3733 */ 3734 static int 3735 vd_setup_backend_vnode(vd_t *vd) 3736 { 3737 int rval, status; 3738 vattr_t vattr; 3739 dev_t dev; 3740 char *file_path = vd->device_path; 3741 char dev_path[MAXPATHLEN + 1]; 3742 ldi_handle_t lhandle; 3743 struct dk_cinfo dk_cinfo; 3744 3745 if ((status = vn_open(file_path, UIO_SYSSPACE, vd->open_flags | FOFFMAX, 3746 0, &vd->file_vnode, 0, 0)) != 0) { 3747 PRN("vn_open(%s) = errno %d", file_path, status); 3748 return (status); 3749 } 3750 3751 /* 3752 * We set vd->file now so that vds_destroy_vd will take care of 3753 * closing the file and releasing the vnode in case of an error. 3754 */ 3755 vd->file = B_TRUE; 3756 3757 vattr.va_mask = AT_SIZE; 3758 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) { 3759 PRN("VOP_GETATTR(%s) = errno %d", file_path, status); 3760 return (EIO); 3761 } 3762 3763 vd->file_size = vattr.va_size; 3764 /* size should be at least sizeof(dk_label) */ 3765 if (vd->file_size < sizeof (struct dk_label)) { 3766 PRN("Size of file has to be at least %ld bytes", 3767 sizeof (struct dk_label)); 3768 return (EIO); 3769 } 3770 3771 if (vd->file_vnode->v_flag & VNOMAP) { 3772 PRN("File %s cannot be mapped", file_path); 3773 return (EIO); 3774 } 3775 3776 /* 3777 * Find and validate the geometry of a disk image. For a single slice 3778 * disk image, this will build a fake geometry and vtoc. 3779 */ 3780 status = vd_file_validate_geometry(vd); 3781 if (status != 0 && status != EINVAL) { 3782 PRN("Fail to read label from %s", file_path); 3783 return (EIO); 3784 } 3785 3786 /* sector size = block size = DEV_BSIZE */ 3787 vd->vdisk_size = vd->file_size / DEV_BSIZE; 3788 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 3789 3790 /* 3791 * Get max_xfer_sz from the device where the file is or from the device 3792 * itself if we have a pseudo device. 3793 */ 3794 dev_path[0] = '\0'; 3795 3796 if (vd->pseudo) { 3797 status = ldi_open_by_name(file_path, FREAD, kcred, &lhandle, 3798 vd->vds->ldi_ident); 3799 } else { 3800 dev = vd->file_vnode->v_vfsp->vfs_dev; 3801 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 3802 PR0("underlying device = %s\n", dev_path); 3803 } 3804 3805 status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, kcred, &lhandle, 3806 vd->vds->ldi_ident); 3807 } 3808 3809 if (status != 0) { 3810 PR0("ldi_open() returned errno %d for device %s", 3811 status, (dev_path[0] == '\0')? file_path : dev_path); 3812 } else { 3813 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 3814 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 3815 &rval)) != 0) { 3816 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 3817 status, dev_path); 3818 } else { 3819 /* 3820 * Store the device's max transfer size for 3821 * return to the client 3822 */ 3823 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 3824 } 3825 3826 PR0("close the device %s", dev_path); 3827 (void) ldi_close(lhandle, FREAD, kcred); 3828 } 3829 3830 PR0("using file %s, dev %s, max_xfer = %u blks", 3831 file_path, dev_path, vd->max_xfer_sz); 3832 3833 /* Setup devid for the disk image */ 3834 3835 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) 3836 return (0); 3837 3838 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 3839 3840 status = vd_file_read_devid(vd, &vd->file_devid); 3841 3842 if (status == 0) { 3843 /* a valid devid was found */ 3844 return (0); 3845 } 3846 3847 if (status != EINVAL) { 3848 /* 3849 * There was an error while trying to read the devid. 3850 * So this disk image may have a devid but we are 3851 * unable to read it. 3852 */ 3853 PR0("can not read devid for %s", file_path); 3854 vd->file_devid = NULL; 3855 return (0); 3856 } 3857 } 3858 3859 /* 3860 * No valid device id was found so we create one. Note that a failure 3861 * to create a device id is not fatal and does not prevent the disk 3862 * image from being attached. 3863 */ 3864 PR1("creating devid for %s", file_path); 3865 3866 if (ddi_devid_init(vd->vds->dip, DEVID_FAB, NULL, 0, 3867 &vd->file_devid) != DDI_SUCCESS) { 3868 PR0("fail to create devid for %s", file_path); 3869 vd->file_devid = NULL; 3870 return (0); 3871 } 3872 3873 /* 3874 * Write devid to the disk image. The devid is stored into the disk 3875 * image if we have a valid label; otherwise the devid will be stored 3876 * when the user writes a valid label. 3877 */ 3878 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 3879 if (vd_file_write_devid(vd, vd->file_devid) != 0) { 3880 PR0("fail to write devid for %s", file_path); 3881 ddi_devid_free(vd->file_devid); 3882 vd->file_devid = NULL; 3883 } 3884 } 3885 3886 return (0); 3887 } 3888 3889 /* 3890 * Setup for a virtual disk which backend is a device (a physical disk, 3891 * slice or pseudo device) that is directly exported either as a full disk 3892 * for a physical disk or as a slice for a pseudo device or a disk slice. 3893 * In these cases, the backend is accessed using the LDI interface. 3894 */ 3895 static int 3896 vd_setup_backend_ldi(vd_t *vd) 3897 { 3898 int rval, status; 3899 struct dk_cinfo dk_cinfo; 3900 char *device_path = vd->device_path; 3901 3902 /* 3903 * Try to open the device. This can fail for example if we are opening 3904 * an empty slice. So in case of a failure, we try the open again but 3905 * this time with the FNDELAY flag. 3906 */ 3907 status = ldi_open_by_name(device_path, vd->open_flags, kcred, 3908 &vd->ldi_handle[0], vd->vds->ldi_ident); 3909 3910 if (status != 0) 3911 status = ldi_open_by_name(device_path, vd->open_flags | FNDELAY, 3912 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident); 3913 3914 if (status != 0) { 3915 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 3916 vd->ldi_handle[0] = NULL; 3917 return (status); 3918 } 3919 3920 vd->file = B_FALSE; 3921 3922 /* Get device number of backing device */ 3923 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 3924 PRN("ldi_get_dev() returned errno %d for %s", 3925 status, device_path); 3926 return (status); 3927 } 3928 3929 /* Verify backing device supports dk_cinfo */ 3930 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 3931 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 3932 &rval)) != 0) { 3933 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 3934 status, device_path); 3935 return (status); 3936 } 3937 if (dk_cinfo.dki_partition >= V_NUMPAR) { 3938 PRN("slice %u >= maximum slice %u for %s", 3939 dk_cinfo.dki_partition, V_NUMPAR, device_path); 3940 return (EIO); 3941 } 3942 3943 vd->vdisk_label = vd_read_vtoc(vd, &vd->vtoc); 3944 3945 /* Store the device's max transfer size for return to the client */ 3946 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 3947 3948 /* 3949 * Export a full disk. 3950 * 3951 * When we use the LDI interface, we export a device as a full disk 3952 * if we have an entire disk slice (slice 2) and if this slice is 3953 * exported as a full disk and not as a single slice disk. 3954 * 3955 * Note that pseudo devices are exported as full disks using the vnode 3956 * interface, not the LDI interface. 3957 */ 3958 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE && 3959 vd->vdisk_type == VD_DISK_TYPE_DISK) { 3960 ASSERT(!vd->pseudo); 3961 return (vd_setup_full_disk(vd)); 3962 } 3963 3964 /* 3965 * Export a single slice disk. 3966 * 3967 * The exported device can be either a pseudo device or a disk slice. If 3968 * it is a disk slice different from slice 2 then it is always exported 3969 * as a single slice disk even if the "slice" option is not specified. 3970 * If it is disk slice 2 or a pseudo device then it is exported as a 3971 * single slice disk only if the "slice" option is specified. 3972 */ 3973 ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE || 3974 dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE); 3975 return (vd_setup_single_slice_disk(vd)); 3976 } 3977 3978 static int 3979 vd_setup_single_slice_disk(vd_t *vd) 3980 { 3981 int status; 3982 char *device_path = vd->device_path; 3983 3984 /* Get size of backing device */ 3985 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 3986 PRN("ldi_get_size() failed for %s", device_path); 3987 return (EIO); 3988 } 3989 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 3990 3991 if (vd->pseudo) { 3992 3993 ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE); 3994 3995 /* 3996 * Currently we only support exporting pseudo devices which 3997 * provide a valid disk label. 3998 */ 3999 if (vd->vdisk_label == VD_DISK_LABEL_UNK) { 4000 PRN("%s is a pseudo device with an invalid disk " 4001 "label\n", device_path); 4002 return (EINVAL); 4003 } 4004 return (0); /* ...and we're done */ 4005 } 4006 4007 /* We can only export a slice if the disk has a valid label */ 4008 if (vd->vdisk_label == VD_DISK_LABEL_UNK) { 4009 PRN("%s is a slice from a disk with an unknown disk label\n", 4010 device_path); 4011 return (EINVAL); 4012 } 4013 4014 /* 4015 * We export the slice as a single slice disk even if the "slice" 4016 * option was not specified. 4017 */ 4018 vd->vdisk_type = VD_DISK_TYPE_SLICE; 4019 vd->nslices = 1; 4020 4021 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 4022 /* Slice from a disk with an EFI label */ 4023 status = vd_setup_partition_efi(vd); 4024 } else { 4025 /* Slice from a disk with a VTOC label */ 4026 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 4027 status = vd_setup_partition_vtoc(vd); 4028 } 4029 4030 return (status); 4031 } 4032 4033 static int 4034 vd_setup_vd(vd_t *vd) 4035 { 4036 int status; 4037 dev_info_t *dip; 4038 vnode_t *vnp; 4039 char *path = vd->device_path; 4040 4041 /* make sure the vdisk backend is valid */ 4042 if ((status = lookupname(path, UIO_SYSSPACE, 4043 FOLLOW, NULLVPP, &vnp)) != 0) { 4044 PR0("Cannot lookup %s errno %d", path, status); 4045 goto done; 4046 } 4047 4048 switch (vnp->v_type) { 4049 case VREG: 4050 /* 4051 * Backend is a file so it is exported as a full disk or as a 4052 * single slice disk using the vnode interface. 4053 */ 4054 VN_RELE(vnp); 4055 vd->pseudo = B_FALSE; 4056 status = vd_setup_backend_vnode(vd); 4057 break; 4058 4059 case VBLK: 4060 case VCHR: 4061 /* 4062 * Backend is a device. The way it is exported depends on the 4063 * type of the device. 4064 * 4065 * - A pseudo device is exported as a full disk using the vnode 4066 * interface or as a single slice disk using the LDI 4067 * interface. 4068 * 4069 * - A disk (represented by the slice 2 of that disk) is 4070 * exported as a full disk using the LDI interface. 4071 * 4072 * - A disk slice (different from slice 2) is always exported 4073 * as a single slice disk using the LDI interface. 4074 * 4075 * - The slice 2 of a disk is exported as a single slice disk 4076 * if the "slice" option is specified, otherwise the entire 4077 * disk will be exported. In any case, the LDI interface is 4078 * used. 4079 */ 4080 4081 /* check if this is a pseudo device */ 4082 if ((dip = ddi_hold_devi_by_instance(getmajor(vnp->v_rdev), 4083 dev_to_instance(vnp->v_rdev), 0)) == NULL) { 4084 PRN("%s is no longer accessible", path); 4085 VN_RELE(vnp); 4086 status = EIO; 4087 break; 4088 } 4089 vd->pseudo = is_pseudo_device(dip); 4090 ddi_release_devi(dip); 4091 VN_RELE(vnp); 4092 4093 /* 4094 * If this is a pseudo device then its usage depends if the 4095 * "slice" option is set or not. If the "slice" option is set 4096 * then the pseudo device will be exported as a single slice, 4097 * otherwise it will be exported as a full disk. 4098 */ 4099 if (vd->pseudo && vd->vdisk_type == VD_DISK_TYPE_DISK) 4100 status = vd_setup_backend_vnode(vd); 4101 else 4102 status = vd_setup_backend_ldi(vd); 4103 break; 4104 4105 default: 4106 PRN("Unsupported vdisk backend %s", path); 4107 VN_RELE(vnp); 4108 status = EBADF; 4109 } 4110 4111 done: 4112 if (status != 0) { 4113 /* 4114 * If the error is retryable print an error message only 4115 * during the first try. 4116 */ 4117 if (status == ENXIO || status == ENODEV || 4118 status == ENOENT || status == EROFS) { 4119 if (!(vd->initialized & VD_SETUP_ERROR)) { 4120 PRN("%s is currently inaccessible (error %d)", 4121 path, status); 4122 } 4123 status = EAGAIN; 4124 } else { 4125 PRN("%s can not be exported as a virtual disk " 4126 "(error %d)", path, status); 4127 } 4128 vd->initialized |= VD_SETUP_ERROR; 4129 4130 } else if (vd->initialized & VD_SETUP_ERROR) { 4131 /* print a message only if we previously had an error */ 4132 PRN("%s is now online", path); 4133 vd->initialized &= ~VD_SETUP_ERROR; 4134 } 4135 4136 return (status); 4137 } 4138 4139 static int 4140 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options, 4141 uint64_t ldc_id, vd_t **vdp) 4142 { 4143 char tq_name[TASKQ_NAMELEN]; 4144 int status; 4145 ddi_iblock_cookie_t iblock = NULL; 4146 ldc_attr_t ldc_attr; 4147 vd_t *vd; 4148 4149 4150 ASSERT(vds != NULL); 4151 ASSERT(device_path != NULL); 4152 ASSERT(vdp != NULL); 4153 PR0("Adding vdisk for %s", device_path); 4154 4155 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 4156 PRN("No memory for virtual disk"); 4157 return (EAGAIN); 4158 } 4159 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 4160 vd->vds = vds; 4161 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 4162 4163 /* Setup open flags */ 4164 vd->open_flags = FREAD; 4165 4166 if (!(options & VD_OPT_RDONLY)) 4167 vd->open_flags |= FWRITE; 4168 4169 if (options & VD_OPT_EXCLUSIVE) 4170 vd->open_flags |= FEXCL; 4171 4172 /* Setup disk type */ 4173 if (options & VD_OPT_SLICE) { 4174 vd->vdisk_type = VD_DISK_TYPE_SLICE; 4175 vd->nslices = 1; 4176 } else { 4177 vd->vdisk_type = VD_DISK_TYPE_DISK; 4178 vd->nslices = V_NUMPAR; 4179 } 4180 4181 /* default disk label */ 4182 vd->vdisk_label = VD_DISK_LABEL_UNK; 4183 4184 /* Open vdisk and initialize parameters */ 4185 if ((status = vd_setup_vd(vd)) == 0) { 4186 vd->initialized |= VD_DISK_READY; 4187 4188 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 4189 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 4190 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 4191 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 4192 vd->nslices); 4193 } else { 4194 if (status != EAGAIN) 4195 return (status); 4196 } 4197 4198 /* Initialize locking */ 4199 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 4200 &iblock) != DDI_SUCCESS) { 4201 PRN("Could not get iblock cookie."); 4202 return (EIO); 4203 } 4204 4205 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 4206 vd->initialized |= VD_LOCKING; 4207 4208 4209 /* Create start and completion task queues for the vdisk */ 4210 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 4211 PR1("tq_name = %s", tq_name); 4212 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 4213 TASKQ_DEFAULTPRI, 0)) == NULL) { 4214 PRN("Could not create task queue"); 4215 return (EIO); 4216 } 4217 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 4218 PR1("tq_name = %s", tq_name); 4219 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 4220 TASKQ_DEFAULTPRI, 0)) == NULL) { 4221 PRN("Could not create task queue"); 4222 return (EIO); 4223 } 4224 vd->enabled = 1; /* before callback can dispatch to startq */ 4225 4226 4227 /* Bring up LDC */ 4228 ldc_attr.devclass = LDC_DEV_BLK_SVC; 4229 ldc_attr.instance = ddi_get_instance(vds->dip); 4230 ldc_attr.mode = LDC_MODE_UNRELIABLE; 4231 ldc_attr.mtu = VD_LDC_MTU; 4232 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 4233 PRN("Could not initialize LDC channel %lu, " 4234 "init failed with error %d", ldc_id, status); 4235 return (status); 4236 } 4237 vd->initialized |= VD_LDC; 4238 4239 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 4240 (caddr_t)vd)) != 0) { 4241 PRN("Could not initialize LDC channel %lu," 4242 "reg_callback failed with error %d", ldc_id, status); 4243 return (status); 4244 } 4245 4246 if ((status = ldc_open(vd->ldc_handle)) != 0) { 4247 PRN("Could not initialize LDC channel %lu," 4248 "open failed with error %d", ldc_id, status); 4249 return (status); 4250 } 4251 4252 if ((status = ldc_up(vd->ldc_handle)) != 0) { 4253 PR0("ldc_up() returned errno %d", status); 4254 } 4255 4256 /* Allocate the inband task memory handle */ 4257 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 4258 if (status) { 4259 PRN("Could not initialize LDC channel %lu," 4260 "alloc_handle failed with error %d", ldc_id, status); 4261 return (ENXIO); 4262 } 4263 4264 /* Add the successfully-initialized vdisk to the server's table */ 4265 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 4266 PRN("Error adding vdisk ID %lu to table", id); 4267 return (EIO); 4268 } 4269 4270 /* Allocate the staging buffer */ 4271 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 4272 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 4273 4274 /* store initial state */ 4275 vd->state = VD_STATE_INIT; 4276 4277 return (0); 4278 } 4279 4280 static void 4281 vd_free_dring_task(vd_t *vdp) 4282 { 4283 if (vdp->dring_task != NULL) { 4284 ASSERT(vdp->dring_len != 0); 4285 /* Free all dring_task memory handles */ 4286 for (int i = 0; i < vdp->dring_len; i++) { 4287 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 4288 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 4289 vdp->dring_task[i].msg = NULL; 4290 } 4291 kmem_free(vdp->dring_task, 4292 (sizeof (*vdp->dring_task)) * vdp->dring_len); 4293 vdp->dring_task = NULL; 4294 } 4295 } 4296 4297 /* 4298 * Destroy the state associated with a virtual disk 4299 */ 4300 static void 4301 vds_destroy_vd(void *arg) 4302 { 4303 vd_t *vd = (vd_t *)arg; 4304 int retry = 0, rv; 4305 4306 if (vd == NULL) 4307 return; 4308 4309 PR0("Destroying vdisk state"); 4310 4311 if (vd->dk_efi.dki_data != NULL) 4312 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 4313 4314 /* Disable queuing requests for the vdisk */ 4315 if (vd->initialized & VD_LOCKING) { 4316 mutex_enter(&vd->lock); 4317 vd->enabled = 0; 4318 mutex_exit(&vd->lock); 4319 } 4320 4321 /* Drain and destroy start queue (*before* destroying completionq) */ 4322 if (vd->startq != NULL) 4323 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 4324 4325 /* Drain and destroy completion queue (*before* shutting down LDC) */ 4326 if (vd->completionq != NULL) 4327 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 4328 4329 vd_free_dring_task(vd); 4330 4331 /* Free the inband task memory handle */ 4332 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 4333 4334 /* Shut down LDC */ 4335 if (vd->initialized & VD_LDC) { 4336 /* unmap the dring */ 4337 if (vd->initialized & VD_DRING) 4338 (void) ldc_mem_dring_unmap(vd->dring_handle); 4339 4340 /* close LDC channel - retry on EAGAIN */ 4341 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 4342 if (++retry > vds_ldc_retries) { 4343 PR0("Timed out closing channel"); 4344 break; 4345 } 4346 drv_usecwait(vds_ldc_delay); 4347 } 4348 if (rv == 0) { 4349 (void) ldc_unreg_callback(vd->ldc_handle); 4350 (void) ldc_fini(vd->ldc_handle); 4351 } else { 4352 /* 4353 * Closing the LDC channel has failed. Ideally we should 4354 * fail here but there is no Zeus level infrastructure 4355 * to handle this. The MD has already been changed and 4356 * we have to do the close. So we try to do as much 4357 * clean up as we can. 4358 */ 4359 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 4360 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 4361 drv_usecwait(vds_ldc_delay); 4362 } 4363 } 4364 4365 /* Free the staging buffer for msgs */ 4366 if (vd->vio_msgp != NULL) { 4367 kmem_free(vd->vio_msgp, vd->max_msglen); 4368 vd->vio_msgp = NULL; 4369 } 4370 4371 /* Free the inband message buffer */ 4372 if (vd->inband_task.msg != NULL) { 4373 kmem_free(vd->inband_task.msg, vd->max_msglen); 4374 vd->inband_task.msg = NULL; 4375 } 4376 if (vd->file) { 4377 /* Close file */ 4378 (void) VOP_CLOSE(vd->file_vnode, vd->open_flags, 1, 4379 0, kcred); 4380 VN_RELE(vd->file_vnode); 4381 if (vd->file_devid != NULL) 4382 ddi_devid_free(vd->file_devid); 4383 } else { 4384 /* Close any open backing-device slices */ 4385 for (uint_t slice = 0; slice < vd->nslices; slice++) { 4386 if (vd->ldi_handle[slice] != NULL) { 4387 PR0("Closing slice %u", slice); 4388 (void) ldi_close(vd->ldi_handle[slice], 4389 vd->open_flags, kcred); 4390 } 4391 } 4392 } 4393 4394 /* Free lock */ 4395 if (vd->initialized & VD_LOCKING) 4396 mutex_destroy(&vd->lock); 4397 4398 /* Finally, free the vdisk structure itself */ 4399 kmem_free(vd, sizeof (*vd)); 4400 } 4401 4402 static int 4403 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options, 4404 uint64_t ldc_id) 4405 { 4406 int status; 4407 vd_t *vd = NULL; 4408 4409 4410 if ((status = vds_do_init_vd(vds, id, device_path, options, 4411 ldc_id, &vd)) != 0) 4412 vds_destroy_vd(vd); 4413 4414 return (status); 4415 } 4416 4417 static int 4418 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 4419 uint64_t *ldc_id) 4420 { 4421 int num_channels; 4422 4423 4424 /* Look for channel endpoint child(ren) of the vdisk MD node */ 4425 if ((num_channels = md_scan_dag(md, vd_node, 4426 md_find_name(md, VD_CHANNEL_ENDPOINT), 4427 md_find_name(md, "fwd"), channel)) <= 0) { 4428 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 4429 return (-1); 4430 } 4431 4432 /* Get the "id" value for the first channel endpoint node */ 4433 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 4434 PRN("No \"%s\" property found for \"%s\" of vdisk", 4435 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 4436 return (-1); 4437 } 4438 4439 if (num_channels > 1) { 4440 PRN("Using ID of first of multiple channels for this vdisk"); 4441 } 4442 4443 return (0); 4444 } 4445 4446 static int 4447 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 4448 { 4449 int num_nodes, status; 4450 size_t size; 4451 mde_cookie_t *channel; 4452 4453 4454 if ((num_nodes = md_node_count(md)) <= 0) { 4455 PRN("Invalid node count in Machine Description subtree"); 4456 return (-1); 4457 } 4458 size = num_nodes*(sizeof (*channel)); 4459 channel = kmem_zalloc(size, KM_SLEEP); 4460 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 4461 kmem_free(channel, size); 4462 4463 return (status); 4464 } 4465 4466 /* 4467 * Function: 4468 * vds_get_options 4469 * 4470 * Description: 4471 * Parse the options of a vds node. Options are defined as an array 4472 * of strings in the vds-block-device-opts property of the vds node 4473 * in the machine description. Options are returned as a bitmask. The 4474 * mapping between the bitmask options and the options strings from the 4475 * machine description is defined in the vd_bdev_options[] array. 4476 * 4477 * The vds-block-device-opts property is optional. If a vds has no such 4478 * property then no option is defined. 4479 * 4480 * Parameters: 4481 * md - machine description. 4482 * vd_node - vds node in the machine description for which 4483 * options have to be parsed. 4484 * options - the returned options. 4485 * 4486 * Return Code: 4487 * none. 4488 */ 4489 static void 4490 vds_get_options(md_t *md, mde_cookie_t vd_node, uint64_t *options) 4491 { 4492 char *optstr, *opt; 4493 int len, n, i; 4494 4495 *options = 0; 4496 4497 if (md_get_prop_data(md, vd_node, VD_BLOCK_DEVICE_OPTS, 4498 (uint8_t **)&optstr, &len) != 0) { 4499 PR0("No options found"); 4500 return; 4501 } 4502 4503 /* parse options */ 4504 opt = optstr; 4505 n = sizeof (vd_bdev_options) / sizeof (vd_option_t); 4506 4507 while (opt < optstr + len) { 4508 for (i = 0; i < n; i++) { 4509 if (strncmp(vd_bdev_options[i].vdo_name, 4510 opt, VD_OPTION_NLEN) == 0) { 4511 *options |= vd_bdev_options[i].vdo_value; 4512 break; 4513 } 4514 } 4515 4516 if (i < n) { 4517 PR0("option: %s", opt); 4518 } else { 4519 PRN("option %s is unknown or unsupported", opt); 4520 } 4521 4522 opt += strlen(opt) + 1; 4523 } 4524 } 4525 4526 static void 4527 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 4528 { 4529 char *device_path = NULL; 4530 uint64_t id = 0, ldc_id = 0, options = 0; 4531 4532 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 4533 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 4534 return; 4535 } 4536 PR0("Adding vdisk ID %lu", id); 4537 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 4538 &device_path) != 0) { 4539 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 4540 return; 4541 } 4542 4543 vds_get_options(md, vd_node, &options); 4544 4545 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 4546 PRN("Error getting LDC ID for vdisk %lu", id); 4547 return; 4548 } 4549 4550 if (vds_init_vd(vds, id, device_path, options, ldc_id) != 0) { 4551 PRN("Failed to add vdisk ID %lu", id); 4552 return; 4553 } 4554 } 4555 4556 static void 4557 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 4558 { 4559 uint64_t id = 0; 4560 4561 4562 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 4563 PRN("Unable to get \"%s\" property from vdisk's MD node", 4564 VD_ID_PROP); 4565 return; 4566 } 4567 PR0("Removing vdisk ID %lu", id); 4568 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 4569 PRN("No vdisk entry found for vdisk ID %lu", id); 4570 } 4571 4572 static void 4573 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 4574 md_t *curr_md, mde_cookie_t curr_vd_node) 4575 { 4576 char *curr_dev, *prev_dev; 4577 uint64_t curr_id = 0, curr_ldc_id = 0, curr_options = 0; 4578 uint64_t prev_id = 0, prev_ldc_id = 0, prev_options = 0; 4579 size_t len; 4580 4581 4582 /* Validate that vdisk ID has not changed */ 4583 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 4584 PRN("Error getting previous vdisk \"%s\" property", 4585 VD_ID_PROP); 4586 return; 4587 } 4588 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 4589 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 4590 return; 4591 } 4592 if (curr_id != prev_id) { 4593 PRN("Not changing vdisk: ID changed from %lu to %lu", 4594 prev_id, curr_id); 4595 return; 4596 } 4597 4598 /* Validate that LDC ID has not changed */ 4599 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 4600 PRN("Error getting LDC ID for vdisk %lu", prev_id); 4601 return; 4602 } 4603 4604 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 4605 PRN("Error getting LDC ID for vdisk %lu", curr_id); 4606 return; 4607 } 4608 if (curr_ldc_id != prev_ldc_id) { 4609 _NOTE(NOTREACHED); /* lint is confused */ 4610 PRN("Not changing vdisk: " 4611 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 4612 return; 4613 } 4614 4615 /* Determine whether device path has changed */ 4616 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 4617 &prev_dev) != 0) { 4618 PRN("Error getting previous vdisk \"%s\"", 4619 VD_BLOCK_DEVICE_PROP); 4620 return; 4621 } 4622 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 4623 &curr_dev) != 0) { 4624 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 4625 return; 4626 } 4627 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 4628 (strncmp(curr_dev, prev_dev, len) == 0)) 4629 return; /* no relevant (supported) change */ 4630 4631 /* Validate that options have not changed */ 4632 vds_get_options(prev_md, prev_vd_node, &prev_options); 4633 vds_get_options(curr_md, curr_vd_node, &curr_options); 4634 if (prev_options != curr_options) { 4635 PRN("Not changing vdisk: options changed from %lx to %lx", 4636 prev_options, curr_options); 4637 return; 4638 } 4639 4640 PR0("Changing vdisk ID %lu", prev_id); 4641 4642 /* Remove old state, which will close vdisk and reset */ 4643 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 4644 PRN("No entry found for vdisk ID %lu", prev_id); 4645 4646 /* Re-initialize vdisk with new state */ 4647 if (vds_init_vd(vds, curr_id, curr_dev, curr_options, 4648 curr_ldc_id) != 0) { 4649 PRN("Failed to change vdisk ID %lu", curr_id); 4650 return; 4651 } 4652 } 4653 4654 static int 4655 vds_process_md(void *arg, mdeg_result_t *md) 4656 { 4657 int i; 4658 vds_t *vds = arg; 4659 4660 4661 if (md == NULL) 4662 return (MDEG_FAILURE); 4663 ASSERT(vds != NULL); 4664 4665 for (i = 0; i < md->removed.nelem; i++) 4666 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 4667 for (i = 0; i < md->match_curr.nelem; i++) 4668 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 4669 md->match_curr.mdp, md->match_curr.mdep[i]); 4670 for (i = 0; i < md->added.nelem; i++) 4671 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 4672 4673 return (MDEG_SUCCESS); 4674 } 4675 4676 4677 static int 4678 vds_do_attach(dev_info_t *dip) 4679 { 4680 int status, sz; 4681 int cfg_handle; 4682 minor_t instance = ddi_get_instance(dip); 4683 vds_t *vds; 4684 mdeg_prop_spec_t *pspecp; 4685 mdeg_node_spec_t *ispecp; 4686 4687 /* 4688 * The "cfg-handle" property of a vds node in an MD contains the MD's 4689 * notion of "instance", or unique identifier, for that node; OBP 4690 * stores the value of the "cfg-handle" MD property as the value of 4691 * the "reg" property on the node in the device tree it builds from 4692 * the MD and passes to Solaris. Thus, we look up the devinfo node's 4693 * "reg" property value to uniquely identify this device instance when 4694 * registering with the MD event-generation framework. If the "reg" 4695 * property cannot be found, the device tree state is presumably so 4696 * broken that there is no point in continuing. 4697 */ 4698 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 4699 VD_REG_PROP)) { 4700 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 4701 return (DDI_FAILURE); 4702 } 4703 4704 /* Get the MD instance for later MDEG registration */ 4705 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 4706 VD_REG_PROP, -1); 4707 4708 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 4709 PRN("Could not allocate state for instance %u", instance); 4710 return (DDI_FAILURE); 4711 } 4712 4713 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 4714 PRN("Could not get state for instance %u", instance); 4715 ddi_soft_state_free(vds_state, instance); 4716 return (DDI_FAILURE); 4717 } 4718 4719 vds->dip = dip; 4720 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 4721 vds_destroy_vd, sizeof (void *)); 4722 4723 ASSERT(vds->vd_table != NULL); 4724 4725 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 4726 PRN("ldi_ident_from_dip() returned errno %d", status); 4727 return (DDI_FAILURE); 4728 } 4729 vds->initialized |= VDS_LDI; 4730 4731 /* Register for MD updates */ 4732 sz = sizeof (vds_prop_template); 4733 pspecp = kmem_alloc(sz, KM_SLEEP); 4734 bcopy(vds_prop_template, pspecp, sz); 4735 4736 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 4737 4738 /* initialize the complete prop spec structure */ 4739 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 4740 ispecp->namep = "virtual-device"; 4741 ispecp->specp = pspecp; 4742 4743 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 4744 &vds->mdeg) != MDEG_SUCCESS) { 4745 PRN("Unable to register for MD updates"); 4746 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 4747 kmem_free(pspecp, sz); 4748 return (DDI_FAILURE); 4749 } 4750 4751 vds->ispecp = ispecp; 4752 vds->initialized |= VDS_MDEG; 4753 4754 /* Prevent auto-detaching so driver is available whenever MD changes */ 4755 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 4756 DDI_PROP_SUCCESS) { 4757 PRN("failed to set \"%s\" property for instance %u", 4758 DDI_NO_AUTODETACH, instance); 4759 } 4760 4761 ddi_report_dev(dip); 4762 return (DDI_SUCCESS); 4763 } 4764 4765 static int 4766 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 4767 { 4768 int status; 4769 4770 switch (cmd) { 4771 case DDI_ATTACH: 4772 PR0("Attaching"); 4773 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 4774 (void) vds_detach(dip, DDI_DETACH); 4775 return (status); 4776 case DDI_RESUME: 4777 PR0("No action required for DDI_RESUME"); 4778 return (DDI_SUCCESS); 4779 default: 4780 return (DDI_FAILURE); 4781 } 4782 } 4783 4784 static struct dev_ops vds_ops = { 4785 DEVO_REV, /* devo_rev */ 4786 0, /* devo_refcnt */ 4787 ddi_no_info, /* devo_getinfo */ 4788 nulldev, /* devo_identify */ 4789 nulldev, /* devo_probe */ 4790 vds_attach, /* devo_attach */ 4791 vds_detach, /* devo_detach */ 4792 nodev, /* devo_reset */ 4793 NULL, /* devo_cb_ops */ 4794 NULL, /* devo_bus_ops */ 4795 nulldev /* devo_power */ 4796 }; 4797 4798 static struct modldrv modldrv = { 4799 &mod_driverops, 4800 "virtual disk server", 4801 &vds_ops, 4802 }; 4803 4804 static struct modlinkage modlinkage = { 4805 MODREV_1, 4806 &modldrv, 4807 NULL 4808 }; 4809 4810 4811 int 4812 _init(void) 4813 { 4814 int i, status; 4815 4816 4817 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 4818 return (status); 4819 if ((status = mod_install(&modlinkage)) != 0) { 4820 ddi_soft_state_fini(&vds_state); 4821 return (status); 4822 } 4823 4824 /* Fill in the bit-mask of server-supported operations */ 4825 for (i = 0; i < vds_noperations; i++) 4826 vds_operations |= 1 << (vds_operation[i].operation - 1); 4827 4828 return (0); 4829 } 4830 4831 int 4832 _info(struct modinfo *modinfop) 4833 { 4834 return (mod_info(&modlinkage, modinfop)); 4835 } 4836 4837 int 4838 _fini(void) 4839 { 4840 int status; 4841 4842 4843 if ((status = mod_remove(&modlinkage)) != 0) 4844 return (status); 4845 ddi_soft_state_fini(&vds_state); 4846 return (0); 4847 } 4848