1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/fs/hsfs_isospec.h> 41 #include <sys/mdeg.h> 42 #include <sys/mhd.h> 43 #include <sys/modhash.h> 44 #include <sys/note.h> 45 #include <sys/pathname.h> 46 #include <sys/sdt.h> 47 #include <sys/sunddi.h> 48 #include <sys/sunldi.h> 49 #include <sys/sysmacros.h> 50 #include <sys/vio_common.h> 51 #include <sys/vio_util.h> 52 #include <sys/vdsk_mailbox.h> 53 #include <sys/vdsk_common.h> 54 #include <sys/vtoc.h> 55 #include <sys/vfs.h> 56 #include <sys/stat.h> 57 #include <sys/scsi/impl/uscsi.h> 58 #include <vm/seg_map.h> 59 60 /* Virtual disk server initialization flags */ 61 #define VDS_LDI 0x01 62 #define VDS_MDEG 0x02 63 64 /* Virtual disk server tunable parameters */ 65 #define VDS_RETRIES 5 66 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 67 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 68 #define VDS_NCHAINS 32 69 70 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 71 #define VDS_NAME "virtual-disk-server" 72 73 #define VD_NAME "vd" 74 #define VD_VOLUME_NAME "vdisk" 75 #define VD_ASCIILABEL "Virtual Disk" 76 77 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 78 #define VD_ID_PROP "id" 79 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 80 #define VD_BLOCK_DEVICE_OPTS "vds-block-device-opts" 81 #define VD_REG_PROP "reg" 82 83 /* Virtual disk initialization flags */ 84 #define VD_DISK_READY 0x01 85 #define VD_LOCKING 0x02 86 #define VD_LDC 0x04 87 #define VD_DRING 0x08 88 #define VD_SID 0x10 89 #define VD_SEQ_NUM 0x20 90 #define VD_SETUP_ERROR 0x40 91 92 /* Flags for writing to a vdisk which is a file */ 93 #define VD_FILE_WRITE_FLAGS SM_ASYNC 94 95 /* Number of backup labels */ 96 #define VD_FILE_NUM_BACKUP 5 97 98 /* Timeout for SCSI I/O */ 99 #define VD_SCSI_RDWR_TIMEOUT 30 /* 30 secs */ 100 101 /* Maximum number of logical partitions */ 102 #define VD_MAXPART (NDKMAP + 1) 103 104 /* 105 * By Solaris convention, slice/partition 2 represents the entire disk; 106 * unfortunately, this convention does not appear to be codified. 107 */ 108 #define VD_ENTIRE_DISK_SLICE 2 109 110 /* Return a cpp token as a string */ 111 #define STRINGIZE(token) #token 112 113 /* 114 * Print a message prefixed with the current function name to the message log 115 * (and optionally to the console for verbose boots); these macros use cpp's 116 * concatenation of string literals and C99 variable-length-argument-list 117 * macros 118 */ 119 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 120 #define _PRN(format, ...) \ 121 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 122 123 /* Return a pointer to the "i"th vdisk dring element */ 124 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 125 (vd->dring + (i)*vd->descriptor_size)) 126 127 /* Return the virtual disk client's type as a string (for use in messages) */ 128 #define VD_CLIENT(vd) \ 129 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 130 (((vd)->xfer_mode == VIO_DRING_MODE_V1_0) ? "dring client" : \ 131 (((vd)->xfer_mode == 0) ? "null client" : \ 132 "unsupported client"))) 133 134 /* Read disk label from a disk on file */ 135 #define VD_FILE_LABEL_READ(vd, labelp) \ 136 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \ 137 0, sizeof (struct dk_label)) 138 139 /* Write disk label to a disk on file */ 140 #define VD_FILE_LABEL_WRITE(vd, labelp) \ 141 vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 142 0, sizeof (struct dk_label)) 143 144 /* Message for disk access rights reset failure */ 145 #define VD_RESET_ACCESS_FAILURE_MSG \ 146 "Fail to reset disk access rights for disk %s" 147 148 /* 149 * Specification of an MD node passed to the MDEG to filter any 150 * 'vport' nodes that do not belong to the specified node. This 151 * template is copied for each vds instance and filled in with 152 * the appropriate 'cfg-handle' value before being passed to the MDEG. 153 */ 154 static mdeg_prop_spec_t vds_prop_template[] = { 155 { MDET_PROP_STR, "name", VDS_NAME }, 156 { MDET_PROP_VAL, "cfg-handle", NULL }, 157 { MDET_LIST_END, NULL, NULL } 158 }; 159 160 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 161 162 /* 163 * Matching criteria passed to the MDEG to register interest 164 * in changes to 'virtual-device-port' nodes identified by their 165 * 'id' property. 166 */ 167 static md_prop_match_t vd_prop_match[] = { 168 { MDET_PROP_VAL, VD_ID_PROP }, 169 { MDET_LIST_END, NULL } 170 }; 171 172 static mdeg_node_match_t vd_match = {"virtual-device-port", 173 vd_prop_match}; 174 175 /* 176 * Options for the VD_BLOCK_DEVICE_OPTS property. 177 */ 178 #define VD_OPT_RDONLY 0x1 /* read-only */ 179 #define VD_OPT_SLICE 0x2 /* single slice */ 180 #define VD_OPT_EXCLUSIVE 0x4 /* exclusive access */ 181 182 #define VD_OPTION_NLEN 128 183 184 typedef struct vd_option { 185 char vdo_name[VD_OPTION_NLEN]; 186 uint64_t vdo_value; 187 } vd_option_t; 188 189 vd_option_t vd_bdev_options[] = { 190 { "ro", VD_OPT_RDONLY }, 191 { "slice", VD_OPT_SLICE }, 192 { "excl", VD_OPT_EXCLUSIVE } 193 }; 194 195 /* Debugging macros */ 196 #ifdef DEBUG 197 198 static int vd_msglevel = 0; 199 200 #define PR0 if (vd_msglevel > 0) PRN 201 #define PR1 if (vd_msglevel > 1) PRN 202 #define PR2 if (vd_msglevel > 2) PRN 203 204 #define VD_DUMP_DRING_ELEM(elem) \ 205 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 206 elem->hdr.dstate, \ 207 elem->payload.operation, \ 208 elem->payload.status, \ 209 elem->payload.nbytes, \ 210 elem->payload.addr, \ 211 elem->payload.ncookies); 212 213 char * 214 vd_decode_state(int state) 215 { 216 char *str; 217 218 #define CASE_STATE(_s) case _s: str = #_s; break; 219 220 switch (state) { 221 CASE_STATE(VD_STATE_INIT) 222 CASE_STATE(VD_STATE_VER) 223 CASE_STATE(VD_STATE_ATTR) 224 CASE_STATE(VD_STATE_DRING) 225 CASE_STATE(VD_STATE_RDX) 226 CASE_STATE(VD_STATE_DATA) 227 default: str = "unknown"; break; 228 } 229 230 #undef CASE_STATE 231 232 return (str); 233 } 234 235 void 236 vd_decode_tag(vio_msg_t *msg) 237 { 238 char *tstr, *sstr, *estr; 239 240 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 241 242 switch (msg->tag.vio_msgtype) { 243 CASE_TYPE(VIO_TYPE_CTRL) 244 CASE_TYPE(VIO_TYPE_DATA) 245 CASE_TYPE(VIO_TYPE_ERR) 246 default: tstr = "unknown"; break; 247 } 248 249 #undef CASE_TYPE 250 251 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 252 253 switch (msg->tag.vio_subtype) { 254 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 255 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 256 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 257 default: sstr = "unknown"; break; 258 } 259 260 #undef CASE_SUBTYPE 261 262 #define CASE_ENV(_s) case _s: estr = #_s; break; 263 264 switch (msg->tag.vio_subtype_env) { 265 CASE_ENV(VIO_VER_INFO) 266 CASE_ENV(VIO_ATTR_INFO) 267 CASE_ENV(VIO_DRING_REG) 268 CASE_ENV(VIO_DRING_UNREG) 269 CASE_ENV(VIO_RDX) 270 CASE_ENV(VIO_PKT_DATA) 271 CASE_ENV(VIO_DESC_DATA) 272 CASE_ENV(VIO_DRING_DATA) 273 default: estr = "unknown"; break; 274 } 275 276 #undef CASE_ENV 277 278 PR1("(%x/%x/%x) message : (%s/%s/%s)", 279 msg->tag.vio_msgtype, msg->tag.vio_subtype, 280 msg->tag.vio_subtype_env, tstr, sstr, estr); 281 } 282 283 #else /* !DEBUG */ 284 285 #define PR0(...) 286 #define PR1(...) 287 #define PR2(...) 288 289 #define VD_DUMP_DRING_ELEM(elem) 290 291 #define vd_decode_state(_s) (NULL) 292 #define vd_decode_tag(_s) (NULL) 293 294 #endif /* DEBUG */ 295 296 297 /* 298 * Soft state structure for a vds instance 299 */ 300 typedef struct vds { 301 uint_t initialized; /* driver inst initialization flags */ 302 dev_info_t *dip; /* driver inst devinfo pointer */ 303 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 304 mod_hash_t *vd_table; /* table of virtual disks served */ 305 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 306 mdeg_handle_t mdeg; /* handle for MDEG operations */ 307 } vds_t; 308 309 /* 310 * Types of descriptor-processing tasks 311 */ 312 typedef enum vd_task_type { 313 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 314 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 315 } vd_task_type_t; 316 317 /* 318 * Structure describing the task for processing a descriptor 319 */ 320 typedef struct vd_task { 321 struct vd *vd; /* vd instance task is for */ 322 vd_task_type_t type; /* type of descriptor task */ 323 int index; /* dring elem index for task */ 324 vio_msg_t *msg; /* VIO message task is for */ 325 size_t msglen; /* length of message content */ 326 vd_dring_payload_t *request; /* request task will perform */ 327 struct buf buf; /* buf(9s) for I/O request */ 328 ldc_mem_handle_t mhdl; /* task memory handle */ 329 int status; /* status of processing task */ 330 int (*completef)(struct vd_task *task); /* completion func ptr */ 331 } vd_task_t; 332 333 /* 334 * Soft state structure for a virtual disk instance 335 */ 336 typedef struct vd { 337 uint_t initialized; /* vdisk initialization flags */ 338 uint64_t operations; /* bitmask of VD_OPs exported */ 339 vio_ver_t version; /* ver negotiated with client */ 340 vds_t *vds; /* server for this vdisk */ 341 ddi_taskq_t *startq; /* queue for I/O start tasks */ 342 ddi_taskq_t *completionq; /* queue for completion tasks */ 343 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 344 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 345 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 346 int open_flags; /* open flags */ 347 uint_t nslices; /* number of slices */ 348 size_t vdisk_size; /* number of blocks in vdisk */ 349 size_t vdisk_block_size; /* size of each vdisk block */ 350 vd_disk_type_t vdisk_type; /* slice or entire disk */ 351 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 352 vd_media_t vdisk_media; /* media type of backing dev. */ 353 boolean_t is_atapi_dev; /* Is this an IDE CD-ROM dev? */ 354 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 355 size_t block_size; /* blk size of actual device */ 356 boolean_t pseudo; /* underlying pseudo dev */ 357 boolean_t file; /* is vDisk backed by a file? */ 358 boolean_t scsi; /* is vDisk backed by scsi? */ 359 vnode_t *file_vnode; /* file vnode */ 360 size_t file_size; /* file size */ 361 ddi_devid_t file_devid; /* devid for disk image */ 362 efi_gpt_t efi_gpt; /* EFI GPT for slice type */ 363 efi_gpe_t efi_gpe; /* EFI GPE for slice type */ 364 int efi_reserved; /* EFI reserved slice */ 365 struct dk_geom dk_geom; /* synthetic for slice type */ 366 struct vtoc vtoc; /* synthetic for slice type */ 367 vd_slice_t slices[VD_MAXPART]; /* logical partitions */ 368 boolean_t ownership; /* disk ownership status */ 369 ldc_status_t ldc_state; /* LDC connection state */ 370 ldc_handle_t ldc_handle; /* handle for LDC comm */ 371 size_t max_msglen; /* largest LDC message len */ 372 vd_state_t state; /* client handshake state */ 373 uint8_t xfer_mode; /* transfer mode with client */ 374 uint32_t sid; /* client's session ID */ 375 uint64_t seq_num; /* message sequence number */ 376 uint64_t dring_ident; /* identifier of dring */ 377 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 378 uint32_t descriptor_size; /* num bytes in desc */ 379 uint32_t dring_len; /* number of dring elements */ 380 caddr_t dring; /* address of dring */ 381 caddr_t vio_msgp; /* vio msg staging buffer */ 382 vd_task_t inband_task; /* task for inband descriptor */ 383 vd_task_t *dring_task; /* tasks dring elements */ 384 385 kmutex_t lock; /* protects variables below */ 386 boolean_t enabled; /* is vdisk enabled? */ 387 boolean_t reset_state; /* reset connection state? */ 388 boolean_t reset_ldc; /* reset LDC channel? */ 389 } vd_t; 390 391 typedef struct vds_operation { 392 char *namep; 393 uint8_t operation; 394 int (*start)(vd_task_t *task); 395 int (*complete)(vd_task_t *task); 396 } vds_operation_t; 397 398 typedef struct vd_ioctl { 399 uint8_t operation; /* vdisk operation */ 400 const char *operation_name; /* vdisk operation name */ 401 size_t nbytes; /* size of operation buffer */ 402 int cmd; /* corresponding ioctl cmd */ 403 const char *cmd_name; /* ioctl cmd name */ 404 void *arg; /* ioctl cmd argument */ 405 /* convert input vd_buf to output ioctl_arg */ 406 int (*copyin)(void *vd_buf, size_t, void *ioctl_arg); 407 /* convert input ioctl_arg to output vd_buf */ 408 void (*copyout)(void *ioctl_arg, void *vd_buf); 409 /* write is true if the operation writes any data to the backend */ 410 boolean_t write; 411 } vd_ioctl_t; 412 413 /* Define trivial copyin/copyout conversion function flag */ 414 #define VD_IDENTITY_IN ((int (*)(void *, size_t, void *))-1) 415 #define VD_IDENTITY_OUT ((void (*)(void *, void *))-1) 416 417 418 static int vds_ldc_retries = VDS_RETRIES; 419 static int vds_ldc_delay = VDS_LDC_DELAY; 420 static int vds_dev_retries = VDS_RETRIES; 421 static int vds_dev_delay = VDS_DEV_DELAY; 422 static void *vds_state; 423 424 static uint_t vd_file_write_flags = VD_FILE_WRITE_FLAGS; 425 426 static short vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT; 427 static int vd_scsi_debug = USCSI_SILENT; 428 429 /* 430 * Tunable to define the behavior of the service domain if the vdisk server 431 * fails to reset disk exclusive access when a LDC channel is reset. When a 432 * LDC channel is reset the vdisk server will try to reset disk exclusive 433 * access by releasing any SCSI-2 reservation or resetting the disk. If these 434 * actions fail then the default behavior (vd_reset_access_failure = 0) is to 435 * print a warning message. This default behavior can be changed by setting 436 * the vd_reset_access_failure variable to A_REBOOT (= 0x1) and that will 437 * cause the service domain to reboot, or A_DUMP (= 0x5) and that will cause 438 * the service domain to panic. In both cases, the reset of the service domain 439 * should trigger a reset SCSI buses and hopefully clear any SCSI-2 reservation. 440 */ 441 static int vd_reset_access_failure = 0; 442 443 /* 444 * Tunable for backward compatibility. When this variable is set to B_TRUE, 445 * all disk volumes (ZFS, SVM, VxvM volumes) will be exported as single 446 * slice disks whether or not they have the "slice" option set. This is 447 * to provide a simple backward compatibility mechanism when upgrading 448 * the vds driver and using a domain configuration created before the 449 * "slice" option was available. 450 */ 451 static boolean_t vd_volume_force_slice = B_FALSE; 452 453 /* 454 * The label of disk images created with some earlier versions of the virtual 455 * disk software is not entirely correct and have an incorrect v_sanity field 456 * (usually 0) instead of VTOC_SANE. This creates a compatibility problem with 457 * these images because we are now validating that the disk label (and the 458 * sanity) is correct when a disk image is opened. 459 * 460 * This tunable is set to false to not validate the sanity field and ensure 461 * compatibility. If the tunable is set to true, we will do a strict checking 462 * of the sanity but this can create compatibility problems with old disk 463 * images. 464 */ 465 static boolean_t vd_file_validate_sanity = B_FALSE; 466 467 /* 468 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 469 * 470 * Each supported major version should appear only once, paired with (and only 471 * with) its highest supported minor version number (as the protocol requires 472 * supporting all lower minor version numbers as well) 473 */ 474 static const vio_ver_t vds_version[] = {{1, 1}}; 475 static const size_t vds_num_versions = 476 sizeof (vds_version)/sizeof (vds_version[0]); 477 478 static void vd_free_dring_task(vd_t *vdp); 479 static int vd_setup_vd(vd_t *vd); 480 static int vd_setup_single_slice_disk(vd_t *vd); 481 static int vd_setup_mediainfo(vd_t *vd); 482 static boolean_t vd_enabled(vd_t *vd); 483 static ushort_t vd_lbl2cksum(struct dk_label *label); 484 static int vd_file_validate_geometry(vd_t *vd); 485 static boolean_t vd_file_is_iso_image(vd_t *vd); 486 static void vd_set_exported_operations(vd_t *vd); 487 static void vd_reset_access(vd_t *vd); 488 static int vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg); 489 static int vds_efi_alloc_and_read(vd_t *, efi_gpt_t **, efi_gpe_t **); 490 static void vds_efi_free(vd_t *, efi_gpt_t *, efi_gpe_t *); 491 492 /* 493 * Function: 494 * vd_file_rw 495 * 496 * Description: 497 * Read or write to a disk on file. 498 * 499 * Parameters: 500 * vd - disk on which the operation is performed. 501 * slice - slice on which the operation is performed, 502 * VD_SLICE_NONE indicates that the operation 503 * is done using an absolute disk offset. 504 * operation - operation to execute: read (VD_OP_BREAD) or 505 * write (VD_OP_BWRITE). 506 * data - buffer where data are read to or written from. 507 * blk - starting block for the operation. 508 * len - number of bytes to read or write. 509 * 510 * Return Code: 511 * n >= 0 - success, n indicates the number of bytes read 512 * or written. 513 * -1 - error. 514 */ 515 static ssize_t 516 vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk, 517 size_t len) 518 { 519 caddr_t maddr; 520 size_t offset, maxlen, moffset, mlen, n; 521 uint_t smflags; 522 enum seg_rw srw; 523 524 ASSERT(vd->file); 525 ASSERT(len > 0); 526 527 /* 528 * If a file is exported as a slice then we don't care about the vtoc. 529 * In that case, the vtoc is a fake mainly to make newfs happy and we 530 * handle any I/O as a raw disk access so that we can have access to the 531 * entire backend. 532 */ 533 if (vd->vdisk_type == VD_DISK_TYPE_SLICE || slice == VD_SLICE_NONE) { 534 /* raw disk access */ 535 offset = blk * DEV_BSIZE; 536 } else { 537 ASSERT(slice >= 0 && slice < V_NUMPAR); 538 539 /* 540 * v1.0 vDisk clients depended on the server not verifying 541 * the label of a unformatted disk. This "feature" is 542 * maintained for backward compatibility but all versions 543 * from v1.1 onwards must do the right thing. 544 */ 545 if (vd->vdisk_label == VD_DISK_LABEL_UNK && 546 vio_ver_is_supported(vd->version, 1, 1)) { 547 (void) vd_file_validate_geometry(vd); 548 if (vd->vdisk_label == VD_DISK_LABEL_UNK) { 549 PR0("Unknown disk label, can't do I/O " 550 "from slice %d", slice); 551 return (-1); 552 } 553 } 554 555 if (vd->vdisk_label == VD_DISK_LABEL_VTOC) { 556 ASSERT(vd->vtoc.v_sectorsz == DEV_BSIZE); 557 } else { 558 ASSERT(vd->vdisk_label == VD_DISK_LABEL_EFI); 559 ASSERT(vd->vdisk_block_size == DEV_BSIZE); 560 } 561 562 if (blk >= vd->slices[slice].nblocks) { 563 /* address past the end of the slice */ 564 PR0("req_addr (0x%lx) > psize (0x%lx)", 565 blk, vd->slices[slice].nblocks); 566 return (0); 567 } 568 569 offset = (vd->slices[slice].start + blk) * DEV_BSIZE; 570 571 /* 572 * If the requested size is greater than the size 573 * of the partition, truncate the read/write. 574 */ 575 maxlen = (vd->slices[slice].nblocks - blk) * DEV_BSIZE; 576 577 if (len > maxlen) { 578 PR0("I/O size truncated to %lu bytes from %lu bytes", 579 maxlen, len); 580 len = maxlen; 581 } 582 } 583 584 /* 585 * We have to ensure that we are reading/writing into the mmap 586 * range. If we have a partial disk image (e.g. an image of 587 * s0 instead s2) the system can try to access slices that 588 * are not included into the disk image. 589 */ 590 if ((offset + len) > vd->file_size) { 591 PR0("offset + nbytes (0x%lx + 0x%lx) > " 592 "file_size (0x%lx)", offset, len, vd->file_size); 593 return (-1); 594 } 595 596 srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE; 597 smflags = (operation == VD_OP_BREAD)? 0 : 598 (SM_WRITE | vd_file_write_flags); 599 n = len; 600 601 do { 602 /* 603 * segmap_getmapflt() returns a MAXBSIZE chunk which is 604 * MAXBSIZE aligned. 605 */ 606 moffset = offset & MAXBOFFSET; 607 mlen = MIN(MAXBSIZE - moffset, n); 608 maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset, 609 mlen, 1, srw); 610 /* 611 * Fault in the pages so we can check for error and ensure 612 * that we can safely used the mapped address. 613 */ 614 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 615 F_SOFTLOCK, srw) != 0) { 616 (void) segmap_release(segkmap, maddr, 0); 617 return (-1); 618 } 619 620 if (operation == VD_OP_BREAD) 621 bcopy(maddr + moffset, data, mlen); 622 else 623 bcopy(data, maddr + moffset, mlen); 624 625 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 626 F_SOFTUNLOCK, srw) != 0) { 627 (void) segmap_release(segkmap, maddr, 0); 628 return (-1); 629 } 630 if (segmap_release(segkmap, maddr, smflags) != 0) 631 return (-1); 632 n -= mlen; 633 offset += mlen; 634 data += mlen; 635 636 } while (n > 0); 637 638 return (len); 639 } 640 641 /* 642 * Function: 643 * vd_file_build_default_label 644 * 645 * Description: 646 * Return a default label for the given disk. This is used when the disk 647 * does not have a valid VTOC so that the user can get a valid default 648 * configuration. The default label has all slice sizes set to 0 (except 649 * slice 2 which is the entire disk) to force the user to write a valid 650 * label onto the disk image. 651 * 652 * Parameters: 653 * vd - disk on which the operation is performed. 654 * label - the returned default label. 655 * 656 * Return Code: 657 * none. 658 */ 659 static void 660 vd_file_build_default_label(vd_t *vd, struct dk_label *label) 661 { 662 size_t size; 663 char prefix; 664 665 ASSERT(vd->file); 666 ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); 667 668 bzero(label, sizeof (struct dk_label)); 669 670 /* 671 * We must have a resonable number of cylinders and sectors so 672 * that newfs can run using default values. 673 * 674 * if (disk_size < 2MB) 675 * phys_cylinders = disk_size / 100K 676 * else 677 * phys_cylinders = disk_size / 300K 678 * 679 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 680 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 681 * data_cylinders = phys_cylinders - alt_cylinders 682 * 683 * sectors = disk_size / (phys_cylinders * blk_size) 684 * 685 * The file size test is an attempt to not have too few cylinders 686 * for a small file, or so many on a big file that you waste space 687 * for backup superblocks or cylinder group structures. 688 */ 689 if (vd->file_size < (2 * 1024 * 1024)) 690 label->dkl_pcyl = vd->file_size / (100 * 1024); 691 else 692 label->dkl_pcyl = vd->file_size / (300 * 1024); 693 694 if (label->dkl_pcyl == 0) 695 label->dkl_pcyl = 1; 696 697 label->dkl_acyl = 0; 698 699 if (label->dkl_pcyl > 2) 700 label->dkl_acyl = 2; 701 702 label->dkl_nsect = vd->file_size / 703 (DEV_BSIZE * label->dkl_pcyl); 704 label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl; 705 label->dkl_nhead = 1; 706 label->dkl_write_reinstruct = 0; 707 label->dkl_read_reinstruct = 0; 708 label->dkl_rpm = 7200; 709 label->dkl_apc = 0; 710 label->dkl_intrlv = 0; 711 712 PR0("requested disk size: %ld bytes\n", vd->file_size); 713 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl, 714 label->dkl_nhead, label->dkl_nsect); 715 PR0("provided disk size: %ld bytes\n", (uint64_t) 716 (label->dkl_pcyl * label->dkl_nhead * 717 label->dkl_nsect * DEV_BSIZE)); 718 719 if (vd->file_size < (1ULL << 20)) { 720 size = vd->file_size >> 10; 721 prefix = 'K'; /* Kilobyte */ 722 } else if (vd->file_size < (1ULL << 30)) { 723 size = vd->file_size >> 20; 724 prefix = 'M'; /* Megabyte */ 725 } else if (vd->file_size < (1ULL << 40)) { 726 size = vd->file_size >> 30; 727 prefix = 'G'; /* Gigabyte */ 728 } else { 729 size = vd->file_size >> 40; 730 prefix = 'T'; /* Terabyte */ 731 } 732 733 /* 734 * We must have a correct label name otherwise format(1m) will 735 * not recognized the disk as labeled. 736 */ 737 (void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII, 738 "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d", 739 size, prefix, 740 label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead, 741 label->dkl_nsect); 742 743 /* default VTOC */ 744 label->dkl_vtoc.v_version = V_VERSION; 745 label->dkl_vtoc.v_nparts = V_NUMPAR; 746 label->dkl_vtoc.v_sanity = VTOC_SANE; 747 label->dkl_vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_tag = V_BACKUP; 748 label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_cylno = 0; 749 label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_nblk = label->dkl_ncyl * 750 label->dkl_nhead * label->dkl_nsect; 751 label->dkl_magic = DKL_MAGIC; 752 label->dkl_cksum = vd_lbl2cksum(label); 753 } 754 755 /* 756 * Function: 757 * vd_file_set_vtoc 758 * 759 * Description: 760 * Set the vtoc of a disk image by writing the label and backup 761 * labels into the disk image backend. 762 * 763 * Parameters: 764 * vd - disk on which the operation is performed. 765 * label - the data to be written. 766 * 767 * Return Code: 768 * 0 - success. 769 * n > 0 - error, n indicates the errno code. 770 */ 771 static int 772 vd_file_set_vtoc(vd_t *vd, struct dk_label *label) 773 { 774 int blk, sec, cyl, head, cnt; 775 776 ASSERT(vd->file); 777 778 if (VD_FILE_LABEL_WRITE(vd, label) < 0) { 779 PR0("fail to write disk label"); 780 return (EIO); 781 } 782 783 /* 784 * Backup labels are on the last alternate cylinder's 785 * first five odd sectors. 786 */ 787 if (label->dkl_acyl == 0) { 788 PR0("no alternate cylinder, can not store backup labels"); 789 return (0); 790 } 791 792 cyl = label->dkl_ncyl + label->dkl_acyl - 1; 793 head = label->dkl_nhead - 1; 794 795 blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) + 796 (head * label->dkl_nsect); 797 798 /* 799 * Write the backup labels. Make sure we don't try to write past 800 * the last cylinder. 801 */ 802 sec = 1; 803 804 for (cnt = 0; cnt < VD_FILE_NUM_BACKUP; cnt++) { 805 806 if (sec >= label->dkl_nsect) { 807 PR0("not enough sector to store all backup labels"); 808 return (0); 809 } 810 811 if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)label, 812 blk + sec, sizeof (struct dk_label)) < 0) { 813 PR0("error writing backup label at block %d\n", 814 blk + sec); 815 return (EIO); 816 } 817 818 PR1("wrote backup label at block %d\n", blk + sec); 819 820 sec += 2; 821 } 822 823 return (0); 824 } 825 826 /* 827 * Function: 828 * vd_file_get_devid_block 829 * 830 * Description: 831 * Return the block number where the device id is stored. 832 * 833 * Parameters: 834 * vd - disk on which the operation is performed. 835 * blkp - pointer to the block number 836 * 837 * Return Code: 838 * 0 - success 839 * ENOSPC - disk has no space to store a device id 840 */ 841 static int 842 vd_file_get_devid_block(vd_t *vd, size_t *blkp) 843 { 844 diskaddr_t spc, head, cyl; 845 846 ASSERT(vd->file); 847 848 if (vd->vdisk_label == VD_DISK_LABEL_UNK) { 849 /* 850 * If no label is defined we don't know where to find 851 * a device id. 852 */ 853 return (ENOSPC); 854 } 855 856 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 857 /* 858 * For an EFI disk, the devid is at the beginning of 859 * the reserved slice 860 */ 861 if (vd->efi_reserved == -1) { 862 PR0("EFI disk has no reserved slice"); 863 return (ENOSPC); 864 } 865 866 *blkp = vd->slices[vd->efi_reserved].start; 867 return (0); 868 } 869 870 ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC); 871 872 /* this geometry doesn't allow us to have a devid */ 873 if (vd->dk_geom.dkg_acyl < 2) { 874 PR0("not enough alternate cylinder available for devid " 875 "(acyl=%u)", vd->dk_geom.dkg_acyl); 876 return (ENOSPC); 877 } 878 879 /* the devid is in on the track next to the last cylinder */ 880 cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2; 881 spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect; 882 head = vd->dk_geom.dkg_nhead - 1; 883 884 *blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) + 885 (head * vd->dk_geom.dkg_nsect) + 1; 886 887 return (0); 888 } 889 890 /* 891 * Return the checksum of a disk block containing an on-disk devid. 892 */ 893 static uint_t 894 vd_dkdevid2cksum(struct dk_devid *dkdevid) 895 { 896 uint_t chksum, *ip; 897 int i; 898 899 chksum = 0; 900 ip = (uint_t *)dkdevid; 901 for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++) 902 chksum ^= ip[i]; 903 904 return (chksum); 905 } 906 907 /* 908 * Function: 909 * vd_file_read_devid 910 * 911 * Description: 912 * Read the device id stored on a disk image. 913 * 914 * Parameters: 915 * vd - disk on which the operation is performed. 916 * devid - the return address of the device ID. 917 * 918 * Return Code: 919 * 0 - success 920 * EIO - I/O error while trying to access the disk image 921 * EINVAL - no valid device id was found 922 * ENOSPC - disk has no space to store a device id 923 */ 924 static int 925 vd_file_read_devid(vd_t *vd, ddi_devid_t *devid) 926 { 927 struct dk_devid *dkdevid; 928 size_t blk; 929 uint_t chksum; 930 int status, sz; 931 932 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 933 return (status); 934 935 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 936 937 /* get the devid */ 938 if ((vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk, 939 DEV_BSIZE)) < 0) { 940 PR0("error reading devid block at %lu", blk); 941 status = EIO; 942 goto done; 943 } 944 945 /* validate the revision */ 946 if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) || 947 (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) { 948 PR0("invalid devid found at block %lu (bad revision)", blk); 949 status = EINVAL; 950 goto done; 951 } 952 953 /* compute checksum */ 954 chksum = vd_dkdevid2cksum(dkdevid); 955 956 /* compare the checksums */ 957 if (DKD_GETCHKSUM(dkdevid) != chksum) { 958 PR0("invalid devid found at block %lu (bad checksum)", blk); 959 status = EINVAL; 960 goto done; 961 } 962 963 /* validate the device id */ 964 if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) { 965 PR0("invalid devid found at block %lu", blk); 966 status = EINVAL; 967 goto done; 968 } 969 970 PR1("devid read at block %lu", blk); 971 972 sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid); 973 *devid = kmem_alloc(sz, KM_SLEEP); 974 bcopy(&dkdevid->dkd_devid, *devid, sz); 975 976 done: 977 kmem_free(dkdevid, DEV_BSIZE); 978 return (status); 979 980 } 981 982 /* 983 * Function: 984 * vd_file_write_devid 985 * 986 * Description: 987 * Write a device id into disk image. 988 * 989 * Parameters: 990 * vd - disk on which the operation is performed. 991 * devid - the device ID to store. 992 * 993 * Return Code: 994 * 0 - success 995 * EIO - I/O error while trying to access the disk image 996 * ENOSPC - disk has no space to store a device id 997 */ 998 static int 999 vd_file_write_devid(vd_t *vd, ddi_devid_t devid) 1000 { 1001 struct dk_devid *dkdevid; 1002 uint_t chksum; 1003 size_t blk; 1004 int status; 1005 1006 if (devid == NULL) { 1007 /* nothing to write */ 1008 return (0); 1009 } 1010 1011 if ((status = vd_file_get_devid_block(vd, &blk)) != 0) 1012 return (status); 1013 1014 dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP); 1015 1016 /* set revision */ 1017 dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB; 1018 dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB; 1019 1020 /* copy devid */ 1021 bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid)); 1022 1023 /* compute checksum */ 1024 chksum = vd_dkdevid2cksum(dkdevid); 1025 1026 /* set checksum */ 1027 DKD_FORMCHKSUM(chksum, dkdevid); 1028 1029 /* store the devid */ 1030 if ((status = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, 1031 (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) { 1032 PR0("Error writing devid block at %lu", blk); 1033 status = EIO; 1034 } else { 1035 PR1("devid written at block %lu", blk); 1036 status = 0; 1037 } 1038 1039 kmem_free(dkdevid, DEV_BSIZE); 1040 return (status); 1041 } 1042 1043 /* 1044 * Function: 1045 * vd_do_scsi_rdwr 1046 * 1047 * Description: 1048 * Read or write to a SCSI disk using an absolute disk offset. 1049 * 1050 * Parameters: 1051 * vd - disk on which the operation is performed. 1052 * operation - operation to execute: read (VD_OP_BREAD) or 1053 * write (VD_OP_BWRITE). 1054 * data - buffer where data are read to or written from. 1055 * blk - starting block for the operation. 1056 * len - number of bytes to read or write. 1057 * 1058 * Return Code: 1059 * 0 - success 1060 * n != 0 - error. 1061 */ 1062 static int 1063 vd_do_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len) 1064 { 1065 struct uscsi_cmd ucmd; 1066 union scsi_cdb cdb; 1067 int nsectors, nblk; 1068 int max_sectors; 1069 int status, rval; 1070 1071 ASSERT(!vd->file); 1072 ASSERT(vd->vdisk_block_size > 0); 1073 1074 max_sectors = vd->max_xfer_sz; 1075 nblk = (len / vd->vdisk_block_size); 1076 1077 if (len % vd->vdisk_block_size != 0) 1078 return (EINVAL); 1079 1080 /* 1081 * Build and execute the uscsi ioctl. We build a group0, group1 1082 * or group4 command as necessary, since some targets 1083 * do not support group1 commands. 1084 */ 1085 while (nblk) { 1086 1087 bzero(&ucmd, sizeof (ucmd)); 1088 bzero(&cdb, sizeof (cdb)); 1089 1090 nsectors = (max_sectors < nblk) ? max_sectors : nblk; 1091 1092 /* 1093 * Some of the optical drives on sun4v machines are ATAPI 1094 * devices which use Group 1 Read/Write commands so we need 1095 * to explicitly check a flag which is set when a domain 1096 * is bound. 1097 */ 1098 if (blk < (2 << 20) && nsectors <= 0xff && !vd->is_atapi_dev) { 1099 FORMG0ADDR(&cdb, blk); 1100 FORMG0COUNT(&cdb, nsectors); 1101 ucmd.uscsi_cdblen = CDB_GROUP0; 1102 } else if (blk > 0xffffffff) { 1103 FORMG4LONGADDR(&cdb, blk); 1104 FORMG4COUNT(&cdb, nsectors); 1105 ucmd.uscsi_cdblen = CDB_GROUP4; 1106 cdb.scc_cmd |= SCMD_GROUP4; 1107 } else { 1108 FORMG1ADDR(&cdb, blk); 1109 FORMG1COUNT(&cdb, nsectors); 1110 ucmd.uscsi_cdblen = CDB_GROUP1; 1111 cdb.scc_cmd |= SCMD_GROUP1; 1112 } 1113 ucmd.uscsi_cdb = (caddr_t)&cdb; 1114 ucmd.uscsi_bufaddr = data; 1115 ucmd.uscsi_buflen = nsectors * vd->block_size; 1116 ucmd.uscsi_timeout = vd_scsi_rdwr_timeout; 1117 /* 1118 * Set flags so that the command is isolated from normal 1119 * commands and no error message is printed. 1120 */ 1121 ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT; 1122 1123 if (operation == VD_OP_BREAD) { 1124 cdb.scc_cmd |= SCMD_READ; 1125 ucmd.uscsi_flags |= USCSI_READ; 1126 } else { 1127 cdb.scc_cmd |= SCMD_WRITE; 1128 } 1129 1130 status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE], 1131 USCSICMD, (intptr_t)&ucmd, (vd->open_flags | FKIOCTL), 1132 kcred, &rval); 1133 1134 if (status == 0) 1135 status = ucmd.uscsi_status; 1136 1137 if (status != 0) 1138 break; 1139 1140 /* 1141 * Check if partial DMA breakup is required. If so, reduce 1142 * the request size by half and retry the last request. 1143 */ 1144 if (ucmd.uscsi_resid == ucmd.uscsi_buflen) { 1145 max_sectors >>= 1; 1146 if (max_sectors <= 0) { 1147 status = EIO; 1148 break; 1149 } 1150 continue; 1151 } 1152 1153 if (ucmd.uscsi_resid != 0) { 1154 status = EIO; 1155 break; 1156 } 1157 1158 blk += nsectors; 1159 nblk -= nsectors; 1160 data += nsectors * vd->vdisk_block_size; /* SECSIZE */ 1161 } 1162 1163 return (status); 1164 } 1165 1166 /* 1167 * Function: 1168 * vd_scsi_rdwr 1169 * 1170 * Description: 1171 * Wrapper function to read or write to a SCSI disk using an absolute 1172 * disk offset. It checks the blocksize of the underlying device and, 1173 * if necessary, adjusts the buffers accordingly before calling 1174 * vd_do_scsi_rdwr() to do the actual read or write. 1175 * 1176 * Parameters: 1177 * vd - disk on which the operation is performed. 1178 * operation - operation to execute: read (VD_OP_BREAD) or 1179 * write (VD_OP_BWRITE). 1180 * data - buffer where data are read to or written from. 1181 * blk - starting block for the operation. 1182 * len - number of bytes to read or write. 1183 * 1184 * Return Code: 1185 * 0 - success 1186 * n != 0 - error. 1187 */ 1188 static int 1189 vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t vblk, size_t vlen) 1190 { 1191 int rv; 1192 1193 size_t pblk; /* physical device block number of data on device */ 1194 size_t delta; /* relative offset between pblk and vblk */ 1195 size_t pnblk; /* number of physical blocks to be read from device */ 1196 size_t plen; /* length of data to be read from physical device */ 1197 char *buf; /* buffer area to fit physical device's block size */ 1198 1199 if (vd->block_size == 0) { 1200 /* 1201 * The block size was not available during the attach, 1202 * try to update it now. 1203 */ 1204 if (vd_setup_mediainfo(vd) != 0) 1205 return (EIO); 1206 } 1207 1208 /* 1209 * If the vdisk block size and the block size of the underlying device 1210 * match we can skip straight to vd_do_scsi_rdwr(), otherwise we need 1211 * to create a buffer large enough to handle the device's block size 1212 * and adjust the block to be read from and the amount of data to 1213 * read to correspond with the device's block size. 1214 */ 1215 if (vd->vdisk_block_size == vd->block_size) 1216 return (vd_do_scsi_rdwr(vd, operation, data, vblk, vlen)); 1217 1218 if (vd->vdisk_block_size > vd->block_size) 1219 return (EINVAL); 1220 1221 /* 1222 * Writing of physical block sizes larger than the virtual block size 1223 * is not supported. This would be added if/when support for guests 1224 * writing to DVDs is implemented. 1225 */ 1226 if (operation == VD_OP_BWRITE) 1227 return (ENOTSUP); 1228 1229 /* BEGIN CSTYLED */ 1230 /* 1231 * Below is a diagram showing the relationship between the physical 1232 * and virtual blocks. If the virtual blocks marked by 'X' below are 1233 * requested, then the physical blocks denoted by 'Y' are read. 1234 * 1235 * vblk 1236 * | vlen 1237 * |<--------------->| 1238 * v v 1239 * --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+- virtual disk: 1240 * | | | |XX|XX|XX|XX|XX|XX| | | | | | } block size is 1241 * --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+- vd->vdisk_block_size 1242 * : : : : 1243 * >:==:< delta : : 1244 * : : : : 1245 * --+-----+-----+-----+-----+-----+-----+-----+-- physical disk: 1246 * | |YY:YY|YYYYY|YYYYY|YY:YY| | | } block size is 1247 * --+-----+-----+-----+-----+-----+-----+-----+-- vd->block_size 1248 * ^ ^ 1249 * |<--------------------->| 1250 * | plen 1251 * pblk 1252 */ 1253 /* END CSTYLED */ 1254 pblk = (vblk * vd->vdisk_block_size) / vd->block_size; 1255 delta = (vblk * vd->vdisk_block_size) - (pblk * vd->block_size); 1256 pnblk = ((delta + vlen - 1) / vd->block_size) + 1; 1257 plen = pnblk * vd->block_size; 1258 1259 PR2("vblk %lx:pblk %lx: vlen %ld:plen %ld", vblk, pblk, vlen, plen); 1260 1261 buf = kmem_zalloc(sizeof (caddr_t) * plen, KM_SLEEP); 1262 rv = vd_do_scsi_rdwr(vd, operation, (caddr_t)buf, pblk, plen); 1263 bcopy(buf + delta, data, vlen); 1264 1265 kmem_free(buf, sizeof (caddr_t) * plen); 1266 1267 return (rv); 1268 } 1269 1270 /* 1271 * Return Values 1272 * EINPROGRESS - operation was successfully started 1273 * EIO - encountered LDC (aka. task error) 1274 * 0 - operation completed successfully 1275 * 1276 * Side Effect 1277 * sets request->status = <disk operation status> 1278 */ 1279 static int 1280 vd_start_bio(vd_task_t *task) 1281 { 1282 int rv, status = 0; 1283 vd_t *vd = task->vd; 1284 vd_dring_payload_t *request = task->request; 1285 struct buf *buf = &task->buf; 1286 uint8_t mtype; 1287 int slice; 1288 char *bufaddr = 0; 1289 size_t buflen; 1290 1291 ASSERT(vd != NULL); 1292 ASSERT(request != NULL); 1293 1294 slice = request->slice; 1295 1296 ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices); 1297 ASSERT((request->operation == VD_OP_BREAD) || 1298 (request->operation == VD_OP_BWRITE)); 1299 1300 if (request->nbytes == 0) { 1301 /* no service for trivial requests */ 1302 request->status = EINVAL; 1303 return (0); 1304 } 1305 1306 PR1("%s %lu bytes at block %lu", 1307 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 1308 request->nbytes, request->addr); 1309 1310 /* 1311 * We have to check the open flags because the functions processing 1312 * the read/write request will not do it. 1313 */ 1314 if (request->operation == VD_OP_BWRITE && !(vd->open_flags & FWRITE)) { 1315 PR0("write fails because backend is opened read-only"); 1316 request->nbytes = 0; 1317 request->status = EROFS; 1318 return (0); 1319 } 1320 1321 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 1322 1323 /* Map memory exported by client */ 1324 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 1325 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 1326 &bufaddr, NULL); 1327 if (status != 0) { 1328 PR0("ldc_mem_map() returned err %d ", status); 1329 return (EIO); 1330 } 1331 1332 buflen = request->nbytes; 1333 1334 status = ldc_mem_acquire(task->mhdl, 0, buflen); 1335 if (status != 0) { 1336 (void) ldc_mem_unmap(task->mhdl); 1337 PR0("ldc_mem_acquire() returned err %d ", status); 1338 return (EIO); 1339 } 1340 1341 /* Start the block I/O */ 1342 if (vd->file) { 1343 rv = vd_file_rw(vd, slice, request->operation, bufaddr, 1344 request->addr, request->nbytes); 1345 if (rv < 0) { 1346 request->nbytes = 0; 1347 request->status = EIO; 1348 } else { 1349 request->nbytes = rv; 1350 request->status = 0; 1351 } 1352 } else { 1353 if (slice == VD_SLICE_NONE) { 1354 /* 1355 * This is not a disk image so it is a real disk. We 1356 * assume that the underlying device driver supports 1357 * USCSICMD ioctls. This is the case of all SCSI devices 1358 * (sd, ssd...). 1359 * 1360 * In the future if we have non-SCSI disks we would need 1361 * to invoke the appropriate function to do I/O using an 1362 * absolute disk offset (for example using DIOCTL_RWCMD 1363 * for IDE disks). 1364 */ 1365 rv = vd_scsi_rdwr(vd, request->operation, bufaddr, 1366 request->addr, request->nbytes); 1367 if (rv != 0) { 1368 request->nbytes = 0; 1369 request->status = EIO; 1370 } else { 1371 request->status = 0; 1372 } 1373 } else { 1374 bioinit(buf); 1375 buf->b_flags = B_BUSY; 1376 buf->b_bcount = request->nbytes; 1377 buf->b_lblkno = request->addr; 1378 buf->b_edev = vd->dev[slice]; 1379 buf->b_un.b_addr = bufaddr; 1380 buf->b_flags |= (request->operation == VD_OP_BREAD)? 1381 B_READ : B_WRITE; 1382 1383 request->status = 1384 ldi_strategy(vd->ldi_handle[slice], buf); 1385 1386 /* 1387 * This is to indicate to the caller that the request 1388 * needs to be finished by vd_complete_bio() by calling 1389 * biowait() there and waiting for that to return before 1390 * triggering the notification of the vDisk client. 1391 * 1392 * This is necessary when writing to real disks as 1393 * otherwise calls to ldi_strategy() would be serialized 1394 * behind the calls to biowait() and performance would 1395 * suffer. 1396 */ 1397 if (request->status == 0) 1398 return (EINPROGRESS); 1399 1400 biofini(buf); 1401 } 1402 } 1403 1404 /* Clean up after error */ 1405 rv = ldc_mem_release(task->mhdl, 0, buflen); 1406 if (rv) { 1407 PR0("ldc_mem_release() returned err %d ", rv); 1408 status = EIO; 1409 } 1410 rv = ldc_mem_unmap(task->mhdl); 1411 if (rv) { 1412 PR0("ldc_mem_unmap() returned err %d ", rv); 1413 status = EIO; 1414 } 1415 1416 return (status); 1417 } 1418 1419 /* 1420 * This function should only be called from vd_notify to ensure that requests 1421 * are responded to in the order that they are received. 1422 */ 1423 static int 1424 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 1425 { 1426 int status; 1427 size_t nbytes; 1428 1429 do { 1430 nbytes = msglen; 1431 status = ldc_write(ldc_handle, msg, &nbytes); 1432 if (status != EWOULDBLOCK) 1433 break; 1434 drv_usecwait(vds_ldc_delay); 1435 } while (status == EWOULDBLOCK); 1436 1437 if (status != 0) { 1438 if (status != ECONNRESET) 1439 PR0("ldc_write() returned errno %d", status); 1440 return (status); 1441 } else if (nbytes != msglen) { 1442 PR0("ldc_write() performed only partial write"); 1443 return (EIO); 1444 } 1445 1446 PR1("SENT %lu bytes", msglen); 1447 return (0); 1448 } 1449 1450 static void 1451 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 1452 { 1453 mutex_enter(&vd->lock); 1454 vd->reset_state = B_TRUE; 1455 vd->reset_ldc = reset_ldc; 1456 mutex_exit(&vd->lock); 1457 } 1458 1459 /* 1460 * Reset the state of the connection with a client, if needed; reset the LDC 1461 * transport as well, if needed. This function should only be called from the 1462 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 1463 */ 1464 static void 1465 vd_reset_if_needed(vd_t *vd) 1466 { 1467 int status = 0; 1468 1469 mutex_enter(&vd->lock); 1470 if (!vd->reset_state) { 1471 ASSERT(!vd->reset_ldc); 1472 mutex_exit(&vd->lock); 1473 return; 1474 } 1475 mutex_exit(&vd->lock); 1476 1477 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 1478 1479 /* 1480 * Let any asynchronous I/O complete before possibly pulling the rug 1481 * out from under it; defer checking vd->reset_ldc, as one of the 1482 * asynchronous tasks might set it 1483 */ 1484 ddi_taskq_wait(vd->completionq); 1485 1486 if (vd->file) { 1487 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL); 1488 if (status) { 1489 PR0("VOP_FSYNC returned errno %d", status); 1490 } 1491 } 1492 1493 if ((vd->initialized & VD_DRING) && 1494 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 1495 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1496 1497 vd_free_dring_task(vd); 1498 1499 /* Free the staging buffer for msgs */ 1500 if (vd->vio_msgp != NULL) { 1501 kmem_free(vd->vio_msgp, vd->max_msglen); 1502 vd->vio_msgp = NULL; 1503 } 1504 1505 /* Free the inband message buffer */ 1506 if (vd->inband_task.msg != NULL) { 1507 kmem_free(vd->inband_task.msg, vd->max_msglen); 1508 vd->inband_task.msg = NULL; 1509 } 1510 1511 mutex_enter(&vd->lock); 1512 1513 if (vd->reset_ldc) 1514 PR0("taking down LDC channel"); 1515 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 1516 PR0("ldc_down() returned errno %d", status); 1517 1518 /* Reset exclusive access rights */ 1519 vd_reset_access(vd); 1520 1521 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 1522 vd->state = VD_STATE_INIT; 1523 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 1524 1525 /* Allocate the staging buffer */ 1526 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 1527 1528 PR0("calling ldc_up\n"); 1529 (void) ldc_up(vd->ldc_handle); 1530 1531 vd->reset_state = B_FALSE; 1532 vd->reset_ldc = B_FALSE; 1533 1534 mutex_exit(&vd->lock); 1535 } 1536 1537 static void vd_recv_msg(void *arg); 1538 1539 static void 1540 vd_mark_in_reset(vd_t *vd) 1541 { 1542 int status; 1543 1544 PR0("vd_mark_in_reset: marking vd in reset\n"); 1545 1546 vd_need_reset(vd, B_FALSE); 1547 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 1548 if (status == DDI_FAILURE) { 1549 PR0("cannot schedule task to recv msg\n"); 1550 vd_need_reset(vd, B_TRUE); 1551 return; 1552 } 1553 } 1554 1555 static int 1556 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 1557 { 1558 boolean_t accepted; 1559 int status; 1560 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1561 1562 if (vd->reset_state) 1563 return (0); 1564 1565 /* Acquire the element */ 1566 if (!vd->reset_state && 1567 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1568 if (status == ECONNRESET) { 1569 vd_mark_in_reset(vd); 1570 return (0); 1571 } else { 1572 PR0("ldc_mem_dring_acquire() returned errno %d", 1573 status); 1574 return (status); 1575 } 1576 } 1577 1578 /* Set the element's status and mark it done */ 1579 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 1580 if (accepted) { 1581 elem->payload.nbytes = elem_nbytes; 1582 elem->payload.status = elem_status; 1583 elem->hdr.dstate = VIO_DESC_DONE; 1584 } else { 1585 /* Perhaps client timed out waiting for I/O... */ 1586 PR0("element %u no longer \"accepted\"", idx); 1587 VD_DUMP_DRING_ELEM(elem); 1588 } 1589 /* Release the element */ 1590 if (!vd->reset_state && 1591 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1592 if (status == ECONNRESET) { 1593 vd_mark_in_reset(vd); 1594 return (0); 1595 } else { 1596 PR0("ldc_mem_dring_release() returned errno %d", 1597 status); 1598 return (status); 1599 } 1600 } 1601 1602 return (accepted ? 0 : EINVAL); 1603 } 1604 1605 /* 1606 * Return Values 1607 * 0 - operation completed successfully 1608 * EIO - encountered LDC / task error 1609 * 1610 * Side Effect 1611 * sets request->status = <disk operation status> 1612 */ 1613 static int 1614 vd_complete_bio(vd_task_t *task) 1615 { 1616 int status = 0; 1617 int rv = 0; 1618 vd_t *vd = task->vd; 1619 vd_dring_payload_t *request = task->request; 1620 struct buf *buf = &task->buf; 1621 1622 1623 ASSERT(vd != NULL); 1624 ASSERT(request != NULL); 1625 ASSERT(task->msg != NULL); 1626 ASSERT(task->msglen >= sizeof (*task->msg)); 1627 ASSERT(!vd->file); 1628 ASSERT(request->slice != VD_SLICE_NONE); 1629 1630 /* Wait for the I/O to complete [ call to ldi_strategy(9f) ] */ 1631 request->status = biowait(buf); 1632 1633 /* return back the number of bytes read/written */ 1634 request->nbytes = buf->b_bcount - buf->b_resid; 1635 1636 /* Release the buffer */ 1637 if (!vd->reset_state) 1638 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 1639 if (status) { 1640 PR0("ldc_mem_release() returned errno %d copying to " 1641 "client", status); 1642 if (status == ECONNRESET) { 1643 vd_mark_in_reset(vd); 1644 } 1645 rv = EIO; 1646 } 1647 1648 /* Unmap the memory, even if in reset */ 1649 status = ldc_mem_unmap(task->mhdl); 1650 if (status) { 1651 PR0("ldc_mem_unmap() returned errno %d copying to client", 1652 status); 1653 if (status == ECONNRESET) { 1654 vd_mark_in_reset(vd); 1655 } 1656 rv = EIO; 1657 } 1658 1659 biofini(buf); 1660 1661 return (rv); 1662 } 1663 1664 /* 1665 * Description: 1666 * This function is called by the two functions called by a taskq 1667 * [ vd_complete_notify() and vd_serial_notify()) ] to send the 1668 * message to the client. 1669 * 1670 * Parameters: 1671 * arg - opaque pointer to structure containing task to be completed 1672 * 1673 * Return Values 1674 * None 1675 */ 1676 static void 1677 vd_notify(vd_task_t *task) 1678 { 1679 int status; 1680 1681 ASSERT(task != NULL); 1682 ASSERT(task->vd != NULL); 1683 1684 if (task->vd->reset_state) 1685 return; 1686 1687 /* 1688 * Send the "ack" or "nack" back to the client; if sending the message 1689 * via LDC fails, arrange to reset both the connection state and LDC 1690 * itself 1691 */ 1692 PR2("Sending %s", 1693 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 1694 1695 status = send_msg(task->vd->ldc_handle, task->msg, task->msglen); 1696 switch (status) { 1697 case 0: 1698 break; 1699 case ECONNRESET: 1700 vd_mark_in_reset(task->vd); 1701 break; 1702 default: 1703 PR0("initiating full reset"); 1704 vd_need_reset(task->vd, B_TRUE); 1705 break; 1706 } 1707 1708 DTRACE_PROBE1(task__end, vd_task_t *, task); 1709 } 1710 1711 /* 1712 * Description: 1713 * Mark the Dring entry as Done and (if necessary) send an ACK/NACK to 1714 * the vDisk client 1715 * 1716 * Parameters: 1717 * task - structure containing the request sent from client 1718 * 1719 * Return Values 1720 * None 1721 */ 1722 static void 1723 vd_complete_notify(vd_task_t *task) 1724 { 1725 int status = 0; 1726 vd_t *vd = task->vd; 1727 vd_dring_payload_t *request = task->request; 1728 1729 /* Update the dring element for a dring client */ 1730 if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE_V1_0)) { 1731 status = vd_mark_elem_done(vd, task->index, 1732 request->status, request->nbytes); 1733 if (status == ECONNRESET) 1734 vd_mark_in_reset(vd); 1735 } 1736 1737 /* 1738 * If a transport error occurred while marking the element done or 1739 * previously while executing the task, arrange to "nack" the message 1740 * when the final task in the descriptor element range completes 1741 */ 1742 if ((status != 0) || (task->status != 0)) 1743 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 1744 1745 /* 1746 * Only the final task for a range of elements will respond to and 1747 * free the message 1748 */ 1749 if (task->type == VD_NONFINAL_RANGE_TASK) { 1750 return; 1751 } 1752 1753 vd_notify(task); 1754 } 1755 1756 /* 1757 * Description: 1758 * This is the basic completion function called to handle inband data 1759 * requests and handshake messages. All it needs to do is trigger a 1760 * message to the client that the request is completed. 1761 * 1762 * Parameters: 1763 * arg - opaque pointer to structure containing task to be completed 1764 * 1765 * Return Values 1766 * None 1767 */ 1768 static void 1769 vd_serial_notify(void *arg) 1770 { 1771 vd_task_t *task = (vd_task_t *)arg; 1772 1773 ASSERT(task != NULL); 1774 vd_notify(task); 1775 } 1776 1777 /* ARGSUSED */ 1778 static int 1779 vd_geom2dk_geom(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) 1780 { 1781 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 1782 return (0); 1783 } 1784 1785 /* ARGSUSED */ 1786 static int 1787 vd_vtoc2vtoc(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) 1788 { 1789 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 1790 return (0); 1791 } 1792 1793 static void 1794 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 1795 { 1796 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 1797 } 1798 1799 static void 1800 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 1801 { 1802 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 1803 } 1804 1805 static int 1806 vd_get_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) 1807 { 1808 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1809 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1810 size_t data_len; 1811 1812 data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t)); 1813 if (vd_efi->length > data_len) 1814 return (EINVAL); 1815 1816 dk_efi->dki_lba = vd_efi->lba; 1817 dk_efi->dki_length = vd_efi->length; 1818 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 1819 return (0); 1820 } 1821 1822 static void 1823 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 1824 { 1825 int len; 1826 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1827 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1828 1829 len = vd_efi->length; 1830 DK_EFI2VD_EFI(dk_efi, vd_efi); 1831 kmem_free(dk_efi->dki_data, len); 1832 } 1833 1834 static int 1835 vd_set_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) 1836 { 1837 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1838 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1839 size_t data_len; 1840 1841 data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t)); 1842 if (vd_efi->length > data_len) 1843 return (EINVAL); 1844 1845 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 1846 VD_EFI2DK_EFI(vd_efi, dk_efi); 1847 return (0); 1848 } 1849 1850 static void 1851 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 1852 { 1853 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 1854 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 1855 1856 kmem_free(dk_efi->dki_data, vd_efi->length); 1857 } 1858 1859 static int 1860 vd_scsicmd_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) 1861 { 1862 size_t vd_scsi_len; 1863 vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf; 1864 struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg; 1865 1866 /* check buffer size */ 1867 vd_scsi_len = VD_SCSI_SIZE; 1868 vd_scsi_len += P2ROUNDUP(vd_scsi->cdb_len, sizeof (uint64_t)); 1869 vd_scsi_len += P2ROUNDUP(vd_scsi->sense_len, sizeof (uint64_t)); 1870 vd_scsi_len += P2ROUNDUP(vd_scsi->datain_len, sizeof (uint64_t)); 1871 vd_scsi_len += P2ROUNDUP(vd_scsi->dataout_len, sizeof (uint64_t)); 1872 1873 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 1874 1875 if (vd_buf_len < vd_scsi_len) 1876 return (EINVAL); 1877 1878 /* set flags */ 1879 uscsi->uscsi_flags = vd_scsi_debug; 1880 1881 if (vd_scsi->options & VD_SCSI_OPT_NORETRY) { 1882 uscsi->uscsi_flags |= USCSI_ISOLATE; 1883 uscsi->uscsi_flags |= USCSI_DIAGNOSE; 1884 } 1885 1886 /* task attribute */ 1887 switch (vd_scsi->task_attribute) { 1888 case VD_SCSI_TASK_ACA: 1889 uscsi->uscsi_flags |= USCSI_HEAD; 1890 break; 1891 case VD_SCSI_TASK_HQUEUE: 1892 uscsi->uscsi_flags |= USCSI_HTAG; 1893 break; 1894 case VD_SCSI_TASK_ORDERED: 1895 uscsi->uscsi_flags |= USCSI_OTAG; 1896 break; 1897 default: 1898 uscsi->uscsi_flags |= USCSI_NOTAG; 1899 break; 1900 } 1901 1902 /* timeout */ 1903 uscsi->uscsi_timeout = vd_scsi->timeout; 1904 1905 /* cdb data */ 1906 uscsi->uscsi_cdb = (caddr_t)VD_SCSI_DATA_CDB(vd_scsi); 1907 uscsi->uscsi_cdblen = vd_scsi->cdb_len; 1908 1909 /* sense buffer */ 1910 if (vd_scsi->sense_len != 0) { 1911 uscsi->uscsi_flags |= USCSI_RQENABLE; 1912 uscsi->uscsi_rqbuf = (caddr_t)VD_SCSI_DATA_SENSE(vd_scsi); 1913 uscsi->uscsi_rqlen = vd_scsi->sense_len; 1914 } 1915 1916 if (vd_scsi->datain_len != 0 && vd_scsi->dataout_len != 0) { 1917 /* uscsi does not support read/write request */ 1918 return (EINVAL); 1919 } 1920 1921 /* request data-in */ 1922 if (vd_scsi->datain_len != 0) { 1923 uscsi->uscsi_flags |= USCSI_READ; 1924 uscsi->uscsi_buflen = vd_scsi->datain_len; 1925 uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_IN(vd_scsi); 1926 } 1927 1928 /* request data-out */ 1929 if (vd_scsi->dataout_len != 0) { 1930 uscsi->uscsi_buflen = vd_scsi->dataout_len; 1931 uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_OUT(vd_scsi); 1932 } 1933 1934 return (0); 1935 } 1936 1937 static void 1938 vd_scsicmd_out(void *ioctl_arg, void *vd_buf) 1939 { 1940 vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf; 1941 struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg; 1942 1943 /* output fields */ 1944 vd_scsi->cmd_status = uscsi->uscsi_status; 1945 1946 /* sense data */ 1947 if ((uscsi->uscsi_flags & USCSI_RQENABLE) && 1948 (uscsi->uscsi_status == STATUS_CHECK || 1949 uscsi->uscsi_status == STATUS_TERMINATED)) { 1950 vd_scsi->sense_status = uscsi->uscsi_rqstatus; 1951 if (uscsi->uscsi_rqstatus == STATUS_GOOD) 1952 vd_scsi->sense_len -= uscsi->uscsi_resid; 1953 else 1954 vd_scsi->sense_len = 0; 1955 } else { 1956 vd_scsi->sense_len = 0; 1957 } 1958 1959 if (uscsi->uscsi_status != STATUS_GOOD) { 1960 vd_scsi->dataout_len = 0; 1961 vd_scsi->datain_len = 0; 1962 return; 1963 } 1964 1965 if (uscsi->uscsi_flags & USCSI_READ) { 1966 /* request data (read) */ 1967 vd_scsi->datain_len -= uscsi->uscsi_resid; 1968 vd_scsi->dataout_len = 0; 1969 } else { 1970 /* request data (write) */ 1971 vd_scsi->datain_len = 0; 1972 vd_scsi->dataout_len -= uscsi->uscsi_resid; 1973 } 1974 } 1975 1976 static ushort_t 1977 vd_lbl2cksum(struct dk_label *label) 1978 { 1979 int count; 1980 ushort_t sum, *sp; 1981 1982 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 1983 sp = (ushort_t *)label; 1984 sum = 0; 1985 while (count--) { 1986 sum ^= *sp++; 1987 } 1988 1989 return (sum); 1990 } 1991 1992 /* 1993 * Handle ioctls to a disk slice. 1994 * 1995 * Return Values 1996 * 0 - Indicates that there are no errors in disk operations 1997 * ENOTSUP - Unknown disk label type or unsupported DKIO ioctl 1998 * EINVAL - Not enough room to copy the EFI label 1999 * 2000 */ 2001 static int 2002 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 2003 { 2004 dk_efi_t *dk_ioc; 2005 int rval; 2006 2007 ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE); 2008 2009 if (cmd == DKIOCFLUSHWRITECACHE) { 2010 if (vd->file) { 2011 return (VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL)); 2012 } else { 2013 return (ldi_ioctl(vd->ldi_handle[0], cmd, 2014 (intptr_t)ioctl_arg, vd->open_flags | FKIOCTL, 2015 kcred, &rval)); 2016 } 2017 } 2018 2019 switch (vd->vdisk_label) { 2020 2021 /* ioctls for a single slice disk with a VTOC label */ 2022 case VD_DISK_LABEL_VTOC: 2023 2024 switch (cmd) { 2025 case DKIOCGGEOM: 2026 ASSERT(ioctl_arg != NULL); 2027 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 2028 return (0); 2029 case DKIOCGVTOC: 2030 ASSERT(ioctl_arg != NULL); 2031 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 2032 return (0); 2033 default: 2034 return (ENOTSUP); 2035 } 2036 2037 /* ioctls for a single slice disk with an EFI label */ 2038 case VD_DISK_LABEL_EFI: 2039 2040 switch (cmd) { 2041 case DKIOCGETEFI: 2042 ASSERT(ioctl_arg != NULL); 2043 dk_ioc = (dk_efi_t *)ioctl_arg; 2044 2045 /* 2046 * For a single slice disk with an EFI label, we define 2047 * a fake EFI label with the GPT at LBA 1 and one GPE 2048 * at LBA 2. So we return the GPT or the GPE depending 2049 * on which LBA is requested. 2050 */ 2051 if (dk_ioc->dki_lba == 1) { 2052 2053 /* return the EFI GPT */ 2054 if (dk_ioc->dki_length < sizeof (efi_gpt_t)) 2055 return (EINVAL); 2056 2057 bcopy(&vd->efi_gpt, dk_ioc->dki_data, 2058 sizeof (efi_gpt_t)); 2059 2060 /* also return the GPE if possible */ 2061 if (dk_ioc->dki_length >= sizeof (efi_gpt_t) + 2062 sizeof (efi_gpe_t)) { 2063 bcopy(&vd->efi_gpe, dk_ioc->dki_data + 2064 1, sizeof (efi_gpe_t)); 2065 } 2066 2067 } else if (dk_ioc->dki_lba == 2) { 2068 2069 /* return the EFI GPE */ 2070 if (dk_ioc->dki_length < sizeof (efi_gpe_t)) 2071 return (EINVAL); 2072 2073 bcopy(&vd->efi_gpe, dk_ioc->dki_data, 2074 sizeof (efi_gpe_t)); 2075 2076 } else { 2077 return (EINVAL); 2078 } 2079 2080 return (0); 2081 default: 2082 return (ENOTSUP); 2083 } 2084 2085 default: 2086 /* Unknown disk label type */ 2087 return (ENOTSUP); 2088 } 2089 } 2090 2091 static int 2092 vds_efi_alloc_and_read(vd_t *vd, efi_gpt_t **gpt, efi_gpe_t **gpe) 2093 { 2094 vd_efi_dev_t edev; 2095 int status; 2096 2097 VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl); 2098 2099 status = vd_efi_alloc_and_read(&edev, gpt, gpe); 2100 2101 return (status); 2102 } 2103 2104 static void 2105 vds_efi_free(vd_t *vd, efi_gpt_t *gpt, efi_gpe_t *gpe) 2106 { 2107 vd_efi_dev_t edev; 2108 2109 VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl); 2110 2111 vd_efi_free(&edev, gpt, gpe); 2112 } 2113 2114 static int 2115 vd_file_validate_efi(vd_t *vd) 2116 { 2117 efi_gpt_t *gpt; 2118 efi_gpe_t *gpe; 2119 int i, nparts, status; 2120 struct uuid efi_reserved = EFI_RESERVED; 2121 2122 if ((status = vds_efi_alloc_and_read(vd, &gpt, &gpe)) != 0) 2123 return (status); 2124 2125 bzero(&vd->vtoc, sizeof (struct vtoc)); 2126 bzero(&vd->dk_geom, sizeof (struct dk_geom)); 2127 bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART); 2128 2129 vd->efi_reserved = -1; 2130 2131 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 2132 2133 for (i = 0; i < nparts && i < VD_MAXPART; i++) { 2134 2135 if (gpe[i].efi_gpe_StartingLBA == 0 || 2136 gpe[i].efi_gpe_EndingLBA == 0) { 2137 continue; 2138 } 2139 2140 vd->slices[i].start = gpe[i].efi_gpe_StartingLBA; 2141 vd->slices[i].nblocks = gpe[i].efi_gpe_EndingLBA - 2142 gpe[i].efi_gpe_StartingLBA + 1; 2143 2144 if (bcmp(&gpe[i].efi_gpe_PartitionTypeGUID, &efi_reserved, 2145 sizeof (struct uuid)) == 0) 2146 vd->efi_reserved = i; 2147 2148 } 2149 2150 ASSERT(vd->vdisk_size != 0); 2151 vd->slices[VD_EFI_WD_SLICE].start = 0; 2152 vd->slices[VD_EFI_WD_SLICE].nblocks = vd->vdisk_size; 2153 2154 vds_efi_free(vd, gpt, gpe); 2155 2156 return (status); 2157 } 2158 2159 /* 2160 * Function: 2161 * vd_file_validate_geometry 2162 * 2163 * Description: 2164 * Read the label and validate the geometry of a disk image. The driver 2165 * label, vtoc and geometry information are updated according to the 2166 * label read from the disk image. 2167 * 2168 * If no valid label is found, the label is set to unknown and the 2169 * function returns EINVAL, but a default vtoc and geometry are provided 2170 * to the driver. If an EFI label is found, ENOTSUP is returned. 2171 * 2172 * Parameters: 2173 * vd - disk on which the operation is performed. 2174 * 2175 * Return Code: 2176 * 0 - success. 2177 * EIO - error reading the label from the disk image. 2178 * EINVAL - unknown disk label. 2179 * ENOTSUP - geometry not applicable (EFI label). 2180 */ 2181 static int 2182 vd_file_validate_geometry(vd_t *vd) 2183 { 2184 struct dk_label label; 2185 struct dk_geom *geom = &vd->dk_geom; 2186 struct vtoc *vtoc = &vd->vtoc; 2187 int i; 2188 int status = 0; 2189 2190 ASSERT(vd->file); 2191 ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); 2192 2193 if (VD_FILE_LABEL_READ(vd, &label) < 0) 2194 return (EIO); 2195 2196 if (label.dkl_magic != DKL_MAGIC || 2197 label.dkl_cksum != vd_lbl2cksum(&label) || 2198 (vd_file_validate_sanity && label.dkl_vtoc.v_sanity != VTOC_SANE) || 2199 label.dkl_vtoc.v_nparts != V_NUMPAR) { 2200 2201 if (vd_file_validate_efi(vd) == 0) { 2202 vd->vdisk_label = VD_DISK_LABEL_EFI; 2203 return (ENOTSUP); 2204 } 2205 2206 vd->vdisk_label = VD_DISK_LABEL_UNK; 2207 vd_file_build_default_label(vd, &label); 2208 status = EINVAL; 2209 } else { 2210 vd->vdisk_label = VD_DISK_LABEL_VTOC; 2211 } 2212 2213 /* Update the driver geometry */ 2214 bzero(geom, sizeof (struct dk_geom)); 2215 2216 geom->dkg_ncyl = label.dkl_ncyl; 2217 geom->dkg_acyl = label.dkl_acyl; 2218 geom->dkg_nhead = label.dkl_nhead; 2219 geom->dkg_nsect = label.dkl_nsect; 2220 geom->dkg_intrlv = label.dkl_intrlv; 2221 geom->dkg_apc = label.dkl_apc; 2222 geom->dkg_rpm = label.dkl_rpm; 2223 geom->dkg_pcyl = label.dkl_pcyl; 2224 geom->dkg_write_reinstruct = label.dkl_write_reinstruct; 2225 geom->dkg_read_reinstruct = label.dkl_read_reinstruct; 2226 2227 /* Update the driver vtoc */ 2228 bzero(vtoc, sizeof (struct vtoc)); 2229 2230 vtoc->v_sanity = label.dkl_vtoc.v_sanity; 2231 vtoc->v_version = label.dkl_vtoc.v_version; 2232 vtoc->v_sectorsz = DEV_BSIZE; 2233 vtoc->v_nparts = label.dkl_vtoc.v_nparts; 2234 2235 for (i = 0; i < vtoc->v_nparts; i++) { 2236 vtoc->v_part[i].p_tag = 2237 label.dkl_vtoc.v_part[i].p_tag; 2238 vtoc->v_part[i].p_flag = 2239 label.dkl_vtoc.v_part[i].p_flag; 2240 vtoc->v_part[i].p_start = 2241 label.dkl_map[i].dkl_cylno * 2242 (label.dkl_nhead * label.dkl_nsect); 2243 vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk; 2244 vtoc->timestamp[i] = 2245 label.dkl_vtoc.v_timestamp[i]; 2246 } 2247 /* 2248 * The bootinfo array can not be copied with bcopy() because 2249 * elements are of type long in vtoc (so 64-bit) and of type 2250 * int in dk_vtoc (so 32-bit). 2251 */ 2252 vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0]; 2253 vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1]; 2254 vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2]; 2255 bcopy(label.dkl_asciilabel, vtoc->v_asciilabel, 2256 LEN_DKL_ASCII); 2257 bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume, 2258 LEN_DKL_VVOL); 2259 2260 /* Update logical partitions */ 2261 bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART); 2262 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 2263 for (i = 0; i < vtoc->v_nparts; i++) { 2264 vd->slices[i].start = vtoc->v_part[i].p_start; 2265 vd->slices[i].nblocks = vtoc->v_part[i].p_size; 2266 } 2267 } 2268 2269 return (status); 2270 } 2271 2272 /* 2273 * Handle ioctls to a disk image (file-based). 2274 * 2275 * Return Values 2276 * 0 - Indicates that there are no errors 2277 * != 0 - Disk operation returned an error 2278 */ 2279 static int 2280 vd_do_file_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 2281 { 2282 struct dk_label label; 2283 struct dk_geom *geom; 2284 struct vtoc *vtoc; 2285 dk_efi_t *efi; 2286 int i, rc; 2287 2288 ASSERT(vd->file); 2289 ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); 2290 2291 switch (cmd) { 2292 2293 case DKIOCGGEOM: 2294 ASSERT(ioctl_arg != NULL); 2295 geom = (struct dk_geom *)ioctl_arg; 2296 2297 rc = vd_file_validate_geometry(vd); 2298 if (rc != 0 && rc != EINVAL) 2299 return (rc); 2300 bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom)); 2301 return (0); 2302 2303 case DKIOCGVTOC: 2304 ASSERT(ioctl_arg != NULL); 2305 vtoc = (struct vtoc *)ioctl_arg; 2306 2307 rc = vd_file_validate_geometry(vd); 2308 if (rc != 0 && rc != EINVAL) 2309 return (rc); 2310 bcopy(&vd->vtoc, vtoc, sizeof (struct vtoc)); 2311 return (0); 2312 2313 case DKIOCSGEOM: 2314 ASSERT(ioctl_arg != NULL); 2315 geom = (struct dk_geom *)ioctl_arg; 2316 2317 if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0) 2318 return (EINVAL); 2319 2320 /* 2321 * The current device geometry is not updated, just the driver 2322 * "notion" of it. The device geometry will be effectively 2323 * updated when a label is written to the device during a next 2324 * DKIOCSVTOC. 2325 */ 2326 bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom)); 2327 return (0); 2328 2329 case DKIOCSVTOC: 2330 ASSERT(ioctl_arg != NULL); 2331 ASSERT(vd->dk_geom.dkg_nhead != 0 && 2332 vd->dk_geom.dkg_nsect != 0); 2333 vtoc = (struct vtoc *)ioctl_arg; 2334 2335 if (vtoc->v_sanity != VTOC_SANE || 2336 vtoc->v_sectorsz != DEV_BSIZE || 2337 vtoc->v_nparts != V_NUMPAR) 2338 return (EINVAL); 2339 2340 bzero(&label, sizeof (label)); 2341 label.dkl_ncyl = vd->dk_geom.dkg_ncyl; 2342 label.dkl_acyl = vd->dk_geom.dkg_acyl; 2343 label.dkl_pcyl = vd->dk_geom.dkg_pcyl; 2344 label.dkl_nhead = vd->dk_geom.dkg_nhead; 2345 label.dkl_nsect = vd->dk_geom.dkg_nsect; 2346 label.dkl_intrlv = vd->dk_geom.dkg_intrlv; 2347 label.dkl_apc = vd->dk_geom.dkg_apc; 2348 label.dkl_rpm = vd->dk_geom.dkg_rpm; 2349 label.dkl_write_reinstruct = vd->dk_geom.dkg_write_reinstruct; 2350 label.dkl_read_reinstruct = vd->dk_geom.dkg_read_reinstruct; 2351 2352 label.dkl_vtoc.v_nparts = V_NUMPAR; 2353 label.dkl_vtoc.v_sanity = VTOC_SANE; 2354 label.dkl_vtoc.v_version = vtoc->v_version; 2355 for (i = 0; i < V_NUMPAR; i++) { 2356 label.dkl_vtoc.v_timestamp[i] = 2357 vtoc->timestamp[i]; 2358 label.dkl_vtoc.v_part[i].p_tag = 2359 vtoc->v_part[i].p_tag; 2360 label.dkl_vtoc.v_part[i].p_flag = 2361 vtoc->v_part[i].p_flag; 2362 label.dkl_map[i].dkl_cylno = 2363 vtoc->v_part[i].p_start / 2364 (label.dkl_nhead * label.dkl_nsect); 2365 label.dkl_map[i].dkl_nblk = 2366 vtoc->v_part[i].p_size; 2367 } 2368 /* 2369 * The bootinfo array can not be copied with bcopy() because 2370 * elements are of type long in vtoc (so 64-bit) and of type 2371 * int in dk_vtoc (so 32-bit). 2372 */ 2373 label.dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0]; 2374 label.dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1]; 2375 label.dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2]; 2376 bcopy(vtoc->v_asciilabel, label.dkl_asciilabel, 2377 LEN_DKL_ASCII); 2378 bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume, 2379 LEN_DKL_VVOL); 2380 2381 /* re-compute checksum */ 2382 label.dkl_magic = DKL_MAGIC; 2383 label.dkl_cksum = vd_lbl2cksum(&label); 2384 2385 /* write label to the disk image */ 2386 if ((rc = vd_file_set_vtoc(vd, &label)) != 0) 2387 return (rc); 2388 2389 break; 2390 2391 case DKIOCFLUSHWRITECACHE: 2392 return (VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL)); 2393 2394 case DKIOCGETEFI: 2395 ASSERT(ioctl_arg != NULL); 2396 efi = (dk_efi_t *)ioctl_arg; 2397 2398 if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, 2399 (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0) 2400 return (EIO); 2401 2402 return (0); 2403 2404 case DKIOCSETEFI: 2405 ASSERT(ioctl_arg != NULL); 2406 efi = (dk_efi_t *)ioctl_arg; 2407 2408 if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, 2409 (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0) 2410 return (EIO); 2411 2412 break; 2413 2414 2415 default: 2416 return (ENOTSUP); 2417 } 2418 2419 ASSERT(cmd == DKIOCSVTOC || cmd == DKIOCSETEFI); 2420 2421 /* label has changed, revalidate the geometry */ 2422 (void) vd_file_validate_geometry(vd); 2423 2424 /* 2425 * The disk geometry may have changed, so we need to write 2426 * the devid (if there is one) so that it is stored at the 2427 * right location. 2428 */ 2429 if (vd_file_write_devid(vd, vd->file_devid) != 0) { 2430 PR0("Fail to write devid"); 2431 } 2432 2433 return (0); 2434 } 2435 2436 static int 2437 vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg) 2438 { 2439 int rval = 0, status; 2440 2441 /* 2442 * Call the appropriate function to execute the ioctl depending 2443 * on the type of vdisk. 2444 */ 2445 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { 2446 2447 /* slice, file or volume exported as a single slice disk */ 2448 status = vd_do_slice_ioctl(vd, cmd, arg); 2449 2450 } else if (vd->file) { 2451 2452 /* file or volume exported as a full disk */ 2453 status = vd_do_file_ioctl(vd, cmd, arg); 2454 2455 } else { 2456 2457 /* disk device exported as a full disk */ 2458 status = ldi_ioctl(vd->ldi_handle[0], cmd, (intptr_t)arg, 2459 vd->open_flags | FKIOCTL, kcred, &rval); 2460 } 2461 2462 #ifdef DEBUG 2463 if (rval != 0) { 2464 PR0("ioctl %x set rval = %d, which is not being returned" 2465 " to caller", cmd, rval); 2466 } 2467 #endif /* DEBUG */ 2468 2469 return (status); 2470 } 2471 2472 /* 2473 * Description: 2474 * This is the function that processes the ioctl requests (farming it 2475 * out to functions that handle slices, files or whole disks) 2476 * 2477 * Return Values 2478 * 0 - ioctl operation completed successfully 2479 * != 0 - The LDC error value encountered 2480 * (propagated back up the call stack as a task error) 2481 * 2482 * Side Effect 2483 * sets request->status to the return value of the ioctl function. 2484 */ 2485 static int 2486 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 2487 { 2488 int status = 0; 2489 size_t nbytes = request->nbytes; /* modifiable copy */ 2490 2491 2492 ASSERT(request->slice < vd->nslices); 2493 PR0("Performing %s", ioctl->operation_name); 2494 2495 /* Get data from client and convert, if necessary */ 2496 if (ioctl->copyin != NULL) { 2497 ASSERT(nbytes != 0 && buf != NULL); 2498 PR1("Getting \"arg\" data from client"); 2499 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 2500 request->cookie, request->ncookies, 2501 LDC_COPY_IN)) != 0) { 2502 PR0("ldc_mem_copy() returned errno %d " 2503 "copying from client", status); 2504 return (status); 2505 } 2506 2507 /* Convert client's data, if necessary */ 2508 if (ioctl->copyin == VD_IDENTITY_IN) { 2509 /* use client buffer */ 2510 ioctl->arg = buf; 2511 } else { 2512 /* convert client vdisk operation data to ioctl data */ 2513 status = (ioctl->copyin)(buf, nbytes, 2514 (void *)ioctl->arg); 2515 if (status != 0) { 2516 request->status = status; 2517 return (0); 2518 } 2519 } 2520 } 2521 2522 if (ioctl->operation == VD_OP_SCSICMD) { 2523 struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl->arg; 2524 2525 /* check write permission */ 2526 if (!(vd->open_flags & FWRITE) && 2527 !(uscsi->uscsi_flags & USCSI_READ)) { 2528 PR0("uscsi fails because backend is opened read-only"); 2529 request->status = EROFS; 2530 return (0); 2531 } 2532 } 2533 2534 /* 2535 * Send the ioctl to the disk backend. 2536 */ 2537 request->status = vd_backend_ioctl(vd, ioctl->cmd, ioctl->arg); 2538 2539 if (request->status != 0) { 2540 PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status); 2541 if (ioctl->operation == VD_OP_SCSICMD && 2542 ((struct uscsi_cmd *)ioctl->arg)->uscsi_status != 0) 2543 /* 2544 * USCSICMD has reported an error and the uscsi_status 2545 * field is not zero. This means that the SCSI command 2546 * has completed but it has an error. So we should 2547 * mark the VD operation has succesfully completed 2548 * and clients can check the SCSI status field for 2549 * SCSI errors. 2550 */ 2551 request->status = 0; 2552 else 2553 return (0); 2554 } 2555 2556 /* Convert data and send to client, if necessary */ 2557 if (ioctl->copyout != NULL) { 2558 ASSERT(nbytes != 0 && buf != NULL); 2559 PR1("Sending \"arg\" data to client"); 2560 2561 /* Convert ioctl data to vdisk operation data, if necessary */ 2562 if (ioctl->copyout != VD_IDENTITY_OUT) 2563 (ioctl->copyout)((void *)ioctl->arg, buf); 2564 2565 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 2566 request->cookie, request->ncookies, 2567 LDC_COPY_OUT)) != 0) { 2568 PR0("ldc_mem_copy() returned errno %d " 2569 "copying to client", status); 2570 return (status); 2571 } 2572 } 2573 2574 return (status); 2575 } 2576 2577 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 2578 2579 /* 2580 * Description: 2581 * This generic function is called by the task queue to complete 2582 * the processing of the tasks. The specific completion function 2583 * is passed in as a field in the task pointer. 2584 * 2585 * Parameters: 2586 * arg - opaque pointer to structure containing task to be completed 2587 * 2588 * Return Values 2589 * None 2590 */ 2591 static void 2592 vd_complete(void *arg) 2593 { 2594 vd_task_t *task = (vd_task_t *)arg; 2595 2596 ASSERT(task != NULL); 2597 ASSERT(task->status == EINPROGRESS); 2598 ASSERT(task->completef != NULL); 2599 2600 task->status = task->completef(task); 2601 if (task->status) 2602 PR0("%s: Error %d completing task", __func__, task->status); 2603 2604 /* Now notify the vDisk client */ 2605 vd_complete_notify(task); 2606 } 2607 2608 static int 2609 vd_ioctl(vd_task_t *task) 2610 { 2611 int i, status; 2612 void *buf = NULL; 2613 struct dk_geom dk_geom = {0}; 2614 struct vtoc vtoc = {0}; 2615 struct dk_efi dk_efi = {0}; 2616 struct uscsi_cmd uscsi = {0}; 2617 vd_t *vd = task->vd; 2618 vd_dring_payload_t *request = task->request; 2619 vd_ioctl_t ioctl[] = { 2620 /* Command (no-copy) operations */ 2621 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 2622 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 2623 NULL, NULL, NULL, B_TRUE}, 2624 2625 /* "Get" (copy-out) operations */ 2626 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 2627 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 2628 NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_FALSE}, 2629 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 2630 RNDSIZE(vd_geom_t), 2631 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 2632 &dk_geom, NULL, dk_geom2vd_geom, B_FALSE}, 2633 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 2634 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 2635 &vtoc, NULL, vtoc2vd_vtoc, B_FALSE}, 2636 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 2637 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 2638 &dk_efi, vd_get_efi_in, vd_get_efi_out, B_FALSE}, 2639 2640 /* "Set" (copy-in) operations */ 2641 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 2642 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 2643 NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_TRUE}, 2644 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 2645 RNDSIZE(vd_geom_t), 2646 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 2647 &dk_geom, vd_geom2dk_geom, NULL, B_TRUE}, 2648 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 2649 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 2650 &vtoc, vd_vtoc2vtoc, NULL, B_TRUE}, 2651 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 2652 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 2653 &dk_efi, vd_set_efi_in, vd_set_efi_out, B_TRUE}, 2654 2655 {VD_OP_SCSICMD, STRINGIZE(VD_OP_SCSICMD), RNDSIZE(vd_scsi_t), 2656 USCSICMD, STRINGIZE(USCSICMD), 2657 &uscsi, vd_scsicmd_in, vd_scsicmd_out, B_FALSE}, 2658 }; 2659 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 2660 2661 2662 ASSERT(vd != NULL); 2663 ASSERT(request != NULL); 2664 ASSERT(request->slice < vd->nslices); 2665 2666 /* 2667 * Determine ioctl corresponding to caller's "operation" and 2668 * validate caller's "nbytes" 2669 */ 2670 for (i = 0; i < nioctls; i++) { 2671 if (request->operation == ioctl[i].operation) { 2672 /* LDC memory operations require 8-byte multiples */ 2673 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 2674 2675 if (request->operation == VD_OP_GET_EFI || 2676 request->operation == VD_OP_SET_EFI || 2677 request->operation == VD_OP_SCSICMD) { 2678 if (request->nbytes >= ioctl[i].nbytes) 2679 break; 2680 PR0("%s: Expected at least nbytes = %lu, " 2681 "got %lu", ioctl[i].operation_name, 2682 ioctl[i].nbytes, request->nbytes); 2683 return (EINVAL); 2684 } 2685 2686 if (request->nbytes != ioctl[i].nbytes) { 2687 PR0("%s: Expected nbytes = %lu, got %lu", 2688 ioctl[i].operation_name, ioctl[i].nbytes, 2689 request->nbytes); 2690 return (EINVAL); 2691 } 2692 2693 break; 2694 } 2695 } 2696 ASSERT(i < nioctls); /* because "operation" already validated */ 2697 2698 if (!(vd->open_flags & FWRITE) && ioctl[i].write) { 2699 PR0("%s fails because backend is opened read-only", 2700 ioctl[i].operation_name); 2701 request->status = EROFS; 2702 return (0); 2703 } 2704 2705 if (request->nbytes) 2706 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 2707 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 2708 if (request->nbytes) 2709 kmem_free(buf, request->nbytes); 2710 2711 return (status); 2712 } 2713 2714 static int 2715 vd_get_devid(vd_task_t *task) 2716 { 2717 vd_t *vd = task->vd; 2718 vd_dring_payload_t *request = task->request; 2719 vd_devid_t *vd_devid; 2720 impl_devid_t *devid; 2721 int status, bufid_len, devid_len, len, sz; 2722 int bufbytes; 2723 2724 PR1("Get Device ID, nbytes=%ld", request->nbytes); 2725 2726 if (vd->file) { 2727 if (vd->file_devid == NULL) { 2728 PR2("No Device ID"); 2729 request->status = ENOENT; 2730 return (0); 2731 } else { 2732 sz = ddi_devid_sizeof(vd->file_devid); 2733 devid = kmem_alloc(sz, KM_SLEEP); 2734 bcopy(vd->file_devid, devid, sz); 2735 } 2736 } else { 2737 if (ddi_lyr_get_devid(vd->dev[request->slice], 2738 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 2739 PR2("No Device ID"); 2740 request->status = ENOENT; 2741 return (0); 2742 } 2743 } 2744 2745 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 2746 devid_len = DEVID_GETLEN(devid); 2747 2748 /* 2749 * Save the buffer size here for use in deallocation. 2750 * The actual number of bytes copied is returned in 2751 * the 'nbytes' field of the request structure. 2752 */ 2753 bufbytes = request->nbytes; 2754 2755 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 2756 vd_devid->length = devid_len; 2757 vd_devid->type = DEVID_GETTYPE(devid); 2758 2759 len = (devid_len > bufid_len)? bufid_len : devid_len; 2760 2761 bcopy(devid->did_id, vd_devid->id, len); 2762 2763 request->status = 0; 2764 2765 /* LDC memory operations require 8-byte multiples */ 2766 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 2767 2768 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 2769 &request->nbytes, request->cookie, request->ncookies, 2770 LDC_COPY_OUT)) != 0) { 2771 PR0("ldc_mem_copy() returned errno %d copying to client", 2772 status); 2773 } 2774 PR1("post mem_copy: nbytes=%ld", request->nbytes); 2775 2776 kmem_free(vd_devid, bufbytes); 2777 ddi_devid_free((ddi_devid_t)devid); 2778 2779 return (status); 2780 } 2781 2782 static int 2783 vd_scsi_reset(vd_t *vd) 2784 { 2785 int rval, status; 2786 struct uscsi_cmd uscsi = { 0 }; 2787 2788 uscsi.uscsi_flags = vd_scsi_debug | USCSI_RESET; 2789 uscsi.uscsi_timeout = vd_scsi_rdwr_timeout; 2790 2791 status = ldi_ioctl(vd->ldi_handle[0], USCSICMD, (intptr_t)&uscsi, 2792 (vd->open_flags | FKIOCTL), kcred, &rval); 2793 2794 return (status); 2795 } 2796 2797 static int 2798 vd_reset(vd_task_t *task) 2799 { 2800 vd_t *vd = task->vd; 2801 vd_dring_payload_t *request = task->request; 2802 2803 ASSERT(request->operation == VD_OP_RESET); 2804 ASSERT(vd->scsi); 2805 2806 PR0("Performing VD_OP_RESET"); 2807 2808 if (request->nbytes != 0) { 2809 PR0("VD_OP_RESET: Expected nbytes = 0, got %lu", 2810 request->nbytes); 2811 return (EINVAL); 2812 } 2813 2814 request->status = vd_scsi_reset(vd); 2815 2816 return (0); 2817 } 2818 2819 static int 2820 vd_get_capacity(vd_task_t *task) 2821 { 2822 int rv; 2823 size_t nbytes; 2824 vd_t *vd = task->vd; 2825 vd_dring_payload_t *request = task->request; 2826 vd_capacity_t vd_cap = { 0 }; 2827 2828 ASSERT(request->operation == VD_OP_GET_CAPACITY); 2829 ASSERT(vd->scsi); 2830 2831 PR0("Performing VD_OP_GET_CAPACITY"); 2832 2833 nbytes = request->nbytes; 2834 2835 if (nbytes != RNDSIZE(vd_capacity_t)) { 2836 PR0("VD_OP_GET_CAPACITY: Expected nbytes = %lu, got %lu", 2837 RNDSIZE(vd_capacity_t), nbytes); 2838 return (EINVAL); 2839 } 2840 2841 if (vd->vdisk_size == VD_SIZE_UNKNOWN) { 2842 if (vd_setup_mediainfo(vd) != 0) 2843 ASSERT(vd->vdisk_size == VD_SIZE_UNKNOWN); 2844 } 2845 2846 ASSERT(vd->vdisk_size != 0); 2847 2848 request->status = 0; 2849 2850 vd_cap.vdisk_block_size = vd->vdisk_block_size; 2851 vd_cap.vdisk_size = vd->vdisk_size; 2852 2853 if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&vd_cap, 0, &nbytes, 2854 request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) { 2855 PR0("ldc_mem_copy() returned errno %d copying to client", rv); 2856 return (rv); 2857 } 2858 2859 return (0); 2860 } 2861 2862 static int 2863 vd_get_access(vd_task_t *task) 2864 { 2865 uint64_t access; 2866 int rv, rval = 0; 2867 size_t nbytes; 2868 vd_t *vd = task->vd; 2869 vd_dring_payload_t *request = task->request; 2870 2871 ASSERT(request->operation == VD_OP_GET_ACCESS); 2872 ASSERT(vd->scsi); 2873 2874 PR0("Performing VD_OP_GET_ACCESS"); 2875 2876 nbytes = request->nbytes; 2877 2878 if (nbytes != sizeof (uint64_t)) { 2879 PR0("VD_OP_GET_ACCESS: Expected nbytes = %lu, got %lu", 2880 sizeof (uint64_t), nbytes); 2881 return (EINVAL); 2882 } 2883 2884 request->status = ldi_ioctl(vd->ldi_handle[request->slice], MHIOCSTATUS, 2885 NULL, (vd->open_flags | FKIOCTL), kcred, &rval); 2886 2887 if (request->status != 0) 2888 return (0); 2889 2890 access = (rval == 0)? VD_ACCESS_ALLOWED : VD_ACCESS_DENIED; 2891 2892 if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&access, 0, &nbytes, 2893 request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) { 2894 PR0("ldc_mem_copy() returned errno %d copying to client", rv); 2895 return (rv); 2896 } 2897 2898 return (0); 2899 } 2900 2901 static int 2902 vd_set_access(vd_task_t *task) 2903 { 2904 uint64_t flags; 2905 int rv, rval; 2906 size_t nbytes; 2907 vd_t *vd = task->vd; 2908 vd_dring_payload_t *request = task->request; 2909 2910 ASSERT(request->operation == VD_OP_SET_ACCESS); 2911 ASSERT(vd->scsi); 2912 2913 nbytes = request->nbytes; 2914 2915 if (nbytes != sizeof (uint64_t)) { 2916 PR0("VD_OP_SET_ACCESS: Expected nbytes = %lu, got %lu", 2917 sizeof (uint64_t), nbytes); 2918 return (EINVAL); 2919 } 2920 2921 if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&flags, 0, &nbytes, 2922 request->cookie, request->ncookies, LDC_COPY_IN)) != 0) { 2923 PR0("ldc_mem_copy() returned errno %d copying from client", rv); 2924 return (rv); 2925 } 2926 2927 if (flags == VD_ACCESS_SET_CLEAR) { 2928 PR0("Performing VD_OP_SET_ACCESS (CLEAR)"); 2929 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2930 MHIOCRELEASE, NULL, (vd->open_flags | FKIOCTL), kcred, 2931 &rval); 2932 if (request->status == 0) 2933 vd->ownership = B_FALSE; 2934 return (0); 2935 } 2936 2937 /* 2938 * As per the VIO spec, the PREEMPT and PRESERVE flags are only valid 2939 * when the EXCLUSIVE flag is set. 2940 */ 2941 if (!(flags & VD_ACCESS_SET_EXCLUSIVE)) { 2942 PR0("Invalid VD_OP_SET_ACCESS flags: 0x%lx", flags); 2943 request->status = EINVAL; 2944 return (0); 2945 } 2946 2947 switch (flags & (VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE)) { 2948 2949 case VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE: 2950 /* 2951 * Flags EXCLUSIVE and PREEMPT and PRESERVE. We have to 2952 * acquire exclusive access rights, preserve them and we 2953 * can use preemption. So we can use the MHIOCTKNOWN ioctl. 2954 */ 2955 PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT|PRESERVE)"); 2956 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2957 MHIOCTKOWN, NULL, (vd->open_flags | FKIOCTL), kcred, &rval); 2958 break; 2959 2960 case VD_ACCESS_SET_PRESERVE: 2961 /* 2962 * Flags EXCLUSIVE and PRESERVE. We have to acquire exclusive 2963 * access rights and preserve them, but not preempt any other 2964 * host. So we need to use the MHIOCTKOWN ioctl to enable the 2965 * "preserve" feature but we can not called it directly 2966 * because it uses preemption. So before that, we use the 2967 * MHIOCQRESERVE ioctl to ensure we can get exclusive rights 2968 * without preempting anyone. 2969 */ 2970 PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PRESERVE)"); 2971 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2972 MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred, 2973 &rval); 2974 if (request->status != 0) 2975 break; 2976 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2977 MHIOCTKOWN, NULL, (vd->open_flags | FKIOCTL), kcred, &rval); 2978 break; 2979 2980 case VD_ACCESS_SET_PREEMPT: 2981 /* 2982 * Flags EXCLUSIVE and PREEMPT. We have to acquire exclusive 2983 * access rights and we can use preemption. So we try to do 2984 * a SCSI reservation, if it fails we reset the disk to clear 2985 * any reservation and we try to reserve again. 2986 */ 2987 PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT)"); 2988 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2989 MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred, 2990 &rval); 2991 if (request->status == 0) 2992 break; 2993 2994 /* reset the disk */ 2995 (void) vd_scsi_reset(vd); 2996 2997 /* try again even if the reset has failed */ 2998 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 2999 MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred, 3000 &rval); 3001 break; 3002 3003 case 0: 3004 /* Flag EXCLUSIVE only. Just issue a SCSI reservation */ 3005 PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE)"); 3006 request->status = ldi_ioctl(vd->ldi_handle[request->slice], 3007 MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred, 3008 &rval); 3009 break; 3010 } 3011 3012 if (request->status == 0) 3013 vd->ownership = B_TRUE; 3014 else 3015 PR0("VD_OP_SET_ACCESS: error %d", request->status); 3016 3017 return (0); 3018 } 3019 3020 static void 3021 vd_reset_access(vd_t *vd) 3022 { 3023 int status, rval; 3024 3025 if (vd->file || !vd->ownership) 3026 return; 3027 3028 PR0("Releasing disk ownership"); 3029 status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, NULL, 3030 (vd->open_flags | FKIOCTL), kcred, &rval); 3031 3032 /* 3033 * An EACCES failure means that there is a reservation conflict, 3034 * so we are not the owner of the disk anymore. 3035 */ 3036 if (status == 0 || status == EACCES) { 3037 vd->ownership = B_FALSE; 3038 return; 3039 } 3040 3041 PR0("Fail to release ownership, error %d", status); 3042 3043 /* 3044 * We have failed to release the ownership, try to reset the disk 3045 * to release reservations. 3046 */ 3047 PR0("Resetting disk"); 3048 status = vd_scsi_reset(vd); 3049 3050 if (status != 0) 3051 PR0("Fail to reset disk, error %d", status); 3052 3053 /* whatever the result of the reset is, we try the release again */ 3054 status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, NULL, 3055 (vd->open_flags | FKIOCTL), kcred, &rval); 3056 3057 if (status == 0 || status == EACCES) { 3058 vd->ownership = B_FALSE; 3059 return; 3060 } 3061 3062 PR0("Fail to release ownership, error %d", status); 3063 3064 /* 3065 * At this point we have done our best to try to reset the 3066 * access rights to the disk and we don't know if we still 3067 * own a reservation and if any mechanism to preserve the 3068 * ownership is still in place. The ultimate solution would 3069 * be to reset the system but this is usually not what we 3070 * want to happen. 3071 */ 3072 3073 if (vd_reset_access_failure == A_REBOOT) { 3074 cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG 3075 ", rebooting the system", vd->device_path); 3076 (void) uadmin(A_SHUTDOWN, AD_BOOT, NULL); 3077 } else if (vd_reset_access_failure == A_DUMP) { 3078 panic(VD_RESET_ACCESS_FAILURE_MSG, vd->device_path); 3079 } 3080 3081 cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG, vd->device_path); 3082 } 3083 3084 /* 3085 * Define the supported operations once the functions for performing them have 3086 * been defined 3087 */ 3088 static const vds_operation_t vds_operation[] = { 3089 #define X(_s) #_s, _s 3090 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 3091 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 3092 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 3093 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 3094 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 3095 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 3096 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 3097 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 3098 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 3099 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 3100 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 3101 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 3102 {X(VD_OP_SCSICMD), vd_ioctl, NULL}, 3103 {X(VD_OP_RESET), vd_reset, NULL}, 3104 {X(VD_OP_GET_CAPACITY), vd_get_capacity, NULL}, 3105 {X(VD_OP_SET_ACCESS), vd_set_access, NULL}, 3106 {X(VD_OP_GET_ACCESS), vd_get_access, NULL}, 3107 #undef X 3108 }; 3109 3110 static const size_t vds_noperations = 3111 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 3112 3113 /* 3114 * Process a task specifying a client I/O request 3115 * 3116 * Parameters: 3117 * task - structure containing the request sent from client 3118 * 3119 * Return Value 3120 * 0 - success 3121 * ENOTSUP - Unknown/Unsupported VD_OP_XXX operation 3122 * EINVAL - Invalid disk slice 3123 * != 0 - some other non-zero return value from start function 3124 */ 3125 static int 3126 vd_do_process_task(vd_task_t *task) 3127 { 3128 int i; 3129 vd_t *vd = task->vd; 3130 vd_dring_payload_t *request = task->request; 3131 3132 ASSERT(vd != NULL); 3133 ASSERT(request != NULL); 3134 3135 /* Find the requested operation */ 3136 for (i = 0; i < vds_noperations; i++) { 3137 if (request->operation == vds_operation[i].operation) { 3138 /* all operations should have a start func */ 3139 ASSERT(vds_operation[i].start != NULL); 3140 3141 task->completef = vds_operation[i].complete; 3142 break; 3143 } 3144 } 3145 3146 /* 3147 * We need to check that the requested operation is permitted 3148 * for the particular client that sent it or that the loop above 3149 * did not complete without finding the operation type (indicating 3150 * that the requested operation is unknown/unimplemented) 3151 */ 3152 if ((VD_OP_SUPPORTED(vd->operations, request->operation) == B_FALSE) || 3153 (i == vds_noperations)) { 3154 PR0("Unsupported operation %u", request->operation); 3155 request->status = ENOTSUP; 3156 return (0); 3157 } 3158 3159 /* Range-check slice */ 3160 if (request->slice >= vd->nslices && 3161 (vd->vdisk_type != VD_DISK_TYPE_DISK || 3162 request->slice != VD_SLICE_NONE)) { 3163 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 3164 request->slice, (vd->nslices - 1)); 3165 return (EINVAL); 3166 } 3167 3168 /* 3169 * Call the function pointer that starts the operation. 3170 */ 3171 return (vds_operation[i].start(task)); 3172 } 3173 3174 /* 3175 * Description: 3176 * This function is called by both the in-band and descriptor ring 3177 * message processing functions paths to actually execute the task 3178 * requested by the vDisk client. It in turn calls its worker 3179 * function, vd_do_process_task(), to carry our the request. 3180 * 3181 * Any transport errors (e.g. LDC errors, vDisk protocol errors) are 3182 * saved in the 'status' field of the task and are propagated back 3183 * up the call stack to trigger a NACK 3184 * 3185 * Any request errors (e.g. ENOTTY from an ioctl) are saved in 3186 * the 'status' field of the request and result in an ACK being sent 3187 * by the completion handler. 3188 * 3189 * Parameters: 3190 * task - structure containing the request sent from client 3191 * 3192 * Return Value 3193 * 0 - successful synchronous request. 3194 * != 0 - transport error (e.g. LDC errors, vDisk protocol) 3195 * EINPROGRESS - task will be finished in a completion handler 3196 */ 3197 static int 3198 vd_process_task(vd_task_t *task) 3199 { 3200 vd_t *vd = task->vd; 3201 int status; 3202 3203 DTRACE_PROBE1(task__start, vd_task_t *, task); 3204 3205 task->status = vd_do_process_task(task); 3206 3207 /* 3208 * If the task processing function returned EINPROGRESS indicating 3209 * that the task needs completing then schedule a taskq entry to 3210 * finish it now. 3211 * 3212 * Otherwise the task processing function returned either zero 3213 * indicating that the task was finished in the start function (and we 3214 * don't need to wait in a completion function) or the start function 3215 * returned an error - in both cases all that needs to happen is the 3216 * notification to the vDisk client higher up the call stack. 3217 * If the task was using a Descriptor Ring, we need to mark it as done 3218 * at this stage. 3219 */ 3220 if (task->status == EINPROGRESS) { 3221 /* Queue a task to complete the operation */ 3222 (void) ddi_taskq_dispatch(vd->completionq, vd_complete, 3223 task, DDI_SLEEP); 3224 3225 } else if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE_V1_0)) { 3226 /* Update the dring element if it's a dring client */ 3227 status = vd_mark_elem_done(vd, task->index, 3228 task->request->status, task->request->nbytes); 3229 if (status == ECONNRESET) 3230 vd_mark_in_reset(vd); 3231 } 3232 3233 return (task->status); 3234 } 3235 3236 /* 3237 * Return true if the "type", "subtype", and "env" fields of the "tag" first 3238 * argument match the corresponding remaining arguments; otherwise, return false 3239 */ 3240 boolean_t 3241 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 3242 { 3243 return ((tag->vio_msgtype == type) && 3244 (tag->vio_subtype == subtype) && 3245 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 3246 } 3247 3248 /* 3249 * Check whether the major/minor version specified in "ver_msg" is supported 3250 * by this server. 3251 */ 3252 static boolean_t 3253 vds_supported_version(vio_ver_msg_t *ver_msg) 3254 { 3255 for (int i = 0; i < vds_num_versions; i++) { 3256 ASSERT(vds_version[i].major > 0); 3257 ASSERT((i == 0) || 3258 (vds_version[i].major < vds_version[i-1].major)); 3259 3260 /* 3261 * If the major versions match, adjust the minor version, if 3262 * necessary, down to the highest value supported by this 3263 * server and return true so this message will get "ack"ed; 3264 * the client should also support all minor versions lower 3265 * than the value it sent 3266 */ 3267 if (ver_msg->ver_major == vds_version[i].major) { 3268 if (ver_msg->ver_minor > vds_version[i].minor) { 3269 PR0("Adjusting minor version from %u to %u", 3270 ver_msg->ver_minor, vds_version[i].minor); 3271 ver_msg->ver_minor = vds_version[i].minor; 3272 } 3273 return (B_TRUE); 3274 } 3275 3276 /* 3277 * If the message contains a higher major version number, set 3278 * the message's major/minor versions to the current values 3279 * and return false, so this message will get "nack"ed with 3280 * these values, and the client will potentially try again 3281 * with the same or a lower version 3282 */ 3283 if (ver_msg->ver_major > vds_version[i].major) { 3284 ver_msg->ver_major = vds_version[i].major; 3285 ver_msg->ver_minor = vds_version[i].minor; 3286 return (B_FALSE); 3287 } 3288 3289 /* 3290 * Otherwise, the message's major version is less than the 3291 * current major version, so continue the loop to the next 3292 * (lower) supported version 3293 */ 3294 } 3295 3296 /* 3297 * No common version was found; "ground" the version pair in the 3298 * message to terminate negotiation 3299 */ 3300 ver_msg->ver_major = 0; 3301 ver_msg->ver_minor = 0; 3302 return (B_FALSE); 3303 } 3304 3305 /* 3306 * Process a version message from a client. vds expects to receive version 3307 * messages from clients seeking service, but never issues version messages 3308 * itself; therefore, vds can ACK or NACK client version messages, but does 3309 * not expect to receive version-message ACKs or NACKs (and will treat such 3310 * messages as invalid). 3311 */ 3312 static int 3313 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3314 { 3315 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 3316 3317 3318 ASSERT(msglen >= sizeof (msg->tag)); 3319 3320 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 3321 VIO_VER_INFO)) { 3322 return (ENOMSG); /* not a version message */ 3323 } 3324 3325 if (msglen != sizeof (*ver_msg)) { 3326 PR0("Expected %lu-byte version message; " 3327 "received %lu bytes", sizeof (*ver_msg), msglen); 3328 return (EBADMSG); 3329 } 3330 3331 if (ver_msg->dev_class != VDEV_DISK) { 3332 PR0("Expected device class %u (disk); received %u", 3333 VDEV_DISK, ver_msg->dev_class); 3334 return (EBADMSG); 3335 } 3336 3337 /* 3338 * We're talking to the expected kind of client; set our device class 3339 * for "ack/nack" back to the client 3340 */ 3341 ver_msg->dev_class = VDEV_DISK_SERVER; 3342 3343 /* 3344 * Check whether the (valid) version message specifies a version 3345 * supported by this server. If the version is not supported, return 3346 * EBADMSG so the message will get "nack"ed; vds_supported_version() 3347 * will have updated the message with a supported version for the 3348 * client to consider 3349 */ 3350 if (!vds_supported_version(ver_msg)) 3351 return (EBADMSG); 3352 3353 3354 /* 3355 * A version has been agreed upon; use the client's SID for 3356 * communication on this channel now 3357 */ 3358 ASSERT(!(vd->initialized & VD_SID)); 3359 vd->sid = ver_msg->tag.vio_sid; 3360 vd->initialized |= VD_SID; 3361 3362 /* 3363 * Store the negotiated major and minor version values in the "vd" data 3364 * structure so that we can check if certain operations are supported 3365 * by the client. 3366 */ 3367 vd->version.major = ver_msg->ver_major; 3368 vd->version.minor = ver_msg->ver_minor; 3369 3370 PR0("Using major version %u, minor version %u", 3371 ver_msg->ver_major, ver_msg->ver_minor); 3372 return (0); 3373 } 3374 3375 static void 3376 vd_set_exported_operations(vd_t *vd) 3377 { 3378 vd->operations = 0; /* clear field */ 3379 3380 /* 3381 * We need to check from the highest version supported to the 3382 * lowest because versions with a higher minor number implicitly 3383 * support versions with a lower minor number. 3384 */ 3385 if (vio_ver_is_supported(vd->version, 1, 1)) { 3386 ASSERT(vd->open_flags & FREAD); 3387 vd->operations |= VD_OP_MASK_READ; 3388 3389 if (vd->open_flags & FWRITE) 3390 vd->operations |= VD_OP_MASK_WRITE; 3391 3392 if (vd->scsi) 3393 vd->operations |= VD_OP_MASK_SCSI; 3394 3395 if (vd->file && vd_file_is_iso_image(vd)) { 3396 /* 3397 * can't write to ISO images, make sure that write 3398 * support is not set in case administrator did not 3399 * use "options=ro" when doing an ldm add-vdsdev 3400 */ 3401 vd->operations &= ~VD_OP_MASK_WRITE; 3402 } 3403 } else if (vio_ver_is_supported(vd->version, 1, 0)) { 3404 vd->operations = VD_OP_MASK_READ | VD_OP_MASK_WRITE; 3405 } 3406 3407 /* we should have already agreed on a version */ 3408 ASSERT(vd->operations != 0); 3409 } 3410 3411 static int 3412 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3413 { 3414 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 3415 int status, retry = 0; 3416 3417 3418 ASSERT(msglen >= sizeof (msg->tag)); 3419 3420 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 3421 VIO_ATTR_INFO)) { 3422 PR0("Message is not an attribute message"); 3423 return (ENOMSG); 3424 } 3425 3426 if (msglen != sizeof (*attr_msg)) { 3427 PR0("Expected %lu-byte attribute message; " 3428 "received %lu bytes", sizeof (*attr_msg), msglen); 3429 return (EBADMSG); 3430 } 3431 3432 if (attr_msg->max_xfer_sz == 0) { 3433 PR0("Received maximum transfer size of 0 from client"); 3434 return (EBADMSG); 3435 } 3436 3437 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 3438 (attr_msg->xfer_mode != VIO_DRING_MODE_V1_0)) { 3439 PR0("Client requested unsupported transfer mode"); 3440 return (EBADMSG); 3441 } 3442 3443 /* 3444 * check if the underlying disk is ready, if not try accessing 3445 * the device again. Open the vdisk device and extract info 3446 * about it, as this is needed to respond to the attr info msg 3447 */ 3448 if ((vd->initialized & VD_DISK_READY) == 0) { 3449 PR0("Retry setting up disk (%s)", vd->device_path); 3450 do { 3451 status = vd_setup_vd(vd); 3452 if (status != EAGAIN || ++retry > vds_dev_retries) 3453 break; 3454 3455 /* incremental delay */ 3456 delay(drv_usectohz(vds_dev_delay)); 3457 3458 /* if vdisk is no longer enabled - return error */ 3459 if (!vd_enabled(vd)) 3460 return (ENXIO); 3461 3462 } while (status == EAGAIN); 3463 3464 if (status) 3465 return (ENXIO); 3466 3467 vd->initialized |= VD_DISK_READY; 3468 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 3469 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 3470 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 3471 (vd->pseudo ? "yes" : "no"), 3472 (vd->file ? "yes" : "no"), 3473 vd->nslices); 3474 } 3475 3476 /* Success: valid message and transfer mode */ 3477 vd->xfer_mode = attr_msg->xfer_mode; 3478 3479 if (vd->xfer_mode == VIO_DESC_MODE) { 3480 3481 /* 3482 * The vd_dring_inband_msg_t contains one cookie; need room 3483 * for up to n-1 more cookies, where "n" is the number of full 3484 * pages plus possibly one partial page required to cover 3485 * "max_xfer_sz". Add room for one more cookie if 3486 * "max_xfer_sz" isn't an integral multiple of the page size. 3487 * Must first get the maximum transfer size in bytes. 3488 */ 3489 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 3490 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 3491 attr_msg->max_xfer_sz; 3492 size_t max_inband_msglen = 3493 sizeof (vd_dring_inband_msg_t) + 3494 ((max_xfer_bytes/PAGESIZE + 3495 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 3496 (sizeof (ldc_mem_cookie_t))); 3497 3498 /* 3499 * Set the maximum expected message length to 3500 * accommodate in-band-descriptor messages with all 3501 * their cookies 3502 */ 3503 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 3504 3505 /* 3506 * Initialize the data structure for processing in-band I/O 3507 * request descriptors 3508 */ 3509 vd->inband_task.vd = vd; 3510 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 3511 vd->inband_task.index = 0; 3512 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 3513 } 3514 3515 /* Return the device's block size and max transfer size to the client */ 3516 attr_msg->vdisk_block_size = vd->vdisk_block_size; 3517 attr_msg->max_xfer_sz = vd->max_xfer_sz; 3518 3519 attr_msg->vdisk_size = vd->vdisk_size; 3520 attr_msg->vdisk_type = vd->vdisk_type; 3521 attr_msg->vdisk_media = vd->vdisk_media; 3522 3523 /* Discover and save the list of supported VD_OP_XXX operations */ 3524 vd_set_exported_operations(vd); 3525 attr_msg->operations = vd->operations; 3526 3527 PR0("%s", VD_CLIENT(vd)); 3528 3529 ASSERT(vd->dring_task == NULL); 3530 3531 return (0); 3532 } 3533 3534 static int 3535 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3536 { 3537 int status; 3538 size_t expected; 3539 ldc_mem_info_t dring_minfo; 3540 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 3541 3542 3543 ASSERT(msglen >= sizeof (msg->tag)); 3544 3545 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 3546 VIO_DRING_REG)) { 3547 PR0("Message is not a register-dring message"); 3548 return (ENOMSG); 3549 } 3550 3551 if (msglen < sizeof (*reg_msg)) { 3552 PR0("Expected at least %lu-byte register-dring message; " 3553 "received %lu bytes", sizeof (*reg_msg), msglen); 3554 return (EBADMSG); 3555 } 3556 3557 expected = sizeof (*reg_msg) + 3558 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 3559 if (msglen != expected) { 3560 PR0("Expected %lu-byte register-dring message; " 3561 "received %lu bytes", expected, msglen); 3562 return (EBADMSG); 3563 } 3564 3565 if (vd->initialized & VD_DRING) { 3566 PR0("A dring was previously registered; only support one"); 3567 return (EBADMSG); 3568 } 3569 3570 if (reg_msg->num_descriptors > INT32_MAX) { 3571 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 3572 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 3573 return (EBADMSG); 3574 } 3575 3576 if (reg_msg->ncookies != 1) { 3577 /* 3578 * In addition to fixing the assertion in the success case 3579 * below, supporting drings which require more than one 3580 * "cookie" requires increasing the value of vd->max_msglen 3581 * somewhere in the code path prior to receiving the message 3582 * which results in calling this function. Note that without 3583 * making this change, the larger message size required to 3584 * accommodate multiple cookies cannot be successfully 3585 * received, so this function will not even get called. 3586 * Gracefully accommodating more dring cookies might 3587 * reasonably demand exchanging an additional attribute or 3588 * making a minor protocol adjustment 3589 */ 3590 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 3591 return (EBADMSG); 3592 } 3593 3594 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 3595 reg_msg->ncookies, reg_msg->num_descriptors, 3596 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 3597 if (status != 0) { 3598 PR0("ldc_mem_dring_map() returned errno %d", status); 3599 return (status); 3600 } 3601 3602 /* 3603 * To remove the need for this assertion, must call 3604 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 3605 * successful call to ldc_mem_dring_map() 3606 */ 3607 ASSERT(reg_msg->ncookies == 1); 3608 3609 if ((status = 3610 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 3611 PR0("ldc_mem_dring_info() returned errno %d", status); 3612 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 3613 PR0("ldc_mem_dring_unmap() returned errno %d", status); 3614 return (status); 3615 } 3616 3617 if (dring_minfo.vaddr == NULL) { 3618 PR0("Descriptor ring virtual address is NULL"); 3619 return (ENXIO); 3620 } 3621 3622 3623 /* Initialize for valid message and mapped dring */ 3624 PR1("descriptor size = %u, dring length = %u", 3625 vd->descriptor_size, vd->dring_len); 3626 vd->initialized |= VD_DRING; 3627 vd->dring_ident = 1; /* "There Can Be Only One" */ 3628 vd->dring = dring_minfo.vaddr; 3629 vd->descriptor_size = reg_msg->descriptor_size; 3630 vd->dring_len = reg_msg->num_descriptors; 3631 reg_msg->dring_ident = vd->dring_ident; 3632 3633 /* 3634 * Allocate and initialize a "shadow" array of data structures for 3635 * tasks to process I/O requests in dring elements 3636 */ 3637 vd->dring_task = 3638 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 3639 for (int i = 0; i < vd->dring_len; i++) { 3640 vd->dring_task[i].vd = vd; 3641 vd->dring_task[i].index = i; 3642 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 3643 3644 status = ldc_mem_alloc_handle(vd->ldc_handle, 3645 &(vd->dring_task[i].mhdl)); 3646 if (status) { 3647 PR0("ldc_mem_alloc_handle() returned err %d ", status); 3648 return (ENXIO); 3649 } 3650 3651 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 3652 } 3653 3654 return (0); 3655 } 3656 3657 static int 3658 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3659 { 3660 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 3661 3662 3663 ASSERT(msglen >= sizeof (msg->tag)); 3664 3665 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 3666 VIO_DRING_UNREG)) { 3667 PR0("Message is not an unregister-dring message"); 3668 return (ENOMSG); 3669 } 3670 3671 if (msglen != sizeof (*unreg_msg)) { 3672 PR0("Expected %lu-byte unregister-dring message; " 3673 "received %lu bytes", sizeof (*unreg_msg), msglen); 3674 return (EBADMSG); 3675 } 3676 3677 if (unreg_msg->dring_ident != vd->dring_ident) { 3678 PR0("Expected dring ident %lu; received %lu", 3679 vd->dring_ident, unreg_msg->dring_ident); 3680 return (EBADMSG); 3681 } 3682 3683 return (0); 3684 } 3685 3686 static int 3687 process_rdx_msg(vio_msg_t *msg, size_t msglen) 3688 { 3689 ASSERT(msglen >= sizeof (msg->tag)); 3690 3691 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 3692 PR0("Message is not an RDX message"); 3693 return (ENOMSG); 3694 } 3695 3696 if (msglen != sizeof (vio_rdx_msg_t)) { 3697 PR0("Expected %lu-byte RDX message; received %lu bytes", 3698 sizeof (vio_rdx_msg_t), msglen); 3699 return (EBADMSG); 3700 } 3701 3702 PR0("Valid RDX message"); 3703 return (0); 3704 } 3705 3706 static int 3707 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 3708 { 3709 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 3710 PR0("Received seq_num %lu; expected %lu", 3711 seq_num, (vd->seq_num + 1)); 3712 PR0("initiating soft reset"); 3713 vd_need_reset(vd, B_FALSE); 3714 return (1); 3715 } 3716 3717 vd->seq_num = seq_num; 3718 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 3719 return (0); 3720 } 3721 3722 /* 3723 * Return the expected size of an inband-descriptor message with all the 3724 * cookies it claims to include 3725 */ 3726 static size_t 3727 expected_inband_size(vd_dring_inband_msg_t *msg) 3728 { 3729 return ((sizeof (*msg)) + 3730 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 3731 } 3732 3733 /* 3734 * Process an in-band descriptor message: used with clients like OBP, with 3735 * which vds exchanges descriptors within VIO message payloads, rather than 3736 * operating on them within a descriptor ring 3737 */ 3738 static int 3739 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3740 { 3741 size_t expected; 3742 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 3743 3744 3745 ASSERT(msglen >= sizeof (msg->tag)); 3746 3747 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 3748 VIO_DESC_DATA)) { 3749 PR1("Message is not an in-band-descriptor message"); 3750 return (ENOMSG); 3751 } 3752 3753 if (msglen < sizeof (*desc_msg)) { 3754 PR0("Expected at least %lu-byte descriptor message; " 3755 "received %lu bytes", sizeof (*desc_msg), msglen); 3756 return (EBADMSG); 3757 } 3758 3759 if (msglen != (expected = expected_inband_size(desc_msg))) { 3760 PR0("Expected %lu-byte descriptor message; " 3761 "received %lu bytes", expected, msglen); 3762 return (EBADMSG); 3763 } 3764 3765 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 3766 return (EBADMSG); 3767 3768 /* 3769 * Valid message: Set up the in-band descriptor task and process the 3770 * request. Arrange to acknowledge the client's message, unless an 3771 * error processing the descriptor task results in setting 3772 * VIO_SUBTYPE_NACK 3773 */ 3774 PR1("Valid in-band-descriptor message"); 3775 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3776 3777 ASSERT(vd->inband_task.msg != NULL); 3778 3779 bcopy(msg, vd->inband_task.msg, msglen); 3780 vd->inband_task.msglen = msglen; 3781 3782 /* 3783 * The task request is now the payload of the message 3784 * that was just copied into the body of the task. 3785 */ 3786 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 3787 vd->inband_task.request = &desc_msg->payload; 3788 3789 return (vd_process_task(&vd->inband_task)); 3790 } 3791 3792 static int 3793 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 3794 vio_msg_t *msg, size_t msglen) 3795 { 3796 int status; 3797 boolean_t ready; 3798 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 3799 3800 3801 /* Accept the updated dring element */ 3802 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 3803 PR0("ldc_mem_dring_acquire() returned errno %d", status); 3804 return (status); 3805 } 3806 ready = (elem->hdr.dstate == VIO_DESC_READY); 3807 if (ready) { 3808 elem->hdr.dstate = VIO_DESC_ACCEPTED; 3809 } else { 3810 PR0("descriptor %u not ready", idx); 3811 VD_DUMP_DRING_ELEM(elem); 3812 } 3813 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 3814 PR0("ldc_mem_dring_release() returned errno %d", status); 3815 return (status); 3816 } 3817 if (!ready) 3818 return (EBUSY); 3819 3820 3821 /* Initialize a task and process the accepted element */ 3822 PR1("Processing dring element %u", idx); 3823 vd->dring_task[idx].type = type; 3824 3825 /* duplicate msg buf for cookies etc. */ 3826 bcopy(msg, vd->dring_task[idx].msg, msglen); 3827 3828 vd->dring_task[idx].msglen = msglen; 3829 return (vd_process_task(&vd->dring_task[idx])); 3830 } 3831 3832 static int 3833 vd_process_element_range(vd_t *vd, int start, int end, 3834 vio_msg_t *msg, size_t msglen) 3835 { 3836 int i, n, nelem, status = 0; 3837 boolean_t inprogress = B_FALSE; 3838 vd_task_type_t type; 3839 3840 3841 ASSERT(start >= 0); 3842 ASSERT(end >= 0); 3843 3844 /* 3845 * Arrange to acknowledge the client's message, unless an error 3846 * processing one of the dring elements results in setting 3847 * VIO_SUBTYPE_NACK 3848 */ 3849 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 3850 3851 /* 3852 * Process the dring elements in the range 3853 */ 3854 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 3855 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 3856 ((vio_dring_msg_t *)msg)->end_idx = i; 3857 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 3858 status = vd_process_element(vd, type, i, msg, msglen); 3859 if (status == EINPROGRESS) 3860 inprogress = B_TRUE; 3861 else if (status != 0) 3862 break; 3863 } 3864 3865 /* 3866 * If some, but not all, operations of a multi-element range are in 3867 * progress, wait for other operations to complete before returning 3868 * (which will result in "ack" or "nack" of the message). Note that 3869 * all outstanding operations will need to complete, not just the ones 3870 * corresponding to the current range of dring elements; howevever, as 3871 * this situation is an error case, performance is less critical. 3872 */ 3873 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 3874 ddi_taskq_wait(vd->completionq); 3875 3876 return (status); 3877 } 3878 3879 static int 3880 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3881 { 3882 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 3883 3884 3885 ASSERT(msglen >= sizeof (msg->tag)); 3886 3887 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 3888 VIO_DRING_DATA)) { 3889 PR1("Message is not a dring-data message"); 3890 return (ENOMSG); 3891 } 3892 3893 if (msglen != sizeof (*dring_msg)) { 3894 PR0("Expected %lu-byte dring message; received %lu bytes", 3895 sizeof (*dring_msg), msglen); 3896 return (EBADMSG); 3897 } 3898 3899 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 3900 return (EBADMSG); 3901 3902 if (dring_msg->dring_ident != vd->dring_ident) { 3903 PR0("Expected dring ident %lu; received ident %lu", 3904 vd->dring_ident, dring_msg->dring_ident); 3905 return (EBADMSG); 3906 } 3907 3908 if (dring_msg->start_idx >= vd->dring_len) { 3909 PR0("\"start_idx\" = %u; must be less than %u", 3910 dring_msg->start_idx, vd->dring_len); 3911 return (EBADMSG); 3912 } 3913 3914 if ((dring_msg->end_idx < 0) || 3915 (dring_msg->end_idx >= vd->dring_len)) { 3916 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 3917 dring_msg->end_idx, vd->dring_len); 3918 return (EBADMSG); 3919 } 3920 3921 /* Valid message; process range of updated dring elements */ 3922 PR1("Processing descriptor range, start = %u, end = %u", 3923 dring_msg->start_idx, dring_msg->end_idx); 3924 return (vd_process_element_range(vd, dring_msg->start_idx, 3925 dring_msg->end_idx, msg, msglen)); 3926 } 3927 3928 static int 3929 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 3930 { 3931 int retry, status; 3932 size_t size = *nbytes; 3933 3934 3935 for (retry = 0, status = ETIMEDOUT; 3936 retry < vds_ldc_retries && status == ETIMEDOUT; 3937 retry++) { 3938 PR1("ldc_read() attempt %d", (retry + 1)); 3939 *nbytes = size; 3940 status = ldc_read(ldc_handle, msg, nbytes); 3941 } 3942 3943 if (status) { 3944 PR0("ldc_read() returned errno %d", status); 3945 if (status != ECONNRESET) 3946 return (ENOMSG); 3947 return (status); 3948 } else if (*nbytes == 0) { 3949 PR1("ldc_read() returned 0 and no message read"); 3950 return (ENOMSG); 3951 } 3952 3953 PR1("RCVD %lu-byte message", *nbytes); 3954 return (0); 3955 } 3956 3957 static int 3958 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 3959 { 3960 int status; 3961 3962 3963 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 3964 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 3965 #ifdef DEBUG 3966 vd_decode_tag(msg); 3967 #endif 3968 3969 /* 3970 * Validate session ID up front, since it applies to all messages 3971 * once set 3972 */ 3973 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 3974 PR0("Expected SID %u, received %u", vd->sid, 3975 msg->tag.vio_sid); 3976 return (EBADMSG); 3977 } 3978 3979 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 3980 3981 /* 3982 * Process the received message based on connection state 3983 */ 3984 switch (vd->state) { 3985 case VD_STATE_INIT: /* expect version message */ 3986 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 3987 return (status); 3988 3989 /* Version negotiated, move to that state */ 3990 vd->state = VD_STATE_VER; 3991 return (0); 3992 3993 case VD_STATE_VER: /* expect attribute message */ 3994 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 3995 return (status); 3996 3997 /* Attributes exchanged, move to that state */ 3998 vd->state = VD_STATE_ATTR; 3999 return (0); 4000 4001 case VD_STATE_ATTR: 4002 switch (vd->xfer_mode) { 4003 case VIO_DESC_MODE: /* expect RDX message */ 4004 if ((status = process_rdx_msg(msg, msglen)) != 0) 4005 return (status); 4006 4007 /* Ready to receive in-band descriptors */ 4008 vd->state = VD_STATE_DATA; 4009 return (0); 4010 4011 case VIO_DRING_MODE_V1_0: /* expect register-dring message */ 4012 if ((status = 4013 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 4014 return (status); 4015 4016 /* One dring negotiated, move to that state */ 4017 vd->state = VD_STATE_DRING; 4018 return (0); 4019 4020 default: 4021 ASSERT("Unsupported transfer mode"); 4022 PR0("Unsupported transfer mode"); 4023 return (ENOTSUP); 4024 } 4025 4026 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 4027 if ((status = process_rdx_msg(msg, msglen)) == 0) { 4028 /* Ready to receive data */ 4029 vd->state = VD_STATE_DATA; 4030 return (0); 4031 } else if (status != ENOMSG) { 4032 return (status); 4033 } 4034 4035 4036 /* 4037 * If another register-dring message is received, stay in 4038 * dring state in case the client sends RDX; although the 4039 * protocol allows multiple drings, this server does not 4040 * support using more than one 4041 */ 4042 if ((status = 4043 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 4044 return (status); 4045 4046 /* 4047 * Acknowledge an unregister-dring message, but reset the 4048 * connection anyway: Although the protocol allows 4049 * unregistering drings, this server cannot serve a vdisk 4050 * without its only dring 4051 */ 4052 status = vd_process_dring_unreg_msg(vd, msg, msglen); 4053 return ((status == 0) ? ENOTSUP : status); 4054 4055 case VD_STATE_DATA: 4056 switch (vd->xfer_mode) { 4057 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 4058 return (vd_process_desc_msg(vd, msg, msglen)); 4059 4060 case VIO_DRING_MODE_V1_0: /* expect dring-data or unreg-dring */ 4061 /* 4062 * Typically expect dring-data messages, so handle 4063 * them first 4064 */ 4065 if ((status = vd_process_dring_msg(vd, msg, 4066 msglen)) != ENOMSG) 4067 return (status); 4068 4069 /* 4070 * Acknowledge an unregister-dring message, but reset 4071 * the connection anyway: Although the protocol 4072 * allows unregistering drings, this server cannot 4073 * serve a vdisk without its only dring 4074 */ 4075 status = vd_process_dring_unreg_msg(vd, msg, msglen); 4076 return ((status == 0) ? ENOTSUP : status); 4077 4078 default: 4079 ASSERT("Unsupported transfer mode"); 4080 PR0("Unsupported transfer mode"); 4081 return (ENOTSUP); 4082 } 4083 4084 default: 4085 ASSERT("Invalid client connection state"); 4086 PR0("Invalid client connection state"); 4087 return (ENOTSUP); 4088 } 4089 } 4090 4091 static int 4092 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 4093 { 4094 int status; 4095 boolean_t reset_ldc = B_FALSE; 4096 vd_task_t task; 4097 4098 /* 4099 * Check that the message is at least big enough for a "tag", so that 4100 * message processing can proceed based on tag-specified message type 4101 */ 4102 if (msglen < sizeof (vio_msg_tag_t)) { 4103 PR0("Received short (%lu-byte) message", msglen); 4104 /* Can't "nack" short message, so drop the big hammer */ 4105 PR0("initiating full reset"); 4106 vd_need_reset(vd, B_TRUE); 4107 return (EBADMSG); 4108 } 4109 4110 /* 4111 * Process the message 4112 */ 4113 switch (status = vd_do_process_msg(vd, msg, msglen)) { 4114 case 0: 4115 /* "ack" valid, successfully-processed messages */ 4116 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 4117 break; 4118 4119 case EINPROGRESS: 4120 /* The completion handler will "ack" or "nack" the message */ 4121 return (EINPROGRESS); 4122 case ENOMSG: 4123 PR0("Received unexpected message"); 4124 _NOTE(FALLTHROUGH); 4125 case EBADMSG: 4126 case ENOTSUP: 4127 /* "transport" error will cause NACK of invalid messages */ 4128 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 4129 break; 4130 4131 default: 4132 /* "transport" error will cause NACK of invalid messages */ 4133 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 4134 /* An LDC error probably occurred, so try resetting it */ 4135 reset_ldc = B_TRUE; 4136 break; 4137 } 4138 4139 PR1("\tResulting in state %d (%s)", vd->state, 4140 vd_decode_state(vd->state)); 4141 4142 /* populate the task so we can dispatch it on the taskq */ 4143 task.vd = vd; 4144 task.msg = msg; 4145 task.msglen = msglen; 4146 4147 /* 4148 * Queue a task to send the notification that the operation completed. 4149 * We need to ensure that requests are responded to in the correct 4150 * order and since the taskq is processed serially this ordering 4151 * is maintained. 4152 */ 4153 (void) ddi_taskq_dispatch(vd->completionq, vd_serial_notify, 4154 &task, DDI_SLEEP); 4155 4156 /* 4157 * To ensure handshake negotiations do not happen out of order, such 4158 * requests that come through this path should not be done in parallel 4159 * so we need to wait here until the response is sent to the client. 4160 */ 4161 ddi_taskq_wait(vd->completionq); 4162 4163 /* Arrange to reset the connection for nack'ed or failed messages */ 4164 if ((status != 0) || reset_ldc) { 4165 PR0("initiating %s reset", 4166 (reset_ldc) ? "full" : "soft"); 4167 vd_need_reset(vd, reset_ldc); 4168 } 4169 4170 return (status); 4171 } 4172 4173 static boolean_t 4174 vd_enabled(vd_t *vd) 4175 { 4176 boolean_t enabled; 4177 4178 mutex_enter(&vd->lock); 4179 enabled = vd->enabled; 4180 mutex_exit(&vd->lock); 4181 return (enabled); 4182 } 4183 4184 static void 4185 vd_recv_msg(void *arg) 4186 { 4187 vd_t *vd = (vd_t *)arg; 4188 int rv = 0, status = 0; 4189 4190 ASSERT(vd != NULL); 4191 4192 PR2("New task to receive incoming message(s)"); 4193 4194 4195 while (vd_enabled(vd) && status == 0) { 4196 size_t msglen, msgsize; 4197 ldc_status_t lstatus; 4198 4199 /* 4200 * Receive and process a message 4201 */ 4202 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 4203 4204 /* 4205 * check if channel is UP - else break out of loop 4206 */ 4207 status = ldc_status(vd->ldc_handle, &lstatus); 4208 if (lstatus != LDC_UP) { 4209 PR0("channel not up (status=%d), exiting recv loop\n", 4210 lstatus); 4211 break; 4212 } 4213 4214 ASSERT(vd->max_msglen != 0); 4215 4216 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 4217 msglen = msgsize; /* actual len after recv_msg() */ 4218 4219 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 4220 switch (status) { 4221 case 0: 4222 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 4223 msglen); 4224 /* check if max_msglen changed */ 4225 if (msgsize != vd->max_msglen) { 4226 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 4227 msgsize, vd->max_msglen); 4228 kmem_free(vd->vio_msgp, msgsize); 4229 vd->vio_msgp = 4230 kmem_alloc(vd->max_msglen, KM_SLEEP); 4231 } 4232 if (rv == EINPROGRESS) 4233 continue; 4234 break; 4235 4236 case ENOMSG: 4237 break; 4238 4239 case ECONNRESET: 4240 PR0("initiating soft reset (ECONNRESET)\n"); 4241 vd_need_reset(vd, B_FALSE); 4242 status = 0; 4243 break; 4244 4245 default: 4246 /* Probably an LDC failure; arrange to reset it */ 4247 PR0("initiating full reset (status=0x%x)", status); 4248 vd_need_reset(vd, B_TRUE); 4249 break; 4250 } 4251 } 4252 4253 PR2("Task finished"); 4254 } 4255 4256 static uint_t 4257 vd_handle_ldc_events(uint64_t event, caddr_t arg) 4258 { 4259 vd_t *vd = (vd_t *)(void *)arg; 4260 int status; 4261 4262 ASSERT(vd != NULL); 4263 4264 if (!vd_enabled(vd)) 4265 return (LDC_SUCCESS); 4266 4267 if (event & LDC_EVT_DOWN) { 4268 PR0("LDC_EVT_DOWN: LDC channel went down"); 4269 4270 vd_need_reset(vd, B_TRUE); 4271 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 4272 DDI_SLEEP); 4273 if (status == DDI_FAILURE) { 4274 PR0("cannot schedule task to recv msg\n"); 4275 vd_need_reset(vd, B_TRUE); 4276 } 4277 } 4278 4279 if (event & LDC_EVT_RESET) { 4280 PR0("LDC_EVT_RESET: LDC channel was reset"); 4281 4282 if (vd->state != VD_STATE_INIT) { 4283 PR0("scheduling full reset"); 4284 vd_need_reset(vd, B_FALSE); 4285 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 4286 vd, DDI_SLEEP); 4287 if (status == DDI_FAILURE) { 4288 PR0("cannot schedule task to recv msg\n"); 4289 vd_need_reset(vd, B_TRUE); 4290 } 4291 4292 } else { 4293 PR0("channel already reset, ignoring...\n"); 4294 PR0("doing ldc up...\n"); 4295 (void) ldc_up(vd->ldc_handle); 4296 } 4297 4298 return (LDC_SUCCESS); 4299 } 4300 4301 if (event & LDC_EVT_UP) { 4302 PR0("EVT_UP: LDC is up\nResetting client connection state"); 4303 PR0("initiating soft reset"); 4304 vd_need_reset(vd, B_FALSE); 4305 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 4306 vd, DDI_SLEEP); 4307 if (status == DDI_FAILURE) { 4308 PR0("cannot schedule task to recv msg\n"); 4309 vd_need_reset(vd, B_TRUE); 4310 return (LDC_SUCCESS); 4311 } 4312 } 4313 4314 if (event & LDC_EVT_READ) { 4315 int status; 4316 4317 PR1("New data available"); 4318 /* Queue a task to receive the new data */ 4319 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 4320 DDI_SLEEP); 4321 4322 if (status == DDI_FAILURE) { 4323 PR0("cannot schedule task to recv msg\n"); 4324 vd_need_reset(vd, B_TRUE); 4325 } 4326 } 4327 4328 return (LDC_SUCCESS); 4329 } 4330 4331 static uint_t 4332 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 4333 { 4334 _NOTE(ARGUNUSED(key, val)) 4335 (*((uint_t *)arg))++; 4336 return (MH_WALK_TERMINATE); 4337 } 4338 4339 4340 static int 4341 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 4342 { 4343 uint_t vd_present = 0; 4344 minor_t instance; 4345 vds_t *vds; 4346 4347 4348 switch (cmd) { 4349 case DDI_DETACH: 4350 /* the real work happens below */ 4351 break; 4352 case DDI_SUSPEND: 4353 PR0("No action required for DDI_SUSPEND"); 4354 return (DDI_SUCCESS); 4355 default: 4356 PR0("Unrecognized \"cmd\""); 4357 return (DDI_FAILURE); 4358 } 4359 4360 ASSERT(cmd == DDI_DETACH); 4361 instance = ddi_get_instance(dip); 4362 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 4363 PR0("Could not get state for instance %u", instance); 4364 ddi_soft_state_free(vds_state, instance); 4365 return (DDI_FAILURE); 4366 } 4367 4368 /* Do no detach when serving any vdisks */ 4369 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 4370 if (vd_present) { 4371 PR0("Not detaching because serving vdisks"); 4372 return (DDI_FAILURE); 4373 } 4374 4375 PR0("Detaching"); 4376 if (vds->initialized & VDS_MDEG) { 4377 (void) mdeg_unregister(vds->mdeg); 4378 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 4379 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 4380 vds->ispecp = NULL; 4381 vds->mdeg = NULL; 4382 } 4383 4384 if (vds->initialized & VDS_LDI) 4385 (void) ldi_ident_release(vds->ldi_ident); 4386 mod_hash_destroy_hash(vds->vd_table); 4387 ddi_soft_state_free(vds_state, instance); 4388 return (DDI_SUCCESS); 4389 } 4390 4391 static boolean_t 4392 is_pseudo_device(dev_info_t *dip) 4393 { 4394 dev_info_t *parent, *root = ddi_root_node(); 4395 4396 4397 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 4398 parent = ddi_get_parent(parent)) { 4399 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 4400 return (B_TRUE); 4401 } 4402 4403 return (B_FALSE); 4404 } 4405 4406 /* 4407 * Description: 4408 * This function checks to see if the file being used as a 4409 * virtual disk is an ISO image. An ISO image is a special 4410 * case which can be booted/installed from like a CD/DVD 4411 * 4412 * Parameters: 4413 * vd - disk on which the operation is performed. 4414 * 4415 * Return Code: 4416 * B_TRUE - The file is an ISO 9660 compliant image 4417 * B_FALSE - just a regular disk image file 4418 */ 4419 static boolean_t 4420 vd_file_is_iso_image(vd_t *vd) 4421 { 4422 char iso_buf[ISO_SECTOR_SIZE]; 4423 int i, rv; 4424 uint_t sec; 4425 4426 ASSERT(vd->file); 4427 4428 /* 4429 * If we have already discovered and saved this info we can 4430 * short-circuit the check and avoid reading the file. 4431 */ 4432 if (vd->vdisk_media == VD_MEDIA_DVD || vd->vdisk_media == VD_MEDIA_CD) 4433 return (B_TRUE); 4434 4435 /* 4436 * We wish to read the sector that should contain the 2nd ISO volume 4437 * descriptor. The second field in this descriptor is called the 4438 * Standard Identifier and is set to CD001 for a CD-ROM compliant 4439 * to the ISO 9660 standard. 4440 */ 4441 sec = (ISO_VOLDESC_SEC * ISO_SECTOR_SIZE) / vd->vdisk_block_size; 4442 rv = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)iso_buf, 4443 sec, ISO_SECTOR_SIZE); 4444 4445 if (rv < 0) 4446 return (B_FALSE); 4447 4448 for (i = 0; i < ISO_ID_STRLEN; i++) { 4449 if (ISO_STD_ID(iso_buf)[i] != ISO_ID_STRING[i]) 4450 return (B_FALSE); 4451 } 4452 4453 return (B_TRUE); 4454 } 4455 4456 /* 4457 * Description: 4458 * This function checks to see if the virtual device is an ATAPI 4459 * device. ATAPI devices use Group 1 Read/Write commands, so 4460 * any USCSI calls vds makes need to take this into account. 4461 * 4462 * Parameters: 4463 * vd - disk on which the operation is performed. 4464 * 4465 * Return Code: 4466 * B_TRUE - The virtual disk is backed by an ATAPI device 4467 * B_FALSE - not an ATAPI device (presumably SCSI) 4468 */ 4469 static boolean_t 4470 vd_is_atapi_device(vd_t *vd) 4471 { 4472 boolean_t is_atapi = B_FALSE; 4473 char *variantp; 4474 int rv; 4475 4476 ASSERT(vd->ldi_handle[0] != NULL); 4477 ASSERT(!vd->file); 4478 4479 rv = ldi_prop_lookup_string(vd->ldi_handle[0], 4480 (LDI_DEV_T_ANY | DDI_PROP_DONTPASS), "variant", &variantp); 4481 if (rv == DDI_PROP_SUCCESS) { 4482 PR0("'variant' property exists for %s", vd->device_path); 4483 if (strcmp(variantp, "atapi") == 0) 4484 is_atapi = B_TRUE; 4485 ddi_prop_free(variantp); 4486 } 4487 4488 rv = ldi_prop_exists(vd->ldi_handle[0], LDI_DEV_T_ANY, "atapi"); 4489 if (rv) { 4490 PR0("'atapi' property exists for %s", vd->device_path); 4491 is_atapi = B_TRUE; 4492 } 4493 4494 return (is_atapi); 4495 } 4496 4497 static int 4498 vd_setup_mediainfo(vd_t *vd) 4499 { 4500 int status, rval; 4501 struct dk_minfo dk_minfo; 4502 4503 ASSERT(vd->ldi_handle[0] != NULL); 4504 ASSERT(vd->vdisk_block_size != 0); 4505 4506 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 4507 (intptr_t)&dk_minfo, (vd->open_flags | FKIOCTL), 4508 kcred, &rval)) != 0) 4509 return (status); 4510 4511 ASSERT(dk_minfo.dki_lbsize % vd->vdisk_block_size == 0); 4512 4513 vd->block_size = dk_minfo.dki_lbsize; 4514 vd->vdisk_size = (dk_minfo.dki_capacity * dk_minfo.dki_lbsize) / 4515 vd->vdisk_block_size; 4516 vd->vdisk_media = DK_MEDIATYPE2VD_MEDIATYPE(dk_minfo.dki_media_type); 4517 return (0); 4518 } 4519 4520 static int 4521 vd_setup_full_disk(vd_t *vd) 4522 { 4523 int status; 4524 major_t major = getmajor(vd->dev[0]); 4525 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 4526 4527 ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); 4528 4529 vd->vdisk_block_size = DEV_BSIZE; 4530 4531 /* 4532 * At this point, vdisk_size is set to the size of partition 2 but 4533 * this does not represent the size of the disk because partition 2 4534 * may not cover the entire disk and its size does not include reserved 4535 * blocks. So we call vd_get_mediainfo to udpate this information and 4536 * set the block size and the media type of the disk. 4537 */ 4538 status = vd_setup_mediainfo(vd); 4539 4540 if (status != 0) { 4541 if (!vd->scsi) { 4542 /* unexpected failure */ 4543 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 4544 status); 4545 return (status); 4546 } 4547 4548 /* 4549 * The function can fail for SCSI disks which are present but 4550 * reserved by another system. In that case, we don't know the 4551 * size of the disk and the block size. 4552 */ 4553 vd->vdisk_size = VD_SIZE_UNKNOWN; 4554 vd->block_size = 0; 4555 vd->vdisk_media = VD_MEDIA_FIXED; 4556 } 4557 4558 /* Move dev number and LDI handle to entire-disk-slice array elements */ 4559 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 4560 vd->dev[0] = 0; 4561 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 4562 vd->ldi_handle[0] = NULL; 4563 4564 /* Initialize device numbers for remaining slices and open them */ 4565 for (int slice = 0; slice < vd->nslices; slice++) { 4566 /* 4567 * Skip the entire-disk slice, as it's already open and its 4568 * device known 4569 */ 4570 if (slice == VD_ENTIRE_DISK_SLICE) 4571 continue; 4572 ASSERT(vd->dev[slice] == 0); 4573 ASSERT(vd->ldi_handle[slice] == NULL); 4574 4575 /* 4576 * Construct the device number for the current slice 4577 */ 4578 vd->dev[slice] = makedevice(major, (minor + slice)); 4579 4580 /* 4581 * Open all slices of the disk to serve them to the client. 4582 * Slices are opened exclusively to prevent other threads or 4583 * processes in the service domain from performing I/O to 4584 * slices being accessed by a client. Failure to open a slice 4585 * results in vds not serving this disk, as the client could 4586 * attempt (and should be able) to access any slice immediately. 4587 * Any slices successfully opened before a failure will get 4588 * closed by vds_destroy_vd() as a result of the error returned 4589 * by this function. 4590 * 4591 * We need to do the open with FNDELAY so that opening an empty 4592 * slice does not fail. 4593 */ 4594 PR0("Opening device major %u, minor %u = slice %u", 4595 major, minor, slice); 4596 4597 /* 4598 * Try to open the device. This can fail for example if we are 4599 * opening an empty slice. So in case of a failure, we try the 4600 * open again but this time with the FNDELAY flag. 4601 */ 4602 status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 4603 vd->open_flags, kcred, &vd->ldi_handle[slice], 4604 vd->vds->ldi_ident); 4605 4606 if (status != 0) { 4607 status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 4608 vd->open_flags | FNDELAY, kcred, 4609 &vd->ldi_handle[slice], vd->vds->ldi_ident); 4610 } 4611 4612 if (status != 0) { 4613 PRN("ldi_open_by_dev() returned errno %d " 4614 "for slice %u", status, slice); 4615 /* vds_destroy_vd() will close any open slices */ 4616 vd->ldi_handle[slice] = NULL; 4617 return (status); 4618 } 4619 } 4620 4621 return (0); 4622 } 4623 4624 /* 4625 * When a slice or a volume is exported as a single-slice disk, we want 4626 * the disk backend (i.e. the slice or volume) to be entirely mapped as 4627 * a slice without the addition of any metadata. 4628 * 4629 * So when exporting the disk as a VTOC disk, we fake a disk with the following 4630 * layout: 4631 * 4632 * 0 1 N+1 4633 * +-+--------------------------+ 4634 * virtual disk: |L| slice 0 | 4635 * +-+--------------------------+ 4636 * ^: : 4637 * |: : 4638 * VTOC LABEL--+: : 4639 * +--------------------------+ 4640 * disk backend: | slice/volume | 4641 * +--------------------------+ 4642 * 0 N 4643 * 4644 * N is the number of blocks in the slice/volume. 4645 * 4646 * We simulate a disk with N+1 blocks. The first block (block 0) is faked and 4647 * can not be changed. The remaining blocks (1 to N+1) defines slice 0 and are 4648 * mapped to the exported slice or volume: 4649 * 4650 * - block 0 (L) can return a fake VTOC label if raw read was implemented. 4651 * - block 1 to N+1 is mapped to the exported slice or volume. 4652 * 4653 */ 4654 static int 4655 vd_setup_partition_vtoc(vd_t *vd) 4656 { 4657 int rval, status; 4658 char *device_path = vd->device_path; 4659 4660 status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 4661 (intptr_t)&vd->dk_geom, (vd->open_flags | FKIOCTL), kcred, &rval); 4662 4663 if (status != 0) { 4664 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 4665 status, device_path); 4666 return (status); 4667 } 4668 4669 /* Initialize dk_geom structure for single-slice device */ 4670 if (vd->dk_geom.dkg_nsect == 0) { 4671 PRN("%s geometry claims 0 sectors per track", device_path); 4672 return (EIO); 4673 } 4674 if (vd->dk_geom.dkg_nhead == 0) { 4675 PRN("%s geometry claims 0 heads", device_path); 4676 return (EIO); 4677 } 4678 vd->dk_geom.dkg_ncyl = (vd->vdisk_size + 1) / vd->dk_geom.dkg_nsect / 4679 vd->dk_geom.dkg_nhead; 4680 vd->dk_geom.dkg_acyl = 0; 4681 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 4682 4683 4684 /* Initialize vtoc structure for single-slice device */ 4685 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 4686 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 4687 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 4688 vd->vtoc.v_nparts = 1; 4689 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 4690 vd->vtoc.v_part[0].p_flag = 0; 4691 vd->vtoc.v_part[0].p_start = 1; 4692 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 4693 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 4694 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 4695 4696 /* adjust the vdisk_size, we emulate the first block */ 4697 vd->vdisk_size += 1; 4698 4699 return (0); 4700 } 4701 4702 /* 4703 * When a slice, volume or file is exported as a single-slice disk, we want 4704 * the disk backend (i.e. the slice, volume or file) to be entirely mapped 4705 * as a slice without the addition of any metadata. 4706 * 4707 * So when exporting the disk as an EFI disk, we fake a disk with the following 4708 * layout: 4709 * 4710 * 0 1 2 3 34 34+N 4711 * +-+-+-+-------+--------------------------+ 4712 * virtual disk: |X|T|E|XXXXXXX| slice 0 | 4713 * +-+-+-+-------+--------------------------+ 4714 * ^ ^ : : 4715 * | | : : 4716 * GPT-+ +-GPE : : 4717 * +--------------------------+ 4718 * disk backend: | slice/volume/file | 4719 * +--------------------------+ 4720 * 0 N 4721 * 4722 * N is the number of blocks in the slice/volume/file. 4723 * 4724 * We simulate a disk with 34+N blocks. The first 34 blocks (0 to 33) are 4725 * emulated and can not be changed. The remaining blocks (34 to 34+N) defines 4726 * slice 0 and are mapped to the exported slice, volume or file: 4727 * 4728 * - block 0 (X) is unused and can return 0 if raw read was implemented. 4729 * - block 1 (T) returns a fake EFI GPT (via DKIOCGETEFI) 4730 * - block 2 (E) returns a fake EFI GPE (via DKIOCGETEFI) 4731 * - block 3 to 33 (X) are unused and return 0 if raw read is implemented. 4732 * - block 34 to 34+N is mapped to the exported slice, volume or file. 4733 * 4734 */ 4735 static int 4736 vd_setup_partition_efi(vd_t *vd) 4737 { 4738 efi_gpt_t *gpt; 4739 efi_gpe_t *gpe; 4740 struct uuid uuid = EFI_USR; 4741 uint32_t crc; 4742 4743 gpt = &vd->efi_gpt; 4744 gpe = &vd->efi_gpe; 4745 4746 bzero(gpt, sizeof (efi_gpt_t)); 4747 bzero(gpe, sizeof (efi_gpe_t)); 4748 4749 /* adjust the vdisk_size, we emulate the first 34 blocks */ 4750 vd->vdisk_size += 34; 4751 4752 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 4753 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 4754 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 4755 gpt->efi_gpt_FirstUsableLBA = LE_64(34ULL); 4756 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 4757 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 4758 gpt->efi_gpt_PartitionEntryLBA = LE_64(2ULL); 4759 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 4760 4761 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 4762 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 4763 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 4764 4765 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 4766 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 4767 4768 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 4769 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 4770 4771 return (0); 4772 } 4773 4774 /* 4775 * Setup for a virtual disk whose backend is a file (exported as a single slice 4776 * or as a full disk) or a pseudo device (for example a ZFS, SVM or VxVM volume) 4777 * exported as a full disk. In these cases, the backend is accessed using the 4778 * vnode interface. 4779 */ 4780 static int 4781 vd_setup_backend_vnode(vd_t *vd) 4782 { 4783 int rval, status; 4784 vattr_t vattr; 4785 dev_t dev; 4786 char *file_path = vd->device_path; 4787 char dev_path[MAXPATHLEN + 1]; 4788 ldi_handle_t lhandle; 4789 struct dk_cinfo dk_cinfo; 4790 4791 if ((status = vn_open(file_path, UIO_SYSSPACE, vd->open_flags | FOFFMAX, 4792 0, &vd->file_vnode, 0, 0)) != 0) { 4793 PRN("vn_open(%s) = errno %d", file_path, status); 4794 return (status); 4795 } 4796 4797 /* 4798 * We set vd->file now so that vds_destroy_vd will take care of 4799 * closing the file and releasing the vnode in case of an error. 4800 */ 4801 vd->file = B_TRUE; 4802 4803 vattr.va_mask = AT_SIZE; 4804 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred, NULL)) 4805 != 0) { 4806 PRN("VOP_GETATTR(%s) = errno %d", file_path, status); 4807 return (EIO); 4808 } 4809 4810 vd->file_size = vattr.va_size; 4811 /* size should be at least sizeof(dk_label) */ 4812 if (vd->file_size < sizeof (struct dk_label)) { 4813 PRN("Size of file has to be at least %ld bytes", 4814 sizeof (struct dk_label)); 4815 return (EIO); 4816 } 4817 4818 if (vd->file_vnode->v_flag & VNOMAP) { 4819 PRN("File %s cannot be mapped", file_path); 4820 return (EIO); 4821 } 4822 4823 /* sector size = block size = DEV_BSIZE */ 4824 vd->block_size = DEV_BSIZE; 4825 vd->vdisk_block_size = DEV_BSIZE; 4826 vd->vdisk_size = vd->file_size / DEV_BSIZE; 4827 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 4828 4829 /* 4830 * Get max_xfer_sz from the device where the file is or from the device 4831 * itself if we have a pseudo device. 4832 */ 4833 dev_path[0] = '\0'; 4834 4835 if (vd->pseudo) { 4836 status = ldi_open_by_name(file_path, FREAD, kcred, &lhandle, 4837 vd->vds->ldi_ident); 4838 } else { 4839 dev = vd->file_vnode->v_vfsp->vfs_dev; 4840 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 4841 PR0("underlying device = %s\n", dev_path); 4842 } 4843 4844 status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, kcred, &lhandle, 4845 vd->vds->ldi_ident); 4846 } 4847 4848 if (status != 0) { 4849 PR0("ldi_open() returned errno %d for device %s", 4850 status, (dev_path[0] == '\0')? file_path : dev_path); 4851 } else { 4852 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 4853 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 4854 &rval)) != 0) { 4855 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 4856 status, dev_path); 4857 } else { 4858 /* 4859 * Store the device's max transfer size for 4860 * return to the client 4861 */ 4862 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 4863 } 4864 4865 PR0("close the device %s", dev_path); 4866 (void) ldi_close(lhandle, FREAD, kcred); 4867 } 4868 4869 PR0("using file %s, dev %s, max_xfer = %u blks", 4870 file_path, dev_path, vd->max_xfer_sz); 4871 4872 if (vd->vdisk_type == VD_DISK_TYPE_SLICE) { 4873 ASSERT(!vd->pseudo); 4874 vd->vdisk_label = VD_DISK_LABEL_EFI; 4875 status = vd_setup_partition_efi(vd); 4876 return (0); 4877 } 4878 4879 /* 4880 * Find and validate the geometry of a disk image. 4881 */ 4882 status = vd_file_validate_geometry(vd); 4883 if (status != 0 && status != EINVAL && status != ENOTSUP) { 4884 PRN("Failed to read label from %s", file_path); 4885 return (EIO); 4886 } 4887 4888 if (vd_file_is_iso_image(vd)) { 4889 /* 4890 * Indicate whether to call this a CD or DVD from the size 4891 * of the ISO image (images for both drive types are stored 4892 * in the ISO-9600 format). CDs can store up to just under 1Gb 4893 */ 4894 if ((vd->vdisk_size * vd->vdisk_block_size) > 4895 (1024 * 1024 * 1024)) 4896 vd->vdisk_media = VD_MEDIA_DVD; 4897 else 4898 vd->vdisk_media = VD_MEDIA_CD; 4899 } else { 4900 vd->vdisk_media = VD_MEDIA_FIXED; 4901 } 4902 4903 /* Setup devid for the disk image */ 4904 4905 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 4906 4907 status = vd_file_read_devid(vd, &vd->file_devid); 4908 4909 if (status == 0) { 4910 /* a valid devid was found */ 4911 return (0); 4912 } 4913 4914 if (status != EINVAL) { 4915 /* 4916 * There was an error while trying to read the devid. 4917 * So this disk image may have a devid but we are 4918 * unable to read it. 4919 */ 4920 PR0("can not read devid for %s", file_path); 4921 vd->file_devid = NULL; 4922 return (0); 4923 } 4924 } 4925 4926 /* 4927 * No valid device id was found so we create one. Note that a failure 4928 * to create a device id is not fatal and does not prevent the disk 4929 * image from being attached. 4930 */ 4931 PR1("creating devid for %s", file_path); 4932 4933 if (ddi_devid_init(vd->vds->dip, DEVID_FAB, NULL, 0, 4934 &vd->file_devid) != DDI_SUCCESS) { 4935 PR0("fail to create devid for %s", file_path); 4936 vd->file_devid = NULL; 4937 return (0); 4938 } 4939 4940 /* 4941 * Write devid to the disk image. The devid is stored into the disk 4942 * image if we have a valid label; otherwise the devid will be stored 4943 * when the user writes a valid label. 4944 */ 4945 if (vd->vdisk_label != VD_DISK_LABEL_UNK) { 4946 if (vd_file_write_devid(vd, vd->file_devid) != 0) { 4947 PR0("fail to write devid for %s", file_path); 4948 ddi_devid_free(vd->file_devid); 4949 vd->file_devid = NULL; 4950 } 4951 } 4952 4953 return (0); 4954 } 4955 4956 4957 /* 4958 * Description: 4959 * Open a device using its device path (supplied by ldm(1m)) 4960 * 4961 * Parameters: 4962 * vd - pointer to structure containing the vDisk info 4963 * 4964 * Return Value 4965 * 0 - success 4966 * EIO - Invalid number of partitions 4967 * != 0 - some other non-zero return value from ldi(9F) functions 4968 */ 4969 static int 4970 vd_open_using_ldi_by_name(vd_t *vd) 4971 { 4972 int rval, status, open_flags; 4973 struct dk_cinfo dk_cinfo; 4974 char *device_path = vd->device_path; 4975 4976 /* 4977 * Try to open the device. If the flags indicate that the device should 4978 * be opened write-enabled, we first we try to open it "read-only" 4979 * to see if we have an optical device such as a CD-ROM which, for 4980 * now, we do not permit writes to and thus should not export write 4981 * operations to the client. 4982 * 4983 * Future: if/when we implement support for guest domains writing to 4984 * optical devices we will need to do further checking of the media type 4985 * to distinguish between read-only and writable discs. 4986 */ 4987 if (vd->open_flags & FWRITE) { 4988 open_flags = vd->open_flags & ~FWRITE; 4989 status = ldi_open_by_name(device_path, open_flags, kcred, 4990 &vd->ldi_handle[0], vd->vds->ldi_ident); 4991 4992 if (status == 0) { 4993 /* Verify backing device supports dk_cinfo */ 4994 status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 4995 (intptr_t)&dk_cinfo, (open_flags | FKIOCTL), 4996 kcred, &rval); 4997 if (status != 0) { 4998 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for" 4999 " %s opened as RO", status, device_path); 5000 return (status); 5001 } 5002 5003 if (dk_cinfo.dki_partition >= V_NUMPAR) { 5004 PRN("slice %u >= maximum slice %u for %s", 5005 dk_cinfo.dki_partition, V_NUMPAR, 5006 device_path); 5007 return (EIO); 5008 } 5009 5010 /* 5011 * If this is an optical device then we disable 5012 * write access and return, otherwise we close 5013 * the device and try again with writes enabled. 5014 */ 5015 if (dk_cinfo.dki_ctype == DKC_CDROM) { 5016 vd->open_flags = open_flags; 5017 return (0); 5018 } else { 5019 (void) ldi_close(vd->ldi_handle[0], 5020 open_flags, kcred); 5021 } 5022 } 5023 } 5024 5025 /* Attempt to (re)open device */ 5026 status = ldi_open_by_name(device_path, open_flags, kcred, 5027 &vd->ldi_handle[0], vd->vds->ldi_ident); 5028 5029 /* 5030 * The open can fail for example if we are opening an empty slice. 5031 * In case of a failure, we try the open again but this time with 5032 * the FNDELAY flag. 5033 */ 5034 if (status != 0) 5035 status = ldi_open_by_name(device_path, vd->open_flags | FNDELAY, 5036 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident); 5037 5038 if (status != 0) { 5039 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 5040 vd->ldi_handle[0] = NULL; 5041 return (status); 5042 } 5043 5044 /* Verify backing device supports dk_cinfo */ 5045 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 5046 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 5047 &rval)) != 0) { 5048 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 5049 status, device_path); 5050 return (status); 5051 } 5052 if (dk_cinfo.dki_partition >= V_NUMPAR) { 5053 PRN("slice %u >= maximum slice %u for %s", 5054 dk_cinfo.dki_partition, V_NUMPAR, device_path); 5055 return (EIO); 5056 } 5057 5058 return (0); 5059 } 5060 5061 5062 /* 5063 * Setup for a virtual disk which backend is a device (a physical disk, 5064 * slice or pseudo device) that is directly exported either as a full disk 5065 * for a physical disk or as a slice for a pseudo device or a disk slice. 5066 * In these cases, the backend is accessed using the LDI interface. 5067 */ 5068 static int 5069 vd_setup_backend_ldi(vd_t *vd) 5070 { 5071 int rval, status; 5072 struct dk_cinfo dk_cinfo; 5073 char *device_path = vd->device_path; 5074 5075 status = vd_open_using_ldi_by_name(vd); 5076 if (status != 0) { 5077 PR0("Failed to open (%s) = errno %d", device_path, status); 5078 return (status); 5079 } 5080 5081 vd->file = B_FALSE; 5082 5083 /* Get device number of backing device */ 5084 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 5085 PRN("ldi_get_dev() returned errno %d for %s", 5086 status, device_path); 5087 return (status); 5088 } 5089 5090 /* Verify backing device supports dk_cinfo */ 5091 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 5092 (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred, 5093 &rval)) != 0) { 5094 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 5095 status, device_path); 5096 return (status); 5097 } 5098 if (dk_cinfo.dki_partition >= V_NUMPAR) { 5099 PRN("slice %u >= maximum slice %u for %s", 5100 dk_cinfo.dki_partition, V_NUMPAR, device_path); 5101 return (EIO); 5102 } 5103 5104 /* Store the device's max transfer size for return to the client */ 5105 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 5106 5107 /* 5108 * We need to work out if it's an ATAPI (IDE CD-ROM) or SCSI device so 5109 * that we can use the correct CDB group when sending USCSI commands. 5110 */ 5111 vd->is_atapi_dev = vd_is_atapi_device(vd); 5112 5113 /* 5114 * Export a full disk. 5115 * 5116 * When we use the LDI interface, we export a device as a full disk 5117 * if we have an entire disk slice (slice 2) and if this slice is 5118 * exported as a full disk and not as a single slice disk. 5119 * Similarly, we want to use LDI if we are accessing a CD or DVD 5120 * device (even if it isn't s2) 5121 * 5122 * Note that pseudo devices are exported as full disks using the vnode 5123 * interface, not the LDI interface. 5124 */ 5125 if ((dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE && 5126 vd->vdisk_type == VD_DISK_TYPE_DISK) || 5127 dk_cinfo.dki_ctype == DKC_CDROM) { 5128 ASSERT(!vd->pseudo); 5129 if (dk_cinfo.dki_ctype == DKC_SCSI_CCS) 5130 vd->scsi = B_TRUE; 5131 return (vd_setup_full_disk(vd)); 5132 } 5133 5134 /* 5135 * Export a single slice disk. 5136 * 5137 * The exported device can be either a pseudo device or a disk slice. If 5138 * it is a disk slice different from slice 2 then it is always exported 5139 * as a single slice disk even if the "slice" option is not specified. 5140 * If it is disk slice 2 or a pseudo device then it is exported as a 5141 * single slice disk only if the "slice" option is specified. 5142 */ 5143 return (vd_setup_single_slice_disk(vd)); 5144 } 5145 5146 static int 5147 vd_setup_single_slice_disk(vd_t *vd) 5148 { 5149 int status, rval; 5150 char *device_path = vd->device_path; 5151 5152 /* Get size of backing device */ 5153 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 5154 PRN("ldi_get_size() failed for %s", device_path); 5155 return (EIO); 5156 } 5157 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 5158 vd->block_size = DEV_BSIZE; 5159 vd->vdisk_block_size = DEV_BSIZE; 5160 vd->vdisk_media = VD_MEDIA_FIXED; 5161 5162 if (vd->pseudo) { 5163 ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE); 5164 } 5165 5166 /* 5167 * We export the slice as a single slice disk even if the "slice" 5168 * option was not specified. 5169 */ 5170 vd->vdisk_type = VD_DISK_TYPE_SLICE; 5171 vd->nslices = 1; 5172 5173 /* 5174 * When exporting a slice or a device as a single slice disk, we don't 5175 * care about any partitioning exposed by the backend. The goal is just 5176 * to export the backend as a flat storage. We provide a fake partition 5177 * table (either a VTOC or EFI), which presents only one slice, to 5178 * accommodate tools expecting a disk label. 5179 * 5180 * We check the label of the backend to export the device as a slice 5181 * using the same type of label (VTOC or EFI). If there is no label 5182 * then we create a fake EFI label. 5183 * 5184 * Note that the partition table we are creating could also be faked 5185 * by the client based on the size of the backend device. 5186 */ 5187 status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vd->vtoc, 5188 (vd->open_flags | FKIOCTL), kcred, &rval); 5189 5190 if (status == 0) { 5191 /* export with a fake VTOC label */ 5192 vd->vdisk_label = VD_DISK_LABEL_VTOC; 5193 status = vd_setup_partition_vtoc(vd); 5194 } else { 5195 /* export with a fake EFI label */ 5196 vd->vdisk_label = VD_DISK_LABEL_EFI; 5197 status = vd_setup_partition_efi(vd); 5198 } 5199 5200 return (status); 5201 } 5202 5203 static int 5204 vd_setup_vd(vd_t *vd) 5205 { 5206 int status; 5207 dev_info_t *dip; 5208 vnode_t *vnp; 5209 char *path = vd->device_path; 5210 5211 /* make sure the vdisk backend is valid */ 5212 if ((status = lookupname(path, UIO_SYSSPACE, 5213 FOLLOW, NULLVPP, &vnp)) != 0) { 5214 PR0("Cannot lookup %s errno %d", path, status); 5215 goto done; 5216 } 5217 5218 switch (vnp->v_type) { 5219 case VREG: 5220 /* 5221 * Backend is a file so it is exported as a full disk or as a 5222 * single slice disk using the vnode interface. 5223 */ 5224 VN_RELE(vnp); 5225 vd->pseudo = B_FALSE; 5226 status = vd_setup_backend_vnode(vd); 5227 break; 5228 5229 case VBLK: 5230 case VCHR: 5231 /* 5232 * Backend is a device. The way it is exported depends on the 5233 * type of the device. 5234 * 5235 * - A pseudo device is exported as a full disk using the vnode 5236 * interface or as a single slice disk using the LDI 5237 * interface. 5238 * 5239 * - A disk (represented by the slice 2 of that disk) is 5240 * exported as a full disk using the LDI interface. 5241 * 5242 * - A disk slice (different from slice 2) is always exported 5243 * as a single slice disk using the LDI interface. 5244 * 5245 * - The slice 2 of a disk is exported as a single slice disk 5246 * if the "slice" option is specified, otherwise the entire 5247 * disk will be exported. In any case, the LDI interface is 5248 * used. 5249 */ 5250 5251 /* check if this is a pseudo device */ 5252 if ((dip = ddi_hold_devi_by_instance(getmajor(vnp->v_rdev), 5253 dev_to_instance(vnp->v_rdev), 0)) == NULL) { 5254 PRN("%s is no longer accessible", path); 5255 VN_RELE(vnp); 5256 status = EIO; 5257 break; 5258 } 5259 vd->pseudo = is_pseudo_device(dip); 5260 ddi_release_devi(dip); 5261 VN_RELE(vnp); 5262 5263 if (!vd->pseudo) { 5264 status = vd_setup_backend_ldi(vd); 5265 break; 5266 } 5267 5268 /* 5269 * If this is a pseudo device then its usage depends if the 5270 * "slice" option is set or not. If the "slice" option is set 5271 * then the pseudo device will be exported as a single slice, 5272 * otherwise it will be exported as a full disk. 5273 * 5274 * For backward compatibility, if vd_volume_force_slice is set 5275 * then we always export pseudo devices as slices. 5276 */ 5277 if (vd_volume_force_slice) { 5278 vd->vdisk_type = VD_DISK_TYPE_SLICE; 5279 vd->nslices = 1; 5280 } 5281 5282 if (vd->vdisk_type == VD_DISK_TYPE_DISK) 5283 status = vd_setup_backend_vnode(vd); 5284 else 5285 status = vd_setup_backend_ldi(vd); 5286 break; 5287 5288 default: 5289 PRN("Unsupported vdisk backend %s", path); 5290 VN_RELE(vnp); 5291 status = EBADF; 5292 } 5293 5294 done: 5295 if (status != 0) { 5296 /* 5297 * If the error is retryable print an error message only 5298 * during the first try. 5299 */ 5300 if (status == ENXIO || status == ENODEV || 5301 status == ENOENT || status == EROFS) { 5302 if (!(vd->initialized & VD_SETUP_ERROR)) { 5303 PRN("%s is currently inaccessible (error %d)", 5304 path, status); 5305 } 5306 status = EAGAIN; 5307 } else { 5308 PRN("%s can not be exported as a virtual disk " 5309 "(error %d)", path, status); 5310 } 5311 vd->initialized |= VD_SETUP_ERROR; 5312 5313 } else if (vd->initialized & VD_SETUP_ERROR) { 5314 /* print a message only if we previously had an error */ 5315 PRN("%s is now online", path); 5316 vd->initialized &= ~VD_SETUP_ERROR; 5317 } 5318 5319 return (status); 5320 } 5321 5322 static int 5323 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options, 5324 uint64_t ldc_id, vd_t **vdp) 5325 { 5326 char tq_name[TASKQ_NAMELEN]; 5327 int status; 5328 ddi_iblock_cookie_t iblock = NULL; 5329 ldc_attr_t ldc_attr; 5330 vd_t *vd; 5331 5332 5333 ASSERT(vds != NULL); 5334 ASSERT(device_path != NULL); 5335 ASSERT(vdp != NULL); 5336 PR0("Adding vdisk for %s", device_path); 5337 5338 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 5339 PRN("No memory for virtual disk"); 5340 return (EAGAIN); 5341 } 5342 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 5343 vd->vds = vds; 5344 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 5345 5346 /* Setup open flags */ 5347 vd->open_flags = FREAD; 5348 5349 if (!(options & VD_OPT_RDONLY)) 5350 vd->open_flags |= FWRITE; 5351 5352 if (options & VD_OPT_EXCLUSIVE) 5353 vd->open_flags |= FEXCL; 5354 5355 /* Setup disk type */ 5356 if (options & VD_OPT_SLICE) { 5357 vd->vdisk_type = VD_DISK_TYPE_SLICE; 5358 vd->nslices = 1; 5359 } else { 5360 vd->vdisk_type = VD_DISK_TYPE_DISK; 5361 vd->nslices = V_NUMPAR; 5362 } 5363 5364 /* default disk label */ 5365 vd->vdisk_label = VD_DISK_LABEL_UNK; 5366 5367 /* Open vdisk and initialize parameters */ 5368 if ((status = vd_setup_vd(vd)) == 0) { 5369 vd->initialized |= VD_DISK_READY; 5370 5371 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 5372 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 5373 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 5374 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 5375 vd->nslices); 5376 } else { 5377 if (status != EAGAIN) 5378 return (status); 5379 } 5380 5381 /* Initialize locking */ 5382 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 5383 &iblock) != DDI_SUCCESS) { 5384 PRN("Could not get iblock cookie."); 5385 return (EIO); 5386 } 5387 5388 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 5389 vd->initialized |= VD_LOCKING; 5390 5391 5392 /* Create start and completion task queues for the vdisk */ 5393 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 5394 PR1("tq_name = %s", tq_name); 5395 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 5396 TASKQ_DEFAULTPRI, 0)) == NULL) { 5397 PRN("Could not create task queue"); 5398 return (EIO); 5399 } 5400 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 5401 PR1("tq_name = %s", tq_name); 5402 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 5403 TASKQ_DEFAULTPRI, 0)) == NULL) { 5404 PRN("Could not create task queue"); 5405 return (EIO); 5406 } 5407 vd->enabled = 1; /* before callback can dispatch to startq */ 5408 5409 5410 /* Bring up LDC */ 5411 ldc_attr.devclass = LDC_DEV_BLK_SVC; 5412 ldc_attr.instance = ddi_get_instance(vds->dip); 5413 ldc_attr.mode = LDC_MODE_UNRELIABLE; 5414 ldc_attr.mtu = VD_LDC_MTU; 5415 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 5416 PRN("Could not initialize LDC channel %lx, " 5417 "init failed with error %d", ldc_id, status); 5418 return (status); 5419 } 5420 vd->initialized |= VD_LDC; 5421 5422 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 5423 (caddr_t)vd)) != 0) { 5424 PRN("Could not initialize LDC channel %lu," 5425 "reg_callback failed with error %d", ldc_id, status); 5426 return (status); 5427 } 5428 5429 if ((status = ldc_open(vd->ldc_handle)) != 0) { 5430 PRN("Could not initialize LDC channel %lu," 5431 "open failed with error %d", ldc_id, status); 5432 return (status); 5433 } 5434 5435 if ((status = ldc_up(vd->ldc_handle)) != 0) { 5436 PR0("ldc_up() returned errno %d", status); 5437 } 5438 5439 /* Allocate the inband task memory handle */ 5440 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 5441 if (status) { 5442 PRN("Could not initialize LDC channel %lu," 5443 "alloc_handle failed with error %d", ldc_id, status); 5444 return (ENXIO); 5445 } 5446 5447 /* Add the successfully-initialized vdisk to the server's table */ 5448 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 5449 PRN("Error adding vdisk ID %lu to table", id); 5450 return (EIO); 5451 } 5452 5453 /* Allocate the staging buffer */ 5454 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 5455 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 5456 5457 /* store initial state */ 5458 vd->state = VD_STATE_INIT; 5459 5460 return (0); 5461 } 5462 5463 static void 5464 vd_free_dring_task(vd_t *vdp) 5465 { 5466 if (vdp->dring_task != NULL) { 5467 ASSERT(vdp->dring_len != 0); 5468 /* Free all dring_task memory handles */ 5469 for (int i = 0; i < vdp->dring_len; i++) { 5470 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 5471 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 5472 vdp->dring_task[i].msg = NULL; 5473 } 5474 kmem_free(vdp->dring_task, 5475 (sizeof (*vdp->dring_task)) * vdp->dring_len); 5476 vdp->dring_task = NULL; 5477 } 5478 } 5479 5480 /* 5481 * Destroy the state associated with a virtual disk 5482 */ 5483 static void 5484 vds_destroy_vd(void *arg) 5485 { 5486 vd_t *vd = (vd_t *)arg; 5487 int retry = 0, rv; 5488 5489 if (vd == NULL) 5490 return; 5491 5492 PR0("Destroying vdisk state"); 5493 5494 /* Disable queuing requests for the vdisk */ 5495 if (vd->initialized & VD_LOCKING) { 5496 mutex_enter(&vd->lock); 5497 vd->enabled = 0; 5498 mutex_exit(&vd->lock); 5499 } 5500 5501 /* Drain and destroy start queue (*before* destroying completionq) */ 5502 if (vd->startq != NULL) 5503 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 5504 5505 /* Drain and destroy completion queue (*before* shutting down LDC) */ 5506 if (vd->completionq != NULL) 5507 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 5508 5509 vd_free_dring_task(vd); 5510 5511 /* Free the inband task memory handle */ 5512 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 5513 5514 /* Shut down LDC */ 5515 if (vd->initialized & VD_LDC) { 5516 /* unmap the dring */ 5517 if (vd->initialized & VD_DRING) 5518 (void) ldc_mem_dring_unmap(vd->dring_handle); 5519 5520 /* close LDC channel - retry on EAGAIN */ 5521 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 5522 if (++retry > vds_ldc_retries) { 5523 PR0("Timed out closing channel"); 5524 break; 5525 } 5526 drv_usecwait(vds_ldc_delay); 5527 } 5528 if (rv == 0) { 5529 (void) ldc_unreg_callback(vd->ldc_handle); 5530 (void) ldc_fini(vd->ldc_handle); 5531 } else { 5532 /* 5533 * Closing the LDC channel has failed. Ideally we should 5534 * fail here but there is no Zeus level infrastructure 5535 * to handle this. The MD has already been changed and 5536 * we have to do the close. So we try to do as much 5537 * clean up as we can. 5538 */ 5539 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 5540 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 5541 drv_usecwait(vds_ldc_delay); 5542 } 5543 } 5544 5545 /* Free the staging buffer for msgs */ 5546 if (vd->vio_msgp != NULL) { 5547 kmem_free(vd->vio_msgp, vd->max_msglen); 5548 vd->vio_msgp = NULL; 5549 } 5550 5551 /* Free the inband message buffer */ 5552 if (vd->inband_task.msg != NULL) { 5553 kmem_free(vd->inband_task.msg, vd->max_msglen); 5554 vd->inband_task.msg = NULL; 5555 } 5556 5557 if (vd->file) { 5558 /* Close file */ 5559 (void) VOP_CLOSE(vd->file_vnode, vd->open_flags, 1, 5560 0, kcred, NULL); 5561 VN_RELE(vd->file_vnode); 5562 if (vd->file_devid != NULL) 5563 ddi_devid_free(vd->file_devid); 5564 } else { 5565 /* Close any open backing-device slices */ 5566 for (uint_t slice = 0; slice < vd->nslices; slice++) { 5567 if (vd->ldi_handle[slice] != NULL) { 5568 PR0("Closing slice %u", slice); 5569 (void) ldi_close(vd->ldi_handle[slice], 5570 vd->open_flags, kcred); 5571 } 5572 } 5573 } 5574 5575 /* Free lock */ 5576 if (vd->initialized & VD_LOCKING) 5577 mutex_destroy(&vd->lock); 5578 5579 /* Finally, free the vdisk structure itself */ 5580 kmem_free(vd, sizeof (*vd)); 5581 } 5582 5583 static int 5584 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options, 5585 uint64_t ldc_id) 5586 { 5587 int status; 5588 vd_t *vd = NULL; 5589 5590 5591 if ((status = vds_do_init_vd(vds, id, device_path, options, 5592 ldc_id, &vd)) != 0) 5593 vds_destroy_vd(vd); 5594 5595 return (status); 5596 } 5597 5598 static int 5599 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 5600 uint64_t *ldc_id) 5601 { 5602 int num_channels; 5603 5604 5605 /* Look for channel endpoint child(ren) of the vdisk MD node */ 5606 if ((num_channels = md_scan_dag(md, vd_node, 5607 md_find_name(md, VD_CHANNEL_ENDPOINT), 5608 md_find_name(md, "fwd"), channel)) <= 0) { 5609 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 5610 return (-1); 5611 } 5612 5613 /* Get the "id" value for the first channel endpoint node */ 5614 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 5615 PRN("No \"%s\" property found for \"%s\" of vdisk", 5616 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 5617 return (-1); 5618 } 5619 5620 if (num_channels > 1) { 5621 PRN("Using ID of first of multiple channels for this vdisk"); 5622 } 5623 5624 return (0); 5625 } 5626 5627 static int 5628 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 5629 { 5630 int num_nodes, status; 5631 size_t size; 5632 mde_cookie_t *channel; 5633 5634 5635 if ((num_nodes = md_node_count(md)) <= 0) { 5636 PRN("Invalid node count in Machine Description subtree"); 5637 return (-1); 5638 } 5639 size = num_nodes*(sizeof (*channel)); 5640 channel = kmem_zalloc(size, KM_SLEEP); 5641 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 5642 kmem_free(channel, size); 5643 5644 return (status); 5645 } 5646 5647 /* 5648 * Function: 5649 * vds_get_options 5650 * 5651 * Description: 5652 * Parse the options of a vds node. Options are defined as an array 5653 * of strings in the vds-block-device-opts property of the vds node 5654 * in the machine description. Options are returned as a bitmask. The 5655 * mapping between the bitmask options and the options strings from the 5656 * machine description is defined in the vd_bdev_options[] array. 5657 * 5658 * The vds-block-device-opts property is optional. If a vds has no such 5659 * property then no option is defined. 5660 * 5661 * Parameters: 5662 * md - machine description. 5663 * vd_node - vds node in the machine description for which 5664 * options have to be parsed. 5665 * options - the returned options. 5666 * 5667 * Return Code: 5668 * none. 5669 */ 5670 static void 5671 vds_get_options(md_t *md, mde_cookie_t vd_node, uint64_t *options) 5672 { 5673 char *optstr, *opt; 5674 int len, n, i; 5675 5676 *options = 0; 5677 5678 if (md_get_prop_data(md, vd_node, VD_BLOCK_DEVICE_OPTS, 5679 (uint8_t **)&optstr, &len) != 0) { 5680 PR0("No options found"); 5681 return; 5682 } 5683 5684 /* parse options */ 5685 opt = optstr; 5686 n = sizeof (vd_bdev_options) / sizeof (vd_option_t); 5687 5688 while (opt < optstr + len) { 5689 for (i = 0; i < n; i++) { 5690 if (strncmp(vd_bdev_options[i].vdo_name, 5691 opt, VD_OPTION_NLEN) == 0) { 5692 *options |= vd_bdev_options[i].vdo_value; 5693 break; 5694 } 5695 } 5696 5697 if (i < n) { 5698 PR0("option: %s", opt); 5699 } else { 5700 PRN("option %s is unknown or unsupported", opt); 5701 } 5702 5703 opt += strlen(opt) + 1; 5704 } 5705 } 5706 5707 static void 5708 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 5709 { 5710 char *device_path = NULL; 5711 uint64_t id = 0, ldc_id = 0, options = 0; 5712 5713 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 5714 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 5715 return; 5716 } 5717 PR0("Adding vdisk ID %lu", id); 5718 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 5719 &device_path) != 0) { 5720 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 5721 return; 5722 } 5723 5724 vds_get_options(md, vd_node, &options); 5725 5726 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 5727 PRN("Error getting LDC ID for vdisk %lu", id); 5728 return; 5729 } 5730 5731 if (vds_init_vd(vds, id, device_path, options, ldc_id) != 0) { 5732 PRN("Failed to add vdisk ID %lu", id); 5733 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 5734 PRN("No vDisk entry found for vdisk ID %lu", id); 5735 return; 5736 } 5737 } 5738 5739 static void 5740 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 5741 { 5742 uint64_t id = 0; 5743 5744 5745 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 5746 PRN("Unable to get \"%s\" property from vdisk's MD node", 5747 VD_ID_PROP); 5748 return; 5749 } 5750 PR0("Removing vdisk ID %lu", id); 5751 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 5752 PRN("No vdisk entry found for vdisk ID %lu", id); 5753 } 5754 5755 static void 5756 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 5757 md_t *curr_md, mde_cookie_t curr_vd_node) 5758 { 5759 char *curr_dev, *prev_dev; 5760 uint64_t curr_id = 0, curr_ldc_id = 0, curr_options = 0; 5761 uint64_t prev_id = 0, prev_ldc_id = 0, prev_options = 0; 5762 size_t len; 5763 5764 5765 /* Validate that vdisk ID has not changed */ 5766 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 5767 PRN("Error getting previous vdisk \"%s\" property", 5768 VD_ID_PROP); 5769 return; 5770 } 5771 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 5772 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 5773 return; 5774 } 5775 if (curr_id != prev_id) { 5776 PRN("Not changing vdisk: ID changed from %lu to %lu", 5777 prev_id, curr_id); 5778 return; 5779 } 5780 5781 /* Validate that LDC ID has not changed */ 5782 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 5783 PRN("Error getting LDC ID for vdisk %lu", prev_id); 5784 return; 5785 } 5786 5787 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 5788 PRN("Error getting LDC ID for vdisk %lu", curr_id); 5789 return; 5790 } 5791 if (curr_ldc_id != prev_ldc_id) { 5792 _NOTE(NOTREACHED); /* lint is confused */ 5793 PRN("Not changing vdisk: " 5794 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 5795 return; 5796 } 5797 5798 /* Determine whether device path has changed */ 5799 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 5800 &prev_dev) != 0) { 5801 PRN("Error getting previous vdisk \"%s\"", 5802 VD_BLOCK_DEVICE_PROP); 5803 return; 5804 } 5805 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 5806 &curr_dev) != 0) { 5807 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 5808 return; 5809 } 5810 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 5811 (strncmp(curr_dev, prev_dev, len) == 0)) 5812 return; /* no relevant (supported) change */ 5813 5814 /* Validate that options have not changed */ 5815 vds_get_options(prev_md, prev_vd_node, &prev_options); 5816 vds_get_options(curr_md, curr_vd_node, &curr_options); 5817 if (prev_options != curr_options) { 5818 PRN("Not changing vdisk: options changed from %lx to %lx", 5819 prev_options, curr_options); 5820 return; 5821 } 5822 5823 PR0("Changing vdisk ID %lu", prev_id); 5824 5825 /* Remove old state, which will close vdisk and reset */ 5826 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 5827 PRN("No entry found for vdisk ID %lu", prev_id); 5828 5829 /* Re-initialize vdisk with new state */ 5830 if (vds_init_vd(vds, curr_id, curr_dev, curr_options, 5831 curr_ldc_id) != 0) { 5832 PRN("Failed to change vdisk ID %lu", curr_id); 5833 return; 5834 } 5835 } 5836 5837 static int 5838 vds_process_md(void *arg, mdeg_result_t *md) 5839 { 5840 int i; 5841 vds_t *vds = arg; 5842 5843 5844 if (md == NULL) 5845 return (MDEG_FAILURE); 5846 ASSERT(vds != NULL); 5847 5848 for (i = 0; i < md->removed.nelem; i++) 5849 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 5850 for (i = 0; i < md->match_curr.nelem; i++) 5851 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 5852 md->match_curr.mdp, md->match_curr.mdep[i]); 5853 for (i = 0; i < md->added.nelem; i++) 5854 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 5855 5856 return (MDEG_SUCCESS); 5857 } 5858 5859 5860 static int 5861 vds_do_attach(dev_info_t *dip) 5862 { 5863 int status, sz; 5864 int cfg_handle; 5865 minor_t instance = ddi_get_instance(dip); 5866 vds_t *vds; 5867 mdeg_prop_spec_t *pspecp; 5868 mdeg_node_spec_t *ispecp; 5869 5870 /* 5871 * The "cfg-handle" property of a vds node in an MD contains the MD's 5872 * notion of "instance", or unique identifier, for that node; OBP 5873 * stores the value of the "cfg-handle" MD property as the value of 5874 * the "reg" property on the node in the device tree it builds from 5875 * the MD and passes to Solaris. Thus, we look up the devinfo node's 5876 * "reg" property value to uniquely identify this device instance when 5877 * registering with the MD event-generation framework. If the "reg" 5878 * property cannot be found, the device tree state is presumably so 5879 * broken that there is no point in continuing. 5880 */ 5881 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 5882 VD_REG_PROP)) { 5883 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 5884 return (DDI_FAILURE); 5885 } 5886 5887 /* Get the MD instance for later MDEG registration */ 5888 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 5889 VD_REG_PROP, -1); 5890 5891 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 5892 PRN("Could not allocate state for instance %u", instance); 5893 return (DDI_FAILURE); 5894 } 5895 5896 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 5897 PRN("Could not get state for instance %u", instance); 5898 ddi_soft_state_free(vds_state, instance); 5899 return (DDI_FAILURE); 5900 } 5901 5902 vds->dip = dip; 5903 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 5904 vds_destroy_vd, sizeof (void *)); 5905 5906 ASSERT(vds->vd_table != NULL); 5907 5908 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 5909 PRN("ldi_ident_from_dip() returned errno %d", status); 5910 return (DDI_FAILURE); 5911 } 5912 vds->initialized |= VDS_LDI; 5913 5914 /* Register for MD updates */ 5915 sz = sizeof (vds_prop_template); 5916 pspecp = kmem_alloc(sz, KM_SLEEP); 5917 bcopy(vds_prop_template, pspecp, sz); 5918 5919 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 5920 5921 /* initialize the complete prop spec structure */ 5922 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 5923 ispecp->namep = "virtual-device"; 5924 ispecp->specp = pspecp; 5925 5926 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 5927 &vds->mdeg) != MDEG_SUCCESS) { 5928 PRN("Unable to register for MD updates"); 5929 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 5930 kmem_free(pspecp, sz); 5931 return (DDI_FAILURE); 5932 } 5933 5934 vds->ispecp = ispecp; 5935 vds->initialized |= VDS_MDEG; 5936 5937 /* Prevent auto-detaching so driver is available whenever MD changes */ 5938 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 5939 DDI_PROP_SUCCESS) { 5940 PRN("failed to set \"%s\" property for instance %u", 5941 DDI_NO_AUTODETACH, instance); 5942 } 5943 5944 ddi_report_dev(dip); 5945 return (DDI_SUCCESS); 5946 } 5947 5948 static int 5949 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 5950 { 5951 int status; 5952 5953 switch (cmd) { 5954 case DDI_ATTACH: 5955 PR0("Attaching"); 5956 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 5957 (void) vds_detach(dip, DDI_DETACH); 5958 return (status); 5959 case DDI_RESUME: 5960 PR0("No action required for DDI_RESUME"); 5961 return (DDI_SUCCESS); 5962 default: 5963 return (DDI_FAILURE); 5964 } 5965 } 5966 5967 static struct dev_ops vds_ops = { 5968 DEVO_REV, /* devo_rev */ 5969 0, /* devo_refcnt */ 5970 ddi_no_info, /* devo_getinfo */ 5971 nulldev, /* devo_identify */ 5972 nulldev, /* devo_probe */ 5973 vds_attach, /* devo_attach */ 5974 vds_detach, /* devo_detach */ 5975 nodev, /* devo_reset */ 5976 NULL, /* devo_cb_ops */ 5977 NULL, /* devo_bus_ops */ 5978 nulldev /* devo_power */ 5979 }; 5980 5981 static struct modldrv modldrv = { 5982 &mod_driverops, 5983 "virtual disk server", 5984 &vds_ops, 5985 }; 5986 5987 static struct modlinkage modlinkage = { 5988 MODREV_1, 5989 &modldrv, 5990 NULL 5991 }; 5992 5993 5994 int 5995 _init(void) 5996 { 5997 int status; 5998 5999 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 6000 return (status); 6001 6002 if ((status = mod_install(&modlinkage)) != 0) { 6003 ddi_soft_state_fini(&vds_state); 6004 return (status); 6005 } 6006 6007 return (0); 6008 } 6009 6010 int 6011 _info(struct modinfo *modinfop) 6012 { 6013 return (mod_info(&modlinkage, modinfop)); 6014 } 6015 6016 int 6017 _fini(void) 6018 { 6019 int status; 6020 6021 if ((status = mod_remove(&modlinkage)) != 0) 6022 return (status); 6023 ddi_soft_state_fini(&vds_state); 6024 return (0); 6025 } 6026