1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sunddi.h> 45 #include <sys/sunldi.h> 46 #include <sys/sysmacros.h> 47 #include <sys/vio_common.h> 48 #include <sys/vdsk_mailbox.h> 49 #include <sys/vdsk_common.h> 50 #include <sys/vtoc.h> 51 #include <sys/vfs.h> 52 #include <sys/stat.h> 53 #include <vm/seg_map.h> 54 55 /* Virtual disk server initialization flags */ 56 #define VDS_LDI 0x01 57 #define VDS_MDEG 0x02 58 59 /* Virtual disk server tunable parameters */ 60 #define VDS_RETRIES 5 61 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 62 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 63 #define VDS_NCHAINS 32 64 65 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 66 #define VDS_NAME "virtual-disk-server" 67 68 #define VD_NAME "vd" 69 #define VD_VOLUME_NAME "vdisk" 70 #define VD_ASCIILABEL "Virtual Disk" 71 72 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 73 #define VD_ID_PROP "id" 74 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 75 #define VD_REG_PROP "reg" 76 77 /* Virtual disk initialization flags */ 78 #define VD_DISK_READY 0x01 79 #define VD_LOCKING 0x02 80 #define VD_LDC 0x04 81 #define VD_DRING 0x08 82 #define VD_SID 0x10 83 #define VD_SEQ_NUM 0x20 84 85 /* Flags for opening/closing backing devices via LDI */ 86 #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) 87 88 /* 89 * By Solaris convention, slice/partition 2 represents the entire disk; 90 * unfortunately, this convention does not appear to be codified. 91 */ 92 #define VD_ENTIRE_DISK_SLICE 2 93 94 /* Return a cpp token as a string */ 95 #define STRINGIZE(token) #token 96 97 /* 98 * Print a message prefixed with the current function name to the message log 99 * (and optionally to the console for verbose boots); these macros use cpp's 100 * concatenation of string literals and C99 variable-length-argument-list 101 * macros 102 */ 103 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 104 #define _PRN(format, ...) \ 105 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 106 107 /* Return a pointer to the "i"th vdisk dring element */ 108 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 109 (vd->dring + (i)*vd->descriptor_size)) 110 111 /* Return the virtual disk client's type as a string (for use in messages) */ 112 #define VD_CLIENT(vd) \ 113 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 114 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 115 (((vd)->xfer_mode == 0) ? "null client" : \ 116 "unsupported client"))) 117 118 /* For IO to raw disk on file */ 119 #define VD_FILE_SLICE_NONE -1 120 121 /* Read disk label from a disk on file */ 122 #define VD_FILE_LABEL_READ(vd, labelp) \ 123 vd_file_rw(vd, VD_FILE_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \ 124 0, sizeof (struct dk_label)) 125 126 /* Write disk label to a disk on file */ 127 #define VD_FILE_LABEL_WRITE(vd, labelp) \ 128 vd_file_rw(vd, VD_FILE_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 129 0, sizeof (struct dk_label)) 130 131 /* 132 * Specification of an MD node passed to the MDEG to filter any 133 * 'vport' nodes that do not belong to the specified node. This 134 * template is copied for each vds instance and filled in with 135 * the appropriate 'cfg-handle' value before being passed to the MDEG. 136 */ 137 static mdeg_prop_spec_t vds_prop_template[] = { 138 { MDET_PROP_STR, "name", VDS_NAME }, 139 { MDET_PROP_VAL, "cfg-handle", NULL }, 140 { MDET_LIST_END, NULL, NULL } 141 }; 142 143 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 144 145 /* 146 * Matching criteria passed to the MDEG to register interest 147 * in changes to 'virtual-device-port' nodes identified by their 148 * 'id' property. 149 */ 150 static md_prop_match_t vd_prop_match[] = { 151 { MDET_PROP_VAL, VD_ID_PROP }, 152 { MDET_LIST_END, NULL } 153 }; 154 155 static mdeg_node_match_t vd_match = {"virtual-device-port", 156 vd_prop_match}; 157 158 /* Debugging macros */ 159 #ifdef DEBUG 160 161 static int vd_msglevel = 0; 162 163 #define PR0 if (vd_msglevel > 0) PRN 164 #define PR1 if (vd_msglevel > 1) PRN 165 #define PR2 if (vd_msglevel > 2) PRN 166 167 #define VD_DUMP_DRING_ELEM(elem) \ 168 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 169 elem->hdr.dstate, \ 170 elem->payload.operation, \ 171 elem->payload.status, \ 172 elem->payload.nbytes, \ 173 elem->payload.addr, \ 174 elem->payload.ncookies); 175 176 char * 177 vd_decode_state(int state) 178 { 179 char *str; 180 181 #define CASE_STATE(_s) case _s: str = #_s; break; 182 183 switch (state) { 184 CASE_STATE(VD_STATE_INIT) 185 CASE_STATE(VD_STATE_VER) 186 CASE_STATE(VD_STATE_ATTR) 187 CASE_STATE(VD_STATE_DRING) 188 CASE_STATE(VD_STATE_RDX) 189 CASE_STATE(VD_STATE_DATA) 190 default: str = "unknown"; break; 191 } 192 193 #undef CASE_STATE 194 195 return (str); 196 } 197 198 void 199 vd_decode_tag(vio_msg_t *msg) 200 { 201 char *tstr, *sstr, *estr; 202 203 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 204 205 switch (msg->tag.vio_msgtype) { 206 CASE_TYPE(VIO_TYPE_CTRL) 207 CASE_TYPE(VIO_TYPE_DATA) 208 CASE_TYPE(VIO_TYPE_ERR) 209 default: tstr = "unknown"; break; 210 } 211 212 #undef CASE_TYPE 213 214 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 215 216 switch (msg->tag.vio_subtype) { 217 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 218 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 219 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 220 default: sstr = "unknown"; break; 221 } 222 223 #undef CASE_SUBTYPE 224 225 #define CASE_ENV(_s) case _s: estr = #_s; break; 226 227 switch (msg->tag.vio_subtype_env) { 228 CASE_ENV(VIO_VER_INFO) 229 CASE_ENV(VIO_ATTR_INFO) 230 CASE_ENV(VIO_DRING_REG) 231 CASE_ENV(VIO_DRING_UNREG) 232 CASE_ENV(VIO_RDX) 233 CASE_ENV(VIO_PKT_DATA) 234 CASE_ENV(VIO_DESC_DATA) 235 CASE_ENV(VIO_DRING_DATA) 236 default: estr = "unknown"; break; 237 } 238 239 #undef CASE_ENV 240 241 PR1("(%x/%x/%x) message : (%s/%s/%s)", 242 msg->tag.vio_msgtype, msg->tag.vio_subtype, 243 msg->tag.vio_subtype_env, tstr, sstr, estr); 244 } 245 246 #else /* !DEBUG */ 247 248 #define PR0(...) 249 #define PR1(...) 250 #define PR2(...) 251 252 #define VD_DUMP_DRING_ELEM(elem) 253 254 #define vd_decode_state(_s) (NULL) 255 #define vd_decode_tag(_s) (NULL) 256 257 #endif /* DEBUG */ 258 259 260 /* 261 * Soft state structure for a vds instance 262 */ 263 typedef struct vds { 264 uint_t initialized; /* driver inst initialization flags */ 265 dev_info_t *dip; /* driver inst devinfo pointer */ 266 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 267 mod_hash_t *vd_table; /* table of virtual disks served */ 268 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 269 mdeg_handle_t mdeg; /* handle for MDEG operations */ 270 } vds_t; 271 272 /* 273 * Types of descriptor-processing tasks 274 */ 275 typedef enum vd_task_type { 276 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 277 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 278 } vd_task_type_t; 279 280 /* 281 * Structure describing the task for processing a descriptor 282 */ 283 typedef struct vd_task { 284 struct vd *vd; /* vd instance task is for */ 285 vd_task_type_t type; /* type of descriptor task */ 286 int index; /* dring elem index for task */ 287 vio_msg_t *msg; /* VIO message task is for */ 288 size_t msglen; /* length of message content */ 289 vd_dring_payload_t *request; /* request task will perform */ 290 struct buf buf; /* buf(9s) for I/O request */ 291 ldc_mem_handle_t mhdl; /* task memory handle */ 292 } vd_task_t; 293 294 /* 295 * Soft state structure for a virtual disk instance 296 */ 297 typedef struct vd { 298 uint_t initialized; /* vdisk initialization flags */ 299 vds_t *vds; /* server for this vdisk */ 300 ddi_taskq_t *startq; /* queue for I/O start tasks */ 301 ddi_taskq_t *completionq; /* queue for completion tasks */ 302 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 303 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 304 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 305 uint_t nslices; /* number of slices */ 306 size_t vdisk_size; /* number of blocks in vdisk */ 307 vd_disk_type_t vdisk_type; /* slice or entire disk */ 308 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 309 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 310 boolean_t pseudo; /* underlying pseudo dev */ 311 boolean_t file; /* underlying file */ 312 vnode_t *file_vnode; /* file vnode */ 313 size_t file_size; /* file size */ 314 struct dk_efi dk_efi; /* synthetic for slice type */ 315 struct dk_geom dk_geom; /* synthetic for slice type */ 316 struct vtoc vtoc; /* synthetic for slice type */ 317 ldc_status_t ldc_state; /* LDC connection state */ 318 ldc_handle_t ldc_handle; /* handle for LDC comm */ 319 size_t max_msglen; /* largest LDC message len */ 320 vd_state_t state; /* client handshake state */ 321 uint8_t xfer_mode; /* transfer mode with client */ 322 uint32_t sid; /* client's session ID */ 323 uint64_t seq_num; /* message sequence number */ 324 uint64_t dring_ident; /* identifier of dring */ 325 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 326 uint32_t descriptor_size; /* num bytes in desc */ 327 uint32_t dring_len; /* number of dring elements */ 328 caddr_t dring; /* address of dring */ 329 caddr_t vio_msgp; /* vio msg staging buffer */ 330 vd_task_t inband_task; /* task for inband descriptor */ 331 vd_task_t *dring_task; /* tasks dring elements */ 332 333 kmutex_t lock; /* protects variables below */ 334 boolean_t enabled; /* is vdisk enabled? */ 335 boolean_t reset_state; /* reset connection state? */ 336 boolean_t reset_ldc; /* reset LDC channel? */ 337 } vd_t; 338 339 typedef struct vds_operation { 340 char *namep; 341 uint8_t operation; 342 int (*start)(vd_task_t *task); 343 void (*complete)(void *arg); 344 } vds_operation_t; 345 346 typedef struct vd_ioctl { 347 uint8_t operation; /* vdisk operation */ 348 const char *operation_name; /* vdisk operation name */ 349 size_t nbytes; /* size of operation buffer */ 350 int cmd; /* corresponding ioctl cmd */ 351 const char *cmd_name; /* ioctl cmd name */ 352 void *arg; /* ioctl cmd argument */ 353 /* convert input vd_buf to output ioctl_arg */ 354 void (*copyin)(void *vd_buf, void *ioctl_arg); 355 /* convert input ioctl_arg to output vd_buf */ 356 void (*copyout)(void *ioctl_arg, void *vd_buf); 357 } vd_ioctl_t; 358 359 /* Define trivial copyin/copyout conversion function flag */ 360 #define VD_IDENTITY ((void (*)(void *, void *))-1) 361 362 363 static int vds_ldc_retries = VDS_RETRIES; 364 static int vds_ldc_delay = VDS_LDC_DELAY; 365 static int vds_dev_retries = VDS_RETRIES; 366 static int vds_dev_delay = VDS_DEV_DELAY; 367 static void *vds_state; 368 static uint64_t vds_operations; /* see vds_operation[] definition below */ 369 370 static int vd_open_flags = VD_OPEN_FLAGS; 371 372 /* 373 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 374 * 375 * Each supported major version should appear only once, paired with (and only 376 * with) its highest supported minor version number (as the protocol requires 377 * supporting all lower minor version numbers as well) 378 */ 379 static const vio_ver_t vds_version[] = {{1, 0}}; 380 static const size_t vds_num_versions = 381 sizeof (vds_version)/sizeof (vds_version[0]); 382 383 static void vd_free_dring_task(vd_t *vdp); 384 static int vd_setup_vd(vd_t *vd); 385 static boolean_t vd_enabled(vd_t *vd); 386 387 /* 388 * Function: 389 * vd_file_rw 390 * 391 * Description: 392 * Read or write to a disk on file. 393 * 394 * Parameters: 395 * vd - disk on which the operation is performed. 396 * slice - slice on which the operation is performed, 397 * VD_FILE_SLICE_NONE indicates that the operation 398 * is done on the raw disk. 399 * operation - operation to execute: read (VD_OP_BREAD) or 400 * write (VD_OP_BWRITE). 401 * data - buffer where data are read to or written from. 402 * blk - starting block for the operation. 403 * len - number of bytes to read or write. 404 * 405 * Return Code: 406 * n >= 0 - success, n indicates the number of bytes read 407 * or written. 408 * -1 - error. 409 */ 410 static ssize_t 411 vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk, 412 size_t len) 413 { 414 caddr_t maddr; 415 size_t offset, maxlen, moffset, mlen, n; 416 uint_t smflags; 417 enum seg_rw srw; 418 419 ASSERT(vd->file); 420 ASSERT(len > 0); 421 422 if (slice == VD_FILE_SLICE_NONE) { 423 /* raw disk access */ 424 offset = blk * DEV_BSIZE; 425 } else { 426 ASSERT(slice >= 0 && slice < V_NUMPAR); 427 if (blk >= vd->vtoc.v_part[slice].p_size) { 428 /* address past the end of the slice */ 429 PR0("req_addr (0x%lx) > psize (0x%lx)", 430 blk, vd->vtoc.v_part[slice].p_size); 431 return (0); 432 } 433 434 offset = (vd->vtoc.v_part[slice].p_start + blk) * DEV_BSIZE; 435 436 /* 437 * If the requested size is greater than the size 438 * of the partition, truncate the read/write. 439 */ 440 maxlen = (vd->vtoc.v_part[slice].p_size - blk) * DEV_BSIZE; 441 442 if (len > maxlen) { 443 PR0("I/O size truncated to %lu bytes from %lu bytes", 444 maxlen, len); 445 len = maxlen; 446 } 447 } 448 449 /* 450 * We have to ensure that we are reading/writing into the mmap 451 * range. If we have a partial disk image (e.g. an image of 452 * s0 instead s2) the system can try to access slices that 453 * are not included into the disk image. 454 */ 455 if ((offset + len) >= vd->file_size) { 456 PR0("offset + nbytes (0x%lx + 0x%lx) >= " 457 "file_size (0x%lx)", offset, len, vd->file_size); 458 return (-1); 459 } 460 461 srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE; 462 smflags = (operation == VD_OP_BREAD)? 0 : SM_WRITE; 463 n = len; 464 465 do { 466 /* 467 * segmap_getmapflt() returns a MAXBSIZE chunk which is 468 * MAXBSIZE aligned. 469 */ 470 moffset = offset & MAXBOFFSET; 471 mlen = MIN(MAXBSIZE - moffset, n); 472 maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset, 473 mlen, 1, srw); 474 /* 475 * Fault in the pages so we can check for error and ensure 476 * that we can safely used the mapped address. 477 */ 478 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 479 F_SOFTLOCK, srw) != 0) { 480 (void) segmap_release(segkmap, maddr, 0); 481 return (-1); 482 } 483 484 if (operation == VD_OP_BREAD) 485 bcopy(maddr + moffset, data, mlen); 486 else 487 bcopy(data, maddr + moffset, mlen); 488 489 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 490 F_SOFTUNLOCK, srw) != 0) { 491 (void) segmap_release(segkmap, maddr, 0); 492 return (-1); 493 } 494 if (segmap_release(segkmap, maddr, smflags) != 0) 495 return (-1); 496 n -= mlen; 497 offset += mlen; 498 data += mlen; 499 500 } while (n > 0); 501 502 return (len); 503 } 504 505 static int 506 vd_start_bio(vd_task_t *task) 507 { 508 int rv, status = 0; 509 vd_t *vd = task->vd; 510 vd_dring_payload_t *request = task->request; 511 struct buf *buf = &task->buf; 512 uint8_t mtype; 513 int slice; 514 515 ASSERT(vd != NULL); 516 ASSERT(request != NULL); 517 518 slice = request->slice; 519 520 ASSERT(slice < vd->nslices); 521 ASSERT((request->operation == VD_OP_BREAD) || 522 (request->operation == VD_OP_BWRITE)); 523 524 if (request->nbytes == 0) 525 return (EINVAL); /* no service for trivial requests */ 526 527 PR1("%s %lu bytes at block %lu", 528 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 529 request->nbytes, request->addr); 530 531 bioinit(buf); 532 buf->b_flags = B_BUSY; 533 buf->b_bcount = request->nbytes; 534 buf->b_lblkno = request->addr; 535 buf->b_edev = vd->dev[slice]; 536 537 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 538 539 /* Map memory exported by client */ 540 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 541 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 542 &(buf->b_un.b_addr), NULL); 543 if (status != 0) { 544 PR0("ldc_mem_map() returned err %d ", status); 545 biofini(buf); 546 return (status); 547 } 548 549 status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount); 550 if (status != 0) { 551 (void) ldc_mem_unmap(task->mhdl); 552 PR0("ldc_mem_acquire() returned err %d ", status); 553 biofini(buf); 554 return (status); 555 } 556 557 buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE; 558 559 /* Start the block I/O */ 560 if (vd->file) { 561 rv = vd_file_rw(vd, slice, request->operation, buf->b_un.b_addr, 562 request->addr, request->nbytes); 563 if (rv < 0) { 564 request->nbytes = 0; 565 status = EIO; 566 } else { 567 request->nbytes = rv; 568 status = 0; 569 } 570 } else { 571 status = ldi_strategy(vd->ldi_handle[slice], buf); 572 if (status == 0) 573 return (EINPROGRESS); /* will complete on completionq */ 574 } 575 576 /* Clean up after error */ 577 rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 578 if (rv) { 579 PR0("ldc_mem_release() returned err %d ", rv); 580 } 581 rv = ldc_mem_unmap(task->mhdl); 582 if (rv) { 583 PR0("ldc_mem_unmap() returned err %d ", status); 584 } 585 586 biofini(buf); 587 return (status); 588 } 589 590 static int 591 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 592 { 593 int status; 594 size_t nbytes; 595 596 do { 597 nbytes = msglen; 598 status = ldc_write(ldc_handle, msg, &nbytes); 599 if (status != EWOULDBLOCK) 600 break; 601 drv_usecwait(vds_ldc_delay); 602 } while (status == EWOULDBLOCK); 603 604 if (status != 0) { 605 if (status != ECONNRESET) 606 PR0("ldc_write() returned errno %d", status); 607 return (status); 608 } else if (nbytes != msglen) { 609 PR0("ldc_write() performed only partial write"); 610 return (EIO); 611 } 612 613 PR1("SENT %lu bytes", msglen); 614 return (0); 615 } 616 617 static void 618 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 619 { 620 mutex_enter(&vd->lock); 621 vd->reset_state = B_TRUE; 622 vd->reset_ldc = reset_ldc; 623 mutex_exit(&vd->lock); 624 } 625 626 /* 627 * Reset the state of the connection with a client, if needed; reset the LDC 628 * transport as well, if needed. This function should only be called from the 629 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 630 */ 631 static void 632 vd_reset_if_needed(vd_t *vd) 633 { 634 int status = 0; 635 636 mutex_enter(&vd->lock); 637 if (!vd->reset_state) { 638 ASSERT(!vd->reset_ldc); 639 mutex_exit(&vd->lock); 640 return; 641 } 642 mutex_exit(&vd->lock); 643 644 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 645 646 /* 647 * Let any asynchronous I/O complete before possibly pulling the rug 648 * out from under it; defer checking vd->reset_ldc, as one of the 649 * asynchronous tasks might set it 650 */ 651 ddi_taskq_wait(vd->completionq); 652 653 if (vd->file) { 654 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred); 655 if (status) { 656 PR0("VOP_FSYNC returned errno %d", status); 657 } 658 } 659 660 if ((vd->initialized & VD_DRING) && 661 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 662 PR0("ldc_mem_dring_unmap() returned errno %d", status); 663 664 vd_free_dring_task(vd); 665 666 /* Free the staging buffer for msgs */ 667 if (vd->vio_msgp != NULL) { 668 kmem_free(vd->vio_msgp, vd->max_msglen); 669 vd->vio_msgp = NULL; 670 } 671 672 /* Free the inband message buffer */ 673 if (vd->inband_task.msg != NULL) { 674 kmem_free(vd->inband_task.msg, vd->max_msglen); 675 vd->inband_task.msg = NULL; 676 } 677 678 mutex_enter(&vd->lock); 679 680 if (vd->reset_ldc) 681 PR0("taking down LDC channel"); 682 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 683 PR0("ldc_down() returned errno %d", status); 684 685 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 686 vd->state = VD_STATE_INIT; 687 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 688 689 /* Allocate the staging buffer */ 690 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 691 692 PR0("calling ldc_up\n"); 693 (void) ldc_up(vd->ldc_handle); 694 695 vd->reset_state = B_FALSE; 696 vd->reset_ldc = B_FALSE; 697 698 mutex_exit(&vd->lock); 699 } 700 701 static void vd_recv_msg(void *arg); 702 703 static void 704 vd_mark_in_reset(vd_t *vd) 705 { 706 int status; 707 708 PR0("vd_mark_in_reset: marking vd in reset\n"); 709 710 vd_need_reset(vd, B_FALSE); 711 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 712 if (status == DDI_FAILURE) { 713 PR0("cannot schedule task to recv msg\n"); 714 vd_need_reset(vd, B_TRUE); 715 return; 716 } 717 } 718 719 static int 720 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 721 { 722 boolean_t accepted; 723 int status; 724 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 725 726 if (vd->reset_state) 727 return (0); 728 729 /* Acquire the element */ 730 if (!vd->reset_state && 731 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 732 if (status == ECONNRESET) { 733 vd_mark_in_reset(vd); 734 return (0); 735 } else { 736 PR0("ldc_mem_dring_acquire() returned errno %d", 737 status); 738 return (status); 739 } 740 } 741 742 /* Set the element's status and mark it done */ 743 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 744 if (accepted) { 745 elem->payload.nbytes = elem_nbytes; 746 elem->payload.status = elem_status; 747 elem->hdr.dstate = VIO_DESC_DONE; 748 } else { 749 /* Perhaps client timed out waiting for I/O... */ 750 PR0("element %u no longer \"accepted\"", idx); 751 VD_DUMP_DRING_ELEM(elem); 752 } 753 /* Release the element */ 754 if (!vd->reset_state && 755 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 756 if (status == ECONNRESET) { 757 vd_mark_in_reset(vd); 758 return (0); 759 } else { 760 PR0("ldc_mem_dring_release() returned errno %d", 761 status); 762 return (status); 763 } 764 } 765 766 return (accepted ? 0 : EINVAL); 767 } 768 769 static void 770 vd_complete_bio(void *arg) 771 { 772 int status = 0; 773 vd_task_t *task = (vd_task_t *)arg; 774 vd_t *vd = task->vd; 775 vd_dring_payload_t *request = task->request; 776 struct buf *buf = &task->buf; 777 778 779 ASSERT(vd != NULL); 780 ASSERT(request != NULL); 781 ASSERT(task->msg != NULL); 782 ASSERT(task->msglen >= sizeof (*task->msg)); 783 ASSERT(!vd->file); 784 785 /* Wait for the I/O to complete */ 786 request->status = biowait(buf); 787 788 /* return back the number of bytes read/written */ 789 request->nbytes = buf->b_bcount - buf->b_resid; 790 791 /* Release the buffer */ 792 if (!vd->reset_state) 793 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 794 if (status) { 795 PR0("ldc_mem_release() returned errno %d copying to " 796 "client", status); 797 if (status == ECONNRESET) { 798 vd_mark_in_reset(vd); 799 } 800 } 801 802 /* Unmap the memory, even if in reset */ 803 status = ldc_mem_unmap(task->mhdl); 804 if (status) { 805 PR0("ldc_mem_unmap() returned errno %d copying to client", 806 status); 807 if (status == ECONNRESET) { 808 vd_mark_in_reset(vd); 809 } 810 } 811 812 biofini(buf); 813 814 /* Update the dring element for a dring client */ 815 if (!vd->reset_state && (status == 0) && 816 (vd->xfer_mode == VIO_DRING_MODE)) { 817 status = vd_mark_elem_done(vd, task->index, 818 request->status, request->nbytes); 819 if (status == ECONNRESET) 820 vd_mark_in_reset(vd); 821 } 822 823 /* 824 * If a transport error occurred, arrange to "nack" the message when 825 * the final task in the descriptor element range completes 826 */ 827 if (status != 0) 828 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 829 830 /* 831 * Only the final task for a range of elements will respond to and 832 * free the message 833 */ 834 if (task->type == VD_NONFINAL_RANGE_TASK) { 835 return; 836 } 837 838 /* 839 * Send the "ack" or "nack" back to the client; if sending the message 840 * via LDC fails, arrange to reset both the connection state and LDC 841 * itself 842 */ 843 PR1("Sending %s", 844 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 845 if (!vd->reset_state) { 846 status = send_msg(vd->ldc_handle, task->msg, task->msglen); 847 switch (status) { 848 case 0: 849 break; 850 case ECONNRESET: 851 vd_mark_in_reset(vd); 852 break; 853 default: 854 PR0("initiating full reset"); 855 vd_need_reset(vd, B_TRUE); 856 break; 857 } 858 } 859 } 860 861 static void 862 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 863 { 864 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 865 } 866 867 static void 868 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 869 { 870 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 871 } 872 873 static void 874 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 875 { 876 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 877 } 878 879 static void 880 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 881 { 882 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 883 } 884 885 static void 886 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 887 { 888 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 889 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 890 891 dk_efi->dki_lba = vd_efi->lba; 892 dk_efi->dki_length = vd_efi->length; 893 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 894 } 895 896 static void 897 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 898 { 899 int len; 900 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 901 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 902 903 len = vd_efi->length; 904 DK_EFI2VD_EFI(dk_efi, vd_efi); 905 kmem_free(dk_efi->dki_data, len); 906 } 907 908 static void 909 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 910 { 911 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 912 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 913 914 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 915 VD_EFI2DK_EFI(vd_efi, dk_efi); 916 } 917 918 static void 919 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 920 { 921 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 922 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 923 924 kmem_free(dk_efi->dki_data, vd_efi->length); 925 } 926 927 static int 928 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label) 929 { 930 int status, rval; 931 struct dk_gpt *efi; 932 size_t efi_len; 933 934 *label = VD_DISK_LABEL_UNK; 935 936 status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc, 937 (vd_open_flags | FKIOCTL), kcred, &rval); 938 939 if (status == 0) { 940 *label = VD_DISK_LABEL_VTOC; 941 return (0); 942 } else if (status != ENOTSUP) { 943 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 944 return (status); 945 } 946 947 status = vds_efi_alloc_and_read(handle, &efi, &efi_len); 948 949 if (status) { 950 PR0("vds_efi_alloc_and_read returned error %d", status); 951 return (status); 952 } 953 954 *label = VD_DISK_LABEL_EFI; 955 vd_efi_to_vtoc(efi, vtoc); 956 vd_efi_free(efi, efi_len); 957 958 return (0); 959 } 960 961 static ushort_t 962 vd_lbl2cksum(struct dk_label *label) 963 { 964 int count; 965 ushort_t sum, *sp; 966 967 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 968 sp = (ushort_t *)label; 969 sum = 0; 970 while (count--) { 971 sum ^= *sp++; 972 } 973 974 return (sum); 975 } 976 977 static int 978 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 979 { 980 dk_efi_t *dk_ioc; 981 struct dk_label label; 982 struct vtoc *vtoc; 983 int i; 984 985 switch (vd->vdisk_label) { 986 987 case VD_DISK_LABEL_VTOC: 988 989 switch (cmd) { 990 case DKIOCGGEOM: 991 ASSERT(ioctl_arg != NULL); 992 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 993 return (0); 994 case DKIOCGVTOC: 995 ASSERT(ioctl_arg != NULL); 996 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 997 return (0); 998 case DKIOCSVTOC: 999 if (!vd->file) 1000 return (ENOTSUP); 1001 ASSERT(ioctl_arg != NULL); 1002 vtoc = (struct vtoc *)ioctl_arg; 1003 1004 if (vtoc->v_sanity != VTOC_SANE || 1005 vtoc->v_sectorsz != DEV_BSIZE || 1006 vtoc->v_nparts != V_NUMPAR) 1007 return (EINVAL); 1008 1009 bzero(&label, sizeof (label)); 1010 label.dkl_ncyl = vd->dk_geom.dkg_ncyl; 1011 label.dkl_acyl = vd->dk_geom.dkg_acyl; 1012 label.dkl_pcyl = vd->dk_geom.dkg_pcyl; 1013 label.dkl_nhead = vd->dk_geom.dkg_nhead; 1014 label.dkl_nsect = vd->dk_geom.dkg_nsect; 1015 label.dkl_intrlv = vd->dk_geom.dkg_intrlv; 1016 label.dkl_apc = vd->dk_geom.dkg_apc; 1017 label.dkl_rpm = vd->dk_geom.dkg_rpm; 1018 label.dkl_write_reinstruct = 1019 vd->dk_geom.dkg_write_reinstruct; 1020 label.dkl_read_reinstruct = 1021 vd->dk_geom.dkg_read_reinstruct; 1022 1023 label.dkl_vtoc.v_nparts = vtoc->v_nparts; 1024 label.dkl_vtoc.v_sanity = vtoc->v_sanity; 1025 label.dkl_vtoc.v_version = vtoc->v_version; 1026 for (i = 0; i < vtoc->v_nparts; i++) { 1027 label.dkl_vtoc.v_timestamp[i] = 1028 vtoc->timestamp[i]; 1029 label.dkl_vtoc.v_part[i].p_tag = 1030 vtoc->v_part[i].p_tag; 1031 label.dkl_vtoc.v_part[i].p_flag = 1032 vtoc->v_part[i].p_flag; 1033 label.dkl_map[i].dkl_cylno = 1034 vtoc->v_part[i].p_start / 1035 (label.dkl_nhead * label.dkl_nsect); 1036 label.dkl_map[i].dkl_nblk = 1037 vtoc->v_part[i].p_size; 1038 } 1039 bcopy(vtoc->v_asciilabel, label.dkl_asciilabel, 1040 LEN_DKL_ASCII); 1041 bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume, 1042 LEN_DKL_VVOL); 1043 bcopy(vtoc->v_bootinfo, label.dkl_vtoc.v_bootinfo, 1044 sizeof (vtoc->v_bootinfo)); 1045 1046 /* re-compute checksum */ 1047 label.dkl_magic = DKL_MAGIC; 1048 label.dkl_cksum = vd_lbl2cksum(&label); 1049 1050 /* write label to file */ 1051 if (VD_FILE_LABEL_WRITE(vd, &label) < 0) 1052 return (EIO); 1053 1054 /* update the cached vdisk VTOC */ 1055 bcopy(vtoc, &vd->vtoc, sizeof (vd->vtoc)); 1056 1057 return (0); 1058 default: 1059 return (ENOTSUP); 1060 } 1061 1062 case VD_DISK_LABEL_EFI: 1063 1064 switch (cmd) { 1065 case DKIOCGETEFI: 1066 ASSERT(ioctl_arg != NULL); 1067 dk_ioc = (dk_efi_t *)ioctl_arg; 1068 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 1069 return (EINVAL); 1070 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 1071 vd->dk_efi.dki_length); 1072 return (0); 1073 default: 1074 return (ENOTSUP); 1075 } 1076 1077 default: 1078 return (ENOTSUP); 1079 } 1080 } 1081 1082 static int 1083 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 1084 { 1085 int rval = 0, status; 1086 size_t nbytes = request->nbytes; /* modifiable copy */ 1087 1088 1089 ASSERT(request->slice < vd->nslices); 1090 PR0("Performing %s", ioctl->operation_name); 1091 1092 /* Get data from client and convert, if necessary */ 1093 if (ioctl->copyin != NULL) { 1094 ASSERT(nbytes != 0 && buf != NULL); 1095 PR1("Getting \"arg\" data from client"); 1096 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1097 request->cookie, request->ncookies, 1098 LDC_COPY_IN)) != 0) { 1099 PR0("ldc_mem_copy() returned errno %d " 1100 "copying from client", status); 1101 return (status); 1102 } 1103 1104 /* Convert client's data, if necessary */ 1105 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 1106 ioctl->arg = buf; 1107 else /* convert client vdisk operation data to ioctl data */ 1108 (ioctl->copyin)(buf, (void *)ioctl->arg); 1109 } 1110 1111 /* 1112 * Handle single-slice block devices internally; otherwise, have the 1113 * real driver perform the ioctl() 1114 */ 1115 if (vd->file || (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo)) { 1116 if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, 1117 (void *)ioctl->arg)) != 0) 1118 return (status); 1119 } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], 1120 ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL), 1121 kcred, &rval)) != 0) { 1122 PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); 1123 return (status); 1124 } 1125 #ifdef DEBUG 1126 if (rval != 0) { 1127 PR0("%s set rval = %d, which is not being returned to client", 1128 ioctl->cmd_name, rval); 1129 } 1130 #endif /* DEBUG */ 1131 1132 /* Convert data and send to client, if necessary */ 1133 if (ioctl->copyout != NULL) { 1134 ASSERT(nbytes != 0 && buf != NULL); 1135 PR1("Sending \"arg\" data to client"); 1136 1137 /* Convert ioctl data to vdisk operation data, if necessary */ 1138 if (ioctl->copyout != VD_IDENTITY) 1139 (ioctl->copyout)((void *)ioctl->arg, buf); 1140 1141 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1142 request->cookie, request->ncookies, 1143 LDC_COPY_OUT)) != 0) { 1144 PR0("ldc_mem_copy() returned errno %d " 1145 "copying to client", status); 1146 return (status); 1147 } 1148 } 1149 1150 return (status); 1151 } 1152 1153 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 1154 static int 1155 vd_ioctl(vd_task_t *task) 1156 { 1157 int i, status, rc; 1158 void *buf = NULL; 1159 struct dk_geom dk_geom = {0}; 1160 struct vtoc vtoc = {0}; 1161 struct dk_efi dk_efi = {0}; 1162 vd_t *vd = task->vd; 1163 vd_dring_payload_t *request = task->request; 1164 vd_ioctl_t ioctl[] = { 1165 /* Command (no-copy) operations */ 1166 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 1167 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 1168 NULL, NULL, NULL}, 1169 1170 /* "Get" (copy-out) operations */ 1171 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 1172 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 1173 NULL, VD_IDENTITY, VD_IDENTITY}, 1174 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 1175 RNDSIZE(vd_geom_t), 1176 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 1177 &dk_geom, NULL, dk_geom2vd_geom}, 1178 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 1179 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 1180 &vtoc, NULL, vtoc2vd_vtoc}, 1181 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 1182 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 1183 &dk_efi, vd_get_efi_in, vd_get_efi_out}, 1184 1185 /* "Set" (copy-in) operations */ 1186 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 1187 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 1188 NULL, VD_IDENTITY, VD_IDENTITY}, 1189 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 1190 RNDSIZE(vd_geom_t), 1191 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 1192 &dk_geom, vd_geom2dk_geom, NULL}, 1193 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 1194 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 1195 &vtoc, vd_vtoc2vtoc, NULL}, 1196 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 1197 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 1198 &dk_efi, vd_set_efi_in, vd_set_efi_out}, 1199 }; 1200 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 1201 1202 1203 ASSERT(vd != NULL); 1204 ASSERT(request != NULL); 1205 ASSERT(request->slice < vd->nslices); 1206 1207 /* 1208 * Determine ioctl corresponding to caller's "operation" and 1209 * validate caller's "nbytes" 1210 */ 1211 for (i = 0; i < nioctls; i++) { 1212 if (request->operation == ioctl[i].operation) { 1213 /* LDC memory operations require 8-byte multiples */ 1214 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 1215 1216 if (request->operation == VD_OP_GET_EFI || 1217 request->operation == VD_OP_SET_EFI) { 1218 if (request->nbytes >= ioctl[i].nbytes) 1219 break; 1220 PR0("%s: Expected at least nbytes = %lu, " 1221 "got %lu", ioctl[i].operation_name, 1222 ioctl[i].nbytes, request->nbytes); 1223 return (EINVAL); 1224 } 1225 1226 if (request->nbytes != ioctl[i].nbytes) { 1227 PR0("%s: Expected nbytes = %lu, got %lu", 1228 ioctl[i].operation_name, ioctl[i].nbytes, 1229 request->nbytes); 1230 return (EINVAL); 1231 } 1232 1233 break; 1234 } 1235 } 1236 ASSERT(i < nioctls); /* because "operation" already validated */ 1237 1238 if (request->nbytes) 1239 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 1240 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 1241 if (request->nbytes) 1242 kmem_free(buf, request->nbytes); 1243 if (!vd->file && vd->vdisk_type == VD_DISK_TYPE_DISK && 1244 (request->operation == VD_OP_SET_VTOC || 1245 request->operation == VD_OP_SET_EFI)) { 1246 /* update disk information */ 1247 rc = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, 1248 &vd->vdisk_label); 1249 if (rc != 0) 1250 PR0("vd_read_vtoc return error %d", rc); 1251 } 1252 PR0("Returning %d", status); 1253 return (status); 1254 } 1255 1256 static int 1257 vd_get_devid(vd_task_t *task) 1258 { 1259 vd_t *vd = task->vd; 1260 vd_dring_payload_t *request = task->request; 1261 vd_devid_t *vd_devid; 1262 impl_devid_t *devid; 1263 int status, bufid_len, devid_len, len; 1264 int bufbytes; 1265 1266 PR1("Get Device ID, nbytes=%ld", request->nbytes); 1267 1268 if (vd->file) { 1269 /* no devid for disk on file */ 1270 return (ENOENT); 1271 } 1272 1273 if (ddi_lyr_get_devid(vd->dev[request->slice], 1274 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 1275 /* the most common failure is that no devid is available */ 1276 PR2("No Device ID"); 1277 return (ENOENT); 1278 } 1279 1280 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 1281 devid_len = DEVID_GETLEN(devid); 1282 1283 /* 1284 * Save the buffer size here for use in deallocation. 1285 * The actual number of bytes copied is returned in 1286 * the 'nbytes' field of the request structure. 1287 */ 1288 bufbytes = request->nbytes; 1289 1290 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 1291 vd_devid->length = devid_len; 1292 vd_devid->type = DEVID_GETTYPE(devid); 1293 1294 len = (devid_len > bufid_len)? bufid_len : devid_len; 1295 1296 bcopy(devid->did_id, vd_devid->id, len); 1297 1298 /* LDC memory operations require 8-byte multiples */ 1299 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 1300 1301 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 1302 &request->nbytes, request->cookie, request->ncookies, 1303 LDC_COPY_OUT)) != 0) { 1304 PR0("ldc_mem_copy() returned errno %d copying to client", 1305 status); 1306 } 1307 PR1("post mem_copy: nbytes=%ld", request->nbytes); 1308 1309 kmem_free(vd_devid, bufbytes); 1310 ddi_devid_free((ddi_devid_t)devid); 1311 1312 return (status); 1313 } 1314 1315 /* 1316 * Define the supported operations once the functions for performing them have 1317 * been defined 1318 */ 1319 static const vds_operation_t vds_operation[] = { 1320 #define X(_s) #_s, _s 1321 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 1322 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 1323 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 1324 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 1325 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 1326 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 1327 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 1328 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 1329 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 1330 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 1331 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 1332 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 1333 #undef X 1334 }; 1335 1336 static const size_t vds_noperations = 1337 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 1338 1339 /* 1340 * Process a task specifying a client I/O request 1341 */ 1342 static int 1343 vd_process_task(vd_task_t *task) 1344 { 1345 int i, status; 1346 vd_t *vd = task->vd; 1347 vd_dring_payload_t *request = task->request; 1348 1349 1350 ASSERT(vd != NULL); 1351 ASSERT(request != NULL); 1352 1353 /* Find the requested operation */ 1354 for (i = 0; i < vds_noperations; i++) 1355 if (request->operation == vds_operation[i].operation) 1356 break; 1357 if (i == vds_noperations) { 1358 PR0("Unsupported operation %u", request->operation); 1359 return (ENOTSUP); 1360 } 1361 1362 /* Handle client using absolute disk offsets */ 1363 if ((vd->vdisk_type == VD_DISK_TYPE_DISK) && 1364 (request->slice == UINT8_MAX)) 1365 request->slice = VD_ENTIRE_DISK_SLICE; 1366 1367 /* Range-check slice */ 1368 if (request->slice >= vd->nslices) { 1369 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 1370 request->slice, (vd->nslices - 1)); 1371 return (EINVAL); 1372 } 1373 1374 PR1("operation : %s", vds_operation[i].namep); 1375 1376 /* Start the operation */ 1377 if ((status = vds_operation[i].start(task)) != EINPROGRESS) { 1378 PR0("operation : %s returned status %d", 1379 vds_operation[i].namep, status); 1380 request->status = status; /* op succeeded or failed */ 1381 return (0); /* but request completed */ 1382 } 1383 1384 ASSERT(vds_operation[i].complete != NULL); /* debug case */ 1385 if (vds_operation[i].complete == NULL) { /* non-debug case */ 1386 PR0("Unexpected return of EINPROGRESS " 1387 "with no I/O completion handler"); 1388 request->status = EIO; /* operation failed */ 1389 return (0); /* but request completed */ 1390 } 1391 1392 PR1("operation : kick off taskq entry for %s", vds_operation[i].namep); 1393 1394 /* Queue a task to complete the operation */ 1395 status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete, 1396 task, DDI_SLEEP); 1397 /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ 1398 ASSERT(status == DDI_SUCCESS); 1399 1400 PR1("Operation in progress"); 1401 return (EINPROGRESS); /* completion handler will finish request */ 1402 } 1403 1404 /* 1405 * Return true if the "type", "subtype", and "env" fields of the "tag" first 1406 * argument match the corresponding remaining arguments; otherwise, return false 1407 */ 1408 boolean_t 1409 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 1410 { 1411 return ((tag->vio_msgtype == type) && 1412 (tag->vio_subtype == subtype) && 1413 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 1414 } 1415 1416 /* 1417 * Check whether the major/minor version specified in "ver_msg" is supported 1418 * by this server. 1419 */ 1420 static boolean_t 1421 vds_supported_version(vio_ver_msg_t *ver_msg) 1422 { 1423 for (int i = 0; i < vds_num_versions; i++) { 1424 ASSERT(vds_version[i].major > 0); 1425 ASSERT((i == 0) || 1426 (vds_version[i].major < vds_version[i-1].major)); 1427 1428 /* 1429 * If the major versions match, adjust the minor version, if 1430 * necessary, down to the highest value supported by this 1431 * server and return true so this message will get "ack"ed; 1432 * the client should also support all minor versions lower 1433 * than the value it sent 1434 */ 1435 if (ver_msg->ver_major == vds_version[i].major) { 1436 if (ver_msg->ver_minor > vds_version[i].minor) { 1437 PR0("Adjusting minor version from %u to %u", 1438 ver_msg->ver_minor, vds_version[i].minor); 1439 ver_msg->ver_minor = vds_version[i].minor; 1440 } 1441 return (B_TRUE); 1442 } 1443 1444 /* 1445 * If the message contains a higher major version number, set 1446 * the message's major/minor versions to the current values 1447 * and return false, so this message will get "nack"ed with 1448 * these values, and the client will potentially try again 1449 * with the same or a lower version 1450 */ 1451 if (ver_msg->ver_major > vds_version[i].major) { 1452 ver_msg->ver_major = vds_version[i].major; 1453 ver_msg->ver_minor = vds_version[i].minor; 1454 return (B_FALSE); 1455 } 1456 1457 /* 1458 * Otherwise, the message's major version is less than the 1459 * current major version, so continue the loop to the next 1460 * (lower) supported version 1461 */ 1462 } 1463 1464 /* 1465 * No common version was found; "ground" the version pair in the 1466 * message to terminate negotiation 1467 */ 1468 ver_msg->ver_major = 0; 1469 ver_msg->ver_minor = 0; 1470 return (B_FALSE); 1471 } 1472 1473 /* 1474 * Process a version message from a client. vds expects to receive version 1475 * messages from clients seeking service, but never issues version messages 1476 * itself; therefore, vds can ACK or NACK client version messages, but does 1477 * not expect to receive version-message ACKs or NACKs (and will treat such 1478 * messages as invalid). 1479 */ 1480 static int 1481 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1482 { 1483 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 1484 1485 1486 ASSERT(msglen >= sizeof (msg->tag)); 1487 1488 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1489 VIO_VER_INFO)) { 1490 return (ENOMSG); /* not a version message */ 1491 } 1492 1493 if (msglen != sizeof (*ver_msg)) { 1494 PR0("Expected %lu-byte version message; " 1495 "received %lu bytes", sizeof (*ver_msg), msglen); 1496 return (EBADMSG); 1497 } 1498 1499 if (ver_msg->dev_class != VDEV_DISK) { 1500 PR0("Expected device class %u (disk); received %u", 1501 VDEV_DISK, ver_msg->dev_class); 1502 return (EBADMSG); 1503 } 1504 1505 /* 1506 * We're talking to the expected kind of client; set our device class 1507 * for "ack/nack" back to the client 1508 */ 1509 ver_msg->dev_class = VDEV_DISK_SERVER; 1510 1511 /* 1512 * Check whether the (valid) version message specifies a version 1513 * supported by this server. If the version is not supported, return 1514 * EBADMSG so the message will get "nack"ed; vds_supported_version() 1515 * will have updated the message with a supported version for the 1516 * client to consider 1517 */ 1518 if (!vds_supported_version(ver_msg)) 1519 return (EBADMSG); 1520 1521 1522 /* 1523 * A version has been agreed upon; use the client's SID for 1524 * communication on this channel now 1525 */ 1526 ASSERT(!(vd->initialized & VD_SID)); 1527 vd->sid = ver_msg->tag.vio_sid; 1528 vd->initialized |= VD_SID; 1529 1530 /* 1531 * When multiple versions are supported, this function should store 1532 * the negotiated major and minor version values in the "vd" data 1533 * structure to govern further communication; in particular, note that 1534 * the client might have specified a lower minor version for the 1535 * agreed major version than specifed in the vds_version[] array. The 1536 * following assertions should help remind future maintainers to make 1537 * the appropriate changes to support multiple versions. 1538 */ 1539 ASSERT(vds_num_versions == 1); 1540 ASSERT(ver_msg->ver_major == vds_version[0].major); 1541 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 1542 1543 PR0("Using major version %u, minor version %u", 1544 ver_msg->ver_major, ver_msg->ver_minor); 1545 return (0); 1546 } 1547 1548 static int 1549 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1550 { 1551 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 1552 int status, retry = 0; 1553 1554 1555 ASSERT(msglen >= sizeof (msg->tag)); 1556 1557 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1558 VIO_ATTR_INFO)) { 1559 PR0("Message is not an attribute message"); 1560 return (ENOMSG); 1561 } 1562 1563 if (msglen != sizeof (*attr_msg)) { 1564 PR0("Expected %lu-byte attribute message; " 1565 "received %lu bytes", sizeof (*attr_msg), msglen); 1566 return (EBADMSG); 1567 } 1568 1569 if (attr_msg->max_xfer_sz == 0) { 1570 PR0("Received maximum transfer size of 0 from client"); 1571 return (EBADMSG); 1572 } 1573 1574 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 1575 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 1576 PR0("Client requested unsupported transfer mode"); 1577 return (EBADMSG); 1578 } 1579 1580 /* 1581 * check if the underlying disk is ready, if not try accessing 1582 * the device again. Open the vdisk device and extract info 1583 * about it, as this is needed to respond to the attr info msg 1584 */ 1585 if ((vd->initialized & VD_DISK_READY) == 0) { 1586 PR0("Retry setting up disk (%s)", vd->device_path); 1587 do { 1588 status = vd_setup_vd(vd); 1589 if (status != EAGAIN || ++retry > vds_dev_retries) 1590 break; 1591 1592 /* incremental delay */ 1593 delay(drv_usectohz(vds_dev_delay)); 1594 1595 /* if vdisk is no longer enabled - return error */ 1596 if (!vd_enabled(vd)) 1597 return (ENXIO); 1598 1599 } while (status == EAGAIN); 1600 1601 if (status) 1602 return (ENXIO); 1603 1604 vd->initialized |= VD_DISK_READY; 1605 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 1606 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 1607 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 1608 (vd->pseudo ? "yes" : "no"), 1609 (vd->file ? "yes" : "no"), 1610 vd->nslices); 1611 } 1612 1613 /* Success: valid message and transfer mode */ 1614 vd->xfer_mode = attr_msg->xfer_mode; 1615 1616 if (vd->xfer_mode == VIO_DESC_MODE) { 1617 1618 /* 1619 * The vd_dring_inband_msg_t contains one cookie; need room 1620 * for up to n-1 more cookies, where "n" is the number of full 1621 * pages plus possibly one partial page required to cover 1622 * "max_xfer_sz". Add room for one more cookie if 1623 * "max_xfer_sz" isn't an integral multiple of the page size. 1624 * Must first get the maximum transfer size in bytes. 1625 */ 1626 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 1627 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 1628 attr_msg->max_xfer_sz; 1629 size_t max_inband_msglen = 1630 sizeof (vd_dring_inband_msg_t) + 1631 ((max_xfer_bytes/PAGESIZE + 1632 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 1633 (sizeof (ldc_mem_cookie_t))); 1634 1635 /* 1636 * Set the maximum expected message length to 1637 * accommodate in-band-descriptor messages with all 1638 * their cookies 1639 */ 1640 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 1641 1642 /* 1643 * Initialize the data structure for processing in-band I/O 1644 * request descriptors 1645 */ 1646 vd->inband_task.vd = vd; 1647 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1648 vd->inband_task.index = 0; 1649 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 1650 } 1651 1652 /* Return the device's block size and max transfer size to the client */ 1653 attr_msg->vdisk_block_size = DEV_BSIZE; 1654 attr_msg->max_xfer_sz = vd->max_xfer_sz; 1655 1656 attr_msg->vdisk_size = vd->vdisk_size; 1657 attr_msg->vdisk_type = vd->vdisk_type; 1658 attr_msg->operations = vds_operations; 1659 PR0("%s", VD_CLIENT(vd)); 1660 1661 ASSERT(vd->dring_task == NULL); 1662 1663 return (0); 1664 } 1665 1666 static int 1667 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1668 { 1669 int status; 1670 size_t expected; 1671 ldc_mem_info_t dring_minfo; 1672 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 1673 1674 1675 ASSERT(msglen >= sizeof (msg->tag)); 1676 1677 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1678 VIO_DRING_REG)) { 1679 PR0("Message is not a register-dring message"); 1680 return (ENOMSG); 1681 } 1682 1683 if (msglen < sizeof (*reg_msg)) { 1684 PR0("Expected at least %lu-byte register-dring message; " 1685 "received %lu bytes", sizeof (*reg_msg), msglen); 1686 return (EBADMSG); 1687 } 1688 1689 expected = sizeof (*reg_msg) + 1690 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 1691 if (msglen != expected) { 1692 PR0("Expected %lu-byte register-dring message; " 1693 "received %lu bytes", expected, msglen); 1694 return (EBADMSG); 1695 } 1696 1697 if (vd->initialized & VD_DRING) { 1698 PR0("A dring was previously registered; only support one"); 1699 return (EBADMSG); 1700 } 1701 1702 if (reg_msg->num_descriptors > INT32_MAX) { 1703 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 1704 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 1705 return (EBADMSG); 1706 } 1707 1708 if (reg_msg->ncookies != 1) { 1709 /* 1710 * In addition to fixing the assertion in the success case 1711 * below, supporting drings which require more than one 1712 * "cookie" requires increasing the value of vd->max_msglen 1713 * somewhere in the code path prior to receiving the message 1714 * which results in calling this function. Note that without 1715 * making this change, the larger message size required to 1716 * accommodate multiple cookies cannot be successfully 1717 * received, so this function will not even get called. 1718 * Gracefully accommodating more dring cookies might 1719 * reasonably demand exchanging an additional attribute or 1720 * making a minor protocol adjustment 1721 */ 1722 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 1723 return (EBADMSG); 1724 } 1725 1726 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 1727 reg_msg->ncookies, reg_msg->num_descriptors, 1728 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 1729 if (status != 0) { 1730 PR0("ldc_mem_dring_map() returned errno %d", status); 1731 return (status); 1732 } 1733 1734 /* 1735 * To remove the need for this assertion, must call 1736 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 1737 * successful call to ldc_mem_dring_map() 1738 */ 1739 ASSERT(reg_msg->ncookies == 1); 1740 1741 if ((status = 1742 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 1743 PR0("ldc_mem_dring_info() returned errno %d", status); 1744 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 1745 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1746 return (status); 1747 } 1748 1749 if (dring_minfo.vaddr == NULL) { 1750 PR0("Descriptor ring virtual address is NULL"); 1751 return (ENXIO); 1752 } 1753 1754 1755 /* Initialize for valid message and mapped dring */ 1756 PR1("descriptor size = %u, dring length = %u", 1757 vd->descriptor_size, vd->dring_len); 1758 vd->initialized |= VD_DRING; 1759 vd->dring_ident = 1; /* "There Can Be Only One" */ 1760 vd->dring = dring_minfo.vaddr; 1761 vd->descriptor_size = reg_msg->descriptor_size; 1762 vd->dring_len = reg_msg->num_descriptors; 1763 reg_msg->dring_ident = vd->dring_ident; 1764 1765 /* 1766 * Allocate and initialize a "shadow" array of data structures for 1767 * tasks to process I/O requests in dring elements 1768 */ 1769 vd->dring_task = 1770 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 1771 for (int i = 0; i < vd->dring_len; i++) { 1772 vd->dring_task[i].vd = vd; 1773 vd->dring_task[i].index = i; 1774 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 1775 1776 status = ldc_mem_alloc_handle(vd->ldc_handle, 1777 &(vd->dring_task[i].mhdl)); 1778 if (status) { 1779 PR0("ldc_mem_alloc_handle() returned err %d ", status); 1780 return (ENXIO); 1781 } 1782 1783 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1784 } 1785 1786 return (0); 1787 } 1788 1789 static int 1790 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1791 { 1792 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 1793 1794 1795 ASSERT(msglen >= sizeof (msg->tag)); 1796 1797 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1798 VIO_DRING_UNREG)) { 1799 PR0("Message is not an unregister-dring message"); 1800 return (ENOMSG); 1801 } 1802 1803 if (msglen != sizeof (*unreg_msg)) { 1804 PR0("Expected %lu-byte unregister-dring message; " 1805 "received %lu bytes", sizeof (*unreg_msg), msglen); 1806 return (EBADMSG); 1807 } 1808 1809 if (unreg_msg->dring_ident != vd->dring_ident) { 1810 PR0("Expected dring ident %lu; received %lu", 1811 vd->dring_ident, unreg_msg->dring_ident); 1812 return (EBADMSG); 1813 } 1814 1815 return (0); 1816 } 1817 1818 static int 1819 process_rdx_msg(vio_msg_t *msg, size_t msglen) 1820 { 1821 ASSERT(msglen >= sizeof (msg->tag)); 1822 1823 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 1824 PR0("Message is not an RDX message"); 1825 return (ENOMSG); 1826 } 1827 1828 if (msglen != sizeof (vio_rdx_msg_t)) { 1829 PR0("Expected %lu-byte RDX message; received %lu bytes", 1830 sizeof (vio_rdx_msg_t), msglen); 1831 return (EBADMSG); 1832 } 1833 1834 PR0("Valid RDX message"); 1835 return (0); 1836 } 1837 1838 static int 1839 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 1840 { 1841 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 1842 PR0("Received seq_num %lu; expected %lu", 1843 seq_num, (vd->seq_num + 1)); 1844 PR0("initiating soft reset"); 1845 vd_need_reset(vd, B_FALSE); 1846 return (1); 1847 } 1848 1849 vd->seq_num = seq_num; 1850 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 1851 return (0); 1852 } 1853 1854 /* 1855 * Return the expected size of an inband-descriptor message with all the 1856 * cookies it claims to include 1857 */ 1858 static size_t 1859 expected_inband_size(vd_dring_inband_msg_t *msg) 1860 { 1861 return ((sizeof (*msg)) + 1862 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 1863 } 1864 1865 /* 1866 * Process an in-band descriptor message: used with clients like OBP, with 1867 * which vds exchanges descriptors within VIO message payloads, rather than 1868 * operating on them within a descriptor ring 1869 */ 1870 static int 1871 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1872 { 1873 size_t expected; 1874 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 1875 1876 1877 ASSERT(msglen >= sizeof (msg->tag)); 1878 1879 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1880 VIO_DESC_DATA)) { 1881 PR1("Message is not an in-band-descriptor message"); 1882 return (ENOMSG); 1883 } 1884 1885 if (msglen < sizeof (*desc_msg)) { 1886 PR0("Expected at least %lu-byte descriptor message; " 1887 "received %lu bytes", sizeof (*desc_msg), msglen); 1888 return (EBADMSG); 1889 } 1890 1891 if (msglen != (expected = expected_inband_size(desc_msg))) { 1892 PR0("Expected %lu-byte descriptor message; " 1893 "received %lu bytes", expected, msglen); 1894 return (EBADMSG); 1895 } 1896 1897 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 1898 return (EBADMSG); 1899 1900 /* 1901 * Valid message: Set up the in-band descriptor task and process the 1902 * request. Arrange to acknowledge the client's message, unless an 1903 * error processing the descriptor task results in setting 1904 * VIO_SUBTYPE_NACK 1905 */ 1906 PR1("Valid in-band-descriptor message"); 1907 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1908 1909 ASSERT(vd->inband_task.msg != NULL); 1910 1911 bcopy(msg, vd->inband_task.msg, msglen); 1912 vd->inband_task.msglen = msglen; 1913 1914 /* 1915 * The task request is now the payload of the message 1916 * that was just copied into the body of the task. 1917 */ 1918 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 1919 vd->inband_task.request = &desc_msg->payload; 1920 1921 return (vd_process_task(&vd->inband_task)); 1922 } 1923 1924 static int 1925 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 1926 vio_msg_t *msg, size_t msglen) 1927 { 1928 int status; 1929 boolean_t ready; 1930 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1931 1932 1933 /* Accept the updated dring element */ 1934 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1935 PR0("ldc_mem_dring_acquire() returned errno %d", status); 1936 return (status); 1937 } 1938 ready = (elem->hdr.dstate == VIO_DESC_READY); 1939 if (ready) { 1940 elem->hdr.dstate = VIO_DESC_ACCEPTED; 1941 } else { 1942 PR0("descriptor %u not ready", idx); 1943 VD_DUMP_DRING_ELEM(elem); 1944 } 1945 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1946 PR0("ldc_mem_dring_release() returned errno %d", status); 1947 return (status); 1948 } 1949 if (!ready) 1950 return (EBUSY); 1951 1952 1953 /* Initialize a task and process the accepted element */ 1954 PR1("Processing dring element %u", idx); 1955 vd->dring_task[idx].type = type; 1956 1957 /* duplicate msg buf for cookies etc. */ 1958 bcopy(msg, vd->dring_task[idx].msg, msglen); 1959 1960 vd->dring_task[idx].msglen = msglen; 1961 if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS) 1962 status = vd_mark_elem_done(vd, idx, 1963 vd->dring_task[idx].request->status, 1964 vd->dring_task[idx].request->nbytes); 1965 1966 return (status); 1967 } 1968 1969 static int 1970 vd_process_element_range(vd_t *vd, int start, int end, 1971 vio_msg_t *msg, size_t msglen) 1972 { 1973 int i, n, nelem, status = 0; 1974 boolean_t inprogress = B_FALSE; 1975 vd_task_type_t type; 1976 1977 1978 ASSERT(start >= 0); 1979 ASSERT(end >= 0); 1980 1981 /* 1982 * Arrange to acknowledge the client's message, unless an error 1983 * processing one of the dring elements results in setting 1984 * VIO_SUBTYPE_NACK 1985 */ 1986 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1987 1988 /* 1989 * Process the dring elements in the range 1990 */ 1991 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 1992 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 1993 ((vio_dring_msg_t *)msg)->end_idx = i; 1994 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 1995 status = vd_process_element(vd, type, i, msg, msglen); 1996 if (status == EINPROGRESS) 1997 inprogress = B_TRUE; 1998 else if (status != 0) 1999 break; 2000 } 2001 2002 /* 2003 * If some, but not all, operations of a multi-element range are in 2004 * progress, wait for other operations to complete before returning 2005 * (which will result in "ack" or "nack" of the message). Note that 2006 * all outstanding operations will need to complete, not just the ones 2007 * corresponding to the current range of dring elements; howevever, as 2008 * this situation is an error case, performance is less critical. 2009 */ 2010 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 2011 ddi_taskq_wait(vd->completionq); 2012 2013 return (status); 2014 } 2015 2016 static int 2017 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2018 { 2019 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 2020 2021 2022 ASSERT(msglen >= sizeof (msg->tag)); 2023 2024 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 2025 VIO_DRING_DATA)) { 2026 PR1("Message is not a dring-data message"); 2027 return (ENOMSG); 2028 } 2029 2030 if (msglen != sizeof (*dring_msg)) { 2031 PR0("Expected %lu-byte dring message; received %lu bytes", 2032 sizeof (*dring_msg), msglen); 2033 return (EBADMSG); 2034 } 2035 2036 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 2037 return (EBADMSG); 2038 2039 if (dring_msg->dring_ident != vd->dring_ident) { 2040 PR0("Expected dring ident %lu; received ident %lu", 2041 vd->dring_ident, dring_msg->dring_ident); 2042 return (EBADMSG); 2043 } 2044 2045 if (dring_msg->start_idx >= vd->dring_len) { 2046 PR0("\"start_idx\" = %u; must be less than %u", 2047 dring_msg->start_idx, vd->dring_len); 2048 return (EBADMSG); 2049 } 2050 2051 if ((dring_msg->end_idx < 0) || 2052 (dring_msg->end_idx >= vd->dring_len)) { 2053 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 2054 dring_msg->end_idx, vd->dring_len); 2055 return (EBADMSG); 2056 } 2057 2058 /* Valid message; process range of updated dring elements */ 2059 PR1("Processing descriptor range, start = %u, end = %u", 2060 dring_msg->start_idx, dring_msg->end_idx); 2061 return (vd_process_element_range(vd, dring_msg->start_idx, 2062 dring_msg->end_idx, msg, msglen)); 2063 } 2064 2065 static int 2066 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 2067 { 2068 int retry, status; 2069 size_t size = *nbytes; 2070 2071 2072 for (retry = 0, status = ETIMEDOUT; 2073 retry < vds_ldc_retries && status == ETIMEDOUT; 2074 retry++) { 2075 PR1("ldc_read() attempt %d", (retry + 1)); 2076 *nbytes = size; 2077 status = ldc_read(ldc_handle, msg, nbytes); 2078 } 2079 2080 if (status) { 2081 PR0("ldc_read() returned errno %d", status); 2082 if (status != ECONNRESET) 2083 return (ENOMSG); 2084 return (status); 2085 } else if (*nbytes == 0) { 2086 PR1("ldc_read() returned 0 and no message read"); 2087 return (ENOMSG); 2088 } 2089 2090 PR1("RCVD %lu-byte message", *nbytes); 2091 return (0); 2092 } 2093 2094 static int 2095 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2096 { 2097 int status; 2098 2099 2100 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 2101 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 2102 #ifdef DEBUG 2103 vd_decode_tag(msg); 2104 #endif 2105 2106 /* 2107 * Validate session ID up front, since it applies to all messages 2108 * once set 2109 */ 2110 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 2111 PR0("Expected SID %u, received %u", vd->sid, 2112 msg->tag.vio_sid); 2113 return (EBADMSG); 2114 } 2115 2116 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 2117 2118 /* 2119 * Process the received message based on connection state 2120 */ 2121 switch (vd->state) { 2122 case VD_STATE_INIT: /* expect version message */ 2123 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 2124 return (status); 2125 2126 /* Version negotiated, move to that state */ 2127 vd->state = VD_STATE_VER; 2128 return (0); 2129 2130 case VD_STATE_VER: /* expect attribute message */ 2131 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 2132 return (status); 2133 2134 /* Attributes exchanged, move to that state */ 2135 vd->state = VD_STATE_ATTR; 2136 return (0); 2137 2138 case VD_STATE_ATTR: 2139 switch (vd->xfer_mode) { 2140 case VIO_DESC_MODE: /* expect RDX message */ 2141 if ((status = process_rdx_msg(msg, msglen)) != 0) 2142 return (status); 2143 2144 /* Ready to receive in-band descriptors */ 2145 vd->state = VD_STATE_DATA; 2146 return (0); 2147 2148 case VIO_DRING_MODE: /* expect register-dring message */ 2149 if ((status = 2150 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 2151 return (status); 2152 2153 /* One dring negotiated, move to that state */ 2154 vd->state = VD_STATE_DRING; 2155 return (0); 2156 2157 default: 2158 ASSERT("Unsupported transfer mode"); 2159 PR0("Unsupported transfer mode"); 2160 return (ENOTSUP); 2161 } 2162 2163 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 2164 if ((status = process_rdx_msg(msg, msglen)) == 0) { 2165 /* Ready to receive data */ 2166 vd->state = VD_STATE_DATA; 2167 return (0); 2168 } else if (status != ENOMSG) { 2169 return (status); 2170 } 2171 2172 2173 /* 2174 * If another register-dring message is received, stay in 2175 * dring state in case the client sends RDX; although the 2176 * protocol allows multiple drings, this server does not 2177 * support using more than one 2178 */ 2179 if ((status = 2180 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 2181 return (status); 2182 2183 /* 2184 * Acknowledge an unregister-dring message, but reset the 2185 * connection anyway: Although the protocol allows 2186 * unregistering drings, this server cannot serve a vdisk 2187 * without its only dring 2188 */ 2189 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2190 return ((status == 0) ? ENOTSUP : status); 2191 2192 case VD_STATE_DATA: 2193 switch (vd->xfer_mode) { 2194 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 2195 return (vd_process_desc_msg(vd, msg, msglen)); 2196 2197 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 2198 /* 2199 * Typically expect dring-data messages, so handle 2200 * them first 2201 */ 2202 if ((status = vd_process_dring_msg(vd, msg, 2203 msglen)) != ENOMSG) 2204 return (status); 2205 2206 /* 2207 * Acknowledge an unregister-dring message, but reset 2208 * the connection anyway: Although the protocol 2209 * allows unregistering drings, this server cannot 2210 * serve a vdisk without its only dring 2211 */ 2212 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2213 return ((status == 0) ? ENOTSUP : status); 2214 2215 default: 2216 ASSERT("Unsupported transfer mode"); 2217 PR0("Unsupported transfer mode"); 2218 return (ENOTSUP); 2219 } 2220 2221 default: 2222 ASSERT("Invalid client connection state"); 2223 PR0("Invalid client connection state"); 2224 return (ENOTSUP); 2225 } 2226 } 2227 2228 static int 2229 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2230 { 2231 int status; 2232 boolean_t reset_ldc = B_FALSE; 2233 2234 2235 /* 2236 * Check that the message is at least big enough for a "tag", so that 2237 * message processing can proceed based on tag-specified message type 2238 */ 2239 if (msglen < sizeof (vio_msg_tag_t)) { 2240 PR0("Received short (%lu-byte) message", msglen); 2241 /* Can't "nack" short message, so drop the big hammer */ 2242 PR0("initiating full reset"); 2243 vd_need_reset(vd, B_TRUE); 2244 return (EBADMSG); 2245 } 2246 2247 /* 2248 * Process the message 2249 */ 2250 switch (status = vd_do_process_msg(vd, msg, msglen)) { 2251 case 0: 2252 /* "ack" valid, successfully-processed messages */ 2253 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2254 break; 2255 2256 case EINPROGRESS: 2257 /* The completion handler will "ack" or "nack" the message */ 2258 return (EINPROGRESS); 2259 case ENOMSG: 2260 PR0("Received unexpected message"); 2261 _NOTE(FALLTHROUGH); 2262 case EBADMSG: 2263 case ENOTSUP: 2264 /* "nack" invalid messages */ 2265 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2266 break; 2267 2268 default: 2269 /* "nack" failed messages */ 2270 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2271 /* An LDC error probably occurred, so try resetting it */ 2272 reset_ldc = B_TRUE; 2273 break; 2274 } 2275 2276 PR1("\tResulting in state %d (%s)", vd->state, 2277 vd_decode_state(vd->state)); 2278 2279 /* Send the "ack" or "nack" to the client */ 2280 PR1("Sending %s", 2281 (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 2282 if (send_msg(vd->ldc_handle, msg, msglen) != 0) 2283 reset_ldc = B_TRUE; 2284 2285 /* Arrange to reset the connection for nack'ed or failed messages */ 2286 if ((status != 0) || reset_ldc) { 2287 PR0("initiating %s reset", 2288 (reset_ldc) ? "full" : "soft"); 2289 vd_need_reset(vd, reset_ldc); 2290 } 2291 2292 return (status); 2293 } 2294 2295 static boolean_t 2296 vd_enabled(vd_t *vd) 2297 { 2298 boolean_t enabled; 2299 2300 2301 mutex_enter(&vd->lock); 2302 enabled = vd->enabled; 2303 mutex_exit(&vd->lock); 2304 return (enabled); 2305 } 2306 2307 static void 2308 vd_recv_msg(void *arg) 2309 { 2310 vd_t *vd = (vd_t *)arg; 2311 int rv = 0, status = 0; 2312 2313 ASSERT(vd != NULL); 2314 2315 PR2("New task to receive incoming message(s)"); 2316 2317 2318 while (vd_enabled(vd) && status == 0) { 2319 size_t msglen, msgsize; 2320 ldc_status_t lstatus; 2321 2322 /* 2323 * Receive and process a message 2324 */ 2325 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 2326 2327 /* 2328 * check if channel is UP - else break out of loop 2329 */ 2330 status = ldc_status(vd->ldc_handle, &lstatus); 2331 if (lstatus != LDC_UP) { 2332 PR0("channel not up (status=%d), exiting recv loop\n", 2333 lstatus); 2334 break; 2335 } 2336 2337 ASSERT(vd->max_msglen != 0); 2338 2339 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 2340 msglen = msgsize; /* actual len after recv_msg() */ 2341 2342 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 2343 switch (status) { 2344 case 0: 2345 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 2346 msglen); 2347 /* check if max_msglen changed */ 2348 if (msgsize != vd->max_msglen) { 2349 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 2350 msgsize, vd->max_msglen); 2351 kmem_free(vd->vio_msgp, msgsize); 2352 vd->vio_msgp = 2353 kmem_alloc(vd->max_msglen, KM_SLEEP); 2354 } 2355 if (rv == EINPROGRESS) 2356 continue; 2357 break; 2358 2359 case ENOMSG: 2360 break; 2361 2362 case ECONNRESET: 2363 PR0("initiating soft reset (ECONNRESET)\n"); 2364 vd_need_reset(vd, B_FALSE); 2365 status = 0; 2366 break; 2367 2368 default: 2369 /* Probably an LDC failure; arrange to reset it */ 2370 PR0("initiating full reset (status=0x%x)", status); 2371 vd_need_reset(vd, B_TRUE); 2372 break; 2373 } 2374 } 2375 2376 PR2("Task finished"); 2377 } 2378 2379 static uint_t 2380 vd_handle_ldc_events(uint64_t event, caddr_t arg) 2381 { 2382 vd_t *vd = (vd_t *)(void *)arg; 2383 int status; 2384 2385 ASSERT(vd != NULL); 2386 2387 if (!vd_enabled(vd)) 2388 return (LDC_SUCCESS); 2389 2390 if (event & LDC_EVT_DOWN) { 2391 PR0("LDC_EVT_DOWN: LDC channel went down"); 2392 2393 vd_need_reset(vd, B_TRUE); 2394 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2395 DDI_SLEEP); 2396 if (status == DDI_FAILURE) { 2397 PR0("cannot schedule task to recv msg\n"); 2398 vd_need_reset(vd, B_TRUE); 2399 } 2400 } 2401 2402 if (event & LDC_EVT_RESET) { 2403 PR0("LDC_EVT_RESET: LDC channel was reset"); 2404 2405 if (vd->state != VD_STATE_INIT) { 2406 PR0("scheduling full reset"); 2407 vd_need_reset(vd, B_FALSE); 2408 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2409 vd, DDI_SLEEP); 2410 if (status == DDI_FAILURE) { 2411 PR0("cannot schedule task to recv msg\n"); 2412 vd_need_reset(vd, B_TRUE); 2413 } 2414 2415 } else { 2416 PR0("channel already reset, ignoring...\n"); 2417 PR0("doing ldc up...\n"); 2418 (void) ldc_up(vd->ldc_handle); 2419 } 2420 2421 return (LDC_SUCCESS); 2422 } 2423 2424 if (event & LDC_EVT_UP) { 2425 PR0("EVT_UP: LDC is up\nResetting client connection state"); 2426 PR0("initiating soft reset"); 2427 vd_need_reset(vd, B_FALSE); 2428 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2429 vd, DDI_SLEEP); 2430 if (status == DDI_FAILURE) { 2431 PR0("cannot schedule task to recv msg\n"); 2432 vd_need_reset(vd, B_TRUE); 2433 return (LDC_SUCCESS); 2434 } 2435 } 2436 2437 if (event & LDC_EVT_READ) { 2438 int status; 2439 2440 PR1("New data available"); 2441 /* Queue a task to receive the new data */ 2442 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2443 DDI_SLEEP); 2444 2445 if (status == DDI_FAILURE) { 2446 PR0("cannot schedule task to recv msg\n"); 2447 vd_need_reset(vd, B_TRUE); 2448 } 2449 } 2450 2451 return (LDC_SUCCESS); 2452 } 2453 2454 static uint_t 2455 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 2456 { 2457 _NOTE(ARGUNUSED(key, val)) 2458 (*((uint_t *)arg))++; 2459 return (MH_WALK_TERMINATE); 2460 } 2461 2462 2463 static int 2464 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2465 { 2466 uint_t vd_present = 0; 2467 minor_t instance; 2468 vds_t *vds; 2469 2470 2471 switch (cmd) { 2472 case DDI_DETACH: 2473 /* the real work happens below */ 2474 break; 2475 case DDI_SUSPEND: 2476 PR0("No action required for DDI_SUSPEND"); 2477 return (DDI_SUCCESS); 2478 default: 2479 PR0("Unrecognized \"cmd\""); 2480 return (DDI_FAILURE); 2481 } 2482 2483 ASSERT(cmd == DDI_DETACH); 2484 instance = ddi_get_instance(dip); 2485 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 2486 PR0("Could not get state for instance %u", instance); 2487 ddi_soft_state_free(vds_state, instance); 2488 return (DDI_FAILURE); 2489 } 2490 2491 /* Do no detach when serving any vdisks */ 2492 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 2493 if (vd_present) { 2494 PR0("Not detaching because serving vdisks"); 2495 return (DDI_FAILURE); 2496 } 2497 2498 PR0("Detaching"); 2499 if (vds->initialized & VDS_MDEG) { 2500 (void) mdeg_unregister(vds->mdeg); 2501 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 2502 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 2503 vds->ispecp = NULL; 2504 vds->mdeg = NULL; 2505 } 2506 2507 if (vds->initialized & VDS_LDI) 2508 (void) ldi_ident_release(vds->ldi_ident); 2509 mod_hash_destroy_hash(vds->vd_table); 2510 ddi_soft_state_free(vds_state, instance); 2511 return (DDI_SUCCESS); 2512 } 2513 2514 static boolean_t 2515 is_pseudo_device(dev_info_t *dip) 2516 { 2517 dev_info_t *parent, *root = ddi_root_node(); 2518 2519 2520 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 2521 parent = ddi_get_parent(parent)) { 2522 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 2523 return (B_TRUE); 2524 } 2525 2526 return (B_FALSE); 2527 } 2528 2529 static int 2530 vd_setup_full_disk(vd_t *vd) 2531 { 2532 int rval, status; 2533 major_t major = getmajor(vd->dev[0]); 2534 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 2535 struct dk_minfo dk_minfo; 2536 2537 /* 2538 * At this point, vdisk_size is set to the size of partition 2 but 2539 * this does not represent the size of the disk because partition 2 2540 * may not cover the entire disk and its size does not include reserved 2541 * blocks. So we update vdisk_size to be the size of the entire disk. 2542 */ 2543 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 2544 (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL), 2545 kcred, &rval)) != 0) { 2546 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 2547 status); 2548 return (status); 2549 } 2550 vd->vdisk_size = dk_minfo.dki_capacity; 2551 2552 /* Set full-disk parameters */ 2553 vd->vdisk_type = VD_DISK_TYPE_DISK; 2554 vd->nslices = (sizeof (vd->dev))/(sizeof (vd->dev[0])); 2555 2556 /* Move dev number and LDI handle to entire-disk-slice array elements */ 2557 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 2558 vd->dev[0] = 0; 2559 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 2560 vd->ldi_handle[0] = NULL; 2561 2562 /* Initialize device numbers for remaining slices and open them */ 2563 for (int slice = 0; slice < vd->nslices; slice++) { 2564 /* 2565 * Skip the entire-disk slice, as it's already open and its 2566 * device known 2567 */ 2568 if (slice == VD_ENTIRE_DISK_SLICE) 2569 continue; 2570 ASSERT(vd->dev[slice] == 0); 2571 ASSERT(vd->ldi_handle[slice] == NULL); 2572 2573 /* 2574 * Construct the device number for the current slice 2575 */ 2576 vd->dev[slice] = makedevice(major, (minor + slice)); 2577 2578 /* 2579 * Open all slices of the disk to serve them to the client. 2580 * Slices are opened exclusively to prevent other threads or 2581 * processes in the service domain from performing I/O to 2582 * slices being accessed by a client. Failure to open a slice 2583 * results in vds not serving this disk, as the client could 2584 * attempt (and should be able) to access any slice immediately. 2585 * Any slices successfully opened before a failure will get 2586 * closed by vds_destroy_vd() as a result of the error returned 2587 * by this function. 2588 * 2589 * We need to do the open with FNDELAY so that opening an empty 2590 * slice does not fail. 2591 */ 2592 PR0("Opening device major %u, minor %u = slice %u", 2593 major, minor, slice); 2594 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 2595 vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[slice], 2596 vd->vds->ldi_ident)) != 0) { 2597 PRN("ldi_open_by_dev() returned errno %d " 2598 "for slice %u", status, slice); 2599 /* vds_destroy_vd() will close any open slices */ 2600 vd->ldi_handle[slice] = NULL; 2601 return (status); 2602 } 2603 } 2604 2605 return (0); 2606 } 2607 2608 static int 2609 vd_setup_partition_efi(vd_t *vd) 2610 { 2611 efi_gpt_t *gpt; 2612 efi_gpe_t *gpe; 2613 struct uuid uuid = EFI_RESERVED; 2614 uint32_t crc; 2615 int length; 2616 2617 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 2618 2619 gpt = kmem_zalloc(length, KM_SLEEP); 2620 gpe = (efi_gpe_t *)(gpt + 1); 2621 2622 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 2623 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 2624 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 2625 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 2626 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 2627 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 2628 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 2629 2630 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 2631 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 2632 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 2633 2634 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 2635 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2636 2637 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 2638 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 2639 2640 vd->dk_efi.dki_lba = 0; 2641 vd->dk_efi.dki_length = length; 2642 vd->dk_efi.dki_data = gpt; 2643 2644 return (0); 2645 } 2646 2647 static int 2648 vd_setup_file(vd_t *vd) 2649 { 2650 int i, rval, status; 2651 ushort_t sum; 2652 vattr_t vattr; 2653 dev_t dev; 2654 char *file_path = vd->device_path; 2655 char dev_path[MAXPATHLEN + 1]; 2656 ldi_handle_t lhandle; 2657 struct dk_cinfo dk_cinfo; 2658 struct dk_label label; 2659 2660 /* make sure the file is valid */ 2661 if ((status = lookupname(file_path, UIO_SYSSPACE, FOLLOW, 2662 NULLVPP, &vd->file_vnode)) != 0) { 2663 PRN("Cannot lookup file(%s) errno %d", file_path, status); 2664 return (status); 2665 } 2666 2667 if (vd->file_vnode->v_type != VREG) { 2668 PRN("Invalid file type (%s)\n", file_path); 2669 VN_RELE(vd->file_vnode); 2670 return (EBADF); 2671 } 2672 VN_RELE(vd->file_vnode); 2673 2674 if ((status = vn_open(file_path, UIO_SYSSPACE, vd_open_flags | FOFFMAX, 2675 0, &vd->file_vnode, 0, 0)) != 0) { 2676 PRN("vn_open(%s) = errno %d", file_path, status); 2677 return (status); 2678 } 2679 2680 /* 2681 * We set vd->file now so that vds_destroy_vd will take care of 2682 * closing the file and releasing the vnode in case of an error. 2683 */ 2684 vd->file = B_TRUE; 2685 vd->pseudo = B_FALSE; 2686 2687 vattr.va_mask = AT_SIZE; 2688 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) { 2689 PRN("VOP_GETATTR(%s) = errno %d", file_path, status); 2690 return (EIO); 2691 } 2692 2693 vd->file_size = vattr.va_size; 2694 /* size should be at least sizeof(dk_label) */ 2695 if (vd->file_size < sizeof (struct dk_label)) { 2696 PRN("Size of file has to be at least %ld bytes", 2697 sizeof (struct dk_label)); 2698 return (EIO); 2699 } 2700 2701 if (vd->file_vnode->v_flag & VNOMAP) { 2702 PRN("File %s cannot be mapped", file_path); 2703 return (EIO); 2704 } 2705 2706 /* read label from file */ 2707 if (VD_FILE_LABEL_READ(vd, &label) < 0) { 2708 PRN("Can't read label from %s", file_path); 2709 return (EIO); 2710 } 2711 2712 /* label checksum */ 2713 sum = vd_lbl2cksum(&label); 2714 2715 if (label.dkl_magic != DKL_MAGIC || label.dkl_cksum != sum) { 2716 PR0("%s has an invalid disk label " 2717 "(magic=%x cksum=%x (expect %x))", 2718 file_path, label.dkl_magic, label.dkl_cksum, sum); 2719 2720 /* default label */ 2721 bzero(&label, sizeof (struct dk_label)); 2722 2723 /* 2724 * We must have a resonable number of cylinders and sectors so 2725 * that newfs can run using default values. 2726 * 2727 * if (disk_size < 2MB) 2728 * phys_cylinders = disk_size / 100K 2729 * else 2730 * phys_cylinders = disk_size / 300K 2731 * 2732 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 2733 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 2734 * data_cylinders = phys_cylinders - alt_cylinders 2735 * 2736 * sectors = disk_size / (phys_cylinders * blk_size) 2737 */ 2738 if (vd->file_size < (2 * 1024 * 1024)) 2739 label.dkl_pcyl = vd->file_size / (100 * 1024); 2740 else 2741 label.dkl_pcyl = vd->file_size / (300 * 1024); 2742 2743 if (label.dkl_pcyl == 0) 2744 label.dkl_pcyl = 1; 2745 2746 if (label.dkl_pcyl > 2) 2747 label.dkl_acyl = 2; 2748 else 2749 label.dkl_acyl = 0; 2750 2751 label.dkl_nsect = vd->file_size / 2752 (DEV_BSIZE * label.dkl_pcyl); 2753 label.dkl_ncyl = label.dkl_pcyl - label.dkl_acyl; 2754 label.dkl_nhead = 1; 2755 label.dkl_write_reinstruct = 0; 2756 label.dkl_read_reinstruct = 0; 2757 label.dkl_rpm = 7200; 2758 label.dkl_apc = 0; 2759 label.dkl_intrlv = 0; 2760 label.dkl_magic = DKL_MAGIC; 2761 2762 PR0("requested disk size: %ld bytes\n", vd->file_size); 2763 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label.dkl_pcyl, 2764 label.dkl_nhead, label.dkl_nsect); 2765 PR0("provided disk size: %ld bytes\n", (uint64_t) 2766 (label.dkl_pcyl * 2767 label.dkl_nhead * label.dkl_nsect * DEV_BSIZE)); 2768 2769 /* 2770 * We must have a correct label name otherwise format(1m) will 2771 * not recognized the disk as labeled. 2772 */ 2773 (void) snprintf(label.dkl_asciilabel, LEN_DKL_ASCII, 2774 "SUNVDSK cyl %d alt %d hd %d sec %d", 2775 label.dkl_ncyl, label.dkl_acyl, label.dkl_nhead, 2776 label.dkl_nsect); 2777 2778 /* default VTOC */ 2779 label.dkl_vtoc.v_version = V_VERSION; 2780 label.dkl_vtoc.v_nparts = V_NUMPAR; 2781 label.dkl_vtoc.v_sanity = VTOC_SANE; 2782 label.dkl_vtoc.v_part[2].p_tag = V_BACKUP; 2783 label.dkl_map[2].dkl_cylno = 0; 2784 label.dkl_map[2].dkl_nblk = label.dkl_ncyl * 2785 label.dkl_nhead * label.dkl_nsect; 2786 label.dkl_map[0] = label.dkl_map[2]; 2787 label.dkl_map[0] = label.dkl_map[2]; 2788 label.dkl_cksum = vd_lbl2cksum(&label); 2789 2790 /* write default label to file */ 2791 if (VD_FILE_LABEL_WRITE(vd, &label) < 0) { 2792 PRN("Can't write label to %s", file_path); 2793 return (EIO); 2794 } 2795 } 2796 2797 vd->nslices = label.dkl_vtoc.v_nparts; 2798 2799 /* sector size = block size = DEV_BSIZE */ 2800 vd->vdisk_size = (label.dkl_pcyl * 2801 label.dkl_nhead * label.dkl_nsect) / DEV_BSIZE; 2802 vd->vdisk_type = VD_DISK_TYPE_DISK; 2803 vd->vdisk_label = VD_DISK_LABEL_VTOC; 2804 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 2805 2806 /* Get max_xfer_sz from the device where the file is */ 2807 dev = vd->file_vnode->v_vfsp->vfs_dev; 2808 dev_path[0] = NULL; 2809 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 2810 PR0("underlying device = %s\n", dev_path); 2811 } 2812 2813 if ((status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, 2814 kcred, &lhandle, vd->vds->ldi_ident)) != 0) { 2815 PR0("ldi_open_by_dev() returned errno %d for device %s", 2816 status, dev_path); 2817 } else { 2818 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 2819 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 2820 &rval)) != 0) { 2821 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 2822 status, dev_path); 2823 } else { 2824 /* 2825 * Store the device's max transfer size for 2826 * return to the client 2827 */ 2828 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 2829 } 2830 2831 PR0("close the device %s", dev_path); 2832 (void) ldi_close(lhandle, FREAD, kcred); 2833 } 2834 2835 PR0("using for file %s, dev %s, max_xfer = %u blks", 2836 file_path, dev_path, vd->max_xfer_sz); 2837 2838 vd->dk_geom.dkg_ncyl = label.dkl_ncyl; 2839 vd->dk_geom.dkg_acyl = label.dkl_acyl; 2840 vd->dk_geom.dkg_pcyl = label.dkl_pcyl; 2841 vd->dk_geom.dkg_nhead = label.dkl_nhead; 2842 vd->dk_geom.dkg_nsect = label.dkl_nsect; 2843 vd->dk_geom.dkg_intrlv = label.dkl_intrlv; 2844 vd->dk_geom.dkg_apc = label.dkl_apc; 2845 vd->dk_geom.dkg_rpm = label.dkl_rpm; 2846 vd->dk_geom.dkg_write_reinstruct = label.dkl_write_reinstruct; 2847 vd->dk_geom.dkg_read_reinstruct = label.dkl_read_reinstruct; 2848 2849 vd->vtoc.v_sanity = label.dkl_vtoc.v_sanity; 2850 vd->vtoc.v_version = label.dkl_vtoc.v_version; 2851 vd->vtoc.v_sectorsz = DEV_BSIZE; 2852 vd->vtoc.v_nparts = label.dkl_vtoc.v_nparts; 2853 2854 bcopy(label.dkl_vtoc.v_volume, vd->vtoc.v_volume, 2855 LEN_DKL_VVOL); 2856 bcopy(label.dkl_asciilabel, vd->vtoc.v_asciilabel, 2857 LEN_DKL_ASCII); 2858 2859 for (i = 0; i < vd->nslices; i++) { 2860 vd->vtoc.timestamp[i] = label.dkl_vtoc.v_timestamp[i]; 2861 vd->vtoc.v_part[i].p_tag = label.dkl_vtoc.v_part[i].p_tag; 2862 vd->vtoc.v_part[i].p_flag = label.dkl_vtoc.v_part[i].p_flag; 2863 vd->vtoc.v_part[i].p_start = label.dkl_map[i].dkl_cylno * 2864 label.dkl_nhead * label.dkl_nsect; 2865 vd->vtoc.v_part[i].p_size = label.dkl_map[i].dkl_nblk; 2866 vd->ldi_handle[i] = NULL; 2867 vd->dev[i] = NULL; 2868 } 2869 2870 return (0); 2871 } 2872 2873 static int 2874 vd_setup_vd(vd_t *vd) 2875 { 2876 int rval, status; 2877 dev_info_t *dip; 2878 struct dk_cinfo dk_cinfo; 2879 char *device_path = vd->device_path; 2880 2881 /* 2882 * We need to open with FNDELAY so that opening an empty partition 2883 * does not fail. 2884 */ 2885 if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY, 2886 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { 2887 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 2888 vd->ldi_handle[0] = NULL; 2889 2890 /* this may not be a device try opening as a file */ 2891 if (status == ENXIO || status == ENODEV) 2892 status = vd_setup_file(vd); 2893 if (status) { 2894 PRN("Cannot use device/file (%s), errno=%d\n", 2895 device_path, status); 2896 if (status == ENXIO || status == ENODEV || 2897 status == ENOENT) { 2898 return (EAGAIN); 2899 } 2900 } 2901 return (status); 2902 } 2903 2904 /* 2905 * nslices must be updated now so that vds_destroy_vd() will close 2906 * the slice we have just opened in case of an error. 2907 */ 2908 vd->nslices = 1; 2909 vd->file = B_FALSE; 2910 2911 /* Get device number and size of backing device */ 2912 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 2913 PRN("ldi_get_dev() returned errno %d for %s", 2914 status, device_path); 2915 return (status); 2916 } 2917 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 2918 PRN("ldi_get_size() failed for %s", device_path); 2919 return (EIO); 2920 } 2921 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 2922 2923 /* Verify backing device supports dk_cinfo, dk_geom, and vtoc */ 2924 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 2925 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 2926 &rval)) != 0) { 2927 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 2928 status, device_path); 2929 return (status); 2930 } 2931 if (dk_cinfo.dki_partition >= V_NUMPAR) { 2932 PRN("slice %u >= maximum slice %u for %s", 2933 dk_cinfo.dki_partition, V_NUMPAR, device_path); 2934 return (EIO); 2935 } 2936 2937 status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label); 2938 2939 if (status != 0) { 2940 PRN("vd_read_vtoc returned errno %d for %s", 2941 status, device_path); 2942 return (status); 2943 } 2944 2945 if (vd->vdisk_label == VD_DISK_LABEL_VTOC && 2946 (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 2947 (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), 2948 kcred, &rval)) != 0) { 2949 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 2950 status, device_path); 2951 return (status); 2952 } 2953 2954 /* Store the device's max transfer size for return to the client */ 2955 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 2956 2957 /* Determine if backing device is a pseudo device */ 2958 if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), 2959 dev_to_instance(vd->dev[0]), 0)) == NULL) { 2960 PRN("%s is no longer accessible", device_path); 2961 return (EIO); 2962 } 2963 vd->pseudo = is_pseudo_device(dip); 2964 ddi_release_devi(dip); 2965 if (vd->pseudo) { 2966 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2967 vd->nslices = 1; 2968 return (0); /* ...and we're done */ 2969 } 2970 2971 /* If slice is entire-disk slice, initialize for full disk */ 2972 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) 2973 return (vd_setup_full_disk(vd)); 2974 2975 2976 /* Otherwise, we have a non-entire slice of a device */ 2977 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2978 vd->nslices = 1; 2979 2980 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 2981 status = vd_setup_partition_efi(vd); 2982 return (status); 2983 } 2984 2985 /* Initialize dk_geom structure for single-slice device */ 2986 if (vd->dk_geom.dkg_nsect == 0) { 2987 PRN("%s geometry claims 0 sectors per track", device_path); 2988 return (EIO); 2989 } 2990 if (vd->dk_geom.dkg_nhead == 0) { 2991 PRN("%s geometry claims 0 heads", device_path); 2992 return (EIO); 2993 } 2994 vd->dk_geom.dkg_ncyl = 2995 vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; 2996 vd->dk_geom.dkg_acyl = 0; 2997 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 2998 2999 3000 /* Initialize vtoc structure for single-slice device */ 3001 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 3002 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 3003 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 3004 vd->vtoc.v_nparts = 1; 3005 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 3006 vd->vtoc.v_part[0].p_flag = 0; 3007 vd->vtoc.v_part[0].p_start = 0; 3008 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 3009 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 3010 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 3011 3012 3013 return (0); 3014 } 3015 3016 static int 3017 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id, 3018 vd_t **vdp) 3019 { 3020 char tq_name[TASKQ_NAMELEN]; 3021 int status; 3022 ddi_iblock_cookie_t iblock = NULL; 3023 ldc_attr_t ldc_attr; 3024 vd_t *vd; 3025 3026 3027 ASSERT(vds != NULL); 3028 ASSERT(device_path != NULL); 3029 ASSERT(vdp != NULL); 3030 PR0("Adding vdisk for %s", device_path); 3031 3032 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 3033 PRN("No memory for virtual disk"); 3034 return (EAGAIN); 3035 } 3036 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 3037 vd->vds = vds; 3038 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 3039 3040 /* Open vdisk and initialize parameters */ 3041 if ((status = vd_setup_vd(vd)) == 0) { 3042 vd->initialized |= VD_DISK_READY; 3043 3044 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 3045 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 3046 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 3047 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 3048 vd->nslices); 3049 } else { 3050 if (status != EAGAIN) 3051 return (status); 3052 } 3053 3054 /* Initialize locking */ 3055 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 3056 &iblock) != DDI_SUCCESS) { 3057 PRN("Could not get iblock cookie."); 3058 return (EIO); 3059 } 3060 3061 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 3062 vd->initialized |= VD_LOCKING; 3063 3064 3065 /* Create start and completion task queues for the vdisk */ 3066 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 3067 PR1("tq_name = %s", tq_name); 3068 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 3069 TASKQ_DEFAULTPRI, 0)) == NULL) { 3070 PRN("Could not create task queue"); 3071 return (EIO); 3072 } 3073 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 3074 PR1("tq_name = %s", tq_name); 3075 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 3076 TASKQ_DEFAULTPRI, 0)) == NULL) { 3077 PRN("Could not create task queue"); 3078 return (EIO); 3079 } 3080 vd->enabled = 1; /* before callback can dispatch to startq */ 3081 3082 3083 /* Bring up LDC */ 3084 ldc_attr.devclass = LDC_DEV_BLK_SVC; 3085 ldc_attr.instance = ddi_get_instance(vds->dip); 3086 ldc_attr.mode = LDC_MODE_UNRELIABLE; 3087 ldc_attr.mtu = VD_LDC_MTU; 3088 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 3089 PRN("Could not initialize LDC channel %lu, " 3090 "init failed with error %d", ldc_id, status); 3091 return (status); 3092 } 3093 vd->initialized |= VD_LDC; 3094 3095 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 3096 (caddr_t)vd)) != 0) { 3097 PRN("Could not initialize LDC channel %lu," 3098 "reg_callback failed with error %d", ldc_id, status); 3099 return (status); 3100 } 3101 3102 if ((status = ldc_open(vd->ldc_handle)) != 0) { 3103 PRN("Could not initialize LDC channel %lu," 3104 "open failed with error %d", ldc_id, status); 3105 return (status); 3106 } 3107 3108 if ((status = ldc_up(vd->ldc_handle)) != 0) { 3109 PR0("ldc_up() returned errno %d", status); 3110 } 3111 3112 /* Allocate the inband task memory handle */ 3113 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 3114 if (status) { 3115 PRN("Could not initialize LDC channel %lu," 3116 "alloc_handle failed with error %d", ldc_id, status); 3117 return (ENXIO); 3118 } 3119 3120 /* Add the successfully-initialized vdisk to the server's table */ 3121 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 3122 PRN("Error adding vdisk ID %lu to table", id); 3123 return (EIO); 3124 } 3125 3126 /* Allocate the staging buffer */ 3127 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 3128 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 3129 3130 /* store initial state */ 3131 vd->state = VD_STATE_INIT; 3132 3133 return (0); 3134 } 3135 3136 static void 3137 vd_free_dring_task(vd_t *vdp) 3138 { 3139 if (vdp->dring_task != NULL) { 3140 ASSERT(vdp->dring_len != 0); 3141 /* Free all dring_task memory handles */ 3142 for (int i = 0; i < vdp->dring_len; i++) { 3143 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 3144 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 3145 vdp->dring_task[i].msg = NULL; 3146 } 3147 kmem_free(vdp->dring_task, 3148 (sizeof (*vdp->dring_task)) * vdp->dring_len); 3149 vdp->dring_task = NULL; 3150 } 3151 } 3152 3153 /* 3154 * Destroy the state associated with a virtual disk 3155 */ 3156 static void 3157 vds_destroy_vd(void *arg) 3158 { 3159 vd_t *vd = (vd_t *)arg; 3160 int retry = 0, rv; 3161 3162 if (vd == NULL) 3163 return; 3164 3165 PR0("Destroying vdisk state"); 3166 3167 if (vd->dk_efi.dki_data != NULL) 3168 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 3169 3170 /* Disable queuing requests for the vdisk */ 3171 if (vd->initialized & VD_LOCKING) { 3172 mutex_enter(&vd->lock); 3173 vd->enabled = 0; 3174 mutex_exit(&vd->lock); 3175 } 3176 3177 /* Drain and destroy start queue (*before* destroying completionq) */ 3178 if (vd->startq != NULL) 3179 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 3180 3181 /* Drain and destroy completion queue (*before* shutting down LDC) */ 3182 if (vd->completionq != NULL) 3183 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 3184 3185 vd_free_dring_task(vd); 3186 3187 /* Free the inband task memory handle */ 3188 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 3189 3190 /* Shut down LDC */ 3191 if (vd->initialized & VD_LDC) { 3192 /* unmap the dring */ 3193 if (vd->initialized & VD_DRING) 3194 (void) ldc_mem_dring_unmap(vd->dring_handle); 3195 3196 /* close LDC channel - retry on EAGAIN */ 3197 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 3198 if (++retry > vds_ldc_retries) { 3199 PR0("Timed out closing channel"); 3200 break; 3201 } 3202 drv_usecwait(vds_ldc_delay); 3203 } 3204 if (rv == 0) { 3205 (void) ldc_unreg_callback(vd->ldc_handle); 3206 (void) ldc_fini(vd->ldc_handle); 3207 } else { 3208 /* 3209 * Closing the LDC channel has failed. Ideally we should 3210 * fail here but there is no Zeus level infrastructure 3211 * to handle this. The MD has already been changed and 3212 * we have to do the close. So we try to do as much 3213 * clean up as we can. 3214 */ 3215 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 3216 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 3217 drv_usecwait(vds_ldc_delay); 3218 } 3219 } 3220 3221 /* Free the staging buffer for msgs */ 3222 if (vd->vio_msgp != NULL) { 3223 kmem_free(vd->vio_msgp, vd->max_msglen); 3224 vd->vio_msgp = NULL; 3225 } 3226 3227 /* Free the inband message buffer */ 3228 if (vd->inband_task.msg != NULL) { 3229 kmem_free(vd->inband_task.msg, vd->max_msglen); 3230 vd->inband_task.msg = NULL; 3231 } 3232 if (vd->file) { 3233 /* Close file */ 3234 (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 3235 0, kcred); 3236 VN_RELE(vd->file_vnode); 3237 } else { 3238 /* Close any open backing-device slices */ 3239 for (uint_t slice = 0; slice < vd->nslices; slice++) { 3240 if (vd->ldi_handle[slice] != NULL) { 3241 PR0("Closing slice %u", slice); 3242 (void) ldi_close(vd->ldi_handle[slice], 3243 vd_open_flags | FNDELAY, kcred); 3244 } 3245 } 3246 } 3247 3248 /* Free lock */ 3249 if (vd->initialized & VD_LOCKING) 3250 mutex_destroy(&vd->lock); 3251 3252 /* Finally, free the vdisk structure itself */ 3253 kmem_free(vd, sizeof (*vd)); 3254 } 3255 3256 static int 3257 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id) 3258 { 3259 int status; 3260 vd_t *vd = NULL; 3261 3262 3263 if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0) 3264 vds_destroy_vd(vd); 3265 3266 return (status); 3267 } 3268 3269 static int 3270 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 3271 uint64_t *ldc_id) 3272 { 3273 int num_channels; 3274 3275 3276 /* Look for channel endpoint child(ren) of the vdisk MD node */ 3277 if ((num_channels = md_scan_dag(md, vd_node, 3278 md_find_name(md, VD_CHANNEL_ENDPOINT), 3279 md_find_name(md, "fwd"), channel)) <= 0) { 3280 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 3281 return (-1); 3282 } 3283 3284 /* Get the "id" value for the first channel endpoint node */ 3285 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 3286 PRN("No \"%s\" property found for \"%s\" of vdisk", 3287 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 3288 return (-1); 3289 } 3290 3291 if (num_channels > 1) { 3292 PRN("Using ID of first of multiple channels for this vdisk"); 3293 } 3294 3295 return (0); 3296 } 3297 3298 static int 3299 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 3300 { 3301 int num_nodes, status; 3302 size_t size; 3303 mde_cookie_t *channel; 3304 3305 3306 if ((num_nodes = md_node_count(md)) <= 0) { 3307 PRN("Invalid node count in Machine Description subtree"); 3308 return (-1); 3309 } 3310 size = num_nodes*(sizeof (*channel)); 3311 channel = kmem_zalloc(size, KM_SLEEP); 3312 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 3313 kmem_free(channel, size); 3314 3315 return (status); 3316 } 3317 3318 static void 3319 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 3320 { 3321 char *device_path = NULL; 3322 uint64_t id = 0, ldc_id = 0; 3323 3324 3325 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 3326 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 3327 return; 3328 } 3329 PR0("Adding vdisk ID %lu", id); 3330 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 3331 &device_path) != 0) { 3332 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 3333 return; 3334 } 3335 3336 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 3337 PRN("Error getting LDC ID for vdisk %lu", id); 3338 return; 3339 } 3340 3341 if (vds_init_vd(vds, id, device_path, ldc_id) != 0) { 3342 PRN("Failed to add vdisk ID %lu", id); 3343 return; 3344 } 3345 } 3346 3347 static void 3348 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 3349 { 3350 uint64_t id = 0; 3351 3352 3353 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 3354 PRN("Unable to get \"%s\" property from vdisk's MD node", 3355 VD_ID_PROP); 3356 return; 3357 } 3358 PR0("Removing vdisk ID %lu", id); 3359 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 3360 PRN("No vdisk entry found for vdisk ID %lu", id); 3361 } 3362 3363 static void 3364 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 3365 md_t *curr_md, mde_cookie_t curr_vd_node) 3366 { 3367 char *curr_dev, *prev_dev; 3368 uint64_t curr_id = 0, curr_ldc_id = 0; 3369 uint64_t prev_id = 0, prev_ldc_id = 0; 3370 size_t len; 3371 3372 3373 /* Validate that vdisk ID has not changed */ 3374 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 3375 PRN("Error getting previous vdisk \"%s\" property", 3376 VD_ID_PROP); 3377 return; 3378 } 3379 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 3380 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 3381 return; 3382 } 3383 if (curr_id != prev_id) { 3384 PRN("Not changing vdisk: ID changed from %lu to %lu", 3385 prev_id, curr_id); 3386 return; 3387 } 3388 3389 /* Validate that LDC ID has not changed */ 3390 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 3391 PRN("Error getting LDC ID for vdisk %lu", prev_id); 3392 return; 3393 } 3394 3395 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 3396 PRN("Error getting LDC ID for vdisk %lu", curr_id); 3397 return; 3398 } 3399 if (curr_ldc_id != prev_ldc_id) { 3400 _NOTE(NOTREACHED); /* lint is confused */ 3401 PRN("Not changing vdisk: " 3402 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 3403 return; 3404 } 3405 3406 /* Determine whether device path has changed */ 3407 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 3408 &prev_dev) != 0) { 3409 PRN("Error getting previous vdisk \"%s\"", 3410 VD_BLOCK_DEVICE_PROP); 3411 return; 3412 } 3413 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 3414 &curr_dev) != 0) { 3415 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 3416 return; 3417 } 3418 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 3419 (strncmp(curr_dev, prev_dev, len) == 0)) 3420 return; /* no relevant (supported) change */ 3421 3422 PR0("Changing vdisk ID %lu", prev_id); 3423 3424 /* Remove old state, which will close vdisk and reset */ 3425 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 3426 PRN("No entry found for vdisk ID %lu", prev_id); 3427 3428 /* Re-initialize vdisk with new state */ 3429 if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { 3430 PRN("Failed to change vdisk ID %lu", curr_id); 3431 return; 3432 } 3433 } 3434 3435 static int 3436 vds_process_md(void *arg, mdeg_result_t *md) 3437 { 3438 int i; 3439 vds_t *vds = arg; 3440 3441 3442 if (md == NULL) 3443 return (MDEG_FAILURE); 3444 ASSERT(vds != NULL); 3445 3446 for (i = 0; i < md->removed.nelem; i++) 3447 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 3448 for (i = 0; i < md->match_curr.nelem; i++) 3449 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 3450 md->match_curr.mdp, md->match_curr.mdep[i]); 3451 for (i = 0; i < md->added.nelem; i++) 3452 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 3453 3454 return (MDEG_SUCCESS); 3455 } 3456 3457 3458 static int 3459 vds_do_attach(dev_info_t *dip) 3460 { 3461 int status, sz; 3462 int cfg_handle; 3463 minor_t instance = ddi_get_instance(dip); 3464 vds_t *vds; 3465 mdeg_prop_spec_t *pspecp; 3466 mdeg_node_spec_t *ispecp; 3467 3468 /* 3469 * The "cfg-handle" property of a vds node in an MD contains the MD's 3470 * notion of "instance", or unique identifier, for that node; OBP 3471 * stores the value of the "cfg-handle" MD property as the value of 3472 * the "reg" property on the node in the device tree it builds from 3473 * the MD and passes to Solaris. Thus, we look up the devinfo node's 3474 * "reg" property value to uniquely identify this device instance when 3475 * registering with the MD event-generation framework. If the "reg" 3476 * property cannot be found, the device tree state is presumably so 3477 * broken that there is no point in continuing. 3478 */ 3479 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 3480 VD_REG_PROP)) { 3481 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 3482 return (DDI_FAILURE); 3483 } 3484 3485 /* Get the MD instance for later MDEG registration */ 3486 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 3487 VD_REG_PROP, -1); 3488 3489 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 3490 PRN("Could not allocate state for instance %u", instance); 3491 return (DDI_FAILURE); 3492 } 3493 3494 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 3495 PRN("Could not get state for instance %u", instance); 3496 ddi_soft_state_free(vds_state, instance); 3497 return (DDI_FAILURE); 3498 } 3499 3500 3501 vds->dip = dip; 3502 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 3503 vds_destroy_vd, 3504 sizeof (void *)); 3505 ASSERT(vds->vd_table != NULL); 3506 3507 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 3508 PRN("ldi_ident_from_dip() returned errno %d", status); 3509 return (DDI_FAILURE); 3510 } 3511 vds->initialized |= VDS_LDI; 3512 3513 /* Register for MD updates */ 3514 sz = sizeof (vds_prop_template); 3515 pspecp = kmem_alloc(sz, KM_SLEEP); 3516 bcopy(vds_prop_template, pspecp, sz); 3517 3518 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 3519 3520 /* initialize the complete prop spec structure */ 3521 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 3522 ispecp->namep = "virtual-device"; 3523 ispecp->specp = pspecp; 3524 3525 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 3526 &vds->mdeg) != MDEG_SUCCESS) { 3527 PRN("Unable to register for MD updates"); 3528 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 3529 kmem_free(pspecp, sz); 3530 return (DDI_FAILURE); 3531 } 3532 3533 vds->ispecp = ispecp; 3534 vds->initialized |= VDS_MDEG; 3535 3536 /* Prevent auto-detaching so driver is available whenever MD changes */ 3537 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 3538 DDI_PROP_SUCCESS) { 3539 PRN("failed to set \"%s\" property for instance %u", 3540 DDI_NO_AUTODETACH, instance); 3541 } 3542 3543 ddi_report_dev(dip); 3544 return (DDI_SUCCESS); 3545 } 3546 3547 static int 3548 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3549 { 3550 int status; 3551 3552 switch (cmd) { 3553 case DDI_ATTACH: 3554 PR0("Attaching"); 3555 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 3556 (void) vds_detach(dip, DDI_DETACH); 3557 return (status); 3558 case DDI_RESUME: 3559 PR0("No action required for DDI_RESUME"); 3560 return (DDI_SUCCESS); 3561 default: 3562 return (DDI_FAILURE); 3563 } 3564 } 3565 3566 static struct dev_ops vds_ops = { 3567 DEVO_REV, /* devo_rev */ 3568 0, /* devo_refcnt */ 3569 ddi_no_info, /* devo_getinfo */ 3570 nulldev, /* devo_identify */ 3571 nulldev, /* devo_probe */ 3572 vds_attach, /* devo_attach */ 3573 vds_detach, /* devo_detach */ 3574 nodev, /* devo_reset */ 3575 NULL, /* devo_cb_ops */ 3576 NULL, /* devo_bus_ops */ 3577 nulldev /* devo_power */ 3578 }; 3579 3580 static struct modldrv modldrv = { 3581 &mod_driverops, 3582 "virtual disk server v%I%", 3583 &vds_ops, 3584 }; 3585 3586 static struct modlinkage modlinkage = { 3587 MODREV_1, 3588 &modldrv, 3589 NULL 3590 }; 3591 3592 3593 int 3594 _init(void) 3595 { 3596 int i, status; 3597 3598 3599 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 3600 return (status); 3601 if ((status = mod_install(&modlinkage)) != 0) { 3602 ddi_soft_state_fini(&vds_state); 3603 return (status); 3604 } 3605 3606 /* Fill in the bit-mask of server-supported operations */ 3607 for (i = 0; i < vds_noperations; i++) 3608 vds_operations |= 1 << (vds_operation[i].operation - 1); 3609 3610 return (0); 3611 } 3612 3613 int 3614 _info(struct modinfo *modinfop) 3615 { 3616 return (mod_info(&modlinkage, modinfop)); 3617 } 3618 3619 int 3620 _fini(void) 3621 { 3622 int status; 3623 3624 3625 if ((status = mod_remove(&modlinkage)) != 0) 3626 return (status); 3627 ddi_soft_state_fini(&vds_state); 3628 return (0); 3629 } 3630