1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sunddi.h> 45 #include <sys/sunldi.h> 46 #include <sys/sysmacros.h> 47 #include <sys/vio_common.h> 48 #include <sys/vdsk_mailbox.h> 49 #include <sys/vdsk_common.h> 50 #include <sys/vtoc.h> 51 #include <sys/vfs.h> 52 #include <sys/stat.h> 53 54 /* Virtual disk server initialization flags */ 55 #define VDS_LDI 0x01 56 #define VDS_MDEG 0x02 57 58 /* Virtual disk server tunable parameters */ 59 #define VDS_RETRIES 5 60 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 61 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 62 #define VDS_NCHAINS 32 63 64 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 65 #define VDS_NAME "virtual-disk-server" 66 67 #define VD_NAME "vd" 68 #define VD_VOLUME_NAME "vdisk" 69 #define VD_ASCIILABEL "Virtual Disk" 70 71 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 72 #define VD_ID_PROP "id" 73 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 74 #define VD_REG_PROP "reg" 75 76 /* Virtual disk initialization flags */ 77 #define VD_DISK_READY 0x01 78 #define VD_LOCKING 0x02 79 #define VD_LDC 0x04 80 #define VD_DRING 0x08 81 #define VD_SID 0x10 82 #define VD_SEQ_NUM 0x20 83 84 /* Flags for opening/closing backing devices via LDI */ 85 #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) 86 87 /* 88 * By Solaris convention, slice/partition 2 represents the entire disk; 89 * unfortunately, this convention does not appear to be codified. 90 */ 91 #define VD_ENTIRE_DISK_SLICE 2 92 93 /* Return a cpp token as a string */ 94 #define STRINGIZE(token) #token 95 96 /* 97 * Print a message prefixed with the current function name to the message log 98 * (and optionally to the console for verbose boots); these macros use cpp's 99 * concatenation of string literals and C99 variable-length-argument-list 100 * macros 101 */ 102 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 103 #define _PRN(format, ...) \ 104 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 105 106 /* Return a pointer to the "i"th vdisk dring element */ 107 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 108 (vd->dring + (i)*vd->descriptor_size)) 109 110 /* Return the virtual disk client's type as a string (for use in messages) */ 111 #define VD_CLIENT(vd) \ 112 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 113 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 114 (((vd)->xfer_mode == 0) ? "null client" : \ 115 "unsupported client"))) 116 117 /* 118 * Specification of an MD node passed to the MDEG to filter any 119 * 'vport' nodes that do not belong to the specified node. This 120 * template is copied for each vds instance and filled in with 121 * the appropriate 'cfg-handle' value before being passed to the MDEG. 122 */ 123 static mdeg_prop_spec_t vds_prop_template[] = { 124 { MDET_PROP_STR, "name", VDS_NAME }, 125 { MDET_PROP_VAL, "cfg-handle", NULL }, 126 { MDET_LIST_END, NULL, NULL } 127 }; 128 129 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 130 131 /* 132 * Matching criteria passed to the MDEG to register interest 133 * in changes to 'virtual-device-port' nodes identified by their 134 * 'id' property. 135 */ 136 static md_prop_match_t vd_prop_match[] = { 137 { MDET_PROP_VAL, VD_ID_PROP }, 138 { MDET_LIST_END, NULL } 139 }; 140 141 static mdeg_node_match_t vd_match = {"virtual-device-port", 142 vd_prop_match}; 143 144 /* Debugging macros */ 145 #ifdef DEBUG 146 147 static int vd_msglevel = 0; 148 149 #define PR0 if (vd_msglevel > 0) PRN 150 #define PR1 if (vd_msglevel > 1) PRN 151 #define PR2 if (vd_msglevel > 2) PRN 152 153 #define VD_DUMP_DRING_ELEM(elem) \ 154 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 155 elem->hdr.dstate, \ 156 elem->payload.operation, \ 157 elem->payload.status, \ 158 elem->payload.nbytes, \ 159 elem->payload.addr, \ 160 elem->payload.ncookies); 161 162 char * 163 vd_decode_state(int state) 164 { 165 char *str; 166 167 #define CASE_STATE(_s) case _s: str = #_s; break; 168 169 switch (state) { 170 CASE_STATE(VD_STATE_INIT) 171 CASE_STATE(VD_STATE_VER) 172 CASE_STATE(VD_STATE_ATTR) 173 CASE_STATE(VD_STATE_DRING) 174 CASE_STATE(VD_STATE_RDX) 175 CASE_STATE(VD_STATE_DATA) 176 default: str = "unknown"; break; 177 } 178 179 #undef CASE_STATE 180 181 return (str); 182 } 183 184 void 185 vd_decode_tag(vio_msg_t *msg) 186 { 187 char *tstr, *sstr, *estr; 188 189 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 190 191 switch (msg->tag.vio_msgtype) { 192 CASE_TYPE(VIO_TYPE_CTRL) 193 CASE_TYPE(VIO_TYPE_DATA) 194 CASE_TYPE(VIO_TYPE_ERR) 195 default: tstr = "unknown"; break; 196 } 197 198 #undef CASE_TYPE 199 200 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 201 202 switch (msg->tag.vio_subtype) { 203 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 204 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 205 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 206 default: sstr = "unknown"; break; 207 } 208 209 #undef CASE_SUBTYPE 210 211 #define CASE_ENV(_s) case _s: estr = #_s; break; 212 213 switch (msg->tag.vio_subtype_env) { 214 CASE_ENV(VIO_VER_INFO) 215 CASE_ENV(VIO_ATTR_INFO) 216 CASE_ENV(VIO_DRING_REG) 217 CASE_ENV(VIO_DRING_UNREG) 218 CASE_ENV(VIO_RDX) 219 CASE_ENV(VIO_PKT_DATA) 220 CASE_ENV(VIO_DESC_DATA) 221 CASE_ENV(VIO_DRING_DATA) 222 default: estr = "unknown"; break; 223 } 224 225 #undef CASE_ENV 226 227 PR1("(%x/%x/%x) message : (%s/%s/%s)", 228 msg->tag.vio_msgtype, msg->tag.vio_subtype, 229 msg->tag.vio_subtype_env, tstr, sstr, estr); 230 } 231 232 #else /* !DEBUG */ 233 234 #define PR0(...) 235 #define PR1(...) 236 #define PR2(...) 237 238 #define VD_DUMP_DRING_ELEM(elem) 239 240 #define vd_decode_state(_s) (NULL) 241 #define vd_decode_tag(_s) (NULL) 242 243 #endif /* DEBUG */ 244 245 246 /* 247 * Soft state structure for a vds instance 248 */ 249 typedef struct vds { 250 uint_t initialized; /* driver inst initialization flags */ 251 dev_info_t *dip; /* driver inst devinfo pointer */ 252 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 253 mod_hash_t *vd_table; /* table of virtual disks served */ 254 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 255 mdeg_handle_t mdeg; /* handle for MDEG operations */ 256 } vds_t; 257 258 /* 259 * Types of descriptor-processing tasks 260 */ 261 typedef enum vd_task_type { 262 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 263 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 264 } vd_task_type_t; 265 266 /* 267 * Structure describing the task for processing a descriptor 268 */ 269 typedef struct vd_task { 270 struct vd *vd; /* vd instance task is for */ 271 vd_task_type_t type; /* type of descriptor task */ 272 int index; /* dring elem index for task */ 273 vio_msg_t *msg; /* VIO message task is for */ 274 size_t msglen; /* length of message content */ 275 vd_dring_payload_t *request; /* request task will perform */ 276 struct buf buf; /* buf(9s) for I/O request */ 277 ldc_mem_handle_t mhdl; /* task memory handle */ 278 } vd_task_t; 279 280 /* 281 * Soft state structure for a virtual disk instance 282 */ 283 typedef struct vd { 284 uint_t initialized; /* vdisk initialization flags */ 285 vds_t *vds; /* server for this vdisk */ 286 ddi_taskq_t *startq; /* queue for I/O start tasks */ 287 ddi_taskq_t *completionq; /* queue for completion tasks */ 288 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 289 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 290 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 291 uint_t nslices; /* number of slices */ 292 size_t vdisk_size; /* number of blocks in vdisk */ 293 vd_disk_type_t vdisk_type; /* slice or entire disk */ 294 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 295 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 296 boolean_t pseudo; /* underlying pseudo dev */ 297 boolean_t file; /* underlying file */ 298 char *file_maddr; /* file mapping address */ 299 vnode_t *file_vnode; /* file vnode */ 300 size_t file_size; /* file size */ 301 struct dk_efi dk_efi; /* synthetic for slice type */ 302 struct dk_geom dk_geom; /* synthetic for slice type */ 303 struct vtoc vtoc; /* synthetic for slice type */ 304 ldc_status_t ldc_state; /* LDC connection state */ 305 ldc_handle_t ldc_handle; /* handle for LDC comm */ 306 size_t max_msglen; /* largest LDC message len */ 307 vd_state_t state; /* client handshake state */ 308 uint8_t xfer_mode; /* transfer mode with client */ 309 uint32_t sid; /* client's session ID */ 310 uint64_t seq_num; /* message sequence number */ 311 uint64_t dring_ident; /* identifier of dring */ 312 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 313 uint32_t descriptor_size; /* num bytes in desc */ 314 uint32_t dring_len; /* number of dring elements */ 315 caddr_t dring; /* address of dring */ 316 caddr_t vio_msgp; /* vio msg staging buffer */ 317 vd_task_t inband_task; /* task for inband descriptor */ 318 vd_task_t *dring_task; /* tasks dring elements */ 319 320 kmutex_t lock; /* protects variables below */ 321 boolean_t enabled; /* is vdisk enabled? */ 322 boolean_t reset_state; /* reset connection state? */ 323 boolean_t reset_ldc; /* reset LDC channel? */ 324 } vd_t; 325 326 typedef struct vds_operation { 327 char *namep; 328 uint8_t operation; 329 int (*start)(vd_task_t *task); 330 void (*complete)(void *arg); 331 } vds_operation_t; 332 333 typedef struct vd_ioctl { 334 uint8_t operation; /* vdisk operation */ 335 const char *operation_name; /* vdisk operation name */ 336 size_t nbytes; /* size of operation buffer */ 337 int cmd; /* corresponding ioctl cmd */ 338 const char *cmd_name; /* ioctl cmd name */ 339 void *arg; /* ioctl cmd argument */ 340 /* convert input vd_buf to output ioctl_arg */ 341 void (*copyin)(void *vd_buf, void *ioctl_arg); 342 /* convert input ioctl_arg to output vd_buf */ 343 void (*copyout)(void *ioctl_arg, void *vd_buf); 344 } vd_ioctl_t; 345 346 /* Define trivial copyin/copyout conversion function flag */ 347 #define VD_IDENTITY ((void (*)(void *, void *))-1) 348 349 350 static int vds_ldc_retries = VDS_RETRIES; 351 static int vds_ldc_delay = VDS_LDC_DELAY; 352 static int vds_dev_retries = VDS_RETRIES; 353 static int vds_dev_delay = VDS_DEV_DELAY; 354 static void *vds_state; 355 static uint64_t vds_operations; /* see vds_operation[] definition below */ 356 357 static int vd_open_flags = VD_OPEN_FLAGS; 358 359 /* 360 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 361 * 362 * Each supported major version should appear only once, paired with (and only 363 * with) its highest supported minor version number (as the protocol requires 364 * supporting all lower minor version numbers as well) 365 */ 366 static const vio_ver_t vds_version[] = {{1, 0}}; 367 static const size_t vds_num_versions = 368 sizeof (vds_version)/sizeof (vds_version[0]); 369 370 static void vd_free_dring_task(vd_t *vdp); 371 static int vd_setup_vd(vd_t *vd); 372 static boolean_t vd_enabled(vd_t *vd); 373 374 static int 375 vd_start_bio(vd_task_t *task) 376 { 377 int rv, status = 0; 378 vd_t *vd = task->vd; 379 vd_dring_payload_t *request = task->request; 380 struct buf *buf = &task->buf; 381 uint8_t mtype; 382 caddr_t addr; 383 size_t offset, maxlen; 384 int slice; 385 386 ASSERT(vd != NULL); 387 ASSERT(request != NULL); 388 389 slice = request->slice; 390 391 ASSERT(slice < vd->nslices); 392 ASSERT((request->operation == VD_OP_BREAD) || 393 (request->operation == VD_OP_BWRITE)); 394 395 if (request->nbytes == 0) 396 return (EINVAL); /* no service for trivial requests */ 397 398 PR1("%s %lu bytes at block %lu", 399 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 400 request->nbytes, request->addr); 401 402 bioinit(buf); 403 buf->b_flags = B_BUSY; 404 buf->b_bcount = request->nbytes; 405 buf->b_lblkno = request->addr; 406 buf->b_edev = vd->dev[slice]; 407 408 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 409 410 /* Map memory exported by client */ 411 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 412 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 413 &(buf->b_un.b_addr), NULL); 414 if (status != 0) { 415 PR0("ldc_mem_map() returned err %d ", status); 416 biofini(buf); 417 return (status); 418 } 419 420 status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount); 421 if (status != 0) { 422 (void) ldc_mem_unmap(task->mhdl); 423 PR0("ldc_mem_acquire() returned err %d ", status); 424 biofini(buf); 425 return (status); 426 } 427 428 buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE; 429 430 /* Start the block I/O */ 431 if (vd->file) { 432 433 if (request->addr >= vd->vtoc.v_part[slice].p_size) { 434 /* address past the end of the slice */ 435 PR0("req_addr (0x%lx) > psize (0x%lx)", 436 request->addr, vd->vtoc.v_part[slice].p_size); 437 request->nbytes = 0; 438 status = 0; 439 goto cleanup; 440 } 441 442 offset = (vd->vtoc.v_part[slice].p_start + 443 request->addr) * DEV_BSIZE; 444 445 /* 446 * If the requested size is greater than the size 447 * of the partition, truncate the read/write. 448 */ 449 maxlen = (vd->vtoc.v_part[slice].p_size - 450 request->addr) * DEV_BSIZE; 451 452 if (request->nbytes > maxlen) { 453 PR0("I/O size truncated to %lu bytes from %lu bytes", 454 maxlen, request->nbytes); 455 request->nbytes = maxlen; 456 } 457 458 /* 459 * We have to ensure that we are reading/writing into the mmap 460 * range. If we have a partial disk image (e.g. an image of 461 * s0 instead s2) the system can try to access slices that 462 * are not included into the disk image. 463 */ 464 if ((offset + request->nbytes) >= vd->file_size) { 465 PR0("offset + nbytes (0x%lx + 0x%lx) >= " 466 "file_size (0x%lx)", offset, request->nbytes, 467 vd->file_size); 468 request->nbytes = 0; 469 status = EIO; 470 goto cleanup; 471 } 472 473 addr = vd->file_maddr + offset; 474 475 if (request->operation == VD_OP_BREAD) 476 bcopy(addr, buf->b_un.b_addr, request->nbytes); 477 else 478 bcopy(buf->b_un.b_addr, addr, request->nbytes); 479 480 } else { 481 status = ldi_strategy(vd->ldi_handle[slice], buf); 482 if (status == 0) 483 return (EINPROGRESS); /* will complete on completionq */ 484 } 485 486 cleanup: 487 /* Clean up after error */ 488 rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 489 if (rv) { 490 PR0("ldc_mem_release() returned err %d ", rv); 491 } 492 rv = ldc_mem_unmap(task->mhdl); 493 if (rv) { 494 PR0("ldc_mem_unmap() returned err %d ", status); 495 } 496 497 biofini(buf); 498 return (status); 499 } 500 501 static int 502 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 503 { 504 int status; 505 size_t nbytes; 506 507 do { 508 nbytes = msglen; 509 status = ldc_write(ldc_handle, msg, &nbytes); 510 if (status != EWOULDBLOCK) 511 break; 512 drv_usecwait(vds_ldc_delay); 513 } while (status == EWOULDBLOCK); 514 515 if (status != 0) { 516 if (status != ECONNRESET) 517 PR0("ldc_write() returned errno %d", status); 518 return (status); 519 } else if (nbytes != msglen) { 520 PR0("ldc_write() performed only partial write"); 521 return (EIO); 522 } 523 524 PR1("SENT %lu bytes", msglen); 525 return (0); 526 } 527 528 static void 529 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 530 { 531 mutex_enter(&vd->lock); 532 vd->reset_state = B_TRUE; 533 vd->reset_ldc = reset_ldc; 534 mutex_exit(&vd->lock); 535 } 536 537 /* 538 * Reset the state of the connection with a client, if needed; reset the LDC 539 * transport as well, if needed. This function should only be called from the 540 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 541 */ 542 static void 543 vd_reset_if_needed(vd_t *vd) 544 { 545 int status = 0; 546 547 mutex_enter(&vd->lock); 548 if (!vd->reset_state) { 549 ASSERT(!vd->reset_ldc); 550 mutex_exit(&vd->lock); 551 return; 552 } 553 mutex_exit(&vd->lock); 554 555 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 556 557 /* 558 * Let any asynchronous I/O complete before possibly pulling the rug 559 * out from under it; defer checking vd->reset_ldc, as one of the 560 * asynchronous tasks might set it 561 */ 562 ddi_taskq_wait(vd->completionq); 563 564 if (vd->file) { 565 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred); 566 if (status) { 567 PR0("VOP_FSYNC returned errno %d", status); 568 } 569 } 570 571 if ((vd->initialized & VD_DRING) && 572 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 573 PR0("ldc_mem_dring_unmap() returned errno %d", status); 574 575 vd_free_dring_task(vd); 576 577 /* Free the staging buffer for msgs */ 578 if (vd->vio_msgp != NULL) { 579 kmem_free(vd->vio_msgp, vd->max_msglen); 580 vd->vio_msgp = NULL; 581 } 582 583 /* Free the inband message buffer */ 584 if (vd->inband_task.msg != NULL) { 585 kmem_free(vd->inband_task.msg, vd->max_msglen); 586 vd->inband_task.msg = NULL; 587 } 588 589 mutex_enter(&vd->lock); 590 591 if (vd->reset_ldc) 592 PR0("taking down LDC channel"); 593 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 594 PR0("ldc_down() returned errno %d", status); 595 596 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 597 vd->state = VD_STATE_INIT; 598 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 599 600 /* Allocate the staging buffer */ 601 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 602 603 PR0("calling ldc_up\n"); 604 (void) ldc_up(vd->ldc_handle); 605 606 vd->reset_state = B_FALSE; 607 vd->reset_ldc = B_FALSE; 608 609 mutex_exit(&vd->lock); 610 } 611 612 static void vd_recv_msg(void *arg); 613 614 static void 615 vd_mark_in_reset(vd_t *vd) 616 { 617 int status; 618 619 PR0("vd_mark_in_reset: marking vd in reset\n"); 620 621 vd_need_reset(vd, B_FALSE); 622 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 623 if (status == DDI_FAILURE) { 624 PR0("cannot schedule task to recv msg\n"); 625 vd_need_reset(vd, B_TRUE); 626 return; 627 } 628 } 629 630 static int 631 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 632 { 633 boolean_t accepted; 634 int status; 635 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 636 637 if (vd->reset_state) 638 return (0); 639 640 /* Acquire the element */ 641 if (!vd->reset_state && 642 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 643 if (status == ECONNRESET) { 644 vd_mark_in_reset(vd); 645 return (0); 646 } else { 647 PR0("ldc_mem_dring_acquire() returned errno %d", 648 status); 649 return (status); 650 } 651 } 652 653 /* Set the element's status and mark it done */ 654 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 655 if (accepted) { 656 elem->payload.nbytes = elem_nbytes; 657 elem->payload.status = elem_status; 658 elem->hdr.dstate = VIO_DESC_DONE; 659 } else { 660 /* Perhaps client timed out waiting for I/O... */ 661 PR0("element %u no longer \"accepted\"", idx); 662 VD_DUMP_DRING_ELEM(elem); 663 } 664 /* Release the element */ 665 if (!vd->reset_state && 666 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 667 if (status == ECONNRESET) { 668 vd_mark_in_reset(vd); 669 return (0); 670 } else { 671 PR0("ldc_mem_dring_release() returned errno %d", 672 status); 673 return (status); 674 } 675 } 676 677 return (accepted ? 0 : EINVAL); 678 } 679 680 static void 681 vd_complete_bio(void *arg) 682 { 683 int status = 0; 684 vd_task_t *task = (vd_task_t *)arg; 685 vd_t *vd = task->vd; 686 vd_dring_payload_t *request = task->request; 687 struct buf *buf = &task->buf; 688 689 690 ASSERT(vd != NULL); 691 ASSERT(request != NULL); 692 ASSERT(task->msg != NULL); 693 ASSERT(task->msglen >= sizeof (*task->msg)); 694 ASSERT(!vd->file); 695 696 /* Wait for the I/O to complete */ 697 request->status = biowait(buf); 698 699 /* return back the number of bytes read/written */ 700 request->nbytes = buf->b_bcount - buf->b_resid; 701 702 /* Release the buffer */ 703 if (!vd->reset_state) 704 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 705 if (status) { 706 PR0("ldc_mem_release() returned errno %d copying to " 707 "client", status); 708 if (status == ECONNRESET) { 709 vd_mark_in_reset(vd); 710 } 711 } 712 713 /* Unmap the memory, even if in reset */ 714 status = ldc_mem_unmap(task->mhdl); 715 if (status) { 716 PR0("ldc_mem_unmap() returned errno %d copying to client", 717 status); 718 if (status == ECONNRESET) { 719 vd_mark_in_reset(vd); 720 } 721 } 722 723 biofini(buf); 724 725 /* Update the dring element for a dring client */ 726 if (!vd->reset_state && (status == 0) && 727 (vd->xfer_mode == VIO_DRING_MODE)) { 728 status = vd_mark_elem_done(vd, task->index, 729 request->status, request->nbytes); 730 if (status == ECONNRESET) 731 vd_mark_in_reset(vd); 732 } 733 734 /* 735 * If a transport error occurred, arrange to "nack" the message when 736 * the final task in the descriptor element range completes 737 */ 738 if (status != 0) 739 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 740 741 /* 742 * Only the final task for a range of elements will respond to and 743 * free the message 744 */ 745 if (task->type == VD_NONFINAL_RANGE_TASK) { 746 return; 747 } 748 749 /* 750 * Send the "ack" or "nack" back to the client; if sending the message 751 * via LDC fails, arrange to reset both the connection state and LDC 752 * itself 753 */ 754 PR1("Sending %s", 755 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 756 if (!vd->reset_state) { 757 status = send_msg(vd->ldc_handle, task->msg, task->msglen); 758 switch (status) { 759 case 0: 760 break; 761 case ECONNRESET: 762 vd_mark_in_reset(vd); 763 break; 764 default: 765 PR0("initiating full reset"); 766 vd_need_reset(vd, B_TRUE); 767 break; 768 } 769 } 770 } 771 772 static void 773 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 774 { 775 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 776 } 777 778 static void 779 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 780 { 781 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 782 } 783 784 static void 785 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 786 { 787 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 788 } 789 790 static void 791 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 792 { 793 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 794 } 795 796 static void 797 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 798 { 799 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 800 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 801 802 dk_efi->dki_lba = vd_efi->lba; 803 dk_efi->dki_length = vd_efi->length; 804 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 805 } 806 807 static void 808 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 809 { 810 int len; 811 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 812 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 813 814 len = vd_efi->length; 815 DK_EFI2VD_EFI(dk_efi, vd_efi); 816 kmem_free(dk_efi->dki_data, len); 817 } 818 819 static void 820 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 821 { 822 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 823 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 824 825 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 826 VD_EFI2DK_EFI(vd_efi, dk_efi); 827 } 828 829 static void 830 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 831 { 832 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 833 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 834 835 kmem_free(dk_efi->dki_data, vd_efi->length); 836 } 837 838 static int 839 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label) 840 { 841 int status, rval; 842 struct dk_gpt *efi; 843 size_t efi_len; 844 845 *label = VD_DISK_LABEL_UNK; 846 847 status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc, 848 (vd_open_flags | FKIOCTL), kcred, &rval); 849 850 if (status == 0) { 851 *label = VD_DISK_LABEL_VTOC; 852 return (0); 853 } else if (status != ENOTSUP) { 854 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 855 return (status); 856 } 857 858 status = vds_efi_alloc_and_read(handle, &efi, &efi_len); 859 860 if (status) { 861 PR0("vds_efi_alloc_and_read returned error %d", status); 862 return (status); 863 } 864 865 *label = VD_DISK_LABEL_EFI; 866 vd_efi_to_vtoc(efi, vtoc); 867 vd_efi_free(efi, efi_len); 868 869 return (0); 870 } 871 872 static short 873 vd_lbl2cksum(struct dk_label *label) 874 { 875 int count; 876 short sum, *sp; 877 878 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 879 sp = (short *)label; 880 sum = 0; 881 while (count--) { 882 sum ^= *sp++; 883 } 884 885 return (sum); 886 } 887 888 static int 889 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 890 { 891 dk_efi_t *dk_ioc; 892 struct dk_label *label; 893 int i; 894 895 switch (vd->vdisk_label) { 896 897 case VD_DISK_LABEL_VTOC: 898 899 switch (cmd) { 900 case DKIOCGGEOM: 901 ASSERT(ioctl_arg != NULL); 902 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 903 return (0); 904 case DKIOCGVTOC: 905 ASSERT(ioctl_arg != NULL); 906 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 907 return (0); 908 case DKIOCSVTOC: 909 if (!vd->file) 910 return (ENOTSUP); 911 ASSERT(ioctl_arg != NULL); 912 bcopy(ioctl_arg, &vd->vtoc, sizeof (vd->vtoc)); 913 /* write new VTOC to file */ 914 label = (struct dk_label *)vd->file_maddr; 915 label->dkl_vtoc.v_nparts = vd->vtoc.v_nparts; 916 label->dkl_vtoc.v_sanity = vd->vtoc.v_sanity; 917 label->dkl_vtoc.v_version = vd->vtoc.v_version; 918 bcopy(vd->vtoc.v_volume, label->dkl_vtoc.v_volume, 919 LEN_DKL_VVOL); 920 for (i = 0; i < vd->vtoc.v_nparts; i++) { 921 label->dkl_vtoc.v_timestamp[i] = 922 vd->vtoc.timestamp[i]; 923 label->dkl_vtoc.v_part[i].p_tag = 924 vd->vtoc.v_part[i].p_tag; 925 label->dkl_vtoc.v_part[i].p_flag = 926 vd->vtoc.v_part[i].p_flag; 927 label->dkl_map[i].dkl_cylno = 928 vd->vtoc.v_part[i].p_start / 929 (label->dkl_nhead * label->dkl_nsect); 930 label->dkl_map[i].dkl_nblk = 931 vd->vtoc.v_part[i].p_size; 932 } 933 934 /* re-compute checksum */ 935 label->dkl_cksum = vd_lbl2cksum(label); 936 937 return (0); 938 default: 939 return (ENOTSUP); 940 } 941 942 case VD_DISK_LABEL_EFI: 943 944 switch (cmd) { 945 case DKIOCGETEFI: 946 ASSERT(ioctl_arg != NULL); 947 dk_ioc = (dk_efi_t *)ioctl_arg; 948 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 949 return (EINVAL); 950 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 951 vd->dk_efi.dki_length); 952 return (0); 953 default: 954 return (ENOTSUP); 955 } 956 957 default: 958 return (ENOTSUP); 959 } 960 } 961 962 static int 963 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 964 { 965 int rval = 0, status; 966 size_t nbytes = request->nbytes; /* modifiable copy */ 967 968 969 ASSERT(request->slice < vd->nslices); 970 PR0("Performing %s", ioctl->operation_name); 971 972 /* Get data from client and convert, if necessary */ 973 if (ioctl->copyin != NULL) { 974 ASSERT(nbytes != 0 && buf != NULL); 975 PR1("Getting \"arg\" data from client"); 976 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 977 request->cookie, request->ncookies, 978 LDC_COPY_IN)) != 0) { 979 PR0("ldc_mem_copy() returned errno %d " 980 "copying from client", status); 981 return (status); 982 } 983 984 /* Convert client's data, if necessary */ 985 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 986 ioctl->arg = buf; 987 else /* convert client vdisk operation data to ioctl data */ 988 (ioctl->copyin)(buf, (void *)ioctl->arg); 989 } 990 991 /* 992 * Handle single-slice block devices internally; otherwise, have the 993 * real driver perform the ioctl() 994 */ 995 if (vd->file || (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo)) { 996 if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, 997 (void *)ioctl->arg)) != 0) 998 return (status); 999 } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], 1000 ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL), 1001 kcred, &rval)) != 0) { 1002 PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); 1003 return (status); 1004 } 1005 #ifdef DEBUG 1006 if (rval != 0) { 1007 PR0("%s set rval = %d, which is not being returned to client", 1008 ioctl->cmd_name, rval); 1009 } 1010 #endif /* DEBUG */ 1011 1012 /* Convert data and send to client, if necessary */ 1013 if (ioctl->copyout != NULL) { 1014 ASSERT(nbytes != 0 && buf != NULL); 1015 PR1("Sending \"arg\" data to client"); 1016 1017 /* Convert ioctl data to vdisk operation data, if necessary */ 1018 if (ioctl->copyout != VD_IDENTITY) 1019 (ioctl->copyout)((void *)ioctl->arg, buf); 1020 1021 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1022 request->cookie, request->ncookies, 1023 LDC_COPY_OUT)) != 0) { 1024 PR0("ldc_mem_copy() returned errno %d " 1025 "copying to client", status); 1026 return (status); 1027 } 1028 } 1029 1030 return (status); 1031 } 1032 1033 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 1034 static int 1035 vd_ioctl(vd_task_t *task) 1036 { 1037 int i, status, rc; 1038 void *buf = NULL; 1039 struct dk_geom dk_geom = {0}; 1040 struct vtoc vtoc = {0}; 1041 struct dk_efi dk_efi = {0}; 1042 vd_t *vd = task->vd; 1043 vd_dring_payload_t *request = task->request; 1044 vd_ioctl_t ioctl[] = { 1045 /* Command (no-copy) operations */ 1046 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 1047 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 1048 NULL, NULL, NULL}, 1049 1050 /* "Get" (copy-out) operations */ 1051 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 1052 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 1053 NULL, VD_IDENTITY, VD_IDENTITY}, 1054 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 1055 RNDSIZE(vd_geom_t), 1056 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 1057 &dk_geom, NULL, dk_geom2vd_geom}, 1058 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 1059 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 1060 &vtoc, NULL, vtoc2vd_vtoc}, 1061 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 1062 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 1063 &dk_efi, vd_get_efi_in, vd_get_efi_out}, 1064 1065 /* "Set" (copy-in) operations */ 1066 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 1067 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 1068 NULL, VD_IDENTITY, VD_IDENTITY}, 1069 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 1070 RNDSIZE(vd_geom_t), 1071 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 1072 &dk_geom, vd_geom2dk_geom, NULL}, 1073 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 1074 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 1075 &vtoc, vd_vtoc2vtoc, NULL}, 1076 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 1077 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 1078 &dk_efi, vd_set_efi_in, vd_set_efi_out}, 1079 }; 1080 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 1081 1082 1083 ASSERT(vd != NULL); 1084 ASSERT(request != NULL); 1085 ASSERT(request->slice < vd->nslices); 1086 1087 /* 1088 * Determine ioctl corresponding to caller's "operation" and 1089 * validate caller's "nbytes" 1090 */ 1091 for (i = 0; i < nioctls; i++) { 1092 if (request->operation == ioctl[i].operation) { 1093 /* LDC memory operations require 8-byte multiples */ 1094 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 1095 1096 if (request->operation == VD_OP_GET_EFI || 1097 request->operation == VD_OP_SET_EFI) { 1098 if (request->nbytes >= ioctl[i].nbytes) 1099 break; 1100 PR0("%s: Expected at least nbytes = %lu, " 1101 "got %lu", ioctl[i].operation_name, 1102 ioctl[i].nbytes, request->nbytes); 1103 return (EINVAL); 1104 } 1105 1106 if (request->nbytes != ioctl[i].nbytes) { 1107 PR0("%s: Expected nbytes = %lu, got %lu", 1108 ioctl[i].operation_name, ioctl[i].nbytes, 1109 request->nbytes); 1110 return (EINVAL); 1111 } 1112 1113 break; 1114 } 1115 } 1116 ASSERT(i < nioctls); /* because "operation" already validated */ 1117 1118 if (request->nbytes) 1119 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 1120 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 1121 if (request->nbytes) 1122 kmem_free(buf, request->nbytes); 1123 if (!vd->file && vd->vdisk_type == VD_DISK_TYPE_DISK && 1124 (request->operation == VD_OP_SET_VTOC || 1125 request->operation == VD_OP_SET_EFI)) { 1126 /* update disk information */ 1127 rc = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, 1128 &vd->vdisk_label); 1129 if (rc != 0) 1130 PR0("vd_read_vtoc return error %d", rc); 1131 } 1132 PR0("Returning %d", status); 1133 return (status); 1134 } 1135 1136 static int 1137 vd_get_devid(vd_task_t *task) 1138 { 1139 vd_t *vd = task->vd; 1140 vd_dring_payload_t *request = task->request; 1141 vd_devid_t *vd_devid; 1142 impl_devid_t *devid; 1143 int status, bufid_len, devid_len, len; 1144 int bufbytes; 1145 1146 PR1("Get Device ID, nbytes=%ld", request->nbytes); 1147 1148 if (vd->file) { 1149 /* no devid for disk on file */ 1150 return (ENOENT); 1151 } 1152 1153 if (ddi_lyr_get_devid(vd->dev[request->slice], 1154 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 1155 /* the most common failure is that no devid is available */ 1156 PR2("No Device ID"); 1157 return (ENOENT); 1158 } 1159 1160 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 1161 devid_len = DEVID_GETLEN(devid); 1162 1163 /* 1164 * Save the buffer size here for use in deallocation. 1165 * The actual number of bytes copied is returned in 1166 * the 'nbytes' field of the request structure. 1167 */ 1168 bufbytes = request->nbytes; 1169 1170 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 1171 vd_devid->length = devid_len; 1172 vd_devid->type = DEVID_GETTYPE(devid); 1173 1174 len = (devid_len > bufid_len)? bufid_len : devid_len; 1175 1176 bcopy(devid->did_id, vd_devid->id, len); 1177 1178 /* LDC memory operations require 8-byte multiples */ 1179 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 1180 1181 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 1182 &request->nbytes, request->cookie, request->ncookies, 1183 LDC_COPY_OUT)) != 0) { 1184 PR0("ldc_mem_copy() returned errno %d copying to client", 1185 status); 1186 } 1187 PR1("post mem_copy: nbytes=%ld", request->nbytes); 1188 1189 kmem_free(vd_devid, bufbytes); 1190 ddi_devid_free((ddi_devid_t)devid); 1191 1192 return (status); 1193 } 1194 1195 /* 1196 * Define the supported operations once the functions for performing them have 1197 * been defined 1198 */ 1199 static const vds_operation_t vds_operation[] = { 1200 #define X(_s) #_s, _s 1201 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 1202 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 1203 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 1204 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 1205 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 1206 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 1207 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 1208 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 1209 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 1210 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 1211 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 1212 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 1213 #undef X 1214 }; 1215 1216 static const size_t vds_noperations = 1217 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 1218 1219 /* 1220 * Process a task specifying a client I/O request 1221 */ 1222 static int 1223 vd_process_task(vd_task_t *task) 1224 { 1225 int i, status; 1226 vd_t *vd = task->vd; 1227 vd_dring_payload_t *request = task->request; 1228 1229 1230 ASSERT(vd != NULL); 1231 ASSERT(request != NULL); 1232 1233 /* Find the requested operation */ 1234 for (i = 0; i < vds_noperations; i++) 1235 if (request->operation == vds_operation[i].operation) 1236 break; 1237 if (i == vds_noperations) { 1238 PR0("Unsupported operation %u", request->operation); 1239 return (ENOTSUP); 1240 } 1241 1242 /* Handle client using absolute disk offsets */ 1243 if ((vd->vdisk_type == VD_DISK_TYPE_DISK) && 1244 (request->slice == UINT8_MAX)) 1245 request->slice = VD_ENTIRE_DISK_SLICE; 1246 1247 /* Range-check slice */ 1248 if (request->slice >= vd->nslices) { 1249 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 1250 request->slice, (vd->nslices - 1)); 1251 return (EINVAL); 1252 } 1253 1254 PR1("operation : %s", vds_operation[i].namep); 1255 1256 /* Start the operation */ 1257 if ((status = vds_operation[i].start(task)) != EINPROGRESS) { 1258 PR0("operation : %s returned status %d", 1259 vds_operation[i].namep, status); 1260 request->status = status; /* op succeeded or failed */ 1261 return (0); /* but request completed */ 1262 } 1263 1264 ASSERT(vds_operation[i].complete != NULL); /* debug case */ 1265 if (vds_operation[i].complete == NULL) { /* non-debug case */ 1266 PR0("Unexpected return of EINPROGRESS " 1267 "with no I/O completion handler"); 1268 request->status = EIO; /* operation failed */ 1269 return (0); /* but request completed */ 1270 } 1271 1272 PR1("operation : kick off taskq entry for %s", vds_operation[i].namep); 1273 1274 /* Queue a task to complete the operation */ 1275 status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete, 1276 task, DDI_SLEEP); 1277 /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ 1278 ASSERT(status == DDI_SUCCESS); 1279 1280 PR1("Operation in progress"); 1281 return (EINPROGRESS); /* completion handler will finish request */ 1282 } 1283 1284 /* 1285 * Return true if the "type", "subtype", and "env" fields of the "tag" first 1286 * argument match the corresponding remaining arguments; otherwise, return false 1287 */ 1288 boolean_t 1289 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 1290 { 1291 return ((tag->vio_msgtype == type) && 1292 (tag->vio_subtype == subtype) && 1293 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 1294 } 1295 1296 /* 1297 * Check whether the major/minor version specified in "ver_msg" is supported 1298 * by this server. 1299 */ 1300 static boolean_t 1301 vds_supported_version(vio_ver_msg_t *ver_msg) 1302 { 1303 for (int i = 0; i < vds_num_versions; i++) { 1304 ASSERT(vds_version[i].major > 0); 1305 ASSERT((i == 0) || 1306 (vds_version[i].major < vds_version[i-1].major)); 1307 1308 /* 1309 * If the major versions match, adjust the minor version, if 1310 * necessary, down to the highest value supported by this 1311 * server and return true so this message will get "ack"ed; 1312 * the client should also support all minor versions lower 1313 * than the value it sent 1314 */ 1315 if (ver_msg->ver_major == vds_version[i].major) { 1316 if (ver_msg->ver_minor > vds_version[i].minor) { 1317 PR0("Adjusting minor version from %u to %u", 1318 ver_msg->ver_minor, vds_version[i].minor); 1319 ver_msg->ver_minor = vds_version[i].minor; 1320 } 1321 return (B_TRUE); 1322 } 1323 1324 /* 1325 * If the message contains a higher major version number, set 1326 * the message's major/minor versions to the current values 1327 * and return false, so this message will get "nack"ed with 1328 * these values, and the client will potentially try again 1329 * with the same or a lower version 1330 */ 1331 if (ver_msg->ver_major > vds_version[i].major) { 1332 ver_msg->ver_major = vds_version[i].major; 1333 ver_msg->ver_minor = vds_version[i].minor; 1334 return (B_FALSE); 1335 } 1336 1337 /* 1338 * Otherwise, the message's major version is less than the 1339 * current major version, so continue the loop to the next 1340 * (lower) supported version 1341 */ 1342 } 1343 1344 /* 1345 * No common version was found; "ground" the version pair in the 1346 * message to terminate negotiation 1347 */ 1348 ver_msg->ver_major = 0; 1349 ver_msg->ver_minor = 0; 1350 return (B_FALSE); 1351 } 1352 1353 /* 1354 * Process a version message from a client. vds expects to receive version 1355 * messages from clients seeking service, but never issues version messages 1356 * itself; therefore, vds can ACK or NACK client version messages, but does 1357 * not expect to receive version-message ACKs or NACKs (and will treat such 1358 * messages as invalid). 1359 */ 1360 static int 1361 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1362 { 1363 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 1364 1365 1366 ASSERT(msglen >= sizeof (msg->tag)); 1367 1368 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1369 VIO_VER_INFO)) { 1370 return (ENOMSG); /* not a version message */ 1371 } 1372 1373 if (msglen != sizeof (*ver_msg)) { 1374 PR0("Expected %lu-byte version message; " 1375 "received %lu bytes", sizeof (*ver_msg), msglen); 1376 return (EBADMSG); 1377 } 1378 1379 if (ver_msg->dev_class != VDEV_DISK) { 1380 PR0("Expected device class %u (disk); received %u", 1381 VDEV_DISK, ver_msg->dev_class); 1382 return (EBADMSG); 1383 } 1384 1385 /* 1386 * We're talking to the expected kind of client; set our device class 1387 * for "ack/nack" back to the client 1388 */ 1389 ver_msg->dev_class = VDEV_DISK_SERVER; 1390 1391 /* 1392 * Check whether the (valid) version message specifies a version 1393 * supported by this server. If the version is not supported, return 1394 * EBADMSG so the message will get "nack"ed; vds_supported_version() 1395 * will have updated the message with a supported version for the 1396 * client to consider 1397 */ 1398 if (!vds_supported_version(ver_msg)) 1399 return (EBADMSG); 1400 1401 1402 /* 1403 * A version has been agreed upon; use the client's SID for 1404 * communication on this channel now 1405 */ 1406 ASSERT(!(vd->initialized & VD_SID)); 1407 vd->sid = ver_msg->tag.vio_sid; 1408 vd->initialized |= VD_SID; 1409 1410 /* 1411 * When multiple versions are supported, this function should store 1412 * the negotiated major and minor version values in the "vd" data 1413 * structure to govern further communication; in particular, note that 1414 * the client might have specified a lower minor version for the 1415 * agreed major version than specifed in the vds_version[] array. The 1416 * following assertions should help remind future maintainers to make 1417 * the appropriate changes to support multiple versions. 1418 */ 1419 ASSERT(vds_num_versions == 1); 1420 ASSERT(ver_msg->ver_major == vds_version[0].major); 1421 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 1422 1423 PR0("Using major version %u, minor version %u", 1424 ver_msg->ver_major, ver_msg->ver_minor); 1425 return (0); 1426 } 1427 1428 static int 1429 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1430 { 1431 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 1432 int status, retry = 0; 1433 1434 1435 ASSERT(msglen >= sizeof (msg->tag)); 1436 1437 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1438 VIO_ATTR_INFO)) { 1439 PR0("Message is not an attribute message"); 1440 return (ENOMSG); 1441 } 1442 1443 if (msglen != sizeof (*attr_msg)) { 1444 PR0("Expected %lu-byte attribute message; " 1445 "received %lu bytes", sizeof (*attr_msg), msglen); 1446 return (EBADMSG); 1447 } 1448 1449 if (attr_msg->max_xfer_sz == 0) { 1450 PR0("Received maximum transfer size of 0 from client"); 1451 return (EBADMSG); 1452 } 1453 1454 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 1455 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 1456 PR0("Client requested unsupported transfer mode"); 1457 return (EBADMSG); 1458 } 1459 1460 /* 1461 * check if the underlying disk is ready, if not try accessing 1462 * the device again. Open the vdisk device and extract info 1463 * about it, as this is needed to respond to the attr info msg 1464 */ 1465 if ((vd->initialized & VD_DISK_READY) == 0) { 1466 PR0("Retry setting up disk (%s)", vd->device_path); 1467 do { 1468 status = vd_setup_vd(vd); 1469 if (status != EAGAIN || ++retry > vds_dev_retries) 1470 break; 1471 1472 /* incremental delay */ 1473 delay(drv_usectohz(vds_dev_delay)); 1474 1475 /* if vdisk is no longer enabled - return error */ 1476 if (!vd_enabled(vd)) 1477 return (ENXIO); 1478 1479 } while (status == EAGAIN); 1480 1481 if (status) 1482 return (ENXIO); 1483 1484 vd->initialized |= VD_DISK_READY; 1485 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 1486 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 1487 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 1488 (vd->pseudo ? "yes" : "no"), 1489 (vd->file ? "yes" : "no"), 1490 vd->nslices); 1491 } 1492 1493 /* Success: valid message and transfer mode */ 1494 vd->xfer_mode = attr_msg->xfer_mode; 1495 1496 if (vd->xfer_mode == VIO_DESC_MODE) { 1497 1498 /* 1499 * The vd_dring_inband_msg_t contains one cookie; need room 1500 * for up to n-1 more cookies, where "n" is the number of full 1501 * pages plus possibly one partial page required to cover 1502 * "max_xfer_sz". Add room for one more cookie if 1503 * "max_xfer_sz" isn't an integral multiple of the page size. 1504 * Must first get the maximum transfer size in bytes. 1505 */ 1506 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 1507 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 1508 attr_msg->max_xfer_sz; 1509 size_t max_inband_msglen = 1510 sizeof (vd_dring_inband_msg_t) + 1511 ((max_xfer_bytes/PAGESIZE + 1512 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 1513 (sizeof (ldc_mem_cookie_t))); 1514 1515 /* 1516 * Set the maximum expected message length to 1517 * accommodate in-band-descriptor messages with all 1518 * their cookies 1519 */ 1520 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 1521 1522 /* 1523 * Initialize the data structure for processing in-band I/O 1524 * request descriptors 1525 */ 1526 vd->inband_task.vd = vd; 1527 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1528 vd->inband_task.index = 0; 1529 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 1530 } 1531 1532 /* Return the device's block size and max transfer size to the client */ 1533 attr_msg->vdisk_block_size = DEV_BSIZE; 1534 attr_msg->max_xfer_sz = vd->max_xfer_sz; 1535 1536 attr_msg->vdisk_size = vd->vdisk_size; 1537 attr_msg->vdisk_type = vd->vdisk_type; 1538 attr_msg->operations = vds_operations; 1539 PR0("%s", VD_CLIENT(vd)); 1540 1541 ASSERT(vd->dring_task == NULL); 1542 1543 return (0); 1544 } 1545 1546 static int 1547 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1548 { 1549 int status; 1550 size_t expected; 1551 ldc_mem_info_t dring_minfo; 1552 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 1553 1554 1555 ASSERT(msglen >= sizeof (msg->tag)); 1556 1557 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1558 VIO_DRING_REG)) { 1559 PR0("Message is not a register-dring message"); 1560 return (ENOMSG); 1561 } 1562 1563 if (msglen < sizeof (*reg_msg)) { 1564 PR0("Expected at least %lu-byte register-dring message; " 1565 "received %lu bytes", sizeof (*reg_msg), msglen); 1566 return (EBADMSG); 1567 } 1568 1569 expected = sizeof (*reg_msg) + 1570 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 1571 if (msglen != expected) { 1572 PR0("Expected %lu-byte register-dring message; " 1573 "received %lu bytes", expected, msglen); 1574 return (EBADMSG); 1575 } 1576 1577 if (vd->initialized & VD_DRING) { 1578 PR0("A dring was previously registered; only support one"); 1579 return (EBADMSG); 1580 } 1581 1582 if (reg_msg->num_descriptors > INT32_MAX) { 1583 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 1584 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 1585 return (EBADMSG); 1586 } 1587 1588 if (reg_msg->ncookies != 1) { 1589 /* 1590 * In addition to fixing the assertion in the success case 1591 * below, supporting drings which require more than one 1592 * "cookie" requires increasing the value of vd->max_msglen 1593 * somewhere in the code path prior to receiving the message 1594 * which results in calling this function. Note that without 1595 * making this change, the larger message size required to 1596 * accommodate multiple cookies cannot be successfully 1597 * received, so this function will not even get called. 1598 * Gracefully accommodating more dring cookies might 1599 * reasonably demand exchanging an additional attribute or 1600 * making a minor protocol adjustment 1601 */ 1602 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 1603 return (EBADMSG); 1604 } 1605 1606 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 1607 reg_msg->ncookies, reg_msg->num_descriptors, 1608 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 1609 if (status != 0) { 1610 PR0("ldc_mem_dring_map() returned errno %d", status); 1611 return (status); 1612 } 1613 1614 /* 1615 * To remove the need for this assertion, must call 1616 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 1617 * successful call to ldc_mem_dring_map() 1618 */ 1619 ASSERT(reg_msg->ncookies == 1); 1620 1621 if ((status = 1622 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 1623 PR0("ldc_mem_dring_info() returned errno %d", status); 1624 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 1625 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1626 return (status); 1627 } 1628 1629 if (dring_minfo.vaddr == NULL) { 1630 PR0("Descriptor ring virtual address is NULL"); 1631 return (ENXIO); 1632 } 1633 1634 1635 /* Initialize for valid message and mapped dring */ 1636 PR1("descriptor size = %u, dring length = %u", 1637 vd->descriptor_size, vd->dring_len); 1638 vd->initialized |= VD_DRING; 1639 vd->dring_ident = 1; /* "There Can Be Only One" */ 1640 vd->dring = dring_minfo.vaddr; 1641 vd->descriptor_size = reg_msg->descriptor_size; 1642 vd->dring_len = reg_msg->num_descriptors; 1643 reg_msg->dring_ident = vd->dring_ident; 1644 1645 /* 1646 * Allocate and initialize a "shadow" array of data structures for 1647 * tasks to process I/O requests in dring elements 1648 */ 1649 vd->dring_task = 1650 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 1651 for (int i = 0; i < vd->dring_len; i++) { 1652 vd->dring_task[i].vd = vd; 1653 vd->dring_task[i].index = i; 1654 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 1655 1656 status = ldc_mem_alloc_handle(vd->ldc_handle, 1657 &(vd->dring_task[i].mhdl)); 1658 if (status) { 1659 PR0("ldc_mem_alloc_handle() returned err %d ", status); 1660 return (ENXIO); 1661 } 1662 1663 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1664 } 1665 1666 return (0); 1667 } 1668 1669 static int 1670 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1671 { 1672 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 1673 1674 1675 ASSERT(msglen >= sizeof (msg->tag)); 1676 1677 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1678 VIO_DRING_UNREG)) { 1679 PR0("Message is not an unregister-dring message"); 1680 return (ENOMSG); 1681 } 1682 1683 if (msglen != sizeof (*unreg_msg)) { 1684 PR0("Expected %lu-byte unregister-dring message; " 1685 "received %lu bytes", sizeof (*unreg_msg), msglen); 1686 return (EBADMSG); 1687 } 1688 1689 if (unreg_msg->dring_ident != vd->dring_ident) { 1690 PR0("Expected dring ident %lu; received %lu", 1691 vd->dring_ident, unreg_msg->dring_ident); 1692 return (EBADMSG); 1693 } 1694 1695 return (0); 1696 } 1697 1698 static int 1699 process_rdx_msg(vio_msg_t *msg, size_t msglen) 1700 { 1701 ASSERT(msglen >= sizeof (msg->tag)); 1702 1703 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 1704 PR0("Message is not an RDX message"); 1705 return (ENOMSG); 1706 } 1707 1708 if (msglen != sizeof (vio_rdx_msg_t)) { 1709 PR0("Expected %lu-byte RDX message; received %lu bytes", 1710 sizeof (vio_rdx_msg_t), msglen); 1711 return (EBADMSG); 1712 } 1713 1714 PR0("Valid RDX message"); 1715 return (0); 1716 } 1717 1718 static int 1719 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 1720 { 1721 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 1722 PR0("Received seq_num %lu; expected %lu", 1723 seq_num, (vd->seq_num + 1)); 1724 PR0("initiating soft reset"); 1725 vd_need_reset(vd, B_FALSE); 1726 return (1); 1727 } 1728 1729 vd->seq_num = seq_num; 1730 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 1731 return (0); 1732 } 1733 1734 /* 1735 * Return the expected size of an inband-descriptor message with all the 1736 * cookies it claims to include 1737 */ 1738 static size_t 1739 expected_inband_size(vd_dring_inband_msg_t *msg) 1740 { 1741 return ((sizeof (*msg)) + 1742 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 1743 } 1744 1745 /* 1746 * Process an in-band descriptor message: used with clients like OBP, with 1747 * which vds exchanges descriptors within VIO message payloads, rather than 1748 * operating on them within a descriptor ring 1749 */ 1750 static int 1751 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1752 { 1753 size_t expected; 1754 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 1755 1756 1757 ASSERT(msglen >= sizeof (msg->tag)); 1758 1759 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1760 VIO_DESC_DATA)) { 1761 PR1("Message is not an in-band-descriptor message"); 1762 return (ENOMSG); 1763 } 1764 1765 if (msglen < sizeof (*desc_msg)) { 1766 PR0("Expected at least %lu-byte descriptor message; " 1767 "received %lu bytes", sizeof (*desc_msg), msglen); 1768 return (EBADMSG); 1769 } 1770 1771 if (msglen != (expected = expected_inband_size(desc_msg))) { 1772 PR0("Expected %lu-byte descriptor message; " 1773 "received %lu bytes", expected, msglen); 1774 return (EBADMSG); 1775 } 1776 1777 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 1778 return (EBADMSG); 1779 1780 /* 1781 * Valid message: Set up the in-band descriptor task and process the 1782 * request. Arrange to acknowledge the client's message, unless an 1783 * error processing the descriptor task results in setting 1784 * VIO_SUBTYPE_NACK 1785 */ 1786 PR1("Valid in-band-descriptor message"); 1787 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1788 1789 ASSERT(vd->inband_task.msg != NULL); 1790 1791 bcopy(msg, vd->inband_task.msg, msglen); 1792 vd->inband_task.msglen = msglen; 1793 1794 /* 1795 * The task request is now the payload of the message 1796 * that was just copied into the body of the task. 1797 */ 1798 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 1799 vd->inband_task.request = &desc_msg->payload; 1800 1801 return (vd_process_task(&vd->inband_task)); 1802 } 1803 1804 static int 1805 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 1806 vio_msg_t *msg, size_t msglen) 1807 { 1808 int status; 1809 boolean_t ready; 1810 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1811 1812 1813 /* Accept the updated dring element */ 1814 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1815 PR0("ldc_mem_dring_acquire() returned errno %d", status); 1816 return (status); 1817 } 1818 ready = (elem->hdr.dstate == VIO_DESC_READY); 1819 if (ready) { 1820 elem->hdr.dstate = VIO_DESC_ACCEPTED; 1821 } else { 1822 PR0("descriptor %u not ready", idx); 1823 VD_DUMP_DRING_ELEM(elem); 1824 } 1825 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1826 PR0("ldc_mem_dring_release() returned errno %d", status); 1827 return (status); 1828 } 1829 if (!ready) 1830 return (EBUSY); 1831 1832 1833 /* Initialize a task and process the accepted element */ 1834 PR1("Processing dring element %u", idx); 1835 vd->dring_task[idx].type = type; 1836 1837 /* duplicate msg buf for cookies etc. */ 1838 bcopy(msg, vd->dring_task[idx].msg, msglen); 1839 1840 vd->dring_task[idx].msglen = msglen; 1841 if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS) 1842 status = vd_mark_elem_done(vd, idx, 1843 vd->dring_task[idx].request->status, 1844 vd->dring_task[idx].request->nbytes); 1845 1846 return (status); 1847 } 1848 1849 static int 1850 vd_process_element_range(vd_t *vd, int start, int end, 1851 vio_msg_t *msg, size_t msglen) 1852 { 1853 int i, n, nelem, status = 0; 1854 boolean_t inprogress = B_FALSE; 1855 vd_task_type_t type; 1856 1857 1858 ASSERT(start >= 0); 1859 ASSERT(end >= 0); 1860 1861 /* 1862 * Arrange to acknowledge the client's message, unless an error 1863 * processing one of the dring elements results in setting 1864 * VIO_SUBTYPE_NACK 1865 */ 1866 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1867 1868 /* 1869 * Process the dring elements in the range 1870 */ 1871 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 1872 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 1873 ((vio_dring_msg_t *)msg)->end_idx = i; 1874 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 1875 status = vd_process_element(vd, type, i, msg, msglen); 1876 if (status == EINPROGRESS) 1877 inprogress = B_TRUE; 1878 else if (status != 0) 1879 break; 1880 } 1881 1882 /* 1883 * If some, but not all, operations of a multi-element range are in 1884 * progress, wait for other operations to complete before returning 1885 * (which will result in "ack" or "nack" of the message). Note that 1886 * all outstanding operations will need to complete, not just the ones 1887 * corresponding to the current range of dring elements; howevever, as 1888 * this situation is an error case, performance is less critical. 1889 */ 1890 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 1891 ddi_taskq_wait(vd->completionq); 1892 1893 return (status); 1894 } 1895 1896 static int 1897 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1898 { 1899 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 1900 1901 1902 ASSERT(msglen >= sizeof (msg->tag)); 1903 1904 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1905 VIO_DRING_DATA)) { 1906 PR1("Message is not a dring-data message"); 1907 return (ENOMSG); 1908 } 1909 1910 if (msglen != sizeof (*dring_msg)) { 1911 PR0("Expected %lu-byte dring message; received %lu bytes", 1912 sizeof (*dring_msg), msglen); 1913 return (EBADMSG); 1914 } 1915 1916 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 1917 return (EBADMSG); 1918 1919 if (dring_msg->dring_ident != vd->dring_ident) { 1920 PR0("Expected dring ident %lu; received ident %lu", 1921 vd->dring_ident, dring_msg->dring_ident); 1922 return (EBADMSG); 1923 } 1924 1925 if (dring_msg->start_idx >= vd->dring_len) { 1926 PR0("\"start_idx\" = %u; must be less than %u", 1927 dring_msg->start_idx, vd->dring_len); 1928 return (EBADMSG); 1929 } 1930 1931 if ((dring_msg->end_idx < 0) || 1932 (dring_msg->end_idx >= vd->dring_len)) { 1933 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 1934 dring_msg->end_idx, vd->dring_len); 1935 return (EBADMSG); 1936 } 1937 1938 /* Valid message; process range of updated dring elements */ 1939 PR1("Processing descriptor range, start = %u, end = %u", 1940 dring_msg->start_idx, dring_msg->end_idx); 1941 return (vd_process_element_range(vd, dring_msg->start_idx, 1942 dring_msg->end_idx, msg, msglen)); 1943 } 1944 1945 static int 1946 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 1947 { 1948 int retry, status; 1949 size_t size = *nbytes; 1950 1951 1952 for (retry = 0, status = ETIMEDOUT; 1953 retry < vds_ldc_retries && status == ETIMEDOUT; 1954 retry++) { 1955 PR1("ldc_read() attempt %d", (retry + 1)); 1956 *nbytes = size; 1957 status = ldc_read(ldc_handle, msg, nbytes); 1958 } 1959 1960 if (status) { 1961 PR0("ldc_read() returned errno %d", status); 1962 if (status != ECONNRESET) 1963 return (ENOMSG); 1964 return (status); 1965 } else if (*nbytes == 0) { 1966 PR1("ldc_read() returned 0 and no message read"); 1967 return (ENOMSG); 1968 } 1969 1970 PR1("RCVD %lu-byte message", *nbytes); 1971 return (0); 1972 } 1973 1974 static int 1975 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1976 { 1977 int status; 1978 1979 1980 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 1981 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 1982 #ifdef DEBUG 1983 vd_decode_tag(msg); 1984 #endif 1985 1986 /* 1987 * Validate session ID up front, since it applies to all messages 1988 * once set 1989 */ 1990 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 1991 PR0("Expected SID %u, received %u", vd->sid, 1992 msg->tag.vio_sid); 1993 return (EBADMSG); 1994 } 1995 1996 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 1997 1998 /* 1999 * Process the received message based on connection state 2000 */ 2001 switch (vd->state) { 2002 case VD_STATE_INIT: /* expect version message */ 2003 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 2004 return (status); 2005 2006 /* Version negotiated, move to that state */ 2007 vd->state = VD_STATE_VER; 2008 return (0); 2009 2010 case VD_STATE_VER: /* expect attribute message */ 2011 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 2012 return (status); 2013 2014 /* Attributes exchanged, move to that state */ 2015 vd->state = VD_STATE_ATTR; 2016 return (0); 2017 2018 case VD_STATE_ATTR: 2019 switch (vd->xfer_mode) { 2020 case VIO_DESC_MODE: /* expect RDX message */ 2021 if ((status = process_rdx_msg(msg, msglen)) != 0) 2022 return (status); 2023 2024 /* Ready to receive in-band descriptors */ 2025 vd->state = VD_STATE_DATA; 2026 return (0); 2027 2028 case VIO_DRING_MODE: /* expect register-dring message */ 2029 if ((status = 2030 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 2031 return (status); 2032 2033 /* One dring negotiated, move to that state */ 2034 vd->state = VD_STATE_DRING; 2035 return (0); 2036 2037 default: 2038 ASSERT("Unsupported transfer mode"); 2039 PR0("Unsupported transfer mode"); 2040 return (ENOTSUP); 2041 } 2042 2043 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 2044 if ((status = process_rdx_msg(msg, msglen)) == 0) { 2045 /* Ready to receive data */ 2046 vd->state = VD_STATE_DATA; 2047 return (0); 2048 } else if (status != ENOMSG) { 2049 return (status); 2050 } 2051 2052 2053 /* 2054 * If another register-dring message is received, stay in 2055 * dring state in case the client sends RDX; although the 2056 * protocol allows multiple drings, this server does not 2057 * support using more than one 2058 */ 2059 if ((status = 2060 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 2061 return (status); 2062 2063 /* 2064 * Acknowledge an unregister-dring message, but reset the 2065 * connection anyway: Although the protocol allows 2066 * unregistering drings, this server cannot serve a vdisk 2067 * without its only dring 2068 */ 2069 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2070 return ((status == 0) ? ENOTSUP : status); 2071 2072 case VD_STATE_DATA: 2073 switch (vd->xfer_mode) { 2074 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 2075 return (vd_process_desc_msg(vd, msg, msglen)); 2076 2077 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 2078 /* 2079 * Typically expect dring-data messages, so handle 2080 * them first 2081 */ 2082 if ((status = vd_process_dring_msg(vd, msg, 2083 msglen)) != ENOMSG) 2084 return (status); 2085 2086 /* 2087 * Acknowledge an unregister-dring message, but reset 2088 * the connection anyway: Although the protocol 2089 * allows unregistering drings, this server cannot 2090 * serve a vdisk without its only dring 2091 */ 2092 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2093 return ((status == 0) ? ENOTSUP : status); 2094 2095 default: 2096 ASSERT("Unsupported transfer mode"); 2097 PR0("Unsupported transfer mode"); 2098 return (ENOTSUP); 2099 } 2100 2101 default: 2102 ASSERT("Invalid client connection state"); 2103 PR0("Invalid client connection state"); 2104 return (ENOTSUP); 2105 } 2106 } 2107 2108 static int 2109 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2110 { 2111 int status; 2112 boolean_t reset_ldc = B_FALSE; 2113 2114 2115 /* 2116 * Check that the message is at least big enough for a "tag", so that 2117 * message processing can proceed based on tag-specified message type 2118 */ 2119 if (msglen < sizeof (vio_msg_tag_t)) { 2120 PR0("Received short (%lu-byte) message", msglen); 2121 /* Can't "nack" short message, so drop the big hammer */ 2122 PR0("initiating full reset"); 2123 vd_need_reset(vd, B_TRUE); 2124 return (EBADMSG); 2125 } 2126 2127 /* 2128 * Process the message 2129 */ 2130 switch (status = vd_do_process_msg(vd, msg, msglen)) { 2131 case 0: 2132 /* "ack" valid, successfully-processed messages */ 2133 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2134 break; 2135 2136 case EINPROGRESS: 2137 /* The completion handler will "ack" or "nack" the message */ 2138 return (EINPROGRESS); 2139 case ENOMSG: 2140 PR0("Received unexpected message"); 2141 _NOTE(FALLTHROUGH); 2142 case EBADMSG: 2143 case ENOTSUP: 2144 /* "nack" invalid messages */ 2145 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2146 break; 2147 2148 default: 2149 /* "nack" failed messages */ 2150 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2151 /* An LDC error probably occurred, so try resetting it */ 2152 reset_ldc = B_TRUE; 2153 break; 2154 } 2155 2156 PR1("\tResulting in state %d (%s)", vd->state, 2157 vd_decode_state(vd->state)); 2158 2159 /* Send the "ack" or "nack" to the client */ 2160 PR1("Sending %s", 2161 (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 2162 if (send_msg(vd->ldc_handle, msg, msglen) != 0) 2163 reset_ldc = B_TRUE; 2164 2165 /* Arrange to reset the connection for nack'ed or failed messages */ 2166 if ((status != 0) || reset_ldc) { 2167 PR0("initiating %s reset", 2168 (reset_ldc) ? "full" : "soft"); 2169 vd_need_reset(vd, reset_ldc); 2170 } 2171 2172 return (status); 2173 } 2174 2175 static boolean_t 2176 vd_enabled(vd_t *vd) 2177 { 2178 boolean_t enabled; 2179 2180 2181 mutex_enter(&vd->lock); 2182 enabled = vd->enabled; 2183 mutex_exit(&vd->lock); 2184 return (enabled); 2185 } 2186 2187 static void 2188 vd_recv_msg(void *arg) 2189 { 2190 vd_t *vd = (vd_t *)arg; 2191 int rv = 0, status = 0; 2192 2193 ASSERT(vd != NULL); 2194 2195 PR2("New task to receive incoming message(s)"); 2196 2197 2198 while (vd_enabled(vd) && status == 0) { 2199 size_t msglen, msgsize; 2200 ldc_status_t lstatus; 2201 2202 /* 2203 * Receive and process a message 2204 */ 2205 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 2206 2207 /* 2208 * check if channel is UP - else break out of loop 2209 */ 2210 status = ldc_status(vd->ldc_handle, &lstatus); 2211 if (lstatus != LDC_UP) { 2212 PR0("channel not up (status=%d), exiting recv loop\n", 2213 lstatus); 2214 break; 2215 } 2216 2217 ASSERT(vd->max_msglen != 0); 2218 2219 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 2220 msglen = msgsize; /* actual len after recv_msg() */ 2221 2222 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 2223 switch (status) { 2224 case 0: 2225 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 2226 msglen); 2227 /* check if max_msglen changed */ 2228 if (msgsize != vd->max_msglen) { 2229 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 2230 msgsize, vd->max_msglen); 2231 kmem_free(vd->vio_msgp, msgsize); 2232 vd->vio_msgp = 2233 kmem_alloc(vd->max_msglen, KM_SLEEP); 2234 } 2235 if (rv == EINPROGRESS) 2236 continue; 2237 break; 2238 2239 case ENOMSG: 2240 break; 2241 2242 case ECONNRESET: 2243 PR0("initiating soft reset (ECONNRESET)\n"); 2244 vd_need_reset(vd, B_FALSE); 2245 status = 0; 2246 break; 2247 2248 default: 2249 /* Probably an LDC failure; arrange to reset it */ 2250 PR0("initiating full reset (status=0x%x)", status); 2251 vd_need_reset(vd, B_TRUE); 2252 break; 2253 } 2254 } 2255 2256 PR2("Task finished"); 2257 } 2258 2259 static uint_t 2260 vd_handle_ldc_events(uint64_t event, caddr_t arg) 2261 { 2262 vd_t *vd = (vd_t *)(void *)arg; 2263 int status; 2264 2265 ASSERT(vd != NULL); 2266 2267 if (!vd_enabled(vd)) 2268 return (LDC_SUCCESS); 2269 2270 if (event & LDC_EVT_DOWN) { 2271 PR0("LDC_EVT_DOWN: LDC channel went down"); 2272 2273 vd_need_reset(vd, B_TRUE); 2274 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2275 DDI_SLEEP); 2276 if (status == DDI_FAILURE) { 2277 PR0("cannot schedule task to recv msg\n"); 2278 vd_need_reset(vd, B_TRUE); 2279 } 2280 } 2281 2282 if (event & LDC_EVT_RESET) { 2283 PR0("LDC_EVT_RESET: LDC channel was reset"); 2284 2285 if (vd->state != VD_STATE_INIT) { 2286 PR0("scheduling full reset"); 2287 vd_need_reset(vd, B_FALSE); 2288 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2289 vd, DDI_SLEEP); 2290 if (status == DDI_FAILURE) { 2291 PR0("cannot schedule task to recv msg\n"); 2292 vd_need_reset(vd, B_TRUE); 2293 } 2294 2295 } else { 2296 PR0("channel already reset, ignoring...\n"); 2297 PR0("doing ldc up...\n"); 2298 (void) ldc_up(vd->ldc_handle); 2299 } 2300 2301 return (LDC_SUCCESS); 2302 } 2303 2304 if (event & LDC_EVT_UP) { 2305 PR0("EVT_UP: LDC is up\nResetting client connection state"); 2306 PR0("initiating soft reset"); 2307 vd_need_reset(vd, B_FALSE); 2308 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2309 vd, DDI_SLEEP); 2310 if (status == DDI_FAILURE) { 2311 PR0("cannot schedule task to recv msg\n"); 2312 vd_need_reset(vd, B_TRUE); 2313 return (LDC_SUCCESS); 2314 } 2315 } 2316 2317 if (event & LDC_EVT_READ) { 2318 int status; 2319 2320 PR1("New data available"); 2321 /* Queue a task to receive the new data */ 2322 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2323 DDI_SLEEP); 2324 2325 if (status == DDI_FAILURE) { 2326 PR0("cannot schedule task to recv msg\n"); 2327 vd_need_reset(vd, B_TRUE); 2328 } 2329 } 2330 2331 return (LDC_SUCCESS); 2332 } 2333 2334 static uint_t 2335 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 2336 { 2337 _NOTE(ARGUNUSED(key, val)) 2338 (*((uint_t *)arg))++; 2339 return (MH_WALK_TERMINATE); 2340 } 2341 2342 2343 static int 2344 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2345 { 2346 uint_t vd_present = 0; 2347 minor_t instance; 2348 vds_t *vds; 2349 2350 2351 switch (cmd) { 2352 case DDI_DETACH: 2353 /* the real work happens below */ 2354 break; 2355 case DDI_SUSPEND: 2356 PR0("No action required for DDI_SUSPEND"); 2357 return (DDI_SUCCESS); 2358 default: 2359 PR0("Unrecognized \"cmd\""); 2360 return (DDI_FAILURE); 2361 } 2362 2363 ASSERT(cmd == DDI_DETACH); 2364 instance = ddi_get_instance(dip); 2365 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 2366 PR0("Could not get state for instance %u", instance); 2367 ddi_soft_state_free(vds_state, instance); 2368 return (DDI_FAILURE); 2369 } 2370 2371 /* Do no detach when serving any vdisks */ 2372 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 2373 if (vd_present) { 2374 PR0("Not detaching because serving vdisks"); 2375 return (DDI_FAILURE); 2376 } 2377 2378 PR0("Detaching"); 2379 if (vds->initialized & VDS_MDEG) { 2380 (void) mdeg_unregister(vds->mdeg); 2381 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 2382 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 2383 vds->ispecp = NULL; 2384 vds->mdeg = NULL; 2385 } 2386 2387 if (vds->initialized & VDS_LDI) 2388 (void) ldi_ident_release(vds->ldi_ident); 2389 mod_hash_destroy_hash(vds->vd_table); 2390 ddi_soft_state_free(vds_state, instance); 2391 return (DDI_SUCCESS); 2392 } 2393 2394 static boolean_t 2395 is_pseudo_device(dev_info_t *dip) 2396 { 2397 dev_info_t *parent, *root = ddi_root_node(); 2398 2399 2400 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 2401 parent = ddi_get_parent(parent)) { 2402 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 2403 return (B_TRUE); 2404 } 2405 2406 return (B_FALSE); 2407 } 2408 2409 static int 2410 vd_setup_full_disk(vd_t *vd) 2411 { 2412 int rval, status; 2413 major_t major = getmajor(vd->dev[0]); 2414 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 2415 struct dk_minfo dk_minfo; 2416 2417 /* 2418 * At this point, vdisk_size is set to the size of partition 2 but 2419 * this does not represent the size of the disk because partition 2 2420 * may not cover the entire disk and its size does not include reserved 2421 * blocks. So we update vdisk_size to be the size of the entire disk. 2422 */ 2423 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 2424 (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL), 2425 kcred, &rval)) != 0) { 2426 PR0("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 2427 status); 2428 return (status); 2429 } 2430 vd->vdisk_size = dk_minfo.dki_capacity; 2431 2432 /* Set full-disk parameters */ 2433 vd->vdisk_type = VD_DISK_TYPE_DISK; 2434 vd->nslices = (sizeof (vd->dev))/(sizeof (vd->dev[0])); 2435 2436 /* Move dev number and LDI handle to entire-disk-slice array elements */ 2437 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 2438 vd->dev[0] = 0; 2439 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 2440 vd->ldi_handle[0] = NULL; 2441 2442 /* Initialize device numbers for remaining slices and open them */ 2443 for (int slice = 0; slice < vd->nslices; slice++) { 2444 /* 2445 * Skip the entire-disk slice, as it's already open and its 2446 * device known 2447 */ 2448 if (slice == VD_ENTIRE_DISK_SLICE) 2449 continue; 2450 ASSERT(vd->dev[slice] == 0); 2451 ASSERT(vd->ldi_handle[slice] == NULL); 2452 2453 /* 2454 * Construct the device number for the current slice 2455 */ 2456 vd->dev[slice] = makedevice(major, (minor + slice)); 2457 2458 /* 2459 * Open all slices of the disk to serve them to the client. 2460 * Slices are opened exclusively to prevent other threads or 2461 * processes in the service domain from performing I/O to 2462 * slices being accessed by a client. Failure to open a slice 2463 * results in vds not serving this disk, as the client could 2464 * attempt (and should be able) to access any slice immediately. 2465 * Any slices successfully opened before a failure will get 2466 * closed by vds_destroy_vd() as a result of the error returned 2467 * by this function. 2468 * 2469 * We need to do the open with FNDELAY so that opening an empty 2470 * slice does not fail. 2471 */ 2472 PR0("Opening device major %u, minor %u = slice %u", 2473 major, minor, slice); 2474 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 2475 vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[slice], 2476 vd->vds->ldi_ident)) != 0) { 2477 PR0("ldi_open_by_dev() returned errno %d " 2478 "for slice %u", status, slice); 2479 /* vds_destroy_vd() will close any open slices */ 2480 return (status); 2481 } 2482 } 2483 2484 return (0); 2485 } 2486 2487 static int 2488 vd_setup_partition_efi(vd_t *vd) 2489 { 2490 efi_gpt_t *gpt; 2491 efi_gpe_t *gpe; 2492 struct uuid uuid = EFI_RESERVED; 2493 uint32_t crc; 2494 int length; 2495 2496 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 2497 2498 gpt = kmem_zalloc(length, KM_SLEEP); 2499 gpe = (efi_gpe_t *)(gpt + 1); 2500 2501 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 2502 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 2503 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 2504 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 2505 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 2506 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 2507 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 2508 2509 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 2510 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 2511 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 2512 2513 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 2514 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2515 2516 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 2517 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 2518 2519 vd->dk_efi.dki_lba = 0; 2520 vd->dk_efi.dki_length = length; 2521 vd->dk_efi.dki_data = gpt; 2522 2523 return (0); 2524 } 2525 2526 static int 2527 vd_setup_file(vd_t *vd) 2528 { 2529 int i, rval, status; 2530 short sum; 2531 vattr_t vattr; 2532 dev_t dev; 2533 char *file_path = vd->device_path; 2534 char dev_path[MAXPATHLEN + 1]; 2535 ldi_handle_t lhandle; 2536 struct dk_cinfo dk_cinfo; 2537 struct dk_label *label; 2538 2539 /* make sure the file is valid */ 2540 if ((status = lookupname(file_path, UIO_SYSSPACE, FOLLOW, 2541 NULLVPP, &vd->file_vnode)) != 0) { 2542 PR0("Cannot lookup file(%s) errno %d", file_path, status); 2543 return (status); 2544 } 2545 2546 if (vd->file_vnode->v_type != VREG) { 2547 PR0("Invalid file type (%s)\n", file_path); 2548 VN_RELE(vd->file_vnode); 2549 return (EBADF); 2550 } 2551 VN_RELE(vd->file_vnode); 2552 2553 if ((status = vn_open(file_path, UIO_SYSSPACE, vd_open_flags | FOFFMAX, 2554 0, &vd->file_vnode, 0, 0)) != 0) { 2555 PR0("vn_open(%s) = errno %d", file_path, status); 2556 return (status); 2557 } 2558 2559 vattr.va_mask = AT_SIZE; 2560 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) { 2561 PR0("VOP_GETATTR(%s) = errno %d", file_path, status); 2562 (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred); 2563 VN_RELE(vd->file_vnode); 2564 return (EIO); 2565 } 2566 2567 vd->file_size = vattr.va_size; 2568 /* size should be at least sizeof(dk_label) */ 2569 if (vd->file_size < sizeof (struct dk_label)) { 2570 PRN("Size of file has to be at least %ld bytes", 2571 sizeof (struct dk_label)); 2572 (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred); 2573 VN_RELE(vd->file_vnode); 2574 return (EIO); 2575 } 2576 2577 if ((status = VOP_MAP(vd->file_vnode, 0, &kas, &vd->file_maddr, 2578 vd->file_size, PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, 2579 MAP_SHARED, kcred)) != 0) { 2580 PR0("VOP_MAP(%s) = errno %d", file_path, status); 2581 (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred); 2582 VN_RELE(vd->file_vnode); 2583 return (EIO); 2584 } 2585 2586 label = (struct dk_label *)vd->file_maddr; 2587 2588 /* label checksum */ 2589 sum = vd_lbl2cksum(label); 2590 2591 if (label->dkl_magic != DKL_MAGIC || label->dkl_cksum != sum) { 2592 PR0("%s has an invalid disk label " 2593 "(magic=%x cksum=%x (expect %x))", 2594 file_path, label->dkl_magic, label->dkl_cksum, sum); 2595 2596 /* default label */ 2597 bzero(label, sizeof (struct dk_label)); 2598 2599 /* 2600 * We must have a resonable number of cylinders and sectors so 2601 * that newfs can run using default values. 2602 * 2603 * if (disk_size < 2MB) 2604 * phys_cylinders = disk_size / 100K 2605 * else 2606 * phys_cylinders = disk_size / 300K 2607 * 2608 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 2609 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 2610 * data_cylinders = phys_cylinders - alt_cylinders 2611 * 2612 * sectors = disk_size / (phys_cylinders * blk_size) 2613 */ 2614 if (vd->file_size < (2 * 1024 * 1024)) 2615 label->dkl_pcyl = vd->file_size / (100 * 1024); 2616 else 2617 label->dkl_pcyl = vd->file_size / (300 * 1024); 2618 2619 if (label->dkl_pcyl == 0) 2620 label->dkl_pcyl = 1; 2621 2622 if (label->dkl_pcyl > 2) 2623 label->dkl_acyl = 2; 2624 else 2625 label->dkl_acyl = 0; 2626 2627 label->dkl_nsect = vd->file_size / 2628 (DEV_BSIZE * label->dkl_pcyl); 2629 label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl; 2630 label->dkl_nhead = 1; 2631 label->dkl_write_reinstruct = 0; 2632 label->dkl_read_reinstruct = 0; 2633 label->dkl_rpm = 7200; 2634 label->dkl_apc = 0; 2635 label->dkl_intrlv = 0; 2636 label->dkl_magic = DKL_MAGIC; 2637 2638 PR0("requested disk size: %ld bytes\n", vd->file_size); 2639 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl, 2640 label->dkl_nhead, label->dkl_nsect); 2641 PR0("provided disk size: %ld bytes\n", (uint64_t) 2642 (label->dkl_pcyl * 2643 label->dkl_nhead * label->dkl_nsect * DEV_BSIZE)); 2644 2645 /* 2646 * We must have a correct label name otherwise format(1m) will 2647 * not recognized the disk as labeled. 2648 */ 2649 (void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII, 2650 "SUNVDSK cyl %d alt %d hd %d sec %d", 2651 label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead, 2652 label->dkl_nsect); 2653 2654 /* default VTOC */ 2655 label->dkl_vtoc.v_version = V_VERSION; 2656 label->dkl_vtoc.v_nparts = 8; 2657 label->dkl_vtoc.v_sanity = VTOC_SANE; 2658 label->dkl_vtoc.v_part[2].p_tag = V_BACKUP; 2659 label->dkl_map[2].dkl_cylno = 0; 2660 label->dkl_map[2].dkl_nblk = label->dkl_ncyl * 2661 label->dkl_nhead * label->dkl_nsect; 2662 label->dkl_map[0] = label->dkl_map[2]; 2663 label->dkl_map[0] = label->dkl_map[2]; 2664 label->dkl_cksum = vd_lbl2cksum(label); 2665 } 2666 2667 vd->nslices = label->dkl_vtoc.v_nparts; 2668 2669 /* sector size = block size = DEV_BSIZE */ 2670 vd->vdisk_size = (label->dkl_pcyl * 2671 label->dkl_nhead * label->dkl_nsect) / DEV_BSIZE; 2672 vd->vdisk_type = VD_DISK_TYPE_DISK; 2673 vd->vdisk_label = VD_DISK_LABEL_VTOC; 2674 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 2675 2676 /* Get max_xfer_sz from the device where the file is */ 2677 dev = vd->file_vnode->v_vfsp->vfs_dev; 2678 dev_path[0] = NULL; 2679 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 2680 PR0("underlying device = %s\n", dev_path); 2681 } 2682 2683 if ((status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, 2684 kcred, &lhandle, vd->vds->ldi_ident)) != 0) { 2685 PR0("ldi_open_by_dev() returned errno %d for device %s", 2686 status, dev_path); 2687 } else { 2688 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 2689 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 2690 &rval)) != 0) { 2691 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 2692 status, dev_path); 2693 } else { 2694 /* 2695 * Store the device's max transfer size for 2696 * return to the client 2697 */ 2698 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 2699 } 2700 2701 PR0("close the device %s", dev_path); 2702 (void) ldi_close(lhandle, FREAD, kcred); 2703 } 2704 2705 PR0("using for file %s, dev %s, max_xfer = %u blks", 2706 file_path, dev_path, vd->max_xfer_sz); 2707 2708 vd->pseudo = B_FALSE; 2709 vd->file = B_TRUE; 2710 2711 vd->dk_geom.dkg_ncyl = label->dkl_ncyl; 2712 vd->dk_geom.dkg_acyl = label->dkl_acyl; 2713 vd->dk_geom.dkg_pcyl = label->dkl_pcyl; 2714 vd->dk_geom.dkg_nhead = label->dkl_nhead; 2715 vd->dk_geom.dkg_nsect = label->dkl_nsect; 2716 vd->dk_geom.dkg_intrlv = label->dkl_intrlv; 2717 vd->dk_geom.dkg_apc = label->dkl_apc; 2718 vd->dk_geom.dkg_rpm = label->dkl_rpm; 2719 vd->dk_geom.dkg_write_reinstruct = label->dkl_write_reinstruct; 2720 vd->dk_geom.dkg_read_reinstruct = label->dkl_read_reinstruct; 2721 2722 vd->vtoc.v_sanity = label->dkl_vtoc.v_sanity; 2723 vd->vtoc.v_version = label->dkl_vtoc.v_version; 2724 vd->vtoc.v_sectorsz = DEV_BSIZE; 2725 vd->vtoc.v_nparts = label->dkl_vtoc.v_nparts; 2726 2727 bcopy(label->dkl_vtoc.v_volume, vd->vtoc.v_volume, 2728 LEN_DKL_VVOL); 2729 bcopy(label->dkl_asciilabel, vd->vtoc.v_asciilabel, 2730 LEN_DKL_ASCII); 2731 2732 for (i = 0; i < vd->nslices; i++) { 2733 vd->vtoc.timestamp[i] = label->dkl_vtoc.v_timestamp[i]; 2734 vd->vtoc.v_part[i].p_tag = label->dkl_vtoc.v_part[i].p_tag; 2735 vd->vtoc.v_part[i].p_flag = label->dkl_vtoc.v_part[i].p_flag; 2736 vd->vtoc.v_part[i].p_start = label->dkl_map[i].dkl_cylno * 2737 label->dkl_nhead * label->dkl_nsect; 2738 vd->vtoc.v_part[i].p_size = label->dkl_map[i].dkl_nblk; 2739 vd->ldi_handle[i] = NULL; 2740 vd->dev[i] = NULL; 2741 } 2742 2743 return (0); 2744 } 2745 2746 static int 2747 vd_setup_vd(vd_t *vd) 2748 { 2749 int rval, status; 2750 dev_info_t *dip; 2751 struct dk_cinfo dk_cinfo; 2752 char *device_path = vd->device_path; 2753 2754 /* 2755 * We need to open with FNDELAY so that opening an empty partition 2756 * does not fail. 2757 */ 2758 if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY, 2759 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { 2760 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 2761 2762 /* this may not be a device try opening as a file */ 2763 if (status == ENXIO || status == ENODEV) 2764 status = vd_setup_file(vd); 2765 if (status) { 2766 PR0("Cannot use device/file (%s), errno=%d\n", 2767 device_path, status); 2768 if (status == ENXIO || status == ENODEV || 2769 status == ENOENT) { 2770 return (EAGAIN); 2771 } 2772 } 2773 return (status); 2774 } 2775 2776 /* 2777 * nslices must be updated now so that vds_destroy_vd() will close 2778 * the slice we have just opened in case of an error. 2779 */ 2780 vd->nslices = 1; 2781 vd->file = B_FALSE; 2782 2783 /* Get device number and size of backing device */ 2784 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 2785 PRN("ldi_get_dev() returned errno %d for %s", 2786 status, device_path); 2787 return (status); 2788 } 2789 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 2790 PRN("ldi_get_size() failed for %s", device_path); 2791 return (EIO); 2792 } 2793 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 2794 2795 /* Verify backing device supports dk_cinfo, dk_geom, and vtoc */ 2796 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 2797 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 2798 &rval)) != 0) { 2799 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 2800 status, device_path); 2801 return (status); 2802 } 2803 if (dk_cinfo.dki_partition >= V_NUMPAR) { 2804 PRN("slice %u >= maximum slice %u for %s", 2805 dk_cinfo.dki_partition, V_NUMPAR, device_path); 2806 return (EIO); 2807 } 2808 2809 status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label); 2810 2811 if (status != 0) { 2812 PRN("vd_read_vtoc returned errno %d for %s", 2813 status, device_path); 2814 return (status); 2815 } 2816 2817 if (vd->vdisk_label == VD_DISK_LABEL_VTOC && 2818 (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 2819 (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), 2820 kcred, &rval)) != 0) { 2821 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 2822 status, device_path); 2823 return (status); 2824 } 2825 2826 /* Store the device's max transfer size for return to the client */ 2827 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 2828 2829 /* Determine if backing device is a pseudo device */ 2830 if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), 2831 dev_to_instance(vd->dev[0]), 0)) == NULL) { 2832 PRN("%s is no longer accessible", device_path); 2833 return (EIO); 2834 } 2835 vd->pseudo = is_pseudo_device(dip); 2836 ddi_release_devi(dip); 2837 if (vd->pseudo) { 2838 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2839 vd->nslices = 1; 2840 return (0); /* ...and we're done */ 2841 } 2842 2843 /* If slice is entire-disk slice, initialize for full disk */ 2844 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) 2845 return (vd_setup_full_disk(vd)); 2846 2847 2848 /* Otherwise, we have a non-entire slice of a device */ 2849 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2850 vd->nslices = 1; 2851 2852 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 2853 status = vd_setup_partition_efi(vd); 2854 return (status); 2855 } 2856 2857 /* Initialize dk_geom structure for single-slice device */ 2858 if (vd->dk_geom.dkg_nsect == 0) { 2859 PR0("%s geometry claims 0 sectors per track", device_path); 2860 return (EIO); 2861 } 2862 if (vd->dk_geom.dkg_nhead == 0) { 2863 PR0("%s geometry claims 0 heads", device_path); 2864 return (EIO); 2865 } 2866 vd->dk_geom.dkg_ncyl = 2867 vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; 2868 vd->dk_geom.dkg_acyl = 0; 2869 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 2870 2871 2872 /* Initialize vtoc structure for single-slice device */ 2873 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 2874 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 2875 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 2876 vd->vtoc.v_nparts = 1; 2877 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 2878 vd->vtoc.v_part[0].p_flag = 0; 2879 vd->vtoc.v_part[0].p_start = 0; 2880 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 2881 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 2882 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 2883 2884 2885 return (0); 2886 } 2887 2888 static int 2889 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id, 2890 vd_t **vdp) 2891 { 2892 char tq_name[TASKQ_NAMELEN]; 2893 int status; 2894 ddi_iblock_cookie_t iblock = NULL; 2895 ldc_attr_t ldc_attr; 2896 vd_t *vd; 2897 2898 2899 ASSERT(vds != NULL); 2900 ASSERT(device_path != NULL); 2901 ASSERT(vdp != NULL); 2902 PR0("Adding vdisk for %s", device_path); 2903 2904 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 2905 PRN("No memory for virtual disk"); 2906 return (EAGAIN); 2907 } 2908 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 2909 vd->vds = vds; 2910 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 2911 2912 /* Open vdisk and initialize parameters */ 2913 if ((status = vd_setup_vd(vd)) == 0) { 2914 vd->initialized |= VD_DISK_READY; 2915 2916 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 2917 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 2918 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 2919 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 2920 vd->nslices); 2921 } else { 2922 if (status != EAGAIN) 2923 return (status); 2924 } 2925 2926 /* Initialize locking */ 2927 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 2928 &iblock) != DDI_SUCCESS) { 2929 PRN("Could not get iblock cookie."); 2930 return (EIO); 2931 } 2932 2933 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 2934 vd->initialized |= VD_LOCKING; 2935 2936 2937 /* Create start and completion task queues for the vdisk */ 2938 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 2939 PR1("tq_name = %s", tq_name); 2940 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 2941 TASKQ_DEFAULTPRI, 0)) == NULL) { 2942 PRN("Could not create task queue"); 2943 return (EIO); 2944 } 2945 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 2946 PR1("tq_name = %s", tq_name); 2947 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 2948 TASKQ_DEFAULTPRI, 0)) == NULL) { 2949 PRN("Could not create task queue"); 2950 return (EIO); 2951 } 2952 vd->enabled = 1; /* before callback can dispatch to startq */ 2953 2954 2955 /* Bring up LDC */ 2956 ldc_attr.devclass = LDC_DEV_BLK_SVC; 2957 ldc_attr.instance = ddi_get_instance(vds->dip); 2958 ldc_attr.mode = LDC_MODE_UNRELIABLE; 2959 ldc_attr.mtu = VD_LDC_MTU; 2960 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 2961 PR0("ldc_init(%lu) = errno %d", ldc_id, status); 2962 return (status); 2963 } 2964 vd->initialized |= VD_LDC; 2965 2966 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 2967 (caddr_t)vd)) != 0) { 2968 PR0("ldc_reg_callback() returned errno %d", status); 2969 return (status); 2970 } 2971 2972 if ((status = ldc_open(vd->ldc_handle)) != 0) { 2973 PR0("ldc_open() returned errno %d", status); 2974 return (status); 2975 } 2976 2977 if ((status = ldc_up(vd->ldc_handle)) != 0) { 2978 PR0("ldc_up() returned errno %d", status); 2979 } 2980 2981 /* Allocate the inband task memory handle */ 2982 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 2983 if (status) { 2984 PR0("ldc_mem_alloc_handle() returned err %d ", status); 2985 return (ENXIO); 2986 } 2987 2988 /* Add the successfully-initialized vdisk to the server's table */ 2989 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 2990 PRN("Error adding vdisk ID %lu to table", id); 2991 return (EIO); 2992 } 2993 2994 /* Allocate the staging buffer */ 2995 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 2996 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 2997 2998 /* store initial state */ 2999 vd->state = VD_STATE_INIT; 3000 3001 return (0); 3002 } 3003 3004 static void 3005 vd_free_dring_task(vd_t *vdp) 3006 { 3007 if (vdp->dring_task != NULL) { 3008 ASSERT(vdp->dring_len != 0); 3009 /* Free all dring_task memory handles */ 3010 for (int i = 0; i < vdp->dring_len; i++) { 3011 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 3012 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 3013 vdp->dring_task[i].msg = NULL; 3014 } 3015 kmem_free(vdp->dring_task, 3016 (sizeof (*vdp->dring_task)) * vdp->dring_len); 3017 vdp->dring_task = NULL; 3018 } 3019 } 3020 3021 /* 3022 * Destroy the state associated with a virtual disk 3023 */ 3024 static void 3025 vds_destroy_vd(void *arg) 3026 { 3027 vd_t *vd = (vd_t *)arg; 3028 int retry = 0, rv; 3029 3030 if (vd == NULL) 3031 return; 3032 3033 PR0("Destroying vdisk state"); 3034 3035 if (vd->dk_efi.dki_data != NULL) 3036 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 3037 3038 /* Disable queuing requests for the vdisk */ 3039 if (vd->initialized & VD_LOCKING) { 3040 mutex_enter(&vd->lock); 3041 vd->enabled = 0; 3042 mutex_exit(&vd->lock); 3043 } 3044 3045 /* Drain and destroy start queue (*before* destroying completionq) */ 3046 if (vd->startq != NULL) 3047 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 3048 3049 /* Drain and destroy completion queue (*before* shutting down LDC) */ 3050 if (vd->completionq != NULL) 3051 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 3052 3053 vd_free_dring_task(vd); 3054 3055 /* Free the inband task memory handle */ 3056 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 3057 3058 /* Shut down LDC */ 3059 if (vd->initialized & VD_LDC) { 3060 /* unmap the dring */ 3061 if (vd->initialized & VD_DRING) 3062 (void) ldc_mem_dring_unmap(vd->dring_handle); 3063 3064 /* close LDC channel - retry on EAGAIN */ 3065 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 3066 if (++retry > vds_ldc_retries) { 3067 PR0("Timed out closing channel"); 3068 break; 3069 } 3070 drv_usecwait(vds_ldc_delay); 3071 } 3072 if (rv == 0) { 3073 (void) ldc_unreg_callback(vd->ldc_handle); 3074 (void) ldc_fini(vd->ldc_handle); 3075 } else { 3076 /* 3077 * Closing the LDC channel has failed. Ideally we should 3078 * fail here but there is no Zeus level infrastructure 3079 * to handle this. The MD has already been changed and 3080 * we have to do the close. So we try to do as much 3081 * clean up as we can. 3082 */ 3083 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 3084 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 3085 drv_usecwait(vds_ldc_delay); 3086 } 3087 } 3088 3089 /* Free the staging buffer for msgs */ 3090 if (vd->vio_msgp != NULL) { 3091 kmem_free(vd->vio_msgp, vd->max_msglen); 3092 vd->vio_msgp = NULL; 3093 } 3094 3095 /* Free the inband message buffer */ 3096 if (vd->inband_task.msg != NULL) { 3097 kmem_free(vd->inband_task.msg, vd->max_msglen); 3098 vd->inband_task.msg = NULL; 3099 } 3100 if (vd->initialized & VD_DISK_READY) { 3101 if (vd->file) { 3102 /* Unmap and close file */ 3103 (void) as_unmap(&kas, vd->file_maddr, vd->file_size); 3104 (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 3105 0, kcred); 3106 VN_RELE(vd->file_vnode); 3107 } else { 3108 /* Close any open backing-device slices */ 3109 for (uint_t slice = 0; slice < vd->nslices; slice++) { 3110 if (vd->ldi_handle[slice] != NULL) { 3111 PR0("Closing slice %u", slice); 3112 (void) ldi_close(vd->ldi_handle[slice], 3113 vd_open_flags | FNDELAY, kcred); 3114 } 3115 } 3116 } 3117 } 3118 3119 /* Free lock */ 3120 if (vd->initialized & VD_LOCKING) 3121 mutex_destroy(&vd->lock); 3122 3123 /* Finally, free the vdisk structure itself */ 3124 kmem_free(vd, sizeof (*vd)); 3125 } 3126 3127 static int 3128 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id) 3129 { 3130 int status; 3131 vd_t *vd = NULL; 3132 3133 3134 if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0) 3135 vds_destroy_vd(vd); 3136 3137 return (status); 3138 } 3139 3140 static int 3141 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 3142 uint64_t *ldc_id) 3143 { 3144 int num_channels; 3145 3146 3147 /* Look for channel endpoint child(ren) of the vdisk MD node */ 3148 if ((num_channels = md_scan_dag(md, vd_node, 3149 md_find_name(md, VD_CHANNEL_ENDPOINT), 3150 md_find_name(md, "fwd"), channel)) <= 0) { 3151 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 3152 return (-1); 3153 } 3154 3155 /* Get the "id" value for the first channel endpoint node */ 3156 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 3157 PRN("No \"%s\" property found for \"%s\" of vdisk", 3158 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 3159 return (-1); 3160 } 3161 3162 if (num_channels > 1) { 3163 PRN("Using ID of first of multiple channels for this vdisk"); 3164 } 3165 3166 return (0); 3167 } 3168 3169 static int 3170 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 3171 { 3172 int num_nodes, status; 3173 size_t size; 3174 mde_cookie_t *channel; 3175 3176 3177 if ((num_nodes = md_node_count(md)) <= 0) { 3178 PRN("Invalid node count in Machine Description subtree"); 3179 return (-1); 3180 } 3181 size = num_nodes*(sizeof (*channel)); 3182 channel = kmem_zalloc(size, KM_SLEEP); 3183 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 3184 kmem_free(channel, size); 3185 3186 return (status); 3187 } 3188 3189 static void 3190 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 3191 { 3192 char *device_path = NULL; 3193 uint64_t id = 0, ldc_id = 0; 3194 3195 3196 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 3197 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 3198 return; 3199 } 3200 PR0("Adding vdisk ID %lu", id); 3201 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 3202 &device_path) != 0) { 3203 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 3204 return; 3205 } 3206 3207 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 3208 PRN("Error getting LDC ID for vdisk %lu", id); 3209 return; 3210 } 3211 3212 if (vds_init_vd(vds, id, device_path, ldc_id) != 0) { 3213 PRN("Failed to add vdisk ID %lu", id); 3214 return; 3215 } 3216 } 3217 3218 static void 3219 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 3220 { 3221 uint64_t id = 0; 3222 3223 3224 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 3225 PRN("Unable to get \"%s\" property from vdisk's MD node", 3226 VD_ID_PROP); 3227 return; 3228 } 3229 PR0("Removing vdisk ID %lu", id); 3230 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 3231 PRN("No vdisk entry found for vdisk ID %lu", id); 3232 } 3233 3234 static void 3235 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 3236 md_t *curr_md, mde_cookie_t curr_vd_node) 3237 { 3238 char *curr_dev, *prev_dev; 3239 uint64_t curr_id = 0, curr_ldc_id = 0; 3240 uint64_t prev_id = 0, prev_ldc_id = 0; 3241 size_t len; 3242 3243 3244 /* Validate that vdisk ID has not changed */ 3245 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 3246 PRN("Error getting previous vdisk \"%s\" property", 3247 VD_ID_PROP); 3248 return; 3249 } 3250 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 3251 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 3252 return; 3253 } 3254 if (curr_id != prev_id) { 3255 PRN("Not changing vdisk: ID changed from %lu to %lu", 3256 prev_id, curr_id); 3257 return; 3258 } 3259 3260 /* Validate that LDC ID has not changed */ 3261 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 3262 PRN("Error getting LDC ID for vdisk %lu", prev_id); 3263 return; 3264 } 3265 3266 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 3267 PRN("Error getting LDC ID for vdisk %lu", curr_id); 3268 return; 3269 } 3270 if (curr_ldc_id != prev_ldc_id) { 3271 _NOTE(NOTREACHED); /* lint is confused */ 3272 PRN("Not changing vdisk: " 3273 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 3274 return; 3275 } 3276 3277 /* Determine whether device path has changed */ 3278 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 3279 &prev_dev) != 0) { 3280 PRN("Error getting previous vdisk \"%s\"", 3281 VD_BLOCK_DEVICE_PROP); 3282 return; 3283 } 3284 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 3285 &curr_dev) != 0) { 3286 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 3287 return; 3288 } 3289 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 3290 (strncmp(curr_dev, prev_dev, len) == 0)) 3291 return; /* no relevant (supported) change */ 3292 3293 PR0("Changing vdisk ID %lu", prev_id); 3294 3295 /* Remove old state, which will close vdisk and reset */ 3296 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 3297 PRN("No entry found for vdisk ID %lu", prev_id); 3298 3299 /* Re-initialize vdisk with new state */ 3300 if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { 3301 PRN("Failed to change vdisk ID %lu", curr_id); 3302 return; 3303 } 3304 } 3305 3306 static int 3307 vds_process_md(void *arg, mdeg_result_t *md) 3308 { 3309 int i; 3310 vds_t *vds = arg; 3311 3312 3313 if (md == NULL) 3314 return (MDEG_FAILURE); 3315 ASSERT(vds != NULL); 3316 3317 for (i = 0; i < md->removed.nelem; i++) 3318 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 3319 for (i = 0; i < md->match_curr.nelem; i++) 3320 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 3321 md->match_curr.mdp, md->match_curr.mdep[i]); 3322 for (i = 0; i < md->added.nelem; i++) 3323 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 3324 3325 return (MDEG_SUCCESS); 3326 } 3327 3328 3329 static int 3330 vds_do_attach(dev_info_t *dip) 3331 { 3332 int status, sz; 3333 int cfg_handle; 3334 minor_t instance = ddi_get_instance(dip); 3335 vds_t *vds; 3336 mdeg_prop_spec_t *pspecp; 3337 mdeg_node_spec_t *ispecp; 3338 3339 /* 3340 * The "cfg-handle" property of a vds node in an MD contains the MD's 3341 * notion of "instance", or unique identifier, for that node; OBP 3342 * stores the value of the "cfg-handle" MD property as the value of 3343 * the "reg" property on the node in the device tree it builds from 3344 * the MD and passes to Solaris. Thus, we look up the devinfo node's 3345 * "reg" property value to uniquely identify this device instance when 3346 * registering with the MD event-generation framework. If the "reg" 3347 * property cannot be found, the device tree state is presumably so 3348 * broken that there is no point in continuing. 3349 */ 3350 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 3351 VD_REG_PROP)) { 3352 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 3353 return (DDI_FAILURE); 3354 } 3355 3356 /* Get the MD instance for later MDEG registration */ 3357 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 3358 VD_REG_PROP, -1); 3359 3360 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 3361 PRN("Could not allocate state for instance %u", instance); 3362 return (DDI_FAILURE); 3363 } 3364 3365 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 3366 PRN("Could not get state for instance %u", instance); 3367 ddi_soft_state_free(vds_state, instance); 3368 return (DDI_FAILURE); 3369 } 3370 3371 3372 vds->dip = dip; 3373 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 3374 vds_destroy_vd, 3375 sizeof (void *)); 3376 ASSERT(vds->vd_table != NULL); 3377 3378 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 3379 PRN("ldi_ident_from_dip() returned errno %d", status); 3380 return (DDI_FAILURE); 3381 } 3382 vds->initialized |= VDS_LDI; 3383 3384 /* Register for MD updates */ 3385 sz = sizeof (vds_prop_template); 3386 pspecp = kmem_alloc(sz, KM_SLEEP); 3387 bcopy(vds_prop_template, pspecp, sz); 3388 3389 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 3390 3391 /* initialize the complete prop spec structure */ 3392 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 3393 ispecp->namep = "virtual-device"; 3394 ispecp->specp = pspecp; 3395 3396 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 3397 &vds->mdeg) != MDEG_SUCCESS) { 3398 PRN("Unable to register for MD updates"); 3399 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 3400 kmem_free(pspecp, sz); 3401 return (DDI_FAILURE); 3402 } 3403 3404 vds->ispecp = ispecp; 3405 vds->initialized |= VDS_MDEG; 3406 3407 /* Prevent auto-detaching so driver is available whenever MD changes */ 3408 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 3409 DDI_PROP_SUCCESS) { 3410 PRN("failed to set \"%s\" property for instance %u", 3411 DDI_NO_AUTODETACH, instance); 3412 } 3413 3414 ddi_report_dev(dip); 3415 return (DDI_SUCCESS); 3416 } 3417 3418 static int 3419 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3420 { 3421 int status; 3422 3423 switch (cmd) { 3424 case DDI_ATTACH: 3425 PR0("Attaching"); 3426 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 3427 (void) vds_detach(dip, DDI_DETACH); 3428 return (status); 3429 case DDI_RESUME: 3430 PR0("No action required for DDI_RESUME"); 3431 return (DDI_SUCCESS); 3432 default: 3433 return (DDI_FAILURE); 3434 } 3435 } 3436 3437 static struct dev_ops vds_ops = { 3438 DEVO_REV, /* devo_rev */ 3439 0, /* devo_refcnt */ 3440 ddi_no_info, /* devo_getinfo */ 3441 nulldev, /* devo_identify */ 3442 nulldev, /* devo_probe */ 3443 vds_attach, /* devo_attach */ 3444 vds_detach, /* devo_detach */ 3445 nodev, /* devo_reset */ 3446 NULL, /* devo_cb_ops */ 3447 NULL, /* devo_bus_ops */ 3448 nulldev /* devo_power */ 3449 }; 3450 3451 static struct modldrv modldrv = { 3452 &mod_driverops, 3453 "virtual disk server v%I%", 3454 &vds_ops, 3455 }; 3456 3457 static struct modlinkage modlinkage = { 3458 MODREV_1, 3459 &modldrv, 3460 NULL 3461 }; 3462 3463 3464 int 3465 _init(void) 3466 { 3467 int i, status; 3468 3469 3470 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 3471 return (status); 3472 if ((status = mod_install(&modlinkage)) != 0) { 3473 ddi_soft_state_fini(&vds_state); 3474 return (status); 3475 } 3476 3477 /* Fill in the bit-mask of server-supported operations */ 3478 for (i = 0; i < vds_noperations; i++) 3479 vds_operations |= 1 << (vds_operation[i].operation - 1); 3480 3481 return (0); 3482 } 3483 3484 int 3485 _info(struct modinfo *modinfop) 3486 { 3487 return (mod_info(&modlinkage, modinfop)); 3488 } 3489 3490 int 3491 _fini(void) 3492 { 3493 int status; 3494 3495 3496 if ((status = mod_remove(&modlinkage)) != 0) 3497 return (status); 3498 ddi_soft_state_fini(&vds_state); 3499 return (0); 3500 } 3501