1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sunddi.h> 45 #include <sys/sunldi.h> 46 #include <sys/sysmacros.h> 47 #include <sys/vio_common.h> 48 #include <sys/vdsk_mailbox.h> 49 #include <sys/vdsk_common.h> 50 #include <sys/vtoc.h> 51 52 53 /* Virtual disk server initialization flags */ 54 #define VDS_LDI 0x01 55 #define VDS_MDEG 0x02 56 57 /* Virtual disk server tunable parameters */ 58 #define VDS_LDC_RETRIES 3 59 #define VDS_LDC_DELAY 1000 /* usec */ 60 #define VDS_NCHAINS 32 61 62 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 63 #define VDS_NAME "virtual-disk-server" 64 65 #define VD_NAME "vd" 66 #define VD_VOLUME_NAME "vdisk" 67 #define VD_ASCIILABEL "Virtual Disk" 68 69 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 70 #define VD_ID_PROP "id" 71 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 72 73 /* Virtual disk initialization flags */ 74 #define VD_LOCKING 0x01 75 #define VD_LDC 0x02 76 #define VD_DRING 0x04 77 #define VD_SID 0x08 78 #define VD_SEQ_NUM 0x10 79 80 /* Flags for opening/closing backing devices via LDI */ 81 #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) 82 83 /* 84 * By Solaris convention, slice/partition 2 represents the entire disk; 85 * unfortunately, this convention does not appear to be codified. 86 */ 87 #define VD_ENTIRE_DISK_SLICE 2 88 89 /* Return a cpp token as a string */ 90 #define STRINGIZE(token) #token 91 92 /* 93 * Print a message prefixed with the current function name to the message log 94 * (and optionally to the console for verbose boots); these macros use cpp's 95 * concatenation of string literals and C99 variable-length-argument-list 96 * macros 97 */ 98 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 99 #define _PRN(format, ...) \ 100 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 101 102 /* Return a pointer to the "i"th vdisk dring element */ 103 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 104 (vd->dring + (i)*vd->descriptor_size)) 105 106 /* Return the virtual disk client's type as a string (for use in messages) */ 107 #define VD_CLIENT(vd) \ 108 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 109 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 110 (((vd)->xfer_mode == 0) ? "null client" : \ 111 "unsupported client"))) 112 113 /* Debugging macros */ 114 #ifdef DEBUG 115 116 static int vd_msglevel = 0; 117 118 119 #define PR0 if (vd_msglevel > 0) PRN 120 #define PR1 if (vd_msglevel > 1) PRN 121 #define PR2 if (vd_msglevel > 2) PRN 122 123 #define VD_DUMP_DRING_ELEM(elem) \ 124 PRN("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 125 elem->hdr.dstate, \ 126 elem->payload.operation, \ 127 elem->payload.status, \ 128 elem->payload.nbytes, \ 129 elem->payload.addr, \ 130 elem->payload.ncookies); 131 132 char * 133 vd_decode_state(int state) 134 { 135 char *str; 136 137 #define CASE_STATE(_s) case _s: str = #_s; break; 138 139 switch (state) { 140 CASE_STATE(VD_STATE_INIT) 141 CASE_STATE(VD_STATE_VER) 142 CASE_STATE(VD_STATE_ATTR) 143 CASE_STATE(VD_STATE_DRING) 144 CASE_STATE(VD_STATE_RDX) 145 CASE_STATE(VD_STATE_DATA) 146 default: str = "unknown"; break; 147 } 148 149 #undef CASE_STATE 150 151 return (str); 152 } 153 154 void 155 vd_decode_tag(vio_msg_t *msg) 156 { 157 char *tstr, *sstr, *estr; 158 159 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 160 161 switch (msg->tag.vio_msgtype) { 162 CASE_TYPE(VIO_TYPE_CTRL) 163 CASE_TYPE(VIO_TYPE_DATA) 164 CASE_TYPE(VIO_TYPE_ERR) 165 default: tstr = "unknown"; break; 166 } 167 168 #undef CASE_TYPE 169 170 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 171 172 switch (msg->tag.vio_subtype) { 173 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 174 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 175 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 176 default: sstr = "unknown"; break; 177 } 178 179 #undef CASE_SUBTYPE 180 181 #define CASE_ENV(_s) case _s: estr = #_s; break; 182 183 switch (msg->tag.vio_subtype_env) { 184 CASE_ENV(VIO_VER_INFO) 185 CASE_ENV(VIO_ATTR_INFO) 186 CASE_ENV(VIO_DRING_REG) 187 CASE_ENV(VIO_DRING_UNREG) 188 CASE_ENV(VIO_RDX) 189 CASE_ENV(VIO_PKT_DATA) 190 CASE_ENV(VIO_DESC_DATA) 191 CASE_ENV(VIO_DRING_DATA) 192 default: estr = "unknown"; break; 193 } 194 195 #undef CASE_ENV 196 197 PR1("(%x/%x/%x) message : (%s/%s/%s)", 198 msg->tag.vio_msgtype, msg->tag.vio_subtype, 199 msg->tag.vio_subtype_env, tstr, sstr, estr); 200 } 201 202 #else /* !DEBUG */ 203 204 #define PR0(...) 205 #define PR1(...) 206 #define PR2(...) 207 208 #define VD_DUMP_DRING_ELEM(elem) 209 210 #define vd_decode_state(_s) (NULL) 211 #define vd_decode_tag(_s) (NULL) 212 213 #endif /* DEBUG */ 214 215 216 /* 217 * Soft state structure for a vds instance 218 */ 219 typedef struct vds { 220 uint_t initialized; /* driver inst initialization flags */ 221 dev_info_t *dip; /* driver inst devinfo pointer */ 222 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 223 mod_hash_t *vd_table; /* table of virtual disks served */ 224 mdeg_handle_t mdeg; /* handle for MDEG operations */ 225 } vds_t; 226 227 /* 228 * Types of descriptor-processing tasks 229 */ 230 typedef enum vd_task_type { 231 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 232 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 233 } vd_task_type_t; 234 235 /* 236 * Structure describing the task for processing a descriptor 237 */ 238 typedef struct vd_task { 239 struct vd *vd; /* vd instance task is for */ 240 vd_task_type_t type; /* type of descriptor task */ 241 int index; /* dring elem index for task */ 242 vio_msg_t *msg; /* VIO message task is for */ 243 size_t msglen; /* length of message content */ 244 vd_dring_payload_t *request; /* request task will perform */ 245 struct buf buf; /* buf(9s) for I/O request */ 246 ldc_mem_handle_t mhdl; /* task memory handle */ 247 } vd_task_t; 248 249 /* 250 * Soft state structure for a virtual disk instance 251 */ 252 typedef struct vd { 253 uint_t initialized; /* vdisk initialization flags */ 254 vds_t *vds; /* server for this vdisk */ 255 ddi_taskq_t *startq; /* queue for I/O start tasks */ 256 ddi_taskq_t *completionq; /* queue for completion tasks */ 257 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 258 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 259 uint_t nslices; /* number of slices */ 260 size_t vdisk_size; /* number of blocks in vdisk */ 261 vd_disk_type_t vdisk_type; /* slice or entire disk */ 262 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 263 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 264 boolean_t pseudo; /* underlying pseudo dev */ 265 struct dk_efi dk_efi; /* synthetic for slice type */ 266 struct dk_geom dk_geom; /* synthetic for slice type */ 267 struct vtoc vtoc; /* synthetic for slice type */ 268 ldc_status_t ldc_state; /* LDC connection state */ 269 ldc_handle_t ldc_handle; /* handle for LDC comm */ 270 size_t max_msglen; /* largest LDC message len */ 271 vd_state_t state; /* client handshake state */ 272 uint8_t xfer_mode; /* transfer mode with client */ 273 uint32_t sid; /* client's session ID */ 274 uint64_t seq_num; /* message sequence number */ 275 uint64_t dring_ident; /* identifier of dring */ 276 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 277 uint32_t descriptor_size; /* num bytes in desc */ 278 uint32_t dring_len; /* number of dring elements */ 279 caddr_t dring; /* address of dring */ 280 caddr_t vio_msgp; /* vio msg staging buffer */ 281 vd_task_t inband_task; /* task for inband descriptor */ 282 vd_task_t *dring_task; /* tasks dring elements */ 283 284 kmutex_t lock; /* protects variables below */ 285 boolean_t enabled; /* is vdisk enabled? */ 286 boolean_t reset_state; /* reset connection state? */ 287 boolean_t reset_ldc; /* reset LDC channel? */ 288 } vd_t; 289 290 typedef struct vds_operation { 291 char *namep; 292 uint8_t operation; 293 int (*start)(vd_task_t *task); 294 void (*complete)(void *arg); 295 } vds_operation_t; 296 297 typedef struct vd_ioctl { 298 uint8_t operation; /* vdisk operation */ 299 const char *operation_name; /* vdisk operation name */ 300 size_t nbytes; /* size of operation buffer */ 301 int cmd; /* corresponding ioctl cmd */ 302 const char *cmd_name; /* ioctl cmd name */ 303 void *arg; /* ioctl cmd argument */ 304 /* convert input vd_buf to output ioctl_arg */ 305 void (*copyin)(void *vd_buf, void *ioctl_arg); 306 /* convert input ioctl_arg to output vd_buf */ 307 void (*copyout)(void *ioctl_arg, void *vd_buf); 308 } vd_ioctl_t; 309 310 /* Define trivial copyin/copyout conversion function flag */ 311 #define VD_IDENTITY ((void (*)(void *, void *))-1) 312 313 314 static int vds_ldc_retries = VDS_LDC_RETRIES; 315 static int vds_ldc_delay = VDS_LDC_DELAY; 316 static void *vds_state; 317 static uint64_t vds_operations; /* see vds_operation[] definition below */ 318 319 static int vd_open_flags = VD_OPEN_FLAGS; 320 321 /* 322 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 323 * 324 * Each supported major version should appear only once, paired with (and only 325 * with) its highest supported minor version number (as the protocol requires 326 * supporting all lower minor version numbers as well) 327 */ 328 static const vio_ver_t vds_version[] = {{1, 0}}; 329 static const size_t vds_num_versions = 330 sizeof (vds_version)/sizeof (vds_version[0]); 331 332 static void vd_free_dring_task(vd_t *vdp); 333 334 static int 335 vd_start_bio(vd_task_t *task) 336 { 337 int rv, status = 0; 338 vd_t *vd = task->vd; 339 vd_dring_payload_t *request = task->request; 340 struct buf *buf = &task->buf; 341 uint8_t mtype; 342 343 344 ASSERT(vd != NULL); 345 ASSERT(request != NULL); 346 ASSERT(request->slice < vd->nslices); 347 ASSERT((request->operation == VD_OP_BREAD) || 348 (request->operation == VD_OP_BWRITE)); 349 350 if (request->nbytes == 0) 351 return (EINVAL); /* no service for trivial requests */ 352 353 PR1("%s %lu bytes at block %lu", 354 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 355 request->nbytes, request->addr); 356 357 bioinit(buf); 358 buf->b_flags = B_BUSY; 359 buf->b_bcount = request->nbytes; 360 buf->b_lblkno = request->addr; 361 buf->b_edev = vd->dev[request->slice]; 362 363 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 364 365 /* Map memory exported by client */ 366 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 367 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 368 &(buf->b_un.b_addr), NULL); 369 if (status != 0) { 370 PR0("ldc_mem_map() returned err %d ", status); 371 biofini(buf); 372 return (status); 373 } 374 375 status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount); 376 if (status != 0) { 377 (void) ldc_mem_unmap(task->mhdl); 378 PR0("ldc_mem_acquire() returned err %d ", status); 379 biofini(buf); 380 return (status); 381 } 382 383 buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE; 384 385 /* Start the block I/O */ 386 if ((status = ldi_strategy(vd->ldi_handle[request->slice], buf)) == 0) 387 return (EINPROGRESS); /* will complete on completionq */ 388 389 /* Clean up after error */ 390 rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 391 if (rv) { 392 PR0("ldc_mem_release() returned err %d ", rv); 393 } 394 rv = ldc_mem_unmap(task->mhdl); 395 if (rv) { 396 PR0("ldc_mem_unmap() returned err %d ", status); 397 } 398 399 biofini(buf); 400 return (status); 401 } 402 403 static int 404 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 405 { 406 int status; 407 size_t nbytes; 408 409 do { 410 nbytes = msglen; 411 status = ldc_write(ldc_handle, msg, &nbytes); 412 if (status != EWOULDBLOCK) 413 break; 414 drv_usecwait(vds_ldc_delay); 415 } while (status == EWOULDBLOCK); 416 417 if (status != 0) { 418 if (status != ECONNRESET) 419 PR0("ldc_write() returned errno %d", status); 420 return (status); 421 } else if (nbytes != msglen) { 422 PR0("ldc_write() performed only partial write"); 423 return (EIO); 424 } 425 426 PR1("SENT %lu bytes", msglen); 427 return (0); 428 } 429 430 static void 431 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 432 { 433 mutex_enter(&vd->lock); 434 vd->reset_state = B_TRUE; 435 vd->reset_ldc = reset_ldc; 436 mutex_exit(&vd->lock); 437 } 438 439 /* 440 * Reset the state of the connection with a client, if needed; reset the LDC 441 * transport as well, if needed. This function should only be called from the 442 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 443 */ 444 static void 445 vd_reset_if_needed(vd_t *vd) 446 { 447 int status = 0; 448 449 mutex_enter(&vd->lock); 450 if (!vd->reset_state) { 451 ASSERT(!vd->reset_ldc); 452 mutex_exit(&vd->lock); 453 return; 454 } 455 mutex_exit(&vd->lock); 456 457 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 458 459 /* 460 * Let any asynchronous I/O complete before possibly pulling the rug 461 * out from under it; defer checking vd->reset_ldc, as one of the 462 * asynchronous tasks might set it 463 */ 464 ddi_taskq_wait(vd->completionq); 465 466 if ((vd->initialized & VD_DRING) && 467 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 468 PR0("ldc_mem_dring_unmap() returned errno %d", status); 469 470 vd_free_dring_task(vd); 471 472 /* Free the staging buffer for msgs */ 473 if (vd->vio_msgp != NULL) { 474 kmem_free(vd->vio_msgp, vd->max_msglen); 475 vd->vio_msgp = NULL; 476 } 477 478 /* Free the inband message buffer */ 479 if (vd->inband_task.msg != NULL) { 480 kmem_free(vd->inband_task.msg, vd->max_msglen); 481 vd->inband_task.msg = NULL; 482 } 483 484 mutex_enter(&vd->lock); 485 486 if (vd->reset_ldc) 487 PR0("taking down LDC channel"); 488 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 489 PR0("ldc_down() returned errno %d", status); 490 491 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 492 vd->state = VD_STATE_INIT; 493 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 494 495 /* Allocate the staging buffer */ 496 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 497 498 status = ldc_status(vd->ldc_handle, &vd->ldc_state); 499 if (vd->reset_ldc && vd->ldc_state != LDC_UP) { 500 PR0("calling ldc_up\n"); 501 (void) ldc_up(vd->ldc_handle); 502 } 503 504 vd->reset_state = B_FALSE; 505 vd->reset_ldc = B_FALSE; 506 507 mutex_exit(&vd->lock); 508 } 509 510 static void vd_recv_msg(void *arg); 511 512 static void 513 vd_mark_in_reset(vd_t *vd) 514 { 515 int status; 516 517 PR0("vd_mark_in_reset: marking vd in reset\n"); 518 519 vd_need_reset(vd, B_FALSE); 520 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 521 if (status == DDI_FAILURE) { 522 PR0("cannot schedule task to recv msg\n"); 523 vd_need_reset(vd, B_TRUE); 524 return; 525 } 526 } 527 528 static int 529 vd_mark_elem_done(vd_t *vd, int idx, int elem_status) 530 { 531 boolean_t accepted; 532 int status; 533 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 534 535 if (vd->reset_state) 536 return (0); 537 538 /* Acquire the element */ 539 if (!vd->reset_state && 540 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 541 if (status == ECONNRESET) { 542 vd_mark_in_reset(vd); 543 return (0); 544 } else { 545 PR0("ldc_mem_dring_acquire() returned errno %d", 546 status); 547 return (status); 548 } 549 } 550 551 /* Set the element's status and mark it done */ 552 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 553 if (accepted) { 554 elem->payload.status = elem_status; 555 elem->hdr.dstate = VIO_DESC_DONE; 556 } else { 557 /* Perhaps client timed out waiting for I/O... */ 558 PR0("element %u no longer \"accepted\"", idx); 559 VD_DUMP_DRING_ELEM(elem); 560 } 561 /* Release the element */ 562 if (!vd->reset_state && 563 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 564 if (status == ECONNRESET) { 565 vd_mark_in_reset(vd); 566 return (0); 567 } else { 568 PR0("ldc_mem_dring_release() returned errno %d", 569 status); 570 return (status); 571 } 572 } 573 574 return (accepted ? 0 : EINVAL); 575 } 576 577 static void 578 vd_complete_bio(void *arg) 579 { 580 int status = 0; 581 vd_task_t *task = (vd_task_t *)arg; 582 vd_t *vd = task->vd; 583 vd_dring_payload_t *request = task->request; 584 struct buf *buf = &task->buf; 585 586 587 ASSERT(vd != NULL); 588 ASSERT(request != NULL); 589 ASSERT(task->msg != NULL); 590 ASSERT(task->msglen >= sizeof (*task->msg)); 591 592 /* Wait for the I/O to complete */ 593 request->status = biowait(buf); 594 595 /* Release the buffer */ 596 if (!vd->reset_state) 597 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 598 if (status) { 599 PR0("ldc_mem_release() returned errno %d copying to " 600 "client", status); 601 if (status == ECONNRESET) { 602 vd_mark_in_reset(vd); 603 } 604 } 605 606 /* Unmap the memory, even if in reset */ 607 status = ldc_mem_unmap(task->mhdl); 608 if (status) { 609 PR0("ldc_mem_unmap() returned errno %d copying to client", 610 status); 611 if (status == ECONNRESET) { 612 vd_mark_in_reset(vd); 613 } 614 } 615 616 biofini(buf); 617 618 /* Update the dring element for a dring client */ 619 if (!vd->reset_state && (status == 0) && 620 (vd->xfer_mode == VIO_DRING_MODE)) { 621 status = vd_mark_elem_done(vd, task->index, request->status); 622 if (status == ECONNRESET) 623 vd_mark_in_reset(vd); 624 } 625 626 /* 627 * If a transport error occurred, arrange to "nack" the message when 628 * the final task in the descriptor element range completes 629 */ 630 if (status != 0) 631 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 632 633 /* 634 * Only the final task for a range of elements will respond to and 635 * free the message 636 */ 637 if (task->type == VD_NONFINAL_RANGE_TASK) { 638 return; 639 } 640 641 /* 642 * Send the "ack" or "nack" back to the client; if sending the message 643 * via LDC fails, arrange to reset both the connection state and LDC 644 * itself 645 */ 646 PR1("Sending %s", 647 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 648 if (!vd->reset_state) { 649 status = send_msg(vd->ldc_handle, task->msg, task->msglen); 650 switch (status) { 651 case 0: 652 break; 653 case ECONNRESET: 654 vd_mark_in_reset(vd); 655 break; 656 default: 657 PR0("initiating full reset"); 658 vd_need_reset(vd, B_TRUE); 659 break; 660 } 661 } 662 } 663 664 static void 665 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 666 { 667 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 668 } 669 670 static void 671 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 672 { 673 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 674 } 675 676 static void 677 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 678 { 679 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 680 } 681 682 static void 683 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 684 { 685 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 686 } 687 688 static void 689 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 690 { 691 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 692 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 693 694 dk_efi->dki_lba = vd_efi->lba; 695 dk_efi->dki_length = vd_efi->length; 696 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 697 } 698 699 static void 700 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 701 { 702 int len; 703 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 704 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 705 706 len = vd_efi->length; 707 DK_EFI2VD_EFI(dk_efi, vd_efi); 708 kmem_free(dk_efi->dki_data, len); 709 } 710 711 static void 712 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 713 { 714 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 715 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 716 717 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 718 VD_EFI2DK_EFI(vd_efi, dk_efi); 719 } 720 721 static void 722 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 723 { 724 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 725 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 726 727 kmem_free(dk_efi->dki_data, vd_efi->length); 728 } 729 730 static int 731 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label) 732 { 733 int status, rval; 734 struct dk_gpt *efi; 735 size_t efi_len; 736 737 *label = VD_DISK_LABEL_UNK; 738 739 status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc, 740 (vd_open_flags | FKIOCTL), kcred, &rval); 741 742 if (status == 0) { 743 *label = VD_DISK_LABEL_VTOC; 744 return (0); 745 } else if (status != ENOTSUP) { 746 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 747 return (status); 748 } 749 750 status = vds_efi_alloc_and_read(handle, &efi, &efi_len); 751 752 if (status) { 753 PR0("vds_efi_alloc_and_read returned error %d", status); 754 return (status); 755 } 756 757 *label = VD_DISK_LABEL_EFI; 758 vd_efi_to_vtoc(efi, vtoc); 759 vd_efi_free(efi, efi_len); 760 761 return (0); 762 } 763 764 static int 765 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 766 { 767 dk_efi_t *dk_ioc; 768 769 switch (vd->vdisk_label) { 770 771 case VD_DISK_LABEL_VTOC: 772 773 switch (cmd) { 774 case DKIOCGGEOM: 775 ASSERT(ioctl_arg != NULL); 776 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 777 return (0); 778 case DKIOCGVTOC: 779 ASSERT(ioctl_arg != NULL); 780 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 781 return (0); 782 default: 783 return (ENOTSUP); 784 } 785 786 case VD_DISK_LABEL_EFI: 787 788 switch (cmd) { 789 case DKIOCGETEFI: 790 ASSERT(ioctl_arg != NULL); 791 dk_ioc = (dk_efi_t *)ioctl_arg; 792 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 793 return (EINVAL); 794 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 795 vd->dk_efi.dki_length); 796 return (0); 797 default: 798 return (ENOTSUP); 799 } 800 801 default: 802 return (ENOTSUP); 803 } 804 } 805 806 static int 807 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 808 { 809 int rval = 0, status; 810 size_t nbytes = request->nbytes; /* modifiable copy */ 811 812 813 ASSERT(request->slice < vd->nslices); 814 PR0("Performing %s", ioctl->operation_name); 815 816 /* Get data from client and convert, if necessary */ 817 if (ioctl->copyin != NULL) { 818 ASSERT(nbytes != 0 && buf != NULL); 819 PR1("Getting \"arg\" data from client"); 820 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 821 request->cookie, request->ncookies, 822 LDC_COPY_IN)) != 0) { 823 PR0("ldc_mem_copy() returned errno %d " 824 "copying from client", status); 825 return (status); 826 } 827 828 /* Convert client's data, if necessary */ 829 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 830 ioctl->arg = buf; 831 else /* convert client vdisk operation data to ioctl data */ 832 (ioctl->copyin)(buf, (void *)ioctl->arg); 833 } 834 835 /* 836 * Handle single-slice block devices internally; otherwise, have the 837 * real driver perform the ioctl() 838 */ 839 if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { 840 if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, 841 (void *)ioctl->arg)) != 0) 842 return (status); 843 } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], 844 ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL), 845 kcred, &rval)) != 0) { 846 PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); 847 return (status); 848 } 849 #ifdef DEBUG 850 if (rval != 0) { 851 PR0("%s set rval = %d, which is not being returned to client", 852 ioctl->cmd_name, rval); 853 } 854 #endif /* DEBUG */ 855 856 /* Convert data and send to client, if necessary */ 857 if (ioctl->copyout != NULL) { 858 ASSERT(nbytes != 0 && buf != NULL); 859 PR1("Sending \"arg\" data to client"); 860 861 /* Convert ioctl data to vdisk operation data, if necessary */ 862 if (ioctl->copyout != VD_IDENTITY) 863 (ioctl->copyout)((void *)ioctl->arg, buf); 864 865 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 866 request->cookie, request->ncookies, 867 LDC_COPY_OUT)) != 0) { 868 PR0("ldc_mem_copy() returned errno %d " 869 "copying to client", status); 870 return (status); 871 } 872 } 873 874 return (status); 875 } 876 877 /* 878 * Open any slices which have become non-empty as a result of performing a 879 * set-VTOC operation for the client. 880 * 881 * When serving a full disk, vds attempts to exclusively open all of the 882 * disk's slices to prevent another thread or process in the service domain 883 * from "stealing" a slice or from performing I/O to a slice while a vds 884 * client is accessing it. Unfortunately, underlying drivers, such as sd(7d) 885 * and cmdk(7d), return an error when attempting to open the device file for a 886 * slice which is currently empty according to the VTOC. This driver behavior 887 * means that vds must skip opening empty slices when initializing a vdisk for 888 * full-disk service and try to open slices that become non-empty (via a 889 * set-VTOC operation) during use of the full disk in order to begin serving 890 * such slices to the client. This approach has an inherent (and therefore 891 * unavoidable) race condition; it also means that failure to open a 892 * newly-non-empty slice has different semantics than failure to open an 893 * initially-non-empty slice: Due to driver bahavior, opening a 894 * newly-non-empty slice is a necessary side effect of vds performing a 895 * (successful) set-VTOC operation for a client on an in-service (and in-use) 896 * disk in order to begin serving the slice; failure of this side-effect 897 * operation does not mean that the client's set-VTOC operation failed or that 898 * operations on other slices must fail. Therefore, this function prints an 899 * error message on failure to open a slice, but does not return an error to 900 * its caller--unlike failure to open a slice initially, which results in an 901 * error that prevents serving the vdisk (and thereby requires an 902 * administrator to resolve the problem). Note that, apart from another 903 * thread or process opening a new slice during the race-condition window, 904 * failure to open a slice in this function will likely indicate an underlying 905 * drive problem, which will also likely become evident in errors returned by 906 * operations on other slices, and which will require administrative 907 * intervention and possibly servicing the drive. 908 */ 909 static void 910 vd_open_new_slices(vd_t *vd) 911 { 912 int status; 913 struct vtoc vtoc; 914 915 /* Get the (new) partitions for updated slice sizes */ 916 if ((status = vd_read_vtoc(vd->ldi_handle[0], &vtoc, 917 &vd->vdisk_label)) != 0) { 918 PR0("vd_read_vtoc returned error %d", status); 919 return; 920 } 921 922 /* Open any newly-non-empty slices */ 923 for (int slice = 0; slice < vd->nslices; slice++) { 924 /* Skip zero-length slices */ 925 if (vtoc.v_part[slice].p_size == 0) { 926 if (vd->ldi_handle[slice] != NULL) 927 PR0("Open slice %u now has zero length", slice); 928 continue; 929 } 930 931 /* Skip already-open slices */ 932 if (vd->ldi_handle[slice] != NULL) 933 continue; 934 935 PR0("Opening newly-non-empty slice %u", slice); 936 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 937 vd_open_flags, kcred, &vd->ldi_handle[slice], 938 vd->vds->ldi_ident)) != 0) { 939 PR0("ldi_open_by_dev() returned errno %d " 940 "for slice %u", status, slice); 941 } 942 } 943 } 944 945 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 946 static int 947 vd_ioctl(vd_task_t *task) 948 { 949 int i, status; 950 void *buf = NULL; 951 struct dk_geom dk_geom = {0}; 952 struct vtoc vtoc = {0}; 953 struct dk_efi dk_efi = {0}; 954 vd_t *vd = task->vd; 955 vd_dring_payload_t *request = task->request; 956 vd_ioctl_t ioctl[] = { 957 /* Command (no-copy) operations */ 958 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 959 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 960 NULL, NULL, NULL}, 961 962 /* "Get" (copy-out) operations */ 963 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 964 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 965 NULL, VD_IDENTITY, VD_IDENTITY}, 966 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 967 RNDSIZE(vd_geom_t), 968 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 969 &dk_geom, NULL, dk_geom2vd_geom}, 970 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 971 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 972 &vtoc, NULL, vtoc2vd_vtoc}, 973 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 974 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 975 &dk_efi, vd_get_efi_in, vd_get_efi_out}, 976 977 /* "Set" (copy-in) operations */ 978 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 979 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 980 NULL, VD_IDENTITY, VD_IDENTITY}, 981 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 982 RNDSIZE(vd_geom_t), 983 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 984 &dk_geom, vd_geom2dk_geom, NULL}, 985 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 986 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 987 &vtoc, vd_vtoc2vtoc, NULL}, 988 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 989 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 990 &dk_efi, vd_set_efi_in, vd_set_efi_out}, 991 }; 992 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 993 994 995 ASSERT(vd != NULL); 996 ASSERT(request != NULL); 997 ASSERT(request->slice < vd->nslices); 998 999 /* 1000 * Determine ioctl corresponding to caller's "operation" and 1001 * validate caller's "nbytes" 1002 */ 1003 for (i = 0; i < nioctls; i++) { 1004 if (request->operation == ioctl[i].operation) { 1005 /* LDC memory operations require 8-byte multiples */ 1006 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 1007 1008 if (request->operation == VD_OP_GET_EFI || 1009 request->operation == VD_OP_SET_EFI) { 1010 if (request->nbytes >= ioctl[i].nbytes) 1011 break; 1012 PR0("%s: Expected at least nbytes = %lu, " 1013 "got %lu", ioctl[i].operation_name, 1014 ioctl[i].nbytes, request->nbytes); 1015 return (EINVAL); 1016 } 1017 1018 if (request->nbytes != ioctl[i].nbytes) { 1019 PR0("%s: Expected nbytes = %lu, got %lu", 1020 ioctl[i].operation_name, ioctl[i].nbytes, 1021 request->nbytes); 1022 return (EINVAL); 1023 } 1024 1025 break; 1026 } 1027 } 1028 ASSERT(i < nioctls); /* because "operation" already validated */ 1029 1030 if (request->nbytes) 1031 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 1032 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 1033 if (request->nbytes) 1034 kmem_free(buf, request->nbytes); 1035 if (vd->vdisk_type == VD_DISK_TYPE_DISK && 1036 (request->operation == VD_OP_SET_VTOC || 1037 request->operation == VD_OP_SET_EFI)) 1038 vd_open_new_slices(vd); 1039 PR0("Returning %d", status); 1040 return (status); 1041 } 1042 1043 static int 1044 vd_get_devid(vd_task_t *task) 1045 { 1046 vd_t *vd = task->vd; 1047 vd_dring_payload_t *request = task->request; 1048 vd_devid_t *vd_devid; 1049 impl_devid_t *devid; 1050 int status, bufid_len, devid_len, len; 1051 int bufbytes; 1052 1053 PR1("Get Device ID, nbytes=%ld", request->nbytes); 1054 1055 if (ddi_lyr_get_devid(vd->dev[request->slice], 1056 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 1057 /* the most common failure is that no devid is available */ 1058 PR2("No Device ID"); 1059 return (ENOENT); 1060 } 1061 1062 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 1063 devid_len = DEVID_GETLEN(devid); 1064 1065 /* 1066 * Save the buffer size here for use in deallocation. 1067 * The actual number of bytes copied is returned in 1068 * the 'nbytes' field of the request structure. 1069 */ 1070 bufbytes = request->nbytes; 1071 1072 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 1073 vd_devid->length = devid_len; 1074 vd_devid->type = DEVID_GETTYPE(devid); 1075 1076 len = (devid_len > bufid_len)? bufid_len : devid_len; 1077 1078 bcopy(devid->did_id, vd_devid->id, len); 1079 1080 /* LDC memory operations require 8-byte multiples */ 1081 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 1082 1083 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 1084 &request->nbytes, request->cookie, request->ncookies, 1085 LDC_COPY_OUT)) != 0) { 1086 PR0("ldc_mem_copy() returned errno %d copying to client", 1087 status); 1088 } 1089 PR1("post mem_copy: nbytes=%ld", request->nbytes); 1090 1091 kmem_free(vd_devid, bufbytes); 1092 ddi_devid_free((ddi_devid_t)devid); 1093 1094 return (status); 1095 } 1096 1097 /* 1098 * Define the supported operations once the functions for performing them have 1099 * been defined 1100 */ 1101 static const vds_operation_t vds_operation[] = { 1102 #define X(_s) #_s, _s 1103 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 1104 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 1105 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 1106 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 1107 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 1108 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 1109 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 1110 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 1111 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 1112 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 1113 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 1114 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 1115 #undef X 1116 }; 1117 1118 static const size_t vds_noperations = 1119 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 1120 1121 /* 1122 * Process a task specifying a client I/O request 1123 */ 1124 static int 1125 vd_process_task(vd_task_t *task) 1126 { 1127 int i, status; 1128 vd_t *vd = task->vd; 1129 vd_dring_payload_t *request = task->request; 1130 1131 1132 ASSERT(vd != NULL); 1133 ASSERT(request != NULL); 1134 1135 /* Find the requested operation */ 1136 for (i = 0; i < vds_noperations; i++) 1137 if (request->operation == vds_operation[i].operation) 1138 break; 1139 if (i == vds_noperations) { 1140 PR0("Unsupported operation %u", request->operation); 1141 return (ENOTSUP); 1142 } 1143 1144 /* Handle client using absolute disk offsets */ 1145 if ((vd->vdisk_type == VD_DISK_TYPE_DISK) && 1146 (request->slice == UINT8_MAX)) 1147 request->slice = VD_ENTIRE_DISK_SLICE; 1148 1149 /* Range-check slice */ 1150 if (request->slice >= vd->nslices) { 1151 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 1152 request->slice, (vd->nslices - 1)); 1153 return (EINVAL); 1154 } 1155 1156 PR1("operation : %s", vds_operation[i].namep); 1157 1158 /* Start the operation */ 1159 if ((status = vds_operation[i].start(task)) != EINPROGRESS) { 1160 PR0("operation : %s returned status %d", 1161 vds_operation[i].namep, status); 1162 request->status = status; /* op succeeded or failed */ 1163 return (0); /* but request completed */ 1164 } 1165 1166 ASSERT(vds_operation[i].complete != NULL); /* debug case */ 1167 if (vds_operation[i].complete == NULL) { /* non-debug case */ 1168 PR0("Unexpected return of EINPROGRESS " 1169 "with no I/O completion handler"); 1170 request->status = EIO; /* operation failed */ 1171 return (0); /* but request completed */ 1172 } 1173 1174 PR1("operation : kick off taskq entry for %s", vds_operation[i].namep); 1175 1176 /* Queue a task to complete the operation */ 1177 status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete, 1178 task, DDI_SLEEP); 1179 /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ 1180 ASSERT(status == DDI_SUCCESS); 1181 1182 PR1("Operation in progress"); 1183 return (EINPROGRESS); /* completion handler will finish request */ 1184 } 1185 1186 /* 1187 * Return true if the "type", "subtype", and "env" fields of the "tag" first 1188 * argument match the corresponding remaining arguments; otherwise, return false 1189 */ 1190 boolean_t 1191 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 1192 { 1193 return ((tag->vio_msgtype == type) && 1194 (tag->vio_subtype == subtype) && 1195 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 1196 } 1197 1198 /* 1199 * Check whether the major/minor version specified in "ver_msg" is supported 1200 * by this server. 1201 */ 1202 static boolean_t 1203 vds_supported_version(vio_ver_msg_t *ver_msg) 1204 { 1205 for (int i = 0; i < vds_num_versions; i++) { 1206 ASSERT(vds_version[i].major > 0); 1207 ASSERT((i == 0) || 1208 (vds_version[i].major < vds_version[i-1].major)); 1209 1210 /* 1211 * If the major versions match, adjust the minor version, if 1212 * necessary, down to the highest value supported by this 1213 * server and return true so this message will get "ack"ed; 1214 * the client should also support all minor versions lower 1215 * than the value it sent 1216 */ 1217 if (ver_msg->ver_major == vds_version[i].major) { 1218 if (ver_msg->ver_minor > vds_version[i].minor) { 1219 PR0("Adjusting minor version from %u to %u", 1220 ver_msg->ver_minor, vds_version[i].minor); 1221 ver_msg->ver_minor = vds_version[i].minor; 1222 } 1223 return (B_TRUE); 1224 } 1225 1226 /* 1227 * If the message contains a higher major version number, set 1228 * the message's major/minor versions to the current values 1229 * and return false, so this message will get "nack"ed with 1230 * these values, and the client will potentially try again 1231 * with the same or a lower version 1232 */ 1233 if (ver_msg->ver_major > vds_version[i].major) { 1234 ver_msg->ver_major = vds_version[i].major; 1235 ver_msg->ver_minor = vds_version[i].minor; 1236 return (B_FALSE); 1237 } 1238 1239 /* 1240 * Otherwise, the message's major version is less than the 1241 * current major version, so continue the loop to the next 1242 * (lower) supported version 1243 */ 1244 } 1245 1246 /* 1247 * No common version was found; "ground" the version pair in the 1248 * message to terminate negotiation 1249 */ 1250 ver_msg->ver_major = 0; 1251 ver_msg->ver_minor = 0; 1252 return (B_FALSE); 1253 } 1254 1255 /* 1256 * Process a version message from a client. vds expects to receive version 1257 * messages from clients seeking service, but never issues version messages 1258 * itself; therefore, vds can ACK or NACK client version messages, but does 1259 * not expect to receive version-message ACKs or NACKs (and will treat such 1260 * messages as invalid). 1261 */ 1262 static int 1263 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1264 { 1265 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 1266 1267 1268 ASSERT(msglen >= sizeof (msg->tag)); 1269 1270 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1271 VIO_VER_INFO)) { 1272 return (ENOMSG); /* not a version message */ 1273 } 1274 1275 if (msglen != sizeof (*ver_msg)) { 1276 PR0("Expected %lu-byte version message; " 1277 "received %lu bytes", sizeof (*ver_msg), msglen); 1278 return (EBADMSG); 1279 } 1280 1281 if (ver_msg->dev_class != VDEV_DISK) { 1282 PR0("Expected device class %u (disk); received %u", 1283 VDEV_DISK, ver_msg->dev_class); 1284 return (EBADMSG); 1285 } 1286 1287 /* 1288 * We're talking to the expected kind of client; set our device class 1289 * for "ack/nack" back to the client 1290 */ 1291 ver_msg->dev_class = VDEV_DISK_SERVER; 1292 1293 /* 1294 * Check whether the (valid) version message specifies a version 1295 * supported by this server. If the version is not supported, return 1296 * EBADMSG so the message will get "nack"ed; vds_supported_version() 1297 * will have updated the message with a supported version for the 1298 * client to consider 1299 */ 1300 if (!vds_supported_version(ver_msg)) 1301 return (EBADMSG); 1302 1303 1304 /* 1305 * A version has been agreed upon; use the client's SID for 1306 * communication on this channel now 1307 */ 1308 ASSERT(!(vd->initialized & VD_SID)); 1309 vd->sid = ver_msg->tag.vio_sid; 1310 vd->initialized |= VD_SID; 1311 1312 /* 1313 * When multiple versions are supported, this function should store 1314 * the negotiated major and minor version values in the "vd" data 1315 * structure to govern further communication; in particular, note that 1316 * the client might have specified a lower minor version for the 1317 * agreed major version than specifed in the vds_version[] array. The 1318 * following assertions should help remind future maintainers to make 1319 * the appropriate changes to support multiple versions. 1320 */ 1321 ASSERT(vds_num_versions == 1); 1322 ASSERT(ver_msg->ver_major == vds_version[0].major); 1323 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 1324 1325 PR0("Using major version %u, minor version %u", 1326 ver_msg->ver_major, ver_msg->ver_minor); 1327 return (0); 1328 } 1329 1330 static int 1331 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1332 { 1333 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 1334 1335 1336 ASSERT(msglen >= sizeof (msg->tag)); 1337 1338 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1339 VIO_ATTR_INFO)) { 1340 PR0("Message is not an attribute message"); 1341 return (ENOMSG); 1342 } 1343 1344 if (msglen != sizeof (*attr_msg)) { 1345 PR0("Expected %lu-byte attribute message; " 1346 "received %lu bytes", sizeof (*attr_msg), msglen); 1347 return (EBADMSG); 1348 } 1349 1350 if (attr_msg->max_xfer_sz == 0) { 1351 PR0("Received maximum transfer size of 0 from client"); 1352 return (EBADMSG); 1353 } 1354 1355 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 1356 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 1357 PR0("Client requested unsupported transfer mode"); 1358 return (EBADMSG); 1359 } 1360 1361 /* Success: valid message and transfer mode */ 1362 vd->xfer_mode = attr_msg->xfer_mode; 1363 1364 if (vd->xfer_mode == VIO_DESC_MODE) { 1365 1366 /* 1367 * The vd_dring_inband_msg_t contains one cookie; need room 1368 * for up to n-1 more cookies, where "n" is the number of full 1369 * pages plus possibly one partial page required to cover 1370 * "max_xfer_sz". Add room for one more cookie if 1371 * "max_xfer_sz" isn't an integral multiple of the page size. 1372 * Must first get the maximum transfer size in bytes. 1373 */ 1374 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 1375 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 1376 attr_msg->max_xfer_sz; 1377 size_t max_inband_msglen = 1378 sizeof (vd_dring_inband_msg_t) + 1379 ((max_xfer_bytes/PAGESIZE + 1380 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 1381 (sizeof (ldc_mem_cookie_t))); 1382 1383 /* 1384 * Set the maximum expected message length to 1385 * accommodate in-band-descriptor messages with all 1386 * their cookies 1387 */ 1388 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 1389 1390 /* 1391 * Initialize the data structure for processing in-band I/O 1392 * request descriptors 1393 */ 1394 vd->inband_task.vd = vd; 1395 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1396 vd->inband_task.index = 0; 1397 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 1398 } 1399 1400 /* Return the device's block size and max transfer size to the client */ 1401 attr_msg->vdisk_block_size = DEV_BSIZE; 1402 attr_msg->max_xfer_sz = vd->max_xfer_sz; 1403 1404 attr_msg->vdisk_size = vd->vdisk_size; 1405 attr_msg->vdisk_type = vd->vdisk_type; 1406 attr_msg->operations = vds_operations; 1407 PR0("%s", VD_CLIENT(vd)); 1408 1409 ASSERT(vd->dring_task == NULL); 1410 1411 return (0); 1412 } 1413 1414 static int 1415 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1416 { 1417 int status; 1418 size_t expected; 1419 ldc_mem_info_t dring_minfo; 1420 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 1421 1422 1423 ASSERT(msglen >= sizeof (msg->tag)); 1424 1425 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1426 VIO_DRING_REG)) { 1427 PR0("Message is not a register-dring message"); 1428 return (ENOMSG); 1429 } 1430 1431 if (msglen < sizeof (*reg_msg)) { 1432 PR0("Expected at least %lu-byte register-dring message; " 1433 "received %lu bytes", sizeof (*reg_msg), msglen); 1434 return (EBADMSG); 1435 } 1436 1437 expected = sizeof (*reg_msg) + 1438 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 1439 if (msglen != expected) { 1440 PR0("Expected %lu-byte register-dring message; " 1441 "received %lu bytes", expected, msglen); 1442 return (EBADMSG); 1443 } 1444 1445 if (vd->initialized & VD_DRING) { 1446 PR0("A dring was previously registered; only support one"); 1447 return (EBADMSG); 1448 } 1449 1450 if (reg_msg->num_descriptors > INT32_MAX) { 1451 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 1452 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 1453 return (EBADMSG); 1454 } 1455 1456 if (reg_msg->ncookies != 1) { 1457 /* 1458 * In addition to fixing the assertion in the success case 1459 * below, supporting drings which require more than one 1460 * "cookie" requires increasing the value of vd->max_msglen 1461 * somewhere in the code path prior to receiving the message 1462 * which results in calling this function. Note that without 1463 * making this change, the larger message size required to 1464 * accommodate multiple cookies cannot be successfully 1465 * received, so this function will not even get called. 1466 * Gracefully accommodating more dring cookies might 1467 * reasonably demand exchanging an additional attribute or 1468 * making a minor protocol adjustment 1469 */ 1470 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 1471 return (EBADMSG); 1472 } 1473 1474 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 1475 reg_msg->ncookies, reg_msg->num_descriptors, 1476 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 1477 if (status != 0) { 1478 PR0("ldc_mem_dring_map() returned errno %d", status); 1479 return (status); 1480 } 1481 1482 /* 1483 * To remove the need for this assertion, must call 1484 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 1485 * successful call to ldc_mem_dring_map() 1486 */ 1487 ASSERT(reg_msg->ncookies == 1); 1488 1489 if ((status = 1490 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 1491 PR0("ldc_mem_dring_info() returned errno %d", status); 1492 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 1493 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1494 return (status); 1495 } 1496 1497 if (dring_minfo.vaddr == NULL) { 1498 PR0("Descriptor ring virtual address is NULL"); 1499 return (ENXIO); 1500 } 1501 1502 1503 /* Initialize for valid message and mapped dring */ 1504 PR1("descriptor size = %u, dring length = %u", 1505 vd->descriptor_size, vd->dring_len); 1506 vd->initialized |= VD_DRING; 1507 vd->dring_ident = 1; /* "There Can Be Only One" */ 1508 vd->dring = dring_minfo.vaddr; 1509 vd->descriptor_size = reg_msg->descriptor_size; 1510 vd->dring_len = reg_msg->num_descriptors; 1511 reg_msg->dring_ident = vd->dring_ident; 1512 1513 /* 1514 * Allocate and initialize a "shadow" array of data structures for 1515 * tasks to process I/O requests in dring elements 1516 */ 1517 vd->dring_task = 1518 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 1519 for (int i = 0; i < vd->dring_len; i++) { 1520 vd->dring_task[i].vd = vd; 1521 vd->dring_task[i].index = i; 1522 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 1523 1524 status = ldc_mem_alloc_handle(vd->ldc_handle, 1525 &(vd->dring_task[i].mhdl)); 1526 if (status) { 1527 PR0("ldc_mem_alloc_handle() returned err %d ", status); 1528 return (ENXIO); 1529 } 1530 1531 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1532 } 1533 1534 return (0); 1535 } 1536 1537 static int 1538 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1539 { 1540 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 1541 1542 1543 ASSERT(msglen >= sizeof (msg->tag)); 1544 1545 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1546 VIO_DRING_UNREG)) { 1547 PR0("Message is not an unregister-dring message"); 1548 return (ENOMSG); 1549 } 1550 1551 if (msglen != sizeof (*unreg_msg)) { 1552 PR0("Expected %lu-byte unregister-dring message; " 1553 "received %lu bytes", sizeof (*unreg_msg), msglen); 1554 return (EBADMSG); 1555 } 1556 1557 if (unreg_msg->dring_ident != vd->dring_ident) { 1558 PR0("Expected dring ident %lu; received %lu", 1559 vd->dring_ident, unreg_msg->dring_ident); 1560 return (EBADMSG); 1561 } 1562 1563 return (0); 1564 } 1565 1566 static int 1567 process_rdx_msg(vio_msg_t *msg, size_t msglen) 1568 { 1569 ASSERT(msglen >= sizeof (msg->tag)); 1570 1571 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 1572 PR0("Message is not an RDX message"); 1573 return (ENOMSG); 1574 } 1575 1576 if (msglen != sizeof (vio_rdx_msg_t)) { 1577 PR0("Expected %lu-byte RDX message; received %lu bytes", 1578 sizeof (vio_rdx_msg_t), msglen); 1579 return (EBADMSG); 1580 } 1581 1582 PR0("Valid RDX message"); 1583 return (0); 1584 } 1585 1586 static int 1587 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 1588 { 1589 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 1590 PR0("Received seq_num %lu; expected %lu", 1591 seq_num, (vd->seq_num + 1)); 1592 PR0("initiating soft reset"); 1593 vd_need_reset(vd, B_FALSE); 1594 return (1); 1595 } 1596 1597 vd->seq_num = seq_num; 1598 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 1599 return (0); 1600 } 1601 1602 /* 1603 * Return the expected size of an inband-descriptor message with all the 1604 * cookies it claims to include 1605 */ 1606 static size_t 1607 expected_inband_size(vd_dring_inband_msg_t *msg) 1608 { 1609 return ((sizeof (*msg)) + 1610 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 1611 } 1612 1613 /* 1614 * Process an in-band descriptor message: used with clients like OBP, with 1615 * which vds exchanges descriptors within VIO message payloads, rather than 1616 * operating on them within a descriptor ring 1617 */ 1618 static int 1619 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1620 { 1621 size_t expected; 1622 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 1623 1624 1625 ASSERT(msglen >= sizeof (msg->tag)); 1626 1627 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1628 VIO_DESC_DATA)) { 1629 PR1("Message is not an in-band-descriptor message"); 1630 return (ENOMSG); 1631 } 1632 1633 if (msglen < sizeof (*desc_msg)) { 1634 PR0("Expected at least %lu-byte descriptor message; " 1635 "received %lu bytes", sizeof (*desc_msg), msglen); 1636 return (EBADMSG); 1637 } 1638 1639 if (msglen != (expected = expected_inband_size(desc_msg))) { 1640 PR0("Expected %lu-byte descriptor message; " 1641 "received %lu bytes", expected, msglen); 1642 return (EBADMSG); 1643 } 1644 1645 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 1646 return (EBADMSG); 1647 1648 /* 1649 * Valid message: Set up the in-band descriptor task and process the 1650 * request. Arrange to acknowledge the client's message, unless an 1651 * error processing the descriptor task results in setting 1652 * VIO_SUBTYPE_NACK 1653 */ 1654 PR1("Valid in-band-descriptor message"); 1655 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1656 1657 ASSERT(vd->inband_task.msg != NULL); 1658 1659 bcopy(msg, vd->inband_task.msg, msglen); 1660 vd->inband_task.msglen = msglen; 1661 1662 /* 1663 * The task request is now the payload of the message 1664 * that was just copied into the body of the task. 1665 */ 1666 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 1667 vd->inband_task.request = &desc_msg->payload; 1668 1669 return (vd_process_task(&vd->inband_task)); 1670 } 1671 1672 static int 1673 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 1674 vio_msg_t *msg, size_t msglen) 1675 { 1676 int status; 1677 boolean_t ready; 1678 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1679 1680 1681 /* Accept the updated dring element */ 1682 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1683 PR0("ldc_mem_dring_acquire() returned errno %d", status); 1684 return (status); 1685 } 1686 ready = (elem->hdr.dstate == VIO_DESC_READY); 1687 if (ready) { 1688 elem->hdr.dstate = VIO_DESC_ACCEPTED; 1689 } else { 1690 PR0("descriptor %u not ready", idx); 1691 VD_DUMP_DRING_ELEM(elem); 1692 } 1693 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1694 PR0("ldc_mem_dring_release() returned errno %d", status); 1695 return (status); 1696 } 1697 if (!ready) 1698 return (EBUSY); 1699 1700 1701 /* Initialize a task and process the accepted element */ 1702 PR1("Processing dring element %u", idx); 1703 vd->dring_task[idx].type = type; 1704 1705 /* duplicate msg buf for cookies etc. */ 1706 bcopy(msg, vd->dring_task[idx].msg, msglen); 1707 1708 vd->dring_task[idx].msglen = msglen; 1709 if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS) 1710 status = vd_mark_elem_done(vd, idx, elem->payload.status); 1711 1712 return (status); 1713 } 1714 1715 static int 1716 vd_process_element_range(vd_t *vd, int start, int end, 1717 vio_msg_t *msg, size_t msglen) 1718 { 1719 int i, n, nelem, status = 0; 1720 boolean_t inprogress = B_FALSE; 1721 vd_task_type_t type; 1722 1723 1724 ASSERT(start >= 0); 1725 ASSERT(end >= 0); 1726 1727 /* 1728 * Arrange to acknowledge the client's message, unless an error 1729 * processing one of the dring elements results in setting 1730 * VIO_SUBTYPE_NACK 1731 */ 1732 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1733 1734 /* 1735 * Process the dring elements in the range 1736 */ 1737 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 1738 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 1739 ((vio_dring_msg_t *)msg)->end_idx = i; 1740 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 1741 status = vd_process_element(vd, type, i, msg, msglen); 1742 if (status == EINPROGRESS) 1743 inprogress = B_TRUE; 1744 else if (status != 0) 1745 break; 1746 } 1747 1748 /* 1749 * If some, but not all, operations of a multi-element range are in 1750 * progress, wait for other operations to complete before returning 1751 * (which will result in "ack" or "nack" of the message). Note that 1752 * all outstanding operations will need to complete, not just the ones 1753 * corresponding to the current range of dring elements; howevever, as 1754 * this situation is an error case, performance is less critical. 1755 */ 1756 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 1757 ddi_taskq_wait(vd->completionq); 1758 1759 return (status); 1760 } 1761 1762 static int 1763 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1764 { 1765 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 1766 1767 1768 ASSERT(msglen >= sizeof (msg->tag)); 1769 1770 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1771 VIO_DRING_DATA)) { 1772 PR1("Message is not a dring-data message"); 1773 return (ENOMSG); 1774 } 1775 1776 if (msglen != sizeof (*dring_msg)) { 1777 PR0("Expected %lu-byte dring message; received %lu bytes", 1778 sizeof (*dring_msg), msglen); 1779 return (EBADMSG); 1780 } 1781 1782 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 1783 return (EBADMSG); 1784 1785 if (dring_msg->dring_ident != vd->dring_ident) { 1786 PR0("Expected dring ident %lu; received ident %lu", 1787 vd->dring_ident, dring_msg->dring_ident); 1788 return (EBADMSG); 1789 } 1790 1791 if (dring_msg->start_idx >= vd->dring_len) { 1792 PR0("\"start_idx\" = %u; must be less than %u", 1793 dring_msg->start_idx, vd->dring_len); 1794 return (EBADMSG); 1795 } 1796 1797 if ((dring_msg->end_idx < 0) || 1798 (dring_msg->end_idx >= vd->dring_len)) { 1799 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 1800 dring_msg->end_idx, vd->dring_len); 1801 return (EBADMSG); 1802 } 1803 1804 /* Valid message; process range of updated dring elements */ 1805 PR1("Processing descriptor range, start = %u, end = %u", 1806 dring_msg->start_idx, dring_msg->end_idx); 1807 return (vd_process_element_range(vd, dring_msg->start_idx, 1808 dring_msg->end_idx, msg, msglen)); 1809 } 1810 1811 static int 1812 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 1813 { 1814 int retry, status; 1815 size_t size = *nbytes; 1816 1817 1818 for (retry = 0, status = ETIMEDOUT; 1819 retry < vds_ldc_retries && status == ETIMEDOUT; 1820 retry++) { 1821 PR1("ldc_read() attempt %d", (retry + 1)); 1822 *nbytes = size; 1823 status = ldc_read(ldc_handle, msg, nbytes); 1824 } 1825 1826 if (status) { 1827 PR0("ldc_read() returned errno %d", status); 1828 if (status != ECONNRESET) 1829 return (ENOMSG); 1830 return (status); 1831 } else if (*nbytes == 0) { 1832 PR1("ldc_read() returned 0 and no message read"); 1833 return (ENOMSG); 1834 } 1835 1836 PR1("RCVD %lu-byte message", *nbytes); 1837 return (0); 1838 } 1839 1840 static int 1841 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1842 { 1843 int status; 1844 1845 1846 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 1847 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 1848 #ifdef DEBUG 1849 vd_decode_tag(msg); 1850 #endif 1851 1852 /* 1853 * Validate session ID up front, since it applies to all messages 1854 * once set 1855 */ 1856 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 1857 PR0("Expected SID %u, received %u", vd->sid, 1858 msg->tag.vio_sid); 1859 return (EBADMSG); 1860 } 1861 1862 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 1863 1864 /* 1865 * Process the received message based on connection state 1866 */ 1867 switch (vd->state) { 1868 case VD_STATE_INIT: /* expect version message */ 1869 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 1870 return (status); 1871 1872 /* Version negotiated, move to that state */ 1873 vd->state = VD_STATE_VER; 1874 return (0); 1875 1876 case VD_STATE_VER: /* expect attribute message */ 1877 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 1878 return (status); 1879 1880 /* Attributes exchanged, move to that state */ 1881 vd->state = VD_STATE_ATTR; 1882 return (0); 1883 1884 case VD_STATE_ATTR: 1885 switch (vd->xfer_mode) { 1886 case VIO_DESC_MODE: /* expect RDX message */ 1887 if ((status = process_rdx_msg(msg, msglen)) != 0) 1888 return (status); 1889 1890 /* Ready to receive in-band descriptors */ 1891 vd->state = VD_STATE_DATA; 1892 return (0); 1893 1894 case VIO_DRING_MODE: /* expect register-dring message */ 1895 if ((status = 1896 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 1897 return (status); 1898 1899 /* One dring negotiated, move to that state */ 1900 vd->state = VD_STATE_DRING; 1901 return (0); 1902 1903 default: 1904 ASSERT("Unsupported transfer mode"); 1905 PR0("Unsupported transfer mode"); 1906 return (ENOTSUP); 1907 } 1908 1909 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 1910 if ((status = process_rdx_msg(msg, msglen)) == 0) { 1911 /* Ready to receive data */ 1912 vd->state = VD_STATE_DATA; 1913 return (0); 1914 } else if (status != ENOMSG) { 1915 return (status); 1916 } 1917 1918 1919 /* 1920 * If another register-dring message is received, stay in 1921 * dring state in case the client sends RDX; although the 1922 * protocol allows multiple drings, this server does not 1923 * support using more than one 1924 */ 1925 if ((status = 1926 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 1927 return (status); 1928 1929 /* 1930 * Acknowledge an unregister-dring message, but reset the 1931 * connection anyway: Although the protocol allows 1932 * unregistering drings, this server cannot serve a vdisk 1933 * without its only dring 1934 */ 1935 status = vd_process_dring_unreg_msg(vd, msg, msglen); 1936 return ((status == 0) ? ENOTSUP : status); 1937 1938 case VD_STATE_DATA: 1939 switch (vd->xfer_mode) { 1940 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 1941 return (vd_process_desc_msg(vd, msg, msglen)); 1942 1943 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 1944 /* 1945 * Typically expect dring-data messages, so handle 1946 * them first 1947 */ 1948 if ((status = vd_process_dring_msg(vd, msg, 1949 msglen)) != ENOMSG) 1950 return (status); 1951 1952 /* 1953 * Acknowledge an unregister-dring message, but reset 1954 * the connection anyway: Although the protocol 1955 * allows unregistering drings, this server cannot 1956 * serve a vdisk without its only dring 1957 */ 1958 status = vd_process_dring_unreg_msg(vd, msg, msglen); 1959 return ((status == 0) ? ENOTSUP : status); 1960 1961 default: 1962 ASSERT("Unsupported transfer mode"); 1963 PR0("Unsupported transfer mode"); 1964 return (ENOTSUP); 1965 } 1966 1967 default: 1968 ASSERT("Invalid client connection state"); 1969 PR0("Invalid client connection state"); 1970 return (ENOTSUP); 1971 } 1972 } 1973 1974 static int 1975 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1976 { 1977 int status; 1978 boolean_t reset_ldc = B_FALSE; 1979 1980 1981 /* 1982 * Check that the message is at least big enough for a "tag", so that 1983 * message processing can proceed based on tag-specified message type 1984 */ 1985 if (msglen < sizeof (vio_msg_tag_t)) { 1986 PR0("Received short (%lu-byte) message", msglen); 1987 /* Can't "nack" short message, so drop the big hammer */ 1988 PR0("initiating full reset"); 1989 vd_need_reset(vd, B_TRUE); 1990 return (EBADMSG); 1991 } 1992 1993 /* 1994 * Process the message 1995 */ 1996 switch (status = vd_do_process_msg(vd, msg, msglen)) { 1997 case 0: 1998 /* "ack" valid, successfully-processed messages */ 1999 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2000 break; 2001 2002 case EINPROGRESS: 2003 /* The completion handler will "ack" or "nack" the message */ 2004 return (EINPROGRESS); 2005 case ENOMSG: 2006 PR0("Received unexpected message"); 2007 _NOTE(FALLTHROUGH); 2008 case EBADMSG: 2009 case ENOTSUP: 2010 /* "nack" invalid messages */ 2011 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2012 break; 2013 2014 default: 2015 /* "nack" failed messages */ 2016 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2017 /* An LDC error probably occurred, so try resetting it */ 2018 reset_ldc = B_TRUE; 2019 break; 2020 } 2021 2022 PR1("\tResulting in state %d (%s)", vd->state, 2023 vd_decode_state(vd->state)); 2024 2025 /* Send the "ack" or "nack" to the client */ 2026 PR1("Sending %s", 2027 (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 2028 if (send_msg(vd->ldc_handle, msg, msglen) != 0) 2029 reset_ldc = B_TRUE; 2030 2031 /* Arrange to reset the connection for nack'ed or failed messages */ 2032 if ((status != 0) || reset_ldc) { 2033 PR0("initiating %s reset", 2034 (reset_ldc) ? "full" : "soft"); 2035 vd_need_reset(vd, reset_ldc); 2036 } 2037 2038 return (status); 2039 } 2040 2041 static boolean_t 2042 vd_enabled(vd_t *vd) 2043 { 2044 boolean_t enabled; 2045 2046 2047 mutex_enter(&vd->lock); 2048 enabled = vd->enabled; 2049 mutex_exit(&vd->lock); 2050 return (enabled); 2051 } 2052 2053 static void 2054 vd_recv_msg(void *arg) 2055 { 2056 vd_t *vd = (vd_t *)arg; 2057 int rv = 0, status = 0; 2058 2059 ASSERT(vd != NULL); 2060 2061 PR2("New task to receive incoming message(s)"); 2062 2063 2064 while (vd_enabled(vd) && status == 0) { 2065 size_t msglen, msgsize; 2066 ldc_status_t lstatus; 2067 2068 /* 2069 * Receive and process a message 2070 */ 2071 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 2072 2073 /* 2074 * check if channel is UP - else break out of loop 2075 */ 2076 status = ldc_status(vd->ldc_handle, &lstatus); 2077 if (lstatus != LDC_UP) { 2078 PR0("channel not up (status=%d), exiting recv loop\n", 2079 lstatus); 2080 break; 2081 } 2082 2083 ASSERT(vd->max_msglen != 0); 2084 2085 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 2086 msglen = msgsize; /* actual len after recv_msg() */ 2087 2088 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 2089 switch (status) { 2090 case 0: 2091 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 2092 msglen); 2093 /* check if max_msglen changed */ 2094 if (msgsize != vd->max_msglen) { 2095 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 2096 msgsize, vd->max_msglen); 2097 kmem_free(vd->vio_msgp, msgsize); 2098 vd->vio_msgp = 2099 kmem_alloc(vd->max_msglen, KM_SLEEP); 2100 } 2101 if (rv == EINPROGRESS) 2102 continue; 2103 break; 2104 2105 case ENOMSG: 2106 break; 2107 2108 case ECONNRESET: 2109 PR0("initiating soft reset (ECONNRESET)\n"); 2110 vd_need_reset(vd, B_FALSE); 2111 status = 0; 2112 break; 2113 2114 default: 2115 /* Probably an LDC failure; arrange to reset it */ 2116 PR0("initiating full reset (status=0x%x)", status); 2117 vd_need_reset(vd, B_TRUE); 2118 break; 2119 } 2120 } 2121 2122 PR2("Task finished"); 2123 } 2124 2125 static uint_t 2126 vd_handle_ldc_events(uint64_t event, caddr_t arg) 2127 { 2128 vd_t *vd = (vd_t *)(void *)arg; 2129 int status; 2130 2131 2132 ASSERT(vd != NULL); 2133 2134 if (!vd_enabled(vd)) 2135 return (LDC_SUCCESS); 2136 2137 if (event & LDC_EVT_DOWN) { 2138 PRN("LDC_EVT_DOWN: LDC channel went down"); 2139 2140 vd_need_reset(vd, B_TRUE); 2141 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2142 DDI_SLEEP); 2143 if (status == DDI_FAILURE) { 2144 PR0("cannot schedule task to recv msg\n"); 2145 vd_need_reset(vd, B_TRUE); 2146 } 2147 } 2148 2149 if (event & LDC_EVT_RESET) { 2150 PR0("LDC_EVT_RESET: LDC channel was reset"); 2151 2152 if (vd->state != VD_STATE_INIT) { 2153 PR0("scheduling full reset"); 2154 vd_need_reset(vd, B_FALSE); 2155 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2156 vd, DDI_SLEEP); 2157 if (status == DDI_FAILURE) { 2158 PR0("cannot schedule task to recv msg\n"); 2159 vd_need_reset(vd, B_TRUE); 2160 } 2161 2162 } else { 2163 PR0("channel already reset, ignoring...\n"); 2164 PR0("doing ldc up...\n"); 2165 (void) ldc_up(vd->ldc_handle); 2166 } 2167 2168 return (LDC_SUCCESS); 2169 } 2170 2171 if (event & LDC_EVT_UP) { 2172 PR0("EVT_UP: LDC is up\nResetting client connection state"); 2173 PR0("initiating soft reset"); 2174 vd_need_reset(vd, B_FALSE); 2175 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2176 vd, DDI_SLEEP); 2177 if (status == DDI_FAILURE) { 2178 PR0("cannot schedule task to recv msg\n"); 2179 vd_need_reset(vd, B_TRUE); 2180 return (LDC_SUCCESS); 2181 } 2182 } 2183 2184 if (event & LDC_EVT_READ) { 2185 int status; 2186 2187 PR1("New data available"); 2188 /* Queue a task to receive the new data */ 2189 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2190 DDI_SLEEP); 2191 2192 if (status == DDI_FAILURE) { 2193 PR0("cannot schedule task to recv msg\n"); 2194 vd_need_reset(vd, B_TRUE); 2195 } 2196 } 2197 2198 return (LDC_SUCCESS); 2199 } 2200 2201 static uint_t 2202 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 2203 { 2204 _NOTE(ARGUNUSED(key, val)) 2205 (*((uint_t *)arg))++; 2206 return (MH_WALK_TERMINATE); 2207 } 2208 2209 2210 static int 2211 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2212 { 2213 uint_t vd_present = 0; 2214 minor_t instance; 2215 vds_t *vds; 2216 2217 2218 switch (cmd) { 2219 case DDI_DETACH: 2220 /* the real work happens below */ 2221 break; 2222 case DDI_SUSPEND: 2223 PR0("No action required for DDI_SUSPEND"); 2224 return (DDI_SUCCESS); 2225 default: 2226 PR0("Unrecognized \"cmd\""); 2227 return (DDI_FAILURE); 2228 } 2229 2230 ASSERT(cmd == DDI_DETACH); 2231 instance = ddi_get_instance(dip); 2232 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 2233 PR0("Could not get state for instance %u", instance); 2234 ddi_soft_state_free(vds_state, instance); 2235 return (DDI_FAILURE); 2236 } 2237 2238 /* Do no detach when serving any vdisks */ 2239 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 2240 if (vd_present) { 2241 PR0("Not detaching because serving vdisks"); 2242 return (DDI_FAILURE); 2243 } 2244 2245 PR0("Detaching"); 2246 if (vds->initialized & VDS_MDEG) 2247 (void) mdeg_unregister(vds->mdeg); 2248 if (vds->initialized & VDS_LDI) 2249 (void) ldi_ident_release(vds->ldi_ident); 2250 mod_hash_destroy_hash(vds->vd_table); 2251 ddi_soft_state_free(vds_state, instance); 2252 return (DDI_SUCCESS); 2253 } 2254 2255 static boolean_t 2256 is_pseudo_device(dev_info_t *dip) 2257 { 2258 dev_info_t *parent, *root = ddi_root_node(); 2259 2260 2261 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 2262 parent = ddi_get_parent(parent)) { 2263 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 2264 return (B_TRUE); 2265 } 2266 2267 return (B_FALSE); 2268 } 2269 2270 static int 2271 vd_setup_full_disk(vd_t *vd) 2272 { 2273 int rval, status; 2274 major_t major = getmajor(vd->dev[0]); 2275 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 2276 struct dk_minfo dk_minfo; 2277 2278 /* 2279 * At this point, vdisk_size is set to the size of partition 2 but 2280 * this does not represent the size of the disk because partition 2 2281 * may not cover the entire disk and its size does not include reserved 2282 * blocks. So we update vdisk_size to be the size of the entire disk. 2283 */ 2284 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 2285 (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL), 2286 kcred, &rval)) != 0) { 2287 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 2288 status); 2289 return (status); 2290 } 2291 vd->vdisk_size = dk_minfo.dki_capacity; 2292 2293 /* Set full-disk parameters */ 2294 vd->vdisk_type = VD_DISK_TYPE_DISK; 2295 vd->nslices = (sizeof (vd->dev))/(sizeof (vd->dev[0])); 2296 2297 /* Move dev number and LDI handle to entire-disk-slice array elements */ 2298 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 2299 vd->dev[0] = 0; 2300 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 2301 vd->ldi_handle[0] = NULL; 2302 2303 /* Initialize device numbers for remaining slices and open them */ 2304 for (int slice = 0; slice < vd->nslices; slice++) { 2305 /* 2306 * Skip the entire-disk slice, as it's already open and its 2307 * device known 2308 */ 2309 if (slice == VD_ENTIRE_DISK_SLICE) 2310 continue; 2311 ASSERT(vd->dev[slice] == 0); 2312 ASSERT(vd->ldi_handle[slice] == NULL); 2313 2314 /* 2315 * Construct the device number for the current slice 2316 */ 2317 vd->dev[slice] = makedevice(major, (minor + slice)); 2318 2319 /* 2320 * At least some underlying drivers refuse to open 2321 * devices for (currently) zero-length slices, so skip 2322 * them for now 2323 */ 2324 if (vd->vtoc.v_part[slice].p_size == 0) { 2325 PR0("Skipping zero-length slice %u", slice); 2326 continue; 2327 } 2328 2329 /* 2330 * Open all non-empty slices of the disk to serve them to the 2331 * client. Slices are opened exclusively to prevent other 2332 * threads or processes in the service domain from performing 2333 * I/O to slices being accessed by a client. Failure to open 2334 * a slice results in vds not serving this disk, as the client 2335 * could attempt (and should be able) to access any non-empty 2336 * slice immediately. Any slices successfully opened before a 2337 * failure will get closed by vds_destroy_vd() as a result of 2338 * the error returned by this function. 2339 */ 2340 PR0("Opening device major %u, minor %u = slice %u", 2341 major, minor, slice); 2342 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 2343 vd_open_flags, kcred, &vd->ldi_handle[slice], 2344 vd->vds->ldi_ident)) != 0) { 2345 PRN("ldi_open_by_dev() returned errno %d " 2346 "for slice %u", status, slice); 2347 /* vds_destroy_vd() will close any open slices */ 2348 return (status); 2349 } 2350 } 2351 2352 return (0); 2353 } 2354 2355 static int 2356 vd_setup_partition_efi(vd_t *vd) 2357 { 2358 efi_gpt_t *gpt; 2359 efi_gpe_t *gpe; 2360 struct uuid uuid = EFI_RESERVED; 2361 uint32_t crc; 2362 int length; 2363 2364 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 2365 2366 gpt = kmem_zalloc(length, KM_SLEEP); 2367 gpe = (efi_gpe_t *)(gpt + 1); 2368 2369 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 2370 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 2371 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 2372 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 2373 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 2374 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 2375 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 2376 2377 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 2378 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 2379 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 2380 2381 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 2382 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2383 2384 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 2385 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 2386 2387 vd->dk_efi.dki_lba = 0; 2388 vd->dk_efi.dki_length = length; 2389 vd->dk_efi.dki_data = gpt; 2390 2391 return (0); 2392 } 2393 2394 static int 2395 vd_setup_vd(char *device_path, vd_t *vd) 2396 { 2397 int rval, status; 2398 dev_info_t *dip; 2399 struct dk_cinfo dk_cinfo; 2400 2401 /* 2402 * We need to open with FNDELAY so that opening an empty partition 2403 * does not fail. 2404 */ 2405 if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY, 2406 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { 2407 PRN("ldi_open_by_name(%s) = errno %d", device_path, status); 2408 return (status); 2409 } 2410 2411 /* 2412 * nslices must be updated now so that vds_destroy_vd() will close 2413 * the slice we have just opened in case of an error. 2414 */ 2415 vd->nslices = 1; 2416 2417 /* Get device number and size of backing device */ 2418 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 2419 PRN("ldi_get_dev() returned errno %d for %s", 2420 status, device_path); 2421 return (status); 2422 } 2423 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 2424 PRN("ldi_get_size() failed for %s", device_path); 2425 return (EIO); 2426 } 2427 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 2428 2429 /* Verify backing device supports dk_cinfo, dk_geom, and vtoc */ 2430 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 2431 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 2432 &rval)) != 0) { 2433 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 2434 status, device_path); 2435 return (status); 2436 } 2437 if (dk_cinfo.dki_partition >= V_NUMPAR) { 2438 PRN("slice %u >= maximum slice %u for %s", 2439 dk_cinfo.dki_partition, V_NUMPAR, device_path); 2440 return (EIO); 2441 } 2442 2443 status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label); 2444 2445 if (status != 0) { 2446 PRN("vd_read_vtoc returned errno %d for %s", 2447 status, device_path); 2448 return (status); 2449 } 2450 2451 if (vd->vdisk_label == VD_DISK_LABEL_VTOC && 2452 (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 2453 (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), 2454 kcred, &rval)) != 0) { 2455 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 2456 status, device_path); 2457 return (status); 2458 } 2459 2460 /* Store the device's max transfer size for return to the client */ 2461 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 2462 2463 2464 /* Determine if backing device is a pseudo device */ 2465 if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), 2466 dev_to_instance(vd->dev[0]), 0)) == NULL) { 2467 PRN("%s is no longer accessible", device_path); 2468 return (EIO); 2469 } 2470 vd->pseudo = is_pseudo_device(dip); 2471 ddi_release_devi(dip); 2472 if (vd->pseudo) { 2473 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2474 vd->nslices = 1; 2475 return (0); /* ...and we're done */ 2476 } 2477 2478 2479 /* If slice is entire-disk slice, initialize for full disk */ 2480 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) 2481 return (vd_setup_full_disk(vd)); 2482 2483 2484 /* Otherwise, we have a non-entire slice of a device */ 2485 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2486 vd->nslices = 1; 2487 2488 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 2489 status = vd_setup_partition_efi(vd); 2490 return (status); 2491 } 2492 2493 /* Initialize dk_geom structure for single-slice device */ 2494 if (vd->dk_geom.dkg_nsect == 0) { 2495 PR0("%s geometry claims 0 sectors per track", device_path); 2496 return (EIO); 2497 } 2498 if (vd->dk_geom.dkg_nhead == 0) { 2499 PR0("%s geometry claims 0 heads", device_path); 2500 return (EIO); 2501 } 2502 vd->dk_geom.dkg_ncyl = 2503 vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; 2504 vd->dk_geom.dkg_acyl = 0; 2505 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 2506 2507 2508 /* Initialize vtoc structure for single-slice device */ 2509 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 2510 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 2511 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 2512 vd->vtoc.v_nparts = 1; 2513 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 2514 vd->vtoc.v_part[0].p_flag = 0; 2515 vd->vtoc.v_part[0].p_start = 0; 2516 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 2517 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 2518 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 2519 2520 2521 return (0); 2522 } 2523 2524 static int 2525 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id, 2526 vd_t **vdp) 2527 { 2528 char tq_name[TASKQ_NAMELEN]; 2529 int status; 2530 ddi_iblock_cookie_t iblock = NULL; 2531 ldc_attr_t ldc_attr; 2532 vd_t *vd; 2533 2534 2535 ASSERT(vds != NULL); 2536 ASSERT(device_path != NULL); 2537 ASSERT(vdp != NULL); 2538 PR0("Adding vdisk for %s", device_path); 2539 2540 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 2541 PRN("No memory for virtual disk"); 2542 return (EAGAIN); 2543 } 2544 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 2545 vd->vds = vds; 2546 2547 2548 /* Open vdisk and initialize parameters */ 2549 if ((status = vd_setup_vd(device_path, vd)) != 0) 2550 return (status); 2551 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 2552 PR0("vdisk_type = %s, pseudo = %s, nslices = %u", 2553 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 2554 (vd->pseudo ? "yes" : "no"), vd->nslices); 2555 2556 2557 /* Initialize locking */ 2558 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 2559 &iblock) != DDI_SUCCESS) { 2560 PRN("Could not get iblock cookie."); 2561 return (EIO); 2562 } 2563 2564 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 2565 vd->initialized |= VD_LOCKING; 2566 2567 2568 /* Create start and completion task queues for the vdisk */ 2569 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 2570 PR1("tq_name = %s", tq_name); 2571 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 2572 TASKQ_DEFAULTPRI, 0)) == NULL) { 2573 PRN("Could not create task queue"); 2574 return (EIO); 2575 } 2576 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 2577 PR1("tq_name = %s", tq_name); 2578 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 2579 TASKQ_DEFAULTPRI, 0)) == NULL) { 2580 PRN("Could not create task queue"); 2581 return (EIO); 2582 } 2583 vd->enabled = 1; /* before callback can dispatch to startq */ 2584 2585 2586 /* Bring up LDC */ 2587 ldc_attr.devclass = LDC_DEV_BLK_SVC; 2588 ldc_attr.instance = ddi_get_instance(vds->dip); 2589 ldc_attr.mode = LDC_MODE_UNRELIABLE; 2590 ldc_attr.mtu = VD_LDC_MTU; 2591 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 2592 PR0("ldc_init(%lu) = errno %d", ldc_id, status); 2593 return (status); 2594 } 2595 vd->initialized |= VD_LDC; 2596 2597 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 2598 (caddr_t)vd)) != 0) { 2599 PR0("ldc_reg_callback() returned errno %d", status); 2600 return (status); 2601 } 2602 2603 if ((status = ldc_open(vd->ldc_handle)) != 0) { 2604 PR0("ldc_open() returned errno %d", status); 2605 return (status); 2606 } 2607 2608 if ((status = ldc_up(vd->ldc_handle)) != 0) { 2609 PRN("ldc_up() returned errno %d", status); 2610 } 2611 2612 /* Allocate the inband task memory handle */ 2613 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 2614 if (status) { 2615 PRN("ldc_mem_alloc_handle() returned err %d ", status); 2616 return (ENXIO); 2617 } 2618 2619 /* Add the successfully-initialized vdisk to the server's table */ 2620 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 2621 PRN("Error adding vdisk ID %lu to table", id); 2622 return (EIO); 2623 } 2624 2625 /* Allocate the staging buffer */ 2626 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 2627 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 2628 2629 /* store initial state */ 2630 vd->state = VD_STATE_INIT; 2631 2632 return (0); 2633 } 2634 2635 static void 2636 vd_free_dring_task(vd_t *vdp) 2637 { 2638 if (vdp->dring_task != NULL) { 2639 ASSERT(vdp->dring_len != 0); 2640 /* Free all dring_task memory handles */ 2641 for (int i = 0; i < vdp->dring_len; i++) { 2642 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 2643 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 2644 vdp->dring_task[i].msg = NULL; 2645 } 2646 kmem_free(vdp->dring_task, 2647 (sizeof (*vdp->dring_task)) * vdp->dring_len); 2648 vdp->dring_task = NULL; 2649 } 2650 } 2651 2652 /* 2653 * Destroy the state associated with a virtual disk 2654 */ 2655 static void 2656 vds_destroy_vd(void *arg) 2657 { 2658 vd_t *vd = (vd_t *)arg; 2659 2660 2661 if (vd == NULL) 2662 return; 2663 2664 PR0("Destroying vdisk state"); 2665 2666 if (vd->dk_efi.dki_data != NULL) 2667 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 2668 2669 /* Disable queuing requests for the vdisk */ 2670 if (vd->initialized & VD_LOCKING) { 2671 mutex_enter(&vd->lock); 2672 vd->enabled = 0; 2673 mutex_exit(&vd->lock); 2674 } 2675 2676 /* Drain and destroy start queue (*before* destroying completionq) */ 2677 if (vd->startq != NULL) 2678 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 2679 2680 /* Drain and destroy completion queue (*before* shutting down LDC) */ 2681 if (vd->completionq != NULL) 2682 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 2683 2684 vd_free_dring_task(vd); 2685 2686 /* Free the staging buffer for msgs */ 2687 if (vd->vio_msgp != NULL) { 2688 kmem_free(vd->vio_msgp, vd->max_msglen); 2689 vd->vio_msgp = NULL; 2690 } 2691 2692 /* Free the inband message buffer */ 2693 if (vd->inband_task.msg != NULL) { 2694 kmem_free(vd->inband_task.msg, vd->max_msglen); 2695 vd->inband_task.msg = NULL; 2696 } 2697 2698 /* Free the inband task memory handle */ 2699 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 2700 2701 /* Shut down LDC */ 2702 if (vd->initialized & VD_LDC) { 2703 if (vd->initialized & VD_DRING) 2704 (void) ldc_mem_dring_unmap(vd->dring_handle); 2705 (void) ldc_unreg_callback(vd->ldc_handle); 2706 (void) ldc_close(vd->ldc_handle); 2707 (void) ldc_fini(vd->ldc_handle); 2708 } 2709 2710 /* Close any open backing-device slices */ 2711 for (uint_t slice = 0; slice < vd->nslices; slice++) { 2712 if (vd->ldi_handle[slice] != NULL) { 2713 PR0("Closing slice %u", slice); 2714 (void) ldi_close(vd->ldi_handle[slice], 2715 vd_open_flags | FNDELAY, kcred); 2716 } 2717 } 2718 2719 /* Free lock */ 2720 if (vd->initialized & VD_LOCKING) 2721 mutex_destroy(&vd->lock); 2722 2723 /* Finally, free the vdisk structure itself */ 2724 kmem_free(vd, sizeof (*vd)); 2725 } 2726 2727 static int 2728 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id) 2729 { 2730 int status; 2731 vd_t *vd = NULL; 2732 2733 2734 if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0) 2735 vds_destroy_vd(vd); 2736 2737 return (status); 2738 } 2739 2740 static int 2741 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 2742 uint64_t *ldc_id) 2743 { 2744 int num_channels; 2745 2746 2747 /* Look for channel endpoint child(ren) of the vdisk MD node */ 2748 if ((num_channels = md_scan_dag(md, vd_node, 2749 md_find_name(md, VD_CHANNEL_ENDPOINT), 2750 md_find_name(md, "fwd"), channel)) <= 0) { 2751 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 2752 return (-1); 2753 } 2754 2755 /* Get the "id" value for the first channel endpoint node */ 2756 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 2757 PRN("No \"%s\" property found for \"%s\" of vdisk", 2758 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 2759 return (-1); 2760 } 2761 2762 if (num_channels > 1) { 2763 PRN("Using ID of first of multiple channels for this vdisk"); 2764 } 2765 2766 return (0); 2767 } 2768 2769 static int 2770 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 2771 { 2772 int num_nodes, status; 2773 size_t size; 2774 mde_cookie_t *channel; 2775 2776 2777 if ((num_nodes = md_node_count(md)) <= 0) { 2778 PRN("Invalid node count in Machine Description subtree"); 2779 return (-1); 2780 } 2781 size = num_nodes*(sizeof (*channel)); 2782 channel = kmem_zalloc(size, KM_SLEEP); 2783 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 2784 kmem_free(channel, size); 2785 2786 return (status); 2787 } 2788 2789 static void 2790 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 2791 { 2792 char *device_path = NULL; 2793 uint64_t id = 0, ldc_id = 0; 2794 2795 2796 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 2797 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 2798 return; 2799 } 2800 PR0("Adding vdisk ID %lu", id); 2801 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 2802 &device_path) != 0) { 2803 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 2804 return; 2805 } 2806 2807 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 2808 PRN("Error getting LDC ID for vdisk %lu", id); 2809 return; 2810 } 2811 2812 if (vds_init_vd(vds, id, device_path, ldc_id) != 0) { 2813 PRN("Failed to add vdisk ID %lu", id); 2814 return; 2815 } 2816 } 2817 2818 static void 2819 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 2820 { 2821 uint64_t id = 0; 2822 2823 2824 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 2825 PRN("Unable to get \"%s\" property from vdisk's MD node", 2826 VD_ID_PROP); 2827 return; 2828 } 2829 PR0("Removing vdisk ID %lu", id); 2830 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 2831 PRN("No vdisk entry found for vdisk ID %lu", id); 2832 } 2833 2834 static void 2835 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 2836 md_t *curr_md, mde_cookie_t curr_vd_node) 2837 { 2838 char *curr_dev, *prev_dev; 2839 uint64_t curr_id = 0, curr_ldc_id = 0; 2840 uint64_t prev_id = 0, prev_ldc_id = 0; 2841 size_t len; 2842 2843 2844 /* Validate that vdisk ID has not changed */ 2845 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 2846 PRN("Error getting previous vdisk \"%s\" property", 2847 VD_ID_PROP); 2848 return; 2849 } 2850 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 2851 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 2852 return; 2853 } 2854 if (curr_id != prev_id) { 2855 PRN("Not changing vdisk: ID changed from %lu to %lu", 2856 prev_id, curr_id); 2857 return; 2858 } 2859 2860 /* Validate that LDC ID has not changed */ 2861 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 2862 PRN("Error getting LDC ID for vdisk %lu", prev_id); 2863 return; 2864 } 2865 2866 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 2867 PRN("Error getting LDC ID for vdisk %lu", curr_id); 2868 return; 2869 } 2870 if (curr_ldc_id != prev_ldc_id) { 2871 _NOTE(NOTREACHED); /* lint is confused */ 2872 PRN("Not changing vdisk: " 2873 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 2874 return; 2875 } 2876 2877 /* Determine whether device path has changed */ 2878 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 2879 &prev_dev) != 0) { 2880 PRN("Error getting previous vdisk \"%s\"", 2881 VD_BLOCK_DEVICE_PROP); 2882 return; 2883 } 2884 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 2885 &curr_dev) != 0) { 2886 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 2887 return; 2888 } 2889 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 2890 (strncmp(curr_dev, prev_dev, len) == 0)) 2891 return; /* no relevant (supported) change */ 2892 2893 PR0("Changing vdisk ID %lu", prev_id); 2894 2895 /* Remove old state, which will close vdisk and reset */ 2896 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 2897 PRN("No entry found for vdisk ID %lu", prev_id); 2898 2899 /* Re-initialize vdisk with new state */ 2900 if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { 2901 PRN("Failed to change vdisk ID %lu", curr_id); 2902 return; 2903 } 2904 } 2905 2906 static int 2907 vds_process_md(void *arg, mdeg_result_t *md) 2908 { 2909 int i; 2910 vds_t *vds = arg; 2911 2912 2913 if (md == NULL) 2914 return (MDEG_FAILURE); 2915 ASSERT(vds != NULL); 2916 2917 for (i = 0; i < md->removed.nelem; i++) 2918 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 2919 for (i = 0; i < md->match_curr.nelem; i++) 2920 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 2921 md->match_curr.mdp, md->match_curr.mdep[i]); 2922 for (i = 0; i < md->added.nelem; i++) 2923 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 2924 2925 return (MDEG_SUCCESS); 2926 } 2927 2928 static int 2929 vds_do_attach(dev_info_t *dip) 2930 { 2931 static char reg_prop[] = "reg"; /* devinfo ID prop */ 2932 2933 /* MDEG specification for a (particular) vds node */ 2934 static mdeg_prop_spec_t vds_prop_spec[] = { 2935 {MDET_PROP_STR, "name", {VDS_NAME}}, 2936 {MDET_PROP_VAL, "cfg-handle", {0}}, 2937 {MDET_LIST_END, NULL, {0}}}; 2938 static mdeg_node_spec_t vds_spec = {"virtual-device", vds_prop_spec}; 2939 2940 /* MDEG specification for matching a vd node */ 2941 static md_prop_match_t vd_prop_spec[] = { 2942 {MDET_PROP_VAL, VD_ID_PROP}, 2943 {MDET_LIST_END, NULL}}; 2944 static mdeg_node_match_t vd_spec = {"virtual-device-port", 2945 vd_prop_spec}; 2946 2947 int status; 2948 uint64_t cfg_handle; 2949 minor_t instance = ddi_get_instance(dip); 2950 vds_t *vds; 2951 2952 2953 /* 2954 * The "cfg-handle" property of a vds node in an MD contains the MD's 2955 * notion of "instance", or unique identifier, for that node; OBP 2956 * stores the value of the "cfg-handle" MD property as the value of 2957 * the "reg" property on the node in the device tree it builds from 2958 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2959 * "reg" property value to uniquely identify this device instance when 2960 * registering with the MD event-generation framework. If the "reg" 2961 * property cannot be found, the device tree state is presumably so 2962 * broken that there is no point in continuing. 2963 */ 2964 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, reg_prop)) { 2965 PRN("vds \"%s\" property does not exist", reg_prop); 2966 return (DDI_FAILURE); 2967 } 2968 2969 /* Get the MD instance for later MDEG registration */ 2970 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2971 reg_prop, -1); 2972 2973 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 2974 PRN("Could not allocate state for instance %u", instance); 2975 return (DDI_FAILURE); 2976 } 2977 2978 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 2979 PRN("Could not get state for instance %u", instance); 2980 ddi_soft_state_free(vds_state, instance); 2981 return (DDI_FAILURE); 2982 } 2983 2984 2985 vds->dip = dip; 2986 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 2987 vds_destroy_vd, 2988 sizeof (void *)); 2989 ASSERT(vds->vd_table != NULL); 2990 2991 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 2992 PRN("ldi_ident_from_dip() returned errno %d", status); 2993 return (DDI_FAILURE); 2994 } 2995 vds->initialized |= VDS_LDI; 2996 2997 /* Register for MD updates */ 2998 vds_prop_spec[1].ps_val = cfg_handle; 2999 if (mdeg_register(&vds_spec, &vd_spec, vds_process_md, vds, 3000 &vds->mdeg) != MDEG_SUCCESS) { 3001 PRN("Unable to register for MD updates"); 3002 return (DDI_FAILURE); 3003 } 3004 vds->initialized |= VDS_MDEG; 3005 3006 /* Prevent auto-detaching so driver is available whenever MD changes */ 3007 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 3008 DDI_PROP_SUCCESS) { 3009 PRN("failed to set \"%s\" property for instance %u", 3010 DDI_NO_AUTODETACH, instance); 3011 } 3012 3013 ddi_report_dev(dip); 3014 return (DDI_SUCCESS); 3015 } 3016 3017 static int 3018 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3019 { 3020 int status; 3021 3022 switch (cmd) { 3023 case DDI_ATTACH: 3024 PR0("Attaching"); 3025 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 3026 (void) vds_detach(dip, DDI_DETACH); 3027 return (status); 3028 case DDI_RESUME: 3029 PR0("No action required for DDI_RESUME"); 3030 return (DDI_SUCCESS); 3031 default: 3032 return (DDI_FAILURE); 3033 } 3034 } 3035 3036 static struct dev_ops vds_ops = { 3037 DEVO_REV, /* devo_rev */ 3038 0, /* devo_refcnt */ 3039 ddi_no_info, /* devo_getinfo */ 3040 nulldev, /* devo_identify */ 3041 nulldev, /* devo_probe */ 3042 vds_attach, /* devo_attach */ 3043 vds_detach, /* devo_detach */ 3044 nodev, /* devo_reset */ 3045 NULL, /* devo_cb_ops */ 3046 NULL, /* devo_bus_ops */ 3047 nulldev /* devo_power */ 3048 }; 3049 3050 static struct modldrv modldrv = { 3051 &mod_driverops, 3052 "virtual disk server v%I%", 3053 &vds_ops, 3054 }; 3055 3056 static struct modlinkage modlinkage = { 3057 MODREV_1, 3058 &modldrv, 3059 NULL 3060 }; 3061 3062 3063 int 3064 _init(void) 3065 { 3066 int i, status; 3067 3068 3069 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 3070 return (status); 3071 if ((status = mod_install(&modlinkage)) != 0) { 3072 ddi_soft_state_fini(&vds_state); 3073 return (status); 3074 } 3075 3076 /* Fill in the bit-mask of server-supported operations */ 3077 for (i = 0; i < vds_noperations; i++) 3078 vds_operations |= 1 << (vds_operation[i].operation - 1); 3079 3080 return (0); 3081 } 3082 3083 int 3084 _info(struct modinfo *modinfop) 3085 { 3086 return (mod_info(&modlinkage, modinfop)); 3087 } 3088 3089 int 3090 _fini(void) 3091 { 3092 int status; 3093 3094 3095 if ((status = mod_remove(&modlinkage)) != 0) 3096 return (status); 3097 ddi_soft_state_fini(&vds_state); 3098 return (0); 3099 } 3100