1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sunddi.h> 45 #include <sys/sunldi.h> 46 #include <sys/sysmacros.h> 47 #include <sys/vio_common.h> 48 #include <sys/vdsk_mailbox.h> 49 #include <sys/vdsk_common.h> 50 #include <sys/vtoc.h> 51 52 53 /* Virtual disk server initialization flags */ 54 #define VDS_LDI 0x01 55 #define VDS_MDEG 0x02 56 57 /* Virtual disk server tunable parameters */ 58 #define VDS_LDC_RETRIES 3 59 #define VDS_LDC_DELAY 1000 /* usec */ 60 #define VDS_NCHAINS 32 61 62 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 63 #define VDS_NAME "virtual-disk-server" 64 65 #define VD_NAME "vd" 66 #define VD_VOLUME_NAME "vdisk" 67 #define VD_ASCIILABEL "Virtual Disk" 68 69 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 70 #define VD_ID_PROP "id" 71 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 72 73 /* Virtual disk initialization flags */ 74 #define VD_LOCKING 0x01 75 #define VD_LDC 0x02 76 #define VD_DRING 0x04 77 #define VD_SID 0x08 78 #define VD_SEQ_NUM 0x10 79 80 /* Flags for opening/closing backing devices via LDI */ 81 #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) 82 83 /* 84 * By Solaris convention, slice/partition 2 represents the entire disk; 85 * unfortunately, this convention does not appear to be codified. 86 */ 87 #define VD_ENTIRE_DISK_SLICE 2 88 89 /* Return a cpp token as a string */ 90 #define STRINGIZE(token) #token 91 92 /* 93 * Print a message prefixed with the current function name to the message log 94 * (and optionally to the console for verbose boots); these macros use cpp's 95 * concatenation of string literals and C99 variable-length-argument-list 96 * macros 97 */ 98 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 99 #define _PRN(format, ...) \ 100 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 101 102 /* Return a pointer to the "i"th vdisk dring element */ 103 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 104 (vd->dring + (i)*vd->descriptor_size)) 105 106 /* Return the virtual disk client's type as a string (for use in messages) */ 107 #define VD_CLIENT(vd) \ 108 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 109 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 110 (((vd)->xfer_mode == 0) ? "null client" : \ 111 "unsupported client"))) 112 113 /* Debugging macros */ 114 #ifdef DEBUG 115 116 static int vd_msglevel = 0; 117 118 119 #define PR0 if (vd_msglevel > 0) PRN 120 #define PR1 if (vd_msglevel > 1) PRN 121 #define PR2 if (vd_msglevel > 2) PRN 122 123 #define VD_DUMP_DRING_ELEM(elem) \ 124 PRN("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 125 elem->hdr.dstate, \ 126 elem->payload.operation, \ 127 elem->payload.status, \ 128 elem->payload.nbytes, \ 129 elem->payload.addr, \ 130 elem->payload.ncookies); 131 132 char * 133 vd_decode_state(int state) 134 { 135 char *str; 136 137 #define CASE_STATE(_s) case _s: str = #_s; break; 138 139 switch (state) { 140 CASE_STATE(VD_STATE_INIT) 141 CASE_STATE(VD_STATE_VER) 142 CASE_STATE(VD_STATE_ATTR) 143 CASE_STATE(VD_STATE_DRING) 144 CASE_STATE(VD_STATE_RDX) 145 CASE_STATE(VD_STATE_DATA) 146 default: str = "unknown"; break; 147 } 148 149 #undef CASE_STATE 150 151 return (str); 152 } 153 154 void 155 vd_decode_tag(vio_msg_t *msg) 156 { 157 char *tstr, *sstr, *estr; 158 159 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 160 161 switch (msg->tag.vio_msgtype) { 162 CASE_TYPE(VIO_TYPE_CTRL) 163 CASE_TYPE(VIO_TYPE_DATA) 164 CASE_TYPE(VIO_TYPE_ERR) 165 default: tstr = "unknown"; break; 166 } 167 168 #undef CASE_TYPE 169 170 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 171 172 switch (msg->tag.vio_subtype) { 173 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 174 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 175 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 176 default: sstr = "unknown"; break; 177 } 178 179 #undef CASE_SUBTYPE 180 181 #define CASE_ENV(_s) case _s: estr = #_s; break; 182 183 switch (msg->tag.vio_subtype_env) { 184 CASE_ENV(VIO_VER_INFO) 185 CASE_ENV(VIO_ATTR_INFO) 186 CASE_ENV(VIO_DRING_REG) 187 CASE_ENV(VIO_DRING_UNREG) 188 CASE_ENV(VIO_RDX) 189 CASE_ENV(VIO_PKT_DATA) 190 CASE_ENV(VIO_DESC_DATA) 191 CASE_ENV(VIO_DRING_DATA) 192 default: estr = "unknown"; break; 193 } 194 195 #undef CASE_ENV 196 197 PR1("(%x/%x/%x) message : (%s/%s/%s)", 198 msg->tag.vio_msgtype, msg->tag.vio_subtype, 199 msg->tag.vio_subtype_env, tstr, sstr, estr); 200 } 201 202 #else /* !DEBUG */ 203 204 #define PR0(...) 205 #define PR1(...) 206 #define PR2(...) 207 208 #define VD_DUMP_DRING_ELEM(elem) 209 210 #define vd_decode_state(_s) (NULL) 211 #define vd_decode_tag(_s) (NULL) 212 213 #endif /* DEBUG */ 214 215 216 /* 217 * Soft state structure for a vds instance 218 */ 219 typedef struct vds { 220 uint_t initialized; /* driver inst initialization flags */ 221 dev_info_t *dip; /* driver inst devinfo pointer */ 222 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 223 mod_hash_t *vd_table; /* table of virtual disks served */ 224 mdeg_handle_t mdeg; /* handle for MDEG operations */ 225 } vds_t; 226 227 /* 228 * Types of descriptor-processing tasks 229 */ 230 typedef enum vd_task_type { 231 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 232 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 233 } vd_task_type_t; 234 235 /* 236 * Structure describing the task for processing a descriptor 237 */ 238 typedef struct vd_task { 239 struct vd *vd; /* vd instance task is for */ 240 vd_task_type_t type; /* type of descriptor task */ 241 int index; /* dring elem index for task */ 242 vio_msg_t *msg; /* VIO message task is for */ 243 size_t msglen; /* length of message content */ 244 vd_dring_payload_t *request; /* request task will perform */ 245 struct buf buf; /* buf(9s) for I/O request */ 246 ldc_mem_handle_t mhdl; /* task memory handle */ 247 } vd_task_t; 248 249 /* 250 * Soft state structure for a virtual disk instance 251 */ 252 typedef struct vd { 253 uint_t initialized; /* vdisk initialization flags */ 254 vds_t *vds; /* server for this vdisk */ 255 ddi_taskq_t *startq; /* queue for I/O start tasks */ 256 ddi_taskq_t *completionq; /* queue for completion tasks */ 257 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 258 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 259 uint_t nslices; /* number of slices */ 260 size_t vdisk_size; /* number of blocks in vdisk */ 261 vd_disk_type_t vdisk_type; /* slice or entire disk */ 262 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 263 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 264 boolean_t pseudo; /* underlying pseudo dev */ 265 struct dk_efi dk_efi; /* synthetic for slice type */ 266 struct dk_geom dk_geom; /* synthetic for slice type */ 267 struct vtoc vtoc; /* synthetic for slice type */ 268 ldc_status_t ldc_state; /* LDC connection state */ 269 ldc_handle_t ldc_handle; /* handle for LDC comm */ 270 size_t max_msglen; /* largest LDC message len */ 271 vd_state_t state; /* client handshake state */ 272 uint8_t xfer_mode; /* transfer mode with client */ 273 uint32_t sid; /* client's session ID */ 274 uint64_t seq_num; /* message sequence number */ 275 uint64_t dring_ident; /* identifier of dring */ 276 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 277 uint32_t descriptor_size; /* num bytes in desc */ 278 uint32_t dring_len; /* number of dring elements */ 279 caddr_t dring; /* address of dring */ 280 caddr_t vio_msgp; /* vio msg staging buffer */ 281 vd_task_t inband_task; /* task for inband descriptor */ 282 vd_task_t *dring_task; /* tasks dring elements */ 283 284 kmutex_t lock; /* protects variables below */ 285 boolean_t enabled; /* is vdisk enabled? */ 286 boolean_t reset_state; /* reset connection state? */ 287 boolean_t reset_ldc; /* reset LDC channel? */ 288 } vd_t; 289 290 typedef struct vds_operation { 291 char *namep; 292 uint8_t operation; 293 int (*start)(vd_task_t *task); 294 void (*complete)(void *arg); 295 } vds_operation_t; 296 297 typedef struct vd_ioctl { 298 uint8_t operation; /* vdisk operation */ 299 const char *operation_name; /* vdisk operation name */ 300 size_t nbytes; /* size of operation buffer */ 301 int cmd; /* corresponding ioctl cmd */ 302 const char *cmd_name; /* ioctl cmd name */ 303 void *arg; /* ioctl cmd argument */ 304 /* convert input vd_buf to output ioctl_arg */ 305 void (*copyin)(void *vd_buf, void *ioctl_arg); 306 /* convert input ioctl_arg to output vd_buf */ 307 void (*copyout)(void *ioctl_arg, void *vd_buf); 308 } vd_ioctl_t; 309 310 /* Define trivial copyin/copyout conversion function flag */ 311 #define VD_IDENTITY ((void (*)(void *, void *))-1) 312 313 314 static int vds_ldc_retries = VDS_LDC_RETRIES; 315 static int vds_ldc_delay = VDS_LDC_DELAY; 316 static void *vds_state; 317 static uint64_t vds_operations; /* see vds_operation[] definition below */ 318 319 static int vd_open_flags = VD_OPEN_FLAGS; 320 321 /* 322 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 323 * 324 * Each supported major version should appear only once, paired with (and only 325 * with) its highest supported minor version number (as the protocol requires 326 * supporting all lower minor version numbers as well) 327 */ 328 static const vio_ver_t vds_version[] = {{1, 0}}; 329 static const size_t vds_num_versions = 330 sizeof (vds_version)/sizeof (vds_version[0]); 331 332 static void vd_free_dring_task(vd_t *vdp); 333 334 static int 335 vd_start_bio(vd_task_t *task) 336 { 337 int rv, status = 0; 338 vd_t *vd = task->vd; 339 vd_dring_payload_t *request = task->request; 340 struct buf *buf = &task->buf; 341 uint8_t mtype; 342 343 344 ASSERT(vd != NULL); 345 ASSERT(request != NULL); 346 ASSERT(request->slice < vd->nslices); 347 ASSERT((request->operation == VD_OP_BREAD) || 348 (request->operation == VD_OP_BWRITE)); 349 350 if (request->nbytes == 0) 351 return (EINVAL); /* no service for trivial requests */ 352 353 PR1("%s %lu bytes at block %lu", 354 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 355 request->nbytes, request->addr); 356 357 bioinit(buf); 358 buf->b_flags = B_BUSY; 359 buf->b_bcount = request->nbytes; 360 buf->b_lblkno = request->addr; 361 buf->b_edev = vd->dev[request->slice]; 362 363 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 364 365 /* Map memory exported by client */ 366 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 367 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 368 &(buf->b_un.b_addr), NULL); 369 if (status != 0) { 370 PR0("ldc_mem_map() returned err %d ", status); 371 biofini(buf); 372 return (status); 373 } 374 375 status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount); 376 if (status != 0) { 377 (void) ldc_mem_unmap(task->mhdl); 378 PR0("ldc_mem_acquire() returned err %d ", status); 379 biofini(buf); 380 return (status); 381 } 382 383 buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE; 384 385 /* Start the block I/O */ 386 if ((status = ldi_strategy(vd->ldi_handle[request->slice], buf)) == 0) 387 return (EINPROGRESS); /* will complete on completionq */ 388 389 /* Clean up after error */ 390 rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 391 if (rv) { 392 PR0("ldc_mem_release() returned err %d ", rv); 393 } 394 rv = ldc_mem_unmap(task->mhdl); 395 if (rv) { 396 PR0("ldc_mem_unmap() returned err %d ", status); 397 } 398 399 biofini(buf); 400 return (status); 401 } 402 403 static int 404 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 405 { 406 int status; 407 size_t nbytes; 408 409 do { 410 nbytes = msglen; 411 status = ldc_write(ldc_handle, msg, &nbytes); 412 if (status != EWOULDBLOCK) 413 break; 414 drv_usecwait(vds_ldc_delay); 415 } while (status == EWOULDBLOCK); 416 417 if (status != 0) { 418 if (status != ECONNRESET) 419 PR0("ldc_write() returned errno %d", status); 420 return (status); 421 } else if (nbytes != msglen) { 422 PR0("ldc_write() performed only partial write"); 423 return (EIO); 424 } 425 426 PR1("SENT %lu bytes", msglen); 427 return (0); 428 } 429 430 static void 431 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 432 { 433 mutex_enter(&vd->lock); 434 vd->reset_state = B_TRUE; 435 vd->reset_ldc = reset_ldc; 436 mutex_exit(&vd->lock); 437 } 438 439 /* 440 * Reset the state of the connection with a client, if needed; reset the LDC 441 * transport as well, if needed. This function should only be called from the 442 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 443 */ 444 static void 445 vd_reset_if_needed(vd_t *vd) 446 { 447 int status = 0; 448 449 mutex_enter(&vd->lock); 450 if (!vd->reset_state) { 451 ASSERT(!vd->reset_ldc); 452 mutex_exit(&vd->lock); 453 return; 454 } 455 mutex_exit(&vd->lock); 456 457 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 458 459 /* 460 * Let any asynchronous I/O complete before possibly pulling the rug 461 * out from under it; defer checking vd->reset_ldc, as one of the 462 * asynchronous tasks might set it 463 */ 464 ddi_taskq_wait(vd->completionq); 465 466 if ((vd->initialized & VD_DRING) && 467 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 468 PR0("ldc_mem_dring_unmap() returned errno %d", status); 469 470 vd_free_dring_task(vd); 471 472 /* Free the staging buffer for msgs */ 473 if (vd->vio_msgp != NULL) { 474 kmem_free(vd->vio_msgp, vd->max_msglen); 475 vd->vio_msgp = NULL; 476 } 477 478 /* Free the inband message buffer */ 479 if (vd->inband_task.msg != NULL) { 480 kmem_free(vd->inband_task.msg, vd->max_msglen); 481 vd->inband_task.msg = NULL; 482 } 483 484 mutex_enter(&vd->lock); 485 486 if (vd->reset_ldc) 487 PR0("taking down LDC channel"); 488 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 489 PR0("ldc_down() returned errno %d", status); 490 491 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 492 vd->state = VD_STATE_INIT; 493 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 494 495 /* Allocate the staging buffer */ 496 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 497 498 PR0("calling ldc_up\n"); 499 (void) ldc_up(vd->ldc_handle); 500 501 vd->reset_state = B_FALSE; 502 vd->reset_ldc = B_FALSE; 503 504 mutex_exit(&vd->lock); 505 } 506 507 static void vd_recv_msg(void *arg); 508 509 static void 510 vd_mark_in_reset(vd_t *vd) 511 { 512 int status; 513 514 PR0("vd_mark_in_reset: marking vd in reset\n"); 515 516 vd_need_reset(vd, B_FALSE); 517 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 518 if (status == DDI_FAILURE) { 519 PR0("cannot schedule task to recv msg\n"); 520 vd_need_reset(vd, B_TRUE); 521 return; 522 } 523 } 524 525 static int 526 vd_mark_elem_done(vd_t *vd, int idx, int elem_status) 527 { 528 boolean_t accepted; 529 int status; 530 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 531 532 if (vd->reset_state) 533 return (0); 534 535 /* Acquire the element */ 536 if (!vd->reset_state && 537 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 538 if (status == ECONNRESET) { 539 vd_mark_in_reset(vd); 540 return (0); 541 } else { 542 PR0("ldc_mem_dring_acquire() returned errno %d", 543 status); 544 return (status); 545 } 546 } 547 548 /* Set the element's status and mark it done */ 549 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 550 if (accepted) { 551 elem->payload.status = elem_status; 552 elem->hdr.dstate = VIO_DESC_DONE; 553 } else { 554 /* Perhaps client timed out waiting for I/O... */ 555 PR0("element %u no longer \"accepted\"", idx); 556 VD_DUMP_DRING_ELEM(elem); 557 } 558 /* Release the element */ 559 if (!vd->reset_state && 560 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 561 if (status == ECONNRESET) { 562 vd_mark_in_reset(vd); 563 return (0); 564 } else { 565 PR0("ldc_mem_dring_release() returned errno %d", 566 status); 567 return (status); 568 } 569 } 570 571 return (accepted ? 0 : EINVAL); 572 } 573 574 static void 575 vd_complete_bio(void *arg) 576 { 577 int status = 0; 578 vd_task_t *task = (vd_task_t *)arg; 579 vd_t *vd = task->vd; 580 vd_dring_payload_t *request = task->request; 581 struct buf *buf = &task->buf; 582 583 584 ASSERT(vd != NULL); 585 ASSERT(request != NULL); 586 ASSERT(task->msg != NULL); 587 ASSERT(task->msglen >= sizeof (*task->msg)); 588 589 /* Wait for the I/O to complete */ 590 request->status = biowait(buf); 591 592 /* Release the buffer */ 593 if (!vd->reset_state) 594 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 595 if (status) { 596 PR0("ldc_mem_release() returned errno %d copying to " 597 "client", status); 598 if (status == ECONNRESET) { 599 vd_mark_in_reset(vd); 600 } 601 } 602 603 /* Unmap the memory, even if in reset */ 604 status = ldc_mem_unmap(task->mhdl); 605 if (status) { 606 PR0("ldc_mem_unmap() returned errno %d copying to client", 607 status); 608 if (status == ECONNRESET) { 609 vd_mark_in_reset(vd); 610 } 611 } 612 613 biofini(buf); 614 615 /* Update the dring element for a dring client */ 616 if (!vd->reset_state && (status == 0) && 617 (vd->xfer_mode == VIO_DRING_MODE)) { 618 status = vd_mark_elem_done(vd, task->index, request->status); 619 if (status == ECONNRESET) 620 vd_mark_in_reset(vd); 621 } 622 623 /* 624 * If a transport error occurred, arrange to "nack" the message when 625 * the final task in the descriptor element range completes 626 */ 627 if (status != 0) 628 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 629 630 /* 631 * Only the final task for a range of elements will respond to and 632 * free the message 633 */ 634 if (task->type == VD_NONFINAL_RANGE_TASK) { 635 return; 636 } 637 638 /* 639 * Send the "ack" or "nack" back to the client; if sending the message 640 * via LDC fails, arrange to reset both the connection state and LDC 641 * itself 642 */ 643 PR1("Sending %s", 644 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 645 if (!vd->reset_state) { 646 status = send_msg(vd->ldc_handle, task->msg, task->msglen); 647 switch (status) { 648 case 0: 649 break; 650 case ECONNRESET: 651 vd_mark_in_reset(vd); 652 break; 653 default: 654 PR0("initiating full reset"); 655 vd_need_reset(vd, B_TRUE); 656 break; 657 } 658 } 659 } 660 661 static void 662 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 663 { 664 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 665 } 666 667 static void 668 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 669 { 670 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 671 } 672 673 static void 674 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 675 { 676 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 677 } 678 679 static void 680 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 681 { 682 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 683 } 684 685 static void 686 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 687 { 688 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 689 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 690 691 dk_efi->dki_lba = vd_efi->lba; 692 dk_efi->dki_length = vd_efi->length; 693 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 694 } 695 696 static void 697 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 698 { 699 int len; 700 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 701 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 702 703 len = vd_efi->length; 704 DK_EFI2VD_EFI(dk_efi, vd_efi); 705 kmem_free(dk_efi->dki_data, len); 706 } 707 708 static void 709 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 710 { 711 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 712 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 713 714 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 715 VD_EFI2DK_EFI(vd_efi, dk_efi); 716 } 717 718 static void 719 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 720 { 721 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 722 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 723 724 kmem_free(dk_efi->dki_data, vd_efi->length); 725 } 726 727 static int 728 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label) 729 { 730 int status, rval; 731 struct dk_gpt *efi; 732 size_t efi_len; 733 734 *label = VD_DISK_LABEL_UNK; 735 736 status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc, 737 (vd_open_flags | FKIOCTL), kcred, &rval); 738 739 if (status == 0) { 740 *label = VD_DISK_LABEL_VTOC; 741 return (0); 742 } else if (status != ENOTSUP) { 743 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 744 return (status); 745 } 746 747 status = vds_efi_alloc_and_read(handle, &efi, &efi_len); 748 749 if (status) { 750 PR0("vds_efi_alloc_and_read returned error %d", status); 751 return (status); 752 } 753 754 *label = VD_DISK_LABEL_EFI; 755 vd_efi_to_vtoc(efi, vtoc); 756 vd_efi_free(efi, efi_len); 757 758 return (0); 759 } 760 761 static int 762 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 763 { 764 dk_efi_t *dk_ioc; 765 766 switch (vd->vdisk_label) { 767 768 case VD_DISK_LABEL_VTOC: 769 770 switch (cmd) { 771 case DKIOCGGEOM: 772 ASSERT(ioctl_arg != NULL); 773 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 774 return (0); 775 case DKIOCGVTOC: 776 ASSERT(ioctl_arg != NULL); 777 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 778 return (0); 779 default: 780 return (ENOTSUP); 781 } 782 783 case VD_DISK_LABEL_EFI: 784 785 switch (cmd) { 786 case DKIOCGETEFI: 787 ASSERT(ioctl_arg != NULL); 788 dk_ioc = (dk_efi_t *)ioctl_arg; 789 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 790 return (EINVAL); 791 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 792 vd->dk_efi.dki_length); 793 return (0); 794 default: 795 return (ENOTSUP); 796 } 797 798 default: 799 return (ENOTSUP); 800 } 801 } 802 803 static int 804 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 805 { 806 int rval = 0, status; 807 size_t nbytes = request->nbytes; /* modifiable copy */ 808 809 810 ASSERT(request->slice < vd->nslices); 811 PR0("Performing %s", ioctl->operation_name); 812 813 /* Get data from client and convert, if necessary */ 814 if (ioctl->copyin != NULL) { 815 ASSERT(nbytes != 0 && buf != NULL); 816 PR1("Getting \"arg\" data from client"); 817 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 818 request->cookie, request->ncookies, 819 LDC_COPY_IN)) != 0) { 820 PR0("ldc_mem_copy() returned errno %d " 821 "copying from client", status); 822 return (status); 823 } 824 825 /* Convert client's data, if necessary */ 826 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 827 ioctl->arg = buf; 828 else /* convert client vdisk operation data to ioctl data */ 829 (ioctl->copyin)(buf, (void *)ioctl->arg); 830 } 831 832 /* 833 * Handle single-slice block devices internally; otherwise, have the 834 * real driver perform the ioctl() 835 */ 836 if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { 837 if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, 838 (void *)ioctl->arg)) != 0) 839 return (status); 840 } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], 841 ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL), 842 kcred, &rval)) != 0) { 843 PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); 844 return (status); 845 } 846 #ifdef DEBUG 847 if (rval != 0) { 848 PR0("%s set rval = %d, which is not being returned to client", 849 ioctl->cmd_name, rval); 850 } 851 #endif /* DEBUG */ 852 853 /* Convert data and send to client, if necessary */ 854 if (ioctl->copyout != NULL) { 855 ASSERT(nbytes != 0 && buf != NULL); 856 PR1("Sending \"arg\" data to client"); 857 858 /* Convert ioctl data to vdisk operation data, if necessary */ 859 if (ioctl->copyout != VD_IDENTITY) 860 (ioctl->copyout)((void *)ioctl->arg, buf); 861 862 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 863 request->cookie, request->ncookies, 864 LDC_COPY_OUT)) != 0) { 865 PR0("ldc_mem_copy() returned errno %d " 866 "copying to client", status); 867 return (status); 868 } 869 } 870 871 return (status); 872 } 873 874 /* 875 * Open any slices which have become non-empty as a result of performing a 876 * set-VTOC operation for the client. 877 * 878 * When serving a full disk, vds attempts to exclusively open all of the 879 * disk's slices to prevent another thread or process in the service domain 880 * from "stealing" a slice or from performing I/O to a slice while a vds 881 * client is accessing it. Unfortunately, underlying drivers, such as sd(7d) 882 * and cmdk(7d), return an error when attempting to open the device file for a 883 * slice which is currently empty according to the VTOC. This driver behavior 884 * means that vds must skip opening empty slices when initializing a vdisk for 885 * full-disk service and try to open slices that become non-empty (via a 886 * set-VTOC operation) during use of the full disk in order to begin serving 887 * such slices to the client. This approach has an inherent (and therefore 888 * unavoidable) race condition; it also means that failure to open a 889 * newly-non-empty slice has different semantics than failure to open an 890 * initially-non-empty slice: Due to driver bahavior, opening a 891 * newly-non-empty slice is a necessary side effect of vds performing a 892 * (successful) set-VTOC operation for a client on an in-service (and in-use) 893 * disk in order to begin serving the slice; failure of this side-effect 894 * operation does not mean that the client's set-VTOC operation failed or that 895 * operations on other slices must fail. Therefore, this function prints an 896 * error message on failure to open a slice, but does not return an error to 897 * its caller--unlike failure to open a slice initially, which results in an 898 * error that prevents serving the vdisk (and thereby requires an 899 * administrator to resolve the problem). Note that, apart from another 900 * thread or process opening a new slice during the race-condition window, 901 * failure to open a slice in this function will likely indicate an underlying 902 * drive problem, which will also likely become evident in errors returned by 903 * operations on other slices, and which will require administrative 904 * intervention and possibly servicing the drive. 905 */ 906 static void 907 vd_open_new_slices(vd_t *vd) 908 { 909 int status; 910 struct vtoc vtoc; 911 912 /* Get the (new) partitions for updated slice sizes */ 913 if ((status = vd_read_vtoc(vd->ldi_handle[0], &vtoc, 914 &vd->vdisk_label)) != 0) { 915 PR0("vd_read_vtoc returned error %d", status); 916 return; 917 } 918 919 /* Open any newly-non-empty slices */ 920 for (int slice = 0; slice < vd->nslices; slice++) { 921 /* Skip zero-length slices */ 922 if (vtoc.v_part[slice].p_size == 0) { 923 if (vd->ldi_handle[slice] != NULL) 924 PR0("Open slice %u now has zero length", slice); 925 continue; 926 } 927 928 /* Skip already-open slices */ 929 if (vd->ldi_handle[slice] != NULL) 930 continue; 931 932 PR0("Opening newly-non-empty slice %u", slice); 933 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 934 vd_open_flags, kcred, &vd->ldi_handle[slice], 935 vd->vds->ldi_ident)) != 0) { 936 PR0("ldi_open_by_dev() returned errno %d " 937 "for slice %u", status, slice); 938 } 939 } 940 } 941 942 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 943 static int 944 vd_ioctl(vd_task_t *task) 945 { 946 int i, status; 947 void *buf = NULL; 948 struct dk_geom dk_geom = {0}; 949 struct vtoc vtoc = {0}; 950 struct dk_efi dk_efi = {0}; 951 vd_t *vd = task->vd; 952 vd_dring_payload_t *request = task->request; 953 vd_ioctl_t ioctl[] = { 954 /* Command (no-copy) operations */ 955 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 956 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 957 NULL, NULL, NULL}, 958 959 /* "Get" (copy-out) operations */ 960 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 961 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 962 NULL, VD_IDENTITY, VD_IDENTITY}, 963 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 964 RNDSIZE(vd_geom_t), 965 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 966 &dk_geom, NULL, dk_geom2vd_geom}, 967 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 968 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 969 &vtoc, NULL, vtoc2vd_vtoc}, 970 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 971 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 972 &dk_efi, vd_get_efi_in, vd_get_efi_out}, 973 974 /* "Set" (copy-in) operations */ 975 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 976 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 977 NULL, VD_IDENTITY, VD_IDENTITY}, 978 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 979 RNDSIZE(vd_geom_t), 980 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 981 &dk_geom, vd_geom2dk_geom, NULL}, 982 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 983 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 984 &vtoc, vd_vtoc2vtoc, NULL}, 985 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 986 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 987 &dk_efi, vd_set_efi_in, vd_set_efi_out}, 988 }; 989 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 990 991 992 ASSERT(vd != NULL); 993 ASSERT(request != NULL); 994 ASSERT(request->slice < vd->nslices); 995 996 /* 997 * Determine ioctl corresponding to caller's "operation" and 998 * validate caller's "nbytes" 999 */ 1000 for (i = 0; i < nioctls; i++) { 1001 if (request->operation == ioctl[i].operation) { 1002 /* LDC memory operations require 8-byte multiples */ 1003 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 1004 1005 if (request->operation == VD_OP_GET_EFI || 1006 request->operation == VD_OP_SET_EFI) { 1007 if (request->nbytes >= ioctl[i].nbytes) 1008 break; 1009 PR0("%s: Expected at least nbytes = %lu, " 1010 "got %lu", ioctl[i].operation_name, 1011 ioctl[i].nbytes, request->nbytes); 1012 return (EINVAL); 1013 } 1014 1015 if (request->nbytes != ioctl[i].nbytes) { 1016 PR0("%s: Expected nbytes = %lu, got %lu", 1017 ioctl[i].operation_name, ioctl[i].nbytes, 1018 request->nbytes); 1019 return (EINVAL); 1020 } 1021 1022 break; 1023 } 1024 } 1025 ASSERT(i < nioctls); /* because "operation" already validated */ 1026 1027 if (request->nbytes) 1028 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 1029 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 1030 if (request->nbytes) 1031 kmem_free(buf, request->nbytes); 1032 if (vd->vdisk_type == VD_DISK_TYPE_DISK && 1033 (request->operation == VD_OP_SET_VTOC || 1034 request->operation == VD_OP_SET_EFI)) 1035 vd_open_new_slices(vd); 1036 PR0("Returning %d", status); 1037 return (status); 1038 } 1039 1040 static int 1041 vd_get_devid(vd_task_t *task) 1042 { 1043 vd_t *vd = task->vd; 1044 vd_dring_payload_t *request = task->request; 1045 vd_devid_t *vd_devid; 1046 impl_devid_t *devid; 1047 int status, bufid_len, devid_len, len; 1048 int bufbytes; 1049 1050 PR1("Get Device ID, nbytes=%ld", request->nbytes); 1051 1052 if (ddi_lyr_get_devid(vd->dev[request->slice], 1053 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 1054 /* the most common failure is that no devid is available */ 1055 PR2("No Device ID"); 1056 return (ENOENT); 1057 } 1058 1059 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 1060 devid_len = DEVID_GETLEN(devid); 1061 1062 /* 1063 * Save the buffer size here for use in deallocation. 1064 * The actual number of bytes copied is returned in 1065 * the 'nbytes' field of the request structure. 1066 */ 1067 bufbytes = request->nbytes; 1068 1069 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 1070 vd_devid->length = devid_len; 1071 vd_devid->type = DEVID_GETTYPE(devid); 1072 1073 len = (devid_len > bufid_len)? bufid_len : devid_len; 1074 1075 bcopy(devid->did_id, vd_devid->id, len); 1076 1077 /* LDC memory operations require 8-byte multiples */ 1078 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 1079 1080 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 1081 &request->nbytes, request->cookie, request->ncookies, 1082 LDC_COPY_OUT)) != 0) { 1083 PR0("ldc_mem_copy() returned errno %d copying to client", 1084 status); 1085 } 1086 PR1("post mem_copy: nbytes=%ld", request->nbytes); 1087 1088 kmem_free(vd_devid, bufbytes); 1089 ddi_devid_free((ddi_devid_t)devid); 1090 1091 return (status); 1092 } 1093 1094 /* 1095 * Define the supported operations once the functions for performing them have 1096 * been defined 1097 */ 1098 static const vds_operation_t vds_operation[] = { 1099 #define X(_s) #_s, _s 1100 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 1101 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 1102 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 1103 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 1104 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 1105 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 1106 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 1107 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 1108 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 1109 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 1110 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 1111 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 1112 #undef X 1113 }; 1114 1115 static const size_t vds_noperations = 1116 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 1117 1118 /* 1119 * Process a task specifying a client I/O request 1120 */ 1121 static int 1122 vd_process_task(vd_task_t *task) 1123 { 1124 int i, status; 1125 vd_t *vd = task->vd; 1126 vd_dring_payload_t *request = task->request; 1127 1128 1129 ASSERT(vd != NULL); 1130 ASSERT(request != NULL); 1131 1132 /* Find the requested operation */ 1133 for (i = 0; i < vds_noperations; i++) 1134 if (request->operation == vds_operation[i].operation) 1135 break; 1136 if (i == vds_noperations) { 1137 PR0("Unsupported operation %u", request->operation); 1138 return (ENOTSUP); 1139 } 1140 1141 /* Handle client using absolute disk offsets */ 1142 if ((vd->vdisk_type == VD_DISK_TYPE_DISK) && 1143 (request->slice == UINT8_MAX)) 1144 request->slice = VD_ENTIRE_DISK_SLICE; 1145 1146 /* Range-check slice */ 1147 if (request->slice >= vd->nslices) { 1148 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 1149 request->slice, (vd->nslices - 1)); 1150 return (EINVAL); 1151 } 1152 1153 PR1("operation : %s", vds_operation[i].namep); 1154 1155 /* Start the operation */ 1156 if ((status = vds_operation[i].start(task)) != EINPROGRESS) { 1157 PR0("operation : %s returned status %d", 1158 vds_operation[i].namep, status); 1159 request->status = status; /* op succeeded or failed */ 1160 return (0); /* but request completed */ 1161 } 1162 1163 ASSERT(vds_operation[i].complete != NULL); /* debug case */ 1164 if (vds_operation[i].complete == NULL) { /* non-debug case */ 1165 PR0("Unexpected return of EINPROGRESS " 1166 "with no I/O completion handler"); 1167 request->status = EIO; /* operation failed */ 1168 return (0); /* but request completed */ 1169 } 1170 1171 PR1("operation : kick off taskq entry for %s", vds_operation[i].namep); 1172 1173 /* Queue a task to complete the operation */ 1174 status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete, 1175 task, DDI_SLEEP); 1176 /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ 1177 ASSERT(status == DDI_SUCCESS); 1178 1179 PR1("Operation in progress"); 1180 return (EINPROGRESS); /* completion handler will finish request */ 1181 } 1182 1183 /* 1184 * Return true if the "type", "subtype", and "env" fields of the "tag" first 1185 * argument match the corresponding remaining arguments; otherwise, return false 1186 */ 1187 boolean_t 1188 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 1189 { 1190 return ((tag->vio_msgtype == type) && 1191 (tag->vio_subtype == subtype) && 1192 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 1193 } 1194 1195 /* 1196 * Check whether the major/minor version specified in "ver_msg" is supported 1197 * by this server. 1198 */ 1199 static boolean_t 1200 vds_supported_version(vio_ver_msg_t *ver_msg) 1201 { 1202 for (int i = 0; i < vds_num_versions; i++) { 1203 ASSERT(vds_version[i].major > 0); 1204 ASSERT((i == 0) || 1205 (vds_version[i].major < vds_version[i-1].major)); 1206 1207 /* 1208 * If the major versions match, adjust the minor version, if 1209 * necessary, down to the highest value supported by this 1210 * server and return true so this message will get "ack"ed; 1211 * the client should also support all minor versions lower 1212 * than the value it sent 1213 */ 1214 if (ver_msg->ver_major == vds_version[i].major) { 1215 if (ver_msg->ver_minor > vds_version[i].minor) { 1216 PR0("Adjusting minor version from %u to %u", 1217 ver_msg->ver_minor, vds_version[i].minor); 1218 ver_msg->ver_minor = vds_version[i].minor; 1219 } 1220 return (B_TRUE); 1221 } 1222 1223 /* 1224 * If the message contains a higher major version number, set 1225 * the message's major/minor versions to the current values 1226 * and return false, so this message will get "nack"ed with 1227 * these values, and the client will potentially try again 1228 * with the same or a lower version 1229 */ 1230 if (ver_msg->ver_major > vds_version[i].major) { 1231 ver_msg->ver_major = vds_version[i].major; 1232 ver_msg->ver_minor = vds_version[i].minor; 1233 return (B_FALSE); 1234 } 1235 1236 /* 1237 * Otherwise, the message's major version is less than the 1238 * current major version, so continue the loop to the next 1239 * (lower) supported version 1240 */ 1241 } 1242 1243 /* 1244 * No common version was found; "ground" the version pair in the 1245 * message to terminate negotiation 1246 */ 1247 ver_msg->ver_major = 0; 1248 ver_msg->ver_minor = 0; 1249 return (B_FALSE); 1250 } 1251 1252 /* 1253 * Process a version message from a client. vds expects to receive version 1254 * messages from clients seeking service, but never issues version messages 1255 * itself; therefore, vds can ACK or NACK client version messages, but does 1256 * not expect to receive version-message ACKs or NACKs (and will treat such 1257 * messages as invalid). 1258 */ 1259 static int 1260 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1261 { 1262 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 1263 1264 1265 ASSERT(msglen >= sizeof (msg->tag)); 1266 1267 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1268 VIO_VER_INFO)) { 1269 return (ENOMSG); /* not a version message */ 1270 } 1271 1272 if (msglen != sizeof (*ver_msg)) { 1273 PR0("Expected %lu-byte version message; " 1274 "received %lu bytes", sizeof (*ver_msg), msglen); 1275 return (EBADMSG); 1276 } 1277 1278 if (ver_msg->dev_class != VDEV_DISK) { 1279 PR0("Expected device class %u (disk); received %u", 1280 VDEV_DISK, ver_msg->dev_class); 1281 return (EBADMSG); 1282 } 1283 1284 /* 1285 * We're talking to the expected kind of client; set our device class 1286 * for "ack/nack" back to the client 1287 */ 1288 ver_msg->dev_class = VDEV_DISK_SERVER; 1289 1290 /* 1291 * Check whether the (valid) version message specifies a version 1292 * supported by this server. If the version is not supported, return 1293 * EBADMSG so the message will get "nack"ed; vds_supported_version() 1294 * will have updated the message with a supported version for the 1295 * client to consider 1296 */ 1297 if (!vds_supported_version(ver_msg)) 1298 return (EBADMSG); 1299 1300 1301 /* 1302 * A version has been agreed upon; use the client's SID for 1303 * communication on this channel now 1304 */ 1305 ASSERT(!(vd->initialized & VD_SID)); 1306 vd->sid = ver_msg->tag.vio_sid; 1307 vd->initialized |= VD_SID; 1308 1309 /* 1310 * When multiple versions are supported, this function should store 1311 * the negotiated major and minor version values in the "vd" data 1312 * structure to govern further communication; in particular, note that 1313 * the client might have specified a lower minor version for the 1314 * agreed major version than specifed in the vds_version[] array. The 1315 * following assertions should help remind future maintainers to make 1316 * the appropriate changes to support multiple versions. 1317 */ 1318 ASSERT(vds_num_versions == 1); 1319 ASSERT(ver_msg->ver_major == vds_version[0].major); 1320 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 1321 1322 PR0("Using major version %u, minor version %u", 1323 ver_msg->ver_major, ver_msg->ver_minor); 1324 return (0); 1325 } 1326 1327 static int 1328 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1329 { 1330 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 1331 1332 1333 ASSERT(msglen >= sizeof (msg->tag)); 1334 1335 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1336 VIO_ATTR_INFO)) { 1337 PR0("Message is not an attribute message"); 1338 return (ENOMSG); 1339 } 1340 1341 if (msglen != sizeof (*attr_msg)) { 1342 PR0("Expected %lu-byte attribute message; " 1343 "received %lu bytes", sizeof (*attr_msg), msglen); 1344 return (EBADMSG); 1345 } 1346 1347 if (attr_msg->max_xfer_sz == 0) { 1348 PR0("Received maximum transfer size of 0 from client"); 1349 return (EBADMSG); 1350 } 1351 1352 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 1353 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 1354 PR0("Client requested unsupported transfer mode"); 1355 return (EBADMSG); 1356 } 1357 1358 /* Success: valid message and transfer mode */ 1359 vd->xfer_mode = attr_msg->xfer_mode; 1360 1361 if (vd->xfer_mode == VIO_DESC_MODE) { 1362 1363 /* 1364 * The vd_dring_inband_msg_t contains one cookie; need room 1365 * for up to n-1 more cookies, where "n" is the number of full 1366 * pages plus possibly one partial page required to cover 1367 * "max_xfer_sz". Add room for one more cookie if 1368 * "max_xfer_sz" isn't an integral multiple of the page size. 1369 * Must first get the maximum transfer size in bytes. 1370 */ 1371 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 1372 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 1373 attr_msg->max_xfer_sz; 1374 size_t max_inband_msglen = 1375 sizeof (vd_dring_inband_msg_t) + 1376 ((max_xfer_bytes/PAGESIZE + 1377 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 1378 (sizeof (ldc_mem_cookie_t))); 1379 1380 /* 1381 * Set the maximum expected message length to 1382 * accommodate in-band-descriptor messages with all 1383 * their cookies 1384 */ 1385 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 1386 1387 /* 1388 * Initialize the data structure for processing in-band I/O 1389 * request descriptors 1390 */ 1391 vd->inband_task.vd = vd; 1392 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1393 vd->inband_task.index = 0; 1394 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 1395 } 1396 1397 /* Return the device's block size and max transfer size to the client */ 1398 attr_msg->vdisk_block_size = DEV_BSIZE; 1399 attr_msg->max_xfer_sz = vd->max_xfer_sz; 1400 1401 attr_msg->vdisk_size = vd->vdisk_size; 1402 attr_msg->vdisk_type = vd->vdisk_type; 1403 attr_msg->operations = vds_operations; 1404 PR0("%s", VD_CLIENT(vd)); 1405 1406 ASSERT(vd->dring_task == NULL); 1407 1408 return (0); 1409 } 1410 1411 static int 1412 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1413 { 1414 int status; 1415 size_t expected; 1416 ldc_mem_info_t dring_minfo; 1417 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 1418 1419 1420 ASSERT(msglen >= sizeof (msg->tag)); 1421 1422 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1423 VIO_DRING_REG)) { 1424 PR0("Message is not a register-dring message"); 1425 return (ENOMSG); 1426 } 1427 1428 if (msglen < sizeof (*reg_msg)) { 1429 PR0("Expected at least %lu-byte register-dring message; " 1430 "received %lu bytes", sizeof (*reg_msg), msglen); 1431 return (EBADMSG); 1432 } 1433 1434 expected = sizeof (*reg_msg) + 1435 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 1436 if (msglen != expected) { 1437 PR0("Expected %lu-byte register-dring message; " 1438 "received %lu bytes", expected, msglen); 1439 return (EBADMSG); 1440 } 1441 1442 if (vd->initialized & VD_DRING) { 1443 PR0("A dring was previously registered; only support one"); 1444 return (EBADMSG); 1445 } 1446 1447 if (reg_msg->num_descriptors > INT32_MAX) { 1448 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 1449 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 1450 return (EBADMSG); 1451 } 1452 1453 if (reg_msg->ncookies != 1) { 1454 /* 1455 * In addition to fixing the assertion in the success case 1456 * below, supporting drings which require more than one 1457 * "cookie" requires increasing the value of vd->max_msglen 1458 * somewhere in the code path prior to receiving the message 1459 * which results in calling this function. Note that without 1460 * making this change, the larger message size required to 1461 * accommodate multiple cookies cannot be successfully 1462 * received, so this function will not even get called. 1463 * Gracefully accommodating more dring cookies might 1464 * reasonably demand exchanging an additional attribute or 1465 * making a minor protocol adjustment 1466 */ 1467 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 1468 return (EBADMSG); 1469 } 1470 1471 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 1472 reg_msg->ncookies, reg_msg->num_descriptors, 1473 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 1474 if (status != 0) { 1475 PR0("ldc_mem_dring_map() returned errno %d", status); 1476 return (status); 1477 } 1478 1479 /* 1480 * To remove the need for this assertion, must call 1481 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 1482 * successful call to ldc_mem_dring_map() 1483 */ 1484 ASSERT(reg_msg->ncookies == 1); 1485 1486 if ((status = 1487 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 1488 PR0("ldc_mem_dring_info() returned errno %d", status); 1489 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 1490 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1491 return (status); 1492 } 1493 1494 if (dring_minfo.vaddr == NULL) { 1495 PR0("Descriptor ring virtual address is NULL"); 1496 return (ENXIO); 1497 } 1498 1499 1500 /* Initialize for valid message and mapped dring */ 1501 PR1("descriptor size = %u, dring length = %u", 1502 vd->descriptor_size, vd->dring_len); 1503 vd->initialized |= VD_DRING; 1504 vd->dring_ident = 1; /* "There Can Be Only One" */ 1505 vd->dring = dring_minfo.vaddr; 1506 vd->descriptor_size = reg_msg->descriptor_size; 1507 vd->dring_len = reg_msg->num_descriptors; 1508 reg_msg->dring_ident = vd->dring_ident; 1509 1510 /* 1511 * Allocate and initialize a "shadow" array of data structures for 1512 * tasks to process I/O requests in dring elements 1513 */ 1514 vd->dring_task = 1515 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 1516 for (int i = 0; i < vd->dring_len; i++) { 1517 vd->dring_task[i].vd = vd; 1518 vd->dring_task[i].index = i; 1519 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 1520 1521 status = ldc_mem_alloc_handle(vd->ldc_handle, 1522 &(vd->dring_task[i].mhdl)); 1523 if (status) { 1524 PR0("ldc_mem_alloc_handle() returned err %d ", status); 1525 return (ENXIO); 1526 } 1527 1528 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1529 } 1530 1531 return (0); 1532 } 1533 1534 static int 1535 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1536 { 1537 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 1538 1539 1540 ASSERT(msglen >= sizeof (msg->tag)); 1541 1542 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1543 VIO_DRING_UNREG)) { 1544 PR0("Message is not an unregister-dring message"); 1545 return (ENOMSG); 1546 } 1547 1548 if (msglen != sizeof (*unreg_msg)) { 1549 PR0("Expected %lu-byte unregister-dring message; " 1550 "received %lu bytes", sizeof (*unreg_msg), msglen); 1551 return (EBADMSG); 1552 } 1553 1554 if (unreg_msg->dring_ident != vd->dring_ident) { 1555 PR0("Expected dring ident %lu; received %lu", 1556 vd->dring_ident, unreg_msg->dring_ident); 1557 return (EBADMSG); 1558 } 1559 1560 return (0); 1561 } 1562 1563 static int 1564 process_rdx_msg(vio_msg_t *msg, size_t msglen) 1565 { 1566 ASSERT(msglen >= sizeof (msg->tag)); 1567 1568 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 1569 PR0("Message is not an RDX message"); 1570 return (ENOMSG); 1571 } 1572 1573 if (msglen != sizeof (vio_rdx_msg_t)) { 1574 PR0("Expected %lu-byte RDX message; received %lu bytes", 1575 sizeof (vio_rdx_msg_t), msglen); 1576 return (EBADMSG); 1577 } 1578 1579 PR0("Valid RDX message"); 1580 return (0); 1581 } 1582 1583 static int 1584 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 1585 { 1586 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 1587 PR0("Received seq_num %lu; expected %lu", 1588 seq_num, (vd->seq_num + 1)); 1589 PR0("initiating soft reset"); 1590 vd_need_reset(vd, B_FALSE); 1591 return (1); 1592 } 1593 1594 vd->seq_num = seq_num; 1595 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 1596 return (0); 1597 } 1598 1599 /* 1600 * Return the expected size of an inband-descriptor message with all the 1601 * cookies it claims to include 1602 */ 1603 static size_t 1604 expected_inband_size(vd_dring_inband_msg_t *msg) 1605 { 1606 return ((sizeof (*msg)) + 1607 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 1608 } 1609 1610 /* 1611 * Process an in-band descriptor message: used with clients like OBP, with 1612 * which vds exchanges descriptors within VIO message payloads, rather than 1613 * operating on them within a descriptor ring 1614 */ 1615 static int 1616 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1617 { 1618 size_t expected; 1619 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 1620 1621 1622 ASSERT(msglen >= sizeof (msg->tag)); 1623 1624 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1625 VIO_DESC_DATA)) { 1626 PR1("Message is not an in-band-descriptor message"); 1627 return (ENOMSG); 1628 } 1629 1630 if (msglen < sizeof (*desc_msg)) { 1631 PR0("Expected at least %lu-byte descriptor message; " 1632 "received %lu bytes", sizeof (*desc_msg), msglen); 1633 return (EBADMSG); 1634 } 1635 1636 if (msglen != (expected = expected_inband_size(desc_msg))) { 1637 PR0("Expected %lu-byte descriptor message; " 1638 "received %lu bytes", expected, msglen); 1639 return (EBADMSG); 1640 } 1641 1642 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 1643 return (EBADMSG); 1644 1645 /* 1646 * Valid message: Set up the in-band descriptor task and process the 1647 * request. Arrange to acknowledge the client's message, unless an 1648 * error processing the descriptor task results in setting 1649 * VIO_SUBTYPE_NACK 1650 */ 1651 PR1("Valid in-band-descriptor message"); 1652 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1653 1654 ASSERT(vd->inband_task.msg != NULL); 1655 1656 bcopy(msg, vd->inband_task.msg, msglen); 1657 vd->inband_task.msglen = msglen; 1658 1659 /* 1660 * The task request is now the payload of the message 1661 * that was just copied into the body of the task. 1662 */ 1663 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 1664 vd->inband_task.request = &desc_msg->payload; 1665 1666 return (vd_process_task(&vd->inband_task)); 1667 } 1668 1669 static int 1670 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 1671 vio_msg_t *msg, size_t msglen) 1672 { 1673 int status; 1674 boolean_t ready; 1675 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1676 1677 1678 /* Accept the updated dring element */ 1679 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1680 PR0("ldc_mem_dring_acquire() returned errno %d", status); 1681 return (status); 1682 } 1683 ready = (elem->hdr.dstate == VIO_DESC_READY); 1684 if (ready) { 1685 elem->hdr.dstate = VIO_DESC_ACCEPTED; 1686 } else { 1687 PR0("descriptor %u not ready", idx); 1688 VD_DUMP_DRING_ELEM(elem); 1689 } 1690 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1691 PR0("ldc_mem_dring_release() returned errno %d", status); 1692 return (status); 1693 } 1694 if (!ready) 1695 return (EBUSY); 1696 1697 1698 /* Initialize a task and process the accepted element */ 1699 PR1("Processing dring element %u", idx); 1700 vd->dring_task[idx].type = type; 1701 1702 /* duplicate msg buf for cookies etc. */ 1703 bcopy(msg, vd->dring_task[idx].msg, msglen); 1704 1705 vd->dring_task[idx].msglen = msglen; 1706 if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS) 1707 status = vd_mark_elem_done(vd, idx, elem->payload.status); 1708 1709 return (status); 1710 } 1711 1712 static int 1713 vd_process_element_range(vd_t *vd, int start, int end, 1714 vio_msg_t *msg, size_t msglen) 1715 { 1716 int i, n, nelem, status = 0; 1717 boolean_t inprogress = B_FALSE; 1718 vd_task_type_t type; 1719 1720 1721 ASSERT(start >= 0); 1722 ASSERT(end >= 0); 1723 1724 /* 1725 * Arrange to acknowledge the client's message, unless an error 1726 * processing one of the dring elements results in setting 1727 * VIO_SUBTYPE_NACK 1728 */ 1729 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1730 1731 /* 1732 * Process the dring elements in the range 1733 */ 1734 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 1735 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 1736 ((vio_dring_msg_t *)msg)->end_idx = i; 1737 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 1738 status = vd_process_element(vd, type, i, msg, msglen); 1739 if (status == EINPROGRESS) 1740 inprogress = B_TRUE; 1741 else if (status != 0) 1742 break; 1743 } 1744 1745 /* 1746 * If some, but not all, operations of a multi-element range are in 1747 * progress, wait for other operations to complete before returning 1748 * (which will result in "ack" or "nack" of the message). Note that 1749 * all outstanding operations will need to complete, not just the ones 1750 * corresponding to the current range of dring elements; howevever, as 1751 * this situation is an error case, performance is less critical. 1752 */ 1753 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 1754 ddi_taskq_wait(vd->completionq); 1755 1756 return (status); 1757 } 1758 1759 static int 1760 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1761 { 1762 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 1763 1764 1765 ASSERT(msglen >= sizeof (msg->tag)); 1766 1767 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1768 VIO_DRING_DATA)) { 1769 PR1("Message is not a dring-data message"); 1770 return (ENOMSG); 1771 } 1772 1773 if (msglen != sizeof (*dring_msg)) { 1774 PR0("Expected %lu-byte dring message; received %lu bytes", 1775 sizeof (*dring_msg), msglen); 1776 return (EBADMSG); 1777 } 1778 1779 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 1780 return (EBADMSG); 1781 1782 if (dring_msg->dring_ident != vd->dring_ident) { 1783 PR0("Expected dring ident %lu; received ident %lu", 1784 vd->dring_ident, dring_msg->dring_ident); 1785 return (EBADMSG); 1786 } 1787 1788 if (dring_msg->start_idx >= vd->dring_len) { 1789 PR0("\"start_idx\" = %u; must be less than %u", 1790 dring_msg->start_idx, vd->dring_len); 1791 return (EBADMSG); 1792 } 1793 1794 if ((dring_msg->end_idx < 0) || 1795 (dring_msg->end_idx >= vd->dring_len)) { 1796 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 1797 dring_msg->end_idx, vd->dring_len); 1798 return (EBADMSG); 1799 } 1800 1801 /* Valid message; process range of updated dring elements */ 1802 PR1("Processing descriptor range, start = %u, end = %u", 1803 dring_msg->start_idx, dring_msg->end_idx); 1804 return (vd_process_element_range(vd, dring_msg->start_idx, 1805 dring_msg->end_idx, msg, msglen)); 1806 } 1807 1808 static int 1809 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 1810 { 1811 int retry, status; 1812 size_t size = *nbytes; 1813 1814 1815 for (retry = 0, status = ETIMEDOUT; 1816 retry < vds_ldc_retries && status == ETIMEDOUT; 1817 retry++) { 1818 PR1("ldc_read() attempt %d", (retry + 1)); 1819 *nbytes = size; 1820 status = ldc_read(ldc_handle, msg, nbytes); 1821 } 1822 1823 if (status) { 1824 PR0("ldc_read() returned errno %d", status); 1825 if (status != ECONNRESET) 1826 return (ENOMSG); 1827 return (status); 1828 } else if (*nbytes == 0) { 1829 PR1("ldc_read() returned 0 and no message read"); 1830 return (ENOMSG); 1831 } 1832 1833 PR1("RCVD %lu-byte message", *nbytes); 1834 return (0); 1835 } 1836 1837 static int 1838 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1839 { 1840 int status; 1841 1842 1843 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 1844 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 1845 #ifdef DEBUG 1846 vd_decode_tag(msg); 1847 #endif 1848 1849 /* 1850 * Validate session ID up front, since it applies to all messages 1851 * once set 1852 */ 1853 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 1854 PR0("Expected SID %u, received %u", vd->sid, 1855 msg->tag.vio_sid); 1856 return (EBADMSG); 1857 } 1858 1859 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 1860 1861 /* 1862 * Process the received message based on connection state 1863 */ 1864 switch (vd->state) { 1865 case VD_STATE_INIT: /* expect version message */ 1866 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 1867 return (status); 1868 1869 /* Version negotiated, move to that state */ 1870 vd->state = VD_STATE_VER; 1871 return (0); 1872 1873 case VD_STATE_VER: /* expect attribute message */ 1874 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 1875 return (status); 1876 1877 /* Attributes exchanged, move to that state */ 1878 vd->state = VD_STATE_ATTR; 1879 return (0); 1880 1881 case VD_STATE_ATTR: 1882 switch (vd->xfer_mode) { 1883 case VIO_DESC_MODE: /* expect RDX message */ 1884 if ((status = process_rdx_msg(msg, msglen)) != 0) 1885 return (status); 1886 1887 /* Ready to receive in-band descriptors */ 1888 vd->state = VD_STATE_DATA; 1889 return (0); 1890 1891 case VIO_DRING_MODE: /* expect register-dring message */ 1892 if ((status = 1893 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 1894 return (status); 1895 1896 /* One dring negotiated, move to that state */ 1897 vd->state = VD_STATE_DRING; 1898 return (0); 1899 1900 default: 1901 ASSERT("Unsupported transfer mode"); 1902 PR0("Unsupported transfer mode"); 1903 return (ENOTSUP); 1904 } 1905 1906 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 1907 if ((status = process_rdx_msg(msg, msglen)) == 0) { 1908 /* Ready to receive data */ 1909 vd->state = VD_STATE_DATA; 1910 return (0); 1911 } else if (status != ENOMSG) { 1912 return (status); 1913 } 1914 1915 1916 /* 1917 * If another register-dring message is received, stay in 1918 * dring state in case the client sends RDX; although the 1919 * protocol allows multiple drings, this server does not 1920 * support using more than one 1921 */ 1922 if ((status = 1923 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 1924 return (status); 1925 1926 /* 1927 * Acknowledge an unregister-dring message, but reset the 1928 * connection anyway: Although the protocol allows 1929 * unregistering drings, this server cannot serve a vdisk 1930 * without its only dring 1931 */ 1932 status = vd_process_dring_unreg_msg(vd, msg, msglen); 1933 return ((status == 0) ? ENOTSUP : status); 1934 1935 case VD_STATE_DATA: 1936 switch (vd->xfer_mode) { 1937 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 1938 return (vd_process_desc_msg(vd, msg, msglen)); 1939 1940 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 1941 /* 1942 * Typically expect dring-data messages, so handle 1943 * them first 1944 */ 1945 if ((status = vd_process_dring_msg(vd, msg, 1946 msglen)) != ENOMSG) 1947 return (status); 1948 1949 /* 1950 * Acknowledge an unregister-dring message, but reset 1951 * the connection anyway: Although the protocol 1952 * allows unregistering drings, this server cannot 1953 * serve a vdisk without its only dring 1954 */ 1955 status = vd_process_dring_unreg_msg(vd, msg, msglen); 1956 return ((status == 0) ? ENOTSUP : status); 1957 1958 default: 1959 ASSERT("Unsupported transfer mode"); 1960 PR0("Unsupported transfer mode"); 1961 return (ENOTSUP); 1962 } 1963 1964 default: 1965 ASSERT("Invalid client connection state"); 1966 PR0("Invalid client connection state"); 1967 return (ENOTSUP); 1968 } 1969 } 1970 1971 static int 1972 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1973 { 1974 int status; 1975 boolean_t reset_ldc = B_FALSE; 1976 1977 1978 /* 1979 * Check that the message is at least big enough for a "tag", so that 1980 * message processing can proceed based on tag-specified message type 1981 */ 1982 if (msglen < sizeof (vio_msg_tag_t)) { 1983 PR0("Received short (%lu-byte) message", msglen); 1984 /* Can't "nack" short message, so drop the big hammer */ 1985 PR0("initiating full reset"); 1986 vd_need_reset(vd, B_TRUE); 1987 return (EBADMSG); 1988 } 1989 1990 /* 1991 * Process the message 1992 */ 1993 switch (status = vd_do_process_msg(vd, msg, msglen)) { 1994 case 0: 1995 /* "ack" valid, successfully-processed messages */ 1996 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1997 break; 1998 1999 case EINPROGRESS: 2000 /* The completion handler will "ack" or "nack" the message */ 2001 return (EINPROGRESS); 2002 case ENOMSG: 2003 PR0("Received unexpected message"); 2004 _NOTE(FALLTHROUGH); 2005 case EBADMSG: 2006 case ENOTSUP: 2007 /* "nack" invalid messages */ 2008 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2009 break; 2010 2011 default: 2012 /* "nack" failed messages */ 2013 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2014 /* An LDC error probably occurred, so try resetting it */ 2015 reset_ldc = B_TRUE; 2016 break; 2017 } 2018 2019 PR1("\tResulting in state %d (%s)", vd->state, 2020 vd_decode_state(vd->state)); 2021 2022 /* Send the "ack" or "nack" to the client */ 2023 PR1("Sending %s", 2024 (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 2025 if (send_msg(vd->ldc_handle, msg, msglen) != 0) 2026 reset_ldc = B_TRUE; 2027 2028 /* Arrange to reset the connection for nack'ed or failed messages */ 2029 if ((status != 0) || reset_ldc) { 2030 PR0("initiating %s reset", 2031 (reset_ldc) ? "full" : "soft"); 2032 vd_need_reset(vd, reset_ldc); 2033 } 2034 2035 return (status); 2036 } 2037 2038 static boolean_t 2039 vd_enabled(vd_t *vd) 2040 { 2041 boolean_t enabled; 2042 2043 2044 mutex_enter(&vd->lock); 2045 enabled = vd->enabled; 2046 mutex_exit(&vd->lock); 2047 return (enabled); 2048 } 2049 2050 static void 2051 vd_recv_msg(void *arg) 2052 { 2053 vd_t *vd = (vd_t *)arg; 2054 int rv = 0, status = 0; 2055 2056 ASSERT(vd != NULL); 2057 2058 PR2("New task to receive incoming message(s)"); 2059 2060 2061 while (vd_enabled(vd) && status == 0) { 2062 size_t msglen, msgsize; 2063 ldc_status_t lstatus; 2064 2065 /* 2066 * Receive and process a message 2067 */ 2068 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 2069 2070 /* 2071 * check if channel is UP - else break out of loop 2072 */ 2073 status = ldc_status(vd->ldc_handle, &lstatus); 2074 if (lstatus != LDC_UP) { 2075 PR0("channel not up (status=%d), exiting recv loop\n", 2076 lstatus); 2077 break; 2078 } 2079 2080 ASSERT(vd->max_msglen != 0); 2081 2082 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 2083 msglen = msgsize; /* actual len after recv_msg() */ 2084 2085 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 2086 switch (status) { 2087 case 0: 2088 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 2089 msglen); 2090 /* check if max_msglen changed */ 2091 if (msgsize != vd->max_msglen) { 2092 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 2093 msgsize, vd->max_msglen); 2094 kmem_free(vd->vio_msgp, msgsize); 2095 vd->vio_msgp = 2096 kmem_alloc(vd->max_msglen, KM_SLEEP); 2097 } 2098 if (rv == EINPROGRESS) 2099 continue; 2100 break; 2101 2102 case ENOMSG: 2103 break; 2104 2105 case ECONNRESET: 2106 PR0("initiating soft reset (ECONNRESET)\n"); 2107 vd_need_reset(vd, B_FALSE); 2108 status = 0; 2109 break; 2110 2111 default: 2112 /* Probably an LDC failure; arrange to reset it */ 2113 PR0("initiating full reset (status=0x%x)", status); 2114 vd_need_reset(vd, B_TRUE); 2115 break; 2116 } 2117 } 2118 2119 PR2("Task finished"); 2120 } 2121 2122 static uint_t 2123 vd_handle_ldc_events(uint64_t event, caddr_t arg) 2124 { 2125 vd_t *vd = (vd_t *)(void *)arg; 2126 int status; 2127 2128 2129 ASSERT(vd != NULL); 2130 2131 if (!vd_enabled(vd)) 2132 return (LDC_SUCCESS); 2133 2134 if (event & LDC_EVT_DOWN) { 2135 PRN("LDC_EVT_DOWN: LDC channel went down"); 2136 2137 vd_need_reset(vd, B_TRUE); 2138 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2139 DDI_SLEEP); 2140 if (status == DDI_FAILURE) { 2141 PR0("cannot schedule task to recv msg\n"); 2142 vd_need_reset(vd, B_TRUE); 2143 } 2144 } 2145 2146 if (event & LDC_EVT_RESET) { 2147 PR0("LDC_EVT_RESET: LDC channel was reset"); 2148 2149 if (vd->state != VD_STATE_INIT) { 2150 PR0("scheduling full reset"); 2151 vd_need_reset(vd, B_FALSE); 2152 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2153 vd, DDI_SLEEP); 2154 if (status == DDI_FAILURE) { 2155 PR0("cannot schedule task to recv msg\n"); 2156 vd_need_reset(vd, B_TRUE); 2157 } 2158 2159 } else { 2160 PR0("channel already reset, ignoring...\n"); 2161 PR0("doing ldc up...\n"); 2162 (void) ldc_up(vd->ldc_handle); 2163 } 2164 2165 return (LDC_SUCCESS); 2166 } 2167 2168 if (event & LDC_EVT_UP) { 2169 PR0("EVT_UP: LDC is up\nResetting client connection state"); 2170 PR0("initiating soft reset"); 2171 vd_need_reset(vd, B_FALSE); 2172 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2173 vd, DDI_SLEEP); 2174 if (status == DDI_FAILURE) { 2175 PR0("cannot schedule task to recv msg\n"); 2176 vd_need_reset(vd, B_TRUE); 2177 return (LDC_SUCCESS); 2178 } 2179 } 2180 2181 if (event & LDC_EVT_READ) { 2182 int status; 2183 2184 PR1("New data available"); 2185 /* Queue a task to receive the new data */ 2186 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2187 DDI_SLEEP); 2188 2189 if (status == DDI_FAILURE) { 2190 PR0("cannot schedule task to recv msg\n"); 2191 vd_need_reset(vd, B_TRUE); 2192 } 2193 } 2194 2195 return (LDC_SUCCESS); 2196 } 2197 2198 static uint_t 2199 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 2200 { 2201 _NOTE(ARGUNUSED(key, val)) 2202 (*((uint_t *)arg))++; 2203 return (MH_WALK_TERMINATE); 2204 } 2205 2206 2207 static int 2208 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2209 { 2210 uint_t vd_present = 0; 2211 minor_t instance; 2212 vds_t *vds; 2213 2214 2215 switch (cmd) { 2216 case DDI_DETACH: 2217 /* the real work happens below */ 2218 break; 2219 case DDI_SUSPEND: 2220 PR0("No action required for DDI_SUSPEND"); 2221 return (DDI_SUCCESS); 2222 default: 2223 PR0("Unrecognized \"cmd\""); 2224 return (DDI_FAILURE); 2225 } 2226 2227 ASSERT(cmd == DDI_DETACH); 2228 instance = ddi_get_instance(dip); 2229 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 2230 PR0("Could not get state for instance %u", instance); 2231 ddi_soft_state_free(vds_state, instance); 2232 return (DDI_FAILURE); 2233 } 2234 2235 /* Do no detach when serving any vdisks */ 2236 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 2237 if (vd_present) { 2238 PR0("Not detaching because serving vdisks"); 2239 return (DDI_FAILURE); 2240 } 2241 2242 PR0("Detaching"); 2243 if (vds->initialized & VDS_MDEG) 2244 (void) mdeg_unregister(vds->mdeg); 2245 if (vds->initialized & VDS_LDI) 2246 (void) ldi_ident_release(vds->ldi_ident); 2247 mod_hash_destroy_hash(vds->vd_table); 2248 ddi_soft_state_free(vds_state, instance); 2249 return (DDI_SUCCESS); 2250 } 2251 2252 static boolean_t 2253 is_pseudo_device(dev_info_t *dip) 2254 { 2255 dev_info_t *parent, *root = ddi_root_node(); 2256 2257 2258 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 2259 parent = ddi_get_parent(parent)) { 2260 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 2261 return (B_TRUE); 2262 } 2263 2264 return (B_FALSE); 2265 } 2266 2267 static int 2268 vd_setup_full_disk(vd_t *vd) 2269 { 2270 int rval, status; 2271 major_t major = getmajor(vd->dev[0]); 2272 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 2273 struct dk_minfo dk_minfo; 2274 2275 /* 2276 * At this point, vdisk_size is set to the size of partition 2 but 2277 * this does not represent the size of the disk because partition 2 2278 * may not cover the entire disk and its size does not include reserved 2279 * blocks. So we update vdisk_size to be the size of the entire disk. 2280 */ 2281 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 2282 (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL), 2283 kcred, &rval)) != 0) { 2284 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 2285 status); 2286 return (status); 2287 } 2288 vd->vdisk_size = dk_minfo.dki_capacity; 2289 2290 /* Set full-disk parameters */ 2291 vd->vdisk_type = VD_DISK_TYPE_DISK; 2292 vd->nslices = (sizeof (vd->dev))/(sizeof (vd->dev[0])); 2293 2294 /* Move dev number and LDI handle to entire-disk-slice array elements */ 2295 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 2296 vd->dev[0] = 0; 2297 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 2298 vd->ldi_handle[0] = NULL; 2299 2300 /* Initialize device numbers for remaining slices and open them */ 2301 for (int slice = 0; slice < vd->nslices; slice++) { 2302 /* 2303 * Skip the entire-disk slice, as it's already open and its 2304 * device known 2305 */ 2306 if (slice == VD_ENTIRE_DISK_SLICE) 2307 continue; 2308 ASSERT(vd->dev[slice] == 0); 2309 ASSERT(vd->ldi_handle[slice] == NULL); 2310 2311 /* 2312 * Construct the device number for the current slice 2313 */ 2314 vd->dev[slice] = makedevice(major, (minor + slice)); 2315 2316 /* 2317 * At least some underlying drivers refuse to open 2318 * devices for (currently) zero-length slices, so skip 2319 * them for now 2320 */ 2321 if (vd->vtoc.v_part[slice].p_size == 0) { 2322 PR0("Skipping zero-length slice %u", slice); 2323 continue; 2324 } 2325 2326 /* 2327 * Open all non-empty slices of the disk to serve them to the 2328 * client. Slices are opened exclusively to prevent other 2329 * threads or processes in the service domain from performing 2330 * I/O to slices being accessed by a client. Failure to open 2331 * a slice results in vds not serving this disk, as the client 2332 * could attempt (and should be able) to access any non-empty 2333 * slice immediately. Any slices successfully opened before a 2334 * failure will get closed by vds_destroy_vd() as a result of 2335 * the error returned by this function. 2336 */ 2337 PR0("Opening device major %u, minor %u = slice %u", 2338 major, minor, slice); 2339 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 2340 vd_open_flags, kcred, &vd->ldi_handle[slice], 2341 vd->vds->ldi_ident)) != 0) { 2342 PRN("ldi_open_by_dev() returned errno %d " 2343 "for slice %u", status, slice); 2344 /* vds_destroy_vd() will close any open slices */ 2345 return (status); 2346 } 2347 } 2348 2349 return (0); 2350 } 2351 2352 static int 2353 vd_setup_partition_efi(vd_t *vd) 2354 { 2355 efi_gpt_t *gpt; 2356 efi_gpe_t *gpe; 2357 struct uuid uuid = EFI_RESERVED; 2358 uint32_t crc; 2359 int length; 2360 2361 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 2362 2363 gpt = kmem_zalloc(length, KM_SLEEP); 2364 gpe = (efi_gpe_t *)(gpt + 1); 2365 2366 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 2367 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 2368 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 2369 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 2370 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 2371 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 2372 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 2373 2374 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 2375 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 2376 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 2377 2378 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 2379 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2380 2381 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 2382 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 2383 2384 vd->dk_efi.dki_lba = 0; 2385 vd->dk_efi.dki_length = length; 2386 vd->dk_efi.dki_data = gpt; 2387 2388 return (0); 2389 } 2390 2391 static int 2392 vd_setup_vd(char *device_path, vd_t *vd) 2393 { 2394 int rval, status; 2395 dev_info_t *dip; 2396 struct dk_cinfo dk_cinfo; 2397 2398 /* 2399 * We need to open with FNDELAY so that opening an empty partition 2400 * does not fail. 2401 */ 2402 if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY, 2403 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { 2404 PRN("ldi_open_by_name(%s) = errno %d", device_path, status); 2405 return (status); 2406 } 2407 2408 /* 2409 * nslices must be updated now so that vds_destroy_vd() will close 2410 * the slice we have just opened in case of an error. 2411 */ 2412 vd->nslices = 1; 2413 2414 /* Get device number and size of backing device */ 2415 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 2416 PRN("ldi_get_dev() returned errno %d for %s", 2417 status, device_path); 2418 return (status); 2419 } 2420 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 2421 PRN("ldi_get_size() failed for %s", device_path); 2422 return (EIO); 2423 } 2424 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 2425 2426 /* Verify backing device supports dk_cinfo, dk_geom, and vtoc */ 2427 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 2428 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 2429 &rval)) != 0) { 2430 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 2431 status, device_path); 2432 return (status); 2433 } 2434 if (dk_cinfo.dki_partition >= V_NUMPAR) { 2435 PRN("slice %u >= maximum slice %u for %s", 2436 dk_cinfo.dki_partition, V_NUMPAR, device_path); 2437 return (EIO); 2438 } 2439 2440 status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label); 2441 2442 if (status != 0) { 2443 PRN("vd_read_vtoc returned errno %d for %s", 2444 status, device_path); 2445 return (status); 2446 } 2447 2448 if (vd->vdisk_label == VD_DISK_LABEL_VTOC && 2449 (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 2450 (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), 2451 kcred, &rval)) != 0) { 2452 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 2453 status, device_path); 2454 return (status); 2455 } 2456 2457 /* Store the device's max transfer size for return to the client */ 2458 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 2459 2460 2461 /* Determine if backing device is a pseudo device */ 2462 if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), 2463 dev_to_instance(vd->dev[0]), 0)) == NULL) { 2464 PRN("%s is no longer accessible", device_path); 2465 return (EIO); 2466 } 2467 vd->pseudo = is_pseudo_device(dip); 2468 ddi_release_devi(dip); 2469 if (vd->pseudo) { 2470 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2471 vd->nslices = 1; 2472 return (0); /* ...and we're done */ 2473 } 2474 2475 2476 /* If slice is entire-disk slice, initialize for full disk */ 2477 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) 2478 return (vd_setup_full_disk(vd)); 2479 2480 2481 /* Otherwise, we have a non-entire slice of a device */ 2482 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2483 vd->nslices = 1; 2484 2485 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 2486 status = vd_setup_partition_efi(vd); 2487 return (status); 2488 } 2489 2490 /* Initialize dk_geom structure for single-slice device */ 2491 if (vd->dk_geom.dkg_nsect == 0) { 2492 PR0("%s geometry claims 0 sectors per track", device_path); 2493 return (EIO); 2494 } 2495 if (vd->dk_geom.dkg_nhead == 0) { 2496 PR0("%s geometry claims 0 heads", device_path); 2497 return (EIO); 2498 } 2499 vd->dk_geom.dkg_ncyl = 2500 vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; 2501 vd->dk_geom.dkg_acyl = 0; 2502 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 2503 2504 2505 /* Initialize vtoc structure for single-slice device */ 2506 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 2507 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 2508 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 2509 vd->vtoc.v_nparts = 1; 2510 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 2511 vd->vtoc.v_part[0].p_flag = 0; 2512 vd->vtoc.v_part[0].p_start = 0; 2513 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 2514 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 2515 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 2516 2517 2518 return (0); 2519 } 2520 2521 static int 2522 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id, 2523 vd_t **vdp) 2524 { 2525 char tq_name[TASKQ_NAMELEN]; 2526 int status; 2527 ddi_iblock_cookie_t iblock = NULL; 2528 ldc_attr_t ldc_attr; 2529 vd_t *vd; 2530 2531 2532 ASSERT(vds != NULL); 2533 ASSERT(device_path != NULL); 2534 ASSERT(vdp != NULL); 2535 PR0("Adding vdisk for %s", device_path); 2536 2537 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 2538 PRN("No memory for virtual disk"); 2539 return (EAGAIN); 2540 } 2541 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 2542 vd->vds = vds; 2543 2544 2545 /* Open vdisk and initialize parameters */ 2546 if ((status = vd_setup_vd(device_path, vd)) != 0) 2547 return (status); 2548 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 2549 PR0("vdisk_type = %s, pseudo = %s, nslices = %u", 2550 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 2551 (vd->pseudo ? "yes" : "no"), vd->nslices); 2552 2553 2554 /* Initialize locking */ 2555 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 2556 &iblock) != DDI_SUCCESS) { 2557 PRN("Could not get iblock cookie."); 2558 return (EIO); 2559 } 2560 2561 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 2562 vd->initialized |= VD_LOCKING; 2563 2564 2565 /* Create start and completion task queues for the vdisk */ 2566 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 2567 PR1("tq_name = %s", tq_name); 2568 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 2569 TASKQ_DEFAULTPRI, 0)) == NULL) { 2570 PRN("Could not create task queue"); 2571 return (EIO); 2572 } 2573 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 2574 PR1("tq_name = %s", tq_name); 2575 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 2576 TASKQ_DEFAULTPRI, 0)) == NULL) { 2577 PRN("Could not create task queue"); 2578 return (EIO); 2579 } 2580 vd->enabled = 1; /* before callback can dispatch to startq */ 2581 2582 2583 /* Bring up LDC */ 2584 ldc_attr.devclass = LDC_DEV_BLK_SVC; 2585 ldc_attr.instance = ddi_get_instance(vds->dip); 2586 ldc_attr.mode = LDC_MODE_UNRELIABLE; 2587 ldc_attr.mtu = VD_LDC_MTU; 2588 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 2589 PR0("ldc_init(%lu) = errno %d", ldc_id, status); 2590 return (status); 2591 } 2592 vd->initialized |= VD_LDC; 2593 2594 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 2595 (caddr_t)vd)) != 0) { 2596 PR0("ldc_reg_callback() returned errno %d", status); 2597 return (status); 2598 } 2599 2600 if ((status = ldc_open(vd->ldc_handle)) != 0) { 2601 PR0("ldc_open() returned errno %d", status); 2602 return (status); 2603 } 2604 2605 if ((status = ldc_up(vd->ldc_handle)) != 0) { 2606 PRN("ldc_up() returned errno %d", status); 2607 } 2608 2609 /* Allocate the inband task memory handle */ 2610 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 2611 if (status) { 2612 PRN("ldc_mem_alloc_handle() returned err %d ", status); 2613 return (ENXIO); 2614 } 2615 2616 /* Add the successfully-initialized vdisk to the server's table */ 2617 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 2618 PRN("Error adding vdisk ID %lu to table", id); 2619 return (EIO); 2620 } 2621 2622 /* Allocate the staging buffer */ 2623 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 2624 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 2625 2626 /* store initial state */ 2627 vd->state = VD_STATE_INIT; 2628 2629 return (0); 2630 } 2631 2632 static void 2633 vd_free_dring_task(vd_t *vdp) 2634 { 2635 if (vdp->dring_task != NULL) { 2636 ASSERT(vdp->dring_len != 0); 2637 /* Free all dring_task memory handles */ 2638 for (int i = 0; i < vdp->dring_len; i++) { 2639 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 2640 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 2641 vdp->dring_task[i].msg = NULL; 2642 } 2643 kmem_free(vdp->dring_task, 2644 (sizeof (*vdp->dring_task)) * vdp->dring_len); 2645 vdp->dring_task = NULL; 2646 } 2647 } 2648 2649 /* 2650 * Destroy the state associated with a virtual disk 2651 */ 2652 static void 2653 vds_destroy_vd(void *arg) 2654 { 2655 vd_t *vd = (vd_t *)arg; 2656 2657 2658 if (vd == NULL) 2659 return; 2660 2661 PR0("Destroying vdisk state"); 2662 2663 if (vd->dk_efi.dki_data != NULL) 2664 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 2665 2666 /* Disable queuing requests for the vdisk */ 2667 if (vd->initialized & VD_LOCKING) { 2668 mutex_enter(&vd->lock); 2669 vd->enabled = 0; 2670 mutex_exit(&vd->lock); 2671 } 2672 2673 /* Drain and destroy start queue (*before* destroying completionq) */ 2674 if (vd->startq != NULL) 2675 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 2676 2677 /* Drain and destroy completion queue (*before* shutting down LDC) */ 2678 if (vd->completionq != NULL) 2679 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 2680 2681 vd_free_dring_task(vd); 2682 2683 /* Free the staging buffer for msgs */ 2684 if (vd->vio_msgp != NULL) { 2685 kmem_free(vd->vio_msgp, vd->max_msglen); 2686 vd->vio_msgp = NULL; 2687 } 2688 2689 /* Free the inband message buffer */ 2690 if (vd->inband_task.msg != NULL) { 2691 kmem_free(vd->inband_task.msg, vd->max_msglen); 2692 vd->inband_task.msg = NULL; 2693 } 2694 2695 /* Free the inband task memory handle */ 2696 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 2697 2698 /* Shut down LDC */ 2699 if (vd->initialized & VD_LDC) { 2700 if (vd->initialized & VD_DRING) 2701 (void) ldc_mem_dring_unmap(vd->dring_handle); 2702 (void) ldc_unreg_callback(vd->ldc_handle); 2703 (void) ldc_close(vd->ldc_handle); 2704 (void) ldc_fini(vd->ldc_handle); 2705 } 2706 2707 /* Close any open backing-device slices */ 2708 for (uint_t slice = 0; slice < vd->nslices; slice++) { 2709 if (vd->ldi_handle[slice] != NULL) { 2710 PR0("Closing slice %u", slice); 2711 (void) ldi_close(vd->ldi_handle[slice], 2712 vd_open_flags | FNDELAY, kcred); 2713 } 2714 } 2715 2716 /* Free lock */ 2717 if (vd->initialized & VD_LOCKING) 2718 mutex_destroy(&vd->lock); 2719 2720 /* Finally, free the vdisk structure itself */ 2721 kmem_free(vd, sizeof (*vd)); 2722 } 2723 2724 static int 2725 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id) 2726 { 2727 int status; 2728 vd_t *vd = NULL; 2729 2730 2731 if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0) 2732 vds_destroy_vd(vd); 2733 2734 return (status); 2735 } 2736 2737 static int 2738 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 2739 uint64_t *ldc_id) 2740 { 2741 int num_channels; 2742 2743 2744 /* Look for channel endpoint child(ren) of the vdisk MD node */ 2745 if ((num_channels = md_scan_dag(md, vd_node, 2746 md_find_name(md, VD_CHANNEL_ENDPOINT), 2747 md_find_name(md, "fwd"), channel)) <= 0) { 2748 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 2749 return (-1); 2750 } 2751 2752 /* Get the "id" value for the first channel endpoint node */ 2753 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 2754 PRN("No \"%s\" property found for \"%s\" of vdisk", 2755 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 2756 return (-1); 2757 } 2758 2759 if (num_channels > 1) { 2760 PRN("Using ID of first of multiple channels for this vdisk"); 2761 } 2762 2763 return (0); 2764 } 2765 2766 static int 2767 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 2768 { 2769 int num_nodes, status; 2770 size_t size; 2771 mde_cookie_t *channel; 2772 2773 2774 if ((num_nodes = md_node_count(md)) <= 0) { 2775 PRN("Invalid node count in Machine Description subtree"); 2776 return (-1); 2777 } 2778 size = num_nodes*(sizeof (*channel)); 2779 channel = kmem_zalloc(size, KM_SLEEP); 2780 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 2781 kmem_free(channel, size); 2782 2783 return (status); 2784 } 2785 2786 static void 2787 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 2788 { 2789 char *device_path = NULL; 2790 uint64_t id = 0, ldc_id = 0; 2791 2792 2793 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 2794 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 2795 return; 2796 } 2797 PR0("Adding vdisk ID %lu", id); 2798 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 2799 &device_path) != 0) { 2800 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 2801 return; 2802 } 2803 2804 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 2805 PRN("Error getting LDC ID for vdisk %lu", id); 2806 return; 2807 } 2808 2809 if (vds_init_vd(vds, id, device_path, ldc_id) != 0) { 2810 PRN("Failed to add vdisk ID %lu", id); 2811 return; 2812 } 2813 } 2814 2815 static void 2816 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 2817 { 2818 uint64_t id = 0; 2819 2820 2821 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 2822 PRN("Unable to get \"%s\" property from vdisk's MD node", 2823 VD_ID_PROP); 2824 return; 2825 } 2826 PR0("Removing vdisk ID %lu", id); 2827 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 2828 PRN("No vdisk entry found for vdisk ID %lu", id); 2829 } 2830 2831 static void 2832 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 2833 md_t *curr_md, mde_cookie_t curr_vd_node) 2834 { 2835 char *curr_dev, *prev_dev; 2836 uint64_t curr_id = 0, curr_ldc_id = 0; 2837 uint64_t prev_id = 0, prev_ldc_id = 0; 2838 size_t len; 2839 2840 2841 /* Validate that vdisk ID has not changed */ 2842 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 2843 PRN("Error getting previous vdisk \"%s\" property", 2844 VD_ID_PROP); 2845 return; 2846 } 2847 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 2848 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 2849 return; 2850 } 2851 if (curr_id != prev_id) { 2852 PRN("Not changing vdisk: ID changed from %lu to %lu", 2853 prev_id, curr_id); 2854 return; 2855 } 2856 2857 /* Validate that LDC ID has not changed */ 2858 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 2859 PRN("Error getting LDC ID for vdisk %lu", prev_id); 2860 return; 2861 } 2862 2863 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 2864 PRN("Error getting LDC ID for vdisk %lu", curr_id); 2865 return; 2866 } 2867 if (curr_ldc_id != prev_ldc_id) { 2868 _NOTE(NOTREACHED); /* lint is confused */ 2869 PRN("Not changing vdisk: " 2870 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 2871 return; 2872 } 2873 2874 /* Determine whether device path has changed */ 2875 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 2876 &prev_dev) != 0) { 2877 PRN("Error getting previous vdisk \"%s\"", 2878 VD_BLOCK_DEVICE_PROP); 2879 return; 2880 } 2881 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 2882 &curr_dev) != 0) { 2883 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 2884 return; 2885 } 2886 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 2887 (strncmp(curr_dev, prev_dev, len) == 0)) 2888 return; /* no relevant (supported) change */ 2889 2890 PR0("Changing vdisk ID %lu", prev_id); 2891 2892 /* Remove old state, which will close vdisk and reset */ 2893 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 2894 PRN("No entry found for vdisk ID %lu", prev_id); 2895 2896 /* Re-initialize vdisk with new state */ 2897 if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { 2898 PRN("Failed to change vdisk ID %lu", curr_id); 2899 return; 2900 } 2901 } 2902 2903 static int 2904 vds_process_md(void *arg, mdeg_result_t *md) 2905 { 2906 int i; 2907 vds_t *vds = arg; 2908 2909 2910 if (md == NULL) 2911 return (MDEG_FAILURE); 2912 ASSERT(vds != NULL); 2913 2914 for (i = 0; i < md->removed.nelem; i++) 2915 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 2916 for (i = 0; i < md->match_curr.nelem; i++) 2917 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 2918 md->match_curr.mdp, md->match_curr.mdep[i]); 2919 for (i = 0; i < md->added.nelem; i++) 2920 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 2921 2922 return (MDEG_SUCCESS); 2923 } 2924 2925 static int 2926 vds_do_attach(dev_info_t *dip) 2927 { 2928 static char reg_prop[] = "reg"; /* devinfo ID prop */ 2929 2930 /* MDEG specification for a (particular) vds node */ 2931 static mdeg_prop_spec_t vds_prop_spec[] = { 2932 {MDET_PROP_STR, "name", {VDS_NAME}}, 2933 {MDET_PROP_VAL, "cfg-handle", {0}}, 2934 {MDET_LIST_END, NULL, {0}}}; 2935 static mdeg_node_spec_t vds_spec = {"virtual-device", vds_prop_spec}; 2936 2937 /* MDEG specification for matching a vd node */ 2938 static md_prop_match_t vd_prop_spec[] = { 2939 {MDET_PROP_VAL, VD_ID_PROP}, 2940 {MDET_LIST_END, NULL}}; 2941 static mdeg_node_match_t vd_spec = {"virtual-device-port", 2942 vd_prop_spec}; 2943 2944 int status; 2945 uint64_t cfg_handle; 2946 minor_t instance = ddi_get_instance(dip); 2947 vds_t *vds; 2948 2949 2950 /* 2951 * The "cfg-handle" property of a vds node in an MD contains the MD's 2952 * notion of "instance", or unique identifier, for that node; OBP 2953 * stores the value of the "cfg-handle" MD property as the value of 2954 * the "reg" property on the node in the device tree it builds from 2955 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2956 * "reg" property value to uniquely identify this device instance when 2957 * registering with the MD event-generation framework. If the "reg" 2958 * property cannot be found, the device tree state is presumably so 2959 * broken that there is no point in continuing. 2960 */ 2961 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, reg_prop)) { 2962 PRN("vds \"%s\" property does not exist", reg_prop); 2963 return (DDI_FAILURE); 2964 } 2965 2966 /* Get the MD instance for later MDEG registration */ 2967 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2968 reg_prop, -1); 2969 2970 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 2971 PRN("Could not allocate state for instance %u", instance); 2972 return (DDI_FAILURE); 2973 } 2974 2975 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 2976 PRN("Could not get state for instance %u", instance); 2977 ddi_soft_state_free(vds_state, instance); 2978 return (DDI_FAILURE); 2979 } 2980 2981 2982 vds->dip = dip; 2983 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 2984 vds_destroy_vd, 2985 sizeof (void *)); 2986 ASSERT(vds->vd_table != NULL); 2987 2988 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 2989 PRN("ldi_ident_from_dip() returned errno %d", status); 2990 return (DDI_FAILURE); 2991 } 2992 vds->initialized |= VDS_LDI; 2993 2994 /* Register for MD updates */ 2995 vds_prop_spec[1].ps_val = cfg_handle; 2996 if (mdeg_register(&vds_spec, &vd_spec, vds_process_md, vds, 2997 &vds->mdeg) != MDEG_SUCCESS) { 2998 PRN("Unable to register for MD updates"); 2999 return (DDI_FAILURE); 3000 } 3001 vds->initialized |= VDS_MDEG; 3002 3003 /* Prevent auto-detaching so driver is available whenever MD changes */ 3004 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 3005 DDI_PROP_SUCCESS) { 3006 PRN("failed to set \"%s\" property for instance %u", 3007 DDI_NO_AUTODETACH, instance); 3008 } 3009 3010 ddi_report_dev(dip); 3011 return (DDI_SUCCESS); 3012 } 3013 3014 static int 3015 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3016 { 3017 int status; 3018 3019 switch (cmd) { 3020 case DDI_ATTACH: 3021 PR0("Attaching"); 3022 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 3023 (void) vds_detach(dip, DDI_DETACH); 3024 return (status); 3025 case DDI_RESUME: 3026 PR0("No action required for DDI_RESUME"); 3027 return (DDI_SUCCESS); 3028 default: 3029 return (DDI_FAILURE); 3030 } 3031 } 3032 3033 static struct dev_ops vds_ops = { 3034 DEVO_REV, /* devo_rev */ 3035 0, /* devo_refcnt */ 3036 ddi_no_info, /* devo_getinfo */ 3037 nulldev, /* devo_identify */ 3038 nulldev, /* devo_probe */ 3039 vds_attach, /* devo_attach */ 3040 vds_detach, /* devo_detach */ 3041 nodev, /* devo_reset */ 3042 NULL, /* devo_cb_ops */ 3043 NULL, /* devo_bus_ops */ 3044 nulldev /* devo_power */ 3045 }; 3046 3047 static struct modldrv modldrv = { 3048 &mod_driverops, 3049 "virtual disk server v%I%", 3050 &vds_ops, 3051 }; 3052 3053 static struct modlinkage modlinkage = { 3054 MODREV_1, 3055 &modldrv, 3056 NULL 3057 }; 3058 3059 3060 int 3061 _init(void) 3062 { 3063 int i, status; 3064 3065 3066 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 3067 return (status); 3068 if ((status = mod_install(&modlinkage)) != 0) { 3069 ddi_soft_state_fini(&vds_state); 3070 return (status); 3071 } 3072 3073 /* Fill in the bit-mask of server-supported operations */ 3074 for (i = 0; i < vds_noperations; i++) 3075 vds_operations |= 1 << (vds_operation[i].operation - 1); 3076 3077 return (0); 3078 } 3079 3080 int 3081 _info(struct modinfo *modinfop) 3082 { 3083 return (mod_info(&modlinkage, modinfop)); 3084 } 3085 3086 int 3087 _fini(void) 3088 { 3089 int status; 3090 3091 3092 if ((status = mod_remove(&modlinkage)) != 0) 3093 return (status); 3094 ddi_soft_state_fini(&vds_state); 3095 return (0); 3096 } 3097