1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/crc32.h> 37 #include <sys/ddi.h> 38 #include <sys/dkio.h> 39 #include <sys/file.h> 40 #include <sys/mdeg.h> 41 #include <sys/modhash.h> 42 #include <sys/note.h> 43 #include <sys/pathname.h> 44 #include <sys/sunddi.h> 45 #include <sys/sunldi.h> 46 #include <sys/sysmacros.h> 47 #include <sys/vio_common.h> 48 #include <sys/vdsk_mailbox.h> 49 #include <sys/vdsk_common.h> 50 #include <sys/vtoc.h> 51 #include <sys/vfs.h> 52 #include <sys/stat.h> 53 #include <vm/seg_map.h> 54 55 /* Virtual disk server initialization flags */ 56 #define VDS_LDI 0x01 57 #define VDS_MDEG 0x02 58 59 /* Virtual disk server tunable parameters */ 60 #define VDS_RETRIES 5 61 #define VDS_LDC_DELAY 1000 /* 1 msecs */ 62 #define VDS_DEV_DELAY 10000000 /* 10 secs */ 63 #define VDS_NCHAINS 32 64 65 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 66 #define VDS_NAME "virtual-disk-server" 67 68 #define VD_NAME "vd" 69 #define VD_VOLUME_NAME "vdisk" 70 #define VD_ASCIILABEL "Virtual Disk" 71 72 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 73 #define VD_ID_PROP "id" 74 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 75 #define VD_REG_PROP "reg" 76 77 /* Virtual disk initialization flags */ 78 #define VD_DISK_READY 0x01 79 #define VD_LOCKING 0x02 80 #define VD_LDC 0x04 81 #define VD_DRING 0x08 82 #define VD_SID 0x10 83 #define VD_SEQ_NUM 0x20 84 85 /* Flags for opening/closing backing devices via LDI */ 86 #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) 87 88 /* Flags for writing to a vdisk which is a file */ 89 #define VD_FILE_WRITE_FLAGS SM_ASYNC 90 91 /* 92 * By Solaris convention, slice/partition 2 represents the entire disk; 93 * unfortunately, this convention does not appear to be codified. 94 */ 95 #define VD_ENTIRE_DISK_SLICE 2 96 97 /* Return a cpp token as a string */ 98 #define STRINGIZE(token) #token 99 100 /* 101 * Print a message prefixed with the current function name to the message log 102 * (and optionally to the console for verbose boots); these macros use cpp's 103 * concatenation of string literals and C99 variable-length-argument-list 104 * macros 105 */ 106 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 107 #define _PRN(format, ...) \ 108 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 109 110 /* Return a pointer to the "i"th vdisk dring element */ 111 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 112 (vd->dring + (i)*vd->descriptor_size)) 113 114 /* Return the virtual disk client's type as a string (for use in messages) */ 115 #define VD_CLIENT(vd) \ 116 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 117 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 118 (((vd)->xfer_mode == 0) ? "null client" : \ 119 "unsupported client"))) 120 121 /* For IO to raw disk on file */ 122 #define VD_FILE_SLICE_NONE -1 123 124 /* Read disk label from a disk on file */ 125 #define VD_FILE_LABEL_READ(vd, labelp) \ 126 vd_file_rw(vd, VD_FILE_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \ 127 0, sizeof (struct dk_label)) 128 129 /* Write disk label to a disk on file */ 130 #define VD_FILE_LABEL_WRITE(vd, labelp) \ 131 vd_file_rw(vd, VD_FILE_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 132 0, sizeof (struct dk_label)) 133 134 /* 135 * Specification of an MD node passed to the MDEG to filter any 136 * 'vport' nodes that do not belong to the specified node. This 137 * template is copied for each vds instance and filled in with 138 * the appropriate 'cfg-handle' value before being passed to the MDEG. 139 */ 140 static mdeg_prop_spec_t vds_prop_template[] = { 141 { MDET_PROP_STR, "name", VDS_NAME }, 142 { MDET_PROP_VAL, "cfg-handle", NULL }, 143 { MDET_LIST_END, NULL, NULL } 144 }; 145 146 #define VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 147 148 /* 149 * Matching criteria passed to the MDEG to register interest 150 * in changes to 'virtual-device-port' nodes identified by their 151 * 'id' property. 152 */ 153 static md_prop_match_t vd_prop_match[] = { 154 { MDET_PROP_VAL, VD_ID_PROP }, 155 { MDET_LIST_END, NULL } 156 }; 157 158 static mdeg_node_match_t vd_match = {"virtual-device-port", 159 vd_prop_match}; 160 161 /* Debugging macros */ 162 #ifdef DEBUG 163 164 static int vd_msglevel = 0; 165 166 #define PR0 if (vd_msglevel > 0) PRN 167 #define PR1 if (vd_msglevel > 1) PRN 168 #define PR2 if (vd_msglevel > 2) PRN 169 170 #define VD_DUMP_DRING_ELEM(elem) \ 171 PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 172 elem->hdr.dstate, \ 173 elem->payload.operation, \ 174 elem->payload.status, \ 175 elem->payload.nbytes, \ 176 elem->payload.addr, \ 177 elem->payload.ncookies); 178 179 char * 180 vd_decode_state(int state) 181 { 182 char *str; 183 184 #define CASE_STATE(_s) case _s: str = #_s; break; 185 186 switch (state) { 187 CASE_STATE(VD_STATE_INIT) 188 CASE_STATE(VD_STATE_VER) 189 CASE_STATE(VD_STATE_ATTR) 190 CASE_STATE(VD_STATE_DRING) 191 CASE_STATE(VD_STATE_RDX) 192 CASE_STATE(VD_STATE_DATA) 193 default: str = "unknown"; break; 194 } 195 196 #undef CASE_STATE 197 198 return (str); 199 } 200 201 void 202 vd_decode_tag(vio_msg_t *msg) 203 { 204 char *tstr, *sstr, *estr; 205 206 #define CASE_TYPE(_s) case _s: tstr = #_s; break; 207 208 switch (msg->tag.vio_msgtype) { 209 CASE_TYPE(VIO_TYPE_CTRL) 210 CASE_TYPE(VIO_TYPE_DATA) 211 CASE_TYPE(VIO_TYPE_ERR) 212 default: tstr = "unknown"; break; 213 } 214 215 #undef CASE_TYPE 216 217 #define CASE_SUBTYPE(_s) case _s: sstr = #_s; break; 218 219 switch (msg->tag.vio_subtype) { 220 CASE_SUBTYPE(VIO_SUBTYPE_INFO) 221 CASE_SUBTYPE(VIO_SUBTYPE_ACK) 222 CASE_SUBTYPE(VIO_SUBTYPE_NACK) 223 default: sstr = "unknown"; break; 224 } 225 226 #undef CASE_SUBTYPE 227 228 #define CASE_ENV(_s) case _s: estr = #_s; break; 229 230 switch (msg->tag.vio_subtype_env) { 231 CASE_ENV(VIO_VER_INFO) 232 CASE_ENV(VIO_ATTR_INFO) 233 CASE_ENV(VIO_DRING_REG) 234 CASE_ENV(VIO_DRING_UNREG) 235 CASE_ENV(VIO_RDX) 236 CASE_ENV(VIO_PKT_DATA) 237 CASE_ENV(VIO_DESC_DATA) 238 CASE_ENV(VIO_DRING_DATA) 239 default: estr = "unknown"; break; 240 } 241 242 #undef CASE_ENV 243 244 PR1("(%x/%x/%x) message : (%s/%s/%s)", 245 msg->tag.vio_msgtype, msg->tag.vio_subtype, 246 msg->tag.vio_subtype_env, tstr, sstr, estr); 247 } 248 249 #else /* !DEBUG */ 250 251 #define PR0(...) 252 #define PR1(...) 253 #define PR2(...) 254 255 #define VD_DUMP_DRING_ELEM(elem) 256 257 #define vd_decode_state(_s) (NULL) 258 #define vd_decode_tag(_s) (NULL) 259 260 #endif /* DEBUG */ 261 262 263 /* 264 * Soft state structure for a vds instance 265 */ 266 typedef struct vds { 267 uint_t initialized; /* driver inst initialization flags */ 268 dev_info_t *dip; /* driver inst devinfo pointer */ 269 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 270 mod_hash_t *vd_table; /* table of virtual disks served */ 271 mdeg_node_spec_t *ispecp; /* mdeg node specification */ 272 mdeg_handle_t mdeg; /* handle for MDEG operations */ 273 } vds_t; 274 275 /* 276 * Types of descriptor-processing tasks 277 */ 278 typedef enum vd_task_type { 279 VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ 280 VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ 281 } vd_task_type_t; 282 283 /* 284 * Structure describing the task for processing a descriptor 285 */ 286 typedef struct vd_task { 287 struct vd *vd; /* vd instance task is for */ 288 vd_task_type_t type; /* type of descriptor task */ 289 int index; /* dring elem index for task */ 290 vio_msg_t *msg; /* VIO message task is for */ 291 size_t msglen; /* length of message content */ 292 vd_dring_payload_t *request; /* request task will perform */ 293 struct buf buf; /* buf(9s) for I/O request */ 294 ldc_mem_handle_t mhdl; /* task memory handle */ 295 } vd_task_t; 296 297 /* 298 * Soft state structure for a virtual disk instance 299 */ 300 typedef struct vd { 301 uint_t initialized; /* vdisk initialization flags */ 302 vds_t *vds; /* server for this vdisk */ 303 ddi_taskq_t *startq; /* queue for I/O start tasks */ 304 ddi_taskq_t *completionq; /* queue for completion tasks */ 305 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 306 char device_path[MAXPATHLEN + 1]; /* vdisk device */ 307 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 308 uint_t nslices; /* number of slices */ 309 size_t vdisk_size; /* number of blocks in vdisk */ 310 vd_disk_type_t vdisk_type; /* slice or entire disk */ 311 vd_disk_label_t vdisk_label; /* EFI or VTOC label */ 312 ushort_t max_xfer_sz; /* max xfer size in DEV_BSIZE */ 313 boolean_t pseudo; /* underlying pseudo dev */ 314 boolean_t file; /* underlying file */ 315 vnode_t *file_vnode; /* file vnode */ 316 size_t file_size; /* file size */ 317 struct dk_efi dk_efi; /* synthetic for slice type */ 318 struct dk_geom dk_geom; /* synthetic for slice type */ 319 struct vtoc vtoc; /* synthetic for slice type */ 320 ldc_status_t ldc_state; /* LDC connection state */ 321 ldc_handle_t ldc_handle; /* handle for LDC comm */ 322 size_t max_msglen; /* largest LDC message len */ 323 vd_state_t state; /* client handshake state */ 324 uint8_t xfer_mode; /* transfer mode with client */ 325 uint32_t sid; /* client's session ID */ 326 uint64_t seq_num; /* message sequence number */ 327 uint64_t dring_ident; /* identifier of dring */ 328 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 329 uint32_t descriptor_size; /* num bytes in desc */ 330 uint32_t dring_len; /* number of dring elements */ 331 caddr_t dring; /* address of dring */ 332 caddr_t vio_msgp; /* vio msg staging buffer */ 333 vd_task_t inband_task; /* task for inband descriptor */ 334 vd_task_t *dring_task; /* tasks dring elements */ 335 336 kmutex_t lock; /* protects variables below */ 337 boolean_t enabled; /* is vdisk enabled? */ 338 boolean_t reset_state; /* reset connection state? */ 339 boolean_t reset_ldc; /* reset LDC channel? */ 340 } vd_t; 341 342 typedef struct vds_operation { 343 char *namep; 344 uint8_t operation; 345 int (*start)(vd_task_t *task); 346 void (*complete)(void *arg); 347 } vds_operation_t; 348 349 typedef struct vd_ioctl { 350 uint8_t operation; /* vdisk operation */ 351 const char *operation_name; /* vdisk operation name */ 352 size_t nbytes; /* size of operation buffer */ 353 int cmd; /* corresponding ioctl cmd */ 354 const char *cmd_name; /* ioctl cmd name */ 355 void *arg; /* ioctl cmd argument */ 356 /* convert input vd_buf to output ioctl_arg */ 357 void (*copyin)(void *vd_buf, void *ioctl_arg); 358 /* convert input ioctl_arg to output vd_buf */ 359 void (*copyout)(void *ioctl_arg, void *vd_buf); 360 } vd_ioctl_t; 361 362 /* Define trivial copyin/copyout conversion function flag */ 363 #define VD_IDENTITY ((void (*)(void *, void *))-1) 364 365 366 static int vds_ldc_retries = VDS_RETRIES; 367 static int vds_ldc_delay = VDS_LDC_DELAY; 368 static int vds_dev_retries = VDS_RETRIES; 369 static int vds_dev_delay = VDS_DEV_DELAY; 370 static void *vds_state; 371 static uint64_t vds_operations; /* see vds_operation[] definition below */ 372 373 static int vd_open_flags = VD_OPEN_FLAGS; 374 375 static uint_t vd_file_write_flags = VD_FILE_WRITE_FLAGS; 376 377 /* 378 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 379 * 380 * Each supported major version should appear only once, paired with (and only 381 * with) its highest supported minor version number (as the protocol requires 382 * supporting all lower minor version numbers as well) 383 */ 384 static const vio_ver_t vds_version[] = {{1, 0}}; 385 static const size_t vds_num_versions = 386 sizeof (vds_version)/sizeof (vds_version[0]); 387 388 static void vd_free_dring_task(vd_t *vdp); 389 static int vd_setup_vd(vd_t *vd); 390 static boolean_t vd_enabled(vd_t *vd); 391 392 /* 393 * Function: 394 * vd_file_rw 395 * 396 * Description: 397 * Read or write to a disk on file. 398 * 399 * Parameters: 400 * vd - disk on which the operation is performed. 401 * slice - slice on which the operation is performed, 402 * VD_FILE_SLICE_NONE indicates that the operation 403 * is done on the raw disk. 404 * operation - operation to execute: read (VD_OP_BREAD) or 405 * write (VD_OP_BWRITE). 406 * data - buffer where data are read to or written from. 407 * blk - starting block for the operation. 408 * len - number of bytes to read or write. 409 * 410 * Return Code: 411 * n >= 0 - success, n indicates the number of bytes read 412 * or written. 413 * -1 - error. 414 */ 415 static ssize_t 416 vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk, 417 size_t len) 418 { 419 caddr_t maddr; 420 size_t offset, maxlen, moffset, mlen, n; 421 uint_t smflags; 422 enum seg_rw srw; 423 424 ASSERT(vd->file); 425 ASSERT(len > 0); 426 427 if (slice == VD_FILE_SLICE_NONE) { 428 /* raw disk access */ 429 offset = blk * DEV_BSIZE; 430 } else { 431 ASSERT(slice >= 0 && slice < V_NUMPAR); 432 if (blk >= vd->vtoc.v_part[slice].p_size) { 433 /* address past the end of the slice */ 434 PR0("req_addr (0x%lx) > psize (0x%lx)", 435 blk, vd->vtoc.v_part[slice].p_size); 436 return (0); 437 } 438 439 offset = (vd->vtoc.v_part[slice].p_start + blk) * DEV_BSIZE; 440 441 /* 442 * If the requested size is greater than the size 443 * of the partition, truncate the read/write. 444 */ 445 maxlen = (vd->vtoc.v_part[slice].p_size - blk) * DEV_BSIZE; 446 447 if (len > maxlen) { 448 PR0("I/O size truncated to %lu bytes from %lu bytes", 449 maxlen, len); 450 len = maxlen; 451 } 452 } 453 454 /* 455 * We have to ensure that we are reading/writing into the mmap 456 * range. If we have a partial disk image (e.g. an image of 457 * s0 instead s2) the system can try to access slices that 458 * are not included into the disk image. 459 */ 460 if ((offset + len) >= vd->file_size) { 461 PR0("offset + nbytes (0x%lx + 0x%lx) >= " 462 "file_size (0x%lx)", offset, len, vd->file_size); 463 return (-1); 464 } 465 466 srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE; 467 smflags = (operation == VD_OP_BREAD)? 0 : 468 (SM_WRITE | vd_file_write_flags); 469 n = len; 470 471 do { 472 /* 473 * segmap_getmapflt() returns a MAXBSIZE chunk which is 474 * MAXBSIZE aligned. 475 */ 476 moffset = offset & MAXBOFFSET; 477 mlen = MIN(MAXBSIZE - moffset, n); 478 maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset, 479 mlen, 1, srw); 480 /* 481 * Fault in the pages so we can check for error and ensure 482 * that we can safely used the mapped address. 483 */ 484 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 485 F_SOFTLOCK, srw) != 0) { 486 (void) segmap_release(segkmap, maddr, 0); 487 return (-1); 488 } 489 490 if (operation == VD_OP_BREAD) 491 bcopy(maddr + moffset, data, mlen); 492 else 493 bcopy(data, maddr + moffset, mlen); 494 495 if (segmap_fault(kas.a_hat, segkmap, maddr, mlen, 496 F_SOFTUNLOCK, srw) != 0) { 497 (void) segmap_release(segkmap, maddr, 0); 498 return (-1); 499 } 500 if (segmap_release(segkmap, maddr, smflags) != 0) 501 return (-1); 502 n -= mlen; 503 offset += mlen; 504 data += mlen; 505 506 } while (n > 0); 507 508 return (len); 509 } 510 511 static int 512 vd_start_bio(vd_task_t *task) 513 { 514 int rv, status = 0; 515 vd_t *vd = task->vd; 516 vd_dring_payload_t *request = task->request; 517 struct buf *buf = &task->buf; 518 uint8_t mtype; 519 int slice; 520 521 ASSERT(vd != NULL); 522 ASSERT(request != NULL); 523 524 slice = request->slice; 525 526 ASSERT(slice < vd->nslices); 527 ASSERT((request->operation == VD_OP_BREAD) || 528 (request->operation == VD_OP_BWRITE)); 529 530 if (request->nbytes == 0) 531 return (EINVAL); /* no service for trivial requests */ 532 533 PR1("%s %lu bytes at block %lu", 534 (request->operation == VD_OP_BREAD) ? "Read" : "Write", 535 request->nbytes, request->addr); 536 537 bioinit(buf); 538 buf->b_flags = B_BUSY; 539 buf->b_bcount = request->nbytes; 540 buf->b_lblkno = request->addr; 541 buf->b_edev = vd->dev[slice]; 542 543 mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP; 544 545 /* Map memory exported by client */ 546 status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies, 547 mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R, 548 &(buf->b_un.b_addr), NULL); 549 if (status != 0) { 550 PR0("ldc_mem_map() returned err %d ", status); 551 biofini(buf); 552 return (status); 553 } 554 555 status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount); 556 if (status != 0) { 557 (void) ldc_mem_unmap(task->mhdl); 558 PR0("ldc_mem_acquire() returned err %d ", status); 559 biofini(buf); 560 return (status); 561 } 562 563 buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE; 564 565 /* Start the block I/O */ 566 if (vd->file) { 567 rv = vd_file_rw(vd, slice, request->operation, buf->b_un.b_addr, 568 request->addr, request->nbytes); 569 if (rv < 0) { 570 request->nbytes = 0; 571 status = EIO; 572 } else { 573 request->nbytes = rv; 574 status = 0; 575 } 576 } else { 577 status = ldi_strategy(vd->ldi_handle[slice], buf); 578 if (status == 0) 579 return (EINPROGRESS); /* will complete on completionq */ 580 } 581 582 /* Clean up after error */ 583 rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 584 if (rv) { 585 PR0("ldc_mem_release() returned err %d ", rv); 586 } 587 rv = ldc_mem_unmap(task->mhdl); 588 if (rv) { 589 PR0("ldc_mem_unmap() returned err %d ", status); 590 } 591 592 biofini(buf); 593 return (status); 594 } 595 596 static int 597 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 598 { 599 int status; 600 size_t nbytes; 601 602 do { 603 nbytes = msglen; 604 status = ldc_write(ldc_handle, msg, &nbytes); 605 if (status != EWOULDBLOCK) 606 break; 607 drv_usecwait(vds_ldc_delay); 608 } while (status == EWOULDBLOCK); 609 610 if (status != 0) { 611 if (status != ECONNRESET) 612 PR0("ldc_write() returned errno %d", status); 613 return (status); 614 } else if (nbytes != msglen) { 615 PR0("ldc_write() performed only partial write"); 616 return (EIO); 617 } 618 619 PR1("SENT %lu bytes", msglen); 620 return (0); 621 } 622 623 static void 624 vd_need_reset(vd_t *vd, boolean_t reset_ldc) 625 { 626 mutex_enter(&vd->lock); 627 vd->reset_state = B_TRUE; 628 vd->reset_ldc = reset_ldc; 629 mutex_exit(&vd->lock); 630 } 631 632 /* 633 * Reset the state of the connection with a client, if needed; reset the LDC 634 * transport as well, if needed. This function should only be called from the 635 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur. 636 */ 637 static void 638 vd_reset_if_needed(vd_t *vd) 639 { 640 int status = 0; 641 642 mutex_enter(&vd->lock); 643 if (!vd->reset_state) { 644 ASSERT(!vd->reset_ldc); 645 mutex_exit(&vd->lock); 646 return; 647 } 648 mutex_exit(&vd->lock); 649 650 PR0("Resetting connection state with %s", VD_CLIENT(vd)); 651 652 /* 653 * Let any asynchronous I/O complete before possibly pulling the rug 654 * out from under it; defer checking vd->reset_ldc, as one of the 655 * asynchronous tasks might set it 656 */ 657 ddi_taskq_wait(vd->completionq); 658 659 if (vd->file) { 660 status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred); 661 if (status) { 662 PR0("VOP_FSYNC returned errno %d", status); 663 } 664 } 665 666 if ((vd->initialized & VD_DRING) && 667 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 668 PR0("ldc_mem_dring_unmap() returned errno %d", status); 669 670 vd_free_dring_task(vd); 671 672 /* Free the staging buffer for msgs */ 673 if (vd->vio_msgp != NULL) { 674 kmem_free(vd->vio_msgp, vd->max_msglen); 675 vd->vio_msgp = NULL; 676 } 677 678 /* Free the inband message buffer */ 679 if (vd->inband_task.msg != NULL) { 680 kmem_free(vd->inband_task.msg, vd->max_msglen); 681 vd->inband_task.msg = NULL; 682 } 683 684 mutex_enter(&vd->lock); 685 686 if (vd->reset_ldc) 687 PR0("taking down LDC channel"); 688 if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) 689 PR0("ldc_down() returned errno %d", status); 690 691 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 692 vd->state = VD_STATE_INIT; 693 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 694 695 /* Allocate the staging buffer */ 696 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 697 698 PR0("calling ldc_up\n"); 699 (void) ldc_up(vd->ldc_handle); 700 701 vd->reset_state = B_FALSE; 702 vd->reset_ldc = B_FALSE; 703 704 mutex_exit(&vd->lock); 705 } 706 707 static void vd_recv_msg(void *arg); 708 709 static void 710 vd_mark_in_reset(vd_t *vd) 711 { 712 int status; 713 714 PR0("vd_mark_in_reset: marking vd in reset\n"); 715 716 vd_need_reset(vd, B_FALSE); 717 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP); 718 if (status == DDI_FAILURE) { 719 PR0("cannot schedule task to recv msg\n"); 720 vd_need_reset(vd, B_TRUE); 721 return; 722 } 723 } 724 725 static int 726 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes) 727 { 728 boolean_t accepted; 729 int status; 730 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 731 732 if (vd->reset_state) 733 return (0); 734 735 /* Acquire the element */ 736 if (!vd->reset_state && 737 (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 738 if (status == ECONNRESET) { 739 vd_mark_in_reset(vd); 740 return (0); 741 } else { 742 PR0("ldc_mem_dring_acquire() returned errno %d", 743 status); 744 return (status); 745 } 746 } 747 748 /* Set the element's status and mark it done */ 749 accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); 750 if (accepted) { 751 elem->payload.nbytes = elem_nbytes; 752 elem->payload.status = elem_status; 753 elem->hdr.dstate = VIO_DESC_DONE; 754 } else { 755 /* Perhaps client timed out waiting for I/O... */ 756 PR0("element %u no longer \"accepted\"", idx); 757 VD_DUMP_DRING_ELEM(elem); 758 } 759 /* Release the element */ 760 if (!vd->reset_state && 761 (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 762 if (status == ECONNRESET) { 763 vd_mark_in_reset(vd); 764 return (0); 765 } else { 766 PR0("ldc_mem_dring_release() returned errno %d", 767 status); 768 return (status); 769 } 770 } 771 772 return (accepted ? 0 : EINVAL); 773 } 774 775 static void 776 vd_complete_bio(void *arg) 777 { 778 int status = 0; 779 vd_task_t *task = (vd_task_t *)arg; 780 vd_t *vd = task->vd; 781 vd_dring_payload_t *request = task->request; 782 struct buf *buf = &task->buf; 783 784 785 ASSERT(vd != NULL); 786 ASSERT(request != NULL); 787 ASSERT(task->msg != NULL); 788 ASSERT(task->msglen >= sizeof (*task->msg)); 789 ASSERT(!vd->file); 790 791 /* Wait for the I/O to complete */ 792 request->status = biowait(buf); 793 794 /* return back the number of bytes read/written */ 795 request->nbytes = buf->b_bcount - buf->b_resid; 796 797 /* Release the buffer */ 798 if (!vd->reset_state) 799 status = ldc_mem_release(task->mhdl, 0, buf->b_bcount); 800 if (status) { 801 PR0("ldc_mem_release() returned errno %d copying to " 802 "client", status); 803 if (status == ECONNRESET) { 804 vd_mark_in_reset(vd); 805 } 806 } 807 808 /* Unmap the memory, even if in reset */ 809 status = ldc_mem_unmap(task->mhdl); 810 if (status) { 811 PR0("ldc_mem_unmap() returned errno %d copying to client", 812 status); 813 if (status == ECONNRESET) { 814 vd_mark_in_reset(vd); 815 } 816 } 817 818 biofini(buf); 819 820 /* Update the dring element for a dring client */ 821 if (!vd->reset_state && (status == 0) && 822 (vd->xfer_mode == VIO_DRING_MODE)) { 823 status = vd_mark_elem_done(vd, task->index, 824 request->status, request->nbytes); 825 if (status == ECONNRESET) 826 vd_mark_in_reset(vd); 827 } 828 829 /* 830 * If a transport error occurred, arrange to "nack" the message when 831 * the final task in the descriptor element range completes 832 */ 833 if (status != 0) 834 task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 835 836 /* 837 * Only the final task for a range of elements will respond to and 838 * free the message 839 */ 840 if (task->type == VD_NONFINAL_RANGE_TASK) { 841 return; 842 } 843 844 /* 845 * Send the "ack" or "nack" back to the client; if sending the message 846 * via LDC fails, arrange to reset both the connection state and LDC 847 * itself 848 */ 849 PR1("Sending %s", 850 (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 851 if (!vd->reset_state) { 852 status = send_msg(vd->ldc_handle, task->msg, task->msglen); 853 switch (status) { 854 case 0: 855 break; 856 case ECONNRESET: 857 vd_mark_in_reset(vd); 858 break; 859 default: 860 PR0("initiating full reset"); 861 vd_need_reset(vd, B_TRUE); 862 break; 863 } 864 } 865 } 866 867 static void 868 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 869 { 870 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 871 } 872 873 static void 874 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 875 { 876 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 877 } 878 879 static void 880 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 881 { 882 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 883 } 884 885 static void 886 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 887 { 888 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 889 } 890 891 static void 892 vd_get_efi_in(void *vd_buf, void *ioctl_arg) 893 { 894 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 895 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 896 897 dk_efi->dki_lba = vd_efi->lba; 898 dk_efi->dki_length = vd_efi->length; 899 dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); 900 } 901 902 static void 903 vd_get_efi_out(void *ioctl_arg, void *vd_buf) 904 { 905 int len; 906 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 907 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 908 909 len = vd_efi->length; 910 DK_EFI2VD_EFI(dk_efi, vd_efi); 911 kmem_free(dk_efi->dki_data, len); 912 } 913 914 static void 915 vd_set_efi_in(void *vd_buf, void *ioctl_arg) 916 { 917 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 918 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 919 920 dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); 921 VD_EFI2DK_EFI(vd_efi, dk_efi); 922 } 923 924 static void 925 vd_set_efi_out(void *ioctl_arg, void *vd_buf) 926 { 927 vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; 928 dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; 929 930 kmem_free(dk_efi->dki_data, vd_efi->length); 931 } 932 933 static int 934 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label) 935 { 936 int status, rval; 937 struct dk_gpt *efi; 938 size_t efi_len; 939 940 *label = VD_DISK_LABEL_UNK; 941 942 status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc, 943 (vd_open_flags | FKIOCTL), kcred, &rval); 944 945 if (status == 0) { 946 *label = VD_DISK_LABEL_VTOC; 947 return (0); 948 } else if (status != ENOTSUP) { 949 PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status); 950 return (status); 951 } 952 953 status = vds_efi_alloc_and_read(handle, &efi, &efi_len); 954 955 if (status) { 956 PR0("vds_efi_alloc_and_read returned error %d", status); 957 return (status); 958 } 959 960 *label = VD_DISK_LABEL_EFI; 961 vd_efi_to_vtoc(efi, vtoc); 962 vd_efi_free(efi, efi_len); 963 964 return (0); 965 } 966 967 static ushort_t 968 vd_lbl2cksum(struct dk_label *label) 969 { 970 int count; 971 ushort_t sum, *sp; 972 973 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 974 sp = (ushort_t *)label; 975 sum = 0; 976 while (count--) { 977 sum ^= *sp++; 978 } 979 980 return (sum); 981 } 982 983 static int 984 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 985 { 986 dk_efi_t *dk_ioc; 987 struct dk_label label; 988 struct vtoc *vtoc; 989 int i; 990 991 switch (vd->vdisk_label) { 992 993 case VD_DISK_LABEL_VTOC: 994 995 switch (cmd) { 996 case DKIOCGGEOM: 997 ASSERT(ioctl_arg != NULL); 998 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 999 return (0); 1000 case DKIOCGVTOC: 1001 ASSERT(ioctl_arg != NULL); 1002 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 1003 return (0); 1004 case DKIOCSVTOC: 1005 if (!vd->file) 1006 return (ENOTSUP); 1007 ASSERT(ioctl_arg != NULL); 1008 vtoc = (struct vtoc *)ioctl_arg; 1009 1010 if (vtoc->v_sanity != VTOC_SANE || 1011 vtoc->v_sectorsz != DEV_BSIZE || 1012 vtoc->v_nparts != V_NUMPAR) 1013 return (EINVAL); 1014 1015 bzero(&label, sizeof (label)); 1016 label.dkl_ncyl = vd->dk_geom.dkg_ncyl; 1017 label.dkl_acyl = vd->dk_geom.dkg_acyl; 1018 label.dkl_pcyl = vd->dk_geom.dkg_pcyl; 1019 label.dkl_nhead = vd->dk_geom.dkg_nhead; 1020 label.dkl_nsect = vd->dk_geom.dkg_nsect; 1021 label.dkl_intrlv = vd->dk_geom.dkg_intrlv; 1022 label.dkl_apc = vd->dk_geom.dkg_apc; 1023 label.dkl_rpm = vd->dk_geom.dkg_rpm; 1024 label.dkl_write_reinstruct = 1025 vd->dk_geom.dkg_write_reinstruct; 1026 label.dkl_read_reinstruct = 1027 vd->dk_geom.dkg_read_reinstruct; 1028 1029 label.dkl_vtoc.v_nparts = vtoc->v_nparts; 1030 label.dkl_vtoc.v_sanity = vtoc->v_sanity; 1031 label.dkl_vtoc.v_version = vtoc->v_version; 1032 for (i = 0; i < vtoc->v_nparts; i++) { 1033 label.dkl_vtoc.v_timestamp[i] = 1034 vtoc->timestamp[i]; 1035 label.dkl_vtoc.v_part[i].p_tag = 1036 vtoc->v_part[i].p_tag; 1037 label.dkl_vtoc.v_part[i].p_flag = 1038 vtoc->v_part[i].p_flag; 1039 label.dkl_map[i].dkl_cylno = 1040 vtoc->v_part[i].p_start / 1041 (label.dkl_nhead * label.dkl_nsect); 1042 label.dkl_map[i].dkl_nblk = 1043 vtoc->v_part[i].p_size; 1044 } 1045 bcopy(vtoc->v_asciilabel, label.dkl_asciilabel, 1046 LEN_DKL_ASCII); 1047 bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume, 1048 LEN_DKL_VVOL); 1049 bcopy(vtoc->v_bootinfo, label.dkl_vtoc.v_bootinfo, 1050 sizeof (vtoc->v_bootinfo)); 1051 1052 /* re-compute checksum */ 1053 label.dkl_magic = DKL_MAGIC; 1054 label.dkl_cksum = vd_lbl2cksum(&label); 1055 1056 /* write label to file */ 1057 if (VD_FILE_LABEL_WRITE(vd, &label) < 0) 1058 return (EIO); 1059 1060 /* update the cached vdisk VTOC */ 1061 bcopy(vtoc, &vd->vtoc, sizeof (vd->vtoc)); 1062 1063 return (0); 1064 default: 1065 return (ENOTSUP); 1066 } 1067 1068 case VD_DISK_LABEL_EFI: 1069 1070 switch (cmd) { 1071 case DKIOCGETEFI: 1072 ASSERT(ioctl_arg != NULL); 1073 dk_ioc = (dk_efi_t *)ioctl_arg; 1074 if (dk_ioc->dki_length < vd->dk_efi.dki_length) 1075 return (EINVAL); 1076 bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data, 1077 vd->dk_efi.dki_length); 1078 return (0); 1079 default: 1080 return (ENOTSUP); 1081 } 1082 1083 default: 1084 return (ENOTSUP); 1085 } 1086 } 1087 1088 static int 1089 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 1090 { 1091 int rval = 0, status; 1092 size_t nbytes = request->nbytes; /* modifiable copy */ 1093 1094 1095 ASSERT(request->slice < vd->nslices); 1096 PR0("Performing %s", ioctl->operation_name); 1097 1098 /* Get data from client and convert, if necessary */ 1099 if (ioctl->copyin != NULL) { 1100 ASSERT(nbytes != 0 && buf != NULL); 1101 PR1("Getting \"arg\" data from client"); 1102 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1103 request->cookie, request->ncookies, 1104 LDC_COPY_IN)) != 0) { 1105 PR0("ldc_mem_copy() returned errno %d " 1106 "copying from client", status); 1107 return (status); 1108 } 1109 1110 /* Convert client's data, if necessary */ 1111 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 1112 ioctl->arg = buf; 1113 else /* convert client vdisk operation data to ioctl data */ 1114 (ioctl->copyin)(buf, (void *)ioctl->arg); 1115 } 1116 1117 /* 1118 * Handle single-slice block devices internally; otherwise, have the 1119 * real driver perform the ioctl() 1120 */ 1121 if (vd->file || (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo)) { 1122 if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, 1123 (void *)ioctl->arg)) != 0) 1124 return (status); 1125 } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], 1126 ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL), 1127 kcred, &rval)) != 0) { 1128 PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); 1129 return (status); 1130 } 1131 #ifdef DEBUG 1132 if (rval != 0) { 1133 PR0("%s set rval = %d, which is not being returned to client", 1134 ioctl->cmd_name, rval); 1135 } 1136 #endif /* DEBUG */ 1137 1138 /* Convert data and send to client, if necessary */ 1139 if (ioctl->copyout != NULL) { 1140 ASSERT(nbytes != 0 && buf != NULL); 1141 PR1("Sending \"arg\" data to client"); 1142 1143 /* Convert ioctl data to vdisk operation data, if necessary */ 1144 if (ioctl->copyout != VD_IDENTITY) 1145 (ioctl->copyout)((void *)ioctl->arg, buf); 1146 1147 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 1148 request->cookie, request->ncookies, 1149 LDC_COPY_OUT)) != 0) { 1150 PR0("ldc_mem_copy() returned errno %d " 1151 "copying to client", status); 1152 return (status); 1153 } 1154 } 1155 1156 return (status); 1157 } 1158 1159 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 1160 static int 1161 vd_ioctl(vd_task_t *task) 1162 { 1163 int i, status, rc; 1164 void *buf = NULL; 1165 struct dk_geom dk_geom = {0}; 1166 struct vtoc vtoc = {0}; 1167 struct dk_efi dk_efi = {0}; 1168 vd_t *vd = task->vd; 1169 vd_dring_payload_t *request = task->request; 1170 vd_ioctl_t ioctl[] = { 1171 /* Command (no-copy) operations */ 1172 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 1173 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 1174 NULL, NULL, NULL}, 1175 1176 /* "Get" (copy-out) operations */ 1177 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 1178 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 1179 NULL, VD_IDENTITY, VD_IDENTITY}, 1180 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 1181 RNDSIZE(vd_geom_t), 1182 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 1183 &dk_geom, NULL, dk_geom2vd_geom}, 1184 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 1185 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 1186 &vtoc, NULL, vtoc2vd_vtoc}, 1187 {VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t), 1188 DKIOCGETEFI, STRINGIZE(DKIOCGETEFI), 1189 &dk_efi, vd_get_efi_in, vd_get_efi_out}, 1190 1191 /* "Set" (copy-in) operations */ 1192 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 1193 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 1194 NULL, VD_IDENTITY, VD_IDENTITY}, 1195 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 1196 RNDSIZE(vd_geom_t), 1197 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 1198 &dk_geom, vd_geom2dk_geom, NULL}, 1199 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 1200 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 1201 &vtoc, vd_vtoc2vtoc, NULL}, 1202 {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), 1203 DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), 1204 &dk_efi, vd_set_efi_in, vd_set_efi_out}, 1205 }; 1206 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 1207 1208 1209 ASSERT(vd != NULL); 1210 ASSERT(request != NULL); 1211 ASSERT(request->slice < vd->nslices); 1212 1213 /* 1214 * Determine ioctl corresponding to caller's "operation" and 1215 * validate caller's "nbytes" 1216 */ 1217 for (i = 0; i < nioctls; i++) { 1218 if (request->operation == ioctl[i].operation) { 1219 /* LDC memory operations require 8-byte multiples */ 1220 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 1221 1222 if (request->operation == VD_OP_GET_EFI || 1223 request->operation == VD_OP_SET_EFI) { 1224 if (request->nbytes >= ioctl[i].nbytes) 1225 break; 1226 PR0("%s: Expected at least nbytes = %lu, " 1227 "got %lu", ioctl[i].operation_name, 1228 ioctl[i].nbytes, request->nbytes); 1229 return (EINVAL); 1230 } 1231 1232 if (request->nbytes != ioctl[i].nbytes) { 1233 PR0("%s: Expected nbytes = %lu, got %lu", 1234 ioctl[i].operation_name, ioctl[i].nbytes, 1235 request->nbytes); 1236 return (EINVAL); 1237 } 1238 1239 break; 1240 } 1241 } 1242 ASSERT(i < nioctls); /* because "operation" already validated */ 1243 1244 if (request->nbytes) 1245 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 1246 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 1247 if (request->nbytes) 1248 kmem_free(buf, request->nbytes); 1249 if (!vd->file && vd->vdisk_type == VD_DISK_TYPE_DISK && 1250 (request->operation == VD_OP_SET_VTOC || 1251 request->operation == VD_OP_SET_EFI)) { 1252 /* update disk information */ 1253 rc = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, 1254 &vd->vdisk_label); 1255 if (rc != 0) 1256 PR0("vd_read_vtoc return error %d", rc); 1257 } 1258 PR0("Returning %d", status); 1259 return (status); 1260 } 1261 1262 static int 1263 vd_get_devid(vd_task_t *task) 1264 { 1265 vd_t *vd = task->vd; 1266 vd_dring_payload_t *request = task->request; 1267 vd_devid_t *vd_devid; 1268 impl_devid_t *devid; 1269 int status, bufid_len, devid_len, len; 1270 int bufbytes; 1271 1272 PR1("Get Device ID, nbytes=%ld", request->nbytes); 1273 1274 if (vd->file) { 1275 /* no devid for disk on file */ 1276 return (ENOENT); 1277 } 1278 1279 if (ddi_lyr_get_devid(vd->dev[request->slice], 1280 (ddi_devid_t *)&devid) != DDI_SUCCESS) { 1281 /* the most common failure is that no devid is available */ 1282 PR2("No Device ID"); 1283 return (ENOENT); 1284 } 1285 1286 bufid_len = request->nbytes - sizeof (vd_devid_t) + 1; 1287 devid_len = DEVID_GETLEN(devid); 1288 1289 /* 1290 * Save the buffer size here for use in deallocation. 1291 * The actual number of bytes copied is returned in 1292 * the 'nbytes' field of the request structure. 1293 */ 1294 bufbytes = request->nbytes; 1295 1296 vd_devid = kmem_zalloc(bufbytes, KM_SLEEP); 1297 vd_devid->length = devid_len; 1298 vd_devid->type = DEVID_GETTYPE(devid); 1299 1300 len = (devid_len > bufid_len)? bufid_len : devid_len; 1301 1302 bcopy(devid->did_id, vd_devid->id, len); 1303 1304 /* LDC memory operations require 8-byte multiples */ 1305 ASSERT(request->nbytes % sizeof (uint64_t) == 0); 1306 1307 if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0, 1308 &request->nbytes, request->cookie, request->ncookies, 1309 LDC_COPY_OUT)) != 0) { 1310 PR0("ldc_mem_copy() returned errno %d copying to client", 1311 status); 1312 } 1313 PR1("post mem_copy: nbytes=%ld", request->nbytes); 1314 1315 kmem_free(vd_devid, bufbytes); 1316 ddi_devid_free((ddi_devid_t)devid); 1317 1318 return (status); 1319 } 1320 1321 /* 1322 * Define the supported operations once the functions for performing them have 1323 * been defined 1324 */ 1325 static const vds_operation_t vds_operation[] = { 1326 #define X(_s) #_s, _s 1327 {X(VD_OP_BREAD), vd_start_bio, vd_complete_bio}, 1328 {X(VD_OP_BWRITE), vd_start_bio, vd_complete_bio}, 1329 {X(VD_OP_FLUSH), vd_ioctl, NULL}, 1330 {X(VD_OP_GET_WCE), vd_ioctl, NULL}, 1331 {X(VD_OP_SET_WCE), vd_ioctl, NULL}, 1332 {X(VD_OP_GET_VTOC), vd_ioctl, NULL}, 1333 {X(VD_OP_SET_VTOC), vd_ioctl, NULL}, 1334 {X(VD_OP_GET_DISKGEOM), vd_ioctl, NULL}, 1335 {X(VD_OP_SET_DISKGEOM), vd_ioctl, NULL}, 1336 {X(VD_OP_GET_EFI), vd_ioctl, NULL}, 1337 {X(VD_OP_SET_EFI), vd_ioctl, NULL}, 1338 {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, 1339 #undef X 1340 }; 1341 1342 static const size_t vds_noperations = 1343 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 1344 1345 /* 1346 * Process a task specifying a client I/O request 1347 */ 1348 static int 1349 vd_process_task(vd_task_t *task) 1350 { 1351 int i, status; 1352 vd_t *vd = task->vd; 1353 vd_dring_payload_t *request = task->request; 1354 1355 1356 ASSERT(vd != NULL); 1357 ASSERT(request != NULL); 1358 1359 /* Find the requested operation */ 1360 for (i = 0; i < vds_noperations; i++) 1361 if (request->operation == vds_operation[i].operation) 1362 break; 1363 if (i == vds_noperations) { 1364 PR0("Unsupported operation %u", request->operation); 1365 return (ENOTSUP); 1366 } 1367 1368 /* Handle client using absolute disk offsets */ 1369 if ((vd->vdisk_type == VD_DISK_TYPE_DISK) && 1370 (request->slice == UINT8_MAX)) 1371 request->slice = VD_ENTIRE_DISK_SLICE; 1372 1373 /* Range-check slice */ 1374 if (request->slice >= vd->nslices) { 1375 PR0("Invalid \"slice\" %u (max %u) for virtual disk", 1376 request->slice, (vd->nslices - 1)); 1377 return (EINVAL); 1378 } 1379 1380 PR1("operation : %s", vds_operation[i].namep); 1381 1382 /* Start the operation */ 1383 if ((status = vds_operation[i].start(task)) != EINPROGRESS) { 1384 PR0("operation : %s returned status %d", 1385 vds_operation[i].namep, status); 1386 request->status = status; /* op succeeded or failed */ 1387 return (0); /* but request completed */ 1388 } 1389 1390 ASSERT(vds_operation[i].complete != NULL); /* debug case */ 1391 if (vds_operation[i].complete == NULL) { /* non-debug case */ 1392 PR0("Unexpected return of EINPROGRESS " 1393 "with no I/O completion handler"); 1394 request->status = EIO; /* operation failed */ 1395 return (0); /* but request completed */ 1396 } 1397 1398 PR1("operation : kick off taskq entry for %s", vds_operation[i].namep); 1399 1400 /* Queue a task to complete the operation */ 1401 status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete, 1402 task, DDI_SLEEP); 1403 /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ 1404 ASSERT(status == DDI_SUCCESS); 1405 1406 PR1("Operation in progress"); 1407 return (EINPROGRESS); /* completion handler will finish request */ 1408 } 1409 1410 /* 1411 * Return true if the "type", "subtype", and "env" fields of the "tag" first 1412 * argument match the corresponding remaining arguments; otherwise, return false 1413 */ 1414 boolean_t 1415 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 1416 { 1417 return ((tag->vio_msgtype == type) && 1418 (tag->vio_subtype == subtype) && 1419 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 1420 } 1421 1422 /* 1423 * Check whether the major/minor version specified in "ver_msg" is supported 1424 * by this server. 1425 */ 1426 static boolean_t 1427 vds_supported_version(vio_ver_msg_t *ver_msg) 1428 { 1429 for (int i = 0; i < vds_num_versions; i++) { 1430 ASSERT(vds_version[i].major > 0); 1431 ASSERT((i == 0) || 1432 (vds_version[i].major < vds_version[i-1].major)); 1433 1434 /* 1435 * If the major versions match, adjust the minor version, if 1436 * necessary, down to the highest value supported by this 1437 * server and return true so this message will get "ack"ed; 1438 * the client should also support all minor versions lower 1439 * than the value it sent 1440 */ 1441 if (ver_msg->ver_major == vds_version[i].major) { 1442 if (ver_msg->ver_minor > vds_version[i].minor) { 1443 PR0("Adjusting minor version from %u to %u", 1444 ver_msg->ver_minor, vds_version[i].minor); 1445 ver_msg->ver_minor = vds_version[i].minor; 1446 } 1447 return (B_TRUE); 1448 } 1449 1450 /* 1451 * If the message contains a higher major version number, set 1452 * the message's major/minor versions to the current values 1453 * and return false, so this message will get "nack"ed with 1454 * these values, and the client will potentially try again 1455 * with the same or a lower version 1456 */ 1457 if (ver_msg->ver_major > vds_version[i].major) { 1458 ver_msg->ver_major = vds_version[i].major; 1459 ver_msg->ver_minor = vds_version[i].minor; 1460 return (B_FALSE); 1461 } 1462 1463 /* 1464 * Otherwise, the message's major version is less than the 1465 * current major version, so continue the loop to the next 1466 * (lower) supported version 1467 */ 1468 } 1469 1470 /* 1471 * No common version was found; "ground" the version pair in the 1472 * message to terminate negotiation 1473 */ 1474 ver_msg->ver_major = 0; 1475 ver_msg->ver_minor = 0; 1476 return (B_FALSE); 1477 } 1478 1479 /* 1480 * Process a version message from a client. vds expects to receive version 1481 * messages from clients seeking service, but never issues version messages 1482 * itself; therefore, vds can ACK or NACK client version messages, but does 1483 * not expect to receive version-message ACKs or NACKs (and will treat such 1484 * messages as invalid). 1485 */ 1486 static int 1487 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1488 { 1489 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 1490 1491 1492 ASSERT(msglen >= sizeof (msg->tag)); 1493 1494 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1495 VIO_VER_INFO)) { 1496 return (ENOMSG); /* not a version message */ 1497 } 1498 1499 if (msglen != sizeof (*ver_msg)) { 1500 PR0("Expected %lu-byte version message; " 1501 "received %lu bytes", sizeof (*ver_msg), msglen); 1502 return (EBADMSG); 1503 } 1504 1505 if (ver_msg->dev_class != VDEV_DISK) { 1506 PR0("Expected device class %u (disk); received %u", 1507 VDEV_DISK, ver_msg->dev_class); 1508 return (EBADMSG); 1509 } 1510 1511 /* 1512 * We're talking to the expected kind of client; set our device class 1513 * for "ack/nack" back to the client 1514 */ 1515 ver_msg->dev_class = VDEV_DISK_SERVER; 1516 1517 /* 1518 * Check whether the (valid) version message specifies a version 1519 * supported by this server. If the version is not supported, return 1520 * EBADMSG so the message will get "nack"ed; vds_supported_version() 1521 * will have updated the message with a supported version for the 1522 * client to consider 1523 */ 1524 if (!vds_supported_version(ver_msg)) 1525 return (EBADMSG); 1526 1527 1528 /* 1529 * A version has been agreed upon; use the client's SID for 1530 * communication on this channel now 1531 */ 1532 ASSERT(!(vd->initialized & VD_SID)); 1533 vd->sid = ver_msg->tag.vio_sid; 1534 vd->initialized |= VD_SID; 1535 1536 /* 1537 * When multiple versions are supported, this function should store 1538 * the negotiated major and minor version values in the "vd" data 1539 * structure to govern further communication; in particular, note that 1540 * the client might have specified a lower minor version for the 1541 * agreed major version than specifed in the vds_version[] array. The 1542 * following assertions should help remind future maintainers to make 1543 * the appropriate changes to support multiple versions. 1544 */ 1545 ASSERT(vds_num_versions == 1); 1546 ASSERT(ver_msg->ver_major == vds_version[0].major); 1547 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 1548 1549 PR0("Using major version %u, minor version %u", 1550 ver_msg->ver_major, ver_msg->ver_minor); 1551 return (0); 1552 } 1553 1554 static int 1555 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1556 { 1557 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 1558 int status, retry = 0; 1559 1560 1561 ASSERT(msglen >= sizeof (msg->tag)); 1562 1563 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1564 VIO_ATTR_INFO)) { 1565 PR0("Message is not an attribute message"); 1566 return (ENOMSG); 1567 } 1568 1569 if (msglen != sizeof (*attr_msg)) { 1570 PR0("Expected %lu-byte attribute message; " 1571 "received %lu bytes", sizeof (*attr_msg), msglen); 1572 return (EBADMSG); 1573 } 1574 1575 if (attr_msg->max_xfer_sz == 0) { 1576 PR0("Received maximum transfer size of 0 from client"); 1577 return (EBADMSG); 1578 } 1579 1580 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 1581 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 1582 PR0("Client requested unsupported transfer mode"); 1583 return (EBADMSG); 1584 } 1585 1586 /* 1587 * check if the underlying disk is ready, if not try accessing 1588 * the device again. Open the vdisk device and extract info 1589 * about it, as this is needed to respond to the attr info msg 1590 */ 1591 if ((vd->initialized & VD_DISK_READY) == 0) { 1592 PR0("Retry setting up disk (%s)", vd->device_path); 1593 do { 1594 status = vd_setup_vd(vd); 1595 if (status != EAGAIN || ++retry > vds_dev_retries) 1596 break; 1597 1598 /* incremental delay */ 1599 delay(drv_usectohz(vds_dev_delay)); 1600 1601 /* if vdisk is no longer enabled - return error */ 1602 if (!vd_enabled(vd)) 1603 return (ENXIO); 1604 1605 } while (status == EAGAIN); 1606 1607 if (status) 1608 return (ENXIO); 1609 1610 vd->initialized |= VD_DISK_READY; 1611 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 1612 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 1613 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 1614 (vd->pseudo ? "yes" : "no"), 1615 (vd->file ? "yes" : "no"), 1616 vd->nslices); 1617 } 1618 1619 /* Success: valid message and transfer mode */ 1620 vd->xfer_mode = attr_msg->xfer_mode; 1621 1622 if (vd->xfer_mode == VIO_DESC_MODE) { 1623 1624 /* 1625 * The vd_dring_inband_msg_t contains one cookie; need room 1626 * for up to n-1 more cookies, where "n" is the number of full 1627 * pages plus possibly one partial page required to cover 1628 * "max_xfer_sz". Add room for one more cookie if 1629 * "max_xfer_sz" isn't an integral multiple of the page size. 1630 * Must first get the maximum transfer size in bytes. 1631 */ 1632 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 1633 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 1634 attr_msg->max_xfer_sz; 1635 size_t max_inband_msglen = 1636 sizeof (vd_dring_inband_msg_t) + 1637 ((max_xfer_bytes/PAGESIZE + 1638 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 1639 (sizeof (ldc_mem_cookie_t))); 1640 1641 /* 1642 * Set the maximum expected message length to 1643 * accommodate in-band-descriptor messages with all 1644 * their cookies 1645 */ 1646 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 1647 1648 /* 1649 * Initialize the data structure for processing in-band I/O 1650 * request descriptors 1651 */ 1652 vd->inband_task.vd = vd; 1653 vd->inband_task.msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1654 vd->inband_task.index = 0; 1655 vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ 1656 } 1657 1658 /* Return the device's block size and max transfer size to the client */ 1659 attr_msg->vdisk_block_size = DEV_BSIZE; 1660 attr_msg->max_xfer_sz = vd->max_xfer_sz; 1661 1662 attr_msg->vdisk_size = vd->vdisk_size; 1663 attr_msg->vdisk_type = vd->vdisk_type; 1664 attr_msg->operations = vds_operations; 1665 PR0("%s", VD_CLIENT(vd)); 1666 1667 ASSERT(vd->dring_task == NULL); 1668 1669 return (0); 1670 } 1671 1672 static int 1673 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1674 { 1675 int status; 1676 size_t expected; 1677 ldc_mem_info_t dring_minfo; 1678 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 1679 1680 1681 ASSERT(msglen >= sizeof (msg->tag)); 1682 1683 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1684 VIO_DRING_REG)) { 1685 PR0("Message is not a register-dring message"); 1686 return (ENOMSG); 1687 } 1688 1689 if (msglen < sizeof (*reg_msg)) { 1690 PR0("Expected at least %lu-byte register-dring message; " 1691 "received %lu bytes", sizeof (*reg_msg), msglen); 1692 return (EBADMSG); 1693 } 1694 1695 expected = sizeof (*reg_msg) + 1696 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 1697 if (msglen != expected) { 1698 PR0("Expected %lu-byte register-dring message; " 1699 "received %lu bytes", expected, msglen); 1700 return (EBADMSG); 1701 } 1702 1703 if (vd->initialized & VD_DRING) { 1704 PR0("A dring was previously registered; only support one"); 1705 return (EBADMSG); 1706 } 1707 1708 if (reg_msg->num_descriptors > INT32_MAX) { 1709 PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)", 1710 reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); 1711 return (EBADMSG); 1712 } 1713 1714 if (reg_msg->ncookies != 1) { 1715 /* 1716 * In addition to fixing the assertion in the success case 1717 * below, supporting drings which require more than one 1718 * "cookie" requires increasing the value of vd->max_msglen 1719 * somewhere in the code path prior to receiving the message 1720 * which results in calling this function. Note that without 1721 * making this change, the larger message size required to 1722 * accommodate multiple cookies cannot be successfully 1723 * received, so this function will not even get called. 1724 * Gracefully accommodating more dring cookies might 1725 * reasonably demand exchanging an additional attribute or 1726 * making a minor protocol adjustment 1727 */ 1728 PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 1729 return (EBADMSG); 1730 } 1731 1732 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 1733 reg_msg->ncookies, reg_msg->num_descriptors, 1734 reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle); 1735 if (status != 0) { 1736 PR0("ldc_mem_dring_map() returned errno %d", status); 1737 return (status); 1738 } 1739 1740 /* 1741 * To remove the need for this assertion, must call 1742 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 1743 * successful call to ldc_mem_dring_map() 1744 */ 1745 ASSERT(reg_msg->ncookies == 1); 1746 1747 if ((status = 1748 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 1749 PR0("ldc_mem_dring_info() returned errno %d", status); 1750 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 1751 PR0("ldc_mem_dring_unmap() returned errno %d", status); 1752 return (status); 1753 } 1754 1755 if (dring_minfo.vaddr == NULL) { 1756 PR0("Descriptor ring virtual address is NULL"); 1757 return (ENXIO); 1758 } 1759 1760 1761 /* Initialize for valid message and mapped dring */ 1762 PR1("descriptor size = %u, dring length = %u", 1763 vd->descriptor_size, vd->dring_len); 1764 vd->initialized |= VD_DRING; 1765 vd->dring_ident = 1; /* "There Can Be Only One" */ 1766 vd->dring = dring_minfo.vaddr; 1767 vd->descriptor_size = reg_msg->descriptor_size; 1768 vd->dring_len = reg_msg->num_descriptors; 1769 reg_msg->dring_ident = vd->dring_ident; 1770 1771 /* 1772 * Allocate and initialize a "shadow" array of data structures for 1773 * tasks to process I/O requests in dring elements 1774 */ 1775 vd->dring_task = 1776 kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); 1777 for (int i = 0; i < vd->dring_len; i++) { 1778 vd->dring_task[i].vd = vd; 1779 vd->dring_task[i].index = i; 1780 vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; 1781 1782 status = ldc_mem_alloc_handle(vd->ldc_handle, 1783 &(vd->dring_task[i].mhdl)); 1784 if (status) { 1785 PR0("ldc_mem_alloc_handle() returned err %d ", status); 1786 return (ENXIO); 1787 } 1788 1789 vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP); 1790 } 1791 1792 return (0); 1793 } 1794 1795 static int 1796 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1797 { 1798 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 1799 1800 1801 ASSERT(msglen >= sizeof (msg->tag)); 1802 1803 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 1804 VIO_DRING_UNREG)) { 1805 PR0("Message is not an unregister-dring message"); 1806 return (ENOMSG); 1807 } 1808 1809 if (msglen != sizeof (*unreg_msg)) { 1810 PR0("Expected %lu-byte unregister-dring message; " 1811 "received %lu bytes", sizeof (*unreg_msg), msglen); 1812 return (EBADMSG); 1813 } 1814 1815 if (unreg_msg->dring_ident != vd->dring_ident) { 1816 PR0("Expected dring ident %lu; received %lu", 1817 vd->dring_ident, unreg_msg->dring_ident); 1818 return (EBADMSG); 1819 } 1820 1821 return (0); 1822 } 1823 1824 static int 1825 process_rdx_msg(vio_msg_t *msg, size_t msglen) 1826 { 1827 ASSERT(msglen >= sizeof (msg->tag)); 1828 1829 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { 1830 PR0("Message is not an RDX message"); 1831 return (ENOMSG); 1832 } 1833 1834 if (msglen != sizeof (vio_rdx_msg_t)) { 1835 PR0("Expected %lu-byte RDX message; received %lu bytes", 1836 sizeof (vio_rdx_msg_t), msglen); 1837 return (EBADMSG); 1838 } 1839 1840 PR0("Valid RDX message"); 1841 return (0); 1842 } 1843 1844 static int 1845 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 1846 { 1847 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 1848 PR0("Received seq_num %lu; expected %lu", 1849 seq_num, (vd->seq_num + 1)); 1850 PR0("initiating soft reset"); 1851 vd_need_reset(vd, B_FALSE); 1852 return (1); 1853 } 1854 1855 vd->seq_num = seq_num; 1856 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 1857 return (0); 1858 } 1859 1860 /* 1861 * Return the expected size of an inband-descriptor message with all the 1862 * cookies it claims to include 1863 */ 1864 static size_t 1865 expected_inband_size(vd_dring_inband_msg_t *msg) 1866 { 1867 return ((sizeof (*msg)) + 1868 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 1869 } 1870 1871 /* 1872 * Process an in-band descriptor message: used with clients like OBP, with 1873 * which vds exchanges descriptors within VIO message payloads, rather than 1874 * operating on them within a descriptor ring 1875 */ 1876 static int 1877 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1878 { 1879 size_t expected; 1880 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 1881 1882 1883 ASSERT(msglen >= sizeof (msg->tag)); 1884 1885 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1886 VIO_DESC_DATA)) { 1887 PR1("Message is not an in-band-descriptor message"); 1888 return (ENOMSG); 1889 } 1890 1891 if (msglen < sizeof (*desc_msg)) { 1892 PR0("Expected at least %lu-byte descriptor message; " 1893 "received %lu bytes", sizeof (*desc_msg), msglen); 1894 return (EBADMSG); 1895 } 1896 1897 if (msglen != (expected = expected_inband_size(desc_msg))) { 1898 PR0("Expected %lu-byte descriptor message; " 1899 "received %lu bytes", expected, msglen); 1900 return (EBADMSG); 1901 } 1902 1903 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) 1904 return (EBADMSG); 1905 1906 /* 1907 * Valid message: Set up the in-band descriptor task and process the 1908 * request. Arrange to acknowledge the client's message, unless an 1909 * error processing the descriptor task results in setting 1910 * VIO_SUBTYPE_NACK 1911 */ 1912 PR1("Valid in-band-descriptor message"); 1913 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1914 1915 ASSERT(vd->inband_task.msg != NULL); 1916 1917 bcopy(msg, vd->inband_task.msg, msglen); 1918 vd->inband_task.msglen = msglen; 1919 1920 /* 1921 * The task request is now the payload of the message 1922 * that was just copied into the body of the task. 1923 */ 1924 desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg; 1925 vd->inband_task.request = &desc_msg->payload; 1926 1927 return (vd_process_task(&vd->inband_task)); 1928 } 1929 1930 static int 1931 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, 1932 vio_msg_t *msg, size_t msglen) 1933 { 1934 int status; 1935 boolean_t ready; 1936 vd_dring_entry_t *elem = VD_DRING_ELEM(idx); 1937 1938 1939 /* Accept the updated dring element */ 1940 if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { 1941 PR0("ldc_mem_dring_acquire() returned errno %d", status); 1942 return (status); 1943 } 1944 ready = (elem->hdr.dstate == VIO_DESC_READY); 1945 if (ready) { 1946 elem->hdr.dstate = VIO_DESC_ACCEPTED; 1947 } else { 1948 PR0("descriptor %u not ready", idx); 1949 VD_DUMP_DRING_ELEM(elem); 1950 } 1951 if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { 1952 PR0("ldc_mem_dring_release() returned errno %d", status); 1953 return (status); 1954 } 1955 if (!ready) 1956 return (EBUSY); 1957 1958 1959 /* Initialize a task and process the accepted element */ 1960 PR1("Processing dring element %u", idx); 1961 vd->dring_task[idx].type = type; 1962 1963 /* duplicate msg buf for cookies etc. */ 1964 bcopy(msg, vd->dring_task[idx].msg, msglen); 1965 1966 vd->dring_task[idx].msglen = msglen; 1967 if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS) 1968 status = vd_mark_elem_done(vd, idx, 1969 vd->dring_task[idx].request->status, 1970 vd->dring_task[idx].request->nbytes); 1971 1972 return (status); 1973 } 1974 1975 static int 1976 vd_process_element_range(vd_t *vd, int start, int end, 1977 vio_msg_t *msg, size_t msglen) 1978 { 1979 int i, n, nelem, status = 0; 1980 boolean_t inprogress = B_FALSE; 1981 vd_task_type_t type; 1982 1983 1984 ASSERT(start >= 0); 1985 ASSERT(end >= 0); 1986 1987 /* 1988 * Arrange to acknowledge the client's message, unless an error 1989 * processing one of the dring elements results in setting 1990 * VIO_SUBTYPE_NACK 1991 */ 1992 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1993 1994 /* 1995 * Process the dring elements in the range 1996 */ 1997 nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; 1998 for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { 1999 ((vio_dring_msg_t *)msg)->end_idx = i; 2000 type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; 2001 status = vd_process_element(vd, type, i, msg, msglen); 2002 if (status == EINPROGRESS) 2003 inprogress = B_TRUE; 2004 else if (status != 0) 2005 break; 2006 } 2007 2008 /* 2009 * If some, but not all, operations of a multi-element range are in 2010 * progress, wait for other operations to complete before returning 2011 * (which will result in "ack" or "nack" of the message). Note that 2012 * all outstanding operations will need to complete, not just the ones 2013 * corresponding to the current range of dring elements; howevever, as 2014 * this situation is an error case, performance is less critical. 2015 */ 2016 if ((nelem > 1) && (status != EINPROGRESS) && inprogress) 2017 ddi_taskq_wait(vd->completionq); 2018 2019 return (status); 2020 } 2021 2022 static int 2023 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2024 { 2025 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 2026 2027 2028 ASSERT(msglen >= sizeof (msg->tag)); 2029 2030 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 2031 VIO_DRING_DATA)) { 2032 PR1("Message is not a dring-data message"); 2033 return (ENOMSG); 2034 } 2035 2036 if (msglen != sizeof (*dring_msg)) { 2037 PR0("Expected %lu-byte dring message; received %lu bytes", 2038 sizeof (*dring_msg), msglen); 2039 return (EBADMSG); 2040 } 2041 2042 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) 2043 return (EBADMSG); 2044 2045 if (dring_msg->dring_ident != vd->dring_ident) { 2046 PR0("Expected dring ident %lu; received ident %lu", 2047 vd->dring_ident, dring_msg->dring_ident); 2048 return (EBADMSG); 2049 } 2050 2051 if (dring_msg->start_idx >= vd->dring_len) { 2052 PR0("\"start_idx\" = %u; must be less than %u", 2053 dring_msg->start_idx, vd->dring_len); 2054 return (EBADMSG); 2055 } 2056 2057 if ((dring_msg->end_idx < 0) || 2058 (dring_msg->end_idx >= vd->dring_len)) { 2059 PR0("\"end_idx\" = %u; must be >= 0 and less than %u", 2060 dring_msg->end_idx, vd->dring_len); 2061 return (EBADMSG); 2062 } 2063 2064 /* Valid message; process range of updated dring elements */ 2065 PR1("Processing descriptor range, start = %u, end = %u", 2066 dring_msg->start_idx, dring_msg->end_idx); 2067 return (vd_process_element_range(vd, dring_msg->start_idx, 2068 dring_msg->end_idx, msg, msglen)); 2069 } 2070 2071 static int 2072 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 2073 { 2074 int retry, status; 2075 size_t size = *nbytes; 2076 2077 2078 for (retry = 0, status = ETIMEDOUT; 2079 retry < vds_ldc_retries && status == ETIMEDOUT; 2080 retry++) { 2081 PR1("ldc_read() attempt %d", (retry + 1)); 2082 *nbytes = size; 2083 status = ldc_read(ldc_handle, msg, nbytes); 2084 } 2085 2086 if (status) { 2087 PR0("ldc_read() returned errno %d", status); 2088 if (status != ECONNRESET) 2089 return (ENOMSG); 2090 return (status); 2091 } else if (*nbytes == 0) { 2092 PR1("ldc_read() returned 0 and no message read"); 2093 return (ENOMSG); 2094 } 2095 2096 PR1("RCVD %lu-byte message", *nbytes); 2097 return (0); 2098 } 2099 2100 static int 2101 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2102 { 2103 int status; 2104 2105 2106 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 2107 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 2108 #ifdef DEBUG 2109 vd_decode_tag(msg); 2110 #endif 2111 2112 /* 2113 * Validate session ID up front, since it applies to all messages 2114 * once set 2115 */ 2116 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 2117 PR0("Expected SID %u, received %u", vd->sid, 2118 msg->tag.vio_sid); 2119 return (EBADMSG); 2120 } 2121 2122 PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state)); 2123 2124 /* 2125 * Process the received message based on connection state 2126 */ 2127 switch (vd->state) { 2128 case VD_STATE_INIT: /* expect version message */ 2129 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 2130 return (status); 2131 2132 /* Version negotiated, move to that state */ 2133 vd->state = VD_STATE_VER; 2134 return (0); 2135 2136 case VD_STATE_VER: /* expect attribute message */ 2137 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 2138 return (status); 2139 2140 /* Attributes exchanged, move to that state */ 2141 vd->state = VD_STATE_ATTR; 2142 return (0); 2143 2144 case VD_STATE_ATTR: 2145 switch (vd->xfer_mode) { 2146 case VIO_DESC_MODE: /* expect RDX message */ 2147 if ((status = process_rdx_msg(msg, msglen)) != 0) 2148 return (status); 2149 2150 /* Ready to receive in-band descriptors */ 2151 vd->state = VD_STATE_DATA; 2152 return (0); 2153 2154 case VIO_DRING_MODE: /* expect register-dring message */ 2155 if ((status = 2156 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 2157 return (status); 2158 2159 /* One dring negotiated, move to that state */ 2160 vd->state = VD_STATE_DRING; 2161 return (0); 2162 2163 default: 2164 ASSERT("Unsupported transfer mode"); 2165 PR0("Unsupported transfer mode"); 2166 return (ENOTSUP); 2167 } 2168 2169 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 2170 if ((status = process_rdx_msg(msg, msglen)) == 0) { 2171 /* Ready to receive data */ 2172 vd->state = VD_STATE_DATA; 2173 return (0); 2174 } else if (status != ENOMSG) { 2175 return (status); 2176 } 2177 2178 2179 /* 2180 * If another register-dring message is received, stay in 2181 * dring state in case the client sends RDX; although the 2182 * protocol allows multiple drings, this server does not 2183 * support using more than one 2184 */ 2185 if ((status = 2186 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 2187 return (status); 2188 2189 /* 2190 * Acknowledge an unregister-dring message, but reset the 2191 * connection anyway: Although the protocol allows 2192 * unregistering drings, this server cannot serve a vdisk 2193 * without its only dring 2194 */ 2195 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2196 return ((status == 0) ? ENOTSUP : status); 2197 2198 case VD_STATE_DATA: 2199 switch (vd->xfer_mode) { 2200 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 2201 return (vd_process_desc_msg(vd, msg, msglen)); 2202 2203 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 2204 /* 2205 * Typically expect dring-data messages, so handle 2206 * them first 2207 */ 2208 if ((status = vd_process_dring_msg(vd, msg, 2209 msglen)) != ENOMSG) 2210 return (status); 2211 2212 /* 2213 * Acknowledge an unregister-dring message, but reset 2214 * the connection anyway: Although the protocol 2215 * allows unregistering drings, this server cannot 2216 * serve a vdisk without its only dring 2217 */ 2218 status = vd_process_dring_unreg_msg(vd, msg, msglen); 2219 return ((status == 0) ? ENOTSUP : status); 2220 2221 default: 2222 ASSERT("Unsupported transfer mode"); 2223 PR0("Unsupported transfer mode"); 2224 return (ENOTSUP); 2225 } 2226 2227 default: 2228 ASSERT("Invalid client connection state"); 2229 PR0("Invalid client connection state"); 2230 return (ENOTSUP); 2231 } 2232 } 2233 2234 static int 2235 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 2236 { 2237 int status; 2238 boolean_t reset_ldc = B_FALSE; 2239 2240 2241 /* 2242 * Check that the message is at least big enough for a "tag", so that 2243 * message processing can proceed based on tag-specified message type 2244 */ 2245 if (msglen < sizeof (vio_msg_tag_t)) { 2246 PR0("Received short (%lu-byte) message", msglen); 2247 /* Can't "nack" short message, so drop the big hammer */ 2248 PR0("initiating full reset"); 2249 vd_need_reset(vd, B_TRUE); 2250 return (EBADMSG); 2251 } 2252 2253 /* 2254 * Process the message 2255 */ 2256 switch (status = vd_do_process_msg(vd, msg, msglen)) { 2257 case 0: 2258 /* "ack" valid, successfully-processed messages */ 2259 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 2260 break; 2261 2262 case EINPROGRESS: 2263 /* The completion handler will "ack" or "nack" the message */ 2264 return (EINPROGRESS); 2265 case ENOMSG: 2266 PR0("Received unexpected message"); 2267 _NOTE(FALLTHROUGH); 2268 case EBADMSG: 2269 case ENOTSUP: 2270 /* "nack" invalid messages */ 2271 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2272 break; 2273 2274 default: 2275 /* "nack" failed messages */ 2276 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 2277 /* An LDC error probably occurred, so try resetting it */ 2278 reset_ldc = B_TRUE; 2279 break; 2280 } 2281 2282 PR1("\tResulting in state %d (%s)", vd->state, 2283 vd_decode_state(vd->state)); 2284 2285 /* Send the "ack" or "nack" to the client */ 2286 PR1("Sending %s", 2287 (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 2288 if (send_msg(vd->ldc_handle, msg, msglen) != 0) 2289 reset_ldc = B_TRUE; 2290 2291 /* Arrange to reset the connection for nack'ed or failed messages */ 2292 if ((status != 0) || reset_ldc) { 2293 PR0("initiating %s reset", 2294 (reset_ldc) ? "full" : "soft"); 2295 vd_need_reset(vd, reset_ldc); 2296 } 2297 2298 return (status); 2299 } 2300 2301 static boolean_t 2302 vd_enabled(vd_t *vd) 2303 { 2304 boolean_t enabled; 2305 2306 2307 mutex_enter(&vd->lock); 2308 enabled = vd->enabled; 2309 mutex_exit(&vd->lock); 2310 return (enabled); 2311 } 2312 2313 static void 2314 vd_recv_msg(void *arg) 2315 { 2316 vd_t *vd = (vd_t *)arg; 2317 int rv = 0, status = 0; 2318 2319 ASSERT(vd != NULL); 2320 2321 PR2("New task to receive incoming message(s)"); 2322 2323 2324 while (vd_enabled(vd) && status == 0) { 2325 size_t msglen, msgsize; 2326 ldc_status_t lstatus; 2327 2328 /* 2329 * Receive and process a message 2330 */ 2331 vd_reset_if_needed(vd); /* can change vd->max_msglen */ 2332 2333 /* 2334 * check if channel is UP - else break out of loop 2335 */ 2336 status = ldc_status(vd->ldc_handle, &lstatus); 2337 if (lstatus != LDC_UP) { 2338 PR0("channel not up (status=%d), exiting recv loop\n", 2339 lstatus); 2340 break; 2341 } 2342 2343 ASSERT(vd->max_msglen != 0); 2344 2345 msgsize = vd->max_msglen; /* stable copy for alloc/free */ 2346 msglen = msgsize; /* actual len after recv_msg() */ 2347 2348 status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen); 2349 switch (status) { 2350 case 0: 2351 rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp, 2352 msglen); 2353 /* check if max_msglen changed */ 2354 if (msgsize != vd->max_msglen) { 2355 PR0("max_msglen changed 0x%lx to 0x%lx bytes\n", 2356 msgsize, vd->max_msglen); 2357 kmem_free(vd->vio_msgp, msgsize); 2358 vd->vio_msgp = 2359 kmem_alloc(vd->max_msglen, KM_SLEEP); 2360 } 2361 if (rv == EINPROGRESS) 2362 continue; 2363 break; 2364 2365 case ENOMSG: 2366 break; 2367 2368 case ECONNRESET: 2369 PR0("initiating soft reset (ECONNRESET)\n"); 2370 vd_need_reset(vd, B_FALSE); 2371 status = 0; 2372 break; 2373 2374 default: 2375 /* Probably an LDC failure; arrange to reset it */ 2376 PR0("initiating full reset (status=0x%x)", status); 2377 vd_need_reset(vd, B_TRUE); 2378 break; 2379 } 2380 } 2381 2382 PR2("Task finished"); 2383 } 2384 2385 static uint_t 2386 vd_handle_ldc_events(uint64_t event, caddr_t arg) 2387 { 2388 vd_t *vd = (vd_t *)(void *)arg; 2389 int status; 2390 2391 ASSERT(vd != NULL); 2392 2393 if (!vd_enabled(vd)) 2394 return (LDC_SUCCESS); 2395 2396 if (event & LDC_EVT_DOWN) { 2397 PR0("LDC_EVT_DOWN: LDC channel went down"); 2398 2399 vd_need_reset(vd, B_TRUE); 2400 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2401 DDI_SLEEP); 2402 if (status == DDI_FAILURE) { 2403 PR0("cannot schedule task to recv msg\n"); 2404 vd_need_reset(vd, B_TRUE); 2405 } 2406 } 2407 2408 if (event & LDC_EVT_RESET) { 2409 PR0("LDC_EVT_RESET: LDC channel was reset"); 2410 2411 if (vd->state != VD_STATE_INIT) { 2412 PR0("scheduling full reset"); 2413 vd_need_reset(vd, B_FALSE); 2414 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2415 vd, DDI_SLEEP); 2416 if (status == DDI_FAILURE) { 2417 PR0("cannot schedule task to recv msg\n"); 2418 vd_need_reset(vd, B_TRUE); 2419 } 2420 2421 } else { 2422 PR0("channel already reset, ignoring...\n"); 2423 PR0("doing ldc up...\n"); 2424 (void) ldc_up(vd->ldc_handle); 2425 } 2426 2427 return (LDC_SUCCESS); 2428 } 2429 2430 if (event & LDC_EVT_UP) { 2431 PR0("EVT_UP: LDC is up\nResetting client connection state"); 2432 PR0("initiating soft reset"); 2433 vd_need_reset(vd, B_FALSE); 2434 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, 2435 vd, DDI_SLEEP); 2436 if (status == DDI_FAILURE) { 2437 PR0("cannot schedule task to recv msg\n"); 2438 vd_need_reset(vd, B_TRUE); 2439 return (LDC_SUCCESS); 2440 } 2441 } 2442 2443 if (event & LDC_EVT_READ) { 2444 int status; 2445 2446 PR1("New data available"); 2447 /* Queue a task to receive the new data */ 2448 status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, 2449 DDI_SLEEP); 2450 2451 if (status == DDI_FAILURE) { 2452 PR0("cannot schedule task to recv msg\n"); 2453 vd_need_reset(vd, B_TRUE); 2454 } 2455 } 2456 2457 return (LDC_SUCCESS); 2458 } 2459 2460 static uint_t 2461 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 2462 { 2463 _NOTE(ARGUNUSED(key, val)) 2464 (*((uint_t *)arg))++; 2465 return (MH_WALK_TERMINATE); 2466 } 2467 2468 2469 static int 2470 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2471 { 2472 uint_t vd_present = 0; 2473 minor_t instance; 2474 vds_t *vds; 2475 2476 2477 switch (cmd) { 2478 case DDI_DETACH: 2479 /* the real work happens below */ 2480 break; 2481 case DDI_SUSPEND: 2482 PR0("No action required for DDI_SUSPEND"); 2483 return (DDI_SUCCESS); 2484 default: 2485 PR0("Unrecognized \"cmd\""); 2486 return (DDI_FAILURE); 2487 } 2488 2489 ASSERT(cmd == DDI_DETACH); 2490 instance = ddi_get_instance(dip); 2491 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 2492 PR0("Could not get state for instance %u", instance); 2493 ddi_soft_state_free(vds_state, instance); 2494 return (DDI_FAILURE); 2495 } 2496 2497 /* Do no detach when serving any vdisks */ 2498 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 2499 if (vd_present) { 2500 PR0("Not detaching because serving vdisks"); 2501 return (DDI_FAILURE); 2502 } 2503 2504 PR0("Detaching"); 2505 if (vds->initialized & VDS_MDEG) { 2506 (void) mdeg_unregister(vds->mdeg); 2507 kmem_free(vds->ispecp->specp, sizeof (vds_prop_template)); 2508 kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t)); 2509 vds->ispecp = NULL; 2510 vds->mdeg = NULL; 2511 } 2512 2513 if (vds->initialized & VDS_LDI) 2514 (void) ldi_ident_release(vds->ldi_ident); 2515 mod_hash_destroy_hash(vds->vd_table); 2516 ddi_soft_state_free(vds_state, instance); 2517 return (DDI_SUCCESS); 2518 } 2519 2520 static boolean_t 2521 is_pseudo_device(dev_info_t *dip) 2522 { 2523 dev_info_t *parent, *root = ddi_root_node(); 2524 2525 2526 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 2527 parent = ddi_get_parent(parent)) { 2528 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 2529 return (B_TRUE); 2530 } 2531 2532 return (B_FALSE); 2533 } 2534 2535 static int 2536 vd_setup_full_disk(vd_t *vd) 2537 { 2538 int rval, status; 2539 major_t major = getmajor(vd->dev[0]); 2540 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 2541 struct dk_minfo dk_minfo; 2542 2543 /* 2544 * At this point, vdisk_size is set to the size of partition 2 but 2545 * this does not represent the size of the disk because partition 2 2546 * may not cover the entire disk and its size does not include reserved 2547 * blocks. So we update vdisk_size to be the size of the entire disk. 2548 */ 2549 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, 2550 (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL), 2551 kcred, &rval)) != 0) { 2552 PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", 2553 status); 2554 return (status); 2555 } 2556 vd->vdisk_size = dk_minfo.dki_capacity; 2557 2558 /* Set full-disk parameters */ 2559 vd->vdisk_type = VD_DISK_TYPE_DISK; 2560 vd->nslices = (sizeof (vd->dev))/(sizeof (vd->dev[0])); 2561 2562 /* Move dev number and LDI handle to entire-disk-slice array elements */ 2563 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 2564 vd->dev[0] = 0; 2565 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 2566 vd->ldi_handle[0] = NULL; 2567 2568 /* Initialize device numbers for remaining slices and open them */ 2569 for (int slice = 0; slice < vd->nslices; slice++) { 2570 /* 2571 * Skip the entire-disk slice, as it's already open and its 2572 * device known 2573 */ 2574 if (slice == VD_ENTIRE_DISK_SLICE) 2575 continue; 2576 ASSERT(vd->dev[slice] == 0); 2577 ASSERT(vd->ldi_handle[slice] == NULL); 2578 2579 /* 2580 * Construct the device number for the current slice 2581 */ 2582 vd->dev[slice] = makedevice(major, (minor + slice)); 2583 2584 /* 2585 * Open all slices of the disk to serve them to the client. 2586 * Slices are opened exclusively to prevent other threads or 2587 * processes in the service domain from performing I/O to 2588 * slices being accessed by a client. Failure to open a slice 2589 * results in vds not serving this disk, as the client could 2590 * attempt (and should be able) to access any slice immediately. 2591 * Any slices successfully opened before a failure will get 2592 * closed by vds_destroy_vd() as a result of the error returned 2593 * by this function. 2594 * 2595 * We need to do the open with FNDELAY so that opening an empty 2596 * slice does not fail. 2597 */ 2598 PR0("Opening device major %u, minor %u = slice %u", 2599 major, minor, slice); 2600 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 2601 vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[slice], 2602 vd->vds->ldi_ident)) != 0) { 2603 PRN("ldi_open_by_dev() returned errno %d " 2604 "for slice %u", status, slice); 2605 /* vds_destroy_vd() will close any open slices */ 2606 vd->ldi_handle[slice] = NULL; 2607 return (status); 2608 } 2609 } 2610 2611 return (0); 2612 } 2613 2614 static int 2615 vd_setup_partition_efi(vd_t *vd) 2616 { 2617 efi_gpt_t *gpt; 2618 efi_gpe_t *gpe; 2619 struct uuid uuid = EFI_RESERVED; 2620 uint32_t crc; 2621 int length; 2622 2623 length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t); 2624 2625 gpt = kmem_zalloc(length, KM_SLEEP); 2626 gpe = (efi_gpe_t *)(gpt + 1); 2627 2628 gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE); 2629 gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 2630 gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t)); 2631 gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL); 2632 gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1); 2633 gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1); 2634 gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t)); 2635 2636 UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid); 2637 gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA; 2638 gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA; 2639 2640 CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table); 2641 gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 2642 2643 CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table); 2644 gpt->efi_gpt_HeaderCRC32 = LE_32(~crc); 2645 2646 vd->dk_efi.dki_lba = 0; 2647 vd->dk_efi.dki_length = length; 2648 vd->dk_efi.dki_data = gpt; 2649 2650 return (0); 2651 } 2652 2653 static int 2654 vd_setup_file(vd_t *vd) 2655 { 2656 int i, rval, status; 2657 ushort_t sum; 2658 vattr_t vattr; 2659 dev_t dev; 2660 char *file_path = vd->device_path; 2661 char dev_path[MAXPATHLEN + 1]; 2662 ldi_handle_t lhandle; 2663 struct dk_cinfo dk_cinfo; 2664 struct dk_label label; 2665 2666 /* make sure the file is valid */ 2667 if ((status = lookupname(file_path, UIO_SYSSPACE, FOLLOW, 2668 NULLVPP, &vd->file_vnode)) != 0) { 2669 PRN("Cannot lookup file(%s) errno %d", file_path, status); 2670 return (status); 2671 } 2672 2673 if (vd->file_vnode->v_type != VREG) { 2674 PRN("Invalid file type (%s)\n", file_path); 2675 VN_RELE(vd->file_vnode); 2676 return (EBADF); 2677 } 2678 VN_RELE(vd->file_vnode); 2679 2680 if ((status = vn_open(file_path, UIO_SYSSPACE, vd_open_flags | FOFFMAX, 2681 0, &vd->file_vnode, 0, 0)) != 0) { 2682 PRN("vn_open(%s) = errno %d", file_path, status); 2683 return (status); 2684 } 2685 2686 /* 2687 * We set vd->file now so that vds_destroy_vd will take care of 2688 * closing the file and releasing the vnode in case of an error. 2689 */ 2690 vd->file = B_TRUE; 2691 vd->pseudo = B_FALSE; 2692 2693 vattr.va_mask = AT_SIZE; 2694 if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) { 2695 PRN("VOP_GETATTR(%s) = errno %d", file_path, status); 2696 return (EIO); 2697 } 2698 2699 vd->file_size = vattr.va_size; 2700 /* size should be at least sizeof(dk_label) */ 2701 if (vd->file_size < sizeof (struct dk_label)) { 2702 PRN("Size of file has to be at least %ld bytes", 2703 sizeof (struct dk_label)); 2704 return (EIO); 2705 } 2706 2707 if (vd->file_vnode->v_flag & VNOMAP) { 2708 PRN("File %s cannot be mapped", file_path); 2709 return (EIO); 2710 } 2711 2712 /* read label from file */ 2713 if (VD_FILE_LABEL_READ(vd, &label) < 0) { 2714 PRN("Can't read label from %s", file_path); 2715 return (EIO); 2716 } 2717 2718 /* label checksum */ 2719 sum = vd_lbl2cksum(&label); 2720 2721 if (label.dkl_magic != DKL_MAGIC || label.dkl_cksum != sum) { 2722 PR0("%s has an invalid disk label " 2723 "(magic=%x cksum=%x (expect %x))", 2724 file_path, label.dkl_magic, label.dkl_cksum, sum); 2725 2726 /* default label */ 2727 bzero(&label, sizeof (struct dk_label)); 2728 2729 /* 2730 * We must have a resonable number of cylinders and sectors so 2731 * that newfs can run using default values. 2732 * 2733 * if (disk_size < 2MB) 2734 * phys_cylinders = disk_size / 100K 2735 * else 2736 * phys_cylinders = disk_size / 300K 2737 * 2738 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders 2739 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0; 2740 * data_cylinders = phys_cylinders - alt_cylinders 2741 * 2742 * sectors = disk_size / (phys_cylinders * blk_size) 2743 */ 2744 if (vd->file_size < (2 * 1024 * 1024)) 2745 label.dkl_pcyl = vd->file_size / (100 * 1024); 2746 else 2747 label.dkl_pcyl = vd->file_size / (300 * 1024); 2748 2749 if (label.dkl_pcyl == 0) 2750 label.dkl_pcyl = 1; 2751 2752 if (label.dkl_pcyl > 2) 2753 label.dkl_acyl = 2; 2754 else 2755 label.dkl_acyl = 0; 2756 2757 label.dkl_nsect = vd->file_size / 2758 (DEV_BSIZE * label.dkl_pcyl); 2759 label.dkl_ncyl = label.dkl_pcyl - label.dkl_acyl; 2760 label.dkl_nhead = 1; 2761 label.dkl_write_reinstruct = 0; 2762 label.dkl_read_reinstruct = 0; 2763 label.dkl_rpm = 7200; 2764 label.dkl_apc = 0; 2765 label.dkl_intrlv = 0; 2766 label.dkl_magic = DKL_MAGIC; 2767 2768 PR0("requested disk size: %ld bytes\n", vd->file_size); 2769 PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label.dkl_pcyl, 2770 label.dkl_nhead, label.dkl_nsect); 2771 PR0("provided disk size: %ld bytes\n", (uint64_t) 2772 (label.dkl_pcyl * 2773 label.dkl_nhead * label.dkl_nsect * DEV_BSIZE)); 2774 2775 /* 2776 * We must have a correct label name otherwise format(1m) will 2777 * not recognized the disk as labeled. 2778 */ 2779 (void) snprintf(label.dkl_asciilabel, LEN_DKL_ASCII, 2780 "SUNVDSK cyl %d alt %d hd %d sec %d", 2781 label.dkl_ncyl, label.dkl_acyl, label.dkl_nhead, 2782 label.dkl_nsect); 2783 2784 /* default VTOC */ 2785 label.dkl_vtoc.v_version = V_VERSION; 2786 label.dkl_vtoc.v_nparts = V_NUMPAR; 2787 label.dkl_vtoc.v_sanity = VTOC_SANE; 2788 label.dkl_vtoc.v_part[2].p_tag = V_BACKUP; 2789 label.dkl_map[2].dkl_cylno = 0; 2790 label.dkl_map[2].dkl_nblk = label.dkl_ncyl * 2791 label.dkl_nhead * label.dkl_nsect; 2792 label.dkl_map[0] = label.dkl_map[2]; 2793 label.dkl_map[0] = label.dkl_map[2]; 2794 label.dkl_cksum = vd_lbl2cksum(&label); 2795 2796 /* write default label to file */ 2797 if (VD_FILE_LABEL_WRITE(vd, &label) < 0) { 2798 PRN("Can't write label to %s", file_path); 2799 return (EIO); 2800 } 2801 } 2802 2803 vd->nslices = label.dkl_vtoc.v_nparts; 2804 2805 /* sector size = block size = DEV_BSIZE */ 2806 vd->vdisk_size = (label.dkl_pcyl * 2807 label.dkl_nhead * label.dkl_nsect) / DEV_BSIZE; 2808 vd->vdisk_type = VD_DISK_TYPE_DISK; 2809 vd->vdisk_label = VD_DISK_LABEL_VTOC; 2810 vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */ 2811 2812 /* Get max_xfer_sz from the device where the file is */ 2813 dev = vd->file_vnode->v_vfsp->vfs_dev; 2814 dev_path[0] = NULL; 2815 if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) { 2816 PR0("underlying device = %s\n", dev_path); 2817 } 2818 2819 if ((status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, 2820 kcred, &lhandle, vd->vds->ldi_ident)) != 0) { 2821 PR0("ldi_open_by_dev() returned errno %d for device %s", 2822 status, dev_path); 2823 } else { 2824 if ((status = ldi_ioctl(lhandle, DKIOCINFO, 2825 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 2826 &rval)) != 0) { 2827 PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 2828 status, dev_path); 2829 } else { 2830 /* 2831 * Store the device's max transfer size for 2832 * return to the client 2833 */ 2834 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 2835 } 2836 2837 PR0("close the device %s", dev_path); 2838 (void) ldi_close(lhandle, FREAD, kcred); 2839 } 2840 2841 PR0("using for file %s, dev %s, max_xfer = %u blks", 2842 file_path, dev_path, vd->max_xfer_sz); 2843 2844 vd->dk_geom.dkg_ncyl = label.dkl_ncyl; 2845 vd->dk_geom.dkg_acyl = label.dkl_acyl; 2846 vd->dk_geom.dkg_pcyl = label.dkl_pcyl; 2847 vd->dk_geom.dkg_nhead = label.dkl_nhead; 2848 vd->dk_geom.dkg_nsect = label.dkl_nsect; 2849 vd->dk_geom.dkg_intrlv = label.dkl_intrlv; 2850 vd->dk_geom.dkg_apc = label.dkl_apc; 2851 vd->dk_geom.dkg_rpm = label.dkl_rpm; 2852 vd->dk_geom.dkg_write_reinstruct = label.dkl_write_reinstruct; 2853 vd->dk_geom.dkg_read_reinstruct = label.dkl_read_reinstruct; 2854 2855 vd->vtoc.v_sanity = label.dkl_vtoc.v_sanity; 2856 vd->vtoc.v_version = label.dkl_vtoc.v_version; 2857 vd->vtoc.v_sectorsz = DEV_BSIZE; 2858 vd->vtoc.v_nparts = label.dkl_vtoc.v_nparts; 2859 2860 bcopy(label.dkl_vtoc.v_volume, vd->vtoc.v_volume, 2861 LEN_DKL_VVOL); 2862 bcopy(label.dkl_asciilabel, vd->vtoc.v_asciilabel, 2863 LEN_DKL_ASCII); 2864 2865 for (i = 0; i < vd->nslices; i++) { 2866 vd->vtoc.timestamp[i] = label.dkl_vtoc.v_timestamp[i]; 2867 vd->vtoc.v_part[i].p_tag = label.dkl_vtoc.v_part[i].p_tag; 2868 vd->vtoc.v_part[i].p_flag = label.dkl_vtoc.v_part[i].p_flag; 2869 vd->vtoc.v_part[i].p_start = label.dkl_map[i].dkl_cylno * 2870 label.dkl_nhead * label.dkl_nsect; 2871 vd->vtoc.v_part[i].p_size = label.dkl_map[i].dkl_nblk; 2872 vd->ldi_handle[i] = NULL; 2873 vd->dev[i] = NULL; 2874 } 2875 2876 return (0); 2877 } 2878 2879 static int 2880 vd_setup_vd(vd_t *vd) 2881 { 2882 int rval, status; 2883 dev_info_t *dip; 2884 struct dk_cinfo dk_cinfo; 2885 char *device_path = vd->device_path; 2886 2887 /* 2888 * We need to open with FNDELAY so that opening an empty partition 2889 * does not fail. 2890 */ 2891 if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY, 2892 kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { 2893 PR0("ldi_open_by_name(%s) = errno %d", device_path, status); 2894 vd->ldi_handle[0] = NULL; 2895 2896 /* this may not be a device try opening as a file */ 2897 if (status == ENXIO || status == ENODEV) 2898 status = vd_setup_file(vd); 2899 if (status) { 2900 PRN("Cannot use device/file (%s), errno=%d\n", 2901 device_path, status); 2902 if (status == ENXIO || status == ENODEV || 2903 status == ENOENT) { 2904 return (EAGAIN); 2905 } 2906 } 2907 return (status); 2908 } 2909 2910 /* 2911 * nslices must be updated now so that vds_destroy_vd() will close 2912 * the slice we have just opened in case of an error. 2913 */ 2914 vd->nslices = 1; 2915 vd->file = B_FALSE; 2916 2917 /* Get device number and size of backing device */ 2918 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 2919 PRN("ldi_get_dev() returned errno %d for %s", 2920 status, device_path); 2921 return (status); 2922 } 2923 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 2924 PRN("ldi_get_size() failed for %s", device_path); 2925 return (EIO); 2926 } 2927 vd->vdisk_size = lbtodb(vd->vdisk_size); /* convert to blocks */ 2928 2929 /* Verify backing device supports dk_cinfo, dk_geom, and vtoc */ 2930 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 2931 (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, 2932 &rval)) != 0) { 2933 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 2934 status, device_path); 2935 return (status); 2936 } 2937 if (dk_cinfo.dki_partition >= V_NUMPAR) { 2938 PRN("slice %u >= maximum slice %u for %s", 2939 dk_cinfo.dki_partition, V_NUMPAR, device_path); 2940 return (EIO); 2941 } 2942 2943 status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label); 2944 2945 if (status != 0) { 2946 PRN("vd_read_vtoc returned errno %d for %s", 2947 status, device_path); 2948 return (status); 2949 } 2950 2951 if (vd->vdisk_label == VD_DISK_LABEL_VTOC && 2952 (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 2953 (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), 2954 kcred, &rval)) != 0) { 2955 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 2956 status, device_path); 2957 return (status); 2958 } 2959 2960 /* Store the device's max transfer size for return to the client */ 2961 vd->max_xfer_sz = dk_cinfo.dki_maxtransfer; 2962 2963 /* Determine if backing device is a pseudo device */ 2964 if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), 2965 dev_to_instance(vd->dev[0]), 0)) == NULL) { 2966 PRN("%s is no longer accessible", device_path); 2967 return (EIO); 2968 } 2969 vd->pseudo = is_pseudo_device(dip); 2970 ddi_release_devi(dip); 2971 if (vd->pseudo) { 2972 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2973 vd->nslices = 1; 2974 return (0); /* ...and we're done */ 2975 } 2976 2977 /* If slice is entire-disk slice, initialize for full disk */ 2978 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) 2979 return (vd_setup_full_disk(vd)); 2980 2981 2982 /* Otherwise, we have a non-entire slice of a device */ 2983 vd->vdisk_type = VD_DISK_TYPE_SLICE; 2984 vd->nslices = 1; 2985 2986 if (vd->vdisk_label == VD_DISK_LABEL_EFI) { 2987 status = vd_setup_partition_efi(vd); 2988 return (status); 2989 } 2990 2991 /* Initialize dk_geom structure for single-slice device */ 2992 if (vd->dk_geom.dkg_nsect == 0) { 2993 PRN("%s geometry claims 0 sectors per track", device_path); 2994 return (EIO); 2995 } 2996 if (vd->dk_geom.dkg_nhead == 0) { 2997 PRN("%s geometry claims 0 heads", device_path); 2998 return (EIO); 2999 } 3000 vd->dk_geom.dkg_ncyl = 3001 vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; 3002 vd->dk_geom.dkg_acyl = 0; 3003 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 3004 3005 3006 /* Initialize vtoc structure for single-slice device */ 3007 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 3008 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 3009 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 3010 vd->vtoc.v_nparts = 1; 3011 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 3012 vd->vtoc.v_part[0].p_flag = 0; 3013 vd->vtoc.v_part[0].p_start = 0; 3014 vd->vtoc.v_part[0].p_size = vd->vdisk_size; 3015 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 3016 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 3017 3018 3019 return (0); 3020 } 3021 3022 static int 3023 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id, 3024 vd_t **vdp) 3025 { 3026 char tq_name[TASKQ_NAMELEN]; 3027 int status; 3028 ddi_iblock_cookie_t iblock = NULL; 3029 ldc_attr_t ldc_attr; 3030 vd_t *vd; 3031 3032 3033 ASSERT(vds != NULL); 3034 ASSERT(device_path != NULL); 3035 ASSERT(vdp != NULL); 3036 PR0("Adding vdisk for %s", device_path); 3037 3038 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 3039 PRN("No memory for virtual disk"); 3040 return (EAGAIN); 3041 } 3042 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 3043 vd->vds = vds; 3044 (void) strncpy(vd->device_path, device_path, MAXPATHLEN); 3045 3046 /* Open vdisk and initialize parameters */ 3047 if ((status = vd_setup_vd(vd)) == 0) { 3048 vd->initialized |= VD_DISK_READY; 3049 3050 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 3051 PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u", 3052 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 3053 (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"), 3054 vd->nslices); 3055 } else { 3056 if (status != EAGAIN) 3057 return (status); 3058 } 3059 3060 /* Initialize locking */ 3061 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 3062 &iblock) != DDI_SUCCESS) { 3063 PRN("Could not get iblock cookie."); 3064 return (EIO); 3065 } 3066 3067 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 3068 vd->initialized |= VD_LOCKING; 3069 3070 3071 /* Create start and completion task queues for the vdisk */ 3072 (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); 3073 PR1("tq_name = %s", tq_name); 3074 if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, 3075 TASKQ_DEFAULTPRI, 0)) == NULL) { 3076 PRN("Could not create task queue"); 3077 return (EIO); 3078 } 3079 (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); 3080 PR1("tq_name = %s", tq_name); 3081 if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, 3082 TASKQ_DEFAULTPRI, 0)) == NULL) { 3083 PRN("Could not create task queue"); 3084 return (EIO); 3085 } 3086 vd->enabled = 1; /* before callback can dispatch to startq */ 3087 3088 3089 /* Bring up LDC */ 3090 ldc_attr.devclass = LDC_DEV_BLK_SVC; 3091 ldc_attr.instance = ddi_get_instance(vds->dip); 3092 ldc_attr.mode = LDC_MODE_UNRELIABLE; 3093 ldc_attr.mtu = VD_LDC_MTU; 3094 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 3095 PRN("Could not initialize LDC channel %lu, " 3096 "init failed with error %d", ldc_id, status); 3097 return (status); 3098 } 3099 vd->initialized |= VD_LDC; 3100 3101 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 3102 (caddr_t)vd)) != 0) { 3103 PRN("Could not initialize LDC channel %lu," 3104 "reg_callback failed with error %d", ldc_id, status); 3105 return (status); 3106 } 3107 3108 if ((status = ldc_open(vd->ldc_handle)) != 0) { 3109 PRN("Could not initialize LDC channel %lu," 3110 "open failed with error %d", ldc_id, status); 3111 return (status); 3112 } 3113 3114 if ((status = ldc_up(vd->ldc_handle)) != 0) { 3115 PR0("ldc_up() returned errno %d", status); 3116 } 3117 3118 /* Allocate the inband task memory handle */ 3119 status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl)); 3120 if (status) { 3121 PRN("Could not initialize LDC channel %lu," 3122 "alloc_handle failed with error %d", ldc_id, status); 3123 return (ENXIO); 3124 } 3125 3126 /* Add the successfully-initialized vdisk to the server's table */ 3127 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 3128 PRN("Error adding vdisk ID %lu to table", id); 3129 return (EIO); 3130 } 3131 3132 /* Allocate the staging buffer */ 3133 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 3134 vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP); 3135 3136 /* store initial state */ 3137 vd->state = VD_STATE_INIT; 3138 3139 return (0); 3140 } 3141 3142 static void 3143 vd_free_dring_task(vd_t *vdp) 3144 { 3145 if (vdp->dring_task != NULL) { 3146 ASSERT(vdp->dring_len != 0); 3147 /* Free all dring_task memory handles */ 3148 for (int i = 0; i < vdp->dring_len; i++) { 3149 (void) ldc_mem_free_handle(vdp->dring_task[i].mhdl); 3150 kmem_free(vdp->dring_task[i].msg, vdp->max_msglen); 3151 vdp->dring_task[i].msg = NULL; 3152 } 3153 kmem_free(vdp->dring_task, 3154 (sizeof (*vdp->dring_task)) * vdp->dring_len); 3155 vdp->dring_task = NULL; 3156 } 3157 } 3158 3159 /* 3160 * Destroy the state associated with a virtual disk 3161 */ 3162 static void 3163 vds_destroy_vd(void *arg) 3164 { 3165 vd_t *vd = (vd_t *)arg; 3166 int retry = 0, rv; 3167 3168 if (vd == NULL) 3169 return; 3170 3171 PR0("Destroying vdisk state"); 3172 3173 if (vd->dk_efi.dki_data != NULL) 3174 kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length); 3175 3176 /* Disable queuing requests for the vdisk */ 3177 if (vd->initialized & VD_LOCKING) { 3178 mutex_enter(&vd->lock); 3179 vd->enabled = 0; 3180 mutex_exit(&vd->lock); 3181 } 3182 3183 /* Drain and destroy start queue (*before* destroying completionq) */ 3184 if (vd->startq != NULL) 3185 ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ 3186 3187 /* Drain and destroy completion queue (*before* shutting down LDC) */ 3188 if (vd->completionq != NULL) 3189 ddi_taskq_destroy(vd->completionq); /* waits for tasks */ 3190 3191 vd_free_dring_task(vd); 3192 3193 /* Free the inband task memory handle */ 3194 (void) ldc_mem_free_handle(vd->inband_task.mhdl); 3195 3196 /* Shut down LDC */ 3197 if (vd->initialized & VD_LDC) { 3198 /* unmap the dring */ 3199 if (vd->initialized & VD_DRING) 3200 (void) ldc_mem_dring_unmap(vd->dring_handle); 3201 3202 /* close LDC channel - retry on EAGAIN */ 3203 while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) { 3204 if (++retry > vds_ldc_retries) { 3205 PR0("Timed out closing channel"); 3206 break; 3207 } 3208 drv_usecwait(vds_ldc_delay); 3209 } 3210 if (rv == 0) { 3211 (void) ldc_unreg_callback(vd->ldc_handle); 3212 (void) ldc_fini(vd->ldc_handle); 3213 } else { 3214 /* 3215 * Closing the LDC channel has failed. Ideally we should 3216 * fail here but there is no Zeus level infrastructure 3217 * to handle this. The MD has already been changed and 3218 * we have to do the close. So we try to do as much 3219 * clean up as we can. 3220 */ 3221 (void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE); 3222 while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN) 3223 drv_usecwait(vds_ldc_delay); 3224 } 3225 } 3226 3227 /* Free the staging buffer for msgs */ 3228 if (vd->vio_msgp != NULL) { 3229 kmem_free(vd->vio_msgp, vd->max_msglen); 3230 vd->vio_msgp = NULL; 3231 } 3232 3233 /* Free the inband message buffer */ 3234 if (vd->inband_task.msg != NULL) { 3235 kmem_free(vd->inband_task.msg, vd->max_msglen); 3236 vd->inband_task.msg = NULL; 3237 } 3238 if (vd->file) { 3239 /* Close file */ 3240 (void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 3241 0, kcred); 3242 VN_RELE(vd->file_vnode); 3243 } else { 3244 /* Close any open backing-device slices */ 3245 for (uint_t slice = 0; slice < vd->nslices; slice++) { 3246 if (vd->ldi_handle[slice] != NULL) { 3247 PR0("Closing slice %u", slice); 3248 (void) ldi_close(vd->ldi_handle[slice], 3249 vd_open_flags | FNDELAY, kcred); 3250 } 3251 } 3252 } 3253 3254 /* Free lock */ 3255 if (vd->initialized & VD_LOCKING) 3256 mutex_destroy(&vd->lock); 3257 3258 /* Finally, free the vdisk structure itself */ 3259 kmem_free(vd, sizeof (*vd)); 3260 } 3261 3262 static int 3263 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id) 3264 { 3265 int status; 3266 vd_t *vd = NULL; 3267 3268 3269 if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0) 3270 vds_destroy_vd(vd); 3271 3272 return (status); 3273 } 3274 3275 static int 3276 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 3277 uint64_t *ldc_id) 3278 { 3279 int num_channels; 3280 3281 3282 /* Look for channel endpoint child(ren) of the vdisk MD node */ 3283 if ((num_channels = md_scan_dag(md, vd_node, 3284 md_find_name(md, VD_CHANNEL_ENDPOINT), 3285 md_find_name(md, "fwd"), channel)) <= 0) { 3286 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 3287 return (-1); 3288 } 3289 3290 /* Get the "id" value for the first channel endpoint node */ 3291 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 3292 PRN("No \"%s\" property found for \"%s\" of vdisk", 3293 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 3294 return (-1); 3295 } 3296 3297 if (num_channels > 1) { 3298 PRN("Using ID of first of multiple channels for this vdisk"); 3299 } 3300 3301 return (0); 3302 } 3303 3304 static int 3305 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 3306 { 3307 int num_nodes, status; 3308 size_t size; 3309 mde_cookie_t *channel; 3310 3311 3312 if ((num_nodes = md_node_count(md)) <= 0) { 3313 PRN("Invalid node count in Machine Description subtree"); 3314 return (-1); 3315 } 3316 size = num_nodes*(sizeof (*channel)); 3317 channel = kmem_zalloc(size, KM_SLEEP); 3318 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 3319 kmem_free(channel, size); 3320 3321 return (status); 3322 } 3323 3324 static void 3325 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 3326 { 3327 char *device_path = NULL; 3328 uint64_t id = 0, ldc_id = 0; 3329 3330 3331 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 3332 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 3333 return; 3334 } 3335 PR0("Adding vdisk ID %lu", id); 3336 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 3337 &device_path) != 0) { 3338 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 3339 return; 3340 } 3341 3342 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 3343 PRN("Error getting LDC ID for vdisk %lu", id); 3344 return; 3345 } 3346 3347 if (vds_init_vd(vds, id, device_path, ldc_id) != 0) { 3348 PRN("Failed to add vdisk ID %lu", id); 3349 return; 3350 } 3351 } 3352 3353 static void 3354 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 3355 { 3356 uint64_t id = 0; 3357 3358 3359 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 3360 PRN("Unable to get \"%s\" property from vdisk's MD node", 3361 VD_ID_PROP); 3362 return; 3363 } 3364 PR0("Removing vdisk ID %lu", id); 3365 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 3366 PRN("No vdisk entry found for vdisk ID %lu", id); 3367 } 3368 3369 static void 3370 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 3371 md_t *curr_md, mde_cookie_t curr_vd_node) 3372 { 3373 char *curr_dev, *prev_dev; 3374 uint64_t curr_id = 0, curr_ldc_id = 0; 3375 uint64_t prev_id = 0, prev_ldc_id = 0; 3376 size_t len; 3377 3378 3379 /* Validate that vdisk ID has not changed */ 3380 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 3381 PRN("Error getting previous vdisk \"%s\" property", 3382 VD_ID_PROP); 3383 return; 3384 } 3385 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 3386 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 3387 return; 3388 } 3389 if (curr_id != prev_id) { 3390 PRN("Not changing vdisk: ID changed from %lu to %lu", 3391 prev_id, curr_id); 3392 return; 3393 } 3394 3395 /* Validate that LDC ID has not changed */ 3396 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 3397 PRN("Error getting LDC ID for vdisk %lu", prev_id); 3398 return; 3399 } 3400 3401 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 3402 PRN("Error getting LDC ID for vdisk %lu", curr_id); 3403 return; 3404 } 3405 if (curr_ldc_id != prev_ldc_id) { 3406 _NOTE(NOTREACHED); /* lint is confused */ 3407 PRN("Not changing vdisk: " 3408 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 3409 return; 3410 } 3411 3412 /* Determine whether device path has changed */ 3413 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 3414 &prev_dev) != 0) { 3415 PRN("Error getting previous vdisk \"%s\"", 3416 VD_BLOCK_DEVICE_PROP); 3417 return; 3418 } 3419 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 3420 &curr_dev) != 0) { 3421 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 3422 return; 3423 } 3424 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 3425 (strncmp(curr_dev, prev_dev, len) == 0)) 3426 return; /* no relevant (supported) change */ 3427 3428 PR0("Changing vdisk ID %lu", prev_id); 3429 3430 /* Remove old state, which will close vdisk and reset */ 3431 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 3432 PRN("No entry found for vdisk ID %lu", prev_id); 3433 3434 /* Re-initialize vdisk with new state */ 3435 if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { 3436 PRN("Failed to change vdisk ID %lu", curr_id); 3437 return; 3438 } 3439 } 3440 3441 static int 3442 vds_process_md(void *arg, mdeg_result_t *md) 3443 { 3444 int i; 3445 vds_t *vds = arg; 3446 3447 3448 if (md == NULL) 3449 return (MDEG_FAILURE); 3450 ASSERT(vds != NULL); 3451 3452 for (i = 0; i < md->removed.nelem; i++) 3453 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 3454 for (i = 0; i < md->match_curr.nelem; i++) 3455 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 3456 md->match_curr.mdp, md->match_curr.mdep[i]); 3457 for (i = 0; i < md->added.nelem; i++) 3458 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 3459 3460 return (MDEG_SUCCESS); 3461 } 3462 3463 3464 static int 3465 vds_do_attach(dev_info_t *dip) 3466 { 3467 int status, sz; 3468 int cfg_handle; 3469 minor_t instance = ddi_get_instance(dip); 3470 vds_t *vds; 3471 mdeg_prop_spec_t *pspecp; 3472 mdeg_node_spec_t *ispecp; 3473 3474 /* 3475 * The "cfg-handle" property of a vds node in an MD contains the MD's 3476 * notion of "instance", or unique identifier, for that node; OBP 3477 * stores the value of the "cfg-handle" MD property as the value of 3478 * the "reg" property on the node in the device tree it builds from 3479 * the MD and passes to Solaris. Thus, we look up the devinfo node's 3480 * "reg" property value to uniquely identify this device instance when 3481 * registering with the MD event-generation framework. If the "reg" 3482 * property cannot be found, the device tree state is presumably so 3483 * broken that there is no point in continuing. 3484 */ 3485 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 3486 VD_REG_PROP)) { 3487 PRN("vds \"%s\" property does not exist", VD_REG_PROP); 3488 return (DDI_FAILURE); 3489 } 3490 3491 /* Get the MD instance for later MDEG registration */ 3492 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 3493 VD_REG_PROP, -1); 3494 3495 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 3496 PRN("Could not allocate state for instance %u", instance); 3497 return (DDI_FAILURE); 3498 } 3499 3500 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 3501 PRN("Could not get state for instance %u", instance); 3502 ddi_soft_state_free(vds_state, instance); 3503 return (DDI_FAILURE); 3504 } 3505 3506 3507 vds->dip = dip; 3508 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 3509 vds_destroy_vd, 3510 sizeof (void *)); 3511 ASSERT(vds->vd_table != NULL); 3512 3513 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 3514 PRN("ldi_ident_from_dip() returned errno %d", status); 3515 return (DDI_FAILURE); 3516 } 3517 vds->initialized |= VDS_LDI; 3518 3519 /* Register for MD updates */ 3520 sz = sizeof (vds_prop_template); 3521 pspecp = kmem_alloc(sz, KM_SLEEP); 3522 bcopy(vds_prop_template, pspecp, sz); 3523 3524 VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle); 3525 3526 /* initialize the complete prop spec structure */ 3527 ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 3528 ispecp->namep = "virtual-device"; 3529 ispecp->specp = pspecp; 3530 3531 if (mdeg_register(ispecp, &vd_match, vds_process_md, vds, 3532 &vds->mdeg) != MDEG_SUCCESS) { 3533 PRN("Unable to register for MD updates"); 3534 kmem_free(ispecp, sizeof (mdeg_node_spec_t)); 3535 kmem_free(pspecp, sz); 3536 return (DDI_FAILURE); 3537 } 3538 3539 vds->ispecp = ispecp; 3540 vds->initialized |= VDS_MDEG; 3541 3542 /* Prevent auto-detaching so driver is available whenever MD changes */ 3543 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 3544 DDI_PROP_SUCCESS) { 3545 PRN("failed to set \"%s\" property for instance %u", 3546 DDI_NO_AUTODETACH, instance); 3547 } 3548 3549 ddi_report_dev(dip); 3550 return (DDI_SUCCESS); 3551 } 3552 3553 static int 3554 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3555 { 3556 int status; 3557 3558 switch (cmd) { 3559 case DDI_ATTACH: 3560 PR0("Attaching"); 3561 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 3562 (void) vds_detach(dip, DDI_DETACH); 3563 return (status); 3564 case DDI_RESUME: 3565 PR0("No action required for DDI_RESUME"); 3566 return (DDI_SUCCESS); 3567 default: 3568 return (DDI_FAILURE); 3569 } 3570 } 3571 3572 static struct dev_ops vds_ops = { 3573 DEVO_REV, /* devo_rev */ 3574 0, /* devo_refcnt */ 3575 ddi_no_info, /* devo_getinfo */ 3576 nulldev, /* devo_identify */ 3577 nulldev, /* devo_probe */ 3578 vds_attach, /* devo_attach */ 3579 vds_detach, /* devo_detach */ 3580 nodev, /* devo_reset */ 3581 NULL, /* devo_cb_ops */ 3582 NULL, /* devo_bus_ops */ 3583 nulldev /* devo_power */ 3584 }; 3585 3586 static struct modldrv modldrv = { 3587 &mod_driverops, 3588 "virtual disk server v%I%", 3589 &vds_ops, 3590 }; 3591 3592 static struct modlinkage modlinkage = { 3593 MODREV_1, 3594 &modldrv, 3595 NULL 3596 }; 3597 3598 3599 int 3600 _init(void) 3601 { 3602 int i, status; 3603 3604 3605 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 3606 return (status); 3607 if ((status = mod_install(&modlinkage)) != 0) { 3608 ddi_soft_state_fini(&vds_state); 3609 return (status); 3610 } 3611 3612 /* Fill in the bit-mask of server-supported operations */ 3613 for (i = 0; i < vds_noperations; i++) 3614 vds_operations |= 1 << (vds_operation[i].operation - 1); 3615 3616 return (0); 3617 } 3618 3619 int 3620 _info(struct modinfo *modinfop) 3621 { 3622 return (mod_info(&modlinkage, modinfop)); 3623 } 3624 3625 int 3626 _fini(void) 3627 { 3628 int status; 3629 3630 3631 if ((status = mod_remove(&modlinkage)) != 0) 3632 return (status); 3633 ddi_soft_state_fini(&vds_state); 3634 return (0); 3635 } 3636