1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Virtual disk server 31 */ 32 33 34 #include <sys/types.h> 35 #include <sys/conf.h> 36 #include <sys/ddi.h> 37 #include <sys/dkio.h> 38 #include <sys/file.h> 39 #include <sys/mdeg.h> 40 #include <sys/modhash.h> 41 #include <sys/note.h> 42 #include <sys/pathname.h> 43 #include <sys/sunddi.h> 44 #include <sys/sunldi.h> 45 #include <sys/sysmacros.h> 46 #include <sys/vio_common.h> 47 #include <sys/vdsk_mailbox.h> 48 #include <sys/vdsk_common.h> 49 #include <sys/vtoc.h> 50 51 52 /* Virtual disk server initialization flags */ 53 #define VDS_LOCKING 0x01 54 #define VDS_LDI 0x02 55 #define VDS_MDEG 0x04 56 57 /* Virtual disk server tunable parameters */ 58 #define VDS_LDC_RETRIES 3 59 #define VDS_NCHAINS 32 60 61 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ 62 #define VDS_NAME "virtual-disk-server" 63 64 #define VD_NAME "vd" 65 #define VD_VOLUME_NAME "vdisk" 66 #define VD_ASCIILABEL "Virtual Disk" 67 68 #define VD_CHANNEL_ENDPOINT "channel-endpoint" 69 #define VD_ID_PROP "id" 70 #define VD_BLOCK_DEVICE_PROP "vds-block-device" 71 72 /* Virtual disk initialization flags */ 73 #define VD_LOCKING 0x01 74 #define VD_TASKQ 0x02 75 #define VD_LDC 0x04 76 #define VD_DRING 0x08 77 #define VD_SID 0x10 78 #define VD_SEQ_NUM 0x20 79 80 /* Flags for opening/closing backing devices via LDI */ 81 #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) 82 83 /* 84 * By Solaris convention, slice/partition 2 represents the entire disk; 85 * unfortunately, this convention does not appear to be codified. 86 */ 87 #define VD_ENTIRE_DISK_SLICE 2 88 89 /* Return a cpp token as a string */ 90 #define STRINGIZE(token) #token 91 92 /* 93 * Print a message prefixed with the current function name to the message log 94 * (and optionally to the console for verbose boots); these macros use cpp's 95 * concatenation of string literals and C99 variable-length-argument-list 96 * macros 97 */ 98 #define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") 99 #define _PRN(format, ...) \ 100 cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) 101 102 /* Return a pointer to the "i"th vdisk dring element */ 103 #define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ 104 (vd->dring + (i)*vd->descriptor_size)) 105 106 /* Return the virtual disk client's type as a string (for use in messages) */ 107 #define VD_CLIENT(vd) \ 108 (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ 109 (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ 110 (((vd)->xfer_mode == 0) ? "null client" : \ 111 "unsupported client"))) 112 113 /* Debugging macros */ 114 #ifdef DEBUG 115 #define PR0 if (vd_msglevel > 0) PRN 116 #define PR1 if (vd_msglevel > 1) PRN 117 #define PR2 if (vd_msglevel > 2) PRN 118 119 #define VD_DUMP_DRING_ELEM(elem) \ 120 PRN("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ 121 elem->hdr.dstate, \ 122 elem->payload.operation, \ 123 elem->payload.status, \ 124 elem->payload.nbytes, \ 125 elem->payload.addr, \ 126 elem->payload.ncookies); 127 128 #else /* !DEBUG */ 129 #define PR0(...) 130 #define PR1(...) 131 #define PR2(...) 132 133 #define VD_DUMP_DRING_ELEM(elem) 134 135 #endif /* DEBUG */ 136 137 138 typedef struct vds { 139 uint_t initialized; /* driver inst initialization flags */ 140 dev_info_t *dip; /* driver inst devinfo pointer */ 141 kmutex_t lock; /* lock for this structure */ 142 ldi_ident_t ldi_ident; /* driver's identifier for LDI */ 143 mod_hash_t *vd_table; /* table of virtual disks served */ 144 mdeg_handle_t mdeg; /* handle for MDEG operations */ 145 } vds_t; 146 147 typedef struct vd { 148 uint_t initialized; /* vdisk initialization flags */ 149 kmutex_t lock; /* lock for this structure */ 150 vds_t *vds; /* server for this vdisk */ 151 ddi_taskq_t *taskq; /* taskq for this vdisk */ 152 ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ 153 dev_t dev[V_NUMPAR]; /* dev numbers for slices */ 154 uint_t nslices; /* number for slices */ 155 size_t vdisk_size; /* number of blocks in vdisk */ 156 vd_disk_type_t vdisk_type; /* slice or entire disk */ 157 boolean_t pseudo; /* underlying pseudo dev */ 158 struct dk_geom dk_geom; /* synthetic for slice type */ 159 struct vtoc vtoc; /* synthetic for slice type */ 160 ldc_status_t ldc_state; /* LDC connection state */ 161 ldc_handle_t ldc_handle; /* handle for LDC comm */ 162 size_t max_msglen; /* largest LDC message len */ 163 boolean_t enabled; /* whether vdisk is enabled */ 164 vd_state_t state; /* client handshake state */ 165 uint8_t xfer_mode; /* transfer mode with client */ 166 uint32_t sid; /* client's session ID */ 167 uint64_t seq_num; /* message sequence number */ 168 uint64_t dring_ident; /* identifier of dring */ 169 ldc_dring_handle_t dring_handle; /* handle for dring ops */ 170 uint32_t descriptor_size; /* num bytes in desc */ 171 uint32_t dring_len; /* number of dring elements */ 172 caddr_t dring; /* address of dring */ 173 } vd_t; 174 175 typedef struct vds_operation { 176 uint8_t operation; 177 int (*function)(vd_t *vd, vd_dring_payload_t *request); 178 } vds_operation_t; 179 180 typedef struct vd_ioctl { 181 uint8_t operation; /* vdisk operation */ 182 const char *operation_name; /* vdisk operation name */ 183 size_t nbytes; /* size of operation buffer */ 184 int cmd; /* corresponding ioctl cmd */ 185 const char *cmd_name; /* ioctl cmd name */ 186 void *arg; /* ioctl cmd argument */ 187 /* convert input vd_buf to output ioctl_arg */ 188 void (*copyin)(void *vd_buf, void *ioctl_arg); 189 /* convert input ioctl_arg to output vd_buf */ 190 void (*copyout)(void *ioctl_arg, void *vd_buf); 191 } vd_ioctl_t; 192 193 /* Define trivial copyin/copyout conversion function flag */ 194 #define VD_IDENTITY ((void (*)(void *, void *))-1) 195 196 197 static int vds_ldc_retries = VDS_LDC_RETRIES; 198 static void *vds_state; 199 static uint64_t vds_operations; /* see vds_operation[] definition below */ 200 201 static int vd_open_flags = VD_OPEN_FLAGS; 202 203 /* 204 * Supported protocol version pairs, from highest (newest) to lowest (oldest) 205 * 206 * Each supported major version should appear only once, paired with (and only 207 * with) its highest supported minor version number (as the protocol requires 208 * supporting all lower minor version numbers as well) 209 */ 210 static const vio_ver_t vds_version[] = {{1, 0}}; 211 static const size_t vds_num_versions = 212 sizeof (vds_version)/sizeof (vds_version[0]); 213 214 #ifdef DEBUG 215 static int vd_msglevel; 216 #endif /* DEBUG */ 217 218 219 static int 220 vd_bread(vd_t *vd, vd_dring_payload_t *request) 221 { 222 int status; 223 struct buf buf; 224 225 PR1("Read %lu bytes at block %lu", request->nbytes, request->addr); 226 if (request->nbytes == 0) 227 return (EINVAL); /* no service for trivial requests */ 228 ASSERT(mutex_owned(&vd->lock)); 229 ASSERT(request->slice < vd->nslices); 230 231 bioinit(&buf); 232 buf.b_flags = B_BUSY | B_READ; 233 buf.b_bcount = request->nbytes; 234 buf.b_un.b_addr = kmem_alloc(buf.b_bcount, KM_SLEEP); 235 buf.b_lblkno = request->addr; 236 buf.b_edev = vd->dev[request->slice]; 237 238 if ((status = ldi_strategy(vd->ldi_handle[request->slice], &buf)) == 0) 239 status = biowait(&buf); 240 biofini(&buf); 241 if ((status == 0) && 242 ((status = ldc_mem_copy(vd->ldc_handle, buf.b_un.b_addr, 0, 243 &request->nbytes, request->cookie, request->ncookies, 244 LDC_COPY_OUT)) != 0)) { 245 PRN("ldc_mem_copy() returned errno %d copying to client", 246 status); 247 } 248 kmem_free(buf.b_un.b_addr, buf.b_bcount); /* nbytes can change */ 249 return (status); 250 } 251 252 static int 253 vd_do_bwrite(vd_t *vd, uint_t slice, diskaddr_t block, size_t nbytes, 254 ldc_mem_cookie_t *cookie, uint64_t ncookies, caddr_t data) 255 { 256 int status; 257 struct buf buf; 258 259 ASSERT(mutex_owned(&vd->lock)); 260 ASSERT(slice < vd->nslices); 261 ASSERT(nbytes != 0); 262 ASSERT(data != NULL); 263 264 /* Get data from client */ 265 if ((status = ldc_mem_copy(vd->ldc_handle, data, 0, &nbytes, 266 cookie, ncookies, LDC_COPY_IN)) != 0) { 267 PRN("ldc_mem_copy() returned errno %d copying from client", 268 status); 269 return (status); 270 } 271 272 bioinit(&buf); 273 buf.b_flags = B_BUSY | B_WRITE; 274 buf.b_bcount = nbytes; 275 buf.b_un.b_addr = data; 276 buf.b_lblkno = block; 277 buf.b_edev = vd->dev[slice]; 278 279 if ((status = ldi_strategy(vd->ldi_handle[slice], &buf)) == 0) 280 status = biowait(&buf); 281 biofini(&buf); 282 return (status); 283 } 284 285 static int 286 vd_bwrite(vd_t *vd, vd_dring_payload_t *request) 287 { 288 int status; 289 caddr_t data; 290 291 292 PR1("Write %ld bytes at block %lu", request->nbytes, request->addr); 293 if (request->nbytes == 0) 294 return (EINVAL); /* no service for trivial requests */ 295 data = kmem_alloc(request->nbytes, KM_SLEEP); 296 status = vd_do_bwrite(vd, request->slice, request->addr, 297 request->nbytes, request->cookie, request->ncookies, data); 298 kmem_free(data, request->nbytes); 299 return (status); 300 } 301 302 static void 303 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) 304 { 305 VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); 306 } 307 308 static void 309 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) 310 { 311 VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); 312 } 313 314 static void 315 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf) 316 { 317 DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf); 318 } 319 320 static void 321 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) 322 { 323 VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); 324 } 325 326 static int 327 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg) 328 { 329 switch (cmd) { 330 case DKIOCGGEOM: 331 ASSERT(ioctl_arg != NULL); 332 bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom)); 333 return (0); 334 case DKIOCGVTOC: 335 ASSERT(ioctl_arg != NULL); 336 bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc)); 337 return (0); 338 default: 339 return (ENOTSUP); 340 } 341 } 342 343 static int 344 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) 345 { 346 int rval = 0, status; 347 size_t nbytes = request->nbytes; /* modifiable copy */ 348 349 350 ASSERT(mutex_owned(&vd->lock)); 351 ASSERT(request->slice < vd->nslices); 352 PR0("Performing %s", ioctl->operation_name); 353 354 /* Get data from client and convert, if necessary */ 355 if (ioctl->copyin != NULL) { 356 ASSERT(nbytes != 0 && buf != NULL); 357 PR1("Getting \"arg\" data from client"); 358 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 359 request->cookie, request->ncookies, 360 LDC_COPY_IN)) != 0) { 361 PRN("ldc_mem_copy() returned errno %d " 362 "copying from client", status); 363 return (status); 364 } 365 366 /* Convert client's data, if necessary */ 367 if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ 368 ioctl->arg = buf; 369 else /* convert client vdisk operation data to ioctl data */ 370 (ioctl->copyin)(buf, (void *)ioctl->arg); 371 } 372 373 /* 374 * Handle single-slice block devices internally; otherwise, have the 375 * real driver perform the ioctl() 376 */ 377 if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { 378 if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, 379 (void *)ioctl->arg)) != 0) 380 return (status); 381 } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], 382 ioctl->cmd, (intptr_t)ioctl->arg, FKIOCTL, kcred, 383 &rval)) != 0) { 384 PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); 385 return (status); 386 } 387 #ifdef DEBUG 388 if (rval != 0) { 389 PRN("%s set rval = %d, which is not being returned to client", 390 ioctl->cmd_name, rval); 391 } 392 #endif /* DEBUG */ 393 394 /* Convert data and send to client, if necessary */ 395 if (ioctl->copyout != NULL) { 396 ASSERT(nbytes != 0 && buf != NULL); 397 PR1("Sending \"arg\" data to client"); 398 399 /* Convert ioctl data to vdisk operation data, if necessary */ 400 if (ioctl->copyout != VD_IDENTITY) 401 (ioctl->copyout)((void *)ioctl->arg, buf); 402 403 if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, 404 request->cookie, request->ncookies, 405 LDC_COPY_OUT)) != 0) { 406 PRN("ldc_mem_copy() returned errno %d " 407 "copying to client", status); 408 return (status); 409 } 410 } 411 412 return (status); 413 } 414 415 /* 416 * Open any slices which have become non-empty as a result of performing a 417 * set-VTOC operation for the client. 418 * 419 * When serving a full disk, vds attempts to exclusively open all of the 420 * disk's slices to prevent another thread or process in the service domain 421 * from "stealing" a slice or from performing I/O to a slice while a vds 422 * client is accessing it. Unfortunately, underlying drivers, such as sd(7d) 423 * and cmdk(7d), return an error when attempting to open the device file for a 424 * slice which is currently empty according to the VTOC. This driver behavior 425 * means that vds must skip opening empty slices when initializing a vdisk for 426 * full-disk service and try to open slices that become non-empty (via a 427 * set-VTOC operation) during use of the full disk in order to begin serving 428 * such slices to the client. This approach has an inherent (and therefore 429 * unavoidable) race condition; it also means that failure to open a 430 * newly-non-empty slice has different semantics than failure to open an 431 * initially-non-empty slice: Due to driver bahavior, opening a 432 * newly-non-empty slice is a necessary side effect of vds performing a 433 * (successful) set-VTOC operation for a client on an in-service (and in-use) 434 * disk in order to begin serving the slice; failure of this side-effect 435 * operation does not mean that the client's set-VTOC operation failed or that 436 * operations on other slices must fail. Therefore, this function prints an 437 * error message on failure to open a slice, but does not return an error to 438 * its caller--unlike failure to open a slice initially, which results in an 439 * error that prevents serving the vdisk (and thereby requires an 440 * administrator to resolve the problem). Note that, apart from another 441 * thread or process opening a new slice during the race-condition window, 442 * failure to open a slice in this function will likely indicate an underlying 443 * drive problem, which will also likely become evident in errors returned by 444 * operations on other slices, and which will require administrative 445 * intervention and possibly servicing the drive. 446 */ 447 static void 448 vd_open_new_slices(vd_t *vd) 449 { 450 int rval, status; 451 struct vtoc vtoc; 452 453 454 /* Get the (new) VTOC for updated slice sizes */ 455 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vtoc, 456 FKIOCTL, kcred, &rval)) != 0) { 457 PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d", status); 458 return; 459 } 460 461 /* Open any newly-non-empty slices */ 462 for (int slice = 0; slice < vd->nslices; slice++) { 463 /* Skip zero-length slices */ 464 if (vtoc.v_part[slice].p_size == 0) { 465 if (vd->ldi_handle[slice] != NULL) 466 PR0("Open slice %u now has zero length", slice); 467 continue; 468 } 469 470 /* Skip already-open slices */ 471 if (vd->ldi_handle[slice] != NULL) 472 continue; 473 474 PR0("Opening newly-non-empty slice %u", slice); 475 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 476 vd_open_flags, kcred, &vd->ldi_handle[slice], 477 vd->vds->ldi_ident)) != 0) { 478 PRN("ldi_open_by_dev() returned errno %d " 479 "for slice %u", status, slice); 480 } 481 } 482 } 483 484 #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) 485 static int 486 vd_ioctl(vd_t *vd, vd_dring_payload_t *request) 487 { 488 int i, status; 489 void *buf = NULL; 490 struct dk_geom dk_geom = {0}; 491 struct vtoc vtoc = {0}; 492 vd_ioctl_t ioctl[] = { 493 /* Command (no-copy) operations */ 494 {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, 495 DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), 496 NULL, NULL, NULL}, 497 498 /* "Get" (copy-out) operations */ 499 {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), 500 DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), 501 NULL, NULL, VD_IDENTITY}, 502 {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), 503 RNDSIZE(vd_geom_t), 504 DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), 505 &dk_geom, NULL, dk_geom2vd_geom}, 506 {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t), 507 DKIOCGVTOC, STRINGIZE(DKIOCGVTOC), 508 &vtoc, NULL, vtoc2vd_vtoc}, 509 510 /* "Set" (copy-in) operations */ 511 {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), 512 DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), 513 NULL, VD_IDENTITY, NULL}, 514 {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), 515 RNDSIZE(vd_geom_t), 516 DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), 517 &dk_geom, vd_geom2dk_geom, NULL}, 518 {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t), 519 DKIOCSVTOC, STRINGIZE(DKIOCSVTOC), 520 &vtoc, vd_vtoc2vtoc, NULL}, 521 }; 522 size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); 523 524 525 ASSERT(mutex_owned(&vd->lock)); 526 ASSERT(request->slice < vd->nslices); 527 528 /* 529 * Determine ioctl corresponding to caller's "operation" and 530 * validate caller's "nbytes" 531 */ 532 for (i = 0; i < nioctls; i++) { 533 if (request->operation == ioctl[i].operation) { 534 /* LDC memory operations require 8-byte multiples */ 535 ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); 536 537 if (request->nbytes != ioctl[i].nbytes) { 538 PRN("%s: Expected nbytes = %lu, got %lu", 539 ioctl[i].operation_name, ioctl[i].nbytes, 540 request->nbytes); 541 return (EINVAL); 542 } 543 544 break; 545 } 546 } 547 ASSERT(i < nioctls); /* because "operation" already validated */ 548 549 if (request->nbytes) 550 buf = kmem_zalloc(request->nbytes, KM_SLEEP); 551 status = vd_do_ioctl(vd, request, buf, &ioctl[i]); 552 if (request->nbytes) 553 kmem_free(buf, request->nbytes); 554 if ((request->operation == VD_OP_SET_VTOC) && 555 (vd->vdisk_type == VD_DISK_TYPE_DISK)) 556 vd_open_new_slices(vd); 557 return (status); 558 } 559 560 /* 561 * Define the supported operations once the functions for performing them have 562 * been defined 563 */ 564 static const vds_operation_t vds_operation[] = { 565 {VD_OP_BREAD, vd_bread}, 566 {VD_OP_BWRITE, vd_bwrite}, 567 {VD_OP_FLUSH, vd_ioctl}, 568 {VD_OP_GET_WCE, vd_ioctl}, 569 {VD_OP_SET_WCE, vd_ioctl}, 570 {VD_OP_GET_VTOC, vd_ioctl}, 571 {VD_OP_SET_VTOC, vd_ioctl}, 572 {VD_OP_GET_DISKGEOM, vd_ioctl}, 573 {VD_OP_SET_DISKGEOM, vd_ioctl} 574 }; 575 576 static const size_t vds_noperations = 577 (sizeof (vds_operation))/(sizeof (vds_operation[0])); 578 579 /* 580 * Process a request using a defined operation 581 */ 582 static int 583 vd_process_request(vd_t *vd, vd_dring_payload_t *request) 584 { 585 int i; 586 587 588 PR1("Entered"); 589 ASSERT(mutex_owned(&vd->lock)); 590 591 /* Range-check slice */ 592 if (request->slice >= vd->nslices) { 593 PRN("Invalid \"slice\" %u (max %u) for virtual disk", 594 request->slice, (vd->nslices - 1)); 595 return (EINVAL); 596 } 597 598 /* Perform the requested operation */ 599 for (i = 0; i < vds_noperations; i++) 600 if (request->operation == vds_operation[i].operation) 601 return (vds_operation[i].function(vd, request)); 602 603 /* No matching operation found */ 604 PRN("Unsupported operation %u", request->operation); 605 return (ENOTSUP); 606 } 607 608 static int 609 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) 610 { 611 int retry, status; 612 size_t nbytes; 613 614 615 for (retry = 0, status = EWOULDBLOCK; 616 retry < vds_ldc_retries && status == EWOULDBLOCK; 617 retry++) { 618 PR1("ldc_write() attempt %d", (retry + 1)); 619 nbytes = msglen; 620 status = ldc_write(ldc_handle, msg, &nbytes); 621 } 622 623 if (status != 0) { 624 PRN("ldc_write() returned errno %d", status); 625 return (status); 626 } else if (nbytes != msglen) { 627 PRN("ldc_write() performed only partial write"); 628 return (EIO); 629 } 630 631 PR1("SENT %lu bytes", msglen); 632 return (0); 633 } 634 635 /* 636 * Return true if the "type", "subtype", and "env" fields of the "tag" first 637 * argument match the corresponding remaining arguments; otherwise, return false 638 */ 639 boolean_t 640 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) 641 { 642 return ((tag->vio_msgtype == type) && 643 (tag->vio_subtype == subtype) && 644 (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE; 645 } 646 647 /* 648 * Check whether the major/minor version specified in "ver_msg" is supported 649 * by this server. 650 */ 651 static boolean_t 652 vds_supported_version(vio_ver_msg_t *ver_msg) 653 { 654 for (int i = 0; i < vds_num_versions; i++) { 655 ASSERT(vds_version[i].major > 0); 656 ASSERT((i == 0) || 657 (vds_version[i].major < vds_version[i-1].major)); 658 659 /* 660 * If the major versions match, adjust the minor version, if 661 * necessary, down to the highest value supported by this 662 * server and return true so this message will get "ack"ed; 663 * the client should also support all minor versions lower 664 * than the value it sent 665 */ 666 if (ver_msg->ver_major == vds_version[i].major) { 667 if (ver_msg->ver_minor > vds_version[i].minor) { 668 PR0("Adjusting minor version from %u to %u", 669 ver_msg->ver_minor, vds_version[i].minor); 670 ver_msg->ver_minor = vds_version[i].minor; 671 } 672 return (B_TRUE); 673 } 674 675 /* 676 * If the message contains a higher major version number, set 677 * the message's major/minor versions to the current values 678 * and return false, so this message will get "nack"ed with 679 * these values, and the client will potentially try again 680 * with the same or a lower version 681 */ 682 if (ver_msg->ver_major > vds_version[i].major) { 683 ver_msg->ver_major = vds_version[i].major; 684 ver_msg->ver_minor = vds_version[i].minor; 685 return (B_FALSE); 686 } 687 688 /* 689 * Otherwise, the message's major version is less than the 690 * current major version, so continue the loop to the next 691 * (lower) supported version 692 */ 693 } 694 695 /* 696 * No common version was found; "ground" the version pair in the 697 * message to terminate negotiation 698 */ 699 ver_msg->ver_major = 0; 700 ver_msg->ver_minor = 0; 701 return (B_FALSE); 702 } 703 704 /* 705 * Process a version message from a client. vds expects to receive version 706 * messages from clients seeking service, but never issues version messages 707 * itself; therefore, vds can ACK or NACK client version messages, but does 708 * not expect to receive version-message ACKs or NACKs (and will treat such 709 * messages as invalid). 710 */ 711 static int 712 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 713 { 714 vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; 715 716 717 ASSERT(msglen >= sizeof (msg->tag)); 718 719 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 720 VIO_VER_INFO)) { 721 return (ENOMSG); /* not a version message */ 722 } 723 724 if (msglen != sizeof (*ver_msg)) { 725 PRN("Expected %lu-byte version message; " 726 "received %lu bytes", sizeof (*ver_msg), msglen); 727 return (EBADMSG); 728 } 729 730 if (ver_msg->dev_class != VDEV_DISK) { 731 PRN("Expected device class %u (disk); received %u", 732 VDEV_DISK, ver_msg->dev_class); 733 return (EBADMSG); 734 } 735 736 /* 737 * We're talking to the expected kind of client; set our device class 738 * for "ack/nack" back to the client 739 */ 740 ver_msg->dev_class = VDEV_DISK_SERVER; 741 742 /* 743 * Check whether the (valid) version message specifies a version 744 * supported by this server. If the version is not supported, return 745 * EBADMSG so the message will get "nack"ed; vds_supported_version() 746 * will have updated the message with a supported version for the 747 * client to consider 748 */ 749 if (!vds_supported_version(ver_msg)) 750 return (EBADMSG); 751 752 753 /* 754 * A version has been agreed upon; use the client's SID for 755 * communication on this channel now 756 */ 757 ASSERT(!(vd->initialized & VD_SID)); 758 vd->sid = ver_msg->tag.vio_sid; 759 vd->initialized |= VD_SID; 760 761 /* 762 * When multiple versions are supported, this function should store 763 * the negotiated major and minor version values in the "vd" data 764 * structure to govern further communication; in particular, note that 765 * the client might have specified a lower minor version for the 766 * agreed major version than specifed in the vds_version[] array. The 767 * following assertions should help remind future maintainers to make 768 * the appropriate changes to support multiple versions. 769 */ 770 ASSERT(vds_num_versions == 1); 771 ASSERT(ver_msg->ver_major == vds_version[0].major); 772 ASSERT(ver_msg->ver_minor == vds_version[0].minor); 773 774 PR0("Using major version %u, minor version %u", 775 ver_msg->ver_major, ver_msg->ver_minor); 776 return (0); 777 } 778 779 static int 780 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 781 { 782 vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; 783 784 785 PR0("Entered"); 786 ASSERT(mutex_owned(&vd->lock)); 787 ASSERT(msglen >= sizeof (msg->tag)); 788 789 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 790 VIO_ATTR_INFO)) { 791 return (ENOMSG); /* not an attribute message */ 792 } 793 794 if (msglen != sizeof (*attr_msg)) { 795 PRN("Expected %lu-byte attribute message; " 796 "received %lu bytes", sizeof (*attr_msg), msglen); 797 return (EBADMSG); 798 } 799 800 if (attr_msg->max_xfer_sz == 0) { 801 PRN("Received maximum transfer size of 0 from client"); 802 return (EBADMSG); 803 } 804 805 if ((attr_msg->xfer_mode != VIO_DESC_MODE) && 806 (attr_msg->xfer_mode != VIO_DRING_MODE)) { 807 PRN("Client requested unsupported transfer mode"); 808 return (EBADMSG); 809 } 810 811 812 /* Success: valid message and transfer mode */ 813 vd->xfer_mode = attr_msg->xfer_mode; 814 if (vd->xfer_mode == VIO_DESC_MODE) { 815 /* 816 * The vd_dring_inband_msg_t contains one cookie; need room 817 * for up to n-1 more cookies, where "n" is the number of full 818 * pages plus possibly one partial page required to cover 819 * "max_xfer_sz". Add room for one more cookie if 820 * "max_xfer_sz" isn't an integral multiple of the page size. 821 * Must first get the maximum transfer size in bytes. 822 */ 823 size_t max_xfer_bytes = attr_msg->vdisk_block_size ? 824 attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : 825 attr_msg->max_xfer_sz; 826 size_t max_inband_msglen = 827 sizeof (vd_dring_inband_msg_t) + 828 ((max_xfer_bytes/PAGESIZE + 829 ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* 830 (sizeof (ldc_mem_cookie_t))); 831 832 /* 833 * Set the maximum expected message length to 834 * accommodate in-band-descriptor messages with all 835 * their cookies 836 */ 837 vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); 838 } 839 840 attr_msg->vdisk_size = vd->vdisk_size; 841 attr_msg->vdisk_type = vd->vdisk_type; 842 attr_msg->operations = vds_operations; 843 PR0("%s", VD_CLIENT(vd)); 844 return (0); 845 } 846 847 static int 848 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 849 { 850 int status; 851 size_t expected; 852 ldc_mem_info_t dring_minfo; 853 vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; 854 855 856 PR0("Entered"); 857 ASSERT(mutex_owned(&vd->lock)); 858 ASSERT(msglen >= sizeof (msg->tag)); 859 860 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 861 VIO_DRING_REG)) { 862 return (ENOMSG); /* not a register-dring message */ 863 } 864 865 if (msglen < sizeof (*reg_msg)) { 866 PRN("Expected at least %lu-byte register-dring message; " 867 "received %lu bytes", sizeof (*reg_msg), msglen); 868 return (EBADMSG); 869 } 870 871 expected = sizeof (*reg_msg) + 872 (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); 873 if (msglen != expected) { 874 PRN("Expected %lu-byte register-dring message; " 875 "received %lu bytes", expected, msglen); 876 return (EBADMSG); 877 } 878 879 if (vd->initialized & VD_DRING) { 880 PRN("A dring was previously registered; only support one"); 881 return (EBADMSG); 882 } 883 884 if (reg_msg->ncookies != 1) { 885 /* 886 * In addition to fixing the assertion in the success case 887 * below, supporting drings which require more than one 888 * "cookie" requires increasing the value of vd->max_msglen 889 * somewhere in the code path prior to receiving the message 890 * which results in calling this function. Note that without 891 * making this change, the larger message size required to 892 * accommodate multiple cookies cannot be successfully 893 * received, so this function will not even get called. 894 * Gracefully accommodating more dring cookies might 895 * reasonably demand exchanging an additional attribute or 896 * making a minor protocol adjustment 897 */ 898 PRN("reg_msg->ncookies = %u != 1", reg_msg->ncookies); 899 return (EBADMSG); 900 } 901 902 status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, 903 reg_msg->ncookies, reg_msg->num_descriptors, 904 reg_msg->descriptor_size, LDC_SHADOW_MAP, &vd->dring_handle); 905 if (status != 0) { 906 PRN("ldc_mem_dring_map() returned errno %d", status); 907 return (status); 908 } 909 910 /* 911 * To remove the need for this assertion, must call 912 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a 913 * successful call to ldc_mem_dring_map() 914 */ 915 ASSERT(reg_msg->ncookies == 1); 916 917 if ((status = 918 ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { 919 PRN("ldc_mem_dring_info() returned errno %d", status); 920 if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) 921 PRN("ldc_mem_dring_unmap() returned errno %d", status); 922 return (status); 923 } 924 925 if (dring_minfo.vaddr == NULL) { 926 PRN("Descriptor ring virtual address is NULL"); 927 return (ENXIO); 928 } 929 930 931 /* Valid message and dring mapped */ 932 PR1("descriptor size = %u, dring length = %u", 933 vd->descriptor_size, vd->dring_len); 934 vd->initialized |= VD_DRING; 935 vd->dring_ident = 1; /* "There Can Be Only One" */ 936 vd->dring = dring_minfo.vaddr; 937 vd->descriptor_size = reg_msg->descriptor_size; 938 vd->dring_len = reg_msg->num_descriptors; 939 reg_msg->dring_ident = vd->dring_ident; 940 return (0); 941 } 942 943 static int 944 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 945 { 946 vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; 947 948 949 PR0("Entered"); 950 ASSERT(mutex_owned(&vd->lock)); 951 ASSERT(msglen >= sizeof (msg->tag)); 952 953 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, 954 VIO_DRING_UNREG)) { 955 return (ENOMSG); /* not an unregister-dring message */ 956 } 957 958 if (msglen != sizeof (*unreg_msg)) { 959 PRN("Expected %lu-byte unregister-dring message; " 960 "received %lu bytes", sizeof (*unreg_msg), msglen); 961 return (EBADMSG); 962 } 963 964 if (unreg_msg->dring_ident != vd->dring_ident) { 965 PRN("Expected dring ident %lu; received %lu", 966 vd->dring_ident, unreg_msg->dring_ident); 967 return (EBADMSG); 968 } 969 970 return (0); 971 } 972 973 static int 974 process_rdx_msg(vio_msg_t *msg, size_t msglen) 975 { 976 PR0("Entered"); 977 ASSERT(msglen >= sizeof (msg->tag)); 978 979 if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) 980 return (ENOMSG); /* not an RDX message */ 981 982 if (msglen != sizeof (vio_rdx_msg_t)) { 983 PRN("Expected %lu-byte RDX message; received %lu bytes", 984 sizeof (vio_rdx_msg_t), msglen); 985 return (EBADMSG); 986 } 987 988 return (0); 989 } 990 991 static void 992 vd_reset_connection(vd_t *vd, boolean_t reset_ldc) 993 { 994 int status = 0; 995 996 997 ASSERT(mutex_owned(&vd->lock)); 998 PR0("Resetting connection with %s", VD_CLIENT(vd)); 999 if ((vd->initialized & VD_DRING) && 1000 ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) 1001 PRN("ldc_mem_dring_unmap() returned errno %d", status); 1002 if ((reset_ldc == B_TRUE) && 1003 ((status = ldc_reset(vd->ldc_handle)) != 0)) 1004 PRN("ldc_reset() returned errno %d", status); 1005 vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); 1006 vd->state = VD_STATE_INIT; 1007 vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ 1008 } 1009 1010 static int 1011 vd_check_seq_num(vd_t *vd, uint64_t seq_num) 1012 { 1013 ASSERT(mutex_owned(&vd->lock)); 1014 if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { 1015 PRN("Received seq_num %lu; expected %lu", 1016 seq_num, (vd->seq_num + 1)); 1017 vd_reset_connection(vd, B_FALSE); 1018 return (1); 1019 } 1020 1021 vd->seq_num = seq_num; 1022 vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ 1023 return (0); 1024 } 1025 1026 /* 1027 * Return the expected size of an inband-descriptor message with all the 1028 * cookies it claims to include 1029 */ 1030 static size_t 1031 expected_inband_size(vd_dring_inband_msg_t *msg) 1032 { 1033 return ((sizeof (*msg)) + 1034 (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); 1035 } 1036 1037 /* 1038 * Process an in-band descriptor message: used with clients like OBP, with 1039 * which vds exchanges descriptors within VIO message payloads, rather than 1040 * operating on them within a descriptor ring 1041 */ 1042 static int 1043 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1044 { 1045 size_t expected; 1046 vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; 1047 1048 1049 PR1("Entered"); 1050 ASSERT(mutex_owned(&vd->lock)); 1051 ASSERT(msglen >= sizeof (msg->tag)); 1052 1053 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1054 VIO_DESC_DATA)) 1055 return (ENOMSG); /* not an in-band-descriptor message */ 1056 1057 if (msglen < sizeof (*desc_msg)) { 1058 PRN("Expected at least %lu-byte descriptor message; " 1059 "received %lu bytes", sizeof (*desc_msg), msglen); 1060 return (EBADMSG); 1061 } 1062 1063 if (msglen != (expected = expected_inband_size(desc_msg))) { 1064 PRN("Expected %lu-byte descriptor message; " 1065 "received %lu bytes", expected, msglen); 1066 return (EBADMSG); 1067 } 1068 1069 if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) { 1070 return (EBADMSG); 1071 } 1072 1073 /* Valid message; process the request */ 1074 desc_msg->payload.status = vd_process_request(vd, &desc_msg->payload); 1075 return (0); 1076 } 1077 1078 static boolean_t 1079 vd_accept_dring_elems(vd_t *vd, uint32_t start, uint32_t ndesc) 1080 { 1081 uint32_t i, n; 1082 1083 1084 /* Check descriptor states */ 1085 for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) { 1086 if (VD_DRING_ELEM(i)->hdr.dstate != VIO_DESC_READY) { 1087 PRN("descriptor %u not ready", i); 1088 VD_DUMP_DRING_ELEM(VD_DRING_ELEM(i)); 1089 return (B_FALSE); 1090 } 1091 } 1092 1093 /* Descriptors are valid; accept them */ 1094 for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) 1095 VD_DRING_ELEM(i)->hdr.dstate = VIO_DESC_ACCEPTED; 1096 1097 return (B_TRUE); 1098 } 1099 1100 static int 1101 vd_process_dring(vd_t *vd, uint32_t start, uint32_t end) 1102 { 1103 int status; 1104 boolean_t accepted; 1105 uint32_t i, io_status, n, ndesc; 1106 1107 1108 ASSERT(mutex_owned(&vd->lock)); 1109 PR1("start = %u, end = %u", start, end); 1110 1111 /* Validate descriptor range */ 1112 if ((start >= vd->dring_len) || (end >= vd->dring_len)) { 1113 PRN("\"start\" = %u, \"end\" = %u; both must be less than %u", 1114 start, end, vd->dring_len); 1115 return (EINVAL); 1116 } 1117 1118 /* Acquire updated dring elements */ 1119 if ((status = ldc_mem_dring_acquire(vd->dring_handle, 1120 start, end)) != 0) { 1121 PRN("ldc_mem_dring_acquire() returned errno %d", status); 1122 return (status); 1123 } 1124 /* Accept updated dring elements */ 1125 ndesc = ((end < start) ? end + vd->dring_len : end) - start + 1; 1126 PR1("ndesc = %u", ndesc); 1127 accepted = vd_accept_dring_elems(vd, start, ndesc); 1128 /* Release dring elements */ 1129 if ((status = ldc_mem_dring_release(vd->dring_handle, 1130 start, end)) != 0) { 1131 PRN("ldc_mem_dring_release() returned errno %d", status); 1132 return (status); 1133 } 1134 /* If a descriptor was in the wrong state, return an error */ 1135 if (!accepted) 1136 return (EINVAL); 1137 1138 1139 /* Process accepted dring elements */ 1140 for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) { 1141 vd_dring_entry_t *elem = VD_DRING_ELEM(i); 1142 1143 /* Process descriptor outside acquire/release bracket */ 1144 PR1("Processing dring element %u", i); 1145 io_status = vd_process_request(vd, &elem->payload); 1146 1147 /* Re-acquire client's dring element */ 1148 if ((status = ldc_mem_dring_acquire(vd->dring_handle, 1149 i, i)) != 0) { 1150 PRN("ldc_mem_dring_acquire() returned errno %d", 1151 status); 1152 return (status); 1153 } 1154 /* Update processed element */ 1155 if (elem->hdr.dstate == VIO_DESC_ACCEPTED) { 1156 elem->payload.status = io_status; 1157 elem->hdr.dstate = VIO_DESC_DONE; 1158 } else { 1159 /* Perhaps client timed out waiting for I/O... */ 1160 accepted = B_FALSE; 1161 PRN("element %u no longer \"accepted\"", i); 1162 VD_DUMP_DRING_ELEM(elem); 1163 } 1164 /* Release updated processed element */ 1165 if ((status = ldc_mem_dring_release(vd->dring_handle, 1166 i, i)) != 0) { 1167 PRN("ldc_mem_dring_release() returned errno %d", 1168 status); 1169 return (status); 1170 } 1171 /* If the descriptor was in the wrong state, return an error */ 1172 if (!accepted) 1173 return (EINVAL); 1174 } 1175 1176 return (0); 1177 } 1178 1179 static int 1180 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1181 { 1182 vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; 1183 1184 1185 PR1("Entered"); 1186 ASSERT(mutex_owned(&vd->lock)); 1187 ASSERT(msglen >= sizeof (msg->tag)); 1188 1189 if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, 1190 VIO_DRING_DATA)) { 1191 return (ENOMSG); /* not a dring-data message */ 1192 } 1193 1194 if (msglen != sizeof (*dring_msg)) { 1195 PRN("Expected %lu-byte dring message; received %lu bytes", 1196 sizeof (*dring_msg), msglen); 1197 return (EBADMSG); 1198 } 1199 1200 if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) { 1201 return (EBADMSG); 1202 } 1203 1204 if (dring_msg->dring_ident != vd->dring_ident) { 1205 PRN("Expected dring ident %lu; received ident %lu", 1206 vd->dring_ident, dring_msg->dring_ident); 1207 return (EBADMSG); 1208 } 1209 1210 1211 /* Valid message; process dring */ 1212 dring_msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1213 return (vd_process_dring(vd, dring_msg->start_idx, dring_msg->end_idx)); 1214 } 1215 1216 static int 1217 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) 1218 { 1219 int retry, status; 1220 size_t size = *nbytes; 1221 1222 1223 for (retry = 0, status = ETIMEDOUT; 1224 retry < vds_ldc_retries && status == ETIMEDOUT; 1225 retry++) { 1226 PR1("ldc_read() attempt %d", (retry + 1)); 1227 *nbytes = size; 1228 status = ldc_read(ldc_handle, msg, nbytes); 1229 } 1230 1231 if (status != 0) { 1232 PRN("ldc_read() returned errno %d", status); 1233 return (status); 1234 } else if (*nbytes == 0) { 1235 PR1("ldc_read() returned 0 and no message read"); 1236 return (ENOMSG); 1237 } 1238 1239 PR1("RCVD %lu-byte message", *nbytes); 1240 return (0); 1241 } 1242 1243 static int 1244 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1245 { 1246 int status; 1247 1248 1249 PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, 1250 msg->tag.vio_subtype, msg->tag.vio_subtype_env); 1251 ASSERT(mutex_owned(&vd->lock)); 1252 1253 /* 1254 * Validate session ID up front, since it applies to all messages 1255 * once set 1256 */ 1257 if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { 1258 PRN("Expected SID %u, received %u", vd->sid, 1259 msg->tag.vio_sid); 1260 return (EBADMSG); 1261 } 1262 1263 1264 /* 1265 * Process the received message based on connection state 1266 */ 1267 switch (vd->state) { 1268 case VD_STATE_INIT: /* expect version message */ 1269 if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0) 1270 return (status); 1271 1272 /* Version negotiated, move to that state */ 1273 vd->state = VD_STATE_VER; 1274 return (0); 1275 1276 case VD_STATE_VER: /* expect attribute message */ 1277 if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) 1278 return (status); 1279 1280 /* Attributes exchanged, move to that state */ 1281 vd->state = VD_STATE_ATTR; 1282 return (0); 1283 1284 case VD_STATE_ATTR: 1285 switch (vd->xfer_mode) { 1286 case VIO_DESC_MODE: /* expect RDX message */ 1287 if ((status = process_rdx_msg(msg, msglen)) != 0) 1288 return (status); 1289 1290 /* Ready to receive in-band descriptors */ 1291 vd->state = VD_STATE_DATA; 1292 return (0); 1293 1294 case VIO_DRING_MODE: /* expect register-dring message */ 1295 if ((status = 1296 vd_process_dring_reg_msg(vd, msg, msglen)) != 0) 1297 return (status); 1298 1299 /* One dring negotiated, move to that state */ 1300 vd->state = VD_STATE_DRING; 1301 return (0); 1302 1303 default: 1304 ASSERT("Unsupported transfer mode"); 1305 PRN("Unsupported transfer mode"); 1306 return (ENOTSUP); 1307 } 1308 1309 case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ 1310 if ((status = process_rdx_msg(msg, msglen)) == 0) { 1311 /* Ready to receive data */ 1312 vd->state = VD_STATE_DATA; 1313 return (0); 1314 } else if (status != ENOMSG) { 1315 return (status); 1316 } 1317 1318 1319 /* 1320 * If another register-dring message is received, stay in 1321 * dring state in case the client sends RDX; although the 1322 * protocol allows multiple drings, this server does not 1323 * support using more than one 1324 */ 1325 if ((status = 1326 vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) 1327 return (status); 1328 1329 /* 1330 * Acknowledge an unregister-dring message, but reset the 1331 * connection anyway: Although the protocol allows 1332 * unregistering drings, this server cannot serve a vdisk 1333 * without its only dring 1334 */ 1335 status = vd_process_dring_unreg_msg(vd, msg, msglen); 1336 return ((status == 0) ? ENOTSUP : status); 1337 1338 case VD_STATE_DATA: 1339 switch (vd->xfer_mode) { 1340 case VIO_DESC_MODE: /* expect in-band-descriptor message */ 1341 return (vd_process_desc_msg(vd, msg, msglen)); 1342 1343 case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ 1344 /* 1345 * Typically expect dring-data messages, so handle 1346 * them first 1347 */ 1348 if ((status = vd_process_dring_msg(vd, msg, 1349 msglen)) != ENOMSG) 1350 return (status); 1351 1352 /* 1353 * Acknowledge an unregister-dring message, but reset 1354 * the connection anyway: Although the protocol 1355 * allows unregistering drings, this server cannot 1356 * serve a vdisk without its only dring 1357 */ 1358 status = vd_process_dring_unreg_msg(vd, msg, msglen); 1359 return ((status == 0) ? ENOTSUP : status); 1360 1361 default: 1362 ASSERT("Unsupported transfer mode"); 1363 PRN("Unsupported transfer mode"); 1364 return (ENOTSUP); 1365 } 1366 1367 default: 1368 ASSERT("Invalid client connection state"); 1369 PRN("Invalid client connection state"); 1370 return (ENOTSUP); 1371 } 1372 } 1373 1374 static void 1375 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) 1376 { 1377 int status; 1378 boolean_t reset_ldc = B_FALSE; 1379 1380 1381 ASSERT(mutex_owned(&vd->lock)); 1382 1383 /* 1384 * Check that the message is at least big enough for a "tag", so that 1385 * message processing can proceed based on tag-specified message type 1386 */ 1387 if (msglen < sizeof (vio_msg_tag_t)) { 1388 PRN("Received short (%lu-byte) message", msglen); 1389 /* Can't "nack" short message, so drop the big hammer */ 1390 vd_reset_connection(vd, B_TRUE); 1391 return; 1392 } 1393 1394 /* 1395 * Process the message 1396 */ 1397 switch (status = vd_do_process_msg(vd, msg, msglen)) { 1398 case 0: 1399 /* "ack" valid, successfully-processed messages */ 1400 msg->tag.vio_subtype = VIO_SUBTYPE_ACK; 1401 break; 1402 1403 case ENOMSG: 1404 PRN("Received unexpected message"); 1405 _NOTE(FALLTHROUGH); 1406 case EBADMSG: 1407 case ENOTSUP: 1408 /* "nack" invalid messages */ 1409 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 1410 break; 1411 1412 default: 1413 /* "nack" failed messages */ 1414 msg->tag.vio_subtype = VIO_SUBTYPE_NACK; 1415 /* An LDC error probably occurred, so try resetting it */ 1416 reset_ldc = B_TRUE; 1417 break; 1418 } 1419 1420 /* "ack" or "nack" the message */ 1421 PR1("Sending %s", 1422 (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); 1423 if (send_msg(vd->ldc_handle, msg, msglen) != 0) 1424 reset_ldc = B_TRUE; 1425 1426 /* Reset the connection for nack'ed or failed messages */ 1427 if ((status != 0) || reset_ldc) 1428 vd_reset_connection(vd, reset_ldc); 1429 } 1430 1431 static void 1432 vd_recv_msg(void *arg) 1433 { 1434 vd_t *vd = (vd_t *)arg; 1435 int status = 0; 1436 1437 1438 PR2("Entered"); 1439 ASSERT(vd != NULL); 1440 mutex_enter(&vd->lock); 1441 /* 1442 * Receive and process any messages in the LDC queue; max_msglen is 1443 * reset each time through the loop, as vd->max_msglen can increase 1444 * during connection handshake 1445 */ 1446 for (size_t max_msglen = vd->max_msglen; 1447 vd->enabled && status == 0; 1448 max_msglen = vd->max_msglen) { 1449 size_t msglen = max_msglen; 1450 vio_msg_t *vio_msg = kmem_alloc(max_msglen, KM_SLEEP); 1451 1452 if ((status = recv_msg(vd->ldc_handle, vio_msg, &msglen)) == 0) 1453 vd_process_msg(vd, vio_msg, msglen); 1454 else if (status != ENOMSG) 1455 vd_reset_connection(vd, B_TRUE); 1456 kmem_free(vio_msg, max_msglen); 1457 } 1458 mutex_exit(&vd->lock); 1459 PR2("Returning"); 1460 } 1461 1462 static uint_t 1463 vd_do_handle_ldc_events(vd_t *vd, uint64_t event) 1464 { 1465 ASSERT(mutex_owned(&vd->lock)); 1466 1467 if (!vd->enabled) 1468 return (LDC_SUCCESS); 1469 1470 if (event & LDC_EVT_RESET) { 1471 PR0("Channel was reset"); 1472 return (LDC_SUCCESS); 1473 } 1474 1475 if (event & LDC_EVT_UP) { 1476 /* Reset the connection state when channel comes (back) up */ 1477 vd_reset_connection(vd, B_FALSE); 1478 } 1479 1480 if (event & LDC_EVT_READ) { 1481 PR1("New data available"); 1482 /* Queue a task to receive the new data */ 1483 if (ddi_taskq_dispatch(vd->taskq, vd_recv_msg, vd, DDI_SLEEP) != 1484 DDI_SUCCESS) 1485 PRN("Unable to dispatch vd_recv_msg()"); 1486 } 1487 1488 return (LDC_SUCCESS); 1489 } 1490 1491 static uint_t 1492 vd_handle_ldc_events(uint64_t event, caddr_t arg) 1493 { 1494 uint_t status; 1495 vd_t *vd = (vd_t *)(void *)arg; 1496 1497 1498 ASSERT(vd != NULL); 1499 mutex_enter(&vd->lock); 1500 status = vd_do_handle_ldc_events(vd, event); 1501 mutex_exit(&vd->lock); 1502 return (status); 1503 } 1504 1505 static uint_t 1506 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 1507 { 1508 _NOTE(ARGUNUSED(key, val)) 1509 (*((uint_t *)arg))++; 1510 return (MH_WALK_TERMINATE); 1511 } 1512 1513 1514 static int 1515 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1516 { 1517 uint_t vd_present = 0; 1518 minor_t instance; 1519 vds_t *vds; 1520 1521 1522 PR0("Entered"); 1523 switch (cmd) { 1524 case DDI_DETACH: 1525 /* the real work happens below */ 1526 break; 1527 case DDI_SUSPEND: 1528 /* nothing to do for this non-device */ 1529 return (DDI_SUCCESS); 1530 default: 1531 return (DDI_FAILURE); 1532 } 1533 1534 ASSERT(cmd == DDI_DETACH); 1535 instance = ddi_get_instance(dip); 1536 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 1537 PRN("Could not get state for instance %u", instance); 1538 ddi_soft_state_free(vds_state, instance); 1539 return (DDI_FAILURE); 1540 } 1541 1542 /* Do no detach when serving any vdisks */ 1543 mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); 1544 if (vd_present) { 1545 PR0("Not detaching because serving vdisks"); 1546 return (DDI_FAILURE); 1547 } 1548 1549 PR0("Detaching"); 1550 if (vds->initialized & VDS_MDEG) 1551 (void) mdeg_unregister(vds->mdeg); 1552 if (vds->initialized & VDS_LDI) 1553 (void) ldi_ident_release(vds->ldi_ident); 1554 mod_hash_destroy_hash(vds->vd_table); 1555 if (vds->initialized & VDS_LOCKING) 1556 mutex_destroy(&vds->lock); 1557 ddi_soft_state_free(vds_state, instance); 1558 return (DDI_SUCCESS); 1559 } 1560 1561 static boolean_t 1562 is_pseudo_device(dev_info_t *dip) 1563 { 1564 dev_info_t *parent, *root = ddi_root_node(); 1565 1566 1567 for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); 1568 parent = ddi_get_parent(parent)) { 1569 if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) 1570 return (B_TRUE); 1571 } 1572 1573 return (B_FALSE); 1574 } 1575 1576 static int 1577 vd_setup_full_disk(vd_t *vd) 1578 { 1579 int rval, status; 1580 major_t major = getmajor(vd->dev[0]); 1581 minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; 1582 struct vtoc vtoc; 1583 1584 1585 /* Get the VTOC for slice sizes */ 1586 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vtoc, 1587 FKIOCTL, kcred, &rval)) != 0) { 1588 PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d", status); 1589 return (status); 1590 } 1591 1592 /* Set full-disk parameters */ 1593 vd->vdisk_type = VD_DISK_TYPE_DISK; 1594 vd->nslices = (sizeof (vd->dev))/(sizeof (vd->dev[0])); 1595 1596 /* Move dev number and LDI handle to entire-disk-slice array elements */ 1597 vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; 1598 vd->dev[0] = 0; 1599 vd->ldi_handle[VD_ENTIRE_DISK_SLICE] = vd->ldi_handle[0]; 1600 vd->ldi_handle[0] = NULL; 1601 1602 /* Initialize device numbers for remaining slices and open them */ 1603 for (int slice = 0; slice < vd->nslices; slice++) { 1604 /* 1605 * Skip the entire-disk slice, as it's already open and its 1606 * device known 1607 */ 1608 if (slice == VD_ENTIRE_DISK_SLICE) 1609 continue; 1610 ASSERT(vd->dev[slice] == 0); 1611 ASSERT(vd->ldi_handle[slice] == NULL); 1612 1613 /* 1614 * Construct the device number for the current slice 1615 */ 1616 vd->dev[slice] = makedevice(major, (minor + slice)); 1617 1618 /* 1619 * At least some underlying drivers refuse to open 1620 * devices for (currently) zero-length slices, so skip 1621 * them for now 1622 */ 1623 if (vtoc.v_part[slice].p_size == 0) { 1624 PR0("Skipping zero-length slice %u", slice); 1625 continue; 1626 } 1627 1628 /* 1629 * Open all non-empty slices of the disk to serve them to the 1630 * client. Slices are opened exclusively to prevent other 1631 * threads or processes in the service domain from performing 1632 * I/O to slices being accessed by a client. Failure to open 1633 * a slice results in vds not serving this disk, as the client 1634 * could attempt (and should be able) to access any non-empty 1635 * slice immediately. Any slices successfully opened before a 1636 * failure will get closed by vds_destroy_vd() as a result of 1637 * the error returned by this function. 1638 */ 1639 PR0("Opening device major %u, minor %u = slice %u", 1640 major, minor, slice); 1641 if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, 1642 vd_open_flags, kcred, &vd->ldi_handle[slice], 1643 vd->vds->ldi_ident)) != 0) { 1644 PRN("ldi_open_by_dev() returned errno %d " 1645 "for slice %u", status, slice); 1646 /* vds_destroy_vd() will close any open slices */ 1647 return (status); 1648 } 1649 } 1650 1651 return (0); 1652 } 1653 1654 static int 1655 vd_setup_vd(char *block_device, vd_t *vd) 1656 { 1657 int otyp, rval, status; 1658 dev_info_t *dip; 1659 struct dk_cinfo dk_cinfo; 1660 1661 1662 if ((status = ldi_open_by_name(block_device, vd_open_flags, kcred, 1663 &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) { 1664 PRN("ldi_open_by_name(%s) = errno %d", block_device, status); 1665 return (status); 1666 } 1667 1668 /* Get block device's device number, otyp, and size */ 1669 if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) { 1670 PRN("ldi_get_dev() returned errno %d for %s", 1671 status, block_device); 1672 return (status); 1673 } 1674 if ((status = ldi_get_otyp(vd->ldi_handle[0], &otyp)) != 0) { 1675 PRN("ldi_get_otyp() returned errno %d for %s", 1676 status, block_device); 1677 return (status); 1678 } 1679 if (otyp != OTYP_BLK) { 1680 PRN("Cannot serve non-block device %s", block_device); 1681 return (ENOTBLK); 1682 } 1683 if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) { 1684 PRN("ldi_get_size() failed for %s", block_device); 1685 return (EIO); 1686 } 1687 1688 /* Determine if backing block device is a pseudo device */ 1689 if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), 1690 dev_to_instance(vd->dev[0]), 0)) == NULL) { 1691 PRN("%s is no longer accessible", block_device); 1692 return (EIO); 1693 } 1694 vd->pseudo = is_pseudo_device(dip); 1695 ddi_release_devi(dip); 1696 if (vd->pseudo) { 1697 vd->vdisk_type = VD_DISK_TYPE_SLICE; 1698 vd->nslices = 1; 1699 return (0); /* ...and we're done */ 1700 } 1701 1702 /* Get dk_cinfo to determine slice of backing block device */ 1703 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, 1704 (intptr_t)&dk_cinfo, FKIOCTL, kcred, &rval)) != 0) { 1705 PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", 1706 status, block_device); 1707 return (status); 1708 } 1709 1710 if (dk_cinfo.dki_partition >= V_NUMPAR) { 1711 PRN("slice %u >= maximum slice %u for %s", 1712 dk_cinfo.dki_partition, V_NUMPAR, block_device); 1713 return (EIO); 1714 } 1715 1716 1717 /* If slice is entire-disk slice, initialize for full disk */ 1718 if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) 1719 return (vd_setup_full_disk(vd)); 1720 1721 1722 /* Otherwise, we have a non-entire slice of a block device */ 1723 vd->vdisk_type = VD_DISK_TYPE_SLICE; 1724 vd->nslices = 1; 1725 1726 1727 /* Initialize dk_geom structure for single-slice block device */ 1728 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, 1729 (intptr_t)&vd->dk_geom, FKIOCTL, kcred, &rval)) != 0) { 1730 PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", 1731 status, block_device); 1732 return (status); 1733 } 1734 if (vd->dk_geom.dkg_nsect == 0) { 1735 PRN("%s geometry claims 0 sectors per track", block_device); 1736 return (EIO); 1737 } 1738 if (vd->dk_geom.dkg_nhead == 0) { 1739 PRN("%s geometry claims 0 heads", block_device); 1740 return (EIO); 1741 } 1742 vd->dk_geom.dkg_ncyl = 1743 lbtodb(vd->vdisk_size)/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; 1744 vd->dk_geom.dkg_acyl = 0; 1745 vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; 1746 1747 1748 /* Initialize vtoc structure for single-slice block device */ 1749 if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, 1750 (intptr_t)&vd->vtoc, FKIOCTL, kcred, &rval)) != 0) { 1751 PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d for %s", 1752 status, block_device); 1753 return (status); 1754 } 1755 bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, 1756 MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); 1757 bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); 1758 vd->vtoc.v_nparts = 1; 1759 vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; 1760 vd->vtoc.v_part[0].p_flag = 0; 1761 vd->vtoc.v_part[0].p_start = 0; 1762 vd->vtoc.v_part[0].p_size = lbtodb(vd->vdisk_size); 1763 bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, 1764 MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); 1765 1766 1767 return (0); 1768 } 1769 1770 static int 1771 vds_do_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id, 1772 vd_t **vdp) 1773 { 1774 char tq_name[TASKQ_NAMELEN]; 1775 int status; 1776 ddi_iblock_cookie_t iblock = NULL; 1777 ldc_attr_t ldc_attr; 1778 vd_t *vd; 1779 1780 1781 ASSERT(vds != NULL); 1782 ASSERT(block_device != NULL); 1783 ASSERT(vdp != NULL); 1784 PR0("Adding vdisk for %s", block_device); 1785 1786 if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { 1787 PRN("No memory for virtual disk"); 1788 return (EAGAIN); 1789 } 1790 *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ 1791 vd->vds = vds; 1792 1793 1794 /* Open vdisk and initialize parameters */ 1795 if ((status = vd_setup_vd(block_device, vd)) != 0) 1796 return (status); 1797 ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); 1798 PR0("vdisk_type = %s, pseudo = %s, nslices = %u", 1799 ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), 1800 (vd->pseudo ? "yes" : "no"), vd->nslices); 1801 1802 1803 /* Initialize locking */ 1804 if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, 1805 &iblock) != DDI_SUCCESS) { 1806 PRN("Could not get iblock cookie."); 1807 return (EIO); 1808 } 1809 1810 mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); 1811 vd->initialized |= VD_LOCKING; 1812 1813 1814 /* Create the task queue for the vdisk */ 1815 (void) snprintf(tq_name, sizeof (tq_name), "vd%lu", id); 1816 PR1("tq_name = %s", tq_name); 1817 if ((vd->taskq = ddi_taskq_create(vds->dip, tq_name, 1, 1818 TASKQ_DEFAULTPRI, 0)) == NULL) { 1819 PRN("Could not create task queue"); 1820 return (EIO); 1821 } 1822 vd->initialized |= VD_TASKQ; 1823 vd->enabled = 1; /* before callback can dispatch to taskq */ 1824 1825 1826 /* Bring up LDC */ 1827 ldc_attr.devclass = LDC_DEV_BLK_SVC; 1828 ldc_attr.instance = ddi_get_instance(vds->dip); 1829 ldc_attr.mode = LDC_MODE_UNRELIABLE; 1830 ldc_attr.qlen = VD_LDC_QLEN; 1831 if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { 1832 PRN("ldc_init(%lu) = errno %d", ldc_id, status); 1833 return (status); 1834 } 1835 vd->initialized |= VD_LDC; 1836 1837 if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, 1838 (caddr_t)vd)) != 0) { 1839 PRN("ldc_reg_callback() returned errno %d", status); 1840 return (status); 1841 } 1842 1843 if ((status = ldc_open(vd->ldc_handle)) != 0) { 1844 PRN("ldc_open() returned errno %d", status); 1845 return (status); 1846 } 1847 1848 1849 /* Add the successfully-initialized vdisk to the server's table */ 1850 if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { 1851 PRN("Error adding vdisk ID %lu to table", id); 1852 return (EIO); 1853 } 1854 1855 return (0); 1856 } 1857 1858 /* 1859 * Destroy the state associated with a virtual disk 1860 */ 1861 static void 1862 vds_destroy_vd(void *arg) 1863 { 1864 vd_t *vd = (vd_t *)arg; 1865 1866 1867 PR0("Entered"); 1868 if (vd == NULL) 1869 return; 1870 1871 /* Disable queuing requests for the vdisk */ 1872 if (vd->initialized & VD_LOCKING) { 1873 mutex_enter(&vd->lock); 1874 vd->enabled = 0; 1875 mutex_exit(&vd->lock); 1876 } 1877 1878 /* Drain and destroy the task queue (*before* shutting down LDC) */ 1879 if (vd->initialized & VD_TASKQ) 1880 ddi_taskq_destroy(vd->taskq); /* waits for queued tasks */ 1881 1882 /* Shut down LDC */ 1883 if (vd->initialized & VD_LDC) { 1884 if (vd->initialized & VD_DRING) 1885 (void) ldc_mem_dring_unmap(vd->dring_handle); 1886 (void) ldc_unreg_callback(vd->ldc_handle); 1887 (void) ldc_close(vd->ldc_handle); 1888 (void) ldc_fini(vd->ldc_handle); 1889 } 1890 1891 /* Close any open backing-device slices */ 1892 for (uint_t slice = 0; slice < vd->nslices; slice++) { 1893 if (vd->ldi_handle[slice] != NULL) { 1894 PR0("Closing slice %u", slice); 1895 (void) ldi_close(vd->ldi_handle[slice], 1896 vd_open_flags, kcred); 1897 } 1898 } 1899 1900 /* Free lock */ 1901 if (vd->initialized & VD_LOCKING) 1902 mutex_destroy(&vd->lock); 1903 1904 /* Finally, free the vdisk structure itself */ 1905 kmem_free(vd, sizeof (*vd)); 1906 } 1907 1908 static int 1909 vds_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id) 1910 { 1911 int status; 1912 vd_t *vd = NULL; 1913 1914 1915 #ifdef lint 1916 (void) vd; 1917 #endif /* lint */ 1918 1919 if ((status = vds_do_init_vd(vds, id, block_device, ldc_id, &vd)) != 0) 1920 vds_destroy_vd(vd); 1921 1922 return (status); 1923 } 1924 1925 static int 1926 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, 1927 uint64_t *ldc_id) 1928 { 1929 int num_channels; 1930 1931 1932 /* Look for channel endpoint child(ren) of the vdisk MD node */ 1933 if ((num_channels = md_scan_dag(md, vd_node, 1934 md_find_name(md, VD_CHANNEL_ENDPOINT), 1935 md_find_name(md, "fwd"), channel)) <= 0) { 1936 PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); 1937 return (-1); 1938 } 1939 1940 /* Get the "id" value for the first channel endpoint node */ 1941 if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { 1942 PRN("No \"%s\" property found for \"%s\" of vdisk", 1943 VD_ID_PROP, VD_CHANNEL_ENDPOINT); 1944 return (-1); 1945 } 1946 1947 if (num_channels > 1) { 1948 PRN("Using ID of first of multiple channels for this vdisk"); 1949 } 1950 1951 return (0); 1952 } 1953 1954 static int 1955 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) 1956 { 1957 int num_nodes, status; 1958 size_t size; 1959 mde_cookie_t *channel; 1960 1961 1962 if ((num_nodes = md_node_count(md)) <= 0) { 1963 PRN("Invalid node count in Machine Description subtree"); 1964 return (-1); 1965 } 1966 size = num_nodes*(sizeof (*channel)); 1967 channel = kmem_zalloc(size, KM_SLEEP); 1968 status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); 1969 kmem_free(channel, size); 1970 1971 return (status); 1972 } 1973 1974 static void 1975 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 1976 { 1977 char *block_device = NULL; 1978 uint64_t id = 0, ldc_id = 0; 1979 1980 1981 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 1982 PRN("Error getting vdisk \"%s\"", VD_ID_PROP); 1983 return; 1984 } 1985 PR0("Adding vdisk ID %lu", id); 1986 if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, 1987 &block_device) != 0) { 1988 PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 1989 return; 1990 } 1991 1992 if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { 1993 PRN("Error getting LDC ID for vdisk %lu", id); 1994 return; 1995 } 1996 1997 if (vds_init_vd(vds, id, block_device, ldc_id) != 0) { 1998 PRN("Failed to add vdisk ID %lu", id); 1999 return; 2000 } 2001 } 2002 2003 static void 2004 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) 2005 { 2006 uint64_t id = 0; 2007 2008 2009 if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { 2010 PRN("Unable to get \"%s\" property from vdisk's MD node", 2011 VD_ID_PROP); 2012 return; 2013 } 2014 PR0("Removing vdisk ID %lu", id); 2015 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) 2016 PRN("No vdisk entry found for vdisk ID %lu", id); 2017 } 2018 2019 static void 2020 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, 2021 md_t *curr_md, mde_cookie_t curr_vd_node) 2022 { 2023 char *curr_dev, *prev_dev; 2024 uint64_t curr_id = 0, curr_ldc_id = 0; 2025 uint64_t prev_id = 0, prev_ldc_id = 0; 2026 size_t len; 2027 2028 2029 /* Validate that vdisk ID has not changed */ 2030 if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { 2031 PRN("Error getting previous vdisk \"%s\" property", 2032 VD_ID_PROP); 2033 return; 2034 } 2035 if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { 2036 PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); 2037 return; 2038 } 2039 if (curr_id != prev_id) { 2040 PRN("Not changing vdisk: ID changed from %lu to %lu", 2041 prev_id, curr_id); 2042 return; 2043 } 2044 2045 /* Validate that LDC ID has not changed */ 2046 if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { 2047 PRN("Error getting LDC ID for vdisk %lu", prev_id); 2048 return; 2049 } 2050 2051 if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { 2052 PRN("Error getting LDC ID for vdisk %lu", curr_id); 2053 return; 2054 } 2055 if (curr_ldc_id != prev_ldc_id) { 2056 _NOTE(NOTREACHED); /* lint is confused */ 2057 PRN("Not changing vdisk: " 2058 "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); 2059 return; 2060 } 2061 2062 /* Determine whether device path has changed */ 2063 if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, 2064 &prev_dev) != 0) { 2065 PRN("Error getting previous vdisk \"%s\"", 2066 VD_BLOCK_DEVICE_PROP); 2067 return; 2068 } 2069 if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, 2070 &curr_dev) != 0) { 2071 PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); 2072 return; 2073 } 2074 if (((len = strlen(curr_dev)) == strlen(prev_dev)) && 2075 (strncmp(curr_dev, prev_dev, len) == 0)) 2076 return; /* no relevant (supported) change */ 2077 2078 PR0("Changing vdisk ID %lu", prev_id); 2079 /* Remove old state, which will close vdisk and reset */ 2080 if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) 2081 PRN("No entry found for vdisk ID %lu", prev_id); 2082 /* Re-initialize vdisk with new state */ 2083 if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { 2084 PRN("Failed to change vdisk ID %lu", curr_id); 2085 return; 2086 } 2087 } 2088 2089 static int 2090 vds_process_md(void *arg, mdeg_result_t *md) 2091 { 2092 int i; 2093 vds_t *vds = arg; 2094 2095 2096 if (md == NULL) 2097 return (MDEG_FAILURE); 2098 ASSERT(vds != NULL); 2099 2100 for (i = 0; i < md->removed.nelem; i++) 2101 vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); 2102 for (i = 0; i < md->match_curr.nelem; i++) 2103 vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], 2104 md->match_curr.mdp, md->match_curr.mdep[i]); 2105 for (i = 0; i < md->added.nelem; i++) 2106 vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); 2107 2108 return (MDEG_SUCCESS); 2109 } 2110 2111 static int 2112 vds_do_attach(dev_info_t *dip) 2113 { 2114 static char reg_prop[] = "reg"; /* devinfo ID prop */ 2115 2116 /* MDEG specification for a (particular) vds node */ 2117 static mdeg_prop_spec_t vds_prop_spec[] = { 2118 {MDET_PROP_STR, "name", {VDS_NAME}}, 2119 {MDET_PROP_VAL, "cfg-handle", {0}}, 2120 {MDET_LIST_END, NULL, {0}}}; 2121 static mdeg_node_spec_t vds_spec = {"virtual-device", vds_prop_spec}; 2122 2123 /* MDEG specification for matching a vd node */ 2124 static md_prop_match_t vd_prop_spec[] = { 2125 {MDET_PROP_VAL, VD_ID_PROP}, 2126 {MDET_LIST_END, NULL}}; 2127 static mdeg_node_match_t vd_spec = {"virtual-device-port", 2128 vd_prop_spec}; 2129 2130 int status; 2131 uint64_t cfg_handle; 2132 minor_t instance = ddi_get_instance(dip); 2133 vds_t *vds; 2134 2135 2136 /* 2137 * The "cfg-handle" property of a vds node in an MD contains the MD's 2138 * notion of "instance", or unique identifier, for that node; OBP 2139 * stores the value of the "cfg-handle" MD property as the value of 2140 * the "reg" property on the node in the device tree it builds from 2141 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2142 * "reg" property value to uniquely identify this device instance when 2143 * registering with the MD event-generation framework. If the "reg" 2144 * property cannot be found, the device tree state is presumably so 2145 * broken that there is no point in continuing. 2146 */ 2147 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, reg_prop)) { 2148 PRN("vds \"%s\" property does not exist", reg_prop); 2149 return (DDI_FAILURE); 2150 } 2151 2152 /* Get the MD instance for later MDEG registration */ 2153 cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2154 reg_prop, -1); 2155 2156 if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { 2157 PRN("Could not allocate state for instance %u", instance); 2158 return (DDI_FAILURE); 2159 } 2160 2161 if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { 2162 PRN("Could not get state for instance %u", instance); 2163 ddi_soft_state_free(vds_state, instance); 2164 return (DDI_FAILURE); 2165 } 2166 2167 2168 vds->dip = dip; 2169 vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, 2170 vds_destroy_vd, 2171 sizeof (void *)); 2172 ASSERT(vds->vd_table != NULL); 2173 2174 mutex_init(&vds->lock, NULL, MUTEX_DRIVER, NULL); 2175 vds->initialized |= VDS_LOCKING; 2176 2177 if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { 2178 PRN("ldi_ident_from_dip() returned errno %d", status); 2179 return (DDI_FAILURE); 2180 } 2181 vds->initialized |= VDS_LDI; 2182 2183 /* Register for MD updates */ 2184 vds_prop_spec[1].ps_val = cfg_handle; 2185 if (mdeg_register(&vds_spec, &vd_spec, vds_process_md, vds, 2186 &vds->mdeg) != MDEG_SUCCESS) { 2187 PRN("Unable to register for MD updates"); 2188 return (DDI_FAILURE); 2189 } 2190 vds->initialized |= VDS_MDEG; 2191 2192 /* Prevent auto-detaching so driver is available whenever MD changes */ 2193 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 2194 DDI_PROP_SUCCESS) { 2195 PRN("failed to set \"%s\" property for instance %u", 2196 DDI_NO_AUTODETACH, instance); 2197 } 2198 2199 ddi_report_dev(dip); 2200 return (DDI_SUCCESS); 2201 } 2202 2203 static int 2204 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2205 { 2206 int status; 2207 2208 PR0("Entered"); 2209 switch (cmd) { 2210 case DDI_ATTACH: 2211 if ((status = vds_do_attach(dip)) != DDI_SUCCESS) 2212 (void) vds_detach(dip, DDI_DETACH); 2213 return (status); 2214 case DDI_RESUME: 2215 /* nothing to do for this non-device */ 2216 return (DDI_SUCCESS); 2217 default: 2218 return (DDI_FAILURE); 2219 } 2220 } 2221 2222 static struct dev_ops vds_ops = { 2223 DEVO_REV, /* devo_rev */ 2224 0, /* devo_refcnt */ 2225 ddi_no_info, /* devo_getinfo */ 2226 nulldev, /* devo_identify */ 2227 nulldev, /* devo_probe */ 2228 vds_attach, /* devo_attach */ 2229 vds_detach, /* devo_detach */ 2230 nodev, /* devo_reset */ 2231 NULL, /* devo_cb_ops */ 2232 NULL, /* devo_bus_ops */ 2233 nulldev /* devo_power */ 2234 }; 2235 2236 static struct modldrv modldrv = { 2237 &mod_driverops, 2238 "virtual disk server v%I%", 2239 &vds_ops, 2240 }; 2241 2242 static struct modlinkage modlinkage = { 2243 MODREV_1, 2244 &modldrv, 2245 NULL 2246 }; 2247 2248 2249 int 2250 _init(void) 2251 { 2252 int i, status; 2253 2254 2255 PR0("Built %s %s", __DATE__, __TIME__); 2256 if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) 2257 return (status); 2258 if ((status = mod_install(&modlinkage)) != 0) { 2259 ddi_soft_state_fini(&vds_state); 2260 return (status); 2261 } 2262 2263 /* Fill in the bit-mask of server-supported operations */ 2264 for (i = 0; i < vds_noperations; i++) 2265 vds_operations |= 1 << (vds_operation[i].operation - 1); 2266 2267 return (0); 2268 } 2269 2270 int 2271 _info(struct modinfo *modinfop) 2272 { 2273 return (mod_info(&modlinkage, modinfop)); 2274 } 2275 2276 int 2277 _fini(void) 2278 { 2279 int status; 2280 2281 2282 PR0("Entered"); 2283 if ((status = mod_remove(&modlinkage)) != 0) 2284 return (status); 2285 ddi_soft_state_fini(&vds_state); 2286 return (0); 2287 } 2288