1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mach_descrip.h> 69 #include <sys/modctl.h> 70 #include <sys/mdeg.h> 71 #include <sys/note.h> 72 #include <sys/open.h> 73 #include <sys/sdt.h> 74 #include <sys/stat.h> 75 #include <sys/sunddi.h> 76 #include <sys/types.h> 77 #include <sys/promif.h> 78 #include <sys/var.h> 79 #include <sys/vtoc.h> 80 #include <sys/archsystm.h> 81 #include <sys/sysmacros.h> 82 83 #include <sys/cdio.h> 84 #include <sys/dktp/fdisk.h> 85 #include <sys/dktp/dadkio.h> 86 #include <sys/mhd.h> 87 #include <sys/scsi/generic/sense.h> 88 #include <sys/scsi/impl/uscsi.h> 89 #include <sys/scsi/impl/services.h> 90 #include <sys/scsi/targets/sddef.h> 91 92 #include <sys/ldoms.h> 93 #include <sys/ldc.h> 94 #include <sys/vio_common.h> 95 #include <sys/vio_mailbox.h> 96 #include <sys/vio_util.h> 97 #include <sys/vdsk_common.h> 98 #include <sys/vdsk_mailbox.h> 99 #include <sys/vdc.h> 100 101 /* 102 * function prototypes 103 */ 104 105 /* standard driver functions */ 106 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 107 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 108 static int vdc_strategy(struct buf *buf); 109 static int vdc_print(dev_t dev, char *str); 110 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 111 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 112 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 114 cred_t *credp, int *rvalp); 115 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 116 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 117 118 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 119 void *arg, void **resultp); 120 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 121 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 122 123 /* setup */ 124 static void vdc_min(struct buf *bufp); 125 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 126 static int vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node); 127 static int vdc_start_ldc_connection(vdc_t *vdc); 128 static int vdc_create_device_nodes(vdc_t *vdc); 129 static int vdc_create_device_nodes_efi(vdc_t *vdc); 130 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 131 static int vdc_create_device_nodes_props(vdc_t *vdc); 132 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 133 mde_cookie_t *vd_nodep, mde_cookie_t *vd_portp); 134 static int vdc_get_ldc_id(md_t *, mde_cookie_t, uint64_t *); 135 static int vdc_do_ldc_up(vdc_t *vdc); 136 static void vdc_terminate_ldc(vdc_t *vdc); 137 static int vdc_init_descriptor_ring(vdc_t *vdc); 138 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 139 static int vdc_setup_devid(vdc_t *vdc); 140 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 141 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 142 static void vdc_store_label_unk(vdc_t *vdc); 143 static boolean_t vdc_is_opened(vdc_t *vdc); 144 145 /* handshake with vds */ 146 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 147 static int vdc_ver_negotiation(vdc_t *vdcp); 148 static int vdc_init_attr_negotiation(vdc_t *vdc); 149 static int vdc_attr_negotiation(vdc_t *vdcp); 150 static int vdc_init_dring_negotiate(vdc_t *vdc); 151 static int vdc_dring_negotiation(vdc_t *vdcp); 152 static int vdc_send_rdx(vdc_t *vdcp); 153 static int vdc_rdx_exchange(vdc_t *vdcp); 154 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 155 156 /* processing incoming messages from vDisk server */ 157 static void vdc_process_msg_thread(vdc_t *vdc); 158 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 159 160 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 161 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 162 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 163 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 164 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 165 static int vdc_send_request(vdc_t *vdcp, int operation, 166 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 167 int cb_type, void *cb_arg, vio_desc_direction_t dir); 168 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 169 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 170 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 171 int cb_type, void *cb_arg, vio_desc_direction_t dir); 172 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 173 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 174 void *cb_arg, vio_desc_direction_t dir, boolean_t); 175 176 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 177 static int vdc_drain_response(vdc_t *vdcp); 178 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 179 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 180 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 181 182 /* dkio */ 183 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 184 int *rvalp); 185 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 186 static void vdc_create_fake_geometry(vdc_t *vdc); 187 static int vdc_validate_geometry(vdc_t *vdc); 188 static void vdc_validate(vdc_t *vdc); 189 static void vdc_validate_task(void *arg); 190 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 191 int mode, int dir); 192 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 193 int mode, int dir); 194 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 195 int mode, int dir); 196 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 197 int mode, int dir); 198 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 199 int mode, int dir); 200 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 201 int mode, int dir); 202 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 203 int mode, int dir); 204 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 205 int mode, int dir); 206 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 207 int mode, int dir); 208 209 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 210 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 211 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 212 static int vdc_failfast_check_resv(vdc_t *vdc); 213 214 /* 215 * Module variables 216 */ 217 218 /* 219 * Tunable variables to control how long vdc waits before timing out on 220 * various operations 221 */ 222 static int vdc_hshake_retries = 3; 223 224 static int vdc_timeout = 0; /* units: seconds */ 225 226 static uint64_t vdc_hz_min_ldc_delay; 227 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 228 static uint64_t vdc_hz_max_ldc_delay; 229 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 230 231 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 232 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 233 234 /* values for dumping - need to run in a tighter loop */ 235 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 236 static int vdc_dump_retries = 100; 237 238 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 239 240 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 241 242 /* Count of the number of vdc instances attached */ 243 static volatile uint32_t vdc_instance_count = 0; 244 245 /* Tunable to log all SCSI errors */ 246 static boolean_t vdc_scsi_log_error = B_FALSE; 247 248 /* Soft state pointer */ 249 static void *vdc_state; 250 251 /* 252 * Controlling the verbosity of the error/debug messages 253 * 254 * vdc_msglevel - controls level of messages 255 * vdc_matchinst - 64-bit variable where each bit corresponds 256 * to the vdc instance the vdc_msglevel applies. 257 */ 258 int vdc_msglevel = 0x0; 259 uint64_t vdc_matchinst = 0ull; 260 261 /* 262 * Supported vDisk protocol version pairs. 263 * 264 * The first array entry is the latest and preferred version. 265 */ 266 static const vio_ver_t vdc_version[] = {{1, 1}}; 267 268 static struct cb_ops vdc_cb_ops = { 269 vdc_open, /* cb_open */ 270 vdc_close, /* cb_close */ 271 vdc_strategy, /* cb_strategy */ 272 vdc_print, /* cb_print */ 273 vdc_dump, /* cb_dump */ 274 vdc_read, /* cb_read */ 275 vdc_write, /* cb_write */ 276 vdc_ioctl, /* cb_ioctl */ 277 nodev, /* cb_devmap */ 278 nodev, /* cb_mmap */ 279 nodev, /* cb_segmap */ 280 nochpoll, /* cb_chpoll */ 281 ddi_prop_op, /* cb_prop_op */ 282 NULL, /* cb_str */ 283 D_MP | D_64BIT, /* cb_flag */ 284 CB_REV, /* cb_rev */ 285 vdc_aread, /* cb_aread */ 286 vdc_awrite /* cb_awrite */ 287 }; 288 289 static struct dev_ops vdc_ops = { 290 DEVO_REV, /* devo_rev */ 291 0, /* devo_refcnt */ 292 vdc_getinfo, /* devo_getinfo */ 293 nulldev, /* devo_identify */ 294 nulldev, /* devo_probe */ 295 vdc_attach, /* devo_attach */ 296 vdc_detach, /* devo_detach */ 297 nodev, /* devo_reset */ 298 &vdc_cb_ops, /* devo_cb_ops */ 299 NULL, /* devo_bus_ops */ 300 nulldev /* devo_power */ 301 }; 302 303 static struct modldrv modldrv = { 304 &mod_driverops, 305 "virtual disk client", 306 &vdc_ops, 307 }; 308 309 static struct modlinkage modlinkage = { 310 MODREV_1, 311 &modldrv, 312 NULL 313 }; 314 315 /* -------------------------------------------------------------------------- */ 316 317 /* 318 * Device Driver housekeeping and setup 319 */ 320 321 int 322 _init(void) 323 { 324 int status; 325 326 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 327 return (status); 328 if ((status = mod_install(&modlinkage)) != 0) 329 ddi_soft_state_fini(&vdc_state); 330 return (status); 331 } 332 333 int 334 _info(struct modinfo *modinfop) 335 { 336 return (mod_info(&modlinkage, modinfop)); 337 } 338 339 int 340 _fini(void) 341 { 342 int status; 343 344 if ((status = mod_remove(&modlinkage)) != 0) 345 return (status); 346 ddi_soft_state_fini(&vdc_state); 347 return (0); 348 } 349 350 static int 351 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 352 { 353 _NOTE(ARGUNUSED(dip)) 354 355 int instance = VDCUNIT((dev_t)arg); 356 vdc_t *vdc = NULL; 357 358 switch (cmd) { 359 case DDI_INFO_DEVT2DEVINFO: 360 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 361 *resultp = NULL; 362 return (DDI_FAILURE); 363 } 364 *resultp = vdc->dip; 365 return (DDI_SUCCESS); 366 case DDI_INFO_DEVT2INSTANCE: 367 *resultp = (void *)(uintptr_t)instance; 368 return (DDI_SUCCESS); 369 default: 370 *resultp = NULL; 371 return (DDI_FAILURE); 372 } 373 } 374 375 static int 376 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 377 { 378 kt_did_t failfast_tid, ownership_tid; 379 int instance; 380 int rv; 381 vdc_t *vdc = NULL; 382 383 switch (cmd) { 384 case DDI_DETACH: 385 /* the real work happens below */ 386 break; 387 case DDI_SUSPEND: 388 /* nothing to do for this non-device */ 389 return (DDI_SUCCESS); 390 default: 391 return (DDI_FAILURE); 392 } 393 394 ASSERT(cmd == DDI_DETACH); 395 instance = ddi_get_instance(dip); 396 DMSGX(1, "[%d] Entered\n", instance); 397 398 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 399 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 400 return (DDI_FAILURE); 401 } 402 403 /* 404 * This function is called when vdc is detached or if it has failed to 405 * attach. In that case, the attach may have fail before the vdisk type 406 * has been set so we can't call vdc_is_opened(). However as the attach 407 * has failed, we know that the vdisk is not opened and we can safely 408 * detach. 409 */ 410 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 411 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 412 return (DDI_FAILURE); 413 } 414 415 if (vdc->dkio_flush_pending) { 416 DMSG(vdc, 0, 417 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 418 instance, vdc->dkio_flush_pending); 419 return (DDI_FAILURE); 420 } 421 422 if (vdc->validate_pending) { 423 DMSG(vdc, 0, 424 "[%d] Cannot detach: %d outstanding validate request\n", 425 instance, vdc->validate_pending); 426 return (DDI_FAILURE); 427 } 428 429 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 430 431 /* If we took ownership, release ownership */ 432 mutex_enter(&vdc->ownership_lock); 433 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 434 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 435 if (rv == 0) { 436 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 437 } 438 } 439 mutex_exit(&vdc->ownership_lock); 440 441 /* mark instance as detaching */ 442 vdc->lifecycle = VDC_LC_DETACHING; 443 444 /* 445 * try and disable callbacks to prevent another handshake 446 */ 447 rv = ldc_set_cb_mode(vdc->ldc_handle, LDC_CB_DISABLE); 448 DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); 449 450 if (vdc->initialized & VDC_THREAD) { 451 mutex_enter(&vdc->read_lock); 452 if ((vdc->read_state == VDC_READ_WAITING) || 453 (vdc->read_state == VDC_READ_RESET)) { 454 vdc->read_state = VDC_READ_RESET; 455 cv_signal(&vdc->read_cv); 456 } 457 458 mutex_exit(&vdc->read_lock); 459 460 /* wake up any thread waiting for connection to come online */ 461 mutex_enter(&vdc->lock); 462 if (vdc->state == VDC_STATE_INIT_WAITING) { 463 DMSG(vdc, 0, 464 "[%d] write reset - move to resetting state...\n", 465 instance); 466 vdc->state = VDC_STATE_RESETTING; 467 cv_signal(&vdc->initwait_cv); 468 } 469 mutex_exit(&vdc->lock); 470 471 /* now wait until state transitions to VDC_STATE_DETACH */ 472 thread_join(vdc->msg_proc_thr->t_did); 473 ASSERT(vdc->state == VDC_STATE_DETACH); 474 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 475 vdc->instance); 476 } 477 478 mutex_enter(&vdc->lock); 479 480 if (vdc->initialized & VDC_DRING) 481 vdc_destroy_descriptor_ring(vdc); 482 483 if (vdc->initialized & VDC_LDC) 484 vdc_terminate_ldc(vdc); 485 486 if (vdc->failfast_thread) { 487 failfast_tid = vdc->failfast_thread->t_did; 488 vdc->failfast_interval = 0; 489 cv_signal(&vdc->failfast_cv); 490 } else { 491 failfast_tid = 0; 492 } 493 494 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 495 ownership_tid = vdc->ownership_thread->t_did; 496 vdc->ownership = VDC_OWNERSHIP_NONE; 497 cv_signal(&vdc->ownership_cv); 498 } else { 499 ownership_tid = 0; 500 } 501 502 mutex_exit(&vdc->lock); 503 504 if (failfast_tid != 0) 505 thread_join(failfast_tid); 506 507 if (ownership_tid != 0) 508 thread_join(ownership_tid); 509 510 if (vdc->initialized & VDC_MINOR) { 511 ddi_prop_remove_all(dip); 512 ddi_remove_minor_node(dip, NULL); 513 } 514 515 if (vdc->initialized & VDC_LOCKS) { 516 mutex_destroy(&vdc->lock); 517 mutex_destroy(&vdc->read_lock); 518 mutex_destroy(&vdc->ownership_lock); 519 cv_destroy(&vdc->initwait_cv); 520 cv_destroy(&vdc->dring_free_cv); 521 cv_destroy(&vdc->membind_cv); 522 cv_destroy(&vdc->sync_pending_cv); 523 cv_destroy(&vdc->sync_blocked_cv); 524 cv_destroy(&vdc->read_cv); 525 cv_destroy(&vdc->running_cv); 526 cv_destroy(&vdc->ownership_cv); 527 cv_destroy(&vdc->failfast_cv); 528 cv_destroy(&vdc->failfast_io_cv); 529 } 530 531 if (vdc->minfo) 532 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 533 534 if (vdc->cinfo) 535 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 536 537 if (vdc->vtoc) 538 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 539 540 if (vdc->geom) 541 kmem_free(vdc->geom, sizeof (struct dk_geom)); 542 543 if (vdc->devid) { 544 ddi_devid_unregister(dip); 545 ddi_devid_free(vdc->devid); 546 } 547 548 if (vdc->initialized & VDC_SOFT_STATE) 549 ddi_soft_state_free(vdc_state, instance); 550 551 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 552 553 return (DDI_SUCCESS); 554 } 555 556 557 static int 558 vdc_do_attach(dev_info_t *dip) 559 { 560 int instance; 561 vdc_t *vdc = NULL; 562 int status; 563 md_t *mdp; 564 mde_cookie_t vd_node, vd_port; 565 566 ASSERT(dip != NULL); 567 568 instance = ddi_get_instance(dip); 569 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 570 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 571 instance); 572 return (DDI_FAILURE); 573 } 574 575 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 576 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 577 return (DDI_FAILURE); 578 } 579 580 /* 581 * We assign the value to initialized in this case to zero out the 582 * variable and then set bits in it to indicate what has been done 583 */ 584 vdc->initialized = VDC_SOFT_STATE; 585 586 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 587 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 588 589 vdc->dip = dip; 590 vdc->instance = instance; 591 vdc->vdisk_type = VD_DISK_TYPE_UNK; 592 vdc->vdisk_label = VD_DISK_LABEL_UNK; 593 vdc->state = VDC_STATE_INIT; 594 vdc->lifecycle = VDC_LC_ATTACHING; 595 vdc->ldc_state = 0; 596 vdc->session_id = 0; 597 vdc->block_size = DEV_BSIZE; 598 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 599 600 /* 601 * We assume, for now, that the vDisk server will export 'read' 602 * operations to us at a minimum (this is needed because of checks 603 * in vdc for supported operations early in the handshake process). 604 * The vDisk server will return ENOTSUP if this is not the case. 605 * The value will be overwritten during the attribute exchange with 606 * the bitmask of operations exported by server. 607 */ 608 vdc->operations = VD_OP_MASK_READ; 609 610 vdc->vtoc = NULL; 611 vdc->geom = NULL; 612 vdc->cinfo = NULL; 613 vdc->minfo = NULL; 614 615 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 616 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 617 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 618 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 619 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 620 621 vdc->threads_pending = 0; 622 vdc->sync_op_pending = B_FALSE; 623 vdc->sync_op_blocked = B_FALSE; 624 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 625 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 626 627 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 628 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 629 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 630 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 631 632 /* init blocking msg read functionality */ 633 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 634 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 635 vdc->read_state = VDC_READ_IDLE; 636 637 vdc->initialized |= VDC_LOCKS; 638 639 /* get device and port MD node for this disk instance */ 640 if (vdc_get_md_node(dip, &mdp, &vd_node, &vd_port) != 0) { 641 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 642 instance); 643 return (DDI_FAILURE); 644 } 645 646 /* set the connection timeout */ 647 if (vd_port == NULL || (md_get_prop_val(mdp, vd_port, 648 VDC_MD_TIMEOUT, &vdc->ctimeout) != 0)) { 649 vdc->ctimeout = 0; 650 } 651 652 /* initialise LDC channel which will be used to communicate with vds */ 653 status = vdc_do_ldc_init(vdc, mdp, vd_node); 654 655 (void) md_fini_handle(mdp); 656 657 if (status != 0) { 658 cmn_err(CE_NOTE, "[%d] Couldn't initialize LDC", instance); 659 goto return_status; 660 } 661 662 /* initialize the thread responsible for managing state with server */ 663 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 664 vdc, 0, &p0, TS_RUN, minclsyspri); 665 if (vdc->msg_proc_thr == NULL) { 666 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 667 instance); 668 return (DDI_FAILURE); 669 } 670 671 vdc->initialized |= VDC_THREAD; 672 673 atomic_inc_32(&vdc_instance_count); 674 675 /* 676 * Check the disk label. This will send requests and do the handshake. 677 * We don't really care about the disk label now. What we really need is 678 * the handshake do be done so that we know the type of the disk (slice 679 * or full disk) and the appropriate device nodes can be created. 680 */ 681 vdc->vdisk_label = VD_DISK_LABEL_UNK; 682 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 683 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 684 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 685 686 mutex_enter(&vdc->lock); 687 (void) vdc_validate_geometry(vdc); 688 mutex_exit(&vdc->lock); 689 690 /* 691 * Now that we have the device info we can create the 692 * device nodes and properties 693 */ 694 status = vdc_create_device_nodes(vdc); 695 if (status) { 696 DMSG(vdc, 0, "[%d] Failed to create device nodes", 697 instance); 698 goto return_status; 699 } 700 status = vdc_create_device_nodes_props(vdc); 701 if (status) { 702 DMSG(vdc, 0, "[%d] Failed to create device nodes" 703 " properties (%d)", instance, status); 704 goto return_status; 705 } 706 707 /* 708 * Setup devid 709 */ 710 if (vdc_setup_devid(vdc)) { 711 DMSG(vdc, 0, "[%d] No device id available\n", instance); 712 } 713 714 ddi_report_dev(dip); 715 vdc->lifecycle = VDC_LC_ONLINE; 716 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 717 718 return_status: 719 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 720 return (status); 721 } 722 723 static int 724 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 725 { 726 int status; 727 728 switch (cmd) { 729 case DDI_ATTACH: 730 if ((status = vdc_do_attach(dip)) != 0) 731 (void) vdc_detach(dip, DDI_DETACH); 732 return (status); 733 case DDI_RESUME: 734 /* nothing to do for this non-device */ 735 return (DDI_SUCCESS); 736 default: 737 return (DDI_FAILURE); 738 } 739 } 740 741 static int 742 vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node) 743 { 744 int status = 0; 745 ldc_status_t ldc_state; 746 ldc_attr_t ldc_attr; 747 uint64_t ldc_id = 0; 748 749 ASSERT(vdc != NULL); 750 751 vdc->initialized |= VDC_LDC; 752 753 if ((status = vdc_get_ldc_id(mdp, vd_node, &ldc_id)) != 0) { 754 DMSG(vdc, 0, "[%d] Failed to get LDC channel ID property", 755 vdc->instance); 756 return (EIO); 757 } 758 759 DMSGX(0, "[%d] LDC id is 0x%lx\n", vdc->instance, ldc_id); 760 761 vdc->ldc_id = ldc_id; 762 763 ldc_attr.devclass = LDC_DEV_BLK; 764 ldc_attr.instance = vdc->instance; 765 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 766 ldc_attr.mtu = VD_LDC_MTU; 767 768 if ((vdc->initialized & VDC_LDC_INIT) == 0) { 769 status = ldc_init(ldc_id, &ldc_attr, &vdc->ldc_handle); 770 if (status != 0) { 771 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 772 vdc->instance, ldc_id, status); 773 return (status); 774 } 775 vdc->initialized |= VDC_LDC_INIT; 776 } 777 status = ldc_status(vdc->ldc_handle, &ldc_state); 778 if (status != 0) { 779 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 780 vdc->instance, status); 781 return (status); 782 } 783 vdc->ldc_state = ldc_state; 784 785 if ((vdc->initialized & VDC_LDC_CB) == 0) { 786 status = ldc_reg_callback(vdc->ldc_handle, vdc_handle_cb, 787 (caddr_t)vdc); 788 if (status != 0) { 789 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 790 vdc->instance, status); 791 return (status); 792 } 793 vdc->initialized |= VDC_LDC_CB; 794 } 795 796 vdc->initialized |= VDC_LDC; 797 798 /* 799 * At this stage we have initialised LDC, we will now try and open 800 * the connection. 801 */ 802 if (vdc->ldc_state == LDC_INIT) { 803 status = ldc_open(vdc->ldc_handle); 804 if (status != 0) { 805 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 806 vdc->instance, vdc->ldc_id, status); 807 return (status); 808 } 809 vdc->initialized |= VDC_LDC_OPEN; 810 } 811 812 return (status); 813 } 814 815 static int 816 vdc_start_ldc_connection(vdc_t *vdc) 817 { 818 int status = 0; 819 820 ASSERT(vdc != NULL); 821 822 ASSERT(MUTEX_HELD(&vdc->lock)); 823 824 status = vdc_do_ldc_up(vdc); 825 826 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 827 828 return (status); 829 } 830 831 static int 832 vdc_stop_ldc_connection(vdc_t *vdcp) 833 { 834 int status; 835 836 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 837 vdcp->state); 838 839 status = ldc_down(vdcp->ldc_handle); 840 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 841 842 vdcp->initialized &= ~VDC_HANDSHAKE; 843 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 844 845 return (status); 846 } 847 848 static int 849 vdc_create_device_nodes_efi(vdc_t *vdc) 850 { 851 ddi_remove_minor_node(vdc->dip, "h"); 852 ddi_remove_minor_node(vdc->dip, "h,raw"); 853 854 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 855 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 856 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 857 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 858 vdc->instance); 859 return (EIO); 860 } 861 862 /* if any device node is created we set this flag */ 863 vdc->initialized |= VDC_MINOR; 864 865 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 866 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 867 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 868 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 869 vdc->instance); 870 return (EIO); 871 } 872 873 return (0); 874 } 875 876 static int 877 vdc_create_device_nodes_vtoc(vdc_t *vdc) 878 { 879 ddi_remove_minor_node(vdc->dip, "wd"); 880 ddi_remove_minor_node(vdc->dip, "wd,raw"); 881 882 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 883 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 884 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 885 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 886 vdc->instance); 887 return (EIO); 888 } 889 890 /* if any device node is created we set this flag */ 891 vdc->initialized |= VDC_MINOR; 892 893 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 894 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 895 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 896 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 897 vdc->instance); 898 return (EIO); 899 } 900 901 return (0); 902 } 903 904 /* 905 * Function: 906 * vdc_create_device_nodes 907 * 908 * Description: 909 * This function creates the block and character device nodes under 910 * /devices along with the node properties. It is called as part of 911 * the attach(9E) of the instance during the handshake with vds after 912 * vds has sent the attributes to vdc. 913 * 914 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 915 * of 2 is used in keeping with the Solaris convention that slice 2 916 * refers to a whole disk. Slices start at 'a' 917 * 918 * Parameters: 919 * vdc - soft state pointer 920 * 921 * Return Values 922 * 0 - Success 923 * EIO - Failed to create node 924 * EINVAL - Unknown type of disk exported 925 */ 926 static int 927 vdc_create_device_nodes(vdc_t *vdc) 928 { 929 char name[sizeof ("s,raw")]; 930 dev_info_t *dip = NULL; 931 int instance, status; 932 int num_slices = 1; 933 int i; 934 935 ASSERT(vdc != NULL); 936 937 instance = vdc->instance; 938 dip = vdc->dip; 939 940 switch (vdc->vdisk_type) { 941 case VD_DISK_TYPE_DISK: 942 num_slices = V_NUMPAR; 943 break; 944 case VD_DISK_TYPE_SLICE: 945 num_slices = 1; 946 break; 947 case VD_DISK_TYPE_UNK: 948 default: 949 return (EINVAL); 950 } 951 952 /* 953 * Minor nodes are different for EFI disks: EFI disks do not have 954 * a minor node 'g' for the minor number corresponding to slice 955 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 956 * representing the whole disk. 957 */ 958 for (i = 0; i < num_slices; i++) { 959 960 if (i == VD_EFI_WD_SLICE) { 961 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 962 status = vdc_create_device_nodes_efi(vdc); 963 else 964 status = vdc_create_device_nodes_vtoc(vdc); 965 if (status != 0) 966 return (status); 967 continue; 968 } 969 970 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 971 if (ddi_create_minor_node(dip, name, S_IFBLK, 972 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 973 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 974 instance, name); 975 return (EIO); 976 } 977 978 /* if any device node is created we set this flag */ 979 vdc->initialized |= VDC_MINOR; 980 981 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 982 983 if (ddi_create_minor_node(dip, name, S_IFCHR, 984 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 985 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 986 instance, name); 987 return (EIO); 988 } 989 } 990 991 return (0); 992 } 993 994 /* 995 * Function: 996 * vdc_create_device_nodes_props 997 * 998 * Description: 999 * This function creates the block and character device nodes under 1000 * /devices along with the node properties. It is called as part of 1001 * the attach(9E) of the instance during the handshake with vds after 1002 * vds has sent the attributes to vdc. 1003 * 1004 * Parameters: 1005 * vdc - soft state pointer 1006 * 1007 * Return Values 1008 * 0 - Success 1009 * EIO - Failed to create device node property 1010 * EINVAL - Unknown type of disk exported 1011 */ 1012 static int 1013 vdc_create_device_nodes_props(vdc_t *vdc) 1014 { 1015 dev_info_t *dip = NULL; 1016 int instance; 1017 int num_slices = 1; 1018 int64_t size = 0; 1019 dev_t dev; 1020 int rv; 1021 int i; 1022 1023 ASSERT(vdc != NULL); 1024 1025 instance = vdc->instance; 1026 dip = vdc->dip; 1027 1028 switch (vdc->vdisk_type) { 1029 case VD_DISK_TYPE_DISK: 1030 num_slices = V_NUMPAR; 1031 break; 1032 case VD_DISK_TYPE_SLICE: 1033 num_slices = 1; 1034 break; 1035 case VD_DISK_TYPE_UNK: 1036 default: 1037 return (EINVAL); 1038 } 1039 1040 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1041 /* remove all properties */ 1042 for (i = 0; i < num_slices; i++) { 1043 dev = makedevice(ddi_driver_major(dip), 1044 VD_MAKE_DEV(instance, i)); 1045 (void) ddi_prop_remove(dev, dip, VDC_SIZE_PROP_NAME); 1046 (void) ddi_prop_remove(dev, dip, VDC_NBLOCKS_PROP_NAME); 1047 } 1048 return (0); 1049 } 1050 1051 for (i = 0; i < num_slices; i++) { 1052 dev = makedevice(ddi_driver_major(dip), 1053 VD_MAKE_DEV(instance, i)); 1054 1055 size = vdc->slice[i].nblocks * vdc->block_size; 1056 DMSG(vdc, 0, "[%d] sz %ld (%ld Mb) p_size %lx\n", 1057 instance, size, size / (1024 * 1024), 1058 vdc->slice[i].nblocks); 1059 1060 rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); 1061 if (rv != DDI_PROP_SUCCESS) { 1062 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop of [%ld]", 1063 instance, VDC_SIZE_PROP_NAME, size); 1064 return (EIO); 1065 } 1066 1067 rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME, 1068 lbtodb(size)); 1069 if (rv != DDI_PROP_SUCCESS) { 1070 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop [%llu]", 1071 instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size)); 1072 return (EIO); 1073 } 1074 } 1075 1076 return (0); 1077 } 1078 1079 /* 1080 * Function: 1081 * vdc_is_opened 1082 * 1083 * Description: 1084 * This function checks if any slice of a given virtual disk is 1085 * currently opened. 1086 * 1087 * Parameters: 1088 * vdc - soft state pointer 1089 * 1090 * Return Values 1091 * B_TRUE - at least one slice is opened. 1092 * B_FALSE - no slice is opened. 1093 */ 1094 static boolean_t 1095 vdc_is_opened(vdc_t *vdc) 1096 { 1097 int i, nslices; 1098 1099 switch (vdc->vdisk_type) { 1100 case VD_DISK_TYPE_DISK: 1101 nslices = V_NUMPAR; 1102 break; 1103 case VD_DISK_TYPE_SLICE: 1104 nslices = 1; 1105 break; 1106 case VD_DISK_TYPE_UNK: 1107 default: 1108 ASSERT(0); 1109 } 1110 1111 /* check if there's any layered open */ 1112 for (i = 0; i < nslices; i++) { 1113 if (vdc->open_lyr[i] > 0) 1114 return (B_TRUE); 1115 } 1116 1117 /* check if there is any other kind of open */ 1118 for (i = 0; i < OTYPCNT; i++) { 1119 if (vdc->open[i] != 0) 1120 return (B_TRUE); 1121 } 1122 1123 return (B_FALSE); 1124 } 1125 1126 static int 1127 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1128 { 1129 uint8_t slicemask; 1130 int i; 1131 1132 ASSERT(otyp < OTYPCNT); 1133 ASSERT(slice < V_NUMPAR); 1134 ASSERT(MUTEX_HELD(&vdc->lock)); 1135 1136 slicemask = 1 << slice; 1137 1138 /* check if slice is already exclusively opened */ 1139 if (vdc->open_excl & slicemask) 1140 return (EBUSY); 1141 1142 /* if open exclusive, check if slice is already opened */ 1143 if (flag & FEXCL) { 1144 if (vdc->open_lyr[slice] > 0) 1145 return (EBUSY); 1146 for (i = 0; i < OTYPCNT; i++) { 1147 if (vdc->open[i] & slicemask) 1148 return (EBUSY); 1149 } 1150 vdc->open_excl |= slicemask; 1151 } 1152 1153 /* mark slice as opened */ 1154 if (otyp == OTYP_LYR) { 1155 vdc->open_lyr[slice]++; 1156 } else { 1157 vdc->open[otyp] |= slicemask; 1158 } 1159 1160 return (0); 1161 } 1162 1163 static void 1164 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1165 { 1166 uint8_t slicemask; 1167 1168 ASSERT(otyp < OTYPCNT); 1169 ASSERT(slice < V_NUMPAR); 1170 ASSERT(MUTEX_HELD(&vdc->lock)); 1171 1172 slicemask = 1 << slice; 1173 1174 if (otyp == OTYP_LYR) { 1175 ASSERT(vdc->open_lyr[slice] > 0); 1176 vdc->open_lyr[slice]--; 1177 } else { 1178 vdc->open[otyp] &= ~slicemask; 1179 } 1180 1181 if (flag & FEXCL) 1182 vdc->open_excl &= ~slicemask; 1183 } 1184 1185 static int 1186 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1187 { 1188 _NOTE(ARGUNUSED(cred)) 1189 1190 int instance; 1191 int slice, status = 0; 1192 vdc_t *vdc; 1193 1194 ASSERT(dev != NULL); 1195 instance = VDCUNIT(*dev); 1196 1197 if (otyp >= OTYPCNT) 1198 return (EINVAL); 1199 1200 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1201 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1202 return (ENXIO); 1203 } 1204 1205 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1206 getminor(*dev), flag, otyp); 1207 1208 slice = VDCPART(*dev); 1209 1210 mutex_enter(&vdc->lock); 1211 1212 status = vdc_mark_opened(vdc, slice, flag, otyp); 1213 1214 if (status != 0) { 1215 mutex_exit(&vdc->lock); 1216 return (status); 1217 } 1218 1219 if (flag & (FNDELAY | FNONBLOCK)) { 1220 1221 /* don't resubmit a validate request if there's already one */ 1222 if (vdc->validate_pending > 0) { 1223 mutex_exit(&vdc->lock); 1224 return (0); 1225 } 1226 1227 /* call vdc_validate() asynchronously to avoid blocking */ 1228 if (taskq_dispatch(system_taskq, vdc_validate_task, 1229 (void *)vdc, TQ_NOSLEEP) == NULL) { 1230 vdc_mark_closed(vdc, slice, flag, otyp); 1231 mutex_exit(&vdc->lock); 1232 return (ENXIO); 1233 } 1234 1235 vdc->validate_pending++; 1236 mutex_exit(&vdc->lock); 1237 return (0); 1238 } 1239 1240 mutex_exit(&vdc->lock); 1241 1242 vdc_validate(vdc); 1243 1244 mutex_enter(&vdc->lock); 1245 1246 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1247 vdc->slice[slice].nblocks == 0) { 1248 vdc_mark_closed(vdc, slice, flag, otyp); 1249 status = EIO; 1250 } 1251 1252 mutex_exit(&vdc->lock); 1253 1254 return (status); 1255 } 1256 1257 static int 1258 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1259 { 1260 _NOTE(ARGUNUSED(cred)) 1261 1262 int instance; 1263 int slice; 1264 int rv, rval; 1265 vdc_t *vdc; 1266 1267 instance = VDCUNIT(dev); 1268 1269 if (otyp >= OTYPCNT) 1270 return (EINVAL); 1271 1272 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1273 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1274 return (ENXIO); 1275 } 1276 1277 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1278 1279 slice = VDCPART(dev); 1280 1281 /* 1282 * Attempt to flush the W$ on a close operation. If this is 1283 * not a supported IOCTL command or the backing device is read-only 1284 * do not fail the close operation. 1285 */ 1286 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1287 1288 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1289 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1290 instance, rv); 1291 return (EIO); 1292 } 1293 1294 mutex_enter(&vdc->lock); 1295 vdc_mark_closed(vdc, slice, flag, otyp); 1296 mutex_exit(&vdc->lock); 1297 1298 return (0); 1299 } 1300 1301 static int 1302 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1303 { 1304 _NOTE(ARGUNUSED(credp)) 1305 1306 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1307 } 1308 1309 static int 1310 vdc_print(dev_t dev, char *str) 1311 { 1312 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1313 return (0); 1314 } 1315 1316 static int 1317 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1318 { 1319 int rv; 1320 size_t nbytes = nblk * DEV_BSIZE; 1321 int instance = VDCUNIT(dev); 1322 vdc_t *vdc = NULL; 1323 1324 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1325 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1326 return (ENXIO); 1327 } 1328 1329 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1330 instance, nbytes, blkno, (void *)addr); 1331 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1332 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1333 if (rv) { 1334 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1335 return (rv); 1336 } 1337 1338 if (ddi_in_panic()) 1339 (void) vdc_drain_response(vdc); 1340 1341 DMSG(vdc, 0, "[%d] End\n", instance); 1342 1343 return (0); 1344 } 1345 1346 /* -------------------------------------------------------------------------- */ 1347 1348 /* 1349 * Disk access routines 1350 * 1351 */ 1352 1353 /* 1354 * vdc_strategy() 1355 * 1356 * Return Value: 1357 * 0: As per strategy(9E), the strategy() function must return 0 1358 * [ bioerror(9f) sets b_flags to the proper error code ] 1359 */ 1360 static int 1361 vdc_strategy(struct buf *buf) 1362 { 1363 int rv = -1; 1364 vdc_t *vdc = NULL; 1365 int instance = VDCUNIT(buf->b_edev); 1366 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1367 int slice; 1368 1369 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1370 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1371 bioerror(buf, ENXIO); 1372 biodone(buf); 1373 return (0); 1374 } 1375 1376 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1377 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1378 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1379 DTRACE_IO2(vstart, buf_t *, buf, vdc_t *, vdc); 1380 1381 bp_mapin(buf); 1382 1383 if ((long)buf->b_private == VD_SLICE_NONE) { 1384 /* I/O using an absolute disk offset */ 1385 slice = VD_SLICE_NONE; 1386 } else { 1387 slice = VDCPART(buf->b_edev); 1388 } 1389 1390 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1391 buf->b_bcount, slice, buf->b_lblkno, 1392 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1393 VIO_write_dir); 1394 1395 /* 1396 * If the request was successfully sent, the strategy call returns and 1397 * the ACK handler calls the bioxxx functions when the vDisk server is 1398 * done. 1399 */ 1400 if (rv) { 1401 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1402 bioerror(buf, rv); 1403 biodone(buf); 1404 } 1405 1406 return (0); 1407 } 1408 1409 /* 1410 * Function: 1411 * vdc_min 1412 * 1413 * Description: 1414 * Routine to limit the size of a data transfer. Used in 1415 * conjunction with physio(9F). 1416 * 1417 * Arguments: 1418 * bp - pointer to the indicated buf(9S) struct. 1419 * 1420 */ 1421 static void 1422 vdc_min(struct buf *bufp) 1423 { 1424 vdc_t *vdc = NULL; 1425 int instance = VDCUNIT(bufp->b_edev); 1426 1427 vdc = ddi_get_soft_state(vdc_state, instance); 1428 VERIFY(vdc != NULL); 1429 1430 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1431 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1432 } 1433 } 1434 1435 static int 1436 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1437 { 1438 _NOTE(ARGUNUSED(cred)) 1439 1440 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1441 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1442 } 1443 1444 static int 1445 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1446 { 1447 _NOTE(ARGUNUSED(cred)) 1448 1449 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1450 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1451 } 1452 1453 static int 1454 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1455 { 1456 _NOTE(ARGUNUSED(cred)) 1457 1458 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1459 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1460 } 1461 1462 static int 1463 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1464 { 1465 _NOTE(ARGUNUSED(cred)) 1466 1467 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1468 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1469 } 1470 1471 1472 /* -------------------------------------------------------------------------- */ 1473 1474 /* 1475 * Handshake support 1476 */ 1477 1478 1479 /* 1480 * Function: 1481 * vdc_init_ver_negotiation() 1482 * 1483 * Description: 1484 * 1485 * Arguments: 1486 * vdc - soft state pointer for this instance of the device driver. 1487 * 1488 * Return Code: 1489 * 0 - Success 1490 */ 1491 static int 1492 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1493 { 1494 vio_ver_msg_t pkt; 1495 size_t msglen = sizeof (pkt); 1496 int status = -1; 1497 1498 ASSERT(vdc != NULL); 1499 ASSERT(mutex_owned(&vdc->lock)); 1500 1501 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1502 1503 /* 1504 * set the Session ID to a unique value 1505 * (the lower 32 bits of the clock tick) 1506 */ 1507 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1508 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1509 1510 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1511 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1512 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1513 pkt.tag.vio_sid = vdc->session_id; 1514 pkt.dev_class = VDEV_DISK; 1515 pkt.ver_major = ver.major; 1516 pkt.ver_minor = ver.minor; 1517 1518 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1519 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1520 vdc->instance, status); 1521 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1522 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1523 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1524 status, msglen); 1525 if (msglen != sizeof (vio_ver_msg_t)) 1526 status = ENOMSG; 1527 } 1528 1529 return (status); 1530 } 1531 1532 /* 1533 * Function: 1534 * vdc_ver_negotiation() 1535 * 1536 * Description: 1537 * 1538 * Arguments: 1539 * vdcp - soft state pointer for this instance of the device driver. 1540 * 1541 * Return Code: 1542 * 0 - Success 1543 */ 1544 static int 1545 vdc_ver_negotiation(vdc_t *vdcp) 1546 { 1547 vio_msg_t vio_msg; 1548 int status; 1549 1550 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1551 return (status); 1552 1553 /* release lock and wait for response */ 1554 mutex_exit(&vdcp->lock); 1555 status = vdc_wait_for_response(vdcp, &vio_msg); 1556 mutex_enter(&vdcp->lock); 1557 if (status) { 1558 DMSG(vdcp, 0, 1559 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1560 vdcp->instance, status); 1561 return (status); 1562 } 1563 1564 /* check type and sub_type ... */ 1565 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1566 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1567 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1568 vdcp->instance); 1569 return (EPROTO); 1570 } 1571 1572 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1573 } 1574 1575 /* 1576 * Function: 1577 * vdc_init_attr_negotiation() 1578 * 1579 * Description: 1580 * 1581 * Arguments: 1582 * vdc - soft state pointer for this instance of the device driver. 1583 * 1584 * Return Code: 1585 * 0 - Success 1586 */ 1587 static int 1588 vdc_init_attr_negotiation(vdc_t *vdc) 1589 { 1590 vd_attr_msg_t pkt; 1591 size_t msglen = sizeof (pkt); 1592 int status; 1593 1594 ASSERT(vdc != NULL); 1595 ASSERT(mutex_owned(&vdc->lock)); 1596 1597 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1598 1599 /* fill in tag */ 1600 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1601 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1602 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1603 pkt.tag.vio_sid = vdc->session_id; 1604 /* fill in payload */ 1605 pkt.max_xfer_sz = vdc->max_xfer_sz; 1606 pkt.vdisk_block_size = vdc->block_size; 1607 pkt.xfer_mode = VIO_DRING_MODE; 1608 pkt.operations = 0; /* server will set bits of valid operations */ 1609 pkt.vdisk_type = 0; /* server will set to valid device type */ 1610 pkt.vdisk_media = 0; /* server will set to valid media type */ 1611 pkt.vdisk_size = 0; /* server will set to valid size */ 1612 1613 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1614 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1615 1616 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1617 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1618 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1619 status, msglen); 1620 if (msglen != sizeof (vio_ver_msg_t)) 1621 status = ENOMSG; 1622 } 1623 1624 return (status); 1625 } 1626 1627 /* 1628 * Function: 1629 * vdc_attr_negotiation() 1630 * 1631 * Description: 1632 * 1633 * Arguments: 1634 * vdc - soft state pointer for this instance of the device driver. 1635 * 1636 * Return Code: 1637 * 0 - Success 1638 */ 1639 static int 1640 vdc_attr_negotiation(vdc_t *vdcp) 1641 { 1642 int status; 1643 vio_msg_t vio_msg; 1644 1645 if (status = vdc_init_attr_negotiation(vdcp)) 1646 return (status); 1647 1648 /* release lock and wait for response */ 1649 mutex_exit(&vdcp->lock); 1650 status = vdc_wait_for_response(vdcp, &vio_msg); 1651 mutex_enter(&vdcp->lock); 1652 if (status) { 1653 DMSG(vdcp, 0, 1654 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1655 vdcp->instance, status); 1656 return (status); 1657 } 1658 1659 /* check type and sub_type ... */ 1660 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1661 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1662 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1663 vdcp->instance); 1664 return (EPROTO); 1665 } 1666 1667 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1668 } 1669 1670 1671 /* 1672 * Function: 1673 * vdc_init_dring_negotiate() 1674 * 1675 * Description: 1676 * 1677 * Arguments: 1678 * vdc - soft state pointer for this instance of the device driver. 1679 * 1680 * Return Code: 1681 * 0 - Success 1682 */ 1683 static int 1684 vdc_init_dring_negotiate(vdc_t *vdc) 1685 { 1686 vio_dring_reg_msg_t pkt; 1687 size_t msglen = sizeof (pkt); 1688 int status = -1; 1689 int retry; 1690 int nretries = 10; 1691 1692 ASSERT(vdc != NULL); 1693 ASSERT(mutex_owned(&vdc->lock)); 1694 1695 for (retry = 0; retry < nretries; retry++) { 1696 status = vdc_init_descriptor_ring(vdc); 1697 if (status != EAGAIN) 1698 break; 1699 drv_usecwait(vdc_min_timeout_ldc); 1700 } 1701 1702 if (status != 0) { 1703 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1704 vdc->instance, status); 1705 return (status); 1706 } 1707 1708 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1709 vdc->instance, status); 1710 1711 /* fill in tag */ 1712 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1713 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1714 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1715 pkt.tag.vio_sid = vdc->session_id; 1716 /* fill in payload */ 1717 pkt.dring_ident = 0; 1718 pkt.num_descriptors = vdc->dring_len; 1719 pkt.descriptor_size = vdc->dring_entry_size; 1720 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1721 pkt.ncookies = vdc->dring_cookie_count; 1722 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1723 1724 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1725 if (status != 0) { 1726 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1727 vdc->instance, status); 1728 } 1729 1730 return (status); 1731 } 1732 1733 1734 /* 1735 * Function: 1736 * vdc_dring_negotiation() 1737 * 1738 * Description: 1739 * 1740 * Arguments: 1741 * vdc - soft state pointer for this instance of the device driver. 1742 * 1743 * Return Code: 1744 * 0 - Success 1745 */ 1746 static int 1747 vdc_dring_negotiation(vdc_t *vdcp) 1748 { 1749 int status; 1750 vio_msg_t vio_msg; 1751 1752 if (status = vdc_init_dring_negotiate(vdcp)) 1753 return (status); 1754 1755 /* release lock and wait for response */ 1756 mutex_exit(&vdcp->lock); 1757 status = vdc_wait_for_response(vdcp, &vio_msg); 1758 mutex_enter(&vdcp->lock); 1759 if (status) { 1760 DMSG(vdcp, 0, 1761 "[%d] Failed waiting for Dring negotiation response," 1762 " rv(%d)", vdcp->instance, status); 1763 return (status); 1764 } 1765 1766 /* check type and sub_type ... */ 1767 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1768 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1769 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1770 vdcp->instance); 1771 return (EPROTO); 1772 } 1773 1774 return (vdc_handle_dring_reg_msg(vdcp, 1775 (vio_dring_reg_msg_t *)&vio_msg)); 1776 } 1777 1778 1779 /* 1780 * Function: 1781 * vdc_send_rdx() 1782 * 1783 * Description: 1784 * 1785 * Arguments: 1786 * vdc - soft state pointer for this instance of the device driver. 1787 * 1788 * Return Code: 1789 * 0 - Success 1790 */ 1791 static int 1792 vdc_send_rdx(vdc_t *vdcp) 1793 { 1794 vio_msg_t msg; 1795 size_t msglen = sizeof (vio_msg_t); 1796 int status; 1797 1798 /* 1799 * Send an RDX message to vds to indicate we are ready 1800 * to send data 1801 */ 1802 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1803 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1804 msg.tag.vio_subtype_env = VIO_RDX; 1805 msg.tag.vio_sid = vdcp->session_id; 1806 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1807 if (status != 0) { 1808 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1809 vdcp->instance, status); 1810 } 1811 1812 return (status); 1813 } 1814 1815 /* 1816 * Function: 1817 * vdc_handle_rdx() 1818 * 1819 * Description: 1820 * 1821 * Arguments: 1822 * vdc - soft state pointer for this instance of the device driver. 1823 * msgp - received msg 1824 * 1825 * Return Code: 1826 * 0 - Success 1827 */ 1828 static int 1829 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1830 { 1831 _NOTE(ARGUNUSED(vdcp)) 1832 _NOTE(ARGUNUSED(msgp)) 1833 1834 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1835 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1836 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1837 1838 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1839 1840 return (0); 1841 } 1842 1843 /* 1844 * Function: 1845 * vdc_rdx_exchange() 1846 * 1847 * Description: 1848 * 1849 * Arguments: 1850 * vdc - soft state pointer for this instance of the device driver. 1851 * 1852 * Return Code: 1853 * 0 - Success 1854 */ 1855 static int 1856 vdc_rdx_exchange(vdc_t *vdcp) 1857 { 1858 int status; 1859 vio_msg_t vio_msg; 1860 1861 if (status = vdc_send_rdx(vdcp)) 1862 return (status); 1863 1864 /* release lock and wait for response */ 1865 mutex_exit(&vdcp->lock); 1866 status = vdc_wait_for_response(vdcp, &vio_msg); 1867 mutex_enter(&vdcp->lock); 1868 if (status) { 1869 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1870 vdcp->instance, status); 1871 return (status); 1872 } 1873 1874 /* check type and sub_type ... */ 1875 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1876 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1877 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1878 return (EPROTO); 1879 } 1880 1881 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1882 } 1883 1884 1885 /* -------------------------------------------------------------------------- */ 1886 1887 /* 1888 * LDC helper routines 1889 */ 1890 1891 static int 1892 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1893 { 1894 int status; 1895 boolean_t q_has_pkts = B_FALSE; 1896 uint64_t delay_time; 1897 size_t len; 1898 1899 mutex_enter(&vdc->read_lock); 1900 1901 if (vdc->read_state == VDC_READ_IDLE) 1902 vdc->read_state = VDC_READ_WAITING; 1903 1904 while (vdc->read_state != VDC_READ_PENDING) { 1905 1906 /* detect if the connection has been reset */ 1907 if (vdc->read_state == VDC_READ_RESET) { 1908 status = ECONNRESET; 1909 goto done; 1910 } 1911 1912 cv_wait(&vdc->read_cv, &vdc->read_lock); 1913 } 1914 1915 /* 1916 * Until we get a blocking ldc read we have to retry 1917 * until the entire LDC message has arrived before 1918 * ldc_read() will succeed. Note we also bail out if 1919 * the channel is reset or goes away. 1920 */ 1921 delay_time = vdc_ldc_read_init_delay; 1922 loop: 1923 len = *nbytesp; 1924 status = ldc_read(vdc->ldc_handle, (caddr_t)msgp, &len); 1925 switch (status) { 1926 case EAGAIN: 1927 delay_time *= 2; 1928 if (delay_time >= vdc_ldc_read_max_delay) 1929 delay_time = vdc_ldc_read_max_delay; 1930 delay(delay_time); 1931 goto loop; 1932 1933 case 0: 1934 if (len == 0) { 1935 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 1936 "no error!\n", vdc->instance); 1937 goto loop; 1938 } 1939 1940 *nbytesp = len; 1941 1942 /* 1943 * If there are pending messages, leave the 1944 * read state as pending. Otherwise, set the state 1945 * back to idle. 1946 */ 1947 status = ldc_chkq(vdc->ldc_handle, &q_has_pkts); 1948 if (status == 0 && !q_has_pkts) 1949 vdc->read_state = VDC_READ_IDLE; 1950 1951 break; 1952 default: 1953 DMSG(vdc, 0, "ldc_read returned %d\n", status); 1954 break; 1955 } 1956 1957 done: 1958 mutex_exit(&vdc->read_lock); 1959 1960 return (status); 1961 } 1962 1963 1964 1965 #ifdef DEBUG 1966 void 1967 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 1968 { 1969 char *ms, *ss, *ses; 1970 switch (msg->tag.vio_msgtype) { 1971 #define Q(_s) case _s : ms = #_s; break; 1972 Q(VIO_TYPE_CTRL) 1973 Q(VIO_TYPE_DATA) 1974 Q(VIO_TYPE_ERR) 1975 #undef Q 1976 default: ms = "unknown"; break; 1977 } 1978 1979 switch (msg->tag.vio_subtype) { 1980 #define Q(_s) case _s : ss = #_s; break; 1981 Q(VIO_SUBTYPE_INFO) 1982 Q(VIO_SUBTYPE_ACK) 1983 Q(VIO_SUBTYPE_NACK) 1984 #undef Q 1985 default: ss = "unknown"; break; 1986 } 1987 1988 switch (msg->tag.vio_subtype_env) { 1989 #define Q(_s) case _s : ses = #_s; break; 1990 Q(VIO_VER_INFO) 1991 Q(VIO_ATTR_INFO) 1992 Q(VIO_DRING_REG) 1993 Q(VIO_DRING_UNREG) 1994 Q(VIO_RDX) 1995 Q(VIO_PKT_DATA) 1996 Q(VIO_DESC_DATA) 1997 Q(VIO_DRING_DATA) 1998 #undef Q 1999 default: ses = "unknown"; break; 2000 } 2001 2002 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2003 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2004 msg->tag.vio_subtype_env, ms, ss, ses); 2005 } 2006 #endif 2007 2008 /* 2009 * Function: 2010 * vdc_send() 2011 * 2012 * Description: 2013 * The function encapsulates the call to write a message using LDC. 2014 * If LDC indicates that the call failed due to the queue being full, 2015 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2016 * 2017 * Arguments: 2018 * ldc_handle - LDC handle for the channel this instance of vdc uses 2019 * pkt - address of LDC message to be sent 2020 * msglen - the size of the message being sent. When the function 2021 * returns, this contains the number of bytes written. 2022 * 2023 * Return Code: 2024 * 0 - Success. 2025 * EINVAL - pkt or msglen were NULL 2026 * ECONNRESET - The connection was not up. 2027 * EWOULDBLOCK - LDC queue is full 2028 * xxx - other error codes returned by ldc_write 2029 */ 2030 static int 2031 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2032 { 2033 size_t size = 0; 2034 int status = 0; 2035 clock_t delay_ticks; 2036 2037 ASSERT(vdc != NULL); 2038 ASSERT(mutex_owned(&vdc->lock)); 2039 ASSERT(msglen != NULL); 2040 ASSERT(*msglen != 0); 2041 2042 #ifdef DEBUG 2043 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2044 #endif 2045 /* 2046 * Wait indefinitely to send if channel 2047 * is busy, but bail out if we succeed or 2048 * if the channel closes or is reset. 2049 */ 2050 delay_ticks = vdc_hz_min_ldc_delay; 2051 do { 2052 size = *msglen; 2053 status = ldc_write(vdc->ldc_handle, pkt, &size); 2054 if (status == EWOULDBLOCK) { 2055 delay(delay_ticks); 2056 /* geometric backoff */ 2057 delay_ticks *= 2; 2058 if (delay_ticks > vdc_hz_max_ldc_delay) 2059 delay_ticks = vdc_hz_max_ldc_delay; 2060 } 2061 } while (status == EWOULDBLOCK); 2062 2063 /* if LDC had serious issues --- reset vdc state */ 2064 if (status == EIO || status == ECONNRESET) { 2065 /* LDC had serious issues --- reset vdc state */ 2066 mutex_enter(&vdc->read_lock); 2067 if ((vdc->read_state == VDC_READ_WAITING) || 2068 (vdc->read_state == VDC_READ_RESET)) 2069 cv_signal(&vdc->read_cv); 2070 vdc->read_state = VDC_READ_RESET; 2071 mutex_exit(&vdc->read_lock); 2072 2073 /* wake up any waiters in the reset thread */ 2074 if (vdc->state == VDC_STATE_INIT_WAITING) { 2075 DMSG(vdc, 0, "[%d] write reset - " 2076 "vdc is resetting ..\n", vdc->instance); 2077 vdc->state = VDC_STATE_RESETTING; 2078 cv_signal(&vdc->initwait_cv); 2079 } 2080 2081 return (ECONNRESET); 2082 } 2083 2084 /* return the last size written */ 2085 *msglen = size; 2086 2087 return (status); 2088 } 2089 2090 /* 2091 * Function: 2092 * vdc_get_md_node 2093 * 2094 * Description: 2095 * Get the MD, the device node and the port node for the given 2096 * disk instance. The caller is responsible for cleaning up the 2097 * reference to the returned MD (mdpp) by calling md_fini_handle(). 2098 * 2099 * Arguments: 2100 * dip - dev info pointer for this instance of the device driver. 2101 * mdpp - the returned MD. 2102 * vd_nodep - the returned device node. 2103 * vd_portp - the returned port node. The returned port node is NULL 2104 * if no port node is found. 2105 * 2106 * Return Code: 2107 * 0 - Success. 2108 * ENOENT - Expected node or property did not exist. 2109 * ENXIO - Unexpected error communicating with MD framework 2110 */ 2111 static int 2112 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep, 2113 mde_cookie_t *vd_portp) 2114 { 2115 int status = ENOENT; 2116 char *node_name = NULL; 2117 md_t *mdp = NULL; 2118 int num_nodes; 2119 int num_vdevs; 2120 int num_vports; 2121 mde_cookie_t rootnode; 2122 mde_cookie_t *listp = NULL; 2123 boolean_t found_inst = B_FALSE; 2124 int listsz; 2125 int idx; 2126 uint64_t md_inst; 2127 int obp_inst; 2128 int instance = ddi_get_instance(dip); 2129 2130 /* 2131 * Get the OBP instance number for comparison with the MD instance 2132 * 2133 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2134 * notion of "instance", or unique identifier, for that node; OBP 2135 * stores the value of the "cfg-handle" MD property as the value of 2136 * the "reg" property on the node in the device tree it builds from 2137 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2138 * "reg" property value to uniquely identify this device instance. 2139 * If the "reg" property cannot be found, the device tree state is 2140 * presumably so broken that there is no point in continuing. 2141 */ 2142 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2143 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2144 return (ENOENT); 2145 } 2146 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2147 OBP_REG, -1); 2148 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2149 2150 /* 2151 * We now walk the MD nodes to find the node for this vdisk. 2152 */ 2153 if ((mdp = md_get_handle()) == NULL) { 2154 cmn_err(CE_WARN, "unable to init machine description"); 2155 return (ENXIO); 2156 } 2157 2158 num_nodes = md_node_count(mdp); 2159 ASSERT(num_nodes > 0); 2160 2161 listsz = num_nodes * sizeof (mde_cookie_t); 2162 2163 /* allocate memory for nodes */ 2164 listp = kmem_zalloc(listsz, KM_SLEEP); 2165 2166 rootnode = md_root_node(mdp); 2167 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2168 2169 /* 2170 * Search for all the virtual devices, we will then check to see which 2171 * ones are disk nodes. 2172 */ 2173 num_vdevs = md_scan_dag(mdp, rootnode, 2174 md_find_name(mdp, VDC_MD_VDEV_NAME), 2175 md_find_name(mdp, "fwd"), listp); 2176 2177 if (num_vdevs <= 0) { 2178 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2179 status = ENOENT; 2180 goto done; 2181 } 2182 2183 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2184 for (idx = 0; idx < num_vdevs; idx++) { 2185 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2186 if ((status != 0) || (node_name == NULL)) { 2187 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2188 ": err %d", VDC_MD_VDEV_NAME, status); 2189 continue; 2190 } 2191 2192 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2193 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2194 status = md_get_prop_val(mdp, listp[idx], 2195 VDC_MD_CFG_HDL, &md_inst); 2196 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2197 instance, md_inst); 2198 if ((status == 0) && (md_inst == obp_inst)) { 2199 found_inst = B_TRUE; 2200 break; 2201 } 2202 } 2203 } 2204 2205 if (!found_inst) { 2206 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2207 status = ENOENT; 2208 goto done; 2209 } 2210 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2211 2212 *vd_nodep = listp[idx]; 2213 *mdpp = mdp; 2214 2215 num_vports = md_scan_dag(mdp, *vd_nodep, 2216 md_find_name(mdp, VDC_MD_PORT_NAME), 2217 md_find_name(mdp, "fwd"), listp); 2218 2219 if (num_vports != 1) { 2220 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2221 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME, num_vports); 2222 } 2223 2224 *vd_portp = (num_vports == 0)? NULL: listp[0]; 2225 2226 done: 2227 kmem_free(listp, listsz); 2228 return (status); 2229 } 2230 2231 /* 2232 * Function: 2233 * vdc_get_ldc_id() 2234 * 2235 * Description: 2236 * This function gets the 'ldc-id' for this particular instance of vdc. 2237 * The id returned is the guest domain channel endpoint LDC uses for 2238 * communication with vds. 2239 * 2240 * Arguments: 2241 * mdp - pointer to the machine description. 2242 * vd_node - the vdisk element from the MD. 2243 * ldc_id - pointer to variable used to return the 'ldc-id' found. 2244 * 2245 * Return Code: 2246 * 0 - Success. 2247 * ENOENT - Expected node or property did not exist. 2248 */ 2249 static int 2250 vdc_get_ldc_id(md_t *mdp, mde_cookie_t vd_node, uint64_t *ldc_id) 2251 { 2252 mde_cookie_t *chanp = NULL; 2253 int listsz; 2254 int num_chans; 2255 int num_nodes; 2256 int status = 0; 2257 2258 num_nodes = md_node_count(mdp); 2259 ASSERT(num_nodes > 0); 2260 2261 listsz = num_nodes * sizeof (mde_cookie_t); 2262 2263 /* allocate memory for nodes */ 2264 chanp = kmem_zalloc(listsz, KM_SLEEP); 2265 2266 /* get the channels for this node */ 2267 num_chans = md_scan_dag(mdp, vd_node, 2268 md_find_name(mdp, VDC_MD_CHAN_NAME), 2269 md_find_name(mdp, "fwd"), chanp); 2270 2271 /* expecting at least one channel */ 2272 if (num_chans <= 0) { 2273 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2274 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2275 status = ENOENT; 2276 goto done; 2277 2278 } else if (num_chans != 1) { 2279 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2280 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, num_chans); 2281 } 2282 2283 /* 2284 * We use the first channel found (index 0), irrespective of how 2285 * many are there in total. 2286 */ 2287 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, ldc_id) != 0) { 2288 cmn_err(CE_NOTE, "Channel '%s' property not found", VDC_MD_ID); 2289 status = ENOENT; 2290 } 2291 2292 done: 2293 kmem_free(chanp, listsz); 2294 return (status); 2295 } 2296 2297 static int 2298 vdc_do_ldc_up(vdc_t *vdc) 2299 { 2300 int status; 2301 ldc_status_t ldc_state; 2302 2303 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2304 vdc->instance, vdc->ldc_id); 2305 2306 if (vdc->lifecycle == VDC_LC_DETACHING) 2307 return (EINVAL); 2308 2309 if ((status = ldc_up(vdc->ldc_handle)) != 0) { 2310 switch (status) { 2311 case ECONNREFUSED: /* listener not ready at other end */ 2312 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2313 vdc->instance, vdc->ldc_id, status); 2314 status = 0; 2315 break; 2316 default: 2317 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2318 "channel=%ld, err=%d", vdc->instance, vdc->ldc_id, 2319 status); 2320 break; 2321 } 2322 } 2323 2324 if (ldc_status(vdc->ldc_handle, &ldc_state) == 0) { 2325 vdc->ldc_state = ldc_state; 2326 if (ldc_state == LDC_UP) { 2327 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2328 vdc->instance); 2329 vdc->seq_num = 1; 2330 vdc->seq_num_reply = 0; 2331 } 2332 } 2333 2334 return (status); 2335 } 2336 2337 /* 2338 * Function: 2339 * vdc_terminate_ldc() 2340 * 2341 * Description: 2342 * 2343 * Arguments: 2344 * vdc - soft state pointer for this instance of the device driver. 2345 * 2346 * Return Code: 2347 * None 2348 */ 2349 static void 2350 vdc_terminate_ldc(vdc_t *vdc) 2351 { 2352 int instance = ddi_get_instance(vdc->dip); 2353 2354 ASSERT(vdc != NULL); 2355 ASSERT(mutex_owned(&vdc->lock)); 2356 2357 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2358 2359 if (vdc->initialized & VDC_LDC_OPEN) { 2360 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2361 (void) ldc_close(vdc->ldc_handle); 2362 } 2363 if (vdc->initialized & VDC_LDC_CB) { 2364 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2365 (void) ldc_unreg_callback(vdc->ldc_handle); 2366 } 2367 if (vdc->initialized & VDC_LDC) { 2368 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2369 (void) ldc_fini(vdc->ldc_handle); 2370 vdc->ldc_handle = NULL; 2371 } 2372 2373 vdc->initialized &= ~(VDC_LDC | VDC_LDC_CB | VDC_LDC_OPEN); 2374 } 2375 2376 /* -------------------------------------------------------------------------- */ 2377 2378 /* 2379 * Descriptor Ring helper routines 2380 */ 2381 2382 /* 2383 * Function: 2384 * vdc_init_descriptor_ring() 2385 * 2386 * Description: 2387 * 2388 * Arguments: 2389 * vdc - soft state pointer for this instance of the device driver. 2390 * 2391 * Return Code: 2392 * 0 - Success 2393 */ 2394 static int 2395 vdc_init_descriptor_ring(vdc_t *vdc) 2396 { 2397 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2398 int status = 0; 2399 int i; 2400 2401 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2402 2403 ASSERT(vdc != NULL); 2404 ASSERT(mutex_owned(&vdc->lock)); 2405 ASSERT(vdc->ldc_handle != NULL); 2406 2407 /* ensure we have enough room to store max sized block */ 2408 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2409 2410 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2411 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2412 /* 2413 * Calculate the maximum block size we can transmit using one 2414 * Descriptor Ring entry from the attributes returned by the 2415 * vDisk server. This is subject to a minimum of 'maxphys' 2416 * as we do not have the capability to split requests over 2417 * multiple DRing entries. 2418 */ 2419 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2420 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2421 vdc->instance); 2422 vdc->dring_max_cookies = maxphys / PAGESIZE; 2423 } else { 2424 vdc->dring_max_cookies = 2425 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2426 } 2427 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2428 (sizeof (ldc_mem_cookie_t) * 2429 (vdc->dring_max_cookies - 1))); 2430 vdc->dring_len = VD_DRING_LEN; 2431 2432 status = ldc_mem_dring_create(vdc->dring_len, 2433 vdc->dring_entry_size, &vdc->ldc_dring_hdl); 2434 if ((vdc->ldc_dring_hdl == NULL) || (status != 0)) { 2435 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2436 vdc->instance); 2437 return (status); 2438 } 2439 vdc->initialized |= VDC_DRING_INIT; 2440 } 2441 2442 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2443 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2444 vdc->dring_cookie = 2445 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2446 2447 status = ldc_mem_dring_bind(vdc->ldc_handle, vdc->ldc_dring_hdl, 2448 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2449 &vdc->dring_cookie[0], 2450 &vdc->dring_cookie_count); 2451 if (status != 0) { 2452 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2453 "(%lx) to channel (%lx) status=%d\n", 2454 vdc->instance, vdc->ldc_dring_hdl, 2455 vdc->ldc_handle, status); 2456 return (status); 2457 } 2458 ASSERT(vdc->dring_cookie_count == 1); 2459 vdc->initialized |= VDC_DRING_BOUND; 2460 } 2461 2462 status = ldc_mem_dring_info(vdc->ldc_dring_hdl, &vdc->dring_mem_info); 2463 if (status != 0) { 2464 DMSG(vdc, 0, 2465 "[%d] Failed to get info for descriptor ring (%lx)\n", 2466 vdc->instance, vdc->ldc_dring_hdl); 2467 return (status); 2468 } 2469 2470 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2471 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2472 2473 /* Allocate the local copy of this dring */ 2474 vdc->local_dring = 2475 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2476 KM_SLEEP); 2477 vdc->initialized |= VDC_DRING_LOCAL; 2478 } 2479 2480 /* 2481 * Mark all DRing entries as free and initialize the private 2482 * descriptor's memory handles. If any entry is initialized, 2483 * we need to free it later so we set the bit in 'initialized' 2484 * at the start. 2485 */ 2486 vdc->initialized |= VDC_DRING_ENTRY; 2487 for (i = 0; i < vdc->dring_len; i++) { 2488 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2489 dep->hdr.dstate = VIO_DESC_FREE; 2490 2491 status = ldc_mem_alloc_handle(vdc->ldc_handle, 2492 &vdc->local_dring[i].desc_mhdl); 2493 if (status != 0) { 2494 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2495 " descriptor %d", vdc->instance, i); 2496 return (status); 2497 } 2498 vdc->local_dring[i].is_free = B_TRUE; 2499 vdc->local_dring[i].dep = dep; 2500 } 2501 2502 /* Initialize the starting index */ 2503 vdc->dring_curr_idx = 0; 2504 2505 return (status); 2506 } 2507 2508 /* 2509 * Function: 2510 * vdc_destroy_descriptor_ring() 2511 * 2512 * Description: 2513 * 2514 * Arguments: 2515 * vdc - soft state pointer for this instance of the device driver. 2516 * 2517 * Return Code: 2518 * None 2519 */ 2520 static void 2521 vdc_destroy_descriptor_ring(vdc_t *vdc) 2522 { 2523 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2524 ldc_mem_handle_t mhdl = NULL; 2525 ldc_mem_info_t minfo; 2526 int status = -1; 2527 int i; /* loop */ 2528 2529 ASSERT(vdc != NULL); 2530 ASSERT(mutex_owned(&vdc->lock)); 2531 2532 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2533 2534 if (vdc->initialized & VDC_DRING_ENTRY) { 2535 DMSG(vdc, 0, 2536 "[%d] Removing Local DRing entries\n", vdc->instance); 2537 for (i = 0; i < vdc->dring_len; i++) { 2538 ldep = &vdc->local_dring[i]; 2539 mhdl = ldep->desc_mhdl; 2540 2541 if (mhdl == NULL) 2542 continue; 2543 2544 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2545 DMSG(vdc, 0, 2546 "ldc_mem_info returned an error: %d\n", 2547 status); 2548 2549 /* 2550 * This must mean that the mem handle 2551 * is not valid. Clear it out so that 2552 * no one tries to use it. 2553 */ 2554 ldep->desc_mhdl = NULL; 2555 continue; 2556 } 2557 2558 if (minfo.status == LDC_BOUND) { 2559 (void) ldc_mem_unbind_handle(mhdl); 2560 } 2561 2562 (void) ldc_mem_free_handle(mhdl); 2563 2564 ldep->desc_mhdl = NULL; 2565 } 2566 vdc->initialized &= ~VDC_DRING_ENTRY; 2567 } 2568 2569 if (vdc->initialized & VDC_DRING_LOCAL) { 2570 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2571 kmem_free(vdc->local_dring, 2572 vdc->dring_len * sizeof (vdc_local_desc_t)); 2573 vdc->initialized &= ~VDC_DRING_LOCAL; 2574 } 2575 2576 if (vdc->initialized & VDC_DRING_BOUND) { 2577 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2578 status = ldc_mem_dring_unbind(vdc->ldc_dring_hdl); 2579 if (status == 0) { 2580 vdc->initialized &= ~VDC_DRING_BOUND; 2581 } else { 2582 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2583 vdc->instance, status, vdc->ldc_dring_hdl); 2584 } 2585 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2586 } 2587 2588 if (vdc->initialized & VDC_DRING_INIT) { 2589 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2590 status = ldc_mem_dring_destroy(vdc->ldc_dring_hdl); 2591 if (status == 0) { 2592 vdc->ldc_dring_hdl = NULL; 2593 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2594 vdc->initialized &= ~VDC_DRING_INIT; 2595 } else { 2596 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2597 vdc->instance, status, vdc->ldc_dring_hdl); 2598 } 2599 } 2600 } 2601 2602 /* 2603 * Function: 2604 * vdc_map_to_shared_ring() 2605 * 2606 * Description: 2607 * Copy contents of the local descriptor to the shared 2608 * memory descriptor. 2609 * 2610 * Arguments: 2611 * vdcp - soft state pointer for this instance of the device driver. 2612 * idx - descriptor ring index 2613 * 2614 * Return Code: 2615 * None 2616 */ 2617 static int 2618 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2619 { 2620 vdc_local_desc_t *ldep; 2621 vd_dring_entry_t *dep; 2622 int rv; 2623 2624 ldep = &(vdcp->local_dring[idx]); 2625 2626 /* for now leave in the old pop_mem_hdl stuff */ 2627 if (ldep->nbytes > 0) { 2628 rv = vdc_populate_mem_hdl(vdcp, ldep); 2629 if (rv) { 2630 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2631 vdcp->instance); 2632 return (rv); 2633 } 2634 } 2635 2636 /* 2637 * fill in the data details into the DRing 2638 */ 2639 dep = ldep->dep; 2640 ASSERT(dep != NULL); 2641 2642 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2643 dep->payload.operation = ldep->operation; 2644 dep->payload.addr = ldep->offset; 2645 dep->payload.nbytes = ldep->nbytes; 2646 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2647 dep->payload.slice = ldep->slice; 2648 dep->hdr.dstate = VIO_DESC_READY; 2649 dep->hdr.ack = 1; /* request an ACK for every message */ 2650 2651 return (0); 2652 } 2653 2654 /* 2655 * Function: 2656 * vdc_send_request 2657 * 2658 * Description: 2659 * This routine writes the data to be transmitted to vds into the 2660 * descriptor, notifies vds that the ring has been updated and 2661 * then waits for the request to be processed. 2662 * 2663 * Arguments: 2664 * vdcp - the soft state pointer 2665 * operation - operation we want vds to perform (VD_OP_XXX) 2666 * addr - address of data buf to be read/written. 2667 * nbytes - number of bytes to read/write 2668 * slice - the disk slice this request is for 2669 * offset - relative disk offset 2670 * cb_type - type of call - STRATEGY or SYNC 2671 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2672 * . mode for ioctl(9e) 2673 * . LP64 diskaddr_t (block I/O) 2674 * dir - direction of operation (READ/WRITE/BOTH) 2675 * 2676 * Return Codes: 2677 * 0 2678 * ENXIO 2679 */ 2680 static int 2681 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2682 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2683 void *cb_arg, vio_desc_direction_t dir) 2684 { 2685 ASSERT(vdcp != NULL); 2686 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2687 2688 mutex_enter(&vdcp->lock); 2689 2690 do { 2691 while (vdcp->state != VDC_STATE_RUNNING) { 2692 2693 /* return error if detaching */ 2694 if (vdcp->state == VDC_STATE_DETACH) { 2695 mutex_exit(&vdcp->lock); 2696 return (ENXIO); 2697 } 2698 2699 /* fail request if connection timeout is reached */ 2700 if (vdcp->ctimeout_reached) { 2701 mutex_exit(&vdcp->lock); 2702 return (EIO); 2703 } 2704 2705 /* 2706 * If we are panicking and the disk is not ready then 2707 * we can't send any request because we can't complete 2708 * the handshake now. 2709 */ 2710 if (ddi_in_panic()) { 2711 mutex_exit(&vdcp->lock); 2712 return (EIO); 2713 } 2714 2715 cv_wait(&vdcp->running_cv, &vdcp->lock); 2716 } 2717 2718 } while (vdc_populate_descriptor(vdcp, operation, addr, 2719 nbytes, slice, offset, cb_type, cb_arg, dir)); 2720 2721 mutex_exit(&vdcp->lock); 2722 return (0); 2723 } 2724 2725 2726 /* 2727 * Function: 2728 * vdc_populate_descriptor 2729 * 2730 * Description: 2731 * This routine writes the data to be transmitted to vds into the 2732 * descriptor, notifies vds that the ring has been updated and 2733 * then waits for the request to be processed. 2734 * 2735 * Arguments: 2736 * vdcp - the soft state pointer 2737 * operation - operation we want vds to perform (VD_OP_XXX) 2738 * addr - address of data buf to be read/written. 2739 * nbytes - number of bytes to read/write 2740 * slice - the disk slice this request is for 2741 * offset - relative disk offset 2742 * cb_type - type of call - STRATEGY or SYNC 2743 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2744 * . mode for ioctl(9e) 2745 * . LP64 diskaddr_t (block I/O) 2746 * dir - direction of operation (READ/WRITE/BOTH) 2747 * 2748 * Return Codes: 2749 * 0 2750 * EAGAIN 2751 * ECONNRESET 2752 * ENXIO 2753 */ 2754 static int 2755 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2756 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2757 void *cb_arg, vio_desc_direction_t dir) 2758 { 2759 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2760 int idx; /* Index of DRing entry used */ 2761 int next_idx; 2762 vio_dring_msg_t dmsg; 2763 size_t msglen; 2764 int rv; 2765 2766 ASSERT(MUTEX_HELD(&vdcp->lock)); 2767 vdcp->threads_pending++; 2768 loop: 2769 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2770 2771 /* Get next available D-Ring entry */ 2772 idx = vdcp->dring_curr_idx; 2773 local_dep = &(vdcp->local_dring[idx]); 2774 2775 if (!local_dep->is_free) { 2776 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2777 vdcp->instance); 2778 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2779 if (vdcp->state == VDC_STATE_RUNNING || 2780 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2781 goto loop; 2782 } 2783 vdcp->threads_pending--; 2784 return (ECONNRESET); 2785 } 2786 2787 next_idx = idx + 1; 2788 if (next_idx >= vdcp->dring_len) 2789 next_idx = 0; 2790 vdcp->dring_curr_idx = next_idx; 2791 2792 ASSERT(local_dep->is_free); 2793 2794 local_dep->operation = operation; 2795 local_dep->addr = addr; 2796 local_dep->nbytes = nbytes; 2797 local_dep->slice = slice; 2798 local_dep->offset = offset; 2799 local_dep->cb_type = cb_type; 2800 local_dep->cb_arg = cb_arg; 2801 local_dep->dir = dir; 2802 2803 local_dep->is_free = B_FALSE; 2804 2805 rv = vdc_map_to_shared_dring(vdcp, idx); 2806 if (rv) { 2807 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 2808 vdcp->instance); 2809 /* free the descriptor */ 2810 local_dep->is_free = B_TRUE; 2811 vdcp->dring_curr_idx = idx; 2812 cv_wait(&vdcp->membind_cv, &vdcp->lock); 2813 if (vdcp->state == VDC_STATE_RUNNING || 2814 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2815 goto loop; 2816 } 2817 vdcp->threads_pending--; 2818 return (ECONNRESET); 2819 } 2820 2821 /* 2822 * Send a msg with the DRing details to vds 2823 */ 2824 VIO_INIT_DRING_DATA_TAG(dmsg); 2825 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 2826 dmsg.dring_ident = vdcp->dring_ident; 2827 dmsg.start_idx = idx; 2828 dmsg.end_idx = idx; 2829 vdcp->seq_num++; 2830 2831 DTRACE_IO2(send, vio_dring_msg_t *, &dmsg, vdc_t *, vdcp); 2832 2833 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 2834 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 2835 2836 /* 2837 * note we're still holding the lock here to 2838 * make sure the message goes out in order !!!... 2839 */ 2840 msglen = sizeof (dmsg); 2841 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 2842 switch (rv) { 2843 case ECONNRESET: 2844 /* 2845 * vdc_send initiates the reset on failure. 2846 * Since the transaction has already been put 2847 * on the local dring, it will automatically get 2848 * retried when the channel is reset. Given that, 2849 * it is ok to just return success even though the 2850 * send failed. 2851 */ 2852 rv = 0; 2853 break; 2854 2855 case 0: /* EOK */ 2856 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 2857 break; 2858 2859 default: 2860 goto cleanup_and_exit; 2861 } 2862 2863 vdcp->threads_pending--; 2864 return (rv); 2865 2866 cleanup_and_exit: 2867 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 2868 return (ENXIO); 2869 } 2870 2871 /* 2872 * Function: 2873 * vdc_do_sync_op 2874 * 2875 * Description: 2876 * Wrapper around vdc_populate_descriptor that blocks until the 2877 * response to the message is available. 2878 * 2879 * Arguments: 2880 * vdcp - the soft state pointer 2881 * operation - operation we want vds to perform (VD_OP_XXX) 2882 * addr - address of data buf to be read/written. 2883 * nbytes - number of bytes to read/write 2884 * slice - the disk slice this request is for 2885 * offset - relative disk offset 2886 * cb_type - type of call - STRATEGY or SYNC 2887 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2888 * . mode for ioctl(9e) 2889 * . LP64 diskaddr_t (block I/O) 2890 * dir - direction of operation (READ/WRITE/BOTH) 2891 * rconflict - check for reservation conflict in case of failure 2892 * 2893 * rconflict should be set to B_TRUE by most callers. Callers invoking the 2894 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 2895 * result of a successful operation with vd_scsi_status(). 2896 * 2897 * Return Codes: 2898 * 0 2899 * EAGAIN 2900 * EFAULT 2901 * ENXIO 2902 * EIO 2903 */ 2904 static int 2905 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 2906 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 2907 vio_desc_direction_t dir, boolean_t rconflict) 2908 { 2909 int status; 2910 vdc_io_t *vio; 2911 boolean_t check_resv_conflict = B_FALSE; 2912 2913 ASSERT(cb_type == CB_SYNC); 2914 2915 /* 2916 * Grab the lock, if blocked wait until the server 2917 * response causes us to wake up again. 2918 */ 2919 mutex_enter(&vdcp->lock); 2920 vdcp->sync_op_cnt++; 2921 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 2922 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 2923 2924 if (vdcp->state == VDC_STATE_DETACH) { 2925 cv_broadcast(&vdcp->sync_blocked_cv); 2926 vdcp->sync_op_cnt--; 2927 mutex_exit(&vdcp->lock); 2928 return (ENXIO); 2929 } 2930 2931 /* now block anyone other thread entering after us */ 2932 vdcp->sync_op_blocked = B_TRUE; 2933 vdcp->sync_op_pending = B_TRUE; 2934 mutex_exit(&vdcp->lock); 2935 2936 status = vdc_send_request(vdcp, operation, addr, 2937 nbytes, slice, offset, cb_type, cb_arg, dir); 2938 2939 mutex_enter(&vdcp->lock); 2940 2941 if (status != 0) { 2942 vdcp->sync_op_pending = B_FALSE; 2943 } else { 2944 /* 2945 * block until our transaction completes. 2946 * Also anyone else waiting also gets to go next. 2947 */ 2948 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 2949 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 2950 2951 DMSG(vdcp, 2, ": operation returned %d\n", 2952 vdcp->sync_op_status); 2953 if (vdcp->state == VDC_STATE_DETACH) { 2954 vdcp->sync_op_pending = B_FALSE; 2955 status = ENXIO; 2956 } else { 2957 status = vdcp->sync_op_status; 2958 if (status != 0 && vdcp->failfast_interval != 0) { 2959 /* 2960 * Operation has failed and failfast is enabled. 2961 * We need to check if the failure is due to a 2962 * reservation conflict if this was requested. 2963 */ 2964 check_resv_conflict = rconflict; 2965 } 2966 2967 } 2968 } 2969 2970 vdcp->sync_op_status = 0; 2971 vdcp->sync_op_blocked = B_FALSE; 2972 vdcp->sync_op_cnt--; 2973 2974 /* signal the next waiting thread */ 2975 cv_signal(&vdcp->sync_blocked_cv); 2976 2977 /* 2978 * We have to check for reservation conflict after unblocking sync 2979 * operations because some sync operations will be used to do this 2980 * check. 2981 */ 2982 if (check_resv_conflict) { 2983 vio = vdc_failfast_io_queue(vdcp, NULL); 2984 while (vio->vio_qtime != 0) 2985 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 2986 kmem_free(vio, sizeof (vdc_io_t)); 2987 } 2988 2989 mutex_exit(&vdcp->lock); 2990 2991 return (status); 2992 } 2993 2994 2995 /* 2996 * Function: 2997 * vdc_drain_response() 2998 * 2999 * Description: 3000 * When a guest is panicking, the completion of requests needs to be 3001 * handled differently because interrupts are disabled and vdc 3002 * will not get messages. We have to poll for the messages instead. 3003 * 3004 * Arguments: 3005 * vdc - soft state pointer for this instance of the device driver. 3006 * 3007 * Return Code: 3008 * 0 - Success 3009 */ 3010 static int 3011 vdc_drain_response(vdc_t *vdc) 3012 { 3013 int rv, idx, retries; 3014 size_t msglen; 3015 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3016 vio_dring_msg_t dmsg; 3017 3018 mutex_enter(&vdc->lock); 3019 3020 retries = 0; 3021 for (;;) { 3022 msglen = sizeof (dmsg); 3023 rv = ldc_read(vdc->ldc_handle, (caddr_t)&dmsg, &msglen); 3024 if (rv) { 3025 rv = EINVAL; 3026 break; 3027 } 3028 3029 /* 3030 * if there are no packets wait and check again 3031 */ 3032 if ((rv == 0) && (msglen == 0)) { 3033 if (retries++ > vdc_dump_retries) { 3034 rv = EAGAIN; 3035 break; 3036 } 3037 3038 drv_usecwait(vdc_usec_timeout_dump); 3039 continue; 3040 } 3041 3042 /* 3043 * Ignore all messages that are not ACKs/NACKs to 3044 * DRing requests. 3045 */ 3046 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3047 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3048 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3049 dmsg.tag.vio_msgtype, 3050 dmsg.tag.vio_subtype, 3051 dmsg.tag.vio_subtype_env); 3052 continue; 3053 } 3054 3055 /* 3056 * set the appropriate return value for the current request. 3057 */ 3058 switch (dmsg.tag.vio_subtype) { 3059 case VIO_SUBTYPE_ACK: 3060 rv = 0; 3061 break; 3062 case VIO_SUBTYPE_NACK: 3063 rv = EAGAIN; 3064 break; 3065 default: 3066 continue; 3067 } 3068 3069 idx = dmsg.start_idx; 3070 if (idx >= vdc->dring_len) { 3071 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3072 vdc->instance, idx); 3073 continue; 3074 } 3075 ldep = &vdc->local_dring[idx]; 3076 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3077 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3078 vdc->instance, idx, ldep->dep->hdr.dstate); 3079 continue; 3080 } 3081 3082 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3083 vdc->instance, idx, ldep->dep->hdr.dstate); 3084 rv = vdc_depopulate_descriptor(vdc, idx); 3085 if (rv) { 3086 DMSG(vdc, 0, 3087 "[%d] Entry @ %d - depopulate failed ..\n", 3088 vdc->instance, idx); 3089 } 3090 3091 /* if this is the last descriptor - break out of loop */ 3092 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3093 break; 3094 } 3095 3096 mutex_exit(&vdc->lock); 3097 DMSG(vdc, 0, "End idx=%d\n", idx); 3098 3099 return (rv); 3100 } 3101 3102 3103 /* 3104 * Function: 3105 * vdc_depopulate_descriptor() 3106 * 3107 * Description: 3108 * 3109 * Arguments: 3110 * vdc - soft state pointer for this instance of the device driver. 3111 * idx - Index of the Descriptor Ring entry being modified 3112 * 3113 * Return Code: 3114 * 0 - Success 3115 */ 3116 static int 3117 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3118 { 3119 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3120 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3121 int status = ENXIO; 3122 int rv = 0; 3123 3124 ASSERT(vdc != NULL); 3125 ASSERT(idx < vdc->dring_len); 3126 ldep = &vdc->local_dring[idx]; 3127 ASSERT(ldep != NULL); 3128 ASSERT(MUTEX_HELD(&vdc->lock)); 3129 3130 DMSG(vdc, 2, ": idx = %d\n", idx); 3131 dep = ldep->dep; 3132 ASSERT(dep != NULL); 3133 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3134 (dep->payload.status == ECANCELED)); 3135 3136 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3137 3138 ldep->is_free = B_TRUE; 3139 status = dep->payload.status; 3140 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3141 3142 /* 3143 * If no buffers were used to transfer information to the server when 3144 * populating the descriptor then no memory handles need to be unbound 3145 * and we can return now. 3146 */ 3147 if (ldep->nbytes == 0) { 3148 cv_signal(&vdc->dring_free_cv); 3149 return (status); 3150 } 3151 3152 /* 3153 * If the upper layer passed in a misaligned address we copied the 3154 * data into an aligned buffer before sending it to LDC - we now 3155 * copy it back to the original buffer. 3156 */ 3157 if (ldep->align_addr) { 3158 ASSERT(ldep->addr != NULL); 3159 3160 if (dep->payload.nbytes > 0) 3161 bcopy(ldep->align_addr, ldep->addr, 3162 dep->payload.nbytes); 3163 kmem_free(ldep->align_addr, 3164 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3165 ldep->align_addr = NULL; 3166 } 3167 3168 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3169 if (rv != 0) { 3170 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3171 vdc->instance, ldep->desc_mhdl, idx, rv); 3172 /* 3173 * The error returned by the vDisk server is more informative 3174 * and thus has a higher priority but if it isn't set we ensure 3175 * that this function returns an error. 3176 */ 3177 if (status == 0) 3178 status = EINVAL; 3179 } 3180 3181 cv_signal(&vdc->membind_cv); 3182 cv_signal(&vdc->dring_free_cv); 3183 3184 return (status); 3185 } 3186 3187 /* 3188 * Function: 3189 * vdc_populate_mem_hdl() 3190 * 3191 * Description: 3192 * 3193 * Arguments: 3194 * vdc - soft state pointer for this instance of the device driver. 3195 * idx - Index of the Descriptor Ring entry being modified 3196 * addr - virtual address being mapped in 3197 * nybtes - number of bytes in 'addr' 3198 * operation - the vDisk operation being performed (VD_OP_xxx) 3199 * 3200 * Return Code: 3201 * 0 - Success 3202 */ 3203 static int 3204 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3205 { 3206 vd_dring_entry_t *dep = NULL; 3207 ldc_mem_handle_t mhdl; 3208 caddr_t vaddr; 3209 size_t nbytes; 3210 uint8_t perm = LDC_MEM_RW; 3211 uint8_t maptype; 3212 int rv = 0; 3213 int i; 3214 3215 ASSERT(vdcp != NULL); 3216 3217 dep = ldep->dep; 3218 mhdl = ldep->desc_mhdl; 3219 3220 switch (ldep->dir) { 3221 case VIO_read_dir: 3222 perm = LDC_MEM_W; 3223 break; 3224 3225 case VIO_write_dir: 3226 perm = LDC_MEM_R; 3227 break; 3228 3229 case VIO_both_dir: 3230 perm = LDC_MEM_RW; 3231 break; 3232 3233 default: 3234 ASSERT(0); /* catch bad programming in vdc */ 3235 } 3236 3237 /* 3238 * LDC expects any addresses passed in to be 8-byte aligned. We need 3239 * to copy the contents of any misaligned buffers to a newly allocated 3240 * buffer and bind it instead (and copy the the contents back to the 3241 * original buffer passed in when depopulating the descriptor) 3242 */ 3243 vaddr = ldep->addr; 3244 nbytes = ldep->nbytes; 3245 if (((uint64_t)vaddr & 0x7) != 0) { 3246 ASSERT(ldep->align_addr == NULL); 3247 ldep->align_addr = 3248 kmem_alloc(sizeof (caddr_t) * 3249 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3250 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3251 "(buf=%p nb=%ld op=%d)\n", 3252 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3253 nbytes, ldep->operation); 3254 if (perm != LDC_MEM_W) 3255 bcopy(vaddr, ldep->align_addr, nbytes); 3256 vaddr = ldep->align_addr; 3257 } 3258 3259 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3260 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3261 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3262 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3263 vdcp->instance, dep->payload.ncookies); 3264 if (rv != 0) { 3265 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3266 "(mhdl=%p, buf=%p, err=%d)\n", 3267 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3268 if (ldep->align_addr) { 3269 kmem_free(ldep->align_addr, 3270 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3271 ldep->align_addr = NULL; 3272 } 3273 return (EAGAIN); 3274 } 3275 3276 /* 3277 * Get the other cookies (if any). 3278 */ 3279 for (i = 1; i < dep->payload.ncookies; i++) { 3280 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3281 if (rv != 0) { 3282 (void) ldc_mem_unbind_handle(mhdl); 3283 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3284 "(mhdl=%lx cnum=%d), err=%d", 3285 vdcp->instance, mhdl, i, rv); 3286 if (ldep->align_addr) { 3287 kmem_free(ldep->align_addr, 3288 sizeof (caddr_t) * ldep->nbytes); 3289 ldep->align_addr = NULL; 3290 } 3291 return (EAGAIN); 3292 } 3293 } 3294 3295 return (rv); 3296 } 3297 3298 /* 3299 * Interrupt handlers for messages from LDC 3300 */ 3301 3302 /* 3303 * Function: 3304 * vdc_handle_cb() 3305 * 3306 * Description: 3307 * 3308 * Arguments: 3309 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3310 * arg - soft state pointer for this instance of the device driver. 3311 * 3312 * Return Code: 3313 * 0 - Success 3314 */ 3315 static uint_t 3316 vdc_handle_cb(uint64_t event, caddr_t arg) 3317 { 3318 ldc_status_t ldc_state; 3319 int rv = 0; 3320 3321 vdc_t *vdc = (vdc_t *)(void *)arg; 3322 3323 ASSERT(vdc != NULL); 3324 3325 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3326 3327 /* 3328 * Depending on the type of event that triggered this callback, 3329 * we modify the handshake state or read the data. 3330 * 3331 * NOTE: not done as a switch() as event could be triggered by 3332 * a state change and a read request. Also the ordering of the 3333 * check for the event types is deliberate. 3334 */ 3335 if (event & LDC_EVT_UP) { 3336 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3337 3338 mutex_enter(&vdc->lock); 3339 3340 /* get LDC state */ 3341 rv = ldc_status(vdc->ldc_handle, &ldc_state); 3342 if (rv != 0) { 3343 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3344 vdc->instance, rv); 3345 return (LDC_SUCCESS); 3346 } 3347 if (vdc->ldc_state != LDC_UP && ldc_state == LDC_UP) { 3348 /* 3349 * Reset the transaction sequence numbers when 3350 * LDC comes up. We then kick off the handshake 3351 * negotiation with the vDisk server. 3352 */ 3353 vdc->seq_num = 1; 3354 vdc->seq_num_reply = 0; 3355 vdc->ldc_state = ldc_state; 3356 cv_signal(&vdc->initwait_cv); 3357 } 3358 3359 mutex_exit(&vdc->lock); 3360 } 3361 3362 if (event & LDC_EVT_READ) { 3363 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3364 mutex_enter(&vdc->read_lock); 3365 cv_signal(&vdc->read_cv); 3366 vdc->read_state = VDC_READ_PENDING; 3367 mutex_exit(&vdc->read_lock); 3368 3369 /* that's all we have to do - no need to handle DOWN/RESET */ 3370 return (LDC_SUCCESS); 3371 } 3372 3373 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3374 3375 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3376 3377 mutex_enter(&vdc->lock); 3378 /* 3379 * Need to wake up any readers so they will 3380 * detect that a reset has occurred. 3381 */ 3382 mutex_enter(&vdc->read_lock); 3383 if ((vdc->read_state == VDC_READ_WAITING) || 3384 (vdc->read_state == VDC_READ_RESET)) 3385 cv_signal(&vdc->read_cv); 3386 vdc->read_state = VDC_READ_RESET; 3387 mutex_exit(&vdc->read_lock); 3388 3389 /* wake up any threads waiting for connection to come up */ 3390 if (vdc->state == VDC_STATE_INIT_WAITING) { 3391 vdc->state = VDC_STATE_RESETTING; 3392 cv_signal(&vdc->initwait_cv); 3393 } 3394 3395 mutex_exit(&vdc->lock); 3396 } 3397 3398 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3399 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3400 vdc->instance, event); 3401 3402 return (LDC_SUCCESS); 3403 } 3404 3405 /* 3406 * Function: 3407 * vdc_wait_for_response() 3408 * 3409 * Description: 3410 * Block waiting for a response from the server. If there is 3411 * no data the thread block on the read_cv that is signalled 3412 * by the callback when an EVT_READ occurs. 3413 * 3414 * Arguments: 3415 * vdcp - soft state pointer for this instance of the device driver. 3416 * 3417 * Return Code: 3418 * 0 - Success 3419 */ 3420 static int 3421 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3422 { 3423 size_t nbytes = sizeof (*msgp); 3424 int status; 3425 3426 ASSERT(vdcp != NULL); 3427 3428 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3429 3430 status = vdc_recv(vdcp, msgp, &nbytes); 3431 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3432 status, (int)nbytes); 3433 if (status) { 3434 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3435 vdcp->instance, status); 3436 return (status); 3437 } 3438 3439 if (nbytes < sizeof (vio_msg_tag_t)) { 3440 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3441 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3442 return (ENOMSG); 3443 } 3444 3445 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3446 msgp->tag.vio_msgtype, 3447 msgp->tag.vio_subtype, 3448 msgp->tag.vio_subtype_env); 3449 3450 /* 3451 * Verify the Session ID of the message 3452 * 3453 * Every message after the Version has been negotiated should 3454 * have the correct session ID set. 3455 */ 3456 if ((msgp->tag.vio_sid != vdcp->session_id) && 3457 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3458 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3459 "expected 0x%lx [seq num %lx @ %d]", 3460 vdcp->instance, msgp->tag.vio_sid, 3461 vdcp->session_id, 3462 ((vio_dring_msg_t *)msgp)->seq_num, 3463 ((vio_dring_msg_t *)msgp)->start_idx); 3464 return (ENOMSG); 3465 } 3466 return (0); 3467 } 3468 3469 3470 /* 3471 * Function: 3472 * vdc_resubmit_backup_dring() 3473 * 3474 * Description: 3475 * Resubmit each descriptor in the backed up dring to 3476 * vDisk server. The Dring was backed up during connection 3477 * reset. 3478 * 3479 * Arguments: 3480 * vdcp - soft state pointer for this instance of the device driver. 3481 * 3482 * Return Code: 3483 * 0 - Success 3484 */ 3485 static int 3486 vdc_resubmit_backup_dring(vdc_t *vdcp) 3487 { 3488 int count; 3489 int b_idx; 3490 int rv; 3491 int dring_size; 3492 int status; 3493 vio_msg_t vio_msg; 3494 vdc_local_desc_t *curr_ldep; 3495 3496 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3497 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3498 3499 if (vdcp->local_dring_backup == NULL) { 3500 /* the pending requests have already been processed */ 3501 return (0); 3502 } 3503 3504 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3505 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3506 3507 /* 3508 * Walk the backup copy of the local descriptor ring and 3509 * resubmit all the outstanding transactions. 3510 */ 3511 b_idx = vdcp->local_dring_backup_tail; 3512 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3513 3514 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3515 3516 /* only resubmit outstanding transactions */ 3517 if (!curr_ldep->is_free) { 3518 3519 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3520 mutex_enter(&vdcp->lock); 3521 rv = vdc_populate_descriptor(vdcp, curr_ldep->operation, 3522 curr_ldep->addr, curr_ldep->nbytes, 3523 curr_ldep->slice, curr_ldep->offset, 3524 curr_ldep->cb_type, curr_ldep->cb_arg, 3525 curr_ldep->dir); 3526 mutex_exit(&vdcp->lock); 3527 if (rv) { 3528 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3529 vdcp->instance, b_idx); 3530 return (rv); 3531 } 3532 3533 /* Wait for the response message. */ 3534 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3535 b_idx); 3536 status = vdc_wait_for_response(vdcp, &vio_msg); 3537 if (status) { 3538 DMSG(vdcp, 1, "[%d] wait_for_response " 3539 "returned err=%d\n", vdcp->instance, 3540 status); 3541 return (status); 3542 } 3543 3544 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3545 status = vdc_process_data_msg(vdcp, &vio_msg); 3546 if (status) { 3547 DMSG(vdcp, 1, "[%d] process_data_msg " 3548 "returned err=%d\n", vdcp->instance, 3549 status); 3550 return (status); 3551 } 3552 } 3553 3554 /* get the next element to submit */ 3555 if (++b_idx >= vdcp->local_dring_backup_len) 3556 b_idx = 0; 3557 } 3558 3559 /* all done - now clear up pending dring copy */ 3560 dring_size = vdcp->local_dring_backup_len * 3561 sizeof (vdcp->local_dring_backup[0]); 3562 3563 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3564 3565 vdcp->local_dring_backup = NULL; 3566 3567 return (0); 3568 } 3569 3570 /* 3571 * Function: 3572 * vdc_cancel_backup_dring 3573 * 3574 * Description: 3575 * Cancel each descriptor in the backed up dring to vDisk server. 3576 * The Dring was backed up during connection reset. 3577 * 3578 * Arguments: 3579 * vdcp - soft state pointer for this instance of the device driver. 3580 * 3581 * Return Code: 3582 * None 3583 */ 3584 void 3585 vdc_cancel_backup_ring(vdc_t *vdcp) 3586 { 3587 vdc_local_desc_t *ldep; 3588 struct buf *bufp; 3589 int count; 3590 int b_idx; 3591 int dring_size; 3592 3593 ASSERT(MUTEX_HELD(&vdcp->lock)); 3594 ASSERT(vdcp->state == VDC_STATE_INIT || 3595 vdcp->state == VDC_STATE_INIT_WAITING || 3596 vdcp->state == VDC_STATE_NEGOTIATE || 3597 vdcp->state == VDC_STATE_RESETTING); 3598 3599 if (vdcp->local_dring_backup == NULL) { 3600 /* the pending requests have already been processed */ 3601 return; 3602 } 3603 3604 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3605 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3606 3607 /* 3608 * Walk the backup copy of the local descriptor ring and 3609 * cancel all the outstanding transactions. 3610 */ 3611 b_idx = vdcp->local_dring_backup_tail; 3612 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3613 3614 ldep = &(vdcp->local_dring_backup[b_idx]); 3615 3616 /* only cancel outstanding transactions */ 3617 if (!ldep->is_free) { 3618 3619 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3620 3621 /* 3622 * All requests have already been cleared from the 3623 * local descriptor ring and the LDC channel has been 3624 * reset so we will never get any reply for these 3625 * requests. Now we just have to notify threads waiting 3626 * for replies that the request has failed. 3627 */ 3628 switch (ldep->cb_type) { 3629 case CB_SYNC: 3630 ASSERT(vdcp->sync_op_pending); 3631 vdcp->sync_op_status = EIO; 3632 vdcp->sync_op_pending = B_FALSE; 3633 cv_signal(&vdcp->sync_pending_cv); 3634 break; 3635 3636 case CB_STRATEGY: 3637 bufp = ldep->cb_arg; 3638 ASSERT(bufp != NULL); 3639 bufp->b_resid = bufp->b_bcount; 3640 bioerror(bufp, EIO); 3641 biodone(bufp); 3642 break; 3643 3644 default: 3645 ASSERT(0); 3646 } 3647 3648 } 3649 3650 /* get the next element to cancel */ 3651 if (++b_idx >= vdcp->local_dring_backup_len) 3652 b_idx = 0; 3653 } 3654 3655 /* all done - now clear up pending dring copy */ 3656 dring_size = vdcp->local_dring_backup_len * 3657 sizeof (vdcp->local_dring_backup[0]); 3658 3659 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3660 3661 vdcp->local_dring_backup = NULL; 3662 3663 DTRACE_IO2(processed, int, count, vdc_t *, vdcp); 3664 } 3665 3666 /* 3667 * Function: 3668 * vdc_connection_timeout 3669 * 3670 * Description: 3671 * This function is invoked if the timeout set to establish the connection 3672 * with vds expires. This will happen if we spend too much time in the 3673 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3674 * cancel any pending request and mark them as failed. 3675 * 3676 * If the timeout does not expire, it will be cancelled when we reach the 3677 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 3678 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 3679 * VDC_STATE_RESETTING state in which case we do nothing because the 3680 * timeout is being cancelled. 3681 * 3682 * Arguments: 3683 * arg - argument of the timeout function actually a soft state 3684 * pointer for the instance of the device driver. 3685 * 3686 * Return Code: 3687 * None 3688 */ 3689 void 3690 vdc_connection_timeout(void *arg) 3691 { 3692 vdc_t *vdcp = (vdc_t *)arg; 3693 3694 mutex_enter(&vdcp->lock); 3695 3696 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 3697 vdcp->state == VDC_STATE_DETACH) { 3698 /* 3699 * The connection has just been re-established or 3700 * we are detaching. 3701 */ 3702 vdcp->ctimeout_reached = B_FALSE; 3703 mutex_exit(&vdcp->lock); 3704 return; 3705 } 3706 3707 vdcp->ctimeout_reached = B_TRUE; 3708 3709 /* notify requests waiting for sending */ 3710 cv_broadcast(&vdcp->running_cv); 3711 3712 /* cancel requests waiting for a result */ 3713 vdc_cancel_backup_ring(vdcp); 3714 3715 mutex_exit(&vdcp->lock); 3716 3717 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 3718 vdcp->instance); 3719 } 3720 3721 /* 3722 * Function: 3723 * vdc_backup_local_dring() 3724 * 3725 * Description: 3726 * Backup the current dring in the event of a reset. The Dring 3727 * transactions will be resubmitted to the server when the 3728 * connection is restored. 3729 * 3730 * Arguments: 3731 * vdcp - soft state pointer for this instance of the device driver. 3732 * 3733 * Return Code: 3734 * NONE 3735 */ 3736 static void 3737 vdc_backup_local_dring(vdc_t *vdcp) 3738 { 3739 int dring_size; 3740 3741 ASSERT(MUTEX_HELD(&vdcp->lock)); 3742 ASSERT(vdcp->state == VDC_STATE_RESETTING); 3743 3744 /* 3745 * If the backup dring is stil around, it means 3746 * that the last restore did not complete. However, 3747 * since we never got back into the running state, 3748 * the backup copy we have is still valid. 3749 */ 3750 if (vdcp->local_dring_backup != NULL) { 3751 DMSG(vdcp, 1, "reusing local descriptor ring backup " 3752 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 3753 vdcp->local_dring_backup_tail); 3754 return; 3755 } 3756 3757 /* 3758 * The backup dring can be NULL and the local dring may not be 3759 * initialized. This can happen if we had a reset while establishing 3760 * a new connection but after the connection has timed out. In that 3761 * case the backup dring is NULL because the requests have been 3762 * cancelled and the request occured before the local dring is 3763 * initialized. 3764 */ 3765 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 3766 return; 3767 3768 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 3769 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 3770 3771 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 3772 3773 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 3774 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 3775 3776 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 3777 vdcp->local_dring_backup_len = vdcp->dring_len; 3778 } 3779 3780 /* -------------------------------------------------------------------------- */ 3781 3782 /* 3783 * The following functions process the incoming messages from vds 3784 */ 3785 3786 /* 3787 * Function: 3788 * vdc_process_msg_thread() 3789 * 3790 * Description: 3791 * 3792 * Main VDC message processing thread. Each vDisk instance 3793 * consists of a copy of this thread. This thread triggers 3794 * all the handshakes and data exchange with the server. It 3795 * also handles all channel resets 3796 * 3797 * Arguments: 3798 * vdc - soft state pointer for this instance of the device driver. 3799 * 3800 * Return Code: 3801 * None 3802 */ 3803 static void 3804 vdc_process_msg_thread(vdc_t *vdcp) 3805 { 3806 int status; 3807 int ctimeout; 3808 timeout_id_t tmid = 0; 3809 3810 mutex_enter(&vdcp->lock); 3811 3812 for (;;) { 3813 3814 #define Q(_s) (vdcp->state == _s) ? #_s : 3815 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 3816 Q(VDC_STATE_INIT) 3817 Q(VDC_STATE_INIT_WAITING) 3818 Q(VDC_STATE_NEGOTIATE) 3819 Q(VDC_STATE_HANDLE_PENDING) 3820 Q(VDC_STATE_RUNNING) 3821 Q(VDC_STATE_RESETTING) 3822 Q(VDC_STATE_DETACH) 3823 "UNKNOWN"); 3824 3825 switch (vdcp->state) { 3826 case VDC_STATE_INIT: 3827 3828 /* 3829 * If requested, start a timeout to check if the 3830 * connection with vds is established in the 3831 * specified delay. If the timeout expires, we 3832 * will cancel any pending request. 3833 * 3834 * If some reset have occurred while establishing 3835 * the connection, we already have a timeout armed 3836 * and in that case we don't need to arm a new one. 3837 */ 3838 ctimeout = (vdc_timeout != 0)? 3839 vdc_timeout : vdcp->ctimeout; 3840 3841 if (ctimeout != 0 && tmid == 0) { 3842 tmid = timeout(vdc_connection_timeout, vdcp, 3843 ctimeout * drv_usectohz(1000000)); 3844 } 3845 3846 /* Check if have re-initializing repeatedly */ 3847 if (vdcp->hshake_cnt++ > vdc_hshake_retries && 3848 vdcp->lifecycle != VDC_LC_ONLINE) { 3849 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 3850 vdcp->instance); 3851 vdcp->state = VDC_STATE_DETACH; 3852 break; 3853 } 3854 3855 /* Bring up connection with vds via LDC */ 3856 status = vdc_start_ldc_connection(vdcp); 3857 if (status == EINVAL) { 3858 DMSG(vdcp, 0, "[%d] Could not start LDC", 3859 vdcp->instance); 3860 vdcp->state = VDC_STATE_DETACH; 3861 } else { 3862 vdcp->state = VDC_STATE_INIT_WAITING; 3863 } 3864 break; 3865 3866 case VDC_STATE_INIT_WAITING: 3867 3868 /* 3869 * Let the callback event move us on 3870 * when channel is open to server 3871 */ 3872 while (vdcp->ldc_state != LDC_UP) { 3873 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 3874 if (vdcp->state != VDC_STATE_INIT_WAITING) { 3875 DMSG(vdcp, 0, 3876 "state moved to %d out from under us...\n", 3877 vdcp->state); 3878 3879 break; 3880 } 3881 } 3882 if (vdcp->state == VDC_STATE_INIT_WAITING && 3883 vdcp->ldc_state == LDC_UP) { 3884 vdcp->state = VDC_STATE_NEGOTIATE; 3885 } 3886 break; 3887 3888 case VDC_STATE_NEGOTIATE: 3889 switch (status = vdc_ver_negotiation(vdcp)) { 3890 case 0: 3891 break; 3892 default: 3893 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 3894 status); 3895 goto reset; 3896 } 3897 3898 switch (status = vdc_attr_negotiation(vdcp)) { 3899 case 0: 3900 break; 3901 default: 3902 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 3903 status); 3904 goto reset; 3905 } 3906 3907 switch (status = vdc_dring_negotiation(vdcp)) { 3908 case 0: 3909 break; 3910 default: 3911 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 3912 status); 3913 goto reset; 3914 } 3915 3916 switch (status = vdc_rdx_exchange(vdcp)) { 3917 case 0: 3918 vdcp->state = VDC_STATE_HANDLE_PENDING; 3919 goto done; 3920 default: 3921 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 3922 status); 3923 goto reset; 3924 } 3925 reset: 3926 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 3927 status); 3928 vdcp->state = VDC_STATE_RESETTING; 3929 vdcp->self_reset = B_TRUE; 3930 done: 3931 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 3932 vdcp->state); 3933 break; 3934 3935 case VDC_STATE_HANDLE_PENDING: 3936 3937 if (vdcp->ctimeout_reached) { 3938 /* 3939 * The connection timeout had been reached so 3940 * pending requests have been cancelled. Now 3941 * that the connection is back we can reset 3942 * the timeout. 3943 */ 3944 ASSERT(vdcp->local_dring_backup == NULL); 3945 ASSERT(tmid != 0); 3946 tmid = 0; 3947 vdcp->ctimeout_reached = B_FALSE; 3948 vdcp->state = VDC_STATE_RUNNING; 3949 DMSG(vdcp, 0, "[%d] connection to service " 3950 "domain is up", vdcp->instance); 3951 break; 3952 } 3953 3954 mutex_exit(&vdcp->lock); 3955 if (tmid != 0) { 3956 (void) untimeout(tmid); 3957 tmid = 0; 3958 } 3959 status = vdc_resubmit_backup_dring(vdcp); 3960 mutex_enter(&vdcp->lock); 3961 3962 if (status) 3963 vdcp->state = VDC_STATE_RESETTING; 3964 else 3965 vdcp->state = VDC_STATE_RUNNING; 3966 3967 break; 3968 3969 /* enter running state */ 3970 case VDC_STATE_RUNNING: 3971 /* 3972 * Signal anyone waiting for the connection 3973 * to come on line. 3974 */ 3975 vdcp->hshake_cnt = 0; 3976 cv_broadcast(&vdcp->running_cv); 3977 3978 /* failfast has to been checked after reset */ 3979 cv_signal(&vdcp->failfast_cv); 3980 3981 /* ownership is lost during reset */ 3982 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 3983 vdcp->ownership |= VDC_OWNERSHIP_RESET; 3984 cv_signal(&vdcp->ownership_cv); 3985 3986 mutex_exit(&vdcp->lock); 3987 3988 for (;;) { 3989 vio_msg_t msg; 3990 status = vdc_wait_for_response(vdcp, &msg); 3991 if (status) break; 3992 3993 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 3994 vdcp->instance); 3995 status = vdc_process_data_msg(vdcp, &msg); 3996 if (status) { 3997 DMSG(vdcp, 1, "[%d] process_data_msg " 3998 "returned err=%d\n", vdcp->instance, 3999 status); 4000 break; 4001 } 4002 4003 } 4004 4005 mutex_enter(&vdcp->lock); 4006 4007 vdcp->state = VDC_STATE_RESETTING; 4008 vdcp->self_reset = B_TRUE; 4009 break; 4010 4011 case VDC_STATE_RESETTING: 4012 /* 4013 * When we reach this state, we either come from the 4014 * VDC_STATE_RUNNING state and we can have pending 4015 * request but no timeout is armed; or we come from 4016 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4017 * VDC_HANDLE_PENDING state and there is no pending 4018 * request or pending requests have already been copied 4019 * into the backup dring. So we can safely keep the 4020 * connection timeout armed while we are in this state. 4021 */ 4022 4023 DMSG(vdcp, 0, "Initiating channel reset " 4024 "(pending = %d)\n", (int)vdcp->threads_pending); 4025 4026 if (vdcp->self_reset) { 4027 DMSG(vdcp, 0, 4028 "[%d] calling stop_ldc_connection.\n", 4029 vdcp->instance); 4030 status = vdc_stop_ldc_connection(vdcp); 4031 vdcp->self_reset = B_FALSE; 4032 } 4033 4034 /* 4035 * Wait for all threads currently waiting 4036 * for a free dring entry to use. 4037 */ 4038 while (vdcp->threads_pending) { 4039 cv_broadcast(&vdcp->membind_cv); 4040 cv_broadcast(&vdcp->dring_free_cv); 4041 mutex_exit(&vdcp->lock); 4042 /* give the waiters enough time to wake up */ 4043 delay(vdc_hz_min_ldc_delay); 4044 mutex_enter(&vdcp->lock); 4045 } 4046 4047 ASSERT(vdcp->threads_pending == 0); 4048 4049 /* Sanity check that no thread is receiving */ 4050 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4051 4052 vdcp->read_state = VDC_READ_IDLE; 4053 4054 vdc_backup_local_dring(vdcp); 4055 4056 /* cleanup the old d-ring */ 4057 vdc_destroy_descriptor_ring(vdcp); 4058 4059 /* go and start again */ 4060 vdcp->state = VDC_STATE_INIT; 4061 4062 break; 4063 4064 case VDC_STATE_DETACH: 4065 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4066 vdcp->instance); 4067 4068 /* cancel any pending timeout */ 4069 mutex_exit(&vdcp->lock); 4070 if (tmid != 0) { 4071 (void) untimeout(tmid); 4072 tmid = 0; 4073 } 4074 mutex_enter(&vdcp->lock); 4075 4076 /* 4077 * Signal anyone waiting for connection 4078 * to come online 4079 */ 4080 cv_broadcast(&vdcp->running_cv); 4081 4082 while (vdcp->sync_op_pending) { 4083 cv_signal(&vdcp->sync_pending_cv); 4084 cv_signal(&vdcp->sync_blocked_cv); 4085 mutex_exit(&vdcp->lock); 4086 /* give the waiters enough time to wake up */ 4087 delay(vdc_hz_min_ldc_delay); 4088 mutex_enter(&vdcp->lock); 4089 } 4090 4091 mutex_exit(&vdcp->lock); 4092 4093 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4094 vdcp->instance); 4095 thread_exit(); 4096 break; 4097 } 4098 } 4099 } 4100 4101 4102 /* 4103 * Function: 4104 * vdc_process_data_msg() 4105 * 4106 * Description: 4107 * This function is called by the message processing thread each time 4108 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4109 * be an ACK or NACK from vds[1] which vdc handles as follows. 4110 * ACK - wake up the waiting thread 4111 * NACK - resend any messages necessary 4112 * 4113 * [1] Although the message format allows it, vds should not send a 4114 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4115 * some bizarre reason it does, vdc will reset the connection. 4116 * 4117 * Arguments: 4118 * vdc - soft state pointer for this instance of the device driver. 4119 * msg - the LDC message sent by vds 4120 * 4121 * Return Code: 4122 * 0 - Success. 4123 * > 0 - error value returned by LDC 4124 */ 4125 static int 4126 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4127 { 4128 int status = 0; 4129 vio_dring_msg_t *dring_msg; 4130 vdc_local_desc_t *ldep = NULL; 4131 int start, end; 4132 int idx; 4133 4134 dring_msg = (vio_dring_msg_t *)msg; 4135 4136 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4137 ASSERT(vdcp != NULL); 4138 4139 mutex_enter(&vdcp->lock); 4140 4141 /* 4142 * Check to see if the message has bogus data 4143 */ 4144 idx = start = dring_msg->start_idx; 4145 end = dring_msg->end_idx; 4146 if ((start >= vdcp->dring_len) || 4147 (end >= vdcp->dring_len) || (end < -1)) { 4148 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4149 vdcp->instance, start, end); 4150 mutex_exit(&vdcp->lock); 4151 return (EINVAL); 4152 } 4153 4154 /* 4155 * Verify that the sequence number is what vdc expects. 4156 */ 4157 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4158 case VDC_SEQ_NUM_TODO: 4159 break; /* keep processing this message */ 4160 case VDC_SEQ_NUM_SKIP: 4161 mutex_exit(&vdcp->lock); 4162 return (0); 4163 case VDC_SEQ_NUM_INVALID: 4164 mutex_exit(&vdcp->lock); 4165 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4166 return (ENXIO); 4167 } 4168 4169 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4170 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4171 VDC_DUMP_DRING_MSG(dring_msg); 4172 mutex_exit(&vdcp->lock); 4173 return (EIO); 4174 4175 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4176 mutex_exit(&vdcp->lock); 4177 return (EPROTO); 4178 } 4179 4180 DTRACE_IO2(recv, vio_dring_msg_t, dring_msg, vdc_t *, vdcp); 4181 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4182 ASSERT(start == end); 4183 4184 ldep = &vdcp->local_dring[idx]; 4185 4186 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4187 ldep->dep->hdr.dstate, ldep->cb_type); 4188 4189 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4190 struct buf *bufp; 4191 4192 switch (ldep->cb_type) { 4193 case CB_SYNC: 4194 ASSERT(vdcp->sync_op_pending); 4195 4196 status = vdc_depopulate_descriptor(vdcp, idx); 4197 vdcp->sync_op_status = status; 4198 vdcp->sync_op_pending = B_FALSE; 4199 cv_signal(&vdcp->sync_pending_cv); 4200 break; 4201 4202 case CB_STRATEGY: 4203 bufp = ldep->cb_arg; 4204 ASSERT(bufp != NULL); 4205 bufp->b_resid = 4206 bufp->b_bcount - ldep->dep->payload.nbytes; 4207 status = ldep->dep->payload.status; /* Future:ntoh */ 4208 if (status != 0) { 4209 DMSG(vdcp, 1, "strategy status=%d\n", status); 4210 bioerror(bufp, status); 4211 } 4212 4213 (void) vdc_depopulate_descriptor(vdcp, idx); 4214 4215 DMSG(vdcp, 1, 4216 "strategy complete req=%ld bytes resp=%ld bytes\n", 4217 bufp->b_bcount, ldep->dep->payload.nbytes); 4218 4219 if (status != 0 && vdcp->failfast_interval != 0) { 4220 /* 4221 * The I/O has failed and failfast is enabled. 4222 * We need the failfast thread to check if the 4223 * failure is due to a reservation conflict. 4224 */ 4225 (void) vdc_failfast_io_queue(vdcp, bufp); 4226 } else { 4227 biodone(bufp); 4228 } 4229 break; 4230 4231 default: 4232 ASSERT(0); 4233 } 4234 } 4235 4236 /* let the arrival signal propogate */ 4237 mutex_exit(&vdcp->lock); 4238 4239 /* probe gives the count of how many entries were processed */ 4240 DTRACE_IO2(processed, int, 1, vdc_t *, vdcp); 4241 4242 return (0); 4243 } 4244 4245 4246 /* 4247 * Function: 4248 * vdc_handle_ver_msg() 4249 * 4250 * Description: 4251 * 4252 * Arguments: 4253 * vdc - soft state pointer for this instance of the device driver. 4254 * ver_msg - LDC message sent by vDisk server 4255 * 4256 * Return Code: 4257 * 0 - Success 4258 */ 4259 static int 4260 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4261 { 4262 int status = 0; 4263 4264 ASSERT(vdc != NULL); 4265 ASSERT(mutex_owned(&vdc->lock)); 4266 4267 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4268 return (EPROTO); 4269 } 4270 4271 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4272 return (EINVAL); 4273 } 4274 4275 switch (ver_msg->tag.vio_subtype) { 4276 case VIO_SUBTYPE_ACK: 4277 /* 4278 * We check to see if the version returned is indeed supported 4279 * (The server may have also adjusted the minor number downwards 4280 * and if so 'ver_msg' will contain the actual version agreed) 4281 */ 4282 if (vdc_is_supported_version(ver_msg)) { 4283 vdc->ver.major = ver_msg->ver_major; 4284 vdc->ver.minor = ver_msg->ver_minor; 4285 ASSERT(vdc->ver.major > 0); 4286 } else { 4287 status = EPROTO; 4288 } 4289 break; 4290 4291 case VIO_SUBTYPE_NACK: 4292 /* 4293 * call vdc_is_supported_version() which will return the next 4294 * supported version (if any) in 'ver_msg' 4295 */ 4296 (void) vdc_is_supported_version(ver_msg); 4297 if (ver_msg->ver_major > 0) { 4298 size_t len = sizeof (*ver_msg); 4299 4300 ASSERT(vdc->ver.major > 0); 4301 4302 /* reset the necessary fields and resend */ 4303 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4304 ver_msg->dev_class = VDEV_DISK; 4305 4306 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4307 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4308 vdc->instance, status); 4309 if (len != sizeof (*ver_msg)) 4310 status = EBADMSG; 4311 } else { 4312 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4313 vdc->instance); 4314 status = ENOTSUP; 4315 } 4316 4317 break; 4318 case VIO_SUBTYPE_INFO: 4319 /* 4320 * Handle the case where vds starts handshake 4321 * (for now only vdc is the instigator) 4322 */ 4323 status = ENOTSUP; 4324 break; 4325 4326 default: 4327 status = EINVAL; 4328 break; 4329 } 4330 4331 return (status); 4332 } 4333 4334 /* 4335 * Function: 4336 * vdc_handle_attr_msg() 4337 * 4338 * Description: 4339 * 4340 * Arguments: 4341 * vdc - soft state pointer for this instance of the device driver. 4342 * attr_msg - LDC message sent by vDisk server 4343 * 4344 * Return Code: 4345 * 0 - Success 4346 */ 4347 static int 4348 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4349 { 4350 int status = 0; 4351 4352 ASSERT(vdc != NULL); 4353 ASSERT(mutex_owned(&vdc->lock)); 4354 4355 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4356 return (EPROTO); 4357 } 4358 4359 switch (attr_msg->tag.vio_subtype) { 4360 case VIO_SUBTYPE_ACK: 4361 /* 4362 * We now verify the attributes sent by vds. 4363 */ 4364 if (attr_msg->vdisk_size == 0) { 4365 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4366 vdc->instance); 4367 status = EINVAL; 4368 break; 4369 } 4370 4371 if (attr_msg->max_xfer_sz == 0) { 4372 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4373 vdc->instance); 4374 status = EINVAL; 4375 break; 4376 } 4377 4378 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4379 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4380 vdc->instance); 4381 attr_msg->vdisk_size = 0; 4382 } 4383 4384 /* 4385 * If the disk size is already set check that it hasn't changed. 4386 */ 4387 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4388 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4389 DMSG(vdc, 0, "[%d] Different disk size from vds " 4390 "(old=0x%lx - new=0x%lx", vdc->instance, 4391 vdc->vdisk_size, attr_msg->vdisk_size) 4392 status = EINVAL; 4393 break; 4394 } 4395 4396 vdc->vdisk_size = attr_msg->vdisk_size; 4397 vdc->vdisk_type = attr_msg->vdisk_type; 4398 vdc->operations = attr_msg->operations; 4399 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4400 vdc->vdisk_media = attr_msg->vdisk_media; 4401 else 4402 vdc->vdisk_media = 0; 4403 4404 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4405 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4406 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4407 vdc->instance, vdc->block_size, 4408 attr_msg->vdisk_block_size); 4409 4410 /* 4411 * We don't know at compile time what the vDisk server will 4412 * think are good values but we apply a large (arbitrary) 4413 * upper bound to prevent memory exhaustion in vdc if it was 4414 * allocating a DRing based of huge values sent by the server. 4415 * We probably will never exceed this except if the message 4416 * was garbage. 4417 */ 4418 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4419 (PAGESIZE * DEV_BSIZE)) { 4420 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4421 vdc->block_size = attr_msg->vdisk_block_size; 4422 } else { 4423 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4424 " using max supported by vdc", vdc->instance); 4425 } 4426 4427 if ((attr_msg->xfer_mode != VIO_DRING_MODE) || 4428 (attr_msg->vdisk_size > INT64_MAX) || 4429 (attr_msg->operations == 0) || 4430 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4431 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4432 vdc->instance); 4433 status = EINVAL; 4434 break; 4435 } 4436 4437 /* 4438 * Now that we have received all attributes we can create a 4439 * fake geometry for the disk. 4440 */ 4441 vdc_create_fake_geometry(vdc); 4442 break; 4443 4444 case VIO_SUBTYPE_NACK: 4445 /* 4446 * vds could not handle the attributes we sent so we 4447 * stop negotiating. 4448 */ 4449 status = EPROTO; 4450 break; 4451 4452 case VIO_SUBTYPE_INFO: 4453 /* 4454 * Handle the case where vds starts the handshake 4455 * (for now; vdc is the only supported instigatior) 4456 */ 4457 status = ENOTSUP; 4458 break; 4459 4460 default: 4461 status = ENOTSUP; 4462 break; 4463 } 4464 4465 return (status); 4466 } 4467 4468 /* 4469 * Function: 4470 * vdc_handle_dring_reg_msg() 4471 * 4472 * Description: 4473 * 4474 * Arguments: 4475 * vdc - soft state pointer for this instance of the driver. 4476 * dring_msg - LDC message sent by vDisk server 4477 * 4478 * Return Code: 4479 * 0 - Success 4480 */ 4481 static int 4482 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4483 { 4484 int status = 0; 4485 4486 ASSERT(vdc != NULL); 4487 ASSERT(mutex_owned(&vdc->lock)); 4488 4489 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4490 return (EPROTO); 4491 } 4492 4493 switch (dring_msg->tag.vio_subtype) { 4494 case VIO_SUBTYPE_ACK: 4495 /* save the received dring_ident */ 4496 vdc->dring_ident = dring_msg->dring_ident; 4497 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4498 vdc->instance, vdc->dring_ident); 4499 break; 4500 4501 case VIO_SUBTYPE_NACK: 4502 /* 4503 * vds could not handle the DRing info we sent so we 4504 * stop negotiating. 4505 */ 4506 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4507 vdc->instance); 4508 status = EPROTO; 4509 break; 4510 4511 case VIO_SUBTYPE_INFO: 4512 /* 4513 * Handle the case where vds starts handshake 4514 * (for now only vdc is the instigatior) 4515 */ 4516 status = ENOTSUP; 4517 break; 4518 default: 4519 status = ENOTSUP; 4520 } 4521 4522 return (status); 4523 } 4524 4525 /* 4526 * Function: 4527 * vdc_verify_seq_num() 4528 * 4529 * Description: 4530 * This functions verifies that the sequence number sent back by the vDisk 4531 * server with the latest message is what is expected (i.e. it is greater 4532 * than the last seq num sent by the vDisk server and less than or equal 4533 * to the last seq num generated by vdc). 4534 * 4535 * It then checks the request ID to see if any requests need processing 4536 * in the DRing. 4537 * 4538 * Arguments: 4539 * vdc - soft state pointer for this instance of the driver. 4540 * dring_msg - pointer to the LDC message sent by vds 4541 * 4542 * Return Code: 4543 * VDC_SEQ_NUM_TODO - Message needs to be processed 4544 * VDC_SEQ_NUM_SKIP - Message has already been processed 4545 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4546 * vdc cannot deal with them 4547 */ 4548 static int 4549 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4550 { 4551 ASSERT(vdc != NULL); 4552 ASSERT(dring_msg != NULL); 4553 ASSERT(mutex_owned(&vdc->lock)); 4554 4555 /* 4556 * Check to see if the messages were responded to in the correct 4557 * order by vds. 4558 */ 4559 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4560 (dring_msg->seq_num > vdc->seq_num)) { 4561 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4562 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4563 vdc->instance, dring_msg->seq_num, 4564 vdc->seq_num_reply, vdc->seq_num, 4565 vdc->req_id_proc, vdc->req_id); 4566 return (VDC_SEQ_NUM_INVALID); 4567 } 4568 vdc->seq_num_reply = dring_msg->seq_num; 4569 4570 if (vdc->req_id_proc < vdc->req_id) 4571 return (VDC_SEQ_NUM_TODO); 4572 else 4573 return (VDC_SEQ_NUM_SKIP); 4574 } 4575 4576 4577 /* 4578 * Function: 4579 * vdc_is_supported_version() 4580 * 4581 * Description: 4582 * This routine checks if the major/minor version numbers specified in 4583 * 'ver_msg' are supported. If not it finds the next version that is 4584 * in the supported version list 'vdc_version[]' and sets the fields in 4585 * 'ver_msg' to those values 4586 * 4587 * Arguments: 4588 * ver_msg - LDC message sent by vDisk server 4589 * 4590 * Return Code: 4591 * B_TRUE - Success 4592 * B_FALSE - Version not supported 4593 */ 4594 static boolean_t 4595 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 4596 { 4597 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 4598 4599 for (int i = 0; i < vdc_num_versions; i++) { 4600 ASSERT(vdc_version[i].major > 0); 4601 ASSERT((i == 0) || 4602 (vdc_version[i].major < vdc_version[i-1].major)); 4603 4604 /* 4605 * If the major versions match, adjust the minor version, if 4606 * necessary, down to the highest value supported by this 4607 * client. The server should support all minor versions lower 4608 * than the value it sent 4609 */ 4610 if (ver_msg->ver_major == vdc_version[i].major) { 4611 if (ver_msg->ver_minor > vdc_version[i].minor) { 4612 DMSGX(0, 4613 "Adjusting minor version from %u to %u", 4614 ver_msg->ver_minor, vdc_version[i].minor); 4615 ver_msg->ver_minor = vdc_version[i].minor; 4616 } 4617 return (B_TRUE); 4618 } 4619 4620 /* 4621 * If the message contains a higher major version number, set 4622 * the message's major/minor versions to the current values 4623 * and return false, so this message will get resent with 4624 * these values, and the server will potentially try again 4625 * with the same or a lower version 4626 */ 4627 if (ver_msg->ver_major > vdc_version[i].major) { 4628 ver_msg->ver_major = vdc_version[i].major; 4629 ver_msg->ver_minor = vdc_version[i].minor; 4630 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 4631 ver_msg->ver_major, ver_msg->ver_minor); 4632 4633 return (B_FALSE); 4634 } 4635 4636 /* 4637 * Otherwise, the message's major version is less than the 4638 * current major version, so continue the loop to the next 4639 * (lower) supported version 4640 */ 4641 } 4642 4643 /* 4644 * No common version was found; "ground" the version pair in the 4645 * message to terminate negotiation 4646 */ 4647 ver_msg->ver_major = 0; 4648 ver_msg->ver_minor = 0; 4649 4650 return (B_FALSE); 4651 } 4652 /* -------------------------------------------------------------------------- */ 4653 4654 /* 4655 * DKIO(7) support 4656 */ 4657 4658 typedef struct vdc_dk_arg { 4659 struct dk_callback dkc; 4660 int mode; 4661 dev_t dev; 4662 vdc_t *vdc; 4663 } vdc_dk_arg_t; 4664 4665 /* 4666 * Function: 4667 * vdc_dkio_flush_cb() 4668 * 4669 * Description: 4670 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 4671 * by kernel code. 4672 * 4673 * Arguments: 4674 * arg - a pointer to a vdc_dk_arg_t structure. 4675 */ 4676 void 4677 vdc_dkio_flush_cb(void *arg) 4678 { 4679 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 4680 struct dk_callback *dkc = NULL; 4681 vdc_t *vdc = NULL; 4682 int rv; 4683 4684 if (dk_arg == NULL) { 4685 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 4686 return; 4687 } 4688 dkc = &dk_arg->dkc; 4689 vdc = dk_arg->vdc; 4690 ASSERT(vdc != NULL); 4691 4692 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 4693 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 4694 if (rv != 0) { 4695 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 4696 vdc->instance, rv, 4697 ddi_model_convert_from(dk_arg->mode & FMODELS)); 4698 } 4699 4700 /* 4701 * Trigger the call back to notify the caller the the ioctl call has 4702 * been completed. 4703 */ 4704 if ((dk_arg->mode & FKIOCTL) && 4705 (dkc != NULL) && 4706 (dkc->dkc_callback != NULL)) { 4707 ASSERT(dkc->dkc_cookie != NULL); 4708 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 4709 } 4710 4711 /* Indicate that one less DKIO write flush is outstanding */ 4712 mutex_enter(&vdc->lock); 4713 vdc->dkio_flush_pending--; 4714 ASSERT(vdc->dkio_flush_pending >= 0); 4715 mutex_exit(&vdc->lock); 4716 4717 /* free the mem that was allocated when the callback was dispatched */ 4718 kmem_free(arg, sizeof (vdc_dk_arg_t)); 4719 } 4720 4721 /* 4722 * Function: 4723 * vdc_dkio_get_partition() 4724 * 4725 * Description: 4726 * This function implements the DKIOCGAPART ioctl. 4727 * 4728 * Arguments: 4729 * vdc - soft state pointer 4730 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 4731 * flag - ioctl flags 4732 */ 4733 static int 4734 vdc_dkio_get_partition(vdc_t *vdc, caddr_t arg, int flag) 4735 { 4736 struct dk_geom *geom; 4737 struct vtoc *vtoc; 4738 union { 4739 struct dk_map map[NDKMAP]; 4740 struct dk_map32 map32[NDKMAP]; 4741 } data; 4742 int i, rv, size; 4743 4744 mutex_enter(&vdc->lock); 4745 4746 if ((rv = vdc_validate_geometry(vdc)) != 0) { 4747 mutex_exit(&vdc->lock); 4748 return (rv); 4749 } 4750 4751 vtoc = vdc->vtoc; 4752 geom = vdc->geom; 4753 4754 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 4755 4756 for (i = 0; i < vtoc->v_nparts; i++) { 4757 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 4758 (geom->dkg_nhead * geom->dkg_nsect); 4759 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 4760 } 4761 size = NDKMAP * sizeof (struct dk_map32); 4762 4763 } else { 4764 4765 for (i = 0; i < vtoc->v_nparts; i++) { 4766 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 4767 (geom->dkg_nhead * geom->dkg_nsect); 4768 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 4769 } 4770 size = NDKMAP * sizeof (struct dk_map); 4771 4772 } 4773 4774 mutex_exit(&vdc->lock); 4775 4776 if (ddi_copyout(&data, arg, size, flag) != 0) 4777 return (EFAULT); 4778 4779 return (0); 4780 } 4781 4782 /* 4783 * Function: 4784 * vdc_dioctl_rwcmd() 4785 * 4786 * Description: 4787 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 4788 * for DKC_DIRECT disks to read or write at an absolute disk offset. 4789 * 4790 * Arguments: 4791 * dev - device 4792 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 4793 * flag - ioctl flags 4794 */ 4795 static int 4796 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 4797 { 4798 struct dadkio_rwcmd32 rwcmd32; 4799 struct dadkio_rwcmd rwcmd; 4800 struct iovec aiov; 4801 struct uio auio; 4802 int rw, status; 4803 struct buf *buf; 4804 4805 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 4806 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 4807 sizeof (struct dadkio_rwcmd32), flag)) { 4808 return (EFAULT); 4809 } 4810 rwcmd.cmd = rwcmd32.cmd; 4811 rwcmd.flags = rwcmd32.flags; 4812 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 4813 rwcmd.buflen = rwcmd32.buflen; 4814 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 4815 } else { 4816 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 4817 sizeof (struct dadkio_rwcmd), flag)) { 4818 return (EFAULT); 4819 } 4820 } 4821 4822 switch (rwcmd.cmd) { 4823 case DADKIO_RWCMD_READ: 4824 rw = B_READ; 4825 break; 4826 case DADKIO_RWCMD_WRITE: 4827 rw = B_WRITE; 4828 break; 4829 default: 4830 return (EINVAL); 4831 } 4832 4833 bzero((caddr_t)&aiov, sizeof (struct iovec)); 4834 aiov.iov_base = rwcmd.bufaddr; 4835 aiov.iov_len = rwcmd.buflen; 4836 4837 bzero((caddr_t)&auio, sizeof (struct uio)); 4838 auio.uio_iov = &aiov; 4839 auio.uio_iovcnt = 1; 4840 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 4841 auio.uio_resid = rwcmd.buflen; 4842 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 4843 4844 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 4845 bioinit(buf); 4846 /* 4847 * We use the private field of buf to specify that this is an 4848 * I/O using an absolute offset. 4849 */ 4850 buf->b_private = (void *)VD_SLICE_NONE; 4851 4852 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 4853 4854 biofini(buf); 4855 kmem_free(buf, sizeof (buf_t)); 4856 4857 return (status); 4858 } 4859 4860 /* 4861 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 4862 * buffer is returned in alloc_len. 4863 */ 4864 static vd_scsi_t * 4865 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 4866 int *alloc_len) 4867 { 4868 vd_scsi_t *vd_scsi; 4869 int vd_scsi_len = VD_SCSI_SIZE; 4870 4871 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 4872 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 4873 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 4874 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 4875 4876 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 4877 4878 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 4879 4880 vd_scsi->cdb_len = cdb_len; 4881 vd_scsi->sense_len = sense_len; 4882 vd_scsi->datain_len = datain_len; 4883 vd_scsi->dataout_len = dataout_len; 4884 4885 *alloc_len = vd_scsi_len; 4886 4887 return (vd_scsi); 4888 } 4889 4890 /* 4891 * Convert the status of a SCSI command to a Solaris return code. 4892 * 4893 * Arguments: 4894 * vd_scsi - The SCSI operation buffer. 4895 * log_error - indicate if an error message should be logged. 4896 * 4897 * Note that our SCSI error messages are rather primitive for the moment 4898 * and could be improved by decoding some data like the SCSI command and 4899 * the sense key. 4900 * 4901 * Return value: 4902 * 0 - Status is good. 4903 * EACCES - Status reports a reservation conflict. 4904 * ENOTSUP - Status reports a check condition and sense key 4905 * reports an illegal request. 4906 * EIO - Any other status. 4907 */ 4908 static int 4909 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 4910 { 4911 int rv; 4912 char path_str[MAXPATHLEN]; 4913 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 4914 union scsi_cdb *cdb; 4915 struct scsi_extended_sense *sense; 4916 4917 if (vd_scsi->cmd_status == STATUS_GOOD) 4918 /* no error */ 4919 return (0); 4920 4921 /* when the tunable vdc_scsi_log_error is true we log all errors */ 4922 if (vdc_scsi_log_error) 4923 log_error = B_TRUE; 4924 4925 if (log_error) { 4926 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 4927 ddi_pathname(vdc->dip, path_str), vdc->instance, 4928 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 4929 } 4930 4931 /* default returned value */ 4932 rv = EIO; 4933 4934 switch (vd_scsi->cmd_status) { 4935 4936 case STATUS_CHECK: 4937 case STATUS_TERMINATED: 4938 if (log_error) 4939 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 4940 4941 /* check sense buffer */ 4942 if (vd_scsi->sense_len == 0 || 4943 vd_scsi->sense_status != STATUS_GOOD) { 4944 if (log_error) 4945 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 4946 break; 4947 } 4948 4949 sense = VD_SCSI_DATA_SENSE(vd_scsi); 4950 4951 if (log_error) { 4952 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 4953 "\tASC: 0x%x, ASCQ: 0x%x\n", 4954 scsi_sense_key((uint8_t *)sense), 4955 scsi_sense_asc((uint8_t *)sense), 4956 scsi_sense_ascq((uint8_t *)sense)); 4957 } 4958 4959 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 4960 rv = ENOTSUP; 4961 break; 4962 4963 case STATUS_BUSY: 4964 if (log_error) 4965 cmn_err(CE_NOTE, "\tDevice Busy\n"); 4966 break; 4967 4968 case STATUS_RESERVATION_CONFLICT: 4969 /* 4970 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 4971 * reservation conflict could be due to various reasons like 4972 * incorrect keys, not registered or not reserved etc. So, 4973 * we should not panic in that case. 4974 */ 4975 cdb = VD_SCSI_DATA_CDB(vd_scsi); 4976 if (vdc->failfast_interval != 0 && 4977 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 4978 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 4979 /* failfast is enabled so we have to panic */ 4980 (void) snprintf(panic_str, sizeof (panic_str), 4981 VDC_RESV_CONFLICT_FMT_STR "%s", 4982 ddi_pathname(vdc->dip, path_str)); 4983 panic(panic_str); 4984 } 4985 if (log_error) 4986 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 4987 rv = EACCES; 4988 break; 4989 4990 case STATUS_QFULL: 4991 if (log_error) 4992 cmn_err(CE_NOTE, "\tQueue Full\n"); 4993 break; 4994 4995 case STATUS_MET: 4996 case STATUS_INTERMEDIATE: 4997 case STATUS_SCSI2: 4998 case STATUS_INTERMEDIATE_MET: 4999 case STATUS_ACA_ACTIVE: 5000 if (log_error) 5001 cmn_err(CE_CONT, 5002 "\tUnexpected SCSI status received: 0x%x\n", 5003 vd_scsi->cmd_status); 5004 break; 5005 5006 default: 5007 if (log_error) 5008 cmn_err(CE_CONT, 5009 "\tInvalid SCSI status received: 0x%x\n", 5010 vd_scsi->cmd_status); 5011 break; 5012 } 5013 5014 return (rv); 5015 } 5016 5017 /* 5018 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5019 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5020 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5021 * converted to a VD_OP_RESET operation. 5022 */ 5023 static int 5024 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5025 { 5026 struct uscsi_cmd uscsi; 5027 struct uscsi_cmd32 uscsi32; 5028 vd_scsi_t *vd_scsi; 5029 int vd_scsi_len; 5030 union scsi_cdb *cdb; 5031 struct scsi_extended_sense *sense; 5032 char *datain, *dataout; 5033 size_t cdb_len, datain_len, dataout_len, sense_len; 5034 int rv; 5035 5036 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5037 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5038 mode) != 0) 5039 return (EFAULT); 5040 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5041 } else { 5042 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5043 mode) != 0) 5044 return (EFAULT); 5045 } 5046 5047 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5048 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5049 USCSI_RESET_ALL)) { 5050 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5051 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5052 return (rv); 5053 } 5054 5055 /* cdb buffer length */ 5056 cdb_len = uscsi.uscsi_cdblen; 5057 5058 /* data in and out buffers length */ 5059 if (uscsi.uscsi_flags & USCSI_READ) { 5060 datain_len = uscsi.uscsi_buflen; 5061 dataout_len = 0; 5062 } else { 5063 datain_len = 0; 5064 dataout_len = uscsi.uscsi_buflen; 5065 } 5066 5067 /* sense buffer length */ 5068 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5069 sense_len = uscsi.uscsi_rqlen; 5070 else 5071 sense_len = 0; 5072 5073 /* allocate buffer for the VD_SCSICMD_OP operation */ 5074 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5075 &vd_scsi_len); 5076 5077 /* 5078 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5079 * but basically they prevent a SCSI command from being retried in case 5080 * of an error. 5081 */ 5082 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5083 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5084 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5085 5086 /* set task attribute */ 5087 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5088 vd_scsi->task_attribute = 0; 5089 } else { 5090 if (uscsi.uscsi_flags & USCSI_HEAD) 5091 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5092 else if (uscsi.uscsi_flags & USCSI_HTAG) 5093 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5094 else if (uscsi.uscsi_flags & USCSI_OTAG) 5095 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5096 else 5097 vd_scsi->task_attribute = 0; 5098 } 5099 5100 /* set timeout */ 5101 vd_scsi->timeout = uscsi.uscsi_timeout; 5102 5103 /* copy-in cdb data */ 5104 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5105 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5106 rv = EFAULT; 5107 goto done; 5108 } 5109 5110 /* keep a pointer to the sense buffer */ 5111 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5112 5113 /* keep a pointer to the data-in buffer */ 5114 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5115 5116 /* copy-in request data to the data-out buffer */ 5117 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5118 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5119 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5120 mode)) { 5121 rv = EFAULT; 5122 goto done; 5123 } 5124 } 5125 5126 /* submit the request */ 5127 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5128 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5129 5130 if (rv != 0) 5131 goto done; 5132 5133 /* update scsi status */ 5134 uscsi.uscsi_status = vd_scsi->cmd_status; 5135 5136 /* update sense data */ 5137 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5138 (uscsi.uscsi_status == STATUS_CHECK || 5139 uscsi.uscsi_status == STATUS_TERMINATED)) { 5140 5141 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5142 5143 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5144 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5145 vd_scsi->sense_len; 5146 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5147 vd_scsi->sense_len, mode) != 0) { 5148 rv = EFAULT; 5149 goto done; 5150 } 5151 } 5152 } 5153 5154 /* update request data */ 5155 if (uscsi.uscsi_status == STATUS_GOOD) { 5156 if (uscsi.uscsi_flags & USCSI_READ) { 5157 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5158 vd_scsi->datain_len; 5159 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5160 vd_scsi->datain_len, mode) != 0) { 5161 rv = EFAULT; 5162 goto done; 5163 } 5164 } else { 5165 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5166 vd_scsi->dataout_len; 5167 } 5168 } 5169 5170 /* copy-out result */ 5171 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5172 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5173 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5174 mode) != 0) { 5175 rv = EFAULT; 5176 goto done; 5177 } 5178 } else { 5179 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5180 mode) != 0) { 5181 rv = EFAULT; 5182 goto done; 5183 } 5184 } 5185 5186 /* get the return code from the SCSI command status */ 5187 rv = vdc_scsi_status(vdc, vd_scsi, 5188 !(uscsi.uscsi_flags & USCSI_SILENT)); 5189 5190 done: 5191 kmem_free(vd_scsi, vd_scsi_len); 5192 return (rv); 5193 } 5194 5195 /* 5196 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5197 * 5198 * Arguments: 5199 * cmd - SCSI PERSISTENT IN command 5200 * len - length of the SCSI input buffer 5201 * vd_scsi_len - return the length of the allocated buffer 5202 * 5203 * Returned Value: 5204 * a pointer to the allocated VD_OP_SCSICMD buffer. 5205 */ 5206 static vd_scsi_t * 5207 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5208 { 5209 int cdb_len, sense_len, datain_len, dataout_len; 5210 vd_scsi_t *vd_scsi; 5211 union scsi_cdb *cdb; 5212 5213 cdb_len = CDB_GROUP1; 5214 sense_len = sizeof (struct scsi_extended_sense); 5215 datain_len = len; 5216 dataout_len = 0; 5217 5218 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5219 vd_scsi_len); 5220 5221 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5222 5223 /* set cdb */ 5224 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5225 cdb->cdb_opaque[1] = cmd; 5226 FORMG1COUNT(cdb, datain_len); 5227 5228 vd_scsi->timeout = vdc_scsi_timeout; 5229 5230 return (vd_scsi); 5231 } 5232 5233 /* 5234 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5235 * 5236 * Arguments: 5237 * cmd - SCSI PERSISTENT OUT command 5238 * len - length of the SCSI output buffer 5239 * vd_scsi_len - return the length of the allocated buffer 5240 * 5241 * Returned Code: 5242 * a pointer to the allocated VD_OP_SCSICMD buffer. 5243 */ 5244 static vd_scsi_t * 5245 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5246 { 5247 int cdb_len, sense_len, datain_len, dataout_len; 5248 vd_scsi_t *vd_scsi; 5249 union scsi_cdb *cdb; 5250 5251 cdb_len = CDB_GROUP1; 5252 sense_len = sizeof (struct scsi_extended_sense); 5253 datain_len = 0; 5254 dataout_len = len; 5255 5256 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5257 vd_scsi_len); 5258 5259 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5260 5261 /* set cdb */ 5262 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5263 cdb->cdb_opaque[1] = cmd; 5264 FORMG1COUNT(cdb, dataout_len); 5265 5266 vd_scsi->timeout = vdc_scsi_timeout; 5267 5268 return (vd_scsi); 5269 } 5270 5271 /* 5272 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5273 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5274 * server with a VD_OP_SCSICMD operation. 5275 */ 5276 static int 5277 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5278 { 5279 vd_scsi_t *vd_scsi; 5280 mhioc_inkeys_t inkeys; 5281 mhioc_key_list_t klist; 5282 struct mhioc_inkeys32 inkeys32; 5283 struct mhioc_key_list32 klist32; 5284 sd_prin_readkeys_t *scsi_keys; 5285 void *user_keys; 5286 int vd_scsi_len; 5287 int listsize, listlen, rv; 5288 5289 /* copyin arguments */ 5290 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5291 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5292 if (rv != 0) 5293 return (EFAULT); 5294 5295 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5296 sizeof (klist32), mode); 5297 if (rv != 0) 5298 return (EFAULT); 5299 5300 listsize = klist32.listsize; 5301 } else { 5302 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5303 if (rv != 0) 5304 return (EFAULT); 5305 5306 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5307 if (rv != 0) 5308 return (EFAULT); 5309 5310 listsize = klist.listsize; 5311 } 5312 5313 /* build SCSI VD_OP request */ 5314 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5315 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5316 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5317 5318 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5319 5320 /* submit the request */ 5321 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5322 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5323 5324 if (rv != 0) 5325 goto done; 5326 5327 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5328 5329 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5330 inkeys32.generation = scsi_keys->generation; 5331 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5332 if (rv != 0) { 5333 rv = EFAULT; 5334 goto done; 5335 } 5336 5337 klist32.listlen = listlen; 5338 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5339 sizeof (klist32), mode); 5340 if (rv != 0) { 5341 rv = EFAULT; 5342 goto done; 5343 } 5344 5345 user_keys = (caddr_t)(uintptr_t)klist32.list; 5346 } else { 5347 inkeys.generation = scsi_keys->generation; 5348 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5349 if (rv != 0) { 5350 rv = EFAULT; 5351 goto done; 5352 } 5353 5354 klist.listlen = listlen; 5355 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5356 if (rv != 0) { 5357 rv = EFAULT; 5358 goto done; 5359 } 5360 5361 user_keys = klist.list; 5362 } 5363 5364 /* copy out keys */ 5365 if (listlen > 0 && listsize > 0) { 5366 if (listsize < listlen) 5367 listlen = listsize; 5368 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5369 listlen * MHIOC_RESV_KEY_SIZE, mode); 5370 if (rv != 0) 5371 rv = EFAULT; 5372 } 5373 5374 if (rv == 0) 5375 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5376 5377 done: 5378 kmem_free(vd_scsi, vd_scsi_len); 5379 5380 return (rv); 5381 } 5382 5383 /* 5384 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5385 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5386 * the vdisk server with a VD_OP_SCSICMD operation. 5387 */ 5388 static int 5389 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5390 { 5391 vd_scsi_t *vd_scsi; 5392 mhioc_inresvs_t inresv; 5393 mhioc_resv_desc_list_t rlist; 5394 struct mhioc_inresvs32 inresv32; 5395 struct mhioc_resv_desc_list32 rlist32; 5396 mhioc_resv_desc_t mhd_resv; 5397 sd_prin_readresv_t *scsi_resv; 5398 sd_readresv_desc_t *resv; 5399 mhioc_resv_desc_t *user_resv; 5400 int vd_scsi_len; 5401 int listsize, listlen, i, rv; 5402 5403 /* copyin arguments */ 5404 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5405 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5406 if (rv != 0) 5407 return (EFAULT); 5408 5409 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5410 sizeof (rlist32), mode); 5411 if (rv != 0) 5412 return (EFAULT); 5413 5414 listsize = rlist32.listsize; 5415 } else { 5416 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5417 if (rv != 0) 5418 return (EFAULT); 5419 5420 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5421 if (rv != 0) 5422 return (EFAULT); 5423 5424 listsize = rlist.listsize; 5425 } 5426 5427 /* build SCSI VD_OP request */ 5428 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5429 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5430 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5431 5432 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5433 5434 /* submit the request */ 5435 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5436 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5437 5438 if (rv != 0) 5439 goto done; 5440 5441 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5442 5443 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5444 inresv32.generation = scsi_resv->generation; 5445 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5446 if (rv != 0) { 5447 rv = EFAULT; 5448 goto done; 5449 } 5450 5451 rlist32.listlen = listlen; 5452 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5453 sizeof (rlist32), mode); 5454 if (rv != 0) { 5455 rv = EFAULT; 5456 goto done; 5457 } 5458 5459 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5460 } else { 5461 inresv.generation = scsi_resv->generation; 5462 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5463 if (rv != 0) { 5464 rv = EFAULT; 5465 goto done; 5466 } 5467 5468 rlist.listlen = listlen; 5469 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5470 if (rv != 0) { 5471 rv = EFAULT; 5472 goto done; 5473 } 5474 5475 user_resv = rlist.list; 5476 } 5477 5478 /* copy out reservations */ 5479 if (listsize > 0 && listlen > 0) { 5480 if (listsize < listlen) 5481 listlen = listsize; 5482 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5483 5484 for (i = 0; i < listlen; i++) { 5485 mhd_resv.type = resv->type; 5486 mhd_resv.scope = resv->scope; 5487 mhd_resv.scope_specific_addr = 5488 BE_32(resv->scope_specific_addr); 5489 bcopy(&resv->resvkey, &mhd_resv.key, 5490 MHIOC_RESV_KEY_SIZE); 5491 5492 rv = ddi_copyout(&mhd_resv, user_resv, 5493 sizeof (mhd_resv), mode); 5494 if (rv != 0) { 5495 rv = EFAULT; 5496 goto done; 5497 } 5498 resv++; 5499 user_resv++; 5500 } 5501 } 5502 5503 if (rv == 0) 5504 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5505 5506 done: 5507 kmem_free(vd_scsi, vd_scsi_len); 5508 return (rv); 5509 } 5510 5511 /* 5512 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5513 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5514 * server with a VD_OP_SCSICMD operation. 5515 */ 5516 static int 5517 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5518 { 5519 vd_scsi_t *vd_scsi; 5520 sd_prout_t *scsi_prout; 5521 mhioc_register_t mhd_reg; 5522 int vd_scsi_len, rv; 5523 5524 /* copyin arguments */ 5525 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 5526 if (rv != 0) 5527 return (EFAULT); 5528 5529 /* build SCSI VD_OP request */ 5530 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 5531 sizeof (sd_prout_t), &vd_scsi_len); 5532 5533 /* set parameters */ 5534 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5535 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5536 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 5537 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 5538 5539 /* submit the request */ 5540 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5541 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5542 5543 if (rv == 0) 5544 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5545 5546 kmem_free(vd_scsi, vd_scsi_len); 5547 return (rv); 5548 } 5549 5550 /* 5551 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 5552 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 5553 * server with a VD_OP_SCSICMD operation. 5554 */ 5555 static int 5556 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 5557 { 5558 union scsi_cdb *cdb; 5559 vd_scsi_t *vd_scsi; 5560 sd_prout_t *scsi_prout; 5561 mhioc_resv_desc_t mhd_resv; 5562 int vd_scsi_len, rv; 5563 5564 /* copyin arguments */ 5565 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 5566 if (rv != 0) 5567 return (EFAULT); 5568 5569 /* build SCSI VD_OP request */ 5570 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 5571 sizeof (sd_prout_t), &vd_scsi_len); 5572 5573 /* set parameters */ 5574 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5575 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5576 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5577 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 5578 cdb->cdb_opaque[2] = mhd_resv.type; 5579 5580 /* submit the request */ 5581 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5582 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5583 5584 if (rv == 0) 5585 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5586 5587 kmem_free(vd_scsi, vd_scsi_len); 5588 return (rv); 5589 } 5590 5591 /* 5592 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 5593 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 5594 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 5595 */ 5596 static int 5597 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 5598 { 5599 union scsi_cdb *cdb; 5600 vd_scsi_t *vd_scsi; 5601 sd_prout_t *scsi_prout; 5602 mhioc_preemptandabort_t mhd_preempt; 5603 int vd_scsi_len, rv; 5604 5605 /* copyin arguments */ 5606 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 5607 if (rv != 0) 5608 return (EFAULT); 5609 5610 /* build SCSI VD_OP request */ 5611 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 5612 sizeof (sd_prout_t), &vd_scsi_len); 5613 5614 /* set parameters */ 5615 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5616 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5617 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5618 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 5619 MHIOC_RESV_KEY_SIZE); 5620 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 5621 MHIOC_RESV_KEY_SIZE); 5622 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 5623 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 5624 5625 /* submit the request */ 5626 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5627 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5628 5629 if (rv == 0) 5630 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5631 5632 kmem_free(vd_scsi, vd_scsi_len); 5633 return (rv); 5634 } 5635 5636 /* 5637 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 5638 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 5639 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 5640 */ 5641 static int 5642 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 5643 { 5644 vd_scsi_t *vd_scsi; 5645 sd_prout_t *scsi_prout; 5646 mhioc_registerandignorekey_t mhd_regi; 5647 int vd_scsi_len, rv; 5648 5649 /* copyin arguments */ 5650 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 5651 if (rv != 0) 5652 return (EFAULT); 5653 5654 /* build SCSI VD_OP request */ 5655 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 5656 sizeof (sd_prout_t), &vd_scsi_len); 5657 5658 /* set parameters */ 5659 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5660 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 5661 MHIOC_RESV_KEY_SIZE); 5662 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 5663 5664 /* submit the request */ 5665 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5666 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5667 5668 if (rv == 0) 5669 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5670 5671 kmem_free(vd_scsi, vd_scsi_len); 5672 return (rv); 5673 } 5674 5675 /* 5676 * This function is used by the failfast mechanism to send a SCSI command 5677 * to check for reservation conflict. 5678 */ 5679 static int 5680 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 5681 { 5682 int cdb_len, sense_len, vd_scsi_len; 5683 vd_scsi_t *vd_scsi; 5684 union scsi_cdb *cdb; 5685 int rv; 5686 5687 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 5688 5689 if (scmd == SCMD_WRITE_G1) 5690 cdb_len = CDB_GROUP1; 5691 else 5692 cdb_len = CDB_GROUP0; 5693 5694 sense_len = sizeof (struct scsi_extended_sense); 5695 5696 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 5697 5698 /* set cdb */ 5699 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5700 cdb->scc_cmd = scmd; 5701 5702 vd_scsi->timeout = vdc_scsi_timeout; 5703 5704 /* 5705 * Submit the request. The last argument has to be B_FALSE so that 5706 * vdc_do_sync_op does not loop checking for reservation conflict if 5707 * the operation returns an error. 5708 */ 5709 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5710 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 5711 5712 if (rv == 0) 5713 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5714 5715 kmem_free(vd_scsi, vd_scsi_len); 5716 return (rv); 5717 } 5718 5719 /* 5720 * This function is used by the failfast mechanism to check for reservation 5721 * conflict. It sends some SCSI commands which will fail with a reservation 5722 * conflict error if the system does not have access to the disk and this 5723 * will panic the system. 5724 * 5725 * Returned Code: 5726 * 0 - disk is accessible without reservation conflict error 5727 * != 0 - unable to check if disk is accessible 5728 */ 5729 int 5730 vdc_failfast_check_resv(vdc_t *vdc) 5731 { 5732 int failure = 0; 5733 5734 /* 5735 * Send a TEST UNIT READY command. The command will panic 5736 * the system if it fails with a reservation conflict. 5737 */ 5738 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 5739 failure++; 5740 5741 /* 5742 * With SPC-3 compliant devices TEST UNIT READY will succeed on 5743 * a reserved device, so we also do a WRITE(10) of zero byte in 5744 * order to provoke a Reservation Conflict status on those newer 5745 * devices. 5746 */ 5747 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 5748 failure++; 5749 5750 return (failure); 5751 } 5752 5753 /* 5754 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 5755 * queue when it has failed and failfast is enabled. Then we have to check 5756 * if it has failed because of a reservation conflict in which case we have 5757 * to panic the system. 5758 * 5759 * Async I/O should be queued with their block I/O data transfer structure 5760 * (buf). Sync I/O should be queued with buf = NULL. 5761 */ 5762 static vdc_io_t * 5763 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 5764 { 5765 vdc_io_t *vio; 5766 5767 ASSERT(MUTEX_HELD(&vdc->lock)); 5768 5769 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 5770 vio->vio_next = vdc->failfast_io_queue; 5771 vio->vio_buf = buf; 5772 vio->vio_qtime = ddi_get_lbolt(); 5773 5774 vdc->failfast_io_queue = vio; 5775 5776 /* notify the failfast thread that a new I/O is queued */ 5777 cv_signal(&vdc->failfast_cv); 5778 5779 return (vio); 5780 } 5781 5782 /* 5783 * Remove and complete I/O in the failfast I/O queue which have been 5784 * added after the indicated deadline. A deadline of 0 means that all 5785 * I/O have to be unqueued and marked as completed. 5786 */ 5787 static void 5788 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 5789 { 5790 vdc_io_t *vio, *vio_tmp; 5791 5792 ASSERT(MUTEX_HELD(&vdc->lock)); 5793 5794 vio_tmp = NULL; 5795 vio = vdc->failfast_io_queue; 5796 5797 if (deadline != 0) { 5798 /* 5799 * Skip any io queued after the deadline. The failfast 5800 * I/O queue is ordered starting with the last I/O added 5801 * to the queue. 5802 */ 5803 while (vio != NULL && vio->vio_qtime > deadline) { 5804 vio_tmp = vio; 5805 vio = vio->vio_next; 5806 } 5807 } 5808 5809 if (vio == NULL) 5810 /* nothing to unqueue */ 5811 return; 5812 5813 /* update the queue */ 5814 if (vio_tmp == NULL) 5815 vdc->failfast_io_queue = NULL; 5816 else 5817 vio_tmp->vio_next = NULL; 5818 5819 /* 5820 * Complete unqueued I/O. Async I/O have a block I/O data transfer 5821 * structure (buf) and they are completed by calling biodone(). Sync 5822 * I/O do not have a buf and they are completed by setting the 5823 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 5824 * thread waiting for the I/O to complete is responsible for freeing 5825 * the vio structure. 5826 */ 5827 while (vio != NULL) { 5828 vio_tmp = vio->vio_next; 5829 if (vio->vio_buf != NULL) { 5830 biodone(vio->vio_buf); 5831 kmem_free(vio, sizeof (vdc_io_t)); 5832 } else { 5833 vio->vio_qtime = 0; 5834 } 5835 vio = vio_tmp; 5836 } 5837 5838 cv_broadcast(&vdc->failfast_io_cv); 5839 } 5840 5841 /* 5842 * Failfast Thread. 5843 * 5844 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 5845 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 5846 * we still have access to the disk. If a command fails with a RESERVATION 5847 * CONFLICT error then the system will immediatly panic. 5848 * 5849 * The failfast thread is also woken up when an I/O has failed. It then check 5850 * the access to the disk to ensure that the I/O failure was not due to a 5851 * reservation conflict. 5852 * 5853 * There is one failfast thread for each virtual disk for which failfast is 5854 * enabled. We could have only one thread sending requests for all disks but 5855 * this would need vdc to send asynchronous requests and to have callbacks to 5856 * process replies. 5857 */ 5858 static void 5859 vdc_failfast_thread(void *arg) 5860 { 5861 int status; 5862 vdc_t *vdc = (vdc_t *)arg; 5863 clock_t timeout, starttime; 5864 5865 mutex_enter(&vdc->lock); 5866 5867 while (vdc->failfast_interval != 0) { 5868 5869 starttime = ddi_get_lbolt(); 5870 5871 mutex_exit(&vdc->lock); 5872 5873 /* check for reservation conflict */ 5874 status = vdc_failfast_check_resv(vdc); 5875 5876 mutex_enter(&vdc->lock); 5877 /* 5878 * We have dropped the lock to send the SCSI command so we have 5879 * to check that failfast is still enabled. 5880 */ 5881 if (vdc->failfast_interval == 0) 5882 break; 5883 5884 /* 5885 * If we have successfully check the disk access and there was 5886 * no reservation conflict then we can complete any I/O queued 5887 * before the last check. 5888 */ 5889 if (status == 0) 5890 vdc_failfast_io_unqueue(vdc, starttime); 5891 5892 /* proceed again if some I/O are still in the queue */ 5893 if (vdc->failfast_io_queue != NULL) 5894 continue; 5895 5896 timeout = ddi_get_lbolt() + 5897 drv_usectohz(vdc->failfast_interval); 5898 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 5899 } 5900 5901 /* 5902 * Failfast is being stop so we can complete any queued I/O. 5903 */ 5904 vdc_failfast_io_unqueue(vdc, 0); 5905 vdc->failfast_thread = NULL; 5906 mutex_exit(&vdc->lock); 5907 thread_exit(); 5908 } 5909 5910 /* 5911 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 5912 */ 5913 static int 5914 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 5915 { 5916 unsigned int mh_time; 5917 5918 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 5919 return (EFAULT); 5920 5921 mutex_enter(&vdc->lock); 5922 if (mh_time != 0 && vdc->failfast_thread == NULL) { 5923 vdc->failfast_thread = thread_create(NULL, 0, 5924 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 5925 v.v_maxsyspri - 2); 5926 } 5927 5928 vdc->failfast_interval = mh_time * 1000; 5929 cv_signal(&vdc->failfast_cv); 5930 mutex_exit(&vdc->lock); 5931 5932 return (0); 5933 } 5934 5935 /* 5936 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 5937 * converted to VD_OP_SET_ACCESS operations. 5938 */ 5939 static int 5940 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 5941 { 5942 int rv; 5943 5944 /* submit owership command request */ 5945 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 5946 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 5947 VIO_both_dir, B_TRUE); 5948 5949 return (rv); 5950 } 5951 5952 /* 5953 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 5954 * VD_OP_GET_ACCESS operation. 5955 */ 5956 static int 5957 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 5958 { 5959 int rv; 5960 5961 /* submit owership command request */ 5962 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 5963 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 5964 VIO_both_dir, B_TRUE); 5965 5966 return (rv); 5967 } 5968 5969 /* 5970 * Disk Ownership Thread. 5971 * 5972 * When we have taken the ownership of a disk, this thread waits to be 5973 * notified when the LDC channel is reset so that it can recover the 5974 * ownership. 5975 * 5976 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 5977 * can not be used to do the ownership recovery because it has to be 5978 * running to handle the reply message to the ownership operation. 5979 */ 5980 static void 5981 vdc_ownership_thread(void *arg) 5982 { 5983 vdc_t *vdc = (vdc_t *)arg; 5984 clock_t timeout; 5985 uint64_t status; 5986 5987 mutex_enter(&vdc->ownership_lock); 5988 mutex_enter(&vdc->lock); 5989 5990 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 5991 5992 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 5993 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 5994 /* 5995 * There was a reset so the ownership has been lost, 5996 * try to recover. We do this without using the preempt 5997 * option so that we don't steal the ownership from 5998 * someone who has preempted us. 5999 */ 6000 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6001 vdc->instance); 6002 6003 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6004 VDC_OWNERSHIP_GRANTED); 6005 6006 mutex_exit(&vdc->lock); 6007 6008 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6009 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6010 6011 mutex_enter(&vdc->lock); 6012 6013 if (status == 0) { 6014 DMSG(vdc, 0, "[%d] Ownership recovered", 6015 vdc->instance); 6016 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6017 } else { 6018 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6019 vdc->instance); 6020 } 6021 6022 } 6023 6024 /* 6025 * If we have the ownership then we just wait for an event 6026 * to happen (LDC reset), otherwise we will retry to recover 6027 * after a delay. 6028 */ 6029 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6030 timeout = 0; 6031 else 6032 timeout = ddi_get_lbolt() + 6033 drv_usectohz(vdc_ownership_delay); 6034 6035 /* Release the ownership_lock and wait on the vdc lock */ 6036 mutex_exit(&vdc->ownership_lock); 6037 6038 if (timeout == 0) 6039 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6040 else 6041 (void) cv_timedwait(&vdc->ownership_cv, 6042 &vdc->lock, timeout); 6043 6044 mutex_exit(&vdc->lock); 6045 6046 mutex_enter(&vdc->ownership_lock); 6047 mutex_enter(&vdc->lock); 6048 } 6049 6050 vdc->ownership_thread = NULL; 6051 mutex_exit(&vdc->lock); 6052 mutex_exit(&vdc->ownership_lock); 6053 6054 thread_exit(); 6055 } 6056 6057 static void 6058 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6059 { 6060 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6061 6062 mutex_enter(&vdc->lock); 6063 vdc->ownership = ownership_flags; 6064 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6065 vdc->ownership_thread == NULL) { 6066 /* start ownership thread */ 6067 vdc->ownership_thread = thread_create(NULL, 0, 6068 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6069 v.v_maxsyspri - 2); 6070 } else { 6071 /* notify the ownership thread */ 6072 cv_signal(&vdc->ownership_cv); 6073 } 6074 mutex_exit(&vdc->lock); 6075 } 6076 6077 /* 6078 * Get the size and the block size of a virtual disk from the vdisk server. 6079 * We need to use this operation when the vdisk_size attribute was not 6080 * available during the handshake with the vdisk server. 6081 */ 6082 static int 6083 vdc_check_capacity(vdc_t *vdc) 6084 { 6085 int rv = 0; 6086 size_t alloc_len; 6087 vd_capacity_t *vd_cap; 6088 6089 if (vdc->vdisk_size != 0) 6090 return (0); 6091 6092 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6093 6094 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6095 6096 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6097 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6098 6099 if (rv == 0) { 6100 if (vd_cap->vdisk_block_size != vdc->block_size || 6101 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6102 vd_cap->vdisk_size == 0) 6103 rv = EINVAL; 6104 else 6105 vdc->vdisk_size = vd_cap->vdisk_size; 6106 } 6107 6108 kmem_free(vd_cap, alloc_len); 6109 return (rv); 6110 } 6111 6112 /* 6113 * This structure is used in the DKIO(7I) array below. 6114 */ 6115 typedef struct vdc_dk_ioctl { 6116 uint8_t op; /* VD_OP_XXX value */ 6117 int cmd; /* Solaris ioctl operation number */ 6118 size_t nbytes; /* size of structure to be copied */ 6119 6120 /* function to convert between vDisk and Solaris structure formats */ 6121 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6122 int mode, int dir); 6123 } vdc_dk_ioctl_t; 6124 6125 /* 6126 * Subset of DKIO(7I) operations currently supported 6127 */ 6128 static vdc_dk_ioctl_t dk_ioctl[] = { 6129 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6130 vdc_null_copy_func}, 6131 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6132 vdc_get_wce_convert}, 6133 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6134 vdc_set_wce_convert}, 6135 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6136 vdc_get_vtoc_convert}, 6137 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6138 vdc_set_vtoc_convert}, 6139 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6140 vdc_get_geom_convert}, 6141 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6142 vdc_get_geom_convert}, 6143 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6144 vdc_get_geom_convert}, 6145 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6146 vdc_set_geom_convert}, 6147 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6148 vdc_get_efi_convert}, 6149 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6150 vdc_set_efi_convert}, 6151 6152 /* DIOCTL_RWCMD is converted to a read or a write */ 6153 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6154 6155 /* mhd(7I) non-shared multihost disks ioctls */ 6156 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6157 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6158 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6159 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6160 6161 /* mhd(7I) shared multihost disks ioctls */ 6162 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6163 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6164 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6165 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6166 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6167 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6168 6169 /* mhd(7I) failfast ioctl */ 6170 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6171 6172 /* 6173 * These particular ioctls are not sent to the server - vdc fakes up 6174 * the necessary info. 6175 */ 6176 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6177 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6178 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6179 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6180 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6181 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6182 }; 6183 6184 /* 6185 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6186 * function and forward them to the vdisk. 6187 */ 6188 static int 6189 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6190 { 6191 vdc_t *vdc = (vdc_t *)vdisk; 6192 dev_t dev; 6193 int rval; 6194 6195 dev = makedevice(ddi_driver_major(vdc->dip), 6196 VD_MAKE_DEV(vdc->instance, 0)); 6197 6198 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6199 } 6200 6201 /* 6202 * Function: 6203 * vd_process_ioctl() 6204 * 6205 * Description: 6206 * This routine processes disk specific ioctl calls 6207 * 6208 * Arguments: 6209 * dev - the device number 6210 * cmd - the operation [dkio(7I)] to be processed 6211 * arg - pointer to user provided structure 6212 * (contains data to be set or reference parameter for get) 6213 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6214 * rvalp - pointer to return value for calling process. 6215 * 6216 * Return Code: 6217 * 0 6218 * EFAULT 6219 * ENXIO 6220 * EIO 6221 * ENOTSUP 6222 */ 6223 static int 6224 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6225 { 6226 int instance = VDCUNIT(dev); 6227 vdc_t *vdc = NULL; 6228 int rv = -1; 6229 int idx = 0; /* index into dk_ioctl[] */ 6230 size_t len = 0; /* #bytes to send to vds */ 6231 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6232 caddr_t mem_p = NULL; 6233 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6234 vdc_dk_ioctl_t *iop; 6235 6236 vdc = ddi_get_soft_state(vdc_state, instance); 6237 if (vdc == NULL) { 6238 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6239 instance); 6240 return (ENXIO); 6241 } 6242 6243 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6244 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6245 6246 if (rvalp != NULL) { 6247 /* the return value of the ioctl is 0 by default */ 6248 *rvalp = 0; 6249 } 6250 6251 /* 6252 * Validate the ioctl operation to be performed. 6253 * 6254 * If we have looped through the array without finding a match then we 6255 * don't support this ioctl. 6256 */ 6257 for (idx = 0; idx < nioctls; idx++) { 6258 if (cmd == dk_ioctl[idx].cmd) 6259 break; 6260 } 6261 6262 if (idx >= nioctls) { 6263 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6264 vdc->instance, cmd); 6265 return (ENOTSUP); 6266 } 6267 6268 iop = &(dk_ioctl[idx]); 6269 6270 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6271 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6272 dk_efi_t dk_efi; 6273 6274 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6275 if (rv != 0) 6276 return (EFAULT); 6277 6278 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6279 } else { 6280 len = iop->nbytes; 6281 } 6282 6283 /* check if the ioctl is applicable */ 6284 switch (cmd) { 6285 case CDROMREADOFFSET: 6286 case DKIOCREMOVABLE: 6287 return (ENOTTY); 6288 6289 case USCSICMD: 6290 case MHIOCTKOWN: 6291 case MHIOCSTATUS: 6292 case MHIOCQRESERVE: 6293 case MHIOCRELEASE: 6294 case MHIOCGRP_INKEYS: 6295 case MHIOCGRP_INRESV: 6296 case MHIOCGRP_REGISTER: 6297 case MHIOCGRP_RESERVE: 6298 case MHIOCGRP_PREEMPTANDABORT: 6299 case MHIOCGRP_REGISTERANDIGNOREKEY: 6300 case MHIOCENFAILFAST: 6301 if (vdc->cinfo == NULL) 6302 return (ENXIO); 6303 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6304 return (ENOTTY); 6305 break; 6306 6307 case DIOCTL_RWCMD: 6308 if (vdc->cinfo == NULL) 6309 return (ENXIO); 6310 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6311 return (ENOTTY); 6312 break; 6313 6314 case DKIOCINFO: 6315 if (vdc->cinfo == NULL) 6316 return (ENXIO); 6317 break; 6318 6319 case DKIOCGMEDIAINFO: 6320 if (vdc->minfo == NULL) 6321 return (ENXIO); 6322 if (vdc_check_capacity(vdc) != 0) 6323 /* disk capacity is not available */ 6324 return (EIO); 6325 break; 6326 } 6327 6328 /* 6329 * Deal with ioctls which require a processing different than 6330 * converting ioctl arguments and sending a corresponding 6331 * VD operation. 6332 */ 6333 switch (cmd) { 6334 6335 case USCSICMD: 6336 { 6337 return (vdc_uscsi_cmd(vdc, arg, mode)); 6338 } 6339 6340 case MHIOCTKOWN: 6341 { 6342 mutex_enter(&vdc->ownership_lock); 6343 /* 6344 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6345 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6346 * while we are processing the ioctl. 6347 */ 6348 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6349 6350 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6351 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6352 if (rv == 0) { 6353 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6354 VDC_OWNERSHIP_GRANTED); 6355 } else { 6356 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6357 } 6358 mutex_exit(&vdc->ownership_lock); 6359 return (rv); 6360 } 6361 6362 case MHIOCRELEASE: 6363 { 6364 mutex_enter(&vdc->ownership_lock); 6365 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6366 if (rv == 0) { 6367 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6368 } 6369 mutex_exit(&vdc->ownership_lock); 6370 return (rv); 6371 } 6372 6373 case MHIOCSTATUS: 6374 { 6375 uint64_t status; 6376 6377 rv = vdc_access_get(vdc, &status, mode); 6378 if (rv == 0 && rvalp != NULL) 6379 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6380 return (rv); 6381 } 6382 6383 case MHIOCQRESERVE: 6384 { 6385 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6386 return (rv); 6387 } 6388 6389 case MHIOCGRP_INKEYS: 6390 { 6391 return (vdc_mhd_inkeys(vdc, arg, mode)); 6392 } 6393 6394 case MHIOCGRP_INRESV: 6395 { 6396 return (vdc_mhd_inresv(vdc, arg, mode)); 6397 } 6398 6399 case MHIOCGRP_REGISTER: 6400 { 6401 return (vdc_mhd_register(vdc, arg, mode)); 6402 } 6403 6404 case MHIOCGRP_RESERVE: 6405 { 6406 return (vdc_mhd_reserve(vdc, arg, mode)); 6407 } 6408 6409 case MHIOCGRP_PREEMPTANDABORT: 6410 { 6411 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6412 } 6413 6414 case MHIOCGRP_REGISTERANDIGNOREKEY: 6415 { 6416 return (vdc_mhd_registerignore(vdc, arg, mode)); 6417 } 6418 6419 case MHIOCENFAILFAST: 6420 { 6421 rv = vdc_failfast(vdc, arg, mode); 6422 return (rv); 6423 } 6424 6425 case DIOCTL_RWCMD: 6426 { 6427 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6428 } 6429 6430 case DKIOCGAPART: 6431 { 6432 return (vdc_dkio_get_partition(vdc, arg, mode)); 6433 } 6434 6435 case DKIOCINFO: 6436 { 6437 struct dk_cinfo cinfo; 6438 6439 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6440 cinfo.dki_partition = VDCPART(dev); 6441 6442 rv = ddi_copyout(&cinfo, (void *)arg, 6443 sizeof (struct dk_cinfo), mode); 6444 if (rv != 0) 6445 return (EFAULT); 6446 6447 return (0); 6448 } 6449 6450 case DKIOCGMEDIAINFO: 6451 { 6452 ASSERT(vdc->vdisk_size != 0); 6453 if (vdc->minfo->dki_capacity == 0) 6454 vdc->minfo->dki_capacity = vdc->vdisk_size; 6455 rv = ddi_copyout(vdc->minfo, (void *)arg, 6456 sizeof (struct dk_minfo), mode); 6457 if (rv != 0) 6458 return (EFAULT); 6459 6460 return (0); 6461 } 6462 6463 case DKIOCFLUSHWRITECACHE: 6464 { 6465 struct dk_callback *dkc = 6466 (struct dk_callback *)(uintptr_t)arg; 6467 vdc_dk_arg_t *dkarg = NULL; 6468 6469 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6470 instance, mode); 6471 6472 /* 6473 * If arg is NULL, then there is no callback function 6474 * registered and the call operates synchronously; we 6475 * break and continue with the rest of the function and 6476 * wait for vds to return (i.e. after the request to 6477 * vds returns successfully, all writes completed prior 6478 * to the ioctl will have been flushed from the disk 6479 * write cache to persistent media. 6480 * 6481 * If a callback function is registered, we dispatch 6482 * the request on a task queue and return immediately. 6483 * The callback will deal with informing the calling 6484 * thread that the flush request is completed. 6485 */ 6486 if (dkc == NULL) 6487 break; 6488 6489 /* 6490 * the asynchronous callback is only supported if 6491 * invoked from within the kernel 6492 */ 6493 if ((mode & FKIOCTL) == 0) 6494 return (ENOTSUP); 6495 6496 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6497 6498 dkarg->mode = mode; 6499 dkarg->dev = dev; 6500 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 6501 6502 mutex_enter(&vdc->lock); 6503 vdc->dkio_flush_pending++; 6504 dkarg->vdc = vdc; 6505 mutex_exit(&vdc->lock); 6506 6507 /* put the request on a task queue */ 6508 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 6509 (void *)dkarg, DDI_SLEEP); 6510 if (rv == NULL) { 6511 /* clean up if dispatch fails */ 6512 mutex_enter(&vdc->lock); 6513 vdc->dkio_flush_pending--; 6514 mutex_exit(&vdc->lock); 6515 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 6516 } 6517 6518 return (rv == NULL ? ENOMEM : 0); 6519 } 6520 } 6521 6522 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 6523 ASSERT(iop->op != 0); 6524 6525 /* check if the vDisk server handles the operation for this vDisk */ 6526 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 6527 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 6528 vdc->instance, iop->op); 6529 return (ENOTSUP); 6530 } 6531 6532 /* LDC requires that the memory being mapped is 8-byte aligned */ 6533 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 6534 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 6535 instance, len, alloc_len); 6536 6537 if (alloc_len > 0) 6538 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 6539 6540 /* 6541 * Call the conversion function for this ioctl which, if necessary, 6542 * converts from the Solaris format to the format ARC'ed 6543 * as part of the vDisk protocol (FWARC 2006/195) 6544 */ 6545 ASSERT(iop->convert != NULL); 6546 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 6547 if (rv != 0) { 6548 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6549 instance, rv, cmd); 6550 if (mem_p != NULL) 6551 kmem_free(mem_p, alloc_len); 6552 return (rv); 6553 } 6554 6555 /* 6556 * send request to vds to service the ioctl. 6557 */ 6558 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 6559 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 6560 VIO_both_dir, B_TRUE); 6561 6562 if (rv != 0) { 6563 /* 6564 * This is not necessarily an error. The ioctl could 6565 * be returning a value such as ENOTTY to indicate 6566 * that the ioctl is not applicable. 6567 */ 6568 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 6569 instance, rv, cmd); 6570 if (mem_p != NULL) 6571 kmem_free(mem_p, alloc_len); 6572 6573 return (rv); 6574 } 6575 6576 /* 6577 * Call the conversion function (if it exists) for this ioctl 6578 * which converts from the format ARC'ed as part of the vDisk 6579 * protocol (FWARC 2006/195) back to a format understood by 6580 * the rest of Solaris. 6581 */ 6582 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 6583 if (rv != 0) { 6584 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6585 instance, rv, cmd); 6586 if (mem_p != NULL) 6587 kmem_free(mem_p, alloc_len); 6588 return (rv); 6589 } 6590 6591 if (mem_p != NULL) 6592 kmem_free(mem_p, alloc_len); 6593 6594 return (rv); 6595 } 6596 6597 /* 6598 * Function: 6599 * 6600 * Description: 6601 * This is an empty conversion function used by ioctl calls which 6602 * do not need to convert the data being passed in/out to userland 6603 */ 6604 static int 6605 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 6606 { 6607 _NOTE(ARGUNUSED(vdc)) 6608 _NOTE(ARGUNUSED(from)) 6609 _NOTE(ARGUNUSED(to)) 6610 _NOTE(ARGUNUSED(mode)) 6611 _NOTE(ARGUNUSED(dir)) 6612 6613 return (0); 6614 } 6615 6616 static int 6617 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 6618 int mode, int dir) 6619 { 6620 _NOTE(ARGUNUSED(vdc)) 6621 6622 if (dir == VD_COPYIN) 6623 return (0); /* nothing to do */ 6624 6625 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 6626 return (EFAULT); 6627 6628 return (0); 6629 } 6630 6631 static int 6632 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 6633 int mode, int dir) 6634 { 6635 _NOTE(ARGUNUSED(vdc)) 6636 6637 if (dir == VD_COPYOUT) 6638 return (0); /* nothing to do */ 6639 6640 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 6641 return (EFAULT); 6642 6643 return (0); 6644 } 6645 6646 /* 6647 * Function: 6648 * vdc_get_vtoc_convert() 6649 * 6650 * Description: 6651 * This routine performs the necessary convertions from the DKIOCGVTOC 6652 * Solaris structure to the format defined in FWARC 2006/195. 6653 * 6654 * In the struct vtoc definition, the timestamp field is marked as not 6655 * supported so it is not part of vDisk protocol (FWARC 2006/195). 6656 * However SVM uses that field to check it can write into the VTOC, 6657 * so we fake up the info of that field. 6658 * 6659 * Arguments: 6660 * vdc - the vDisk client 6661 * from - the buffer containing the data to be copied from 6662 * to - the buffer to be copied to 6663 * mode - flags passed to ioctl() call 6664 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 6665 * 6666 * Return Code: 6667 * 0 - Success 6668 * ENXIO - incorrect buffer passed in. 6669 * EFAULT - ddi_copyout routine encountered an error. 6670 */ 6671 static int 6672 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6673 { 6674 int i; 6675 void *tmp_mem = NULL; 6676 void *tmp_memp; 6677 struct vtoc vt; 6678 struct vtoc32 vt32; 6679 int copy_len = 0; 6680 int rv = 0; 6681 6682 if (dir != VD_COPYOUT) 6683 return (0); /* nothing to do */ 6684 6685 if ((from == NULL) || (to == NULL)) 6686 return (ENXIO); 6687 6688 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6689 copy_len = sizeof (struct vtoc32); 6690 else 6691 copy_len = sizeof (struct vtoc); 6692 6693 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6694 6695 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 6696 6697 /* fake the VTOC timestamp field */ 6698 for (i = 0; i < V_NUMPAR; i++) { 6699 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 6700 } 6701 6702 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6703 /* LINTED E_ASSIGN_NARROW_CONV */ 6704 vtoctovtoc32(vt, vt32); 6705 tmp_memp = &vt32; 6706 } else { 6707 tmp_memp = &vt; 6708 } 6709 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 6710 if (rv != 0) 6711 rv = EFAULT; 6712 6713 kmem_free(tmp_mem, copy_len); 6714 return (rv); 6715 } 6716 6717 /* 6718 * Function: 6719 * vdc_set_vtoc_convert() 6720 * 6721 * Description: 6722 * This routine performs the necessary convertions from the DKIOCSVTOC 6723 * Solaris structure to the format defined in FWARC 2006/195. 6724 * 6725 * Arguments: 6726 * vdc - the vDisk client 6727 * from - Buffer with data 6728 * to - Buffer where data is to be copied to 6729 * mode - flags passed to ioctl 6730 * dir - direction of copy (in or out) 6731 * 6732 * Return Code: 6733 * 0 - Success 6734 * ENXIO - Invalid buffer passed in 6735 * EFAULT - ddi_copyin of data failed 6736 */ 6737 static int 6738 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6739 { 6740 _NOTE(ARGUNUSED(vdc)) 6741 6742 void *tmp_mem = NULL, *uvtoc; 6743 struct vtoc vt; 6744 struct vtoc *vtp = &vt; 6745 vd_vtoc_t vtvd; 6746 int copy_len = 0; 6747 int i, rv = 0; 6748 6749 if ((from == NULL) || (to == NULL)) 6750 return (ENXIO); 6751 6752 if (dir == VD_COPYIN) 6753 uvtoc = from; 6754 else 6755 uvtoc = to; 6756 6757 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6758 copy_len = sizeof (struct vtoc32); 6759 else 6760 copy_len = sizeof (struct vtoc); 6761 6762 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6763 6764 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 6765 if (rv != 0) { 6766 kmem_free(tmp_mem, copy_len); 6767 return (EFAULT); 6768 } 6769 6770 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6771 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 6772 } else { 6773 vtp = tmp_mem; 6774 } 6775 6776 if (dir == VD_COPYOUT) { 6777 /* 6778 * The disk label may have changed. Revalidate the disk 6779 * geometry. This will also update the device nodes and 6780 * properties. 6781 */ 6782 vdc_validate(vdc); 6783 6784 /* 6785 * We also need to keep track of the timestamp fields. 6786 */ 6787 for (i = 0; i < V_NUMPAR; i++) { 6788 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 6789 } 6790 6791 return (0); 6792 } 6793 6794 VTOC2VD_VTOC(vtp, &vtvd); 6795 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 6796 kmem_free(tmp_mem, copy_len); 6797 6798 return (0); 6799 } 6800 6801 /* 6802 * Function: 6803 * vdc_get_geom_convert() 6804 * 6805 * Description: 6806 * This routine performs the necessary convertions from the DKIOCGGEOM, 6807 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 6808 * defined in FWARC 2006/195 6809 * 6810 * Arguments: 6811 * vdc - the vDisk client 6812 * from - Buffer with data 6813 * to - Buffer where data is to be copied to 6814 * mode - flags passed to ioctl 6815 * dir - direction of copy (in or out) 6816 * 6817 * Return Code: 6818 * 0 - Success 6819 * ENXIO - Invalid buffer passed in 6820 * EFAULT - ddi_copyout of data failed 6821 */ 6822 static int 6823 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6824 { 6825 _NOTE(ARGUNUSED(vdc)) 6826 6827 struct dk_geom geom; 6828 int copy_len = sizeof (struct dk_geom); 6829 int rv = 0; 6830 6831 if (dir != VD_COPYOUT) 6832 return (0); /* nothing to do */ 6833 6834 if ((from == NULL) || (to == NULL)) 6835 return (ENXIO); 6836 6837 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 6838 rv = ddi_copyout(&geom, to, copy_len, mode); 6839 if (rv != 0) 6840 rv = EFAULT; 6841 6842 return (rv); 6843 } 6844 6845 /* 6846 * Function: 6847 * vdc_set_geom_convert() 6848 * 6849 * Description: 6850 * This routine performs the necessary convertions from the DKIOCSGEOM 6851 * Solaris structure to the format defined in FWARC 2006/195. 6852 * 6853 * Arguments: 6854 * vdc - the vDisk client 6855 * from - Buffer with data 6856 * to - Buffer where data is to be copied to 6857 * mode - flags passed to ioctl 6858 * dir - direction of copy (in or out) 6859 * 6860 * Return Code: 6861 * 0 - Success 6862 * ENXIO - Invalid buffer passed in 6863 * EFAULT - ddi_copyin of data failed 6864 */ 6865 static int 6866 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6867 { 6868 _NOTE(ARGUNUSED(vdc)) 6869 6870 vd_geom_t vdgeom; 6871 void *tmp_mem = NULL; 6872 int copy_len = sizeof (struct dk_geom); 6873 int rv = 0; 6874 6875 if (dir != VD_COPYIN) 6876 return (0); /* nothing to do */ 6877 6878 if ((from == NULL) || (to == NULL)) 6879 return (ENXIO); 6880 6881 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6882 6883 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 6884 if (rv != 0) { 6885 kmem_free(tmp_mem, copy_len); 6886 return (EFAULT); 6887 } 6888 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 6889 bcopy(&vdgeom, to, sizeof (vdgeom)); 6890 kmem_free(tmp_mem, copy_len); 6891 6892 return (0); 6893 } 6894 6895 static int 6896 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6897 { 6898 _NOTE(ARGUNUSED(vdc)) 6899 6900 vd_efi_t *vd_efi; 6901 dk_efi_t dk_efi; 6902 int rv = 0; 6903 void *uaddr; 6904 6905 if ((from == NULL) || (to == NULL)) 6906 return (ENXIO); 6907 6908 if (dir == VD_COPYIN) { 6909 6910 vd_efi = (vd_efi_t *)to; 6911 6912 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 6913 if (rv != 0) 6914 return (EFAULT); 6915 6916 vd_efi->lba = dk_efi.dki_lba; 6917 vd_efi->length = dk_efi.dki_length; 6918 bzero(vd_efi->data, vd_efi->length); 6919 6920 } else { 6921 6922 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 6923 if (rv != 0) 6924 return (EFAULT); 6925 6926 uaddr = dk_efi.dki_data; 6927 6928 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 6929 6930 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 6931 6932 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 6933 mode); 6934 if (rv != 0) 6935 return (EFAULT); 6936 6937 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 6938 } 6939 6940 return (0); 6941 } 6942 6943 static int 6944 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6945 { 6946 _NOTE(ARGUNUSED(vdc)) 6947 6948 dk_efi_t dk_efi; 6949 void *uaddr; 6950 6951 if (dir == VD_COPYOUT) { 6952 /* 6953 * The disk label may have changed. Revalidate the disk 6954 * geometry. This will also update the device nodes and 6955 * properties. 6956 */ 6957 vdc_validate(vdc); 6958 return (0); 6959 } 6960 6961 if ((from == NULL) || (to == NULL)) 6962 return (ENXIO); 6963 6964 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 6965 return (EFAULT); 6966 6967 uaddr = dk_efi.dki_data; 6968 6969 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 6970 6971 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 6972 return (EFAULT); 6973 6974 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 6975 6976 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 6977 6978 return (0); 6979 } 6980 6981 6982 /* -------------------------------------------------------------------------- */ 6983 6984 /* 6985 * Function: 6986 * vdc_create_fake_geometry() 6987 * 6988 * Description: 6989 * This routine fakes up the disk info needed for some DKIO ioctls such 6990 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 6991 * 6992 * Note: This function must not be called until the vDisk attributes have 6993 * been exchanged as part of the handshake with the vDisk server. 6994 * 6995 * Arguments: 6996 * vdc - soft state pointer for this instance of the device driver. 6997 * 6998 * Return Code: 6999 * none. 7000 */ 7001 static void 7002 vdc_create_fake_geometry(vdc_t *vdc) 7003 { 7004 ASSERT(vdc != NULL); 7005 ASSERT(vdc->max_xfer_sz != 0); 7006 7007 /* 7008 * DKIOCINFO support 7009 */ 7010 if (vdc->cinfo == NULL) 7011 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7012 7013 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7014 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7015 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7016 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7017 7018 /* 7019 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7020 * operation is supported, otherwise the controller type is DKC_DIRECT. 7021 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7022 * controller type is always DKC_DIRECT in that case. 7023 * 7024 * If the virtual disk is backed by a physical CD/DVD device or 7025 * an ISO image, modify the controller type to indicate this 7026 */ 7027 switch (vdc->vdisk_media) { 7028 case VD_MEDIA_CD: 7029 case VD_MEDIA_DVD: 7030 vdc->cinfo->dki_ctype = DKC_CDROM; 7031 break; 7032 case VD_MEDIA_FIXED: 7033 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7034 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7035 else 7036 vdc->cinfo->dki_ctype = DKC_DIRECT; 7037 break; 7038 default: 7039 /* in the case of v1.0 we default to a fixed disk */ 7040 vdc->cinfo->dki_ctype = DKC_DIRECT; 7041 break; 7042 } 7043 vdc->cinfo->dki_flags = DKI_FMTVOL; 7044 vdc->cinfo->dki_cnum = 0; 7045 vdc->cinfo->dki_addr = 0; 7046 vdc->cinfo->dki_space = 0; 7047 vdc->cinfo->dki_prio = 0; 7048 vdc->cinfo->dki_vec = 0; 7049 vdc->cinfo->dki_unit = vdc->instance; 7050 vdc->cinfo->dki_slave = 0; 7051 /* 7052 * The partition number will be created on the fly depending on the 7053 * actual slice (i.e. minor node) that is used to request the data. 7054 */ 7055 vdc->cinfo->dki_partition = 0; 7056 7057 /* 7058 * DKIOCGMEDIAINFO support 7059 */ 7060 if (vdc->minfo == NULL) 7061 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7062 7063 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7064 vdc->minfo->dki_media_type = 7065 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7066 } else { 7067 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7068 } 7069 7070 vdc->minfo->dki_capacity = vdc->vdisk_size; 7071 vdc->minfo->dki_lbsize = vdc->block_size; 7072 } 7073 7074 static ushort_t 7075 vdc_lbl2cksum(struct dk_label *label) 7076 { 7077 int count; 7078 ushort_t sum, *sp; 7079 7080 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7081 sp = (ushort_t *)label; 7082 sum = 0; 7083 while (count--) { 7084 sum ^= *sp++; 7085 } 7086 7087 return (sum); 7088 } 7089 7090 /* 7091 * Function: 7092 * vdc_validate_geometry 7093 * 7094 * Description: 7095 * This routine discovers the label and geometry of the disk. It stores 7096 * the disk label and related information in the vdc structure. If it 7097 * fails to validate the geometry or to discover the disk label then 7098 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7099 * 7100 * Arguments: 7101 * vdc - soft state pointer for this instance of the device driver. 7102 * 7103 * Return Code: 7104 * 0 - success. 7105 * EINVAL - unknown disk label. 7106 * ENOTSUP - geometry not applicable (EFI label). 7107 * EIO - error accessing the disk. 7108 */ 7109 static int 7110 vdc_validate_geometry(vdc_t *vdc) 7111 { 7112 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7113 dev_t dev; 7114 int rv, rval; 7115 struct dk_label label; 7116 struct dk_geom geom; 7117 struct vtoc vtoc; 7118 efi_gpt_t *gpt; 7119 efi_gpe_t *gpe; 7120 vd_efi_dev_t edev; 7121 7122 ASSERT(vdc != NULL); 7123 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7124 ASSERT(MUTEX_HELD(&vdc->lock)); 7125 7126 mutex_exit(&vdc->lock); 7127 7128 dev = makedevice(ddi_driver_major(vdc->dip), 7129 VD_MAKE_DEV(vdc->instance, 0)); 7130 7131 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7132 if (rv == 0) 7133 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7134 FKIOCTL, &rval); 7135 7136 if (rv == ENOTSUP) { 7137 /* 7138 * If the device does not support VTOC then we try 7139 * to read an EFI label. 7140 * 7141 * We need to know the block size and the disk size to 7142 * be able to read an EFI label. 7143 */ 7144 if (vdc->vdisk_size == 0) { 7145 if ((rv = vdc_check_capacity(vdc)) != 0) { 7146 mutex_enter(&vdc->lock); 7147 vdc_store_label_unk(vdc); 7148 return (rv); 7149 } 7150 } 7151 7152 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7153 7154 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7155 7156 if (rv) { 7157 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7158 vdc->instance, rv); 7159 mutex_enter(&vdc->lock); 7160 vdc_store_label_unk(vdc); 7161 return (EIO); 7162 } 7163 7164 mutex_enter(&vdc->lock); 7165 vdc_store_label_efi(vdc, gpt, gpe); 7166 vd_efi_free(&edev, gpt, gpe); 7167 return (ENOTSUP); 7168 } 7169 7170 if (rv != 0) { 7171 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7172 vdc->instance, rv); 7173 mutex_enter(&vdc->lock); 7174 vdc_store_label_unk(vdc); 7175 if (rv != EINVAL) 7176 rv = EIO; 7177 return (rv); 7178 } 7179 7180 /* check that geometry and vtoc are valid */ 7181 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7182 vtoc.v_sanity != VTOC_SANE) { 7183 mutex_enter(&vdc->lock); 7184 vdc_store_label_unk(vdc); 7185 return (EINVAL); 7186 } 7187 7188 /* 7189 * We have a disk and a valid VTOC. However this does not mean 7190 * that the disk currently have a VTOC label. The returned VTOC may 7191 * be a default VTOC to be used for configuring the disk (this is 7192 * what is done for disk image). So we read the label from the 7193 * beginning of the disk to ensure we really have a VTOC label. 7194 * 7195 * FUTURE: This could be the default way for reading the VTOC 7196 * from the disk as opposed to sending the VD_OP_GET_VTOC 7197 * to the server. This will be the default if vdc is implemented 7198 * ontop of cmlb. 7199 */ 7200 7201 /* 7202 * Single slice disk does not support read using an absolute disk 7203 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7204 */ 7205 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7206 mutex_enter(&vdc->lock); 7207 if (vtoc.v_nparts != 1) { 7208 vdc_store_label_unk(vdc); 7209 return (EINVAL); 7210 } 7211 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7212 return (0); 7213 } 7214 7215 if (vtoc.v_nparts != V_NUMPAR) { 7216 mutex_enter(&vdc->lock); 7217 vdc_store_label_unk(vdc); 7218 return (EINVAL); 7219 } 7220 7221 /* 7222 * Read disk label from start of disk 7223 */ 7224 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7225 bioinit(buf); 7226 buf->b_un.b_addr = (caddr_t)&label; 7227 buf->b_bcount = DK_LABEL_SIZE; 7228 buf->b_flags = B_BUSY | B_READ; 7229 buf->b_dev = cmpdev(dev); 7230 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7231 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7232 if (rv) { 7233 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7234 vdc->instance); 7235 } else { 7236 rv = biowait(buf); 7237 biofini(buf); 7238 } 7239 kmem_free(buf, sizeof (buf_t)); 7240 7241 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7242 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7243 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7244 vdc->instance); 7245 mutex_enter(&vdc->lock); 7246 vdc_store_label_unk(vdc); 7247 return (EINVAL); 7248 } 7249 7250 mutex_enter(&vdc->lock); 7251 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7252 return (0); 7253 } 7254 7255 /* 7256 * Function: 7257 * vdc_validate 7258 * 7259 * Description: 7260 * This routine discovers the label of the disk and create the 7261 * appropriate device nodes if the label has changed. 7262 * 7263 * Arguments: 7264 * vdc - soft state pointer for this instance of the device driver. 7265 * 7266 * Return Code: 7267 * none. 7268 */ 7269 static void 7270 vdc_validate(vdc_t *vdc) 7271 { 7272 vd_disk_label_t old_label; 7273 vd_slice_t old_slice[V_NUMPAR]; 7274 int rv; 7275 7276 ASSERT(!MUTEX_HELD(&vdc->lock)); 7277 7278 mutex_enter(&vdc->lock); 7279 7280 /* save the current label and vtoc */ 7281 old_label = vdc->vdisk_label; 7282 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7283 7284 /* check the geometry */ 7285 (void) vdc_validate_geometry(vdc); 7286 7287 /* if the disk label has changed, update device nodes */ 7288 if (vdc->vdisk_label != old_label) { 7289 7290 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7291 rv = vdc_create_device_nodes_efi(vdc); 7292 else 7293 rv = vdc_create_device_nodes_vtoc(vdc); 7294 7295 if (rv != 0) { 7296 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7297 vdc->instance); 7298 } 7299 } 7300 7301 /* if the vtoc has changed, update device nodes properties */ 7302 if (bcmp(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR) != 0) { 7303 7304 if (vdc_create_device_nodes_props(vdc) != 0) { 7305 DMSG(vdc, 0, "![%d] Failed to update device nodes" 7306 " properties", vdc->instance); 7307 } 7308 } 7309 7310 mutex_exit(&vdc->lock); 7311 } 7312 7313 static void 7314 vdc_validate_task(void *arg) 7315 { 7316 vdc_t *vdc = (vdc_t *)arg; 7317 7318 vdc_validate(vdc); 7319 7320 mutex_enter(&vdc->lock); 7321 ASSERT(vdc->validate_pending > 0); 7322 vdc->validate_pending--; 7323 mutex_exit(&vdc->lock); 7324 } 7325 7326 /* 7327 * Function: 7328 * vdc_setup_devid() 7329 * 7330 * Description: 7331 * This routine discovers the devid of a vDisk. It requests the devid of 7332 * the underlying device from the vDisk server, builds an encapsulated 7333 * devid based on the retrieved devid and registers that new devid to 7334 * the vDisk. 7335 * 7336 * Arguments: 7337 * vdc - soft state pointer for this instance of the device driver. 7338 * 7339 * Return Code: 7340 * 0 - A devid was succesfully registered for the vDisk 7341 */ 7342 static int 7343 vdc_setup_devid(vdc_t *vdc) 7344 { 7345 int rv; 7346 vd_devid_t *vd_devid; 7347 size_t bufsize, bufid_len; 7348 7349 /* 7350 * At first sight, we don't know the size of the devid that the 7351 * server will return but this size will be encoded into the 7352 * reply. So we do a first request using a default size then we 7353 * check if this size was large enough. If not then we do a second 7354 * request with the correct size returned by the server. Note that 7355 * ldc requires size to be 8-byte aligned. 7356 */ 7357 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7358 sizeof (uint64_t)); 7359 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7360 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7361 7362 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7363 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7364 7365 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7366 7367 if (rv) { 7368 kmem_free(vd_devid, bufsize); 7369 return (rv); 7370 } 7371 7372 if (vd_devid->length > bufid_len) { 7373 /* 7374 * The returned devid is larger than the buffer used. Try again 7375 * with a buffer with the right size. 7376 */ 7377 kmem_free(vd_devid, bufsize); 7378 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7379 sizeof (uint64_t)); 7380 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7381 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7382 7383 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7384 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7385 VIO_both_dir, B_TRUE); 7386 7387 if (rv) { 7388 kmem_free(vd_devid, bufsize); 7389 return (rv); 7390 } 7391 } 7392 7393 /* 7394 * The virtual disk should have the same device id as the one associated 7395 * with the physical disk it is mapped on, otherwise sharing a disk 7396 * between a LDom and a non-LDom may not work (for example for a shared 7397 * SVM disk set). 7398 * 7399 * The DDI framework does not allow creating a device id with any 7400 * type so we first create a device id of type DEVID_ENCAP and then 7401 * we restore the orignal type of the physical device. 7402 */ 7403 7404 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7405 7406 /* build an encapsulated devid based on the returned devid */ 7407 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7408 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7409 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7410 kmem_free(vd_devid, bufsize); 7411 return (1); 7412 } 7413 7414 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7415 7416 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7417 7418 kmem_free(vd_devid, bufsize); 7419 7420 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7421 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7422 return (1); 7423 } 7424 7425 return (0); 7426 } 7427 7428 static void 7429 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7430 { 7431 int i, nparts; 7432 7433 ASSERT(MUTEX_HELD(&vdc->lock)); 7434 7435 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7436 bzero(vdc->vtoc, sizeof (struct vtoc)); 7437 bzero(vdc->geom, sizeof (struct dk_geom)); 7438 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7439 7440 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7441 7442 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7443 7444 if (gpe[i].efi_gpe_StartingLBA == 0 || 7445 gpe[i].efi_gpe_EndingLBA == 0) { 7446 continue; 7447 } 7448 7449 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7450 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7451 gpe[i].efi_gpe_StartingLBA + 1; 7452 } 7453 7454 ASSERT(vdc->vdisk_size != 0); 7455 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7456 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7457 7458 } 7459 7460 static void 7461 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7462 { 7463 int i; 7464 7465 ASSERT(MUTEX_HELD(&vdc->lock)); 7466 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7467 7468 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7469 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7470 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7471 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7472 7473 for (i = 0; i < vtoc->v_nparts; i++) { 7474 vdc->slice[i].start = vtoc->v_part[i].p_start; 7475 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7476 } 7477 } 7478 7479 static void 7480 vdc_store_label_unk(vdc_t *vdc) 7481 { 7482 ASSERT(MUTEX_HELD(&vdc->lock)); 7483 7484 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7485 bzero(vdc->vtoc, sizeof (struct vtoc)); 7486 bzero(vdc->geom, sizeof (struct dk_geom)); 7487 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7488 } 7489