1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mach_descrip.h> 69 #include <sys/modctl.h> 70 #include <sys/mdeg.h> 71 #include <sys/note.h> 72 #include <sys/open.h> 73 #include <sys/sdt.h> 74 #include <sys/stat.h> 75 #include <sys/sunddi.h> 76 #include <sys/types.h> 77 #include <sys/promif.h> 78 #include <sys/var.h> 79 #include <sys/vtoc.h> 80 #include <sys/archsystm.h> 81 #include <sys/sysmacros.h> 82 83 #include <sys/cdio.h> 84 #include <sys/dktp/fdisk.h> 85 #include <sys/dktp/dadkio.h> 86 #include <sys/mhd.h> 87 #include <sys/scsi/generic/sense.h> 88 #include <sys/scsi/impl/uscsi.h> 89 #include <sys/scsi/impl/services.h> 90 #include <sys/scsi/targets/sddef.h> 91 92 #include <sys/ldoms.h> 93 #include <sys/ldc.h> 94 #include <sys/vio_common.h> 95 #include <sys/vio_mailbox.h> 96 #include <sys/vio_util.h> 97 #include <sys/vdsk_common.h> 98 #include <sys/vdsk_mailbox.h> 99 #include <sys/vdc.h> 100 101 /* 102 * function prototypes 103 */ 104 105 /* standard driver functions */ 106 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 107 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 108 static int vdc_strategy(struct buf *buf); 109 static int vdc_print(dev_t dev, char *str); 110 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 111 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 112 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 114 cred_t *credp, int *rvalp); 115 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 116 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 117 118 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 119 void *arg, void **resultp); 120 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 121 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 122 123 /* setup */ 124 static void vdc_min(struct buf *bufp); 125 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 126 static int vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node); 127 static int vdc_start_ldc_connection(vdc_t *vdc); 128 static int vdc_create_device_nodes(vdc_t *vdc); 129 static int vdc_create_device_nodes_efi(vdc_t *vdc); 130 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 131 static int vdc_create_device_nodes_props(vdc_t *vdc); 132 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 133 mde_cookie_t *vd_nodep, mde_cookie_t *vd_portp); 134 static int vdc_get_ldc_id(md_t *, mde_cookie_t, uint64_t *); 135 static int vdc_do_ldc_up(vdc_t *vdc); 136 static void vdc_terminate_ldc(vdc_t *vdc); 137 static int vdc_init_descriptor_ring(vdc_t *vdc); 138 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 139 static int vdc_setup_devid(vdc_t *vdc); 140 static void vdc_store_label_efi(vdc_t *vdc, struct dk_gpt *efi); 141 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 142 static void vdc_store_label_unk(vdc_t *vdc); 143 static boolean_t vdc_is_opened(vdc_t *vdc); 144 145 /* handshake with vds */ 146 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 147 static int vdc_ver_negotiation(vdc_t *vdcp); 148 static int vdc_init_attr_negotiation(vdc_t *vdc); 149 static int vdc_attr_negotiation(vdc_t *vdcp); 150 static int vdc_init_dring_negotiate(vdc_t *vdc); 151 static int vdc_dring_negotiation(vdc_t *vdcp); 152 static int vdc_send_rdx(vdc_t *vdcp); 153 static int vdc_rdx_exchange(vdc_t *vdcp); 154 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 155 156 /* processing incoming messages from vDisk server */ 157 static void vdc_process_msg_thread(vdc_t *vdc); 158 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 159 160 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 161 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 162 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 163 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 164 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 165 static int vdc_send_request(vdc_t *vdcp, int operation, 166 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 167 int cb_type, void *cb_arg, vio_desc_direction_t dir); 168 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 169 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 170 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 171 int cb_type, void *cb_arg, vio_desc_direction_t dir); 172 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 173 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 174 void *cb_arg, vio_desc_direction_t dir, boolean_t); 175 176 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 177 static int vdc_drain_response(vdc_t *vdcp); 178 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 179 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 180 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 181 182 /* dkio */ 183 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 184 int *rvalp); 185 static int vd_process_efi_ioctl(dev_t dev, int cmd, caddr_t arg, int mode); 186 static void vdc_create_fake_geometry(vdc_t *vdc); 187 static int vdc_validate_geometry(vdc_t *vdc); 188 static void vdc_validate(vdc_t *vdc); 189 static void vdc_validate_task(void *arg); 190 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 191 int mode, int dir); 192 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 193 int mode, int dir); 194 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 195 int mode, int dir); 196 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 197 int mode, int dir); 198 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 199 int mode, int dir); 200 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 201 int mode, int dir); 202 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 203 int mode, int dir); 204 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 205 int mode, int dir); 206 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 207 int mode, int dir); 208 209 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 210 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 211 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 212 static int vdc_failfast_check_resv(vdc_t *vdc); 213 214 /* 215 * Module variables 216 */ 217 218 /* 219 * Tunable variables to control how long vdc waits before timing out on 220 * various operations 221 */ 222 static int vdc_hshake_retries = 3; 223 224 static int vdc_timeout = 0; /* units: seconds */ 225 226 static uint64_t vdc_hz_min_ldc_delay; 227 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 228 static uint64_t vdc_hz_max_ldc_delay; 229 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 230 231 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 232 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 233 234 /* values for dumping - need to run in a tighter loop */ 235 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 236 static int vdc_dump_retries = 100; 237 238 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 239 240 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 241 242 /* Count of the number of vdc instances attached */ 243 static volatile uint32_t vdc_instance_count = 0; 244 245 /* Tunable to log all SCSI errors */ 246 static boolean_t vdc_scsi_log_error = B_FALSE; 247 248 /* Soft state pointer */ 249 static void *vdc_state; 250 251 /* 252 * Controlling the verbosity of the error/debug messages 253 * 254 * vdc_msglevel - controls level of messages 255 * vdc_matchinst - 64-bit variable where each bit corresponds 256 * to the vdc instance the vdc_msglevel applies. 257 */ 258 int vdc_msglevel = 0x0; 259 uint64_t vdc_matchinst = 0ull; 260 261 /* 262 * Supported vDisk protocol version pairs. 263 * 264 * The first array entry is the latest and preferred version. 265 */ 266 static const vio_ver_t vdc_version[] = {{1, 1}}; 267 268 static struct cb_ops vdc_cb_ops = { 269 vdc_open, /* cb_open */ 270 vdc_close, /* cb_close */ 271 vdc_strategy, /* cb_strategy */ 272 vdc_print, /* cb_print */ 273 vdc_dump, /* cb_dump */ 274 vdc_read, /* cb_read */ 275 vdc_write, /* cb_write */ 276 vdc_ioctl, /* cb_ioctl */ 277 nodev, /* cb_devmap */ 278 nodev, /* cb_mmap */ 279 nodev, /* cb_segmap */ 280 nochpoll, /* cb_chpoll */ 281 ddi_prop_op, /* cb_prop_op */ 282 NULL, /* cb_str */ 283 D_MP | D_64BIT, /* cb_flag */ 284 CB_REV, /* cb_rev */ 285 vdc_aread, /* cb_aread */ 286 vdc_awrite /* cb_awrite */ 287 }; 288 289 static struct dev_ops vdc_ops = { 290 DEVO_REV, /* devo_rev */ 291 0, /* devo_refcnt */ 292 vdc_getinfo, /* devo_getinfo */ 293 nulldev, /* devo_identify */ 294 nulldev, /* devo_probe */ 295 vdc_attach, /* devo_attach */ 296 vdc_detach, /* devo_detach */ 297 nodev, /* devo_reset */ 298 &vdc_cb_ops, /* devo_cb_ops */ 299 NULL, /* devo_bus_ops */ 300 nulldev /* devo_power */ 301 }; 302 303 static struct modldrv modldrv = { 304 &mod_driverops, 305 "virtual disk client", 306 &vdc_ops, 307 }; 308 309 static struct modlinkage modlinkage = { 310 MODREV_1, 311 &modldrv, 312 NULL 313 }; 314 315 /* -------------------------------------------------------------------------- */ 316 317 /* 318 * Device Driver housekeeping and setup 319 */ 320 321 int 322 _init(void) 323 { 324 int status; 325 326 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 327 return (status); 328 if ((status = mod_install(&modlinkage)) != 0) 329 ddi_soft_state_fini(&vdc_state); 330 vdc_efi_init(vd_process_efi_ioctl); 331 return (status); 332 } 333 334 int 335 _info(struct modinfo *modinfop) 336 { 337 return (mod_info(&modlinkage, modinfop)); 338 } 339 340 int 341 _fini(void) 342 { 343 int status; 344 345 if ((status = mod_remove(&modlinkage)) != 0) 346 return (status); 347 vdc_efi_fini(); 348 ddi_soft_state_fini(&vdc_state); 349 return (0); 350 } 351 352 static int 353 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 354 { 355 _NOTE(ARGUNUSED(dip)) 356 357 int instance = VDCUNIT((dev_t)arg); 358 vdc_t *vdc = NULL; 359 360 switch (cmd) { 361 case DDI_INFO_DEVT2DEVINFO: 362 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 363 *resultp = NULL; 364 return (DDI_FAILURE); 365 } 366 *resultp = vdc->dip; 367 return (DDI_SUCCESS); 368 case DDI_INFO_DEVT2INSTANCE: 369 *resultp = (void *)(uintptr_t)instance; 370 return (DDI_SUCCESS); 371 default: 372 *resultp = NULL; 373 return (DDI_FAILURE); 374 } 375 } 376 377 static int 378 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 379 { 380 kt_did_t failfast_tid, ownership_tid; 381 int instance; 382 int rv; 383 vdc_t *vdc = NULL; 384 385 switch (cmd) { 386 case DDI_DETACH: 387 /* the real work happens below */ 388 break; 389 case DDI_SUSPEND: 390 /* nothing to do for this non-device */ 391 return (DDI_SUCCESS); 392 default: 393 return (DDI_FAILURE); 394 } 395 396 ASSERT(cmd == DDI_DETACH); 397 instance = ddi_get_instance(dip); 398 DMSGX(1, "[%d] Entered\n", instance); 399 400 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 401 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 402 return (DDI_FAILURE); 403 } 404 405 /* 406 * This function is called when vdc is detached or if it has failed to 407 * attach. In that case, the attach may have fail before the vdisk type 408 * has been set so we can't call vdc_is_opened(). However as the attach 409 * has failed, we know that the vdisk is not opened and we can safely 410 * detach. 411 */ 412 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 413 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 414 return (DDI_FAILURE); 415 } 416 417 if (vdc->dkio_flush_pending) { 418 DMSG(vdc, 0, 419 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 420 instance, vdc->dkio_flush_pending); 421 return (DDI_FAILURE); 422 } 423 424 if (vdc->validate_pending) { 425 DMSG(vdc, 0, 426 "[%d] Cannot detach: %d outstanding validate request\n", 427 instance, vdc->validate_pending); 428 return (DDI_FAILURE); 429 } 430 431 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 432 433 /* If we took ownership, release ownership */ 434 mutex_enter(&vdc->ownership_lock); 435 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 436 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 437 if (rv == 0) { 438 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 439 } 440 } 441 mutex_exit(&vdc->ownership_lock); 442 443 /* mark instance as detaching */ 444 vdc->lifecycle = VDC_LC_DETACHING; 445 446 /* 447 * try and disable callbacks to prevent another handshake 448 */ 449 rv = ldc_set_cb_mode(vdc->ldc_handle, LDC_CB_DISABLE); 450 DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); 451 452 if (vdc->initialized & VDC_THREAD) { 453 mutex_enter(&vdc->read_lock); 454 if ((vdc->read_state == VDC_READ_WAITING) || 455 (vdc->read_state == VDC_READ_RESET)) { 456 vdc->read_state = VDC_READ_RESET; 457 cv_signal(&vdc->read_cv); 458 } 459 460 mutex_exit(&vdc->read_lock); 461 462 /* wake up any thread waiting for connection to come online */ 463 mutex_enter(&vdc->lock); 464 if (vdc->state == VDC_STATE_INIT_WAITING) { 465 DMSG(vdc, 0, 466 "[%d] write reset - move to resetting state...\n", 467 instance); 468 vdc->state = VDC_STATE_RESETTING; 469 cv_signal(&vdc->initwait_cv); 470 } 471 mutex_exit(&vdc->lock); 472 473 /* now wait until state transitions to VDC_STATE_DETACH */ 474 thread_join(vdc->msg_proc_thr->t_did); 475 ASSERT(vdc->state == VDC_STATE_DETACH); 476 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 477 vdc->instance); 478 } 479 480 mutex_enter(&vdc->lock); 481 482 if (vdc->initialized & VDC_DRING) 483 vdc_destroy_descriptor_ring(vdc); 484 485 if (vdc->initialized & VDC_LDC) 486 vdc_terminate_ldc(vdc); 487 488 if (vdc->failfast_thread) { 489 failfast_tid = vdc->failfast_thread->t_did; 490 vdc->failfast_interval = 0; 491 cv_signal(&vdc->failfast_cv); 492 } else { 493 failfast_tid = 0; 494 } 495 496 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 497 ownership_tid = vdc->ownership_thread->t_did; 498 vdc->ownership = VDC_OWNERSHIP_NONE; 499 cv_signal(&vdc->ownership_cv); 500 } else { 501 ownership_tid = 0; 502 } 503 504 mutex_exit(&vdc->lock); 505 506 if (failfast_tid != 0) 507 thread_join(failfast_tid); 508 509 if (ownership_tid != 0) 510 thread_join(ownership_tid); 511 512 if (vdc->initialized & VDC_MINOR) { 513 ddi_prop_remove_all(dip); 514 ddi_remove_minor_node(dip, NULL); 515 } 516 517 if (vdc->initialized & VDC_LOCKS) { 518 mutex_destroy(&vdc->lock); 519 mutex_destroy(&vdc->read_lock); 520 mutex_destroy(&vdc->ownership_lock); 521 cv_destroy(&vdc->initwait_cv); 522 cv_destroy(&vdc->dring_free_cv); 523 cv_destroy(&vdc->membind_cv); 524 cv_destroy(&vdc->sync_pending_cv); 525 cv_destroy(&vdc->sync_blocked_cv); 526 cv_destroy(&vdc->read_cv); 527 cv_destroy(&vdc->running_cv); 528 cv_destroy(&vdc->ownership_cv); 529 cv_destroy(&vdc->failfast_cv); 530 cv_destroy(&vdc->failfast_io_cv); 531 } 532 533 if (vdc->minfo) 534 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 535 536 if (vdc->cinfo) 537 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 538 539 if (vdc->vtoc) 540 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 541 542 if (vdc->geom) 543 kmem_free(vdc->geom, sizeof (struct dk_geom)); 544 545 if (vdc->devid) { 546 ddi_devid_unregister(dip); 547 ddi_devid_free(vdc->devid); 548 } 549 550 if (vdc->initialized & VDC_SOFT_STATE) 551 ddi_soft_state_free(vdc_state, instance); 552 553 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 554 555 return (DDI_SUCCESS); 556 } 557 558 559 static int 560 vdc_do_attach(dev_info_t *dip) 561 { 562 int instance; 563 vdc_t *vdc = NULL; 564 int status; 565 md_t *mdp; 566 mde_cookie_t vd_node, vd_port; 567 568 ASSERT(dip != NULL); 569 570 instance = ddi_get_instance(dip); 571 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 572 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 573 instance); 574 return (DDI_FAILURE); 575 } 576 577 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 578 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 579 return (DDI_FAILURE); 580 } 581 582 /* 583 * We assign the value to initialized in this case to zero out the 584 * variable and then set bits in it to indicate what has been done 585 */ 586 vdc->initialized = VDC_SOFT_STATE; 587 588 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 589 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 590 591 vdc->dip = dip; 592 vdc->instance = instance; 593 vdc->vdisk_type = VD_DISK_TYPE_UNK; 594 vdc->vdisk_label = VD_DISK_LABEL_UNK; 595 vdc->state = VDC_STATE_INIT; 596 vdc->lifecycle = VDC_LC_ATTACHING; 597 vdc->ldc_state = 0; 598 vdc->session_id = 0; 599 vdc->block_size = DEV_BSIZE; 600 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 601 602 /* 603 * We assume, for now, that the vDisk server will export 'read' 604 * operations to us at a minimum (this is needed because of checks 605 * in vdc for supported operations early in the handshake process). 606 * The vDisk server will return ENOTSUP if this is not the case. 607 * The value will be overwritten during the attribute exchange with 608 * the bitmask of operations exported by server. 609 */ 610 vdc->operations = VD_OP_MASK_READ; 611 612 vdc->vtoc = NULL; 613 vdc->geom = NULL; 614 vdc->cinfo = NULL; 615 vdc->minfo = NULL; 616 617 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 618 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 619 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 620 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 621 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 622 623 vdc->threads_pending = 0; 624 vdc->sync_op_pending = B_FALSE; 625 vdc->sync_op_blocked = B_FALSE; 626 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 627 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 628 629 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 630 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 631 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 632 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 633 634 /* init blocking msg read functionality */ 635 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 636 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 637 vdc->read_state = VDC_READ_IDLE; 638 639 vdc->initialized |= VDC_LOCKS; 640 641 /* get device and port MD node for this disk instance */ 642 if (vdc_get_md_node(dip, &mdp, &vd_node, &vd_port) != 0) { 643 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 644 instance); 645 return (DDI_FAILURE); 646 } 647 648 /* set the connection timeout */ 649 if (vd_port == NULL || (md_get_prop_val(mdp, vd_port, 650 VDC_MD_TIMEOUT, &vdc->ctimeout) != 0)) { 651 vdc->ctimeout = 0; 652 } 653 654 /* initialise LDC channel which will be used to communicate with vds */ 655 status = vdc_do_ldc_init(vdc, mdp, vd_node); 656 657 (void) md_fini_handle(mdp); 658 659 if (status != 0) { 660 cmn_err(CE_NOTE, "[%d] Couldn't initialize LDC", instance); 661 goto return_status; 662 } 663 664 /* initialize the thread responsible for managing state with server */ 665 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 666 vdc, 0, &p0, TS_RUN, minclsyspri); 667 if (vdc->msg_proc_thr == NULL) { 668 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 669 instance); 670 return (DDI_FAILURE); 671 } 672 673 vdc->initialized |= VDC_THREAD; 674 675 atomic_inc_32(&vdc_instance_count); 676 677 /* 678 * Check the disk label. This will send requests and do the handshake. 679 * We don't really care about the disk label now. What we really need is 680 * the handshake do be done so that we know the type of the disk (slice 681 * or full disk) and the appropriate device nodes can be created. 682 */ 683 vdc->vdisk_label = VD_DISK_LABEL_UNK; 684 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 685 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 686 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 687 688 mutex_enter(&vdc->lock); 689 (void) vdc_validate_geometry(vdc); 690 mutex_exit(&vdc->lock); 691 692 /* 693 * Now that we have the device info we can create the 694 * device nodes and properties 695 */ 696 status = vdc_create_device_nodes(vdc); 697 if (status) { 698 DMSG(vdc, 0, "[%d] Failed to create device nodes", 699 instance); 700 goto return_status; 701 } 702 status = vdc_create_device_nodes_props(vdc); 703 if (status) { 704 DMSG(vdc, 0, "[%d] Failed to create device nodes" 705 " properties (%d)", instance, status); 706 goto return_status; 707 } 708 709 /* 710 * Setup devid 711 */ 712 if (vdc_setup_devid(vdc)) { 713 DMSG(vdc, 0, "[%d] No device id available\n", instance); 714 } 715 716 ddi_report_dev(dip); 717 vdc->lifecycle = VDC_LC_ONLINE; 718 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 719 720 return_status: 721 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 722 return (status); 723 } 724 725 static int 726 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 727 { 728 int status; 729 730 switch (cmd) { 731 case DDI_ATTACH: 732 if ((status = vdc_do_attach(dip)) != 0) 733 (void) vdc_detach(dip, DDI_DETACH); 734 return (status); 735 case DDI_RESUME: 736 /* nothing to do for this non-device */ 737 return (DDI_SUCCESS); 738 default: 739 return (DDI_FAILURE); 740 } 741 } 742 743 static int 744 vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node) 745 { 746 int status = 0; 747 ldc_status_t ldc_state; 748 ldc_attr_t ldc_attr; 749 uint64_t ldc_id = 0; 750 751 ASSERT(vdc != NULL); 752 753 vdc->initialized |= VDC_LDC; 754 755 if ((status = vdc_get_ldc_id(mdp, vd_node, &ldc_id)) != 0) { 756 DMSG(vdc, 0, "[%d] Failed to get LDC channel ID property", 757 vdc->instance); 758 return (EIO); 759 } 760 761 DMSGX(0, "[%d] LDC id is 0x%lx\n", vdc->instance, ldc_id); 762 763 vdc->ldc_id = ldc_id; 764 765 ldc_attr.devclass = LDC_DEV_BLK; 766 ldc_attr.instance = vdc->instance; 767 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 768 ldc_attr.mtu = VD_LDC_MTU; 769 770 if ((vdc->initialized & VDC_LDC_INIT) == 0) { 771 status = ldc_init(ldc_id, &ldc_attr, &vdc->ldc_handle); 772 if (status != 0) { 773 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 774 vdc->instance, ldc_id, status); 775 return (status); 776 } 777 vdc->initialized |= VDC_LDC_INIT; 778 } 779 status = ldc_status(vdc->ldc_handle, &ldc_state); 780 if (status != 0) { 781 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 782 vdc->instance, status); 783 return (status); 784 } 785 vdc->ldc_state = ldc_state; 786 787 if ((vdc->initialized & VDC_LDC_CB) == 0) { 788 status = ldc_reg_callback(vdc->ldc_handle, vdc_handle_cb, 789 (caddr_t)vdc); 790 if (status != 0) { 791 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 792 vdc->instance, status); 793 return (status); 794 } 795 vdc->initialized |= VDC_LDC_CB; 796 } 797 798 vdc->initialized |= VDC_LDC; 799 800 /* 801 * At this stage we have initialised LDC, we will now try and open 802 * the connection. 803 */ 804 if (vdc->ldc_state == LDC_INIT) { 805 status = ldc_open(vdc->ldc_handle); 806 if (status != 0) { 807 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 808 vdc->instance, vdc->ldc_id, status); 809 return (status); 810 } 811 vdc->initialized |= VDC_LDC_OPEN; 812 } 813 814 return (status); 815 } 816 817 static int 818 vdc_start_ldc_connection(vdc_t *vdc) 819 { 820 int status = 0; 821 822 ASSERT(vdc != NULL); 823 824 ASSERT(MUTEX_HELD(&vdc->lock)); 825 826 status = vdc_do_ldc_up(vdc); 827 828 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 829 830 return (status); 831 } 832 833 static int 834 vdc_stop_ldc_connection(vdc_t *vdcp) 835 { 836 int status; 837 838 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 839 vdcp->state); 840 841 status = ldc_down(vdcp->ldc_handle); 842 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 843 844 vdcp->initialized &= ~VDC_HANDSHAKE; 845 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 846 847 return (status); 848 } 849 850 static int 851 vdc_create_device_nodes_efi(vdc_t *vdc) 852 { 853 ddi_remove_minor_node(vdc->dip, "h"); 854 ddi_remove_minor_node(vdc->dip, "h,raw"); 855 856 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 857 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 858 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 859 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 860 vdc->instance); 861 return (EIO); 862 } 863 864 /* if any device node is created we set this flag */ 865 vdc->initialized |= VDC_MINOR; 866 867 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 868 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 869 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 870 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 871 vdc->instance); 872 return (EIO); 873 } 874 875 return (0); 876 } 877 878 static int 879 vdc_create_device_nodes_vtoc(vdc_t *vdc) 880 { 881 ddi_remove_minor_node(vdc->dip, "wd"); 882 ddi_remove_minor_node(vdc->dip, "wd,raw"); 883 884 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 885 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 886 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 887 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 888 vdc->instance); 889 return (EIO); 890 } 891 892 /* if any device node is created we set this flag */ 893 vdc->initialized |= VDC_MINOR; 894 895 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 896 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 897 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 898 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 899 vdc->instance); 900 return (EIO); 901 } 902 903 return (0); 904 } 905 906 /* 907 * Function: 908 * vdc_create_device_nodes 909 * 910 * Description: 911 * This function creates the block and character device nodes under 912 * /devices along with the node properties. It is called as part of 913 * the attach(9E) of the instance during the handshake with vds after 914 * vds has sent the attributes to vdc. 915 * 916 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 917 * of 2 is used in keeping with the Solaris convention that slice 2 918 * refers to a whole disk. Slices start at 'a' 919 * 920 * Parameters: 921 * vdc - soft state pointer 922 * 923 * Return Values 924 * 0 - Success 925 * EIO - Failed to create node 926 * EINVAL - Unknown type of disk exported 927 */ 928 static int 929 vdc_create_device_nodes(vdc_t *vdc) 930 { 931 char name[sizeof ("s,raw")]; 932 dev_info_t *dip = NULL; 933 int instance, status; 934 int num_slices = 1; 935 int i; 936 937 ASSERT(vdc != NULL); 938 939 instance = vdc->instance; 940 dip = vdc->dip; 941 942 switch (vdc->vdisk_type) { 943 case VD_DISK_TYPE_DISK: 944 num_slices = V_NUMPAR; 945 break; 946 case VD_DISK_TYPE_SLICE: 947 num_slices = 1; 948 break; 949 case VD_DISK_TYPE_UNK: 950 default: 951 return (EINVAL); 952 } 953 954 /* 955 * Minor nodes are different for EFI disks: EFI disks do not have 956 * a minor node 'g' for the minor number corresponding to slice 957 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 958 * representing the whole disk. 959 */ 960 for (i = 0; i < num_slices; i++) { 961 962 if (i == VD_EFI_WD_SLICE) { 963 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 964 status = vdc_create_device_nodes_efi(vdc); 965 else 966 status = vdc_create_device_nodes_vtoc(vdc); 967 if (status != 0) 968 return (status); 969 continue; 970 } 971 972 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 973 if (ddi_create_minor_node(dip, name, S_IFBLK, 974 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 975 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 976 instance, name); 977 return (EIO); 978 } 979 980 /* if any device node is created we set this flag */ 981 vdc->initialized |= VDC_MINOR; 982 983 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 984 985 if (ddi_create_minor_node(dip, name, S_IFCHR, 986 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 987 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 988 instance, name); 989 return (EIO); 990 } 991 } 992 993 return (0); 994 } 995 996 /* 997 * Function: 998 * vdc_create_device_nodes_props 999 * 1000 * Description: 1001 * This function creates the block and character device nodes under 1002 * /devices along with the node properties. It is called as part of 1003 * the attach(9E) of the instance during the handshake with vds after 1004 * vds has sent the attributes to vdc. 1005 * 1006 * Parameters: 1007 * vdc - soft state pointer 1008 * 1009 * Return Values 1010 * 0 - Success 1011 * EIO - Failed to create device node property 1012 * EINVAL - Unknown type of disk exported 1013 */ 1014 static int 1015 vdc_create_device_nodes_props(vdc_t *vdc) 1016 { 1017 dev_info_t *dip = NULL; 1018 int instance; 1019 int num_slices = 1; 1020 int64_t size = 0; 1021 dev_t dev; 1022 int rv; 1023 int i; 1024 1025 ASSERT(vdc != NULL); 1026 ASSERT(vdc->vtoc != NULL); 1027 1028 instance = vdc->instance; 1029 dip = vdc->dip; 1030 1031 switch (vdc->vdisk_type) { 1032 case VD_DISK_TYPE_DISK: 1033 num_slices = V_NUMPAR; 1034 break; 1035 case VD_DISK_TYPE_SLICE: 1036 num_slices = 1; 1037 break; 1038 case VD_DISK_TYPE_UNK: 1039 default: 1040 return (EINVAL); 1041 } 1042 1043 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1044 /* remove all properties */ 1045 for (i = 0; i < num_slices; i++) { 1046 dev = makedevice(ddi_driver_major(dip), 1047 VD_MAKE_DEV(instance, i)); 1048 (void) ddi_prop_remove(dev, dip, VDC_SIZE_PROP_NAME); 1049 (void) ddi_prop_remove(dev, dip, VDC_NBLOCKS_PROP_NAME); 1050 } 1051 return (0); 1052 } 1053 1054 for (i = 0; i < num_slices; i++) { 1055 dev = makedevice(ddi_driver_major(dip), 1056 VD_MAKE_DEV(instance, i)); 1057 1058 size = vdc->vtoc->v_part[i].p_size * vdc->vtoc->v_sectorsz; 1059 DMSG(vdc, 0, "[%d] sz %ld (%ld Mb) p_size %lx\n", 1060 instance, size, size / (1024 * 1024), 1061 vdc->vtoc->v_part[i].p_size); 1062 1063 rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); 1064 if (rv != DDI_PROP_SUCCESS) { 1065 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop of [%ld]", 1066 instance, VDC_SIZE_PROP_NAME, size); 1067 return (EIO); 1068 } 1069 1070 rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME, 1071 lbtodb(size)); 1072 if (rv != DDI_PROP_SUCCESS) { 1073 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop [%llu]", 1074 instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size)); 1075 return (EIO); 1076 } 1077 } 1078 1079 return (0); 1080 } 1081 1082 /* 1083 * Function: 1084 * vdc_is_opened 1085 * 1086 * Description: 1087 * This function checks if any slice of a given virtual disk is 1088 * currently opened. 1089 * 1090 * Parameters: 1091 * vdc - soft state pointer 1092 * 1093 * Return Values 1094 * B_TRUE - at least one slice is opened. 1095 * B_FALSE - no slice is opened. 1096 */ 1097 static boolean_t 1098 vdc_is_opened(vdc_t *vdc) 1099 { 1100 int i, nslices; 1101 1102 switch (vdc->vdisk_type) { 1103 case VD_DISK_TYPE_DISK: 1104 nslices = V_NUMPAR; 1105 break; 1106 case VD_DISK_TYPE_SLICE: 1107 nslices = 1; 1108 break; 1109 case VD_DISK_TYPE_UNK: 1110 default: 1111 ASSERT(0); 1112 } 1113 1114 /* check if there's any layered open */ 1115 for (i = 0; i < nslices; i++) { 1116 if (vdc->open_lyr[i] > 0) 1117 return (B_TRUE); 1118 } 1119 1120 /* check if there is any other kind of open */ 1121 for (i = 0; i < OTYPCNT; i++) { 1122 if (vdc->open[i] != 0) 1123 return (B_TRUE); 1124 } 1125 1126 return (B_FALSE); 1127 } 1128 1129 static int 1130 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1131 { 1132 uint8_t slicemask; 1133 int i; 1134 1135 ASSERT(otyp < OTYPCNT); 1136 ASSERT(slice < V_NUMPAR); 1137 ASSERT(MUTEX_HELD(&vdc->lock)); 1138 1139 slicemask = 1 << slice; 1140 1141 /* check if slice is already exclusively opened */ 1142 if (vdc->open_excl & slicemask) 1143 return (EBUSY); 1144 1145 /* if open exclusive, check if slice is already opened */ 1146 if (flag & FEXCL) { 1147 if (vdc->open_lyr[slice] > 0) 1148 return (EBUSY); 1149 for (i = 0; i < OTYPCNT; i++) { 1150 if (vdc->open[i] & slicemask) 1151 return (EBUSY); 1152 } 1153 vdc->open_excl |= slicemask; 1154 } 1155 1156 /* mark slice as opened */ 1157 if (otyp == OTYP_LYR) { 1158 vdc->open_lyr[slice]++; 1159 } else { 1160 vdc->open[otyp] |= slicemask; 1161 } 1162 1163 return (0); 1164 } 1165 1166 static void 1167 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1168 { 1169 uint8_t slicemask; 1170 1171 ASSERT(otyp < OTYPCNT); 1172 ASSERT(slice < V_NUMPAR); 1173 ASSERT(MUTEX_HELD(&vdc->lock)); 1174 1175 slicemask = 1 << slice; 1176 1177 if (otyp == OTYP_LYR) { 1178 ASSERT(vdc->open_lyr[slice] > 0); 1179 vdc->open_lyr[slice]--; 1180 } else { 1181 vdc->open[otyp] &= ~slicemask; 1182 } 1183 1184 if (flag & FEXCL) 1185 vdc->open_excl &= ~slicemask; 1186 } 1187 1188 static int 1189 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1190 { 1191 _NOTE(ARGUNUSED(cred)) 1192 1193 int instance; 1194 int slice, status = 0; 1195 vdc_t *vdc; 1196 1197 ASSERT(dev != NULL); 1198 instance = VDCUNIT(*dev); 1199 1200 if (otyp >= OTYPCNT) 1201 return (EINVAL); 1202 1203 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1204 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1205 return (ENXIO); 1206 } 1207 1208 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1209 getminor(*dev), flag, otyp); 1210 1211 slice = VDCPART(*dev); 1212 1213 mutex_enter(&vdc->lock); 1214 1215 status = vdc_mark_opened(vdc, slice, flag, otyp); 1216 1217 if (status != 0) { 1218 mutex_exit(&vdc->lock); 1219 return (status); 1220 } 1221 1222 if (flag & (FNDELAY | FNONBLOCK)) { 1223 1224 /* don't resubmit a validate request if there's already one */ 1225 if (vdc->validate_pending > 0) { 1226 mutex_exit(&vdc->lock); 1227 return (0); 1228 } 1229 1230 /* call vdc_validate() asynchronously to avoid blocking */ 1231 if (taskq_dispatch(system_taskq, vdc_validate_task, 1232 (void *)vdc, TQ_NOSLEEP) == NULL) { 1233 vdc_mark_closed(vdc, slice, flag, otyp); 1234 mutex_exit(&vdc->lock); 1235 return (ENXIO); 1236 } 1237 1238 vdc->validate_pending++; 1239 mutex_exit(&vdc->lock); 1240 return (0); 1241 } 1242 1243 mutex_exit(&vdc->lock); 1244 1245 vdc_validate(vdc); 1246 1247 mutex_enter(&vdc->lock); 1248 1249 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1250 vdc->vtoc->v_part[slice].p_size == 0) { 1251 vdc_mark_closed(vdc, slice, flag, otyp); 1252 status = EIO; 1253 } 1254 1255 mutex_exit(&vdc->lock); 1256 1257 return (status); 1258 } 1259 1260 static int 1261 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1262 { 1263 _NOTE(ARGUNUSED(cred)) 1264 1265 int instance; 1266 int slice; 1267 int rv, rval; 1268 vdc_t *vdc; 1269 1270 instance = VDCUNIT(dev); 1271 1272 if (otyp >= OTYPCNT) 1273 return (EINVAL); 1274 1275 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1276 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1277 return (ENXIO); 1278 } 1279 1280 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1281 1282 slice = VDCPART(dev); 1283 1284 /* 1285 * Attempt to flush the W$ on a close operation. If this is 1286 * not a supported IOCTL command or the backing device is read-only 1287 * do not fail the close operation. 1288 */ 1289 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1290 1291 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1292 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1293 instance, rv); 1294 return (EIO); 1295 } 1296 1297 mutex_enter(&vdc->lock); 1298 vdc_mark_closed(vdc, slice, flag, otyp); 1299 mutex_exit(&vdc->lock); 1300 1301 return (0); 1302 } 1303 1304 static int 1305 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1306 { 1307 _NOTE(ARGUNUSED(credp)) 1308 1309 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1310 } 1311 1312 static int 1313 vdc_print(dev_t dev, char *str) 1314 { 1315 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1316 return (0); 1317 } 1318 1319 static int 1320 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1321 { 1322 int rv; 1323 size_t nbytes = nblk * DEV_BSIZE; 1324 int instance = VDCUNIT(dev); 1325 vdc_t *vdc = NULL; 1326 1327 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1328 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1329 return (ENXIO); 1330 } 1331 1332 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1333 instance, nbytes, blkno, (void *)addr); 1334 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1335 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1336 if (rv) { 1337 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1338 return (rv); 1339 } 1340 1341 if (ddi_in_panic()) 1342 (void) vdc_drain_response(vdc); 1343 1344 DMSG(vdc, 0, "[%d] End\n", instance); 1345 1346 return (0); 1347 } 1348 1349 /* -------------------------------------------------------------------------- */ 1350 1351 /* 1352 * Disk access routines 1353 * 1354 */ 1355 1356 /* 1357 * vdc_strategy() 1358 * 1359 * Return Value: 1360 * 0: As per strategy(9E), the strategy() function must return 0 1361 * [ bioerror(9f) sets b_flags to the proper error code ] 1362 */ 1363 static int 1364 vdc_strategy(struct buf *buf) 1365 { 1366 int rv = -1; 1367 vdc_t *vdc = NULL; 1368 int instance = VDCUNIT(buf->b_edev); 1369 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1370 int slice; 1371 1372 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1373 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1374 bioerror(buf, ENXIO); 1375 biodone(buf); 1376 return (0); 1377 } 1378 1379 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1380 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1381 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1382 DTRACE_IO2(vstart, buf_t *, buf, vdc_t *, vdc); 1383 1384 bp_mapin(buf); 1385 1386 if ((long)buf->b_private == VD_SLICE_NONE) { 1387 /* I/O using an absolute disk offset */ 1388 slice = VD_SLICE_NONE; 1389 } else { 1390 slice = VDCPART(buf->b_edev); 1391 } 1392 1393 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1394 buf->b_bcount, slice, buf->b_lblkno, 1395 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1396 VIO_write_dir); 1397 1398 /* 1399 * If the request was successfully sent, the strategy call returns and 1400 * the ACK handler calls the bioxxx functions when the vDisk server is 1401 * done. 1402 */ 1403 if (rv) { 1404 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1405 bioerror(buf, rv); 1406 biodone(buf); 1407 } 1408 1409 return (0); 1410 } 1411 1412 /* 1413 * Function: 1414 * vdc_min 1415 * 1416 * Description: 1417 * Routine to limit the size of a data transfer. Used in 1418 * conjunction with physio(9F). 1419 * 1420 * Arguments: 1421 * bp - pointer to the indicated buf(9S) struct. 1422 * 1423 */ 1424 static void 1425 vdc_min(struct buf *bufp) 1426 { 1427 vdc_t *vdc = NULL; 1428 int instance = VDCUNIT(bufp->b_edev); 1429 1430 vdc = ddi_get_soft_state(vdc_state, instance); 1431 VERIFY(vdc != NULL); 1432 1433 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1434 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1435 } 1436 } 1437 1438 static int 1439 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1440 { 1441 _NOTE(ARGUNUSED(cred)) 1442 1443 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1444 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1445 } 1446 1447 static int 1448 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1449 { 1450 _NOTE(ARGUNUSED(cred)) 1451 1452 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1453 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1454 } 1455 1456 static int 1457 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1458 { 1459 _NOTE(ARGUNUSED(cred)) 1460 1461 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1462 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1463 } 1464 1465 static int 1466 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1467 { 1468 _NOTE(ARGUNUSED(cred)) 1469 1470 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1471 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1472 } 1473 1474 1475 /* -------------------------------------------------------------------------- */ 1476 1477 /* 1478 * Handshake support 1479 */ 1480 1481 1482 /* 1483 * Function: 1484 * vdc_init_ver_negotiation() 1485 * 1486 * Description: 1487 * 1488 * Arguments: 1489 * vdc - soft state pointer for this instance of the device driver. 1490 * 1491 * Return Code: 1492 * 0 - Success 1493 */ 1494 static int 1495 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1496 { 1497 vio_ver_msg_t pkt; 1498 size_t msglen = sizeof (pkt); 1499 int status = -1; 1500 1501 ASSERT(vdc != NULL); 1502 ASSERT(mutex_owned(&vdc->lock)); 1503 1504 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1505 1506 /* 1507 * set the Session ID to a unique value 1508 * (the lower 32 bits of the clock tick) 1509 */ 1510 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1511 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1512 1513 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1514 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1515 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1516 pkt.tag.vio_sid = vdc->session_id; 1517 pkt.dev_class = VDEV_DISK; 1518 pkt.ver_major = ver.major; 1519 pkt.ver_minor = ver.minor; 1520 1521 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1522 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1523 vdc->instance, status); 1524 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1525 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1526 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1527 status, msglen); 1528 if (msglen != sizeof (vio_ver_msg_t)) 1529 status = ENOMSG; 1530 } 1531 1532 return (status); 1533 } 1534 1535 /* 1536 * Function: 1537 * vdc_ver_negotiation() 1538 * 1539 * Description: 1540 * 1541 * Arguments: 1542 * vdcp - soft state pointer for this instance of the device driver. 1543 * 1544 * Return Code: 1545 * 0 - Success 1546 */ 1547 static int 1548 vdc_ver_negotiation(vdc_t *vdcp) 1549 { 1550 vio_msg_t vio_msg; 1551 int status; 1552 1553 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1554 return (status); 1555 1556 /* release lock and wait for response */ 1557 mutex_exit(&vdcp->lock); 1558 status = vdc_wait_for_response(vdcp, &vio_msg); 1559 mutex_enter(&vdcp->lock); 1560 if (status) { 1561 DMSG(vdcp, 0, 1562 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1563 vdcp->instance, status); 1564 return (status); 1565 } 1566 1567 /* check type and sub_type ... */ 1568 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1569 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1570 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1571 vdcp->instance); 1572 return (EPROTO); 1573 } 1574 1575 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1576 } 1577 1578 /* 1579 * Function: 1580 * vdc_init_attr_negotiation() 1581 * 1582 * Description: 1583 * 1584 * Arguments: 1585 * vdc - soft state pointer for this instance of the device driver. 1586 * 1587 * Return Code: 1588 * 0 - Success 1589 */ 1590 static int 1591 vdc_init_attr_negotiation(vdc_t *vdc) 1592 { 1593 vd_attr_msg_t pkt; 1594 size_t msglen = sizeof (pkt); 1595 int status; 1596 1597 ASSERT(vdc != NULL); 1598 ASSERT(mutex_owned(&vdc->lock)); 1599 1600 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1601 1602 /* fill in tag */ 1603 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1604 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1605 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1606 pkt.tag.vio_sid = vdc->session_id; 1607 /* fill in payload */ 1608 pkt.max_xfer_sz = vdc->max_xfer_sz; 1609 pkt.vdisk_block_size = vdc->block_size; 1610 pkt.xfer_mode = VIO_DRING_MODE; 1611 pkt.operations = 0; /* server will set bits of valid operations */ 1612 pkt.vdisk_type = 0; /* server will set to valid device type */ 1613 pkt.vdisk_media = 0; /* server will set to valid media type */ 1614 pkt.vdisk_size = 0; /* server will set to valid size */ 1615 1616 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1617 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1618 1619 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1620 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1621 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1622 status, msglen); 1623 if (msglen != sizeof (vio_ver_msg_t)) 1624 status = ENOMSG; 1625 } 1626 1627 return (status); 1628 } 1629 1630 /* 1631 * Function: 1632 * vdc_attr_negotiation() 1633 * 1634 * Description: 1635 * 1636 * Arguments: 1637 * vdc - soft state pointer for this instance of the device driver. 1638 * 1639 * Return Code: 1640 * 0 - Success 1641 */ 1642 static int 1643 vdc_attr_negotiation(vdc_t *vdcp) 1644 { 1645 int status; 1646 vio_msg_t vio_msg; 1647 1648 if (status = vdc_init_attr_negotiation(vdcp)) 1649 return (status); 1650 1651 /* release lock and wait for response */ 1652 mutex_exit(&vdcp->lock); 1653 status = vdc_wait_for_response(vdcp, &vio_msg); 1654 mutex_enter(&vdcp->lock); 1655 if (status) { 1656 DMSG(vdcp, 0, 1657 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1658 vdcp->instance, status); 1659 return (status); 1660 } 1661 1662 /* check type and sub_type ... */ 1663 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1664 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1665 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1666 vdcp->instance); 1667 return (EPROTO); 1668 } 1669 1670 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1671 } 1672 1673 1674 /* 1675 * Function: 1676 * vdc_init_dring_negotiate() 1677 * 1678 * Description: 1679 * 1680 * Arguments: 1681 * vdc - soft state pointer for this instance of the device driver. 1682 * 1683 * Return Code: 1684 * 0 - Success 1685 */ 1686 static int 1687 vdc_init_dring_negotiate(vdc_t *vdc) 1688 { 1689 vio_dring_reg_msg_t pkt; 1690 size_t msglen = sizeof (pkt); 1691 int status = -1; 1692 int retry; 1693 int nretries = 10; 1694 1695 ASSERT(vdc != NULL); 1696 ASSERT(mutex_owned(&vdc->lock)); 1697 1698 for (retry = 0; retry < nretries; retry++) { 1699 status = vdc_init_descriptor_ring(vdc); 1700 if (status != EAGAIN) 1701 break; 1702 drv_usecwait(vdc_min_timeout_ldc); 1703 } 1704 1705 if (status != 0) { 1706 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1707 vdc->instance, status); 1708 return (status); 1709 } 1710 1711 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1712 vdc->instance, status); 1713 1714 /* fill in tag */ 1715 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1716 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1717 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1718 pkt.tag.vio_sid = vdc->session_id; 1719 /* fill in payload */ 1720 pkt.dring_ident = 0; 1721 pkt.num_descriptors = vdc->dring_len; 1722 pkt.descriptor_size = vdc->dring_entry_size; 1723 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1724 pkt.ncookies = vdc->dring_cookie_count; 1725 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1726 1727 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1728 if (status != 0) { 1729 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1730 vdc->instance, status); 1731 } 1732 1733 return (status); 1734 } 1735 1736 1737 /* 1738 * Function: 1739 * vdc_dring_negotiation() 1740 * 1741 * Description: 1742 * 1743 * Arguments: 1744 * vdc - soft state pointer for this instance of the device driver. 1745 * 1746 * Return Code: 1747 * 0 - Success 1748 */ 1749 static int 1750 vdc_dring_negotiation(vdc_t *vdcp) 1751 { 1752 int status; 1753 vio_msg_t vio_msg; 1754 1755 if (status = vdc_init_dring_negotiate(vdcp)) 1756 return (status); 1757 1758 /* release lock and wait for response */ 1759 mutex_exit(&vdcp->lock); 1760 status = vdc_wait_for_response(vdcp, &vio_msg); 1761 mutex_enter(&vdcp->lock); 1762 if (status) { 1763 DMSG(vdcp, 0, 1764 "[%d] Failed waiting for Dring negotiation response," 1765 " rv(%d)", vdcp->instance, status); 1766 return (status); 1767 } 1768 1769 /* check type and sub_type ... */ 1770 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1771 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1772 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1773 vdcp->instance); 1774 return (EPROTO); 1775 } 1776 1777 return (vdc_handle_dring_reg_msg(vdcp, 1778 (vio_dring_reg_msg_t *)&vio_msg)); 1779 } 1780 1781 1782 /* 1783 * Function: 1784 * vdc_send_rdx() 1785 * 1786 * Description: 1787 * 1788 * Arguments: 1789 * vdc - soft state pointer for this instance of the device driver. 1790 * 1791 * Return Code: 1792 * 0 - Success 1793 */ 1794 static int 1795 vdc_send_rdx(vdc_t *vdcp) 1796 { 1797 vio_msg_t msg; 1798 size_t msglen = sizeof (vio_msg_t); 1799 int status; 1800 1801 /* 1802 * Send an RDX message to vds to indicate we are ready 1803 * to send data 1804 */ 1805 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1806 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1807 msg.tag.vio_subtype_env = VIO_RDX; 1808 msg.tag.vio_sid = vdcp->session_id; 1809 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1810 if (status != 0) { 1811 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1812 vdcp->instance, status); 1813 } 1814 1815 return (status); 1816 } 1817 1818 /* 1819 * Function: 1820 * vdc_handle_rdx() 1821 * 1822 * Description: 1823 * 1824 * Arguments: 1825 * vdc - soft state pointer for this instance of the device driver. 1826 * msgp - received msg 1827 * 1828 * Return Code: 1829 * 0 - Success 1830 */ 1831 static int 1832 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1833 { 1834 _NOTE(ARGUNUSED(vdcp)) 1835 _NOTE(ARGUNUSED(msgp)) 1836 1837 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1838 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1839 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1840 1841 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1842 1843 return (0); 1844 } 1845 1846 /* 1847 * Function: 1848 * vdc_rdx_exchange() 1849 * 1850 * Description: 1851 * 1852 * Arguments: 1853 * vdc - soft state pointer for this instance of the device driver. 1854 * 1855 * Return Code: 1856 * 0 - Success 1857 */ 1858 static int 1859 vdc_rdx_exchange(vdc_t *vdcp) 1860 { 1861 int status; 1862 vio_msg_t vio_msg; 1863 1864 if (status = vdc_send_rdx(vdcp)) 1865 return (status); 1866 1867 /* release lock and wait for response */ 1868 mutex_exit(&vdcp->lock); 1869 status = vdc_wait_for_response(vdcp, &vio_msg); 1870 mutex_enter(&vdcp->lock); 1871 if (status) { 1872 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1873 vdcp->instance, status); 1874 return (status); 1875 } 1876 1877 /* check type and sub_type ... */ 1878 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1879 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1880 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1881 return (EPROTO); 1882 } 1883 1884 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1885 } 1886 1887 1888 /* -------------------------------------------------------------------------- */ 1889 1890 /* 1891 * LDC helper routines 1892 */ 1893 1894 static int 1895 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1896 { 1897 int status; 1898 boolean_t q_has_pkts = B_FALSE; 1899 uint64_t delay_time; 1900 size_t len; 1901 1902 mutex_enter(&vdc->read_lock); 1903 1904 if (vdc->read_state == VDC_READ_IDLE) 1905 vdc->read_state = VDC_READ_WAITING; 1906 1907 while (vdc->read_state != VDC_READ_PENDING) { 1908 1909 /* detect if the connection has been reset */ 1910 if (vdc->read_state == VDC_READ_RESET) { 1911 status = ECONNRESET; 1912 goto done; 1913 } 1914 1915 cv_wait(&vdc->read_cv, &vdc->read_lock); 1916 } 1917 1918 /* 1919 * Until we get a blocking ldc read we have to retry 1920 * until the entire LDC message has arrived before 1921 * ldc_read() will succeed. Note we also bail out if 1922 * the channel is reset or goes away. 1923 */ 1924 delay_time = vdc_ldc_read_init_delay; 1925 loop: 1926 len = *nbytesp; 1927 status = ldc_read(vdc->ldc_handle, (caddr_t)msgp, &len); 1928 switch (status) { 1929 case EAGAIN: 1930 delay_time *= 2; 1931 if (delay_time >= vdc_ldc_read_max_delay) 1932 delay_time = vdc_ldc_read_max_delay; 1933 delay(delay_time); 1934 goto loop; 1935 1936 case 0: 1937 if (len == 0) { 1938 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 1939 "no error!\n", vdc->instance); 1940 goto loop; 1941 } 1942 1943 *nbytesp = len; 1944 1945 /* 1946 * If there are pending messages, leave the 1947 * read state as pending. Otherwise, set the state 1948 * back to idle. 1949 */ 1950 status = ldc_chkq(vdc->ldc_handle, &q_has_pkts); 1951 if (status == 0 && !q_has_pkts) 1952 vdc->read_state = VDC_READ_IDLE; 1953 1954 break; 1955 default: 1956 DMSG(vdc, 0, "ldc_read returned %d\n", status); 1957 break; 1958 } 1959 1960 done: 1961 mutex_exit(&vdc->read_lock); 1962 1963 return (status); 1964 } 1965 1966 1967 1968 #ifdef DEBUG 1969 void 1970 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 1971 { 1972 char *ms, *ss, *ses; 1973 switch (msg->tag.vio_msgtype) { 1974 #define Q(_s) case _s : ms = #_s; break; 1975 Q(VIO_TYPE_CTRL) 1976 Q(VIO_TYPE_DATA) 1977 Q(VIO_TYPE_ERR) 1978 #undef Q 1979 default: ms = "unknown"; break; 1980 } 1981 1982 switch (msg->tag.vio_subtype) { 1983 #define Q(_s) case _s : ss = #_s; break; 1984 Q(VIO_SUBTYPE_INFO) 1985 Q(VIO_SUBTYPE_ACK) 1986 Q(VIO_SUBTYPE_NACK) 1987 #undef Q 1988 default: ss = "unknown"; break; 1989 } 1990 1991 switch (msg->tag.vio_subtype_env) { 1992 #define Q(_s) case _s : ses = #_s; break; 1993 Q(VIO_VER_INFO) 1994 Q(VIO_ATTR_INFO) 1995 Q(VIO_DRING_REG) 1996 Q(VIO_DRING_UNREG) 1997 Q(VIO_RDX) 1998 Q(VIO_PKT_DATA) 1999 Q(VIO_DESC_DATA) 2000 Q(VIO_DRING_DATA) 2001 #undef Q 2002 default: ses = "unknown"; break; 2003 } 2004 2005 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2006 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2007 msg->tag.vio_subtype_env, ms, ss, ses); 2008 } 2009 #endif 2010 2011 /* 2012 * Function: 2013 * vdc_send() 2014 * 2015 * Description: 2016 * The function encapsulates the call to write a message using LDC. 2017 * If LDC indicates that the call failed due to the queue being full, 2018 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2019 * 2020 * Arguments: 2021 * ldc_handle - LDC handle for the channel this instance of vdc uses 2022 * pkt - address of LDC message to be sent 2023 * msglen - the size of the message being sent. When the function 2024 * returns, this contains the number of bytes written. 2025 * 2026 * Return Code: 2027 * 0 - Success. 2028 * EINVAL - pkt or msglen were NULL 2029 * ECONNRESET - The connection was not up. 2030 * EWOULDBLOCK - LDC queue is full 2031 * xxx - other error codes returned by ldc_write 2032 */ 2033 static int 2034 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2035 { 2036 size_t size = 0; 2037 int status = 0; 2038 clock_t delay_ticks; 2039 2040 ASSERT(vdc != NULL); 2041 ASSERT(mutex_owned(&vdc->lock)); 2042 ASSERT(msglen != NULL); 2043 ASSERT(*msglen != 0); 2044 2045 #ifdef DEBUG 2046 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2047 #endif 2048 /* 2049 * Wait indefinitely to send if channel 2050 * is busy, but bail out if we succeed or 2051 * if the channel closes or is reset. 2052 */ 2053 delay_ticks = vdc_hz_min_ldc_delay; 2054 do { 2055 size = *msglen; 2056 status = ldc_write(vdc->ldc_handle, pkt, &size); 2057 if (status == EWOULDBLOCK) { 2058 delay(delay_ticks); 2059 /* geometric backoff */ 2060 delay_ticks *= 2; 2061 if (delay_ticks > vdc_hz_max_ldc_delay) 2062 delay_ticks = vdc_hz_max_ldc_delay; 2063 } 2064 } while (status == EWOULDBLOCK); 2065 2066 /* if LDC had serious issues --- reset vdc state */ 2067 if (status == EIO || status == ECONNRESET) { 2068 /* LDC had serious issues --- reset vdc state */ 2069 mutex_enter(&vdc->read_lock); 2070 if ((vdc->read_state == VDC_READ_WAITING) || 2071 (vdc->read_state == VDC_READ_RESET)) 2072 cv_signal(&vdc->read_cv); 2073 vdc->read_state = VDC_READ_RESET; 2074 mutex_exit(&vdc->read_lock); 2075 2076 /* wake up any waiters in the reset thread */ 2077 if (vdc->state == VDC_STATE_INIT_WAITING) { 2078 DMSG(vdc, 0, "[%d] write reset - " 2079 "vdc is resetting ..\n", vdc->instance); 2080 vdc->state = VDC_STATE_RESETTING; 2081 cv_signal(&vdc->initwait_cv); 2082 } 2083 2084 return (ECONNRESET); 2085 } 2086 2087 /* return the last size written */ 2088 *msglen = size; 2089 2090 return (status); 2091 } 2092 2093 /* 2094 * Function: 2095 * vdc_get_md_node 2096 * 2097 * Description: 2098 * Get the MD, the device node and the port node for the given 2099 * disk instance. The caller is responsible for cleaning up the 2100 * reference to the returned MD (mdpp) by calling md_fini_handle(). 2101 * 2102 * Arguments: 2103 * dip - dev info pointer for this instance of the device driver. 2104 * mdpp - the returned MD. 2105 * vd_nodep - the returned device node. 2106 * vd_portp - the returned port node. The returned port node is NULL 2107 * if no port node is found. 2108 * 2109 * Return Code: 2110 * 0 - Success. 2111 * ENOENT - Expected node or property did not exist. 2112 * ENXIO - Unexpected error communicating with MD framework 2113 */ 2114 static int 2115 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep, 2116 mde_cookie_t *vd_portp) 2117 { 2118 int status = ENOENT; 2119 char *node_name = NULL; 2120 md_t *mdp = NULL; 2121 int num_nodes; 2122 int num_vdevs; 2123 int num_vports; 2124 mde_cookie_t rootnode; 2125 mde_cookie_t *listp = NULL; 2126 boolean_t found_inst = B_FALSE; 2127 int listsz; 2128 int idx; 2129 uint64_t md_inst; 2130 int obp_inst; 2131 int instance = ddi_get_instance(dip); 2132 2133 /* 2134 * Get the OBP instance number for comparison with the MD instance 2135 * 2136 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2137 * notion of "instance", or unique identifier, for that node; OBP 2138 * stores the value of the "cfg-handle" MD property as the value of 2139 * the "reg" property on the node in the device tree it builds from 2140 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2141 * "reg" property value to uniquely identify this device instance. 2142 * If the "reg" property cannot be found, the device tree state is 2143 * presumably so broken that there is no point in continuing. 2144 */ 2145 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2146 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2147 return (ENOENT); 2148 } 2149 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2150 OBP_REG, -1); 2151 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2152 2153 /* 2154 * We now walk the MD nodes to find the node for this vdisk. 2155 */ 2156 if ((mdp = md_get_handle()) == NULL) { 2157 cmn_err(CE_WARN, "unable to init machine description"); 2158 return (ENXIO); 2159 } 2160 2161 num_nodes = md_node_count(mdp); 2162 ASSERT(num_nodes > 0); 2163 2164 listsz = num_nodes * sizeof (mde_cookie_t); 2165 2166 /* allocate memory for nodes */ 2167 listp = kmem_zalloc(listsz, KM_SLEEP); 2168 2169 rootnode = md_root_node(mdp); 2170 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2171 2172 /* 2173 * Search for all the virtual devices, we will then check to see which 2174 * ones are disk nodes. 2175 */ 2176 num_vdevs = md_scan_dag(mdp, rootnode, 2177 md_find_name(mdp, VDC_MD_VDEV_NAME), 2178 md_find_name(mdp, "fwd"), listp); 2179 2180 if (num_vdevs <= 0) { 2181 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2182 status = ENOENT; 2183 goto done; 2184 } 2185 2186 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2187 for (idx = 0; idx < num_vdevs; idx++) { 2188 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2189 if ((status != 0) || (node_name == NULL)) { 2190 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2191 ": err %d", VDC_MD_VDEV_NAME, status); 2192 continue; 2193 } 2194 2195 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2196 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2197 status = md_get_prop_val(mdp, listp[idx], 2198 VDC_MD_CFG_HDL, &md_inst); 2199 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2200 instance, md_inst); 2201 if ((status == 0) && (md_inst == obp_inst)) { 2202 found_inst = B_TRUE; 2203 break; 2204 } 2205 } 2206 } 2207 2208 if (!found_inst) { 2209 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2210 status = ENOENT; 2211 goto done; 2212 } 2213 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2214 2215 *vd_nodep = listp[idx]; 2216 *mdpp = mdp; 2217 2218 num_vports = md_scan_dag(mdp, *vd_nodep, 2219 md_find_name(mdp, VDC_MD_PORT_NAME), 2220 md_find_name(mdp, "fwd"), listp); 2221 2222 if (num_vports != 1) { 2223 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2224 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME, num_vports); 2225 } 2226 2227 *vd_portp = (num_vports == 0)? NULL: listp[0]; 2228 2229 done: 2230 kmem_free(listp, listsz); 2231 return (status); 2232 } 2233 2234 /* 2235 * Function: 2236 * vdc_get_ldc_id() 2237 * 2238 * Description: 2239 * This function gets the 'ldc-id' for this particular instance of vdc. 2240 * The id returned is the guest domain channel endpoint LDC uses for 2241 * communication with vds. 2242 * 2243 * Arguments: 2244 * mdp - pointer to the machine description. 2245 * vd_node - the vdisk element from the MD. 2246 * ldc_id - pointer to variable used to return the 'ldc-id' found. 2247 * 2248 * Return Code: 2249 * 0 - Success. 2250 * ENOENT - Expected node or property did not exist. 2251 */ 2252 static int 2253 vdc_get_ldc_id(md_t *mdp, mde_cookie_t vd_node, uint64_t *ldc_id) 2254 { 2255 mde_cookie_t *chanp = NULL; 2256 int listsz; 2257 int num_chans; 2258 int num_nodes; 2259 int status = 0; 2260 2261 num_nodes = md_node_count(mdp); 2262 ASSERT(num_nodes > 0); 2263 2264 listsz = num_nodes * sizeof (mde_cookie_t); 2265 2266 /* allocate memory for nodes */ 2267 chanp = kmem_zalloc(listsz, KM_SLEEP); 2268 2269 /* get the channels for this node */ 2270 num_chans = md_scan_dag(mdp, vd_node, 2271 md_find_name(mdp, VDC_MD_CHAN_NAME), 2272 md_find_name(mdp, "fwd"), chanp); 2273 2274 /* expecting at least one channel */ 2275 if (num_chans <= 0) { 2276 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2277 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2278 status = ENOENT; 2279 goto done; 2280 2281 } else if (num_chans != 1) { 2282 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2283 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, num_chans); 2284 } 2285 2286 /* 2287 * We use the first channel found (index 0), irrespective of how 2288 * many are there in total. 2289 */ 2290 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, ldc_id) != 0) { 2291 cmn_err(CE_NOTE, "Channel '%s' property not found", VDC_MD_ID); 2292 status = ENOENT; 2293 } 2294 2295 done: 2296 kmem_free(chanp, listsz); 2297 return (status); 2298 } 2299 2300 static int 2301 vdc_do_ldc_up(vdc_t *vdc) 2302 { 2303 int status; 2304 ldc_status_t ldc_state; 2305 2306 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2307 vdc->instance, vdc->ldc_id); 2308 2309 if (vdc->lifecycle == VDC_LC_DETACHING) 2310 return (EINVAL); 2311 2312 if ((status = ldc_up(vdc->ldc_handle)) != 0) { 2313 switch (status) { 2314 case ECONNREFUSED: /* listener not ready at other end */ 2315 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2316 vdc->instance, vdc->ldc_id, status); 2317 status = 0; 2318 break; 2319 default: 2320 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2321 "channel=%ld, err=%d", vdc->instance, vdc->ldc_id, 2322 status); 2323 break; 2324 } 2325 } 2326 2327 if (ldc_status(vdc->ldc_handle, &ldc_state) == 0) { 2328 vdc->ldc_state = ldc_state; 2329 if (ldc_state == LDC_UP) { 2330 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2331 vdc->instance); 2332 vdc->seq_num = 1; 2333 vdc->seq_num_reply = 0; 2334 } 2335 } 2336 2337 return (status); 2338 } 2339 2340 /* 2341 * Function: 2342 * vdc_terminate_ldc() 2343 * 2344 * Description: 2345 * 2346 * Arguments: 2347 * vdc - soft state pointer for this instance of the device driver. 2348 * 2349 * Return Code: 2350 * None 2351 */ 2352 static void 2353 vdc_terminate_ldc(vdc_t *vdc) 2354 { 2355 int instance = ddi_get_instance(vdc->dip); 2356 2357 ASSERT(vdc != NULL); 2358 ASSERT(mutex_owned(&vdc->lock)); 2359 2360 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2361 2362 if (vdc->initialized & VDC_LDC_OPEN) { 2363 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2364 (void) ldc_close(vdc->ldc_handle); 2365 } 2366 if (vdc->initialized & VDC_LDC_CB) { 2367 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2368 (void) ldc_unreg_callback(vdc->ldc_handle); 2369 } 2370 if (vdc->initialized & VDC_LDC) { 2371 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2372 (void) ldc_fini(vdc->ldc_handle); 2373 vdc->ldc_handle = NULL; 2374 } 2375 2376 vdc->initialized &= ~(VDC_LDC | VDC_LDC_CB | VDC_LDC_OPEN); 2377 } 2378 2379 /* -------------------------------------------------------------------------- */ 2380 2381 /* 2382 * Descriptor Ring helper routines 2383 */ 2384 2385 /* 2386 * Function: 2387 * vdc_init_descriptor_ring() 2388 * 2389 * Description: 2390 * 2391 * Arguments: 2392 * vdc - soft state pointer for this instance of the device driver. 2393 * 2394 * Return Code: 2395 * 0 - Success 2396 */ 2397 static int 2398 vdc_init_descriptor_ring(vdc_t *vdc) 2399 { 2400 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2401 int status = 0; 2402 int i; 2403 2404 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2405 2406 ASSERT(vdc != NULL); 2407 ASSERT(mutex_owned(&vdc->lock)); 2408 ASSERT(vdc->ldc_handle != NULL); 2409 2410 /* ensure we have enough room to store max sized block */ 2411 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2412 2413 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2414 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2415 /* 2416 * Calculate the maximum block size we can transmit using one 2417 * Descriptor Ring entry from the attributes returned by the 2418 * vDisk server. This is subject to a minimum of 'maxphys' 2419 * as we do not have the capability to split requests over 2420 * multiple DRing entries. 2421 */ 2422 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2423 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2424 vdc->instance); 2425 vdc->dring_max_cookies = maxphys / PAGESIZE; 2426 } else { 2427 vdc->dring_max_cookies = 2428 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2429 } 2430 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2431 (sizeof (ldc_mem_cookie_t) * 2432 (vdc->dring_max_cookies - 1))); 2433 vdc->dring_len = VD_DRING_LEN; 2434 2435 status = ldc_mem_dring_create(vdc->dring_len, 2436 vdc->dring_entry_size, &vdc->ldc_dring_hdl); 2437 if ((vdc->ldc_dring_hdl == NULL) || (status != 0)) { 2438 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2439 vdc->instance); 2440 return (status); 2441 } 2442 vdc->initialized |= VDC_DRING_INIT; 2443 } 2444 2445 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2446 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2447 vdc->dring_cookie = 2448 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2449 2450 status = ldc_mem_dring_bind(vdc->ldc_handle, vdc->ldc_dring_hdl, 2451 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2452 &vdc->dring_cookie[0], 2453 &vdc->dring_cookie_count); 2454 if (status != 0) { 2455 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2456 "(%lx) to channel (%lx) status=%d\n", 2457 vdc->instance, vdc->ldc_dring_hdl, 2458 vdc->ldc_handle, status); 2459 return (status); 2460 } 2461 ASSERT(vdc->dring_cookie_count == 1); 2462 vdc->initialized |= VDC_DRING_BOUND; 2463 } 2464 2465 status = ldc_mem_dring_info(vdc->ldc_dring_hdl, &vdc->dring_mem_info); 2466 if (status != 0) { 2467 DMSG(vdc, 0, 2468 "[%d] Failed to get info for descriptor ring (%lx)\n", 2469 vdc->instance, vdc->ldc_dring_hdl); 2470 return (status); 2471 } 2472 2473 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2474 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2475 2476 /* Allocate the local copy of this dring */ 2477 vdc->local_dring = 2478 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2479 KM_SLEEP); 2480 vdc->initialized |= VDC_DRING_LOCAL; 2481 } 2482 2483 /* 2484 * Mark all DRing entries as free and initialize the private 2485 * descriptor's memory handles. If any entry is initialized, 2486 * we need to free it later so we set the bit in 'initialized' 2487 * at the start. 2488 */ 2489 vdc->initialized |= VDC_DRING_ENTRY; 2490 for (i = 0; i < vdc->dring_len; i++) { 2491 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2492 dep->hdr.dstate = VIO_DESC_FREE; 2493 2494 status = ldc_mem_alloc_handle(vdc->ldc_handle, 2495 &vdc->local_dring[i].desc_mhdl); 2496 if (status != 0) { 2497 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2498 " descriptor %d", vdc->instance, i); 2499 return (status); 2500 } 2501 vdc->local_dring[i].is_free = B_TRUE; 2502 vdc->local_dring[i].dep = dep; 2503 } 2504 2505 /* Initialize the starting index */ 2506 vdc->dring_curr_idx = 0; 2507 2508 return (status); 2509 } 2510 2511 /* 2512 * Function: 2513 * vdc_destroy_descriptor_ring() 2514 * 2515 * Description: 2516 * 2517 * Arguments: 2518 * vdc - soft state pointer for this instance of the device driver. 2519 * 2520 * Return Code: 2521 * None 2522 */ 2523 static void 2524 vdc_destroy_descriptor_ring(vdc_t *vdc) 2525 { 2526 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2527 ldc_mem_handle_t mhdl = NULL; 2528 ldc_mem_info_t minfo; 2529 int status = -1; 2530 int i; /* loop */ 2531 2532 ASSERT(vdc != NULL); 2533 ASSERT(mutex_owned(&vdc->lock)); 2534 2535 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2536 2537 if (vdc->initialized & VDC_DRING_ENTRY) { 2538 DMSG(vdc, 0, 2539 "[%d] Removing Local DRing entries\n", vdc->instance); 2540 for (i = 0; i < vdc->dring_len; i++) { 2541 ldep = &vdc->local_dring[i]; 2542 mhdl = ldep->desc_mhdl; 2543 2544 if (mhdl == NULL) 2545 continue; 2546 2547 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2548 DMSG(vdc, 0, 2549 "ldc_mem_info returned an error: %d\n", 2550 status); 2551 2552 /* 2553 * This must mean that the mem handle 2554 * is not valid. Clear it out so that 2555 * no one tries to use it. 2556 */ 2557 ldep->desc_mhdl = NULL; 2558 continue; 2559 } 2560 2561 if (minfo.status == LDC_BOUND) { 2562 (void) ldc_mem_unbind_handle(mhdl); 2563 } 2564 2565 (void) ldc_mem_free_handle(mhdl); 2566 2567 ldep->desc_mhdl = NULL; 2568 } 2569 vdc->initialized &= ~VDC_DRING_ENTRY; 2570 } 2571 2572 if (vdc->initialized & VDC_DRING_LOCAL) { 2573 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2574 kmem_free(vdc->local_dring, 2575 vdc->dring_len * sizeof (vdc_local_desc_t)); 2576 vdc->initialized &= ~VDC_DRING_LOCAL; 2577 } 2578 2579 if (vdc->initialized & VDC_DRING_BOUND) { 2580 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2581 status = ldc_mem_dring_unbind(vdc->ldc_dring_hdl); 2582 if (status == 0) { 2583 vdc->initialized &= ~VDC_DRING_BOUND; 2584 } else { 2585 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2586 vdc->instance, status, vdc->ldc_dring_hdl); 2587 } 2588 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2589 } 2590 2591 if (vdc->initialized & VDC_DRING_INIT) { 2592 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2593 status = ldc_mem_dring_destroy(vdc->ldc_dring_hdl); 2594 if (status == 0) { 2595 vdc->ldc_dring_hdl = NULL; 2596 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2597 vdc->initialized &= ~VDC_DRING_INIT; 2598 } else { 2599 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2600 vdc->instance, status, vdc->ldc_dring_hdl); 2601 } 2602 } 2603 } 2604 2605 /* 2606 * Function: 2607 * vdc_map_to_shared_ring() 2608 * 2609 * Description: 2610 * Copy contents of the local descriptor to the shared 2611 * memory descriptor. 2612 * 2613 * Arguments: 2614 * vdcp - soft state pointer for this instance of the device driver. 2615 * idx - descriptor ring index 2616 * 2617 * Return Code: 2618 * None 2619 */ 2620 static int 2621 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2622 { 2623 vdc_local_desc_t *ldep; 2624 vd_dring_entry_t *dep; 2625 int rv; 2626 2627 ldep = &(vdcp->local_dring[idx]); 2628 2629 /* for now leave in the old pop_mem_hdl stuff */ 2630 if (ldep->nbytes > 0) { 2631 rv = vdc_populate_mem_hdl(vdcp, ldep); 2632 if (rv) { 2633 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2634 vdcp->instance); 2635 return (rv); 2636 } 2637 } 2638 2639 /* 2640 * fill in the data details into the DRing 2641 */ 2642 dep = ldep->dep; 2643 ASSERT(dep != NULL); 2644 2645 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2646 dep->payload.operation = ldep->operation; 2647 dep->payload.addr = ldep->offset; 2648 dep->payload.nbytes = ldep->nbytes; 2649 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2650 dep->payload.slice = ldep->slice; 2651 dep->hdr.dstate = VIO_DESC_READY; 2652 dep->hdr.ack = 1; /* request an ACK for every message */ 2653 2654 return (0); 2655 } 2656 2657 /* 2658 * Function: 2659 * vdc_send_request 2660 * 2661 * Description: 2662 * This routine writes the data to be transmitted to vds into the 2663 * descriptor, notifies vds that the ring has been updated and 2664 * then waits for the request to be processed. 2665 * 2666 * Arguments: 2667 * vdcp - the soft state pointer 2668 * operation - operation we want vds to perform (VD_OP_XXX) 2669 * addr - address of data buf to be read/written. 2670 * nbytes - number of bytes to read/write 2671 * slice - the disk slice this request is for 2672 * offset - relative disk offset 2673 * cb_type - type of call - STRATEGY or SYNC 2674 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2675 * . mode for ioctl(9e) 2676 * . LP64 diskaddr_t (block I/O) 2677 * dir - direction of operation (READ/WRITE/BOTH) 2678 * 2679 * Return Codes: 2680 * 0 2681 * ENXIO 2682 */ 2683 static int 2684 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2685 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2686 void *cb_arg, vio_desc_direction_t dir) 2687 { 2688 ASSERT(vdcp != NULL); 2689 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2690 2691 mutex_enter(&vdcp->lock); 2692 2693 do { 2694 while (vdcp->state != VDC_STATE_RUNNING) { 2695 2696 /* return error if detaching */ 2697 if (vdcp->state == VDC_STATE_DETACH) { 2698 mutex_exit(&vdcp->lock); 2699 return (ENXIO); 2700 } 2701 2702 /* fail request if connection timeout is reached */ 2703 if (vdcp->ctimeout_reached) { 2704 mutex_exit(&vdcp->lock); 2705 return (EIO); 2706 } 2707 2708 /* 2709 * If we are panicking and the disk is not ready then 2710 * we can't send any request because we can't complete 2711 * the handshake now. 2712 */ 2713 if (ddi_in_panic()) { 2714 mutex_exit(&vdcp->lock); 2715 return (EIO); 2716 } 2717 2718 cv_wait(&vdcp->running_cv, &vdcp->lock); 2719 } 2720 2721 } while (vdc_populate_descriptor(vdcp, operation, addr, 2722 nbytes, slice, offset, cb_type, cb_arg, dir)); 2723 2724 mutex_exit(&vdcp->lock); 2725 return (0); 2726 } 2727 2728 2729 /* 2730 * Function: 2731 * vdc_populate_descriptor 2732 * 2733 * Description: 2734 * This routine writes the data to be transmitted to vds into the 2735 * descriptor, notifies vds that the ring has been updated and 2736 * then waits for the request to be processed. 2737 * 2738 * Arguments: 2739 * vdcp - the soft state pointer 2740 * operation - operation we want vds to perform (VD_OP_XXX) 2741 * addr - address of data buf to be read/written. 2742 * nbytes - number of bytes to read/write 2743 * slice - the disk slice this request is for 2744 * offset - relative disk offset 2745 * cb_type - type of call - STRATEGY or SYNC 2746 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2747 * . mode for ioctl(9e) 2748 * . LP64 diskaddr_t (block I/O) 2749 * dir - direction of operation (READ/WRITE/BOTH) 2750 * 2751 * Return Codes: 2752 * 0 2753 * EAGAIN 2754 * ECONNRESET 2755 * ENXIO 2756 */ 2757 static int 2758 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2759 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2760 void *cb_arg, vio_desc_direction_t dir) 2761 { 2762 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2763 int idx; /* Index of DRing entry used */ 2764 int next_idx; 2765 vio_dring_msg_t dmsg; 2766 size_t msglen; 2767 int rv; 2768 2769 ASSERT(MUTEX_HELD(&vdcp->lock)); 2770 vdcp->threads_pending++; 2771 loop: 2772 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2773 2774 /* Get next available D-Ring entry */ 2775 idx = vdcp->dring_curr_idx; 2776 local_dep = &(vdcp->local_dring[idx]); 2777 2778 if (!local_dep->is_free) { 2779 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2780 vdcp->instance); 2781 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2782 if (vdcp->state == VDC_STATE_RUNNING || 2783 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2784 goto loop; 2785 } 2786 vdcp->threads_pending--; 2787 return (ECONNRESET); 2788 } 2789 2790 next_idx = idx + 1; 2791 if (next_idx >= vdcp->dring_len) 2792 next_idx = 0; 2793 vdcp->dring_curr_idx = next_idx; 2794 2795 ASSERT(local_dep->is_free); 2796 2797 local_dep->operation = operation; 2798 local_dep->addr = addr; 2799 local_dep->nbytes = nbytes; 2800 local_dep->slice = slice; 2801 local_dep->offset = offset; 2802 local_dep->cb_type = cb_type; 2803 local_dep->cb_arg = cb_arg; 2804 local_dep->dir = dir; 2805 2806 local_dep->is_free = B_FALSE; 2807 2808 rv = vdc_map_to_shared_dring(vdcp, idx); 2809 if (rv) { 2810 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 2811 vdcp->instance); 2812 /* free the descriptor */ 2813 local_dep->is_free = B_TRUE; 2814 vdcp->dring_curr_idx = idx; 2815 cv_wait(&vdcp->membind_cv, &vdcp->lock); 2816 if (vdcp->state == VDC_STATE_RUNNING || 2817 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2818 goto loop; 2819 } 2820 vdcp->threads_pending--; 2821 return (ECONNRESET); 2822 } 2823 2824 /* 2825 * Send a msg with the DRing details to vds 2826 */ 2827 VIO_INIT_DRING_DATA_TAG(dmsg); 2828 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 2829 dmsg.dring_ident = vdcp->dring_ident; 2830 dmsg.start_idx = idx; 2831 dmsg.end_idx = idx; 2832 vdcp->seq_num++; 2833 2834 DTRACE_IO2(send, vio_dring_msg_t *, &dmsg, vdc_t *, vdcp); 2835 2836 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 2837 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 2838 2839 /* 2840 * note we're still holding the lock here to 2841 * make sure the message goes out in order !!!... 2842 */ 2843 msglen = sizeof (dmsg); 2844 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 2845 switch (rv) { 2846 case ECONNRESET: 2847 /* 2848 * vdc_send initiates the reset on failure. 2849 * Since the transaction has already been put 2850 * on the local dring, it will automatically get 2851 * retried when the channel is reset. Given that, 2852 * it is ok to just return success even though the 2853 * send failed. 2854 */ 2855 rv = 0; 2856 break; 2857 2858 case 0: /* EOK */ 2859 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 2860 break; 2861 2862 default: 2863 goto cleanup_and_exit; 2864 } 2865 2866 vdcp->threads_pending--; 2867 return (rv); 2868 2869 cleanup_and_exit: 2870 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 2871 return (ENXIO); 2872 } 2873 2874 /* 2875 * Function: 2876 * vdc_do_sync_op 2877 * 2878 * Description: 2879 * Wrapper around vdc_populate_descriptor that blocks until the 2880 * response to the message is available. 2881 * 2882 * Arguments: 2883 * vdcp - the soft state pointer 2884 * operation - operation we want vds to perform (VD_OP_XXX) 2885 * addr - address of data buf to be read/written. 2886 * nbytes - number of bytes to read/write 2887 * slice - the disk slice this request is for 2888 * offset - relative disk offset 2889 * cb_type - type of call - STRATEGY or SYNC 2890 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2891 * . mode for ioctl(9e) 2892 * . LP64 diskaddr_t (block I/O) 2893 * dir - direction of operation (READ/WRITE/BOTH) 2894 * rconflict - check for reservation conflict in case of failure 2895 * 2896 * rconflict should be set to B_TRUE by most callers. Callers invoking the 2897 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 2898 * result of a successful operation with vd_scsi_status(). 2899 * 2900 * Return Codes: 2901 * 0 2902 * EAGAIN 2903 * EFAULT 2904 * ENXIO 2905 * EIO 2906 */ 2907 static int 2908 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 2909 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 2910 vio_desc_direction_t dir, boolean_t rconflict) 2911 { 2912 int status; 2913 vdc_io_t *vio; 2914 boolean_t check_resv_conflict = B_FALSE; 2915 2916 ASSERT(cb_type == CB_SYNC); 2917 2918 /* 2919 * Grab the lock, if blocked wait until the server 2920 * response causes us to wake up again. 2921 */ 2922 mutex_enter(&vdcp->lock); 2923 vdcp->sync_op_cnt++; 2924 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 2925 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 2926 2927 if (vdcp->state == VDC_STATE_DETACH) { 2928 cv_broadcast(&vdcp->sync_blocked_cv); 2929 vdcp->sync_op_cnt--; 2930 mutex_exit(&vdcp->lock); 2931 return (ENXIO); 2932 } 2933 2934 /* now block anyone other thread entering after us */ 2935 vdcp->sync_op_blocked = B_TRUE; 2936 vdcp->sync_op_pending = B_TRUE; 2937 mutex_exit(&vdcp->lock); 2938 2939 status = vdc_send_request(vdcp, operation, addr, 2940 nbytes, slice, offset, cb_type, cb_arg, dir); 2941 2942 mutex_enter(&vdcp->lock); 2943 2944 if (status != 0) { 2945 vdcp->sync_op_pending = B_FALSE; 2946 } else { 2947 /* 2948 * block until our transaction completes. 2949 * Also anyone else waiting also gets to go next. 2950 */ 2951 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 2952 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 2953 2954 DMSG(vdcp, 2, ": operation returned %d\n", 2955 vdcp->sync_op_status); 2956 if (vdcp->state == VDC_STATE_DETACH) { 2957 vdcp->sync_op_pending = B_FALSE; 2958 status = ENXIO; 2959 } else { 2960 status = vdcp->sync_op_status; 2961 if (status != 0 && vdcp->failfast_interval != 0) { 2962 /* 2963 * Operation has failed and failfast is enabled. 2964 * We need to check if the failure is due to a 2965 * reservation conflict if this was requested. 2966 */ 2967 check_resv_conflict = rconflict; 2968 } 2969 2970 } 2971 } 2972 2973 vdcp->sync_op_status = 0; 2974 vdcp->sync_op_blocked = B_FALSE; 2975 vdcp->sync_op_cnt--; 2976 2977 /* signal the next waiting thread */ 2978 cv_signal(&vdcp->sync_blocked_cv); 2979 2980 /* 2981 * We have to check for reservation conflict after unblocking sync 2982 * operations because some sync operations will be used to do this 2983 * check. 2984 */ 2985 if (check_resv_conflict) { 2986 vio = vdc_failfast_io_queue(vdcp, NULL); 2987 while (vio->vio_qtime != 0) 2988 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 2989 kmem_free(vio, sizeof (vdc_io_t)); 2990 } 2991 2992 mutex_exit(&vdcp->lock); 2993 2994 return (status); 2995 } 2996 2997 2998 /* 2999 * Function: 3000 * vdc_drain_response() 3001 * 3002 * Description: 3003 * When a guest is panicking, the completion of requests needs to be 3004 * handled differently because interrupts are disabled and vdc 3005 * will not get messages. We have to poll for the messages instead. 3006 * 3007 * Arguments: 3008 * vdc - soft state pointer for this instance of the device driver. 3009 * 3010 * Return Code: 3011 * 0 - Success 3012 */ 3013 static int 3014 vdc_drain_response(vdc_t *vdc) 3015 { 3016 int rv, idx, retries; 3017 size_t msglen; 3018 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3019 vio_dring_msg_t dmsg; 3020 3021 mutex_enter(&vdc->lock); 3022 3023 retries = 0; 3024 for (;;) { 3025 msglen = sizeof (dmsg); 3026 rv = ldc_read(vdc->ldc_handle, (caddr_t)&dmsg, &msglen); 3027 if (rv) { 3028 rv = EINVAL; 3029 break; 3030 } 3031 3032 /* 3033 * if there are no packets wait and check again 3034 */ 3035 if ((rv == 0) && (msglen == 0)) { 3036 if (retries++ > vdc_dump_retries) { 3037 rv = EAGAIN; 3038 break; 3039 } 3040 3041 drv_usecwait(vdc_usec_timeout_dump); 3042 continue; 3043 } 3044 3045 /* 3046 * Ignore all messages that are not ACKs/NACKs to 3047 * DRing requests. 3048 */ 3049 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3050 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3051 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3052 dmsg.tag.vio_msgtype, 3053 dmsg.tag.vio_subtype, 3054 dmsg.tag.vio_subtype_env); 3055 continue; 3056 } 3057 3058 /* 3059 * set the appropriate return value for the current request. 3060 */ 3061 switch (dmsg.tag.vio_subtype) { 3062 case VIO_SUBTYPE_ACK: 3063 rv = 0; 3064 break; 3065 case VIO_SUBTYPE_NACK: 3066 rv = EAGAIN; 3067 break; 3068 default: 3069 continue; 3070 } 3071 3072 idx = dmsg.start_idx; 3073 if (idx >= vdc->dring_len) { 3074 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3075 vdc->instance, idx); 3076 continue; 3077 } 3078 ldep = &vdc->local_dring[idx]; 3079 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3080 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3081 vdc->instance, idx, ldep->dep->hdr.dstate); 3082 continue; 3083 } 3084 3085 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3086 vdc->instance, idx, ldep->dep->hdr.dstate); 3087 rv = vdc_depopulate_descriptor(vdc, idx); 3088 if (rv) { 3089 DMSG(vdc, 0, 3090 "[%d] Entry @ %d - depopulate failed ..\n", 3091 vdc->instance, idx); 3092 } 3093 3094 /* if this is the last descriptor - break out of loop */ 3095 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3096 break; 3097 } 3098 3099 mutex_exit(&vdc->lock); 3100 DMSG(vdc, 0, "End idx=%d\n", idx); 3101 3102 return (rv); 3103 } 3104 3105 3106 /* 3107 * Function: 3108 * vdc_depopulate_descriptor() 3109 * 3110 * Description: 3111 * 3112 * Arguments: 3113 * vdc - soft state pointer for this instance of the device driver. 3114 * idx - Index of the Descriptor Ring entry being modified 3115 * 3116 * Return Code: 3117 * 0 - Success 3118 */ 3119 static int 3120 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3121 { 3122 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3123 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3124 int status = ENXIO; 3125 int rv = 0; 3126 3127 ASSERT(vdc != NULL); 3128 ASSERT(idx < vdc->dring_len); 3129 ldep = &vdc->local_dring[idx]; 3130 ASSERT(ldep != NULL); 3131 ASSERT(MUTEX_HELD(&vdc->lock)); 3132 3133 DMSG(vdc, 2, ": idx = %d\n", idx); 3134 dep = ldep->dep; 3135 ASSERT(dep != NULL); 3136 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3137 (dep->payload.status == ECANCELED)); 3138 3139 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3140 3141 ldep->is_free = B_TRUE; 3142 status = dep->payload.status; 3143 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3144 3145 /* 3146 * If no buffers were used to transfer information to the server when 3147 * populating the descriptor then no memory handles need to be unbound 3148 * and we can return now. 3149 */ 3150 if (ldep->nbytes == 0) { 3151 cv_signal(&vdc->dring_free_cv); 3152 return (status); 3153 } 3154 3155 /* 3156 * If the upper layer passed in a misaligned address we copied the 3157 * data into an aligned buffer before sending it to LDC - we now 3158 * copy it back to the original buffer. 3159 */ 3160 if (ldep->align_addr) { 3161 ASSERT(ldep->addr != NULL); 3162 3163 if (dep->payload.nbytes > 0) 3164 bcopy(ldep->align_addr, ldep->addr, 3165 dep->payload.nbytes); 3166 kmem_free(ldep->align_addr, 3167 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3168 ldep->align_addr = NULL; 3169 } 3170 3171 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3172 if (rv != 0) { 3173 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3174 vdc->instance, ldep->desc_mhdl, idx, rv); 3175 /* 3176 * The error returned by the vDisk server is more informative 3177 * and thus has a higher priority but if it isn't set we ensure 3178 * that this function returns an error. 3179 */ 3180 if (status == 0) 3181 status = EINVAL; 3182 } 3183 3184 cv_signal(&vdc->membind_cv); 3185 cv_signal(&vdc->dring_free_cv); 3186 3187 return (status); 3188 } 3189 3190 /* 3191 * Function: 3192 * vdc_populate_mem_hdl() 3193 * 3194 * Description: 3195 * 3196 * Arguments: 3197 * vdc - soft state pointer for this instance of the device driver. 3198 * idx - Index of the Descriptor Ring entry being modified 3199 * addr - virtual address being mapped in 3200 * nybtes - number of bytes in 'addr' 3201 * operation - the vDisk operation being performed (VD_OP_xxx) 3202 * 3203 * Return Code: 3204 * 0 - Success 3205 */ 3206 static int 3207 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3208 { 3209 vd_dring_entry_t *dep = NULL; 3210 ldc_mem_handle_t mhdl; 3211 caddr_t vaddr; 3212 size_t nbytes; 3213 uint8_t perm = LDC_MEM_RW; 3214 uint8_t maptype; 3215 int rv = 0; 3216 int i; 3217 3218 ASSERT(vdcp != NULL); 3219 3220 dep = ldep->dep; 3221 mhdl = ldep->desc_mhdl; 3222 3223 switch (ldep->dir) { 3224 case VIO_read_dir: 3225 perm = LDC_MEM_W; 3226 break; 3227 3228 case VIO_write_dir: 3229 perm = LDC_MEM_R; 3230 break; 3231 3232 case VIO_both_dir: 3233 perm = LDC_MEM_RW; 3234 break; 3235 3236 default: 3237 ASSERT(0); /* catch bad programming in vdc */ 3238 } 3239 3240 /* 3241 * LDC expects any addresses passed in to be 8-byte aligned. We need 3242 * to copy the contents of any misaligned buffers to a newly allocated 3243 * buffer and bind it instead (and copy the the contents back to the 3244 * original buffer passed in when depopulating the descriptor) 3245 */ 3246 vaddr = ldep->addr; 3247 nbytes = ldep->nbytes; 3248 if (((uint64_t)vaddr & 0x7) != 0) { 3249 ASSERT(ldep->align_addr == NULL); 3250 ldep->align_addr = 3251 kmem_alloc(sizeof (caddr_t) * 3252 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3253 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3254 "(buf=%p nb=%ld op=%d)\n", 3255 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3256 nbytes, ldep->operation); 3257 if (perm != LDC_MEM_W) 3258 bcopy(vaddr, ldep->align_addr, nbytes); 3259 vaddr = ldep->align_addr; 3260 } 3261 3262 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3263 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3264 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3265 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3266 vdcp->instance, dep->payload.ncookies); 3267 if (rv != 0) { 3268 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3269 "(mhdl=%p, buf=%p, err=%d)\n", 3270 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3271 if (ldep->align_addr) { 3272 kmem_free(ldep->align_addr, 3273 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3274 ldep->align_addr = NULL; 3275 } 3276 return (EAGAIN); 3277 } 3278 3279 /* 3280 * Get the other cookies (if any). 3281 */ 3282 for (i = 1; i < dep->payload.ncookies; i++) { 3283 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3284 if (rv != 0) { 3285 (void) ldc_mem_unbind_handle(mhdl); 3286 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3287 "(mhdl=%lx cnum=%d), err=%d", 3288 vdcp->instance, mhdl, i, rv); 3289 if (ldep->align_addr) { 3290 kmem_free(ldep->align_addr, 3291 sizeof (caddr_t) * ldep->nbytes); 3292 ldep->align_addr = NULL; 3293 } 3294 return (EAGAIN); 3295 } 3296 } 3297 3298 return (rv); 3299 } 3300 3301 /* 3302 * Interrupt handlers for messages from LDC 3303 */ 3304 3305 /* 3306 * Function: 3307 * vdc_handle_cb() 3308 * 3309 * Description: 3310 * 3311 * Arguments: 3312 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3313 * arg - soft state pointer for this instance of the device driver. 3314 * 3315 * Return Code: 3316 * 0 - Success 3317 */ 3318 static uint_t 3319 vdc_handle_cb(uint64_t event, caddr_t arg) 3320 { 3321 ldc_status_t ldc_state; 3322 int rv = 0; 3323 3324 vdc_t *vdc = (vdc_t *)(void *)arg; 3325 3326 ASSERT(vdc != NULL); 3327 3328 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3329 3330 /* 3331 * Depending on the type of event that triggered this callback, 3332 * we modify the handshake state or read the data. 3333 * 3334 * NOTE: not done as a switch() as event could be triggered by 3335 * a state change and a read request. Also the ordering of the 3336 * check for the event types is deliberate. 3337 */ 3338 if (event & LDC_EVT_UP) { 3339 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3340 3341 mutex_enter(&vdc->lock); 3342 3343 /* get LDC state */ 3344 rv = ldc_status(vdc->ldc_handle, &ldc_state); 3345 if (rv != 0) { 3346 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3347 vdc->instance, rv); 3348 return (LDC_SUCCESS); 3349 } 3350 if (vdc->ldc_state != LDC_UP && ldc_state == LDC_UP) { 3351 /* 3352 * Reset the transaction sequence numbers when 3353 * LDC comes up. We then kick off the handshake 3354 * negotiation with the vDisk server. 3355 */ 3356 vdc->seq_num = 1; 3357 vdc->seq_num_reply = 0; 3358 vdc->ldc_state = ldc_state; 3359 cv_signal(&vdc->initwait_cv); 3360 } 3361 3362 mutex_exit(&vdc->lock); 3363 } 3364 3365 if (event & LDC_EVT_READ) { 3366 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3367 mutex_enter(&vdc->read_lock); 3368 cv_signal(&vdc->read_cv); 3369 vdc->read_state = VDC_READ_PENDING; 3370 mutex_exit(&vdc->read_lock); 3371 3372 /* that's all we have to do - no need to handle DOWN/RESET */ 3373 return (LDC_SUCCESS); 3374 } 3375 3376 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3377 3378 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3379 3380 mutex_enter(&vdc->lock); 3381 /* 3382 * Need to wake up any readers so they will 3383 * detect that a reset has occurred. 3384 */ 3385 mutex_enter(&vdc->read_lock); 3386 if ((vdc->read_state == VDC_READ_WAITING) || 3387 (vdc->read_state == VDC_READ_RESET)) 3388 cv_signal(&vdc->read_cv); 3389 vdc->read_state = VDC_READ_RESET; 3390 mutex_exit(&vdc->read_lock); 3391 3392 /* wake up any threads waiting for connection to come up */ 3393 if (vdc->state == VDC_STATE_INIT_WAITING) { 3394 vdc->state = VDC_STATE_RESETTING; 3395 cv_signal(&vdc->initwait_cv); 3396 } 3397 3398 mutex_exit(&vdc->lock); 3399 } 3400 3401 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3402 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3403 vdc->instance, event); 3404 3405 return (LDC_SUCCESS); 3406 } 3407 3408 /* 3409 * Function: 3410 * vdc_wait_for_response() 3411 * 3412 * Description: 3413 * Block waiting for a response from the server. If there is 3414 * no data the thread block on the read_cv that is signalled 3415 * by the callback when an EVT_READ occurs. 3416 * 3417 * Arguments: 3418 * vdcp - soft state pointer for this instance of the device driver. 3419 * 3420 * Return Code: 3421 * 0 - Success 3422 */ 3423 static int 3424 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3425 { 3426 size_t nbytes = sizeof (*msgp); 3427 int status; 3428 3429 ASSERT(vdcp != NULL); 3430 3431 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3432 3433 status = vdc_recv(vdcp, msgp, &nbytes); 3434 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3435 status, (int)nbytes); 3436 if (status) { 3437 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3438 vdcp->instance, status); 3439 return (status); 3440 } 3441 3442 if (nbytes < sizeof (vio_msg_tag_t)) { 3443 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3444 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3445 return (ENOMSG); 3446 } 3447 3448 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3449 msgp->tag.vio_msgtype, 3450 msgp->tag.vio_subtype, 3451 msgp->tag.vio_subtype_env); 3452 3453 /* 3454 * Verify the Session ID of the message 3455 * 3456 * Every message after the Version has been negotiated should 3457 * have the correct session ID set. 3458 */ 3459 if ((msgp->tag.vio_sid != vdcp->session_id) && 3460 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3461 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3462 "expected 0x%lx [seq num %lx @ %d]", 3463 vdcp->instance, msgp->tag.vio_sid, 3464 vdcp->session_id, 3465 ((vio_dring_msg_t *)msgp)->seq_num, 3466 ((vio_dring_msg_t *)msgp)->start_idx); 3467 return (ENOMSG); 3468 } 3469 return (0); 3470 } 3471 3472 3473 /* 3474 * Function: 3475 * vdc_resubmit_backup_dring() 3476 * 3477 * Description: 3478 * Resubmit each descriptor in the backed up dring to 3479 * vDisk server. The Dring was backed up during connection 3480 * reset. 3481 * 3482 * Arguments: 3483 * vdcp - soft state pointer for this instance of the device driver. 3484 * 3485 * Return Code: 3486 * 0 - Success 3487 */ 3488 static int 3489 vdc_resubmit_backup_dring(vdc_t *vdcp) 3490 { 3491 int count; 3492 int b_idx; 3493 int rv; 3494 int dring_size; 3495 int status; 3496 vio_msg_t vio_msg; 3497 vdc_local_desc_t *curr_ldep; 3498 3499 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3500 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3501 3502 if (vdcp->local_dring_backup == NULL) { 3503 /* the pending requests have already been processed */ 3504 return (0); 3505 } 3506 3507 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3508 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3509 3510 /* 3511 * Walk the backup copy of the local descriptor ring and 3512 * resubmit all the outstanding transactions. 3513 */ 3514 b_idx = vdcp->local_dring_backup_tail; 3515 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3516 3517 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3518 3519 /* only resubmit outstanding transactions */ 3520 if (!curr_ldep->is_free) { 3521 3522 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3523 mutex_enter(&vdcp->lock); 3524 rv = vdc_populate_descriptor(vdcp, curr_ldep->operation, 3525 curr_ldep->addr, curr_ldep->nbytes, 3526 curr_ldep->slice, curr_ldep->offset, 3527 curr_ldep->cb_type, curr_ldep->cb_arg, 3528 curr_ldep->dir); 3529 mutex_exit(&vdcp->lock); 3530 if (rv) { 3531 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3532 vdcp->instance, b_idx); 3533 return (rv); 3534 } 3535 3536 /* Wait for the response message. */ 3537 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3538 b_idx); 3539 status = vdc_wait_for_response(vdcp, &vio_msg); 3540 if (status) { 3541 DMSG(vdcp, 1, "[%d] wait_for_response " 3542 "returned err=%d\n", vdcp->instance, 3543 status); 3544 return (status); 3545 } 3546 3547 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3548 status = vdc_process_data_msg(vdcp, &vio_msg); 3549 if (status) { 3550 DMSG(vdcp, 1, "[%d] process_data_msg " 3551 "returned err=%d\n", vdcp->instance, 3552 status); 3553 return (status); 3554 } 3555 } 3556 3557 /* get the next element to submit */ 3558 if (++b_idx >= vdcp->local_dring_backup_len) 3559 b_idx = 0; 3560 } 3561 3562 /* all done - now clear up pending dring copy */ 3563 dring_size = vdcp->local_dring_backup_len * 3564 sizeof (vdcp->local_dring_backup[0]); 3565 3566 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3567 3568 vdcp->local_dring_backup = NULL; 3569 3570 return (0); 3571 } 3572 3573 /* 3574 * Function: 3575 * vdc_cancel_backup_dring 3576 * 3577 * Description: 3578 * Cancel each descriptor in the backed up dring to vDisk server. 3579 * The Dring was backed up during connection reset. 3580 * 3581 * Arguments: 3582 * vdcp - soft state pointer for this instance of the device driver. 3583 * 3584 * Return Code: 3585 * None 3586 */ 3587 void 3588 vdc_cancel_backup_ring(vdc_t *vdcp) 3589 { 3590 vdc_local_desc_t *ldep; 3591 struct buf *bufp; 3592 int count; 3593 int b_idx; 3594 int dring_size; 3595 3596 ASSERT(MUTEX_HELD(&vdcp->lock)); 3597 ASSERT(vdcp->state == VDC_STATE_INIT || 3598 vdcp->state == VDC_STATE_INIT_WAITING || 3599 vdcp->state == VDC_STATE_NEGOTIATE || 3600 vdcp->state == VDC_STATE_RESETTING); 3601 3602 if (vdcp->local_dring_backup == NULL) { 3603 /* the pending requests have already been processed */ 3604 return; 3605 } 3606 3607 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3608 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3609 3610 /* 3611 * Walk the backup copy of the local descriptor ring and 3612 * cancel all the outstanding transactions. 3613 */ 3614 b_idx = vdcp->local_dring_backup_tail; 3615 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3616 3617 ldep = &(vdcp->local_dring_backup[b_idx]); 3618 3619 /* only cancel outstanding transactions */ 3620 if (!ldep->is_free) { 3621 3622 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3623 3624 /* 3625 * All requests have already been cleared from the 3626 * local descriptor ring and the LDC channel has been 3627 * reset so we will never get any reply for these 3628 * requests. Now we just have to notify threads waiting 3629 * for replies that the request has failed. 3630 */ 3631 switch (ldep->cb_type) { 3632 case CB_SYNC: 3633 ASSERT(vdcp->sync_op_pending); 3634 vdcp->sync_op_status = EIO; 3635 vdcp->sync_op_pending = B_FALSE; 3636 cv_signal(&vdcp->sync_pending_cv); 3637 break; 3638 3639 case CB_STRATEGY: 3640 bufp = ldep->cb_arg; 3641 ASSERT(bufp != NULL); 3642 bufp->b_resid = bufp->b_bcount; 3643 bioerror(bufp, EIO); 3644 biodone(bufp); 3645 break; 3646 3647 default: 3648 ASSERT(0); 3649 } 3650 3651 } 3652 3653 /* get the next element to cancel */ 3654 if (++b_idx >= vdcp->local_dring_backup_len) 3655 b_idx = 0; 3656 } 3657 3658 /* all done - now clear up pending dring copy */ 3659 dring_size = vdcp->local_dring_backup_len * 3660 sizeof (vdcp->local_dring_backup[0]); 3661 3662 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3663 3664 vdcp->local_dring_backup = NULL; 3665 3666 DTRACE_IO2(processed, int, count, vdc_t *, vdcp); 3667 } 3668 3669 /* 3670 * Function: 3671 * vdc_connection_timeout 3672 * 3673 * Description: 3674 * This function is invoked if the timeout set to establish the connection 3675 * with vds expires. This will happen if we spend too much time in the 3676 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3677 * cancel any pending request and mark them as failed. 3678 * 3679 * If the timeout does not expire, it will be cancelled when we reach the 3680 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 3681 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 3682 * VDC_STATE_RESETTING state in which case we do nothing because the 3683 * timeout is being cancelled. 3684 * 3685 * Arguments: 3686 * arg - argument of the timeout function actually a soft state 3687 * pointer for the instance of the device driver. 3688 * 3689 * Return Code: 3690 * None 3691 */ 3692 void 3693 vdc_connection_timeout(void *arg) 3694 { 3695 vdc_t *vdcp = (vdc_t *)arg; 3696 3697 mutex_enter(&vdcp->lock); 3698 3699 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 3700 vdcp->state == VDC_STATE_DETACH) { 3701 /* 3702 * The connection has just been re-established or 3703 * we are detaching. 3704 */ 3705 vdcp->ctimeout_reached = B_FALSE; 3706 mutex_exit(&vdcp->lock); 3707 return; 3708 } 3709 3710 vdcp->ctimeout_reached = B_TRUE; 3711 3712 /* notify requests waiting for sending */ 3713 cv_broadcast(&vdcp->running_cv); 3714 3715 /* cancel requests waiting for a result */ 3716 vdc_cancel_backup_ring(vdcp); 3717 3718 mutex_exit(&vdcp->lock); 3719 3720 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 3721 vdcp->instance); 3722 } 3723 3724 /* 3725 * Function: 3726 * vdc_backup_local_dring() 3727 * 3728 * Description: 3729 * Backup the current dring in the event of a reset. The Dring 3730 * transactions will be resubmitted to the server when the 3731 * connection is restored. 3732 * 3733 * Arguments: 3734 * vdcp - soft state pointer for this instance of the device driver. 3735 * 3736 * Return Code: 3737 * NONE 3738 */ 3739 static void 3740 vdc_backup_local_dring(vdc_t *vdcp) 3741 { 3742 int dring_size; 3743 3744 ASSERT(MUTEX_HELD(&vdcp->lock)); 3745 ASSERT(vdcp->state == VDC_STATE_RESETTING); 3746 3747 /* 3748 * If the backup dring is stil around, it means 3749 * that the last restore did not complete. However, 3750 * since we never got back into the running state, 3751 * the backup copy we have is still valid. 3752 */ 3753 if (vdcp->local_dring_backup != NULL) { 3754 DMSG(vdcp, 1, "reusing local descriptor ring backup " 3755 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 3756 vdcp->local_dring_backup_tail); 3757 return; 3758 } 3759 3760 /* 3761 * The backup dring can be NULL and the local dring may not be 3762 * initialized. This can happen if we had a reset while establishing 3763 * a new connection but after the connection has timed out. In that 3764 * case the backup dring is NULL because the requests have been 3765 * cancelled and the request occured before the local dring is 3766 * initialized. 3767 */ 3768 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 3769 return; 3770 3771 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 3772 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 3773 3774 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 3775 3776 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 3777 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 3778 3779 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 3780 vdcp->local_dring_backup_len = vdcp->dring_len; 3781 } 3782 3783 /* -------------------------------------------------------------------------- */ 3784 3785 /* 3786 * The following functions process the incoming messages from vds 3787 */ 3788 3789 /* 3790 * Function: 3791 * vdc_process_msg_thread() 3792 * 3793 * Description: 3794 * 3795 * Main VDC message processing thread. Each vDisk instance 3796 * consists of a copy of this thread. This thread triggers 3797 * all the handshakes and data exchange with the server. It 3798 * also handles all channel resets 3799 * 3800 * Arguments: 3801 * vdc - soft state pointer for this instance of the device driver. 3802 * 3803 * Return Code: 3804 * None 3805 */ 3806 static void 3807 vdc_process_msg_thread(vdc_t *vdcp) 3808 { 3809 int status; 3810 int ctimeout; 3811 timeout_id_t tmid = 0; 3812 3813 mutex_enter(&vdcp->lock); 3814 3815 for (;;) { 3816 3817 #define Q(_s) (vdcp->state == _s) ? #_s : 3818 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 3819 Q(VDC_STATE_INIT) 3820 Q(VDC_STATE_INIT_WAITING) 3821 Q(VDC_STATE_NEGOTIATE) 3822 Q(VDC_STATE_HANDLE_PENDING) 3823 Q(VDC_STATE_RUNNING) 3824 Q(VDC_STATE_RESETTING) 3825 Q(VDC_STATE_DETACH) 3826 "UNKNOWN"); 3827 3828 switch (vdcp->state) { 3829 case VDC_STATE_INIT: 3830 3831 /* 3832 * If requested, start a timeout to check if the 3833 * connection with vds is established in the 3834 * specified delay. If the timeout expires, we 3835 * will cancel any pending request. 3836 * 3837 * If some reset have occurred while establishing 3838 * the connection, we already have a timeout armed 3839 * and in that case we don't need to arm a new one. 3840 */ 3841 ctimeout = (vdc_timeout != 0)? 3842 vdc_timeout : vdcp->ctimeout; 3843 3844 if (ctimeout != 0 && tmid == 0) { 3845 tmid = timeout(vdc_connection_timeout, vdcp, 3846 ctimeout * drv_usectohz(1000000)); 3847 } 3848 3849 /* Check if have re-initializing repeatedly */ 3850 if (vdcp->hshake_cnt++ > vdc_hshake_retries && 3851 vdcp->lifecycle != VDC_LC_ONLINE) { 3852 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 3853 vdcp->instance); 3854 vdcp->state = VDC_STATE_DETACH; 3855 break; 3856 } 3857 3858 /* Bring up connection with vds via LDC */ 3859 status = vdc_start_ldc_connection(vdcp); 3860 if (status == EINVAL) { 3861 DMSG(vdcp, 0, "[%d] Could not start LDC", 3862 vdcp->instance); 3863 vdcp->state = VDC_STATE_DETACH; 3864 } else { 3865 vdcp->state = VDC_STATE_INIT_WAITING; 3866 } 3867 break; 3868 3869 case VDC_STATE_INIT_WAITING: 3870 3871 /* 3872 * Let the callback event move us on 3873 * when channel is open to server 3874 */ 3875 while (vdcp->ldc_state != LDC_UP) { 3876 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 3877 if (vdcp->state != VDC_STATE_INIT_WAITING) { 3878 DMSG(vdcp, 0, 3879 "state moved to %d out from under us...\n", 3880 vdcp->state); 3881 3882 break; 3883 } 3884 } 3885 if (vdcp->state == VDC_STATE_INIT_WAITING && 3886 vdcp->ldc_state == LDC_UP) { 3887 vdcp->state = VDC_STATE_NEGOTIATE; 3888 } 3889 break; 3890 3891 case VDC_STATE_NEGOTIATE: 3892 switch (status = vdc_ver_negotiation(vdcp)) { 3893 case 0: 3894 break; 3895 default: 3896 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 3897 status); 3898 goto reset; 3899 } 3900 3901 switch (status = vdc_attr_negotiation(vdcp)) { 3902 case 0: 3903 break; 3904 default: 3905 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 3906 status); 3907 goto reset; 3908 } 3909 3910 switch (status = vdc_dring_negotiation(vdcp)) { 3911 case 0: 3912 break; 3913 default: 3914 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 3915 status); 3916 goto reset; 3917 } 3918 3919 switch (status = vdc_rdx_exchange(vdcp)) { 3920 case 0: 3921 vdcp->state = VDC_STATE_HANDLE_PENDING; 3922 goto done; 3923 default: 3924 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 3925 status); 3926 goto reset; 3927 } 3928 reset: 3929 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 3930 status); 3931 vdcp->state = VDC_STATE_RESETTING; 3932 vdcp->self_reset = B_TRUE; 3933 done: 3934 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 3935 vdcp->state); 3936 break; 3937 3938 case VDC_STATE_HANDLE_PENDING: 3939 3940 if (vdcp->ctimeout_reached) { 3941 /* 3942 * The connection timeout had been reached so 3943 * pending requests have been cancelled. Now 3944 * that the connection is back we can reset 3945 * the timeout. 3946 */ 3947 ASSERT(vdcp->local_dring_backup == NULL); 3948 ASSERT(tmid != 0); 3949 tmid = 0; 3950 vdcp->ctimeout_reached = B_FALSE; 3951 vdcp->state = VDC_STATE_RUNNING; 3952 DMSG(vdcp, 0, "[%d] connection to service " 3953 "domain is up", vdcp->instance); 3954 break; 3955 } 3956 3957 mutex_exit(&vdcp->lock); 3958 if (tmid != 0) { 3959 (void) untimeout(tmid); 3960 tmid = 0; 3961 } 3962 status = vdc_resubmit_backup_dring(vdcp); 3963 mutex_enter(&vdcp->lock); 3964 3965 if (status) 3966 vdcp->state = VDC_STATE_RESETTING; 3967 else 3968 vdcp->state = VDC_STATE_RUNNING; 3969 3970 break; 3971 3972 /* enter running state */ 3973 case VDC_STATE_RUNNING: 3974 /* 3975 * Signal anyone waiting for the connection 3976 * to come on line. 3977 */ 3978 vdcp->hshake_cnt = 0; 3979 cv_broadcast(&vdcp->running_cv); 3980 3981 /* failfast has to been checked after reset */ 3982 cv_signal(&vdcp->failfast_cv); 3983 3984 /* ownership is lost during reset */ 3985 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 3986 vdcp->ownership |= VDC_OWNERSHIP_RESET; 3987 cv_signal(&vdcp->ownership_cv); 3988 3989 mutex_exit(&vdcp->lock); 3990 3991 for (;;) { 3992 vio_msg_t msg; 3993 status = vdc_wait_for_response(vdcp, &msg); 3994 if (status) break; 3995 3996 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 3997 vdcp->instance); 3998 status = vdc_process_data_msg(vdcp, &msg); 3999 if (status) { 4000 DMSG(vdcp, 1, "[%d] process_data_msg " 4001 "returned err=%d\n", vdcp->instance, 4002 status); 4003 break; 4004 } 4005 4006 } 4007 4008 mutex_enter(&vdcp->lock); 4009 4010 vdcp->state = VDC_STATE_RESETTING; 4011 vdcp->self_reset = B_TRUE; 4012 break; 4013 4014 case VDC_STATE_RESETTING: 4015 /* 4016 * When we reach this state, we either come from the 4017 * VDC_STATE_RUNNING state and we can have pending 4018 * request but no timeout is armed; or we come from 4019 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4020 * VDC_HANDLE_PENDING state and there is no pending 4021 * request or pending requests have already been copied 4022 * into the backup dring. So we can safely keep the 4023 * connection timeout armed while we are in this state. 4024 */ 4025 4026 DMSG(vdcp, 0, "Initiating channel reset " 4027 "(pending = %d)\n", (int)vdcp->threads_pending); 4028 4029 if (vdcp->self_reset) { 4030 DMSG(vdcp, 0, 4031 "[%d] calling stop_ldc_connection.\n", 4032 vdcp->instance); 4033 status = vdc_stop_ldc_connection(vdcp); 4034 vdcp->self_reset = B_FALSE; 4035 } 4036 4037 /* 4038 * Wait for all threads currently waiting 4039 * for a free dring entry to use. 4040 */ 4041 while (vdcp->threads_pending) { 4042 cv_broadcast(&vdcp->membind_cv); 4043 cv_broadcast(&vdcp->dring_free_cv); 4044 mutex_exit(&vdcp->lock); 4045 /* give the waiters enough time to wake up */ 4046 delay(vdc_hz_min_ldc_delay); 4047 mutex_enter(&vdcp->lock); 4048 } 4049 4050 ASSERT(vdcp->threads_pending == 0); 4051 4052 /* Sanity check that no thread is receiving */ 4053 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4054 4055 vdcp->read_state = VDC_READ_IDLE; 4056 4057 vdc_backup_local_dring(vdcp); 4058 4059 /* cleanup the old d-ring */ 4060 vdc_destroy_descriptor_ring(vdcp); 4061 4062 /* go and start again */ 4063 vdcp->state = VDC_STATE_INIT; 4064 4065 break; 4066 4067 case VDC_STATE_DETACH: 4068 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4069 vdcp->instance); 4070 4071 /* cancel any pending timeout */ 4072 mutex_exit(&vdcp->lock); 4073 if (tmid != 0) { 4074 (void) untimeout(tmid); 4075 tmid = 0; 4076 } 4077 mutex_enter(&vdcp->lock); 4078 4079 /* 4080 * Signal anyone waiting for connection 4081 * to come online 4082 */ 4083 cv_broadcast(&vdcp->running_cv); 4084 4085 while (vdcp->sync_op_pending) { 4086 cv_signal(&vdcp->sync_pending_cv); 4087 cv_signal(&vdcp->sync_blocked_cv); 4088 mutex_exit(&vdcp->lock); 4089 /* give the waiters enough time to wake up */ 4090 delay(vdc_hz_min_ldc_delay); 4091 mutex_enter(&vdcp->lock); 4092 } 4093 4094 mutex_exit(&vdcp->lock); 4095 4096 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4097 vdcp->instance); 4098 thread_exit(); 4099 break; 4100 } 4101 } 4102 } 4103 4104 4105 /* 4106 * Function: 4107 * vdc_process_data_msg() 4108 * 4109 * Description: 4110 * This function is called by the message processing thread each time 4111 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4112 * be an ACK or NACK from vds[1] which vdc handles as follows. 4113 * ACK - wake up the waiting thread 4114 * NACK - resend any messages necessary 4115 * 4116 * [1] Although the message format allows it, vds should not send a 4117 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4118 * some bizarre reason it does, vdc will reset the connection. 4119 * 4120 * Arguments: 4121 * vdc - soft state pointer for this instance of the device driver. 4122 * msg - the LDC message sent by vds 4123 * 4124 * Return Code: 4125 * 0 - Success. 4126 * > 0 - error value returned by LDC 4127 */ 4128 static int 4129 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4130 { 4131 int status = 0; 4132 vio_dring_msg_t *dring_msg; 4133 vdc_local_desc_t *ldep = NULL; 4134 int start, end; 4135 int idx; 4136 4137 dring_msg = (vio_dring_msg_t *)msg; 4138 4139 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4140 ASSERT(vdcp != NULL); 4141 4142 mutex_enter(&vdcp->lock); 4143 4144 /* 4145 * Check to see if the message has bogus data 4146 */ 4147 idx = start = dring_msg->start_idx; 4148 end = dring_msg->end_idx; 4149 if ((start >= vdcp->dring_len) || 4150 (end >= vdcp->dring_len) || (end < -1)) { 4151 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4152 vdcp->instance, start, end); 4153 mutex_exit(&vdcp->lock); 4154 return (EINVAL); 4155 } 4156 4157 /* 4158 * Verify that the sequence number is what vdc expects. 4159 */ 4160 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4161 case VDC_SEQ_NUM_TODO: 4162 break; /* keep processing this message */ 4163 case VDC_SEQ_NUM_SKIP: 4164 mutex_exit(&vdcp->lock); 4165 return (0); 4166 case VDC_SEQ_NUM_INVALID: 4167 mutex_exit(&vdcp->lock); 4168 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4169 return (ENXIO); 4170 } 4171 4172 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4173 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4174 VDC_DUMP_DRING_MSG(dring_msg); 4175 mutex_exit(&vdcp->lock); 4176 return (EIO); 4177 4178 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4179 mutex_exit(&vdcp->lock); 4180 return (EPROTO); 4181 } 4182 4183 DTRACE_IO2(recv, vio_dring_msg_t, dring_msg, vdc_t *, vdcp); 4184 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4185 ASSERT(start == end); 4186 4187 ldep = &vdcp->local_dring[idx]; 4188 4189 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4190 ldep->dep->hdr.dstate, ldep->cb_type); 4191 4192 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4193 struct buf *bufp; 4194 4195 switch (ldep->cb_type) { 4196 case CB_SYNC: 4197 ASSERT(vdcp->sync_op_pending); 4198 4199 status = vdc_depopulate_descriptor(vdcp, idx); 4200 vdcp->sync_op_status = status; 4201 vdcp->sync_op_pending = B_FALSE; 4202 cv_signal(&vdcp->sync_pending_cv); 4203 break; 4204 4205 case CB_STRATEGY: 4206 bufp = ldep->cb_arg; 4207 ASSERT(bufp != NULL); 4208 bufp->b_resid = 4209 bufp->b_bcount - ldep->dep->payload.nbytes; 4210 status = ldep->dep->payload.status; /* Future:ntoh */ 4211 if (status != 0) { 4212 DMSG(vdcp, 1, "strategy status=%d\n", status); 4213 bioerror(bufp, status); 4214 } 4215 4216 (void) vdc_depopulate_descriptor(vdcp, idx); 4217 4218 DMSG(vdcp, 1, 4219 "strategy complete req=%ld bytes resp=%ld bytes\n", 4220 bufp->b_bcount, ldep->dep->payload.nbytes); 4221 4222 if (status != 0 && vdcp->failfast_interval != 0) { 4223 /* 4224 * The I/O has failed and failfast is enabled. 4225 * We need the failfast thread to check if the 4226 * failure is due to a reservation conflict. 4227 */ 4228 (void) vdc_failfast_io_queue(vdcp, bufp); 4229 } else { 4230 biodone(bufp); 4231 } 4232 break; 4233 4234 default: 4235 ASSERT(0); 4236 } 4237 } 4238 4239 /* let the arrival signal propogate */ 4240 mutex_exit(&vdcp->lock); 4241 4242 /* probe gives the count of how many entries were processed */ 4243 DTRACE_IO2(processed, int, 1, vdc_t *, vdcp); 4244 4245 return (0); 4246 } 4247 4248 4249 /* 4250 * Function: 4251 * vdc_handle_ver_msg() 4252 * 4253 * Description: 4254 * 4255 * Arguments: 4256 * vdc - soft state pointer for this instance of the device driver. 4257 * ver_msg - LDC message sent by vDisk server 4258 * 4259 * Return Code: 4260 * 0 - Success 4261 */ 4262 static int 4263 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4264 { 4265 int status = 0; 4266 4267 ASSERT(vdc != NULL); 4268 ASSERT(mutex_owned(&vdc->lock)); 4269 4270 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4271 return (EPROTO); 4272 } 4273 4274 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4275 return (EINVAL); 4276 } 4277 4278 switch (ver_msg->tag.vio_subtype) { 4279 case VIO_SUBTYPE_ACK: 4280 /* 4281 * We check to see if the version returned is indeed supported 4282 * (The server may have also adjusted the minor number downwards 4283 * and if so 'ver_msg' will contain the actual version agreed) 4284 */ 4285 if (vdc_is_supported_version(ver_msg)) { 4286 vdc->ver.major = ver_msg->ver_major; 4287 vdc->ver.minor = ver_msg->ver_minor; 4288 ASSERT(vdc->ver.major > 0); 4289 } else { 4290 status = EPROTO; 4291 } 4292 break; 4293 4294 case VIO_SUBTYPE_NACK: 4295 /* 4296 * call vdc_is_supported_version() which will return the next 4297 * supported version (if any) in 'ver_msg' 4298 */ 4299 (void) vdc_is_supported_version(ver_msg); 4300 if (ver_msg->ver_major > 0) { 4301 size_t len = sizeof (*ver_msg); 4302 4303 ASSERT(vdc->ver.major > 0); 4304 4305 /* reset the necessary fields and resend */ 4306 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4307 ver_msg->dev_class = VDEV_DISK; 4308 4309 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4310 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4311 vdc->instance, status); 4312 if (len != sizeof (*ver_msg)) 4313 status = EBADMSG; 4314 } else { 4315 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4316 vdc->instance); 4317 status = ENOTSUP; 4318 } 4319 4320 break; 4321 case VIO_SUBTYPE_INFO: 4322 /* 4323 * Handle the case where vds starts handshake 4324 * (for now only vdc is the instigator) 4325 */ 4326 status = ENOTSUP; 4327 break; 4328 4329 default: 4330 status = EINVAL; 4331 break; 4332 } 4333 4334 return (status); 4335 } 4336 4337 /* 4338 * Function: 4339 * vdc_handle_attr_msg() 4340 * 4341 * Description: 4342 * 4343 * Arguments: 4344 * vdc - soft state pointer for this instance of the device driver. 4345 * attr_msg - LDC message sent by vDisk server 4346 * 4347 * Return Code: 4348 * 0 - Success 4349 */ 4350 static int 4351 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4352 { 4353 int status = 0; 4354 4355 ASSERT(vdc != NULL); 4356 ASSERT(mutex_owned(&vdc->lock)); 4357 4358 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4359 return (EPROTO); 4360 } 4361 4362 switch (attr_msg->tag.vio_subtype) { 4363 case VIO_SUBTYPE_ACK: 4364 /* 4365 * We now verify the attributes sent by vds. 4366 */ 4367 if (attr_msg->vdisk_size == 0) { 4368 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4369 vdc->instance); 4370 status = EINVAL; 4371 break; 4372 } 4373 4374 if (attr_msg->max_xfer_sz == 0) { 4375 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4376 vdc->instance); 4377 status = EINVAL; 4378 break; 4379 } 4380 4381 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4382 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4383 vdc->instance); 4384 attr_msg->vdisk_size = 0; 4385 } 4386 4387 /* 4388 * If the disk size is already set check that it hasn't changed. 4389 */ 4390 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4391 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4392 DMSG(vdc, 0, "[%d] Different disk size from vds " 4393 "(old=0x%lx - new=0x%lx", vdc->instance, 4394 vdc->vdisk_size, attr_msg->vdisk_size) 4395 status = EINVAL; 4396 break; 4397 } 4398 4399 vdc->vdisk_size = attr_msg->vdisk_size; 4400 vdc->vdisk_type = attr_msg->vdisk_type; 4401 vdc->operations = attr_msg->operations; 4402 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4403 vdc->vdisk_media = attr_msg->vdisk_media; 4404 else 4405 vdc->vdisk_media = 0; 4406 4407 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4408 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4409 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4410 vdc->instance, vdc->block_size, 4411 attr_msg->vdisk_block_size); 4412 4413 /* 4414 * We don't know at compile time what the vDisk server will 4415 * think are good values but we apply a large (arbitrary) 4416 * upper bound to prevent memory exhaustion in vdc if it was 4417 * allocating a DRing based of huge values sent by the server. 4418 * We probably will never exceed this except if the message 4419 * was garbage. 4420 */ 4421 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4422 (PAGESIZE * DEV_BSIZE)) { 4423 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4424 vdc->block_size = attr_msg->vdisk_block_size; 4425 } else { 4426 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4427 " using max supported by vdc", vdc->instance); 4428 } 4429 4430 if ((attr_msg->xfer_mode != VIO_DRING_MODE) || 4431 (attr_msg->vdisk_size > INT64_MAX) || 4432 (attr_msg->operations == 0) || 4433 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4434 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4435 vdc->instance); 4436 status = EINVAL; 4437 break; 4438 } 4439 4440 /* 4441 * Now that we have received all attributes we can create a 4442 * fake geometry for the disk. 4443 */ 4444 vdc_create_fake_geometry(vdc); 4445 break; 4446 4447 case VIO_SUBTYPE_NACK: 4448 /* 4449 * vds could not handle the attributes we sent so we 4450 * stop negotiating. 4451 */ 4452 status = EPROTO; 4453 break; 4454 4455 case VIO_SUBTYPE_INFO: 4456 /* 4457 * Handle the case where vds starts the handshake 4458 * (for now; vdc is the only supported instigatior) 4459 */ 4460 status = ENOTSUP; 4461 break; 4462 4463 default: 4464 status = ENOTSUP; 4465 break; 4466 } 4467 4468 return (status); 4469 } 4470 4471 /* 4472 * Function: 4473 * vdc_handle_dring_reg_msg() 4474 * 4475 * Description: 4476 * 4477 * Arguments: 4478 * vdc - soft state pointer for this instance of the driver. 4479 * dring_msg - LDC message sent by vDisk server 4480 * 4481 * Return Code: 4482 * 0 - Success 4483 */ 4484 static int 4485 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4486 { 4487 int status = 0; 4488 4489 ASSERT(vdc != NULL); 4490 ASSERT(mutex_owned(&vdc->lock)); 4491 4492 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4493 return (EPROTO); 4494 } 4495 4496 switch (dring_msg->tag.vio_subtype) { 4497 case VIO_SUBTYPE_ACK: 4498 /* save the received dring_ident */ 4499 vdc->dring_ident = dring_msg->dring_ident; 4500 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4501 vdc->instance, vdc->dring_ident); 4502 break; 4503 4504 case VIO_SUBTYPE_NACK: 4505 /* 4506 * vds could not handle the DRing info we sent so we 4507 * stop negotiating. 4508 */ 4509 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4510 vdc->instance); 4511 status = EPROTO; 4512 break; 4513 4514 case VIO_SUBTYPE_INFO: 4515 /* 4516 * Handle the case where vds starts handshake 4517 * (for now only vdc is the instigatior) 4518 */ 4519 status = ENOTSUP; 4520 break; 4521 default: 4522 status = ENOTSUP; 4523 } 4524 4525 return (status); 4526 } 4527 4528 /* 4529 * Function: 4530 * vdc_verify_seq_num() 4531 * 4532 * Description: 4533 * This functions verifies that the sequence number sent back by the vDisk 4534 * server with the latest message is what is expected (i.e. it is greater 4535 * than the last seq num sent by the vDisk server and less than or equal 4536 * to the last seq num generated by vdc). 4537 * 4538 * It then checks the request ID to see if any requests need processing 4539 * in the DRing. 4540 * 4541 * Arguments: 4542 * vdc - soft state pointer for this instance of the driver. 4543 * dring_msg - pointer to the LDC message sent by vds 4544 * 4545 * Return Code: 4546 * VDC_SEQ_NUM_TODO - Message needs to be processed 4547 * VDC_SEQ_NUM_SKIP - Message has already been processed 4548 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4549 * vdc cannot deal with them 4550 */ 4551 static int 4552 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4553 { 4554 ASSERT(vdc != NULL); 4555 ASSERT(dring_msg != NULL); 4556 ASSERT(mutex_owned(&vdc->lock)); 4557 4558 /* 4559 * Check to see if the messages were responded to in the correct 4560 * order by vds. 4561 */ 4562 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4563 (dring_msg->seq_num > vdc->seq_num)) { 4564 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4565 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4566 vdc->instance, dring_msg->seq_num, 4567 vdc->seq_num_reply, vdc->seq_num, 4568 vdc->req_id_proc, vdc->req_id); 4569 return (VDC_SEQ_NUM_INVALID); 4570 } 4571 vdc->seq_num_reply = dring_msg->seq_num; 4572 4573 if (vdc->req_id_proc < vdc->req_id) 4574 return (VDC_SEQ_NUM_TODO); 4575 else 4576 return (VDC_SEQ_NUM_SKIP); 4577 } 4578 4579 4580 /* 4581 * Function: 4582 * vdc_is_supported_version() 4583 * 4584 * Description: 4585 * This routine checks if the major/minor version numbers specified in 4586 * 'ver_msg' are supported. If not it finds the next version that is 4587 * in the supported version list 'vdc_version[]' and sets the fields in 4588 * 'ver_msg' to those values 4589 * 4590 * Arguments: 4591 * ver_msg - LDC message sent by vDisk server 4592 * 4593 * Return Code: 4594 * B_TRUE - Success 4595 * B_FALSE - Version not supported 4596 */ 4597 static boolean_t 4598 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 4599 { 4600 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 4601 4602 for (int i = 0; i < vdc_num_versions; i++) { 4603 ASSERT(vdc_version[i].major > 0); 4604 ASSERT((i == 0) || 4605 (vdc_version[i].major < vdc_version[i-1].major)); 4606 4607 /* 4608 * If the major versions match, adjust the minor version, if 4609 * necessary, down to the highest value supported by this 4610 * client. The server should support all minor versions lower 4611 * than the value it sent 4612 */ 4613 if (ver_msg->ver_major == vdc_version[i].major) { 4614 if (ver_msg->ver_minor > vdc_version[i].minor) { 4615 DMSGX(0, 4616 "Adjusting minor version from %u to %u", 4617 ver_msg->ver_minor, vdc_version[i].minor); 4618 ver_msg->ver_minor = vdc_version[i].minor; 4619 } 4620 return (B_TRUE); 4621 } 4622 4623 /* 4624 * If the message contains a higher major version number, set 4625 * the message's major/minor versions to the current values 4626 * and return false, so this message will get resent with 4627 * these values, and the server will potentially try again 4628 * with the same or a lower version 4629 */ 4630 if (ver_msg->ver_major > vdc_version[i].major) { 4631 ver_msg->ver_major = vdc_version[i].major; 4632 ver_msg->ver_minor = vdc_version[i].minor; 4633 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 4634 ver_msg->ver_major, ver_msg->ver_minor); 4635 4636 return (B_FALSE); 4637 } 4638 4639 /* 4640 * Otherwise, the message's major version is less than the 4641 * current major version, so continue the loop to the next 4642 * (lower) supported version 4643 */ 4644 } 4645 4646 /* 4647 * No common version was found; "ground" the version pair in the 4648 * message to terminate negotiation 4649 */ 4650 ver_msg->ver_major = 0; 4651 ver_msg->ver_minor = 0; 4652 4653 return (B_FALSE); 4654 } 4655 /* -------------------------------------------------------------------------- */ 4656 4657 /* 4658 * DKIO(7) support 4659 */ 4660 4661 typedef struct vdc_dk_arg { 4662 struct dk_callback dkc; 4663 int mode; 4664 dev_t dev; 4665 vdc_t *vdc; 4666 } vdc_dk_arg_t; 4667 4668 /* 4669 * Function: 4670 * vdc_dkio_flush_cb() 4671 * 4672 * Description: 4673 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 4674 * by kernel code. 4675 * 4676 * Arguments: 4677 * arg - a pointer to a vdc_dk_arg_t structure. 4678 */ 4679 void 4680 vdc_dkio_flush_cb(void *arg) 4681 { 4682 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 4683 struct dk_callback *dkc = NULL; 4684 vdc_t *vdc = NULL; 4685 int rv; 4686 4687 if (dk_arg == NULL) { 4688 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 4689 return; 4690 } 4691 dkc = &dk_arg->dkc; 4692 vdc = dk_arg->vdc; 4693 ASSERT(vdc != NULL); 4694 4695 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 4696 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 4697 if (rv != 0) { 4698 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 4699 vdc->instance, rv, 4700 ddi_model_convert_from(dk_arg->mode & FMODELS)); 4701 } 4702 4703 /* 4704 * Trigger the call back to notify the caller the the ioctl call has 4705 * been completed. 4706 */ 4707 if ((dk_arg->mode & FKIOCTL) && 4708 (dkc != NULL) && 4709 (dkc->dkc_callback != NULL)) { 4710 ASSERT(dkc->dkc_cookie != NULL); 4711 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 4712 } 4713 4714 /* Indicate that one less DKIO write flush is outstanding */ 4715 mutex_enter(&vdc->lock); 4716 vdc->dkio_flush_pending--; 4717 ASSERT(vdc->dkio_flush_pending >= 0); 4718 mutex_exit(&vdc->lock); 4719 4720 /* free the mem that was allocated when the callback was dispatched */ 4721 kmem_free(arg, sizeof (vdc_dk_arg_t)); 4722 } 4723 4724 /* 4725 * Function: 4726 * vdc_dkio_get_partition() 4727 * 4728 * Description: 4729 * This function implements the DKIOCGAPART ioctl. 4730 * 4731 * Arguments: 4732 * vdc - soft state pointer 4733 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 4734 * flag - ioctl flags 4735 */ 4736 static int 4737 vdc_dkio_get_partition(vdc_t *vdc, caddr_t arg, int flag) 4738 { 4739 struct dk_geom *geom; 4740 struct vtoc *vtoc; 4741 union { 4742 struct dk_map map[NDKMAP]; 4743 struct dk_map32 map32[NDKMAP]; 4744 } data; 4745 int i, rv, size; 4746 4747 mutex_enter(&vdc->lock); 4748 4749 if ((rv = vdc_validate_geometry(vdc)) != 0) { 4750 mutex_exit(&vdc->lock); 4751 return (rv); 4752 } 4753 4754 vtoc = vdc->vtoc; 4755 geom = vdc->geom; 4756 4757 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 4758 4759 for (i = 0; i < vtoc->v_nparts; i++) { 4760 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 4761 (geom->dkg_nhead * geom->dkg_nsect); 4762 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 4763 } 4764 size = NDKMAP * sizeof (struct dk_map32); 4765 4766 } else { 4767 4768 for (i = 0; i < vtoc->v_nparts; i++) { 4769 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 4770 (geom->dkg_nhead * geom->dkg_nsect); 4771 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 4772 } 4773 size = NDKMAP * sizeof (struct dk_map); 4774 4775 } 4776 4777 mutex_exit(&vdc->lock); 4778 4779 if (ddi_copyout(&data, arg, size, flag) != 0) 4780 return (EFAULT); 4781 4782 return (0); 4783 } 4784 4785 /* 4786 * Function: 4787 * vdc_dioctl_rwcmd() 4788 * 4789 * Description: 4790 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 4791 * for DKC_DIRECT disks to read or write at an absolute disk offset. 4792 * 4793 * Arguments: 4794 * dev - device 4795 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 4796 * flag - ioctl flags 4797 */ 4798 static int 4799 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 4800 { 4801 struct dadkio_rwcmd32 rwcmd32; 4802 struct dadkio_rwcmd rwcmd; 4803 struct iovec aiov; 4804 struct uio auio; 4805 int rw, status; 4806 struct buf *buf; 4807 4808 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 4809 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 4810 sizeof (struct dadkio_rwcmd32), flag)) { 4811 return (EFAULT); 4812 } 4813 rwcmd.cmd = rwcmd32.cmd; 4814 rwcmd.flags = rwcmd32.flags; 4815 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 4816 rwcmd.buflen = rwcmd32.buflen; 4817 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 4818 } else { 4819 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 4820 sizeof (struct dadkio_rwcmd), flag)) { 4821 return (EFAULT); 4822 } 4823 } 4824 4825 switch (rwcmd.cmd) { 4826 case DADKIO_RWCMD_READ: 4827 rw = B_READ; 4828 break; 4829 case DADKIO_RWCMD_WRITE: 4830 rw = B_WRITE; 4831 break; 4832 default: 4833 return (EINVAL); 4834 } 4835 4836 bzero((caddr_t)&aiov, sizeof (struct iovec)); 4837 aiov.iov_base = rwcmd.bufaddr; 4838 aiov.iov_len = rwcmd.buflen; 4839 4840 bzero((caddr_t)&auio, sizeof (struct uio)); 4841 auio.uio_iov = &aiov; 4842 auio.uio_iovcnt = 1; 4843 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 4844 auio.uio_resid = rwcmd.buflen; 4845 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 4846 4847 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 4848 bioinit(buf); 4849 /* 4850 * We use the private field of buf to specify that this is an 4851 * I/O using an absolute offset. 4852 */ 4853 buf->b_private = (void *)VD_SLICE_NONE; 4854 4855 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 4856 4857 biofini(buf); 4858 kmem_free(buf, sizeof (buf_t)); 4859 4860 return (status); 4861 } 4862 4863 /* 4864 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 4865 * buffer is returned in alloc_len. 4866 */ 4867 static vd_scsi_t * 4868 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 4869 int *alloc_len) 4870 { 4871 vd_scsi_t *vd_scsi; 4872 int vd_scsi_len = VD_SCSI_SIZE; 4873 4874 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 4875 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 4876 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 4877 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 4878 4879 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 4880 4881 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 4882 4883 vd_scsi->cdb_len = cdb_len; 4884 vd_scsi->sense_len = sense_len; 4885 vd_scsi->datain_len = datain_len; 4886 vd_scsi->dataout_len = dataout_len; 4887 4888 *alloc_len = vd_scsi_len; 4889 4890 return (vd_scsi); 4891 } 4892 4893 /* 4894 * Convert the status of a SCSI command to a Solaris return code. 4895 * 4896 * Arguments: 4897 * vd_scsi - The SCSI operation buffer. 4898 * log_error - indicate if an error message should be logged. 4899 * 4900 * Note that our SCSI error messages are rather primitive for the moment 4901 * and could be improved by decoding some data like the SCSI command and 4902 * the sense key. 4903 * 4904 * Return value: 4905 * 0 - Status is good. 4906 * EACCES - Status reports a reservation conflict. 4907 * ENOTSUP - Status reports a check condition and sense key 4908 * reports an illegal request. 4909 * EIO - Any other status. 4910 */ 4911 static int 4912 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 4913 { 4914 int rv; 4915 char path_str[MAXPATHLEN]; 4916 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 4917 union scsi_cdb *cdb; 4918 struct scsi_extended_sense *sense; 4919 4920 if (vd_scsi->cmd_status == STATUS_GOOD) 4921 /* no error */ 4922 return (0); 4923 4924 /* when the tunable vdc_scsi_log_error is true we log all errors */ 4925 if (vdc_scsi_log_error) 4926 log_error = B_TRUE; 4927 4928 if (log_error) { 4929 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 4930 ddi_pathname(vdc->dip, path_str), vdc->instance, 4931 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 4932 } 4933 4934 /* default returned value */ 4935 rv = EIO; 4936 4937 switch (vd_scsi->cmd_status) { 4938 4939 case STATUS_CHECK: 4940 case STATUS_TERMINATED: 4941 if (log_error) 4942 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 4943 4944 /* check sense buffer */ 4945 if (vd_scsi->sense_len == 0 || 4946 vd_scsi->sense_status != STATUS_GOOD) { 4947 if (log_error) 4948 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 4949 break; 4950 } 4951 4952 sense = VD_SCSI_DATA_SENSE(vd_scsi); 4953 4954 if (log_error) { 4955 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 4956 "\tASC: 0x%x, ASCQ: 0x%x\n", 4957 scsi_sense_key((uint8_t *)sense), 4958 scsi_sense_asc((uint8_t *)sense), 4959 scsi_sense_ascq((uint8_t *)sense)); 4960 } 4961 4962 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 4963 rv = ENOTSUP; 4964 break; 4965 4966 case STATUS_BUSY: 4967 if (log_error) 4968 cmn_err(CE_NOTE, "\tDevice Busy\n"); 4969 break; 4970 4971 case STATUS_RESERVATION_CONFLICT: 4972 /* 4973 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 4974 * reservation conflict could be due to various reasons like 4975 * incorrect keys, not registered or not reserved etc. So, 4976 * we should not panic in that case. 4977 */ 4978 cdb = VD_SCSI_DATA_CDB(vd_scsi); 4979 if (vdc->failfast_interval != 0 && 4980 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 4981 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 4982 /* failfast is enabled so we have to panic */ 4983 (void) snprintf(panic_str, sizeof (panic_str), 4984 VDC_RESV_CONFLICT_FMT_STR "%s", 4985 ddi_pathname(vdc->dip, path_str)); 4986 panic(panic_str); 4987 } 4988 if (log_error) 4989 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 4990 rv = EACCES; 4991 break; 4992 4993 case STATUS_QFULL: 4994 if (log_error) 4995 cmn_err(CE_NOTE, "\tQueue Full\n"); 4996 break; 4997 4998 case STATUS_MET: 4999 case STATUS_INTERMEDIATE: 5000 case STATUS_SCSI2: 5001 case STATUS_INTERMEDIATE_MET: 5002 case STATUS_ACA_ACTIVE: 5003 if (log_error) 5004 cmn_err(CE_CONT, 5005 "\tUnexpected SCSI status received: 0x%x\n", 5006 vd_scsi->cmd_status); 5007 break; 5008 5009 default: 5010 if (log_error) 5011 cmn_err(CE_CONT, 5012 "\tInvalid SCSI status received: 0x%x\n", 5013 vd_scsi->cmd_status); 5014 break; 5015 } 5016 5017 return (rv); 5018 } 5019 5020 /* 5021 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5022 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5023 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5024 * converted to a VD_OP_RESET operation. 5025 */ 5026 static int 5027 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5028 { 5029 struct uscsi_cmd uscsi; 5030 struct uscsi_cmd32 uscsi32; 5031 vd_scsi_t *vd_scsi; 5032 int vd_scsi_len; 5033 union scsi_cdb *cdb; 5034 struct scsi_extended_sense *sense; 5035 char *datain, *dataout; 5036 size_t cdb_len, datain_len, dataout_len, sense_len; 5037 int rv; 5038 5039 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5040 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5041 mode) != 0) 5042 return (EFAULT); 5043 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5044 } else { 5045 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5046 mode) != 0) 5047 return (EFAULT); 5048 } 5049 5050 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5051 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5052 USCSI_RESET_ALL)) { 5053 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5054 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5055 return (rv); 5056 } 5057 5058 /* cdb buffer length */ 5059 cdb_len = uscsi.uscsi_cdblen; 5060 5061 /* data in and out buffers length */ 5062 if (uscsi.uscsi_flags & USCSI_READ) { 5063 datain_len = uscsi.uscsi_buflen; 5064 dataout_len = 0; 5065 } else { 5066 datain_len = 0; 5067 dataout_len = uscsi.uscsi_buflen; 5068 } 5069 5070 /* sense buffer length */ 5071 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5072 sense_len = uscsi.uscsi_rqlen; 5073 else 5074 sense_len = 0; 5075 5076 /* allocate buffer for the VD_SCSICMD_OP operation */ 5077 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5078 &vd_scsi_len); 5079 5080 /* 5081 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5082 * but basically they prevent a SCSI command from being retried in case 5083 * of an error. 5084 */ 5085 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5086 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5087 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5088 5089 /* set task attribute */ 5090 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5091 vd_scsi->task_attribute = 0; 5092 } else { 5093 if (uscsi.uscsi_flags & USCSI_HEAD) 5094 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5095 else if (uscsi.uscsi_flags & USCSI_HTAG) 5096 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5097 else if (uscsi.uscsi_flags & USCSI_OTAG) 5098 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5099 else 5100 vd_scsi->task_attribute = 0; 5101 } 5102 5103 /* set timeout */ 5104 vd_scsi->timeout = uscsi.uscsi_timeout; 5105 5106 /* copy-in cdb data */ 5107 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5108 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5109 rv = EFAULT; 5110 goto done; 5111 } 5112 5113 /* keep a pointer to the sense buffer */ 5114 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5115 5116 /* keep a pointer to the data-in buffer */ 5117 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5118 5119 /* copy-in request data to the data-out buffer */ 5120 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5121 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5122 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5123 mode)) { 5124 rv = EFAULT; 5125 goto done; 5126 } 5127 } 5128 5129 /* submit the request */ 5130 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5131 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5132 5133 if (rv != 0) 5134 goto done; 5135 5136 /* update scsi status */ 5137 uscsi.uscsi_status = vd_scsi->cmd_status; 5138 5139 /* update sense data */ 5140 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5141 (uscsi.uscsi_status == STATUS_CHECK || 5142 uscsi.uscsi_status == STATUS_TERMINATED)) { 5143 5144 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5145 5146 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5147 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5148 vd_scsi->sense_len; 5149 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5150 vd_scsi->sense_len, mode) != 0) { 5151 rv = EFAULT; 5152 goto done; 5153 } 5154 } 5155 } 5156 5157 /* update request data */ 5158 if (uscsi.uscsi_status == STATUS_GOOD) { 5159 if (uscsi.uscsi_flags & USCSI_READ) { 5160 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5161 vd_scsi->datain_len; 5162 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5163 vd_scsi->datain_len, mode) != 0) { 5164 rv = EFAULT; 5165 goto done; 5166 } 5167 } else { 5168 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5169 vd_scsi->dataout_len; 5170 } 5171 } 5172 5173 /* copy-out result */ 5174 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5175 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5176 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5177 mode) != 0) { 5178 rv = EFAULT; 5179 goto done; 5180 } 5181 } else { 5182 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5183 mode) != 0) { 5184 rv = EFAULT; 5185 goto done; 5186 } 5187 } 5188 5189 /* get the return code from the SCSI command status */ 5190 rv = vdc_scsi_status(vdc, vd_scsi, 5191 !(uscsi.uscsi_flags & USCSI_SILENT)); 5192 5193 done: 5194 kmem_free(vd_scsi, vd_scsi_len); 5195 return (rv); 5196 } 5197 5198 /* 5199 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5200 * 5201 * Arguments: 5202 * cmd - SCSI PERSISTENT IN command 5203 * len - length of the SCSI input buffer 5204 * vd_scsi_len - return the length of the allocated buffer 5205 * 5206 * Returned Value: 5207 * a pointer to the allocated VD_OP_SCSICMD buffer. 5208 */ 5209 static vd_scsi_t * 5210 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5211 { 5212 int cdb_len, sense_len, datain_len, dataout_len; 5213 vd_scsi_t *vd_scsi; 5214 union scsi_cdb *cdb; 5215 5216 cdb_len = CDB_GROUP1; 5217 sense_len = sizeof (struct scsi_extended_sense); 5218 datain_len = len; 5219 dataout_len = 0; 5220 5221 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5222 vd_scsi_len); 5223 5224 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5225 5226 /* set cdb */ 5227 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5228 cdb->cdb_opaque[1] = cmd; 5229 FORMG1COUNT(cdb, datain_len); 5230 5231 vd_scsi->timeout = vdc_scsi_timeout; 5232 5233 return (vd_scsi); 5234 } 5235 5236 /* 5237 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5238 * 5239 * Arguments: 5240 * cmd - SCSI PERSISTENT OUT command 5241 * len - length of the SCSI output buffer 5242 * vd_scsi_len - return the length of the allocated buffer 5243 * 5244 * Returned Code: 5245 * a pointer to the allocated VD_OP_SCSICMD buffer. 5246 */ 5247 static vd_scsi_t * 5248 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5249 { 5250 int cdb_len, sense_len, datain_len, dataout_len; 5251 vd_scsi_t *vd_scsi; 5252 union scsi_cdb *cdb; 5253 5254 cdb_len = CDB_GROUP1; 5255 sense_len = sizeof (struct scsi_extended_sense); 5256 datain_len = 0; 5257 dataout_len = len; 5258 5259 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5260 vd_scsi_len); 5261 5262 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5263 5264 /* set cdb */ 5265 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5266 cdb->cdb_opaque[1] = cmd; 5267 FORMG1COUNT(cdb, dataout_len); 5268 5269 vd_scsi->timeout = vdc_scsi_timeout; 5270 5271 return (vd_scsi); 5272 } 5273 5274 /* 5275 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5276 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5277 * server with a VD_OP_SCSICMD operation. 5278 */ 5279 static int 5280 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5281 { 5282 vd_scsi_t *vd_scsi; 5283 mhioc_inkeys_t inkeys; 5284 mhioc_key_list_t klist; 5285 struct mhioc_inkeys32 inkeys32; 5286 struct mhioc_key_list32 klist32; 5287 sd_prin_readkeys_t *scsi_keys; 5288 void *user_keys; 5289 int vd_scsi_len; 5290 int listsize, listlen, rv; 5291 5292 /* copyin arguments */ 5293 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5294 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5295 if (rv != 0) 5296 return (EFAULT); 5297 5298 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5299 sizeof (klist32), mode); 5300 if (rv != 0) 5301 return (EFAULT); 5302 5303 listsize = klist32.listsize; 5304 } else { 5305 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5306 if (rv != 0) 5307 return (EFAULT); 5308 5309 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5310 if (rv != 0) 5311 return (EFAULT); 5312 5313 listsize = klist.listsize; 5314 } 5315 5316 /* build SCSI VD_OP request */ 5317 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5318 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5319 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5320 5321 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5322 5323 /* submit the request */ 5324 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5325 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5326 5327 if (rv != 0) 5328 goto done; 5329 5330 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5331 5332 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5333 inkeys32.generation = scsi_keys->generation; 5334 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5335 if (rv != 0) { 5336 rv = EFAULT; 5337 goto done; 5338 } 5339 5340 klist32.listlen = listlen; 5341 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5342 sizeof (klist32), mode); 5343 if (rv != 0) { 5344 rv = EFAULT; 5345 goto done; 5346 } 5347 5348 user_keys = (caddr_t)(uintptr_t)klist32.list; 5349 } else { 5350 inkeys.generation = scsi_keys->generation; 5351 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5352 if (rv != 0) { 5353 rv = EFAULT; 5354 goto done; 5355 } 5356 5357 klist.listlen = listlen; 5358 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5359 if (rv != 0) { 5360 rv = EFAULT; 5361 goto done; 5362 } 5363 5364 user_keys = klist.list; 5365 } 5366 5367 /* copy out keys */ 5368 if (listlen > 0 && listsize > 0) { 5369 if (listsize < listlen) 5370 listlen = listsize; 5371 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5372 listlen * MHIOC_RESV_KEY_SIZE, mode); 5373 if (rv != 0) 5374 rv = EFAULT; 5375 } 5376 5377 if (rv == 0) 5378 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5379 5380 done: 5381 kmem_free(vd_scsi, vd_scsi_len); 5382 5383 return (rv); 5384 } 5385 5386 /* 5387 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5388 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5389 * the vdisk server with a VD_OP_SCSICMD operation. 5390 */ 5391 static int 5392 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5393 { 5394 vd_scsi_t *vd_scsi; 5395 mhioc_inresvs_t inresv; 5396 mhioc_resv_desc_list_t rlist; 5397 struct mhioc_inresvs32 inresv32; 5398 struct mhioc_resv_desc_list32 rlist32; 5399 mhioc_resv_desc_t mhd_resv; 5400 sd_prin_readresv_t *scsi_resv; 5401 sd_readresv_desc_t *resv; 5402 mhioc_resv_desc_t *user_resv; 5403 int vd_scsi_len; 5404 int listsize, listlen, i, rv; 5405 5406 /* copyin arguments */ 5407 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5408 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5409 if (rv != 0) 5410 return (EFAULT); 5411 5412 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5413 sizeof (rlist32), mode); 5414 if (rv != 0) 5415 return (EFAULT); 5416 5417 listsize = rlist32.listsize; 5418 } else { 5419 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5420 if (rv != 0) 5421 return (EFAULT); 5422 5423 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5424 if (rv != 0) 5425 return (EFAULT); 5426 5427 listsize = rlist.listsize; 5428 } 5429 5430 /* build SCSI VD_OP request */ 5431 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5432 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5433 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5434 5435 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5436 5437 /* submit the request */ 5438 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5439 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5440 5441 if (rv != 0) 5442 goto done; 5443 5444 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5445 5446 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5447 inresv32.generation = scsi_resv->generation; 5448 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5449 if (rv != 0) { 5450 rv = EFAULT; 5451 goto done; 5452 } 5453 5454 rlist32.listlen = listlen; 5455 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5456 sizeof (rlist32), mode); 5457 if (rv != 0) { 5458 rv = EFAULT; 5459 goto done; 5460 } 5461 5462 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5463 } else { 5464 inresv.generation = scsi_resv->generation; 5465 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5466 if (rv != 0) { 5467 rv = EFAULT; 5468 goto done; 5469 } 5470 5471 rlist.listlen = listlen; 5472 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5473 if (rv != 0) { 5474 rv = EFAULT; 5475 goto done; 5476 } 5477 5478 user_resv = rlist.list; 5479 } 5480 5481 /* copy out reservations */ 5482 if (listsize > 0 && listlen > 0) { 5483 if (listsize < listlen) 5484 listlen = listsize; 5485 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5486 5487 for (i = 0; i < listlen; i++) { 5488 mhd_resv.type = resv->type; 5489 mhd_resv.scope = resv->scope; 5490 mhd_resv.scope_specific_addr = 5491 BE_32(resv->scope_specific_addr); 5492 bcopy(&resv->resvkey, &mhd_resv.key, 5493 MHIOC_RESV_KEY_SIZE); 5494 5495 rv = ddi_copyout(&mhd_resv, user_resv, 5496 sizeof (mhd_resv), mode); 5497 if (rv != 0) { 5498 rv = EFAULT; 5499 goto done; 5500 } 5501 resv++; 5502 user_resv++; 5503 } 5504 } 5505 5506 if (rv == 0) 5507 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5508 5509 done: 5510 kmem_free(vd_scsi, vd_scsi_len); 5511 return (rv); 5512 } 5513 5514 /* 5515 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5516 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5517 * server with a VD_OP_SCSICMD operation. 5518 */ 5519 static int 5520 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5521 { 5522 vd_scsi_t *vd_scsi; 5523 sd_prout_t *scsi_prout; 5524 mhioc_register_t mhd_reg; 5525 int vd_scsi_len, rv; 5526 5527 /* copyin arguments */ 5528 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 5529 if (rv != 0) 5530 return (EFAULT); 5531 5532 /* build SCSI VD_OP request */ 5533 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 5534 sizeof (sd_prout_t), &vd_scsi_len); 5535 5536 /* set parameters */ 5537 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5538 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5539 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 5540 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 5541 5542 /* submit the request */ 5543 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5544 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5545 5546 if (rv == 0) 5547 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5548 5549 kmem_free(vd_scsi, vd_scsi_len); 5550 return (rv); 5551 } 5552 5553 /* 5554 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 5555 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 5556 * server with a VD_OP_SCSICMD operation. 5557 */ 5558 static int 5559 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 5560 { 5561 union scsi_cdb *cdb; 5562 vd_scsi_t *vd_scsi; 5563 sd_prout_t *scsi_prout; 5564 mhioc_resv_desc_t mhd_resv; 5565 int vd_scsi_len, rv; 5566 5567 /* copyin arguments */ 5568 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 5569 if (rv != 0) 5570 return (EFAULT); 5571 5572 /* build SCSI VD_OP request */ 5573 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 5574 sizeof (sd_prout_t), &vd_scsi_len); 5575 5576 /* set parameters */ 5577 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5578 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5579 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5580 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 5581 cdb->cdb_opaque[2] = mhd_resv.type; 5582 5583 /* submit the request */ 5584 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5585 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5586 5587 if (rv == 0) 5588 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5589 5590 kmem_free(vd_scsi, vd_scsi_len); 5591 return (rv); 5592 } 5593 5594 /* 5595 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 5596 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 5597 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 5598 */ 5599 static int 5600 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 5601 { 5602 union scsi_cdb *cdb; 5603 vd_scsi_t *vd_scsi; 5604 sd_prout_t *scsi_prout; 5605 mhioc_preemptandabort_t mhd_preempt; 5606 int vd_scsi_len, rv; 5607 5608 /* copyin arguments */ 5609 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 5610 if (rv != 0) 5611 return (EFAULT); 5612 5613 /* build SCSI VD_OP request */ 5614 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 5615 sizeof (sd_prout_t), &vd_scsi_len); 5616 5617 /* set parameters */ 5618 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5619 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5620 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5621 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 5622 MHIOC_RESV_KEY_SIZE); 5623 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 5624 MHIOC_RESV_KEY_SIZE); 5625 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 5626 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 5627 5628 /* submit the request */ 5629 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5630 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5631 5632 if (rv == 0) 5633 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5634 5635 kmem_free(vd_scsi, vd_scsi_len); 5636 return (rv); 5637 } 5638 5639 /* 5640 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 5641 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 5642 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 5643 */ 5644 static int 5645 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 5646 { 5647 vd_scsi_t *vd_scsi; 5648 sd_prout_t *scsi_prout; 5649 mhioc_registerandignorekey_t mhd_regi; 5650 int vd_scsi_len, rv; 5651 5652 /* copyin arguments */ 5653 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 5654 if (rv != 0) 5655 return (EFAULT); 5656 5657 /* build SCSI VD_OP request */ 5658 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 5659 sizeof (sd_prout_t), &vd_scsi_len); 5660 5661 /* set parameters */ 5662 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5663 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 5664 MHIOC_RESV_KEY_SIZE); 5665 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 5666 5667 /* submit the request */ 5668 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5669 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5670 5671 if (rv == 0) 5672 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5673 5674 kmem_free(vd_scsi, vd_scsi_len); 5675 return (rv); 5676 } 5677 5678 /* 5679 * This function is used by the failfast mechanism to send a SCSI command 5680 * to check for reservation conflict. 5681 */ 5682 static int 5683 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 5684 { 5685 int cdb_len, sense_len, vd_scsi_len; 5686 vd_scsi_t *vd_scsi; 5687 union scsi_cdb *cdb; 5688 int rv; 5689 5690 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 5691 5692 if (scmd == SCMD_WRITE_G1) 5693 cdb_len = CDB_GROUP1; 5694 else 5695 cdb_len = CDB_GROUP0; 5696 5697 sense_len = sizeof (struct scsi_extended_sense); 5698 5699 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 5700 5701 /* set cdb */ 5702 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5703 cdb->scc_cmd = scmd; 5704 5705 vd_scsi->timeout = vdc_scsi_timeout; 5706 5707 /* 5708 * Submit the request. The last argument has to be B_FALSE so that 5709 * vdc_do_sync_op does not loop checking for reservation conflict if 5710 * the operation returns an error. 5711 */ 5712 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5713 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 5714 5715 if (rv == 0) 5716 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5717 5718 kmem_free(vd_scsi, vd_scsi_len); 5719 return (rv); 5720 } 5721 5722 /* 5723 * This function is used by the failfast mechanism to check for reservation 5724 * conflict. It sends some SCSI commands which will fail with a reservation 5725 * conflict error if the system does not have access to the disk and this 5726 * will panic the system. 5727 * 5728 * Returned Code: 5729 * 0 - disk is accessible without reservation conflict error 5730 * != 0 - unable to check if disk is accessible 5731 */ 5732 int 5733 vdc_failfast_check_resv(vdc_t *vdc) 5734 { 5735 int failure = 0; 5736 5737 /* 5738 * Send a TEST UNIT READY command. The command will panic 5739 * the system if it fails with a reservation conflict. 5740 */ 5741 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 5742 failure++; 5743 5744 /* 5745 * With SPC-3 compliant devices TEST UNIT READY will succeed on 5746 * a reserved device, so we also do a WRITE(10) of zero byte in 5747 * order to provoke a Reservation Conflict status on those newer 5748 * devices. 5749 */ 5750 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 5751 failure++; 5752 5753 return (failure); 5754 } 5755 5756 /* 5757 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 5758 * queue when it has failed and failfast is enabled. Then we have to check 5759 * if it has failed because of a reservation conflict in which case we have 5760 * to panic the system. 5761 * 5762 * Async I/O should be queued with their block I/O data transfer structure 5763 * (buf). Sync I/O should be queued with buf = NULL. 5764 */ 5765 static vdc_io_t * 5766 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 5767 { 5768 vdc_io_t *vio; 5769 5770 ASSERT(MUTEX_HELD(&vdc->lock)); 5771 5772 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 5773 vio->vio_next = vdc->failfast_io_queue; 5774 vio->vio_buf = buf; 5775 vio->vio_qtime = ddi_get_lbolt(); 5776 5777 vdc->failfast_io_queue = vio; 5778 5779 /* notify the failfast thread that a new I/O is queued */ 5780 cv_signal(&vdc->failfast_cv); 5781 5782 return (vio); 5783 } 5784 5785 /* 5786 * Remove and complete I/O in the failfast I/O queue which have been 5787 * added after the indicated deadline. A deadline of 0 means that all 5788 * I/O have to be unqueued and marked as completed. 5789 */ 5790 static void 5791 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 5792 { 5793 vdc_io_t *vio, *vio_tmp; 5794 5795 ASSERT(MUTEX_HELD(&vdc->lock)); 5796 5797 vio_tmp = NULL; 5798 vio = vdc->failfast_io_queue; 5799 5800 if (deadline != 0) { 5801 /* 5802 * Skip any io queued after the deadline. The failfast 5803 * I/O queue is ordered starting with the last I/O added 5804 * to the queue. 5805 */ 5806 while (vio != NULL && vio->vio_qtime > deadline) { 5807 vio_tmp = vio; 5808 vio = vio->vio_next; 5809 } 5810 } 5811 5812 if (vio == NULL) 5813 /* nothing to unqueue */ 5814 return; 5815 5816 /* update the queue */ 5817 if (vio_tmp == NULL) 5818 vdc->failfast_io_queue = NULL; 5819 else 5820 vio_tmp->vio_next = NULL; 5821 5822 /* 5823 * Complete unqueued I/O. Async I/O have a block I/O data transfer 5824 * structure (buf) and they are completed by calling biodone(). Sync 5825 * I/O do not have a buf and they are completed by setting the 5826 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 5827 * thread waiting for the I/O to complete is responsible for freeing 5828 * the vio structure. 5829 */ 5830 while (vio != NULL) { 5831 vio_tmp = vio->vio_next; 5832 if (vio->vio_buf != NULL) { 5833 biodone(vio->vio_buf); 5834 kmem_free(vio, sizeof (vdc_io_t)); 5835 } else { 5836 vio->vio_qtime = 0; 5837 } 5838 vio = vio_tmp; 5839 } 5840 5841 cv_broadcast(&vdc->failfast_io_cv); 5842 } 5843 5844 /* 5845 * Failfast Thread. 5846 * 5847 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 5848 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 5849 * we still have access to the disk. If a command fails with a RESERVATION 5850 * CONFLICT error then the system will immediatly panic. 5851 * 5852 * The failfast thread is also woken up when an I/O has failed. It then check 5853 * the access to the disk to ensure that the I/O failure was not due to a 5854 * reservation conflict. 5855 * 5856 * There is one failfast thread for each virtual disk for which failfast is 5857 * enabled. We could have only one thread sending requests for all disks but 5858 * this would need vdc to send asynchronous requests and to have callbacks to 5859 * process replies. 5860 */ 5861 static void 5862 vdc_failfast_thread(void *arg) 5863 { 5864 int status; 5865 vdc_t *vdc = (vdc_t *)arg; 5866 clock_t timeout, starttime; 5867 5868 mutex_enter(&vdc->lock); 5869 5870 while (vdc->failfast_interval != 0) { 5871 5872 starttime = ddi_get_lbolt(); 5873 5874 mutex_exit(&vdc->lock); 5875 5876 /* check for reservation conflict */ 5877 status = vdc_failfast_check_resv(vdc); 5878 5879 mutex_enter(&vdc->lock); 5880 /* 5881 * We have dropped the lock to send the SCSI command so we have 5882 * to check that failfast is still enabled. 5883 */ 5884 if (vdc->failfast_interval == 0) 5885 break; 5886 5887 /* 5888 * If we have successfully check the disk access and there was 5889 * no reservation conflict then we can complete any I/O queued 5890 * before the last check. 5891 */ 5892 if (status == 0) 5893 vdc_failfast_io_unqueue(vdc, starttime); 5894 5895 /* proceed again if some I/O are still in the queue */ 5896 if (vdc->failfast_io_queue != NULL) 5897 continue; 5898 5899 timeout = ddi_get_lbolt() + 5900 drv_usectohz(vdc->failfast_interval); 5901 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 5902 } 5903 5904 /* 5905 * Failfast is being stop so we can complete any queued I/O. 5906 */ 5907 vdc_failfast_io_unqueue(vdc, 0); 5908 vdc->failfast_thread = NULL; 5909 mutex_exit(&vdc->lock); 5910 thread_exit(); 5911 } 5912 5913 /* 5914 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 5915 */ 5916 static int 5917 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 5918 { 5919 unsigned int mh_time; 5920 5921 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 5922 return (EFAULT); 5923 5924 mutex_enter(&vdc->lock); 5925 if (mh_time != 0 && vdc->failfast_thread == NULL) { 5926 vdc->failfast_thread = thread_create(NULL, 0, 5927 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 5928 v.v_maxsyspri - 2); 5929 } 5930 5931 vdc->failfast_interval = mh_time * 1000; 5932 cv_signal(&vdc->failfast_cv); 5933 mutex_exit(&vdc->lock); 5934 5935 return (0); 5936 } 5937 5938 /* 5939 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 5940 * converted to VD_OP_SET_ACCESS operations. 5941 */ 5942 static int 5943 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 5944 { 5945 int rv; 5946 5947 /* submit owership command request */ 5948 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 5949 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 5950 VIO_both_dir, B_TRUE); 5951 5952 return (rv); 5953 } 5954 5955 /* 5956 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 5957 * VD_OP_GET_ACCESS operation. 5958 */ 5959 static int 5960 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 5961 { 5962 int rv; 5963 5964 /* submit owership command request */ 5965 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 5966 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 5967 VIO_both_dir, B_TRUE); 5968 5969 return (rv); 5970 } 5971 5972 /* 5973 * Disk Ownership Thread. 5974 * 5975 * When we have taken the ownership of a disk, this thread waits to be 5976 * notified when the LDC channel is reset so that it can recover the 5977 * ownership. 5978 * 5979 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 5980 * can not be used to do the ownership recovery because it has to be 5981 * running to handle the reply message to the ownership operation. 5982 */ 5983 static void 5984 vdc_ownership_thread(void *arg) 5985 { 5986 vdc_t *vdc = (vdc_t *)arg; 5987 clock_t timeout; 5988 uint64_t status; 5989 5990 mutex_enter(&vdc->ownership_lock); 5991 mutex_enter(&vdc->lock); 5992 5993 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 5994 5995 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 5996 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 5997 /* 5998 * There was a reset so the ownership has been lost, 5999 * try to recover. We do this without using the preempt 6000 * option so that we don't steal the ownership from 6001 * someone who has preempted us. 6002 */ 6003 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6004 vdc->instance); 6005 6006 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6007 VDC_OWNERSHIP_GRANTED); 6008 6009 mutex_exit(&vdc->lock); 6010 6011 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6012 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6013 6014 mutex_enter(&vdc->lock); 6015 6016 if (status == 0) { 6017 DMSG(vdc, 0, "[%d] Ownership recovered", 6018 vdc->instance); 6019 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6020 } else { 6021 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6022 vdc->instance); 6023 } 6024 6025 } 6026 6027 /* 6028 * If we have the ownership then we just wait for an event 6029 * to happen (LDC reset), otherwise we will retry to recover 6030 * after a delay. 6031 */ 6032 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6033 timeout = 0; 6034 else 6035 timeout = ddi_get_lbolt() + 6036 drv_usectohz(vdc_ownership_delay); 6037 6038 /* Release the ownership_lock and wait on the vdc lock */ 6039 mutex_exit(&vdc->ownership_lock); 6040 6041 if (timeout == 0) 6042 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6043 else 6044 (void) cv_timedwait(&vdc->ownership_cv, 6045 &vdc->lock, timeout); 6046 6047 mutex_exit(&vdc->lock); 6048 6049 mutex_enter(&vdc->ownership_lock); 6050 mutex_enter(&vdc->lock); 6051 } 6052 6053 vdc->ownership_thread = NULL; 6054 mutex_exit(&vdc->lock); 6055 mutex_exit(&vdc->ownership_lock); 6056 6057 thread_exit(); 6058 } 6059 6060 static void 6061 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6062 { 6063 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6064 6065 mutex_enter(&vdc->lock); 6066 vdc->ownership = ownership_flags; 6067 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6068 vdc->ownership_thread == NULL) { 6069 /* start ownership thread */ 6070 vdc->ownership_thread = thread_create(NULL, 0, 6071 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6072 v.v_maxsyspri - 2); 6073 } else { 6074 /* notify the ownership thread */ 6075 cv_signal(&vdc->ownership_cv); 6076 } 6077 mutex_exit(&vdc->lock); 6078 } 6079 6080 /* 6081 * Get the size and the block size of a virtual disk from the vdisk server. 6082 * We need to use this operation when the vdisk_size attribute was not 6083 * available during the handshake with the vdisk server. 6084 */ 6085 static int 6086 vdc_check_capacity(vdc_t *vdc) 6087 { 6088 int rv = 0; 6089 size_t alloc_len; 6090 vd_capacity_t *vd_cap; 6091 6092 if (vdc->vdisk_size != 0) 6093 return (0); 6094 6095 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6096 6097 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6098 6099 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6100 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6101 6102 if (rv == 0) { 6103 if (vd_cap->vdisk_block_size != vdc->block_size || 6104 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6105 vd_cap->vdisk_size == 0) 6106 rv = EINVAL; 6107 else 6108 vdc->vdisk_size = vd_cap->vdisk_size; 6109 } 6110 6111 kmem_free(vd_cap, alloc_len); 6112 return (rv); 6113 } 6114 6115 /* 6116 * This structure is used in the DKIO(7I) array below. 6117 */ 6118 typedef struct vdc_dk_ioctl { 6119 uint8_t op; /* VD_OP_XXX value */ 6120 int cmd; /* Solaris ioctl operation number */ 6121 size_t nbytes; /* size of structure to be copied */ 6122 6123 /* function to convert between vDisk and Solaris structure formats */ 6124 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6125 int mode, int dir); 6126 } vdc_dk_ioctl_t; 6127 6128 /* 6129 * Subset of DKIO(7I) operations currently supported 6130 */ 6131 static vdc_dk_ioctl_t dk_ioctl[] = { 6132 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6133 vdc_null_copy_func}, 6134 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6135 vdc_get_wce_convert}, 6136 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6137 vdc_set_wce_convert}, 6138 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6139 vdc_get_vtoc_convert}, 6140 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6141 vdc_set_vtoc_convert}, 6142 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6143 vdc_get_geom_convert}, 6144 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6145 vdc_get_geom_convert}, 6146 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6147 vdc_get_geom_convert}, 6148 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6149 vdc_set_geom_convert}, 6150 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6151 vdc_get_efi_convert}, 6152 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6153 vdc_set_efi_convert}, 6154 6155 /* DIOCTL_RWCMD is converted to a read or a write */ 6156 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6157 6158 /* mhd(7I) non-shared multihost disks ioctls */ 6159 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6160 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6161 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6162 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6163 6164 /* mhd(7I) shared multihost disks ioctls */ 6165 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6166 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6167 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6168 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6169 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6170 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6171 6172 /* mhd(7I) failfast ioctl */ 6173 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6174 6175 /* 6176 * These particular ioctls are not sent to the server - vdc fakes up 6177 * the necessary info. 6178 */ 6179 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6180 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6181 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6182 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6183 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6184 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6185 }; 6186 6187 /* 6188 * The signature of vd_process_ioctl() has changed to include the return value 6189 * pointer. However we don't want to change vd_efi_* functions now so we add 6190 * this wrapper function so that we can use it with vdc_efi_init(). 6191 * 6192 * vd_efi_* functions need some changes to fix 6528974 and so we will eventually 6193 * remove this function when fixing that bug. 6194 */ 6195 static int 6196 vd_process_efi_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) 6197 { 6198 int rval; 6199 return (vd_process_ioctl(dev, cmd, arg, mode, &rval)); 6200 } 6201 6202 /* 6203 * Function: 6204 * vd_process_ioctl() 6205 * 6206 * Description: 6207 * This routine processes disk specific ioctl calls 6208 * 6209 * Arguments: 6210 * dev - the device number 6211 * cmd - the operation [dkio(7I)] to be processed 6212 * arg - pointer to user provided structure 6213 * (contains data to be set or reference parameter for get) 6214 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6215 * rvalp - pointer to return value for calling process. 6216 * 6217 * Return Code: 6218 * 0 6219 * EFAULT 6220 * ENXIO 6221 * EIO 6222 * ENOTSUP 6223 */ 6224 static int 6225 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6226 { 6227 int instance = VDCUNIT(dev); 6228 vdc_t *vdc = NULL; 6229 int rv = -1; 6230 int idx = 0; /* index into dk_ioctl[] */ 6231 size_t len = 0; /* #bytes to send to vds */ 6232 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6233 caddr_t mem_p = NULL; 6234 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6235 vdc_dk_ioctl_t *iop; 6236 6237 vdc = ddi_get_soft_state(vdc_state, instance); 6238 if (vdc == NULL) { 6239 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6240 instance); 6241 return (ENXIO); 6242 } 6243 6244 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6245 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6246 6247 if (rvalp != NULL) { 6248 /* the return value of the ioctl is 0 by default */ 6249 *rvalp = 0; 6250 } 6251 6252 /* 6253 * Validate the ioctl operation to be performed. 6254 * 6255 * If we have looped through the array without finding a match then we 6256 * don't support this ioctl. 6257 */ 6258 for (idx = 0; idx < nioctls; idx++) { 6259 if (cmd == dk_ioctl[idx].cmd) 6260 break; 6261 } 6262 6263 if (idx >= nioctls) { 6264 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6265 vdc->instance, cmd); 6266 return (ENOTSUP); 6267 } 6268 6269 iop = &(dk_ioctl[idx]); 6270 6271 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6272 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6273 dk_efi_t dk_efi; 6274 6275 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6276 if (rv != 0) 6277 return (EFAULT); 6278 6279 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6280 } else { 6281 len = iop->nbytes; 6282 } 6283 6284 /* check if the ioctl is applicable */ 6285 switch (cmd) { 6286 case CDROMREADOFFSET: 6287 case DKIOCREMOVABLE: 6288 return (ENOTTY); 6289 6290 case USCSICMD: 6291 case MHIOCTKOWN: 6292 case MHIOCSTATUS: 6293 case MHIOCQRESERVE: 6294 case MHIOCRELEASE: 6295 case MHIOCGRP_INKEYS: 6296 case MHIOCGRP_INRESV: 6297 case MHIOCGRP_REGISTER: 6298 case MHIOCGRP_RESERVE: 6299 case MHIOCGRP_PREEMPTANDABORT: 6300 case MHIOCGRP_REGISTERANDIGNOREKEY: 6301 case MHIOCENFAILFAST: 6302 if (vdc->cinfo == NULL) 6303 return (ENXIO); 6304 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6305 return (ENOTTY); 6306 break; 6307 6308 case DIOCTL_RWCMD: 6309 if (vdc->cinfo == NULL) 6310 return (ENXIO); 6311 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6312 return (ENOTTY); 6313 break; 6314 6315 case DKIOCINFO: 6316 if (vdc->cinfo == NULL) 6317 return (ENXIO); 6318 break; 6319 6320 case DKIOCGMEDIAINFO: 6321 if (vdc->minfo == NULL) 6322 return (ENXIO); 6323 if (vdc_check_capacity(vdc) != 0) 6324 /* disk capacity is not available */ 6325 return (EIO); 6326 break; 6327 } 6328 6329 /* 6330 * Deal with ioctls which require a processing different than 6331 * converting ioctl arguments and sending a corresponding 6332 * VD operation. 6333 */ 6334 switch (cmd) { 6335 6336 case USCSICMD: 6337 { 6338 return (vdc_uscsi_cmd(vdc, arg, mode)); 6339 } 6340 6341 case MHIOCTKOWN: 6342 { 6343 mutex_enter(&vdc->ownership_lock); 6344 /* 6345 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6346 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6347 * while we are processing the ioctl. 6348 */ 6349 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6350 6351 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6352 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6353 if (rv == 0) { 6354 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6355 VDC_OWNERSHIP_GRANTED); 6356 } else { 6357 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6358 } 6359 mutex_exit(&vdc->ownership_lock); 6360 return (rv); 6361 } 6362 6363 case MHIOCRELEASE: 6364 { 6365 mutex_enter(&vdc->ownership_lock); 6366 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6367 if (rv == 0) { 6368 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6369 } 6370 mutex_exit(&vdc->ownership_lock); 6371 return (rv); 6372 } 6373 6374 case MHIOCSTATUS: 6375 { 6376 uint64_t status; 6377 6378 rv = vdc_access_get(vdc, &status, mode); 6379 if (rv == 0 && rvalp != NULL) 6380 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6381 return (rv); 6382 } 6383 6384 case MHIOCQRESERVE: 6385 { 6386 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6387 return (rv); 6388 } 6389 6390 case MHIOCGRP_INKEYS: 6391 { 6392 return (vdc_mhd_inkeys(vdc, arg, mode)); 6393 } 6394 6395 case MHIOCGRP_INRESV: 6396 { 6397 return (vdc_mhd_inresv(vdc, arg, mode)); 6398 } 6399 6400 case MHIOCGRP_REGISTER: 6401 { 6402 return (vdc_mhd_register(vdc, arg, mode)); 6403 } 6404 6405 case MHIOCGRP_RESERVE: 6406 { 6407 return (vdc_mhd_reserve(vdc, arg, mode)); 6408 } 6409 6410 case MHIOCGRP_PREEMPTANDABORT: 6411 { 6412 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6413 } 6414 6415 case MHIOCGRP_REGISTERANDIGNOREKEY: 6416 { 6417 return (vdc_mhd_registerignore(vdc, arg, mode)); 6418 } 6419 6420 case MHIOCENFAILFAST: 6421 { 6422 rv = vdc_failfast(vdc, arg, mode); 6423 return (rv); 6424 } 6425 6426 case DIOCTL_RWCMD: 6427 { 6428 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6429 } 6430 6431 case DKIOCGAPART: 6432 { 6433 return (vdc_dkio_get_partition(vdc, arg, mode)); 6434 } 6435 6436 case DKIOCINFO: 6437 { 6438 struct dk_cinfo cinfo; 6439 6440 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6441 cinfo.dki_partition = VDCPART(dev); 6442 6443 rv = ddi_copyout(&cinfo, (void *)arg, 6444 sizeof (struct dk_cinfo), mode); 6445 if (rv != 0) 6446 return (EFAULT); 6447 6448 return (0); 6449 } 6450 6451 case DKIOCGMEDIAINFO: 6452 { 6453 ASSERT(vdc->vdisk_size != 0); 6454 if (vdc->minfo->dki_capacity == 0) 6455 vdc->minfo->dki_capacity = vdc->vdisk_size; 6456 rv = ddi_copyout(vdc->minfo, (void *)arg, 6457 sizeof (struct dk_minfo), mode); 6458 if (rv != 0) 6459 return (EFAULT); 6460 6461 return (0); 6462 } 6463 6464 case DKIOCFLUSHWRITECACHE: 6465 { 6466 struct dk_callback *dkc = 6467 (struct dk_callback *)(uintptr_t)arg; 6468 vdc_dk_arg_t *dkarg = NULL; 6469 6470 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6471 instance, mode); 6472 6473 /* 6474 * If arg is NULL, then there is no callback function 6475 * registered and the call operates synchronously; we 6476 * break and continue with the rest of the function and 6477 * wait for vds to return (i.e. after the request to 6478 * vds returns successfully, all writes completed prior 6479 * to the ioctl will have been flushed from the disk 6480 * write cache to persistent media. 6481 * 6482 * If a callback function is registered, we dispatch 6483 * the request on a task queue and return immediately. 6484 * The callback will deal with informing the calling 6485 * thread that the flush request is completed. 6486 */ 6487 if (dkc == NULL) 6488 break; 6489 6490 /* 6491 * the asynchronous callback is only supported if 6492 * invoked from within the kernel 6493 */ 6494 if ((mode & FKIOCTL) == 0) 6495 return (ENOTSUP); 6496 6497 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6498 6499 dkarg->mode = mode; 6500 dkarg->dev = dev; 6501 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 6502 6503 mutex_enter(&vdc->lock); 6504 vdc->dkio_flush_pending++; 6505 dkarg->vdc = vdc; 6506 mutex_exit(&vdc->lock); 6507 6508 /* put the request on a task queue */ 6509 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 6510 (void *)dkarg, DDI_SLEEP); 6511 if (rv == NULL) { 6512 /* clean up if dispatch fails */ 6513 mutex_enter(&vdc->lock); 6514 vdc->dkio_flush_pending--; 6515 mutex_exit(&vdc->lock); 6516 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 6517 } 6518 6519 return (rv == NULL ? ENOMEM : 0); 6520 } 6521 } 6522 6523 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 6524 ASSERT(iop->op != 0); 6525 6526 /* check if the vDisk server handles the operation for this vDisk */ 6527 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 6528 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 6529 vdc->instance, iop->op); 6530 return (ENOTSUP); 6531 } 6532 6533 /* LDC requires that the memory being mapped is 8-byte aligned */ 6534 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 6535 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 6536 instance, len, alloc_len); 6537 6538 if (alloc_len > 0) 6539 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 6540 6541 /* 6542 * Call the conversion function for this ioctl which, if necessary, 6543 * converts from the Solaris format to the format ARC'ed 6544 * as part of the vDisk protocol (FWARC 2006/195) 6545 */ 6546 ASSERT(iop->convert != NULL); 6547 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 6548 if (rv != 0) { 6549 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6550 instance, rv, cmd); 6551 if (mem_p != NULL) 6552 kmem_free(mem_p, alloc_len); 6553 return (rv); 6554 } 6555 6556 /* 6557 * send request to vds to service the ioctl. 6558 */ 6559 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 6560 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 6561 VIO_both_dir, B_TRUE); 6562 6563 if (rv != 0) { 6564 /* 6565 * This is not necessarily an error. The ioctl could 6566 * be returning a value such as ENOTTY to indicate 6567 * that the ioctl is not applicable. 6568 */ 6569 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 6570 instance, rv, cmd); 6571 if (mem_p != NULL) 6572 kmem_free(mem_p, alloc_len); 6573 6574 return (rv); 6575 } 6576 6577 /* 6578 * Call the conversion function (if it exists) for this ioctl 6579 * which converts from the format ARC'ed as part of the vDisk 6580 * protocol (FWARC 2006/195) back to a format understood by 6581 * the rest of Solaris. 6582 */ 6583 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 6584 if (rv != 0) { 6585 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6586 instance, rv, cmd); 6587 if (mem_p != NULL) 6588 kmem_free(mem_p, alloc_len); 6589 return (rv); 6590 } 6591 6592 if (mem_p != NULL) 6593 kmem_free(mem_p, alloc_len); 6594 6595 return (rv); 6596 } 6597 6598 /* 6599 * Function: 6600 * 6601 * Description: 6602 * This is an empty conversion function used by ioctl calls which 6603 * do not need to convert the data being passed in/out to userland 6604 */ 6605 static int 6606 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 6607 { 6608 _NOTE(ARGUNUSED(vdc)) 6609 _NOTE(ARGUNUSED(from)) 6610 _NOTE(ARGUNUSED(to)) 6611 _NOTE(ARGUNUSED(mode)) 6612 _NOTE(ARGUNUSED(dir)) 6613 6614 return (0); 6615 } 6616 6617 static int 6618 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 6619 int mode, int dir) 6620 { 6621 _NOTE(ARGUNUSED(vdc)) 6622 6623 if (dir == VD_COPYIN) 6624 return (0); /* nothing to do */ 6625 6626 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 6627 return (EFAULT); 6628 6629 return (0); 6630 } 6631 6632 static int 6633 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 6634 int mode, int dir) 6635 { 6636 _NOTE(ARGUNUSED(vdc)) 6637 6638 if (dir == VD_COPYOUT) 6639 return (0); /* nothing to do */ 6640 6641 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 6642 return (EFAULT); 6643 6644 return (0); 6645 } 6646 6647 /* 6648 * Function: 6649 * vdc_get_vtoc_convert() 6650 * 6651 * Description: 6652 * This routine performs the necessary convertions from the DKIOCGVTOC 6653 * Solaris structure to the format defined in FWARC 2006/195. 6654 * 6655 * In the struct vtoc definition, the timestamp field is marked as not 6656 * supported so it is not part of vDisk protocol (FWARC 2006/195). 6657 * However SVM uses that field to check it can write into the VTOC, 6658 * so we fake up the info of that field. 6659 * 6660 * Arguments: 6661 * vdc - the vDisk client 6662 * from - the buffer containing the data to be copied from 6663 * to - the buffer to be copied to 6664 * mode - flags passed to ioctl() call 6665 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 6666 * 6667 * Return Code: 6668 * 0 - Success 6669 * ENXIO - incorrect buffer passed in. 6670 * EFAULT - ddi_copyout routine encountered an error. 6671 */ 6672 static int 6673 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6674 { 6675 int i; 6676 void *tmp_mem = NULL; 6677 void *tmp_memp; 6678 struct vtoc vt; 6679 struct vtoc32 vt32; 6680 int copy_len = 0; 6681 int rv = 0; 6682 6683 if (dir != VD_COPYOUT) 6684 return (0); /* nothing to do */ 6685 6686 if ((from == NULL) || (to == NULL)) 6687 return (ENXIO); 6688 6689 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6690 copy_len = sizeof (struct vtoc32); 6691 else 6692 copy_len = sizeof (struct vtoc); 6693 6694 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6695 6696 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 6697 6698 /* fake the VTOC timestamp field */ 6699 for (i = 0; i < V_NUMPAR; i++) { 6700 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 6701 } 6702 6703 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6704 /* LINTED E_ASSIGN_NARROW_CONV */ 6705 vtoctovtoc32(vt, vt32); 6706 tmp_memp = &vt32; 6707 } else { 6708 tmp_memp = &vt; 6709 } 6710 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 6711 if (rv != 0) 6712 rv = EFAULT; 6713 6714 kmem_free(tmp_mem, copy_len); 6715 return (rv); 6716 } 6717 6718 /* 6719 * Function: 6720 * vdc_set_vtoc_convert() 6721 * 6722 * Description: 6723 * This routine performs the necessary convertions from the DKIOCSVTOC 6724 * Solaris structure to the format defined in FWARC 2006/195. 6725 * 6726 * Arguments: 6727 * vdc - the vDisk client 6728 * from - Buffer with data 6729 * to - Buffer where data is to be copied to 6730 * mode - flags passed to ioctl 6731 * dir - direction of copy (in or out) 6732 * 6733 * Return Code: 6734 * 0 - Success 6735 * ENXIO - Invalid buffer passed in 6736 * EFAULT - ddi_copyin of data failed 6737 */ 6738 static int 6739 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6740 { 6741 _NOTE(ARGUNUSED(vdc)) 6742 6743 void *tmp_mem = NULL, *uvtoc; 6744 struct vtoc vt; 6745 struct vtoc *vtp = &vt; 6746 vd_vtoc_t vtvd; 6747 int copy_len = 0; 6748 int i, rv = 0; 6749 6750 if ((from == NULL) || (to == NULL)) 6751 return (ENXIO); 6752 6753 if (dir == VD_COPYIN) 6754 uvtoc = from; 6755 else 6756 uvtoc = to; 6757 6758 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6759 copy_len = sizeof (struct vtoc32); 6760 else 6761 copy_len = sizeof (struct vtoc); 6762 6763 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6764 6765 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 6766 if (rv != 0) { 6767 kmem_free(tmp_mem, copy_len); 6768 return (EFAULT); 6769 } 6770 6771 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6772 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 6773 } else { 6774 vtp = tmp_mem; 6775 } 6776 6777 if (dir == VD_COPYOUT) { 6778 /* 6779 * The disk label may have changed. Revalidate the disk 6780 * geometry. This will also update the device nodes and 6781 * properties. 6782 */ 6783 vdc_validate(vdc); 6784 6785 /* 6786 * We also need to keep track of the timestamp fields. 6787 */ 6788 for (i = 0; i < V_NUMPAR; i++) { 6789 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 6790 } 6791 6792 return (0); 6793 } 6794 6795 VTOC2VD_VTOC(vtp, &vtvd); 6796 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 6797 kmem_free(tmp_mem, copy_len); 6798 6799 return (0); 6800 } 6801 6802 /* 6803 * Function: 6804 * vdc_get_geom_convert() 6805 * 6806 * Description: 6807 * This routine performs the necessary convertions from the DKIOCGGEOM, 6808 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 6809 * defined in FWARC 2006/195 6810 * 6811 * Arguments: 6812 * vdc - the vDisk client 6813 * from - Buffer with data 6814 * to - Buffer where data is to be copied to 6815 * mode - flags passed to ioctl 6816 * dir - direction of copy (in or out) 6817 * 6818 * Return Code: 6819 * 0 - Success 6820 * ENXIO - Invalid buffer passed in 6821 * EFAULT - ddi_copyout of data failed 6822 */ 6823 static int 6824 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6825 { 6826 _NOTE(ARGUNUSED(vdc)) 6827 6828 struct dk_geom geom; 6829 int copy_len = sizeof (struct dk_geom); 6830 int rv = 0; 6831 6832 if (dir != VD_COPYOUT) 6833 return (0); /* nothing to do */ 6834 6835 if ((from == NULL) || (to == NULL)) 6836 return (ENXIO); 6837 6838 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 6839 rv = ddi_copyout(&geom, to, copy_len, mode); 6840 if (rv != 0) 6841 rv = EFAULT; 6842 6843 return (rv); 6844 } 6845 6846 /* 6847 * Function: 6848 * vdc_set_geom_convert() 6849 * 6850 * Description: 6851 * This routine performs the necessary convertions from the DKIOCSGEOM 6852 * Solaris structure to the format defined in FWARC 2006/195. 6853 * 6854 * Arguments: 6855 * vdc - the vDisk client 6856 * from - Buffer with data 6857 * to - Buffer where data is to be copied to 6858 * mode - flags passed to ioctl 6859 * dir - direction of copy (in or out) 6860 * 6861 * Return Code: 6862 * 0 - Success 6863 * ENXIO - Invalid buffer passed in 6864 * EFAULT - ddi_copyin of data failed 6865 */ 6866 static int 6867 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6868 { 6869 _NOTE(ARGUNUSED(vdc)) 6870 6871 vd_geom_t vdgeom; 6872 void *tmp_mem = NULL; 6873 int copy_len = sizeof (struct dk_geom); 6874 int rv = 0; 6875 6876 if (dir != VD_COPYIN) 6877 return (0); /* nothing to do */ 6878 6879 if ((from == NULL) || (to == NULL)) 6880 return (ENXIO); 6881 6882 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6883 6884 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 6885 if (rv != 0) { 6886 kmem_free(tmp_mem, copy_len); 6887 return (EFAULT); 6888 } 6889 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 6890 bcopy(&vdgeom, to, sizeof (vdgeom)); 6891 kmem_free(tmp_mem, copy_len); 6892 6893 return (0); 6894 } 6895 6896 static int 6897 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6898 { 6899 _NOTE(ARGUNUSED(vdc)) 6900 6901 vd_efi_t *vd_efi; 6902 dk_efi_t dk_efi; 6903 int rv = 0; 6904 void *uaddr; 6905 6906 if ((from == NULL) || (to == NULL)) 6907 return (ENXIO); 6908 6909 if (dir == VD_COPYIN) { 6910 6911 vd_efi = (vd_efi_t *)to; 6912 6913 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 6914 if (rv != 0) 6915 return (EFAULT); 6916 6917 vd_efi->lba = dk_efi.dki_lba; 6918 vd_efi->length = dk_efi.dki_length; 6919 bzero(vd_efi->data, vd_efi->length); 6920 6921 } else { 6922 6923 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 6924 if (rv != 0) 6925 return (EFAULT); 6926 6927 uaddr = dk_efi.dki_data; 6928 6929 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 6930 6931 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 6932 6933 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 6934 mode); 6935 if (rv != 0) 6936 return (EFAULT); 6937 6938 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 6939 } 6940 6941 return (0); 6942 } 6943 6944 static int 6945 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6946 { 6947 _NOTE(ARGUNUSED(vdc)) 6948 6949 dk_efi_t dk_efi; 6950 void *uaddr; 6951 6952 if (dir == VD_COPYOUT) { 6953 /* 6954 * The disk label may have changed. Revalidate the disk 6955 * geometry. This will also update the device nodes and 6956 * properties. 6957 */ 6958 vdc_validate(vdc); 6959 return (0); 6960 } 6961 6962 if ((from == NULL) || (to == NULL)) 6963 return (ENXIO); 6964 6965 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 6966 return (EFAULT); 6967 6968 uaddr = dk_efi.dki_data; 6969 6970 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 6971 6972 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 6973 return (EFAULT); 6974 6975 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 6976 6977 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 6978 6979 return (0); 6980 } 6981 6982 6983 /* -------------------------------------------------------------------------- */ 6984 6985 /* 6986 * Function: 6987 * vdc_create_fake_geometry() 6988 * 6989 * Description: 6990 * This routine fakes up the disk info needed for some DKIO ioctls such 6991 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 6992 * 6993 * Note: This function must not be called until the vDisk attributes have 6994 * been exchanged as part of the handshake with the vDisk server. 6995 * 6996 * Arguments: 6997 * vdc - soft state pointer for this instance of the device driver. 6998 * 6999 * Return Code: 7000 * none. 7001 */ 7002 static void 7003 vdc_create_fake_geometry(vdc_t *vdc) 7004 { 7005 ASSERT(vdc != NULL); 7006 ASSERT(vdc->max_xfer_sz != 0); 7007 7008 /* 7009 * DKIOCINFO support 7010 */ 7011 if (vdc->cinfo == NULL) 7012 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7013 7014 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7015 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7016 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7017 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7018 7019 /* 7020 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7021 * operation is supported, otherwise the controller type is DKC_DIRECT. 7022 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7023 * controller type is always DKC_DIRECT in that case. 7024 * 7025 * If the virtual disk is backed by a physical CD/DVD device or 7026 * an ISO image, modify the controller type to indicate this 7027 */ 7028 switch (vdc->vdisk_media) { 7029 case VD_MEDIA_CD: 7030 case VD_MEDIA_DVD: 7031 vdc->cinfo->dki_ctype = DKC_CDROM; 7032 break; 7033 case VD_MEDIA_FIXED: 7034 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7035 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7036 else 7037 vdc->cinfo->dki_ctype = DKC_DIRECT; 7038 break; 7039 default: 7040 /* in the case of v1.0 we default to a fixed disk */ 7041 vdc->cinfo->dki_ctype = DKC_DIRECT; 7042 break; 7043 } 7044 vdc->cinfo->dki_flags = DKI_FMTVOL; 7045 vdc->cinfo->dki_cnum = 0; 7046 vdc->cinfo->dki_addr = 0; 7047 vdc->cinfo->dki_space = 0; 7048 vdc->cinfo->dki_prio = 0; 7049 vdc->cinfo->dki_vec = 0; 7050 vdc->cinfo->dki_unit = vdc->instance; 7051 vdc->cinfo->dki_slave = 0; 7052 /* 7053 * The partition number will be created on the fly depending on the 7054 * actual slice (i.e. minor node) that is used to request the data. 7055 */ 7056 vdc->cinfo->dki_partition = 0; 7057 7058 /* 7059 * DKIOCGMEDIAINFO support 7060 */ 7061 if (vdc->minfo == NULL) 7062 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7063 7064 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7065 vdc->minfo->dki_media_type = 7066 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7067 } else { 7068 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7069 } 7070 7071 vdc->minfo->dki_capacity = vdc->vdisk_size; 7072 vdc->minfo->dki_lbsize = vdc->block_size; 7073 } 7074 7075 static ushort_t 7076 vdc_lbl2cksum(struct dk_label *label) 7077 { 7078 int count; 7079 ushort_t sum, *sp; 7080 7081 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7082 sp = (ushort_t *)label; 7083 sum = 0; 7084 while (count--) { 7085 sum ^= *sp++; 7086 } 7087 7088 return (sum); 7089 } 7090 7091 /* 7092 * Function: 7093 * vdc_validate_geometry 7094 * 7095 * Description: 7096 * This routine discovers the label and geometry of the disk. It stores 7097 * the disk label and related information in the vdc structure. If it 7098 * fails to validate the geometry or to discover the disk label then 7099 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7100 * 7101 * Arguments: 7102 * vdc - soft state pointer for this instance of the device driver. 7103 * 7104 * Return Code: 7105 * 0 - success. 7106 * EINVAL - unknown disk label. 7107 * ENOTSUP - geometry not applicable (EFI label). 7108 * EIO - error accessing the disk. 7109 */ 7110 static int 7111 vdc_validate_geometry(vdc_t *vdc) 7112 { 7113 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7114 dev_t dev; 7115 int rv, rval; 7116 struct dk_label label; 7117 struct dk_geom geom; 7118 struct vtoc vtoc; 7119 7120 ASSERT(vdc != NULL); 7121 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7122 ASSERT(MUTEX_HELD(&vdc->lock)); 7123 7124 mutex_exit(&vdc->lock); 7125 7126 dev = makedevice(ddi_driver_major(vdc->dip), 7127 VD_MAKE_DEV(vdc->instance, 0)); 7128 7129 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7130 if (rv == 0) 7131 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7132 FKIOCTL, &rval); 7133 7134 if (rv == ENOTSUP) { 7135 /* 7136 * If the device does not support VTOC then we try 7137 * to read an EFI label. 7138 */ 7139 struct dk_gpt *efi; 7140 size_t efi_len; 7141 7142 rv = vdc_efi_alloc_and_read(dev, &efi, &efi_len); 7143 7144 if (rv) { 7145 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7146 vdc->instance, rv); 7147 mutex_enter(&vdc->lock); 7148 vdc_store_label_unk(vdc); 7149 return (EIO); 7150 } 7151 7152 mutex_enter(&vdc->lock); 7153 vdc_store_label_efi(vdc, efi); 7154 vd_efi_free(efi, efi_len); 7155 return (ENOTSUP); 7156 } 7157 7158 if (rv != 0) { 7159 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7160 vdc->instance, rv); 7161 mutex_enter(&vdc->lock); 7162 vdc_store_label_unk(vdc); 7163 if (rv != EINVAL) 7164 rv = EIO; 7165 return (rv); 7166 } 7167 7168 /* check that geometry and vtoc are valid */ 7169 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7170 vtoc.v_sanity != VTOC_SANE) { 7171 mutex_enter(&vdc->lock); 7172 vdc_store_label_unk(vdc); 7173 return (EINVAL); 7174 } 7175 7176 /* 7177 * We have a disk and a valid VTOC. However this does not mean 7178 * that the disk currently have a VTOC label. The returned VTOC may 7179 * be a default VTOC to be used for configuring the disk (this is 7180 * what is done for disk image). So we read the label from the 7181 * beginning of the disk to ensure we really have a VTOC label. 7182 * 7183 * FUTURE: This could be the default way for reading the VTOC 7184 * from the disk as opposed to sending the VD_OP_GET_VTOC 7185 * to the server. This will be the default if vdc is implemented 7186 * ontop of cmlb. 7187 */ 7188 7189 /* 7190 * Single slice disk does not support read using an absolute disk 7191 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7192 */ 7193 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7194 mutex_enter(&vdc->lock); 7195 if (vtoc.v_nparts != 1) { 7196 vdc_store_label_unk(vdc); 7197 return (EINVAL); 7198 } 7199 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7200 return (0); 7201 } 7202 7203 if (vtoc.v_nparts != V_NUMPAR) { 7204 mutex_enter(&vdc->lock); 7205 vdc_store_label_unk(vdc); 7206 return (EINVAL); 7207 } 7208 7209 /* 7210 * Read disk label from start of disk 7211 */ 7212 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7213 bioinit(buf); 7214 buf->b_un.b_addr = (caddr_t)&label; 7215 buf->b_bcount = DK_LABEL_SIZE; 7216 buf->b_flags = B_BUSY | B_READ; 7217 buf->b_dev = cmpdev(dev); 7218 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7219 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7220 if (rv) { 7221 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7222 vdc->instance); 7223 } else { 7224 rv = biowait(buf); 7225 biofini(buf); 7226 } 7227 kmem_free(buf, sizeof (buf_t)); 7228 7229 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7230 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7231 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7232 vdc->instance); 7233 mutex_enter(&vdc->lock); 7234 vdc_store_label_unk(vdc); 7235 return (EINVAL); 7236 } 7237 7238 mutex_enter(&vdc->lock); 7239 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7240 return (0); 7241 } 7242 7243 /* 7244 * Function: 7245 * vdc_validate 7246 * 7247 * Description: 7248 * This routine discovers the label of the disk and create the 7249 * appropriate device nodes if the label has changed. 7250 * 7251 * Arguments: 7252 * vdc - soft state pointer for this instance of the device driver. 7253 * 7254 * Return Code: 7255 * none. 7256 */ 7257 static void 7258 vdc_validate(vdc_t *vdc) 7259 { 7260 vd_disk_label_t old_label; 7261 struct vtoc old_vtoc; 7262 int rv; 7263 7264 ASSERT(!MUTEX_HELD(&vdc->lock)); 7265 7266 mutex_enter(&vdc->lock); 7267 7268 /* save the current label and vtoc */ 7269 old_label = vdc->vdisk_label; 7270 bcopy(vdc->vtoc, &old_vtoc, sizeof (struct vtoc)); 7271 7272 /* check the geometry */ 7273 (void) vdc_validate_geometry(vdc); 7274 7275 /* if the disk label has changed, update device nodes */ 7276 if (vdc->vdisk_label != old_label) { 7277 7278 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7279 rv = vdc_create_device_nodes_efi(vdc); 7280 else 7281 rv = vdc_create_device_nodes_vtoc(vdc); 7282 7283 if (rv != 0) { 7284 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7285 vdc->instance); 7286 } 7287 } 7288 7289 /* if the vtoc has changed, update device nodes properties */ 7290 if (bcmp(vdc->vtoc, &old_vtoc, sizeof (struct vtoc)) != 0) { 7291 7292 if (vdc_create_device_nodes_props(vdc) != 0) { 7293 DMSG(vdc, 0, "![%d] Failed to update device nodes" 7294 " properties", vdc->instance); 7295 } 7296 } 7297 7298 mutex_exit(&vdc->lock); 7299 } 7300 7301 static void 7302 vdc_validate_task(void *arg) 7303 { 7304 vdc_t *vdc = (vdc_t *)arg; 7305 7306 vdc_validate(vdc); 7307 7308 mutex_enter(&vdc->lock); 7309 ASSERT(vdc->validate_pending > 0); 7310 vdc->validate_pending--; 7311 mutex_exit(&vdc->lock); 7312 } 7313 7314 /* 7315 * Function: 7316 * vdc_setup_devid() 7317 * 7318 * Description: 7319 * This routine discovers the devid of a vDisk. It requests the devid of 7320 * the underlying device from the vDisk server, builds an encapsulated 7321 * devid based on the retrieved devid and registers that new devid to 7322 * the vDisk. 7323 * 7324 * Arguments: 7325 * vdc - soft state pointer for this instance of the device driver. 7326 * 7327 * Return Code: 7328 * 0 - A devid was succesfully registered for the vDisk 7329 */ 7330 static int 7331 vdc_setup_devid(vdc_t *vdc) 7332 { 7333 int rv; 7334 vd_devid_t *vd_devid; 7335 size_t bufsize, bufid_len; 7336 7337 /* 7338 * At first sight, we don't know the size of the devid that the 7339 * server will return but this size will be encoded into the 7340 * reply. So we do a first request using a default size then we 7341 * check if this size was large enough. If not then we do a second 7342 * request with the correct size returned by the server. Note that 7343 * ldc requires size to be 8-byte aligned. 7344 */ 7345 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7346 sizeof (uint64_t)); 7347 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7348 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7349 7350 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7351 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7352 7353 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7354 7355 if (rv) { 7356 kmem_free(vd_devid, bufsize); 7357 return (rv); 7358 } 7359 7360 if (vd_devid->length > bufid_len) { 7361 /* 7362 * The returned devid is larger than the buffer used. Try again 7363 * with a buffer with the right size. 7364 */ 7365 kmem_free(vd_devid, bufsize); 7366 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7367 sizeof (uint64_t)); 7368 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7369 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7370 7371 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7372 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7373 VIO_both_dir, B_TRUE); 7374 7375 if (rv) { 7376 kmem_free(vd_devid, bufsize); 7377 return (rv); 7378 } 7379 } 7380 7381 /* 7382 * The virtual disk should have the same device id as the one associated 7383 * with the physical disk it is mapped on, otherwise sharing a disk 7384 * between a LDom and a non-LDom may not work (for example for a shared 7385 * SVM disk set). 7386 * 7387 * The DDI framework does not allow creating a device id with any 7388 * type so we first create a device id of type DEVID_ENCAP and then 7389 * we restore the orignal type of the physical device. 7390 */ 7391 7392 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7393 7394 /* build an encapsulated devid based on the returned devid */ 7395 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7396 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7397 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7398 kmem_free(vd_devid, bufsize); 7399 return (1); 7400 } 7401 7402 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7403 7404 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7405 7406 kmem_free(vd_devid, bufsize); 7407 7408 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7409 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7410 return (1); 7411 } 7412 7413 return (0); 7414 } 7415 7416 static void 7417 vdc_store_label_efi(vdc_t *vdc, struct dk_gpt *efi) 7418 { 7419 struct vtoc *vtoc = vdc->vtoc; 7420 7421 ASSERT(MUTEX_HELD(&vdc->lock)); 7422 7423 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7424 bzero(vdc->geom, sizeof (struct dk_geom)); 7425 vd_efi_to_vtoc(efi, vtoc); 7426 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7427 /* 7428 * vd_efi_to_vtoc() will store information about the EFI Sun 7429 * reserved partition (representing the entire disk) into 7430 * partition 7. However single-slice device will only have 7431 * that single partition and the vdc driver expects to find 7432 * information about that partition in slice 0. So we need 7433 * to copy information from slice 7 to slice 0. 7434 */ 7435 vtoc->v_part[0].p_tag = vtoc->v_part[VD_EFI_WD_SLICE].p_tag; 7436 vtoc->v_part[0].p_flag = vtoc->v_part[VD_EFI_WD_SLICE].p_flag; 7437 vtoc->v_part[0].p_start = vtoc->v_part[VD_EFI_WD_SLICE].p_start; 7438 vtoc->v_part[0].p_size = vtoc->v_part[VD_EFI_WD_SLICE].p_size; 7439 } 7440 } 7441 7442 static void 7443 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7444 { 7445 ASSERT(MUTEX_HELD(&vdc->lock)); 7446 7447 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7448 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7449 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7450 } 7451 7452 static void 7453 vdc_store_label_unk(vdc_t *vdc) 7454 { 7455 ASSERT(MUTEX_HELD(&vdc->lock)); 7456 7457 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7458 bzero(vdc->vtoc, sizeof (struct vtoc)); 7459 bzero(vdc->geom, sizeof (struct dk_geom)); 7460 } 7461