1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/kstat.h> 69 #include <sys/mach_descrip.h> 70 #include <sys/modctl.h> 71 #include <sys/mdeg.h> 72 #include <sys/note.h> 73 #include <sys/open.h> 74 #include <sys/sdt.h> 75 #include <sys/stat.h> 76 #include <sys/sunddi.h> 77 #include <sys/types.h> 78 #include <sys/promif.h> 79 #include <sys/var.h> 80 #include <sys/vtoc.h> 81 #include <sys/archsystm.h> 82 #include <sys/sysmacros.h> 83 84 #include <sys/cdio.h> 85 #include <sys/dktp/fdisk.h> 86 #include <sys/dktp/dadkio.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 124 /* setup */ 125 static void vdc_min(struct buf *bufp); 126 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 127 static int vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node); 128 static int vdc_start_ldc_connection(vdc_t *vdc); 129 static int vdc_create_device_nodes(vdc_t *vdc); 130 static int vdc_create_device_nodes_efi(vdc_t *vdc); 131 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 132 static int vdc_create_device_nodes_props(vdc_t *vdc); 133 static void vdc_create_io_kstats(vdc_t *vdc); 134 static void vdc_create_err_kstats(vdc_t *vdc); 135 static void vdc_set_err_kstats(vdc_t *vdc); 136 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 137 mde_cookie_t *vd_nodep, mde_cookie_t *vd_portp); 138 static int vdc_get_ldc_id(md_t *, mde_cookie_t, uint64_t *); 139 static int vdc_do_ldc_up(vdc_t *vdc); 140 static void vdc_terminate_ldc(vdc_t *vdc); 141 static int vdc_init_descriptor_ring(vdc_t *vdc); 142 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 143 static int vdc_setup_devid(vdc_t *vdc); 144 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 145 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 146 static void vdc_store_label_unk(vdc_t *vdc); 147 static boolean_t vdc_is_opened(vdc_t *vdc); 148 149 /* handshake with vds */ 150 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 151 static int vdc_ver_negotiation(vdc_t *vdcp); 152 static int vdc_init_attr_negotiation(vdc_t *vdc); 153 static int vdc_attr_negotiation(vdc_t *vdcp); 154 static int vdc_init_dring_negotiate(vdc_t *vdc); 155 static int vdc_dring_negotiation(vdc_t *vdcp); 156 static int vdc_send_rdx(vdc_t *vdcp); 157 static int vdc_rdx_exchange(vdc_t *vdcp); 158 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 159 160 /* processing incoming messages from vDisk server */ 161 static void vdc_process_msg_thread(vdc_t *vdc); 162 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 163 164 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 165 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 166 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 167 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 168 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 169 static int vdc_send_request(vdc_t *vdcp, int operation, 170 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 171 int cb_type, void *cb_arg, vio_desc_direction_t dir); 172 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 173 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 174 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 175 int cb_type, void *cb_arg, vio_desc_direction_t dir); 176 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 177 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 178 void *cb_arg, vio_desc_direction_t dir, boolean_t); 179 180 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 181 static int vdc_drain_response(vdc_t *vdcp); 182 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 183 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 184 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 185 186 /* dkio */ 187 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 188 int *rvalp); 189 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 190 static void vdc_create_fake_geometry(vdc_t *vdc); 191 static int vdc_validate_geometry(vdc_t *vdc); 192 static void vdc_validate(vdc_t *vdc); 193 static void vdc_validate_task(void *arg); 194 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 195 int mode, int dir); 196 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 197 int mode, int dir); 198 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 199 int mode, int dir); 200 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 201 int mode, int dir); 202 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 203 int mode, int dir); 204 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 205 int mode, int dir); 206 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 207 int mode, int dir); 208 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 209 int mode, int dir); 210 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 211 int mode, int dir); 212 213 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 214 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 215 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 216 static int vdc_failfast_check_resv(vdc_t *vdc); 217 218 /* 219 * Module variables 220 */ 221 222 /* 223 * Tunable variables to control how long vdc waits before timing out on 224 * various operations 225 */ 226 static int vdc_hshake_retries = 3; 227 228 static int vdc_timeout = 0; /* units: seconds */ 229 230 static uint64_t vdc_hz_min_ldc_delay; 231 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 232 static uint64_t vdc_hz_max_ldc_delay; 233 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 234 235 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 236 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 237 238 /* values for dumping - need to run in a tighter loop */ 239 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 240 static int vdc_dump_retries = 100; 241 242 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 243 244 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 245 246 /* Count of the number of vdc instances attached */ 247 static volatile uint32_t vdc_instance_count = 0; 248 249 /* Tunable to log all SCSI errors */ 250 static boolean_t vdc_scsi_log_error = B_FALSE; 251 252 /* Soft state pointer */ 253 static void *vdc_state; 254 255 /* 256 * Controlling the verbosity of the error/debug messages 257 * 258 * vdc_msglevel - controls level of messages 259 * vdc_matchinst - 64-bit variable where each bit corresponds 260 * to the vdc instance the vdc_msglevel applies. 261 */ 262 int vdc_msglevel = 0x0; 263 uint64_t vdc_matchinst = 0ull; 264 265 /* 266 * Supported vDisk protocol version pairs. 267 * 268 * The first array entry is the latest and preferred version. 269 */ 270 static const vio_ver_t vdc_version[] = {{1, 1}}; 271 272 static struct cb_ops vdc_cb_ops = { 273 vdc_open, /* cb_open */ 274 vdc_close, /* cb_close */ 275 vdc_strategy, /* cb_strategy */ 276 vdc_print, /* cb_print */ 277 vdc_dump, /* cb_dump */ 278 vdc_read, /* cb_read */ 279 vdc_write, /* cb_write */ 280 vdc_ioctl, /* cb_ioctl */ 281 nodev, /* cb_devmap */ 282 nodev, /* cb_mmap */ 283 nodev, /* cb_segmap */ 284 nochpoll, /* cb_chpoll */ 285 ddi_prop_op, /* cb_prop_op */ 286 NULL, /* cb_str */ 287 D_MP | D_64BIT, /* cb_flag */ 288 CB_REV, /* cb_rev */ 289 vdc_aread, /* cb_aread */ 290 vdc_awrite /* cb_awrite */ 291 }; 292 293 static struct dev_ops vdc_ops = { 294 DEVO_REV, /* devo_rev */ 295 0, /* devo_refcnt */ 296 vdc_getinfo, /* devo_getinfo */ 297 nulldev, /* devo_identify */ 298 nulldev, /* devo_probe */ 299 vdc_attach, /* devo_attach */ 300 vdc_detach, /* devo_detach */ 301 nodev, /* devo_reset */ 302 &vdc_cb_ops, /* devo_cb_ops */ 303 NULL, /* devo_bus_ops */ 304 nulldev /* devo_power */ 305 }; 306 307 static struct modldrv modldrv = { 308 &mod_driverops, 309 "virtual disk client", 310 &vdc_ops, 311 }; 312 313 static struct modlinkage modlinkage = { 314 MODREV_1, 315 &modldrv, 316 NULL 317 }; 318 319 /* -------------------------------------------------------------------------- */ 320 321 /* 322 * Device Driver housekeeping and setup 323 */ 324 325 int 326 _init(void) 327 { 328 int status; 329 330 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 331 return (status); 332 if ((status = mod_install(&modlinkage)) != 0) 333 ddi_soft_state_fini(&vdc_state); 334 return (status); 335 } 336 337 int 338 _info(struct modinfo *modinfop) 339 { 340 return (mod_info(&modlinkage, modinfop)); 341 } 342 343 int 344 _fini(void) 345 { 346 int status; 347 348 if ((status = mod_remove(&modlinkage)) != 0) 349 return (status); 350 ddi_soft_state_fini(&vdc_state); 351 return (0); 352 } 353 354 static int 355 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 356 { 357 _NOTE(ARGUNUSED(dip)) 358 359 int instance = VDCUNIT((dev_t)arg); 360 vdc_t *vdc = NULL; 361 362 switch (cmd) { 363 case DDI_INFO_DEVT2DEVINFO: 364 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 365 *resultp = NULL; 366 return (DDI_FAILURE); 367 } 368 *resultp = vdc->dip; 369 return (DDI_SUCCESS); 370 case DDI_INFO_DEVT2INSTANCE: 371 *resultp = (void *)(uintptr_t)instance; 372 return (DDI_SUCCESS); 373 default: 374 *resultp = NULL; 375 return (DDI_FAILURE); 376 } 377 } 378 379 static int 380 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 381 { 382 kt_did_t failfast_tid, ownership_tid; 383 int instance; 384 int rv; 385 vdc_t *vdc = NULL; 386 387 switch (cmd) { 388 case DDI_DETACH: 389 /* the real work happens below */ 390 break; 391 case DDI_SUSPEND: 392 /* nothing to do for this non-device */ 393 return (DDI_SUCCESS); 394 default: 395 return (DDI_FAILURE); 396 } 397 398 ASSERT(cmd == DDI_DETACH); 399 instance = ddi_get_instance(dip); 400 DMSGX(1, "[%d] Entered\n", instance); 401 402 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 403 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 404 return (DDI_FAILURE); 405 } 406 407 /* 408 * This function is called when vdc is detached or if it has failed to 409 * attach. In that case, the attach may have fail before the vdisk type 410 * has been set so we can't call vdc_is_opened(). However as the attach 411 * has failed, we know that the vdisk is not opened and we can safely 412 * detach. 413 */ 414 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 415 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 416 return (DDI_FAILURE); 417 } 418 419 if (vdc->dkio_flush_pending) { 420 DMSG(vdc, 0, 421 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 422 instance, vdc->dkio_flush_pending); 423 return (DDI_FAILURE); 424 } 425 426 if (vdc->validate_pending) { 427 DMSG(vdc, 0, 428 "[%d] Cannot detach: %d outstanding validate request\n", 429 instance, vdc->validate_pending); 430 return (DDI_FAILURE); 431 } 432 433 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 434 435 /* If we took ownership, release ownership */ 436 mutex_enter(&vdc->ownership_lock); 437 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 438 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 439 if (rv == 0) { 440 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 441 } 442 } 443 mutex_exit(&vdc->ownership_lock); 444 445 /* mark instance as detaching */ 446 vdc->lifecycle = VDC_LC_DETACHING; 447 448 /* 449 * try and disable callbacks to prevent another handshake 450 */ 451 rv = ldc_set_cb_mode(vdc->ldc_handle, LDC_CB_DISABLE); 452 DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); 453 454 if (vdc->initialized & VDC_THREAD) { 455 mutex_enter(&vdc->read_lock); 456 if ((vdc->read_state == VDC_READ_WAITING) || 457 (vdc->read_state == VDC_READ_RESET)) { 458 vdc->read_state = VDC_READ_RESET; 459 cv_signal(&vdc->read_cv); 460 } 461 462 mutex_exit(&vdc->read_lock); 463 464 /* wake up any thread waiting for connection to come online */ 465 mutex_enter(&vdc->lock); 466 if (vdc->state == VDC_STATE_INIT_WAITING) { 467 DMSG(vdc, 0, 468 "[%d] write reset - move to resetting state...\n", 469 instance); 470 vdc->state = VDC_STATE_RESETTING; 471 cv_signal(&vdc->initwait_cv); 472 } 473 mutex_exit(&vdc->lock); 474 475 /* now wait until state transitions to VDC_STATE_DETACH */ 476 thread_join(vdc->msg_proc_thr->t_did); 477 ASSERT(vdc->state == VDC_STATE_DETACH); 478 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 479 vdc->instance); 480 } 481 482 mutex_enter(&vdc->lock); 483 484 if (vdc->initialized & VDC_DRING) 485 vdc_destroy_descriptor_ring(vdc); 486 487 if (vdc->initialized & VDC_LDC) 488 vdc_terminate_ldc(vdc); 489 490 if (vdc->failfast_thread) { 491 failfast_tid = vdc->failfast_thread->t_did; 492 vdc->failfast_interval = 0; 493 cv_signal(&vdc->failfast_cv); 494 } else { 495 failfast_tid = 0; 496 } 497 498 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 499 ownership_tid = vdc->ownership_thread->t_did; 500 vdc->ownership = VDC_OWNERSHIP_NONE; 501 cv_signal(&vdc->ownership_cv); 502 } else { 503 ownership_tid = 0; 504 } 505 506 mutex_exit(&vdc->lock); 507 508 if (failfast_tid != 0) 509 thread_join(failfast_tid); 510 511 if (ownership_tid != 0) 512 thread_join(ownership_tid); 513 514 if (vdc->initialized & VDC_MINOR) { 515 ddi_prop_remove_all(dip); 516 ddi_remove_minor_node(dip, NULL); 517 } 518 519 if (vdc->io_stats) { 520 kstat_delete(vdc->io_stats); 521 vdc->io_stats = NULL; 522 } 523 524 if (vdc->err_stats) { 525 kstat_delete(vdc->err_stats); 526 vdc->err_stats = NULL; 527 } 528 529 if (vdc->initialized & VDC_LOCKS) { 530 mutex_destroy(&vdc->lock); 531 mutex_destroy(&vdc->read_lock); 532 mutex_destroy(&vdc->ownership_lock); 533 cv_destroy(&vdc->initwait_cv); 534 cv_destroy(&vdc->dring_free_cv); 535 cv_destroy(&vdc->membind_cv); 536 cv_destroy(&vdc->sync_pending_cv); 537 cv_destroy(&vdc->sync_blocked_cv); 538 cv_destroy(&vdc->read_cv); 539 cv_destroy(&vdc->running_cv); 540 cv_destroy(&vdc->ownership_cv); 541 cv_destroy(&vdc->failfast_cv); 542 cv_destroy(&vdc->failfast_io_cv); 543 } 544 545 if (vdc->minfo) 546 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 547 548 if (vdc->cinfo) 549 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 550 551 if (vdc->vtoc) 552 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 553 554 if (vdc->geom) 555 kmem_free(vdc->geom, sizeof (struct dk_geom)); 556 557 if (vdc->devid) { 558 ddi_devid_unregister(dip); 559 ddi_devid_free(vdc->devid); 560 } 561 562 if (vdc->initialized & VDC_SOFT_STATE) 563 ddi_soft_state_free(vdc_state, instance); 564 565 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 566 567 return (DDI_SUCCESS); 568 } 569 570 571 static int 572 vdc_do_attach(dev_info_t *dip) 573 { 574 int instance; 575 vdc_t *vdc = NULL; 576 int status; 577 md_t *mdp; 578 mde_cookie_t vd_node, vd_port; 579 580 ASSERT(dip != NULL); 581 582 instance = ddi_get_instance(dip); 583 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 584 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 585 instance); 586 return (DDI_FAILURE); 587 } 588 589 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 590 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 591 return (DDI_FAILURE); 592 } 593 594 /* 595 * We assign the value to initialized in this case to zero out the 596 * variable and then set bits in it to indicate what has been done 597 */ 598 vdc->initialized = VDC_SOFT_STATE; 599 600 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 601 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 602 603 vdc->dip = dip; 604 vdc->instance = instance; 605 vdc->vdisk_type = VD_DISK_TYPE_UNK; 606 vdc->vdisk_label = VD_DISK_LABEL_UNK; 607 vdc->state = VDC_STATE_INIT; 608 vdc->lifecycle = VDC_LC_ATTACHING; 609 vdc->ldc_state = 0; 610 vdc->session_id = 0; 611 vdc->block_size = DEV_BSIZE; 612 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 613 614 /* 615 * We assume, for now, that the vDisk server will export 'read' 616 * operations to us at a minimum (this is needed because of checks 617 * in vdc for supported operations early in the handshake process). 618 * The vDisk server will return ENOTSUP if this is not the case. 619 * The value will be overwritten during the attribute exchange with 620 * the bitmask of operations exported by server. 621 */ 622 vdc->operations = VD_OP_MASK_READ; 623 624 vdc->vtoc = NULL; 625 vdc->geom = NULL; 626 vdc->cinfo = NULL; 627 vdc->minfo = NULL; 628 629 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 630 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 631 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 632 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 633 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 634 635 vdc->threads_pending = 0; 636 vdc->sync_op_pending = B_FALSE; 637 vdc->sync_op_blocked = B_FALSE; 638 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 639 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 640 641 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 642 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 643 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 645 646 /* init blocking msg read functionality */ 647 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 648 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 649 vdc->read_state = VDC_READ_IDLE; 650 651 vdc->initialized |= VDC_LOCKS; 652 653 /* get device and port MD node for this disk instance */ 654 if (vdc_get_md_node(dip, &mdp, &vd_node, &vd_port) != 0) { 655 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 656 instance); 657 return (DDI_FAILURE); 658 } 659 660 /* set the connection timeout */ 661 if (vd_port == NULL || (md_get_prop_val(mdp, vd_port, 662 VDC_MD_TIMEOUT, &vdc->ctimeout) != 0)) { 663 vdc->ctimeout = 0; 664 } 665 666 /* initialise LDC channel which will be used to communicate with vds */ 667 status = vdc_do_ldc_init(vdc, mdp, vd_node); 668 669 (void) md_fini_handle(mdp); 670 671 if (status != 0) { 672 cmn_err(CE_NOTE, "[%d] Couldn't initialize LDC", instance); 673 goto return_status; 674 } 675 676 /* initialize the thread responsible for managing state with server */ 677 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 678 vdc, 0, &p0, TS_RUN, minclsyspri); 679 if (vdc->msg_proc_thr == NULL) { 680 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 681 instance); 682 return (DDI_FAILURE); 683 } 684 685 vdc->initialized |= VDC_THREAD; 686 687 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 688 vdc_create_io_kstats(vdc); 689 vdc_create_err_kstats(vdc); 690 691 atomic_inc_32(&vdc_instance_count); 692 693 /* 694 * Check the disk label. This will send requests and do the handshake. 695 * We don't really care about the disk label now. What we really need is 696 * the handshake do be done so that we know the type of the disk (slice 697 * or full disk) and the appropriate device nodes can be created. 698 */ 699 vdc->vdisk_label = VD_DISK_LABEL_UNK; 700 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 701 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 702 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 703 704 mutex_enter(&vdc->lock); 705 (void) vdc_validate_geometry(vdc); 706 mutex_exit(&vdc->lock); 707 708 /* 709 * Now that we have the device info we can create the 710 * device nodes and properties 711 */ 712 status = vdc_create_device_nodes(vdc); 713 if (status) { 714 DMSG(vdc, 0, "[%d] Failed to create device nodes", 715 instance); 716 goto return_status; 717 } 718 status = vdc_create_device_nodes_props(vdc); 719 if (status) { 720 DMSG(vdc, 0, "[%d] Failed to create device nodes" 721 " properties (%d)", instance, status); 722 goto return_status; 723 } 724 725 /* 726 * Setup devid 727 */ 728 if (vdc_setup_devid(vdc)) { 729 DMSG(vdc, 0, "[%d] No device id available\n", instance); 730 } 731 732 /* 733 * Fill in the fields of the error statistics kstat that were not 734 * available when creating the kstat 735 */ 736 vdc_set_err_kstats(vdc); 737 738 ddi_report_dev(dip); 739 vdc->lifecycle = VDC_LC_ONLINE; 740 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 741 742 return_status: 743 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 744 return (status); 745 } 746 747 static int 748 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 749 { 750 int status; 751 752 switch (cmd) { 753 case DDI_ATTACH: 754 if ((status = vdc_do_attach(dip)) != 0) 755 (void) vdc_detach(dip, DDI_DETACH); 756 return (status); 757 case DDI_RESUME: 758 /* nothing to do for this non-device */ 759 return (DDI_SUCCESS); 760 default: 761 return (DDI_FAILURE); 762 } 763 } 764 765 static int 766 vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node) 767 { 768 int status = 0; 769 ldc_status_t ldc_state; 770 ldc_attr_t ldc_attr; 771 uint64_t ldc_id = 0; 772 773 ASSERT(vdc != NULL); 774 775 vdc->initialized |= VDC_LDC; 776 777 if ((status = vdc_get_ldc_id(mdp, vd_node, &ldc_id)) != 0) { 778 DMSG(vdc, 0, "[%d] Failed to get LDC channel ID property", 779 vdc->instance); 780 return (EIO); 781 } 782 783 DMSGX(0, "[%d] LDC id is 0x%lx\n", vdc->instance, ldc_id); 784 785 vdc->ldc_id = ldc_id; 786 787 ldc_attr.devclass = LDC_DEV_BLK; 788 ldc_attr.instance = vdc->instance; 789 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 790 ldc_attr.mtu = VD_LDC_MTU; 791 792 if ((vdc->initialized & VDC_LDC_INIT) == 0) { 793 status = ldc_init(ldc_id, &ldc_attr, &vdc->ldc_handle); 794 if (status != 0) { 795 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 796 vdc->instance, ldc_id, status); 797 return (status); 798 } 799 vdc->initialized |= VDC_LDC_INIT; 800 } 801 status = ldc_status(vdc->ldc_handle, &ldc_state); 802 if (status != 0) { 803 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 804 vdc->instance, status); 805 return (status); 806 } 807 vdc->ldc_state = ldc_state; 808 809 if ((vdc->initialized & VDC_LDC_CB) == 0) { 810 status = ldc_reg_callback(vdc->ldc_handle, vdc_handle_cb, 811 (caddr_t)vdc); 812 if (status != 0) { 813 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 814 vdc->instance, status); 815 return (status); 816 } 817 vdc->initialized |= VDC_LDC_CB; 818 } 819 820 vdc->initialized |= VDC_LDC; 821 822 /* 823 * At this stage we have initialised LDC, we will now try and open 824 * the connection. 825 */ 826 if (vdc->ldc_state == LDC_INIT) { 827 status = ldc_open(vdc->ldc_handle); 828 if (status != 0) { 829 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 830 vdc->instance, vdc->ldc_id, status); 831 return (status); 832 } 833 vdc->initialized |= VDC_LDC_OPEN; 834 } 835 836 return (status); 837 } 838 839 static int 840 vdc_start_ldc_connection(vdc_t *vdc) 841 { 842 int status = 0; 843 844 ASSERT(vdc != NULL); 845 846 ASSERT(MUTEX_HELD(&vdc->lock)); 847 848 status = vdc_do_ldc_up(vdc); 849 850 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 851 852 return (status); 853 } 854 855 static int 856 vdc_stop_ldc_connection(vdc_t *vdcp) 857 { 858 int status; 859 860 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 861 vdcp->state); 862 863 status = ldc_down(vdcp->ldc_handle); 864 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 865 866 vdcp->initialized &= ~VDC_HANDSHAKE; 867 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 868 869 return (status); 870 } 871 872 static void 873 vdc_create_io_kstats(vdc_t *vdc) 874 { 875 if (vdc->io_stats != NULL) { 876 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 877 return; 878 } 879 880 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 881 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 882 if (vdc->io_stats != NULL) { 883 vdc->io_stats->ks_lock = &vdc->lock; 884 kstat_install(vdc->io_stats); 885 } else { 886 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 887 " will not be gathered", vdc->instance); 888 } 889 } 890 891 static void 892 vdc_create_err_kstats(vdc_t *vdc) 893 { 894 vd_err_stats_t *stp; 895 char kstatmodule_err[KSTAT_STRLEN]; 896 char kstatname[KSTAT_STRLEN]; 897 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 898 int instance = vdc->instance; 899 900 if (vdc->err_stats != NULL) { 901 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 902 return; 903 } 904 905 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 906 "%serr", VDC_DRIVER_NAME); 907 (void) snprintf(kstatname, sizeof (kstatname), 908 "%s%d,err", VDC_DRIVER_NAME, instance); 909 910 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 911 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 912 913 if (vdc->err_stats == NULL) { 914 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 915 " will not be gathered", instance); 916 return; 917 } 918 919 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 920 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 921 KSTAT_DATA_UINT32); 922 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 923 KSTAT_DATA_UINT32); 924 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 925 KSTAT_DATA_UINT32); 926 kstat_named_init(&stp->vd_vid, "Vendor", 927 KSTAT_DATA_CHAR); 928 kstat_named_init(&stp->vd_pid, "Product", 929 KSTAT_DATA_CHAR); 930 kstat_named_init(&stp->vd_capacity, "Size", 931 KSTAT_DATA_ULONGLONG); 932 933 vdc->err_stats->ks_update = nulldev; 934 935 kstat_install(vdc->err_stats); 936 } 937 938 static void 939 vdc_set_err_kstats(vdc_t *vdc) 940 { 941 vd_err_stats_t *stp; 942 943 if (vdc->err_stats == NULL) 944 return; 945 946 mutex_enter(&vdc->lock); 947 948 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 949 ASSERT(stp != NULL); 950 951 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 952 (void) strcpy(stp->vd_vid.value.c, "SUN"); 953 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 954 955 mutex_exit(&vdc->lock); 956 } 957 958 static int 959 vdc_create_device_nodes_efi(vdc_t *vdc) 960 { 961 ddi_remove_minor_node(vdc->dip, "h"); 962 ddi_remove_minor_node(vdc->dip, "h,raw"); 963 964 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 965 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 966 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 967 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 968 vdc->instance); 969 return (EIO); 970 } 971 972 /* if any device node is created we set this flag */ 973 vdc->initialized |= VDC_MINOR; 974 975 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 976 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 977 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 978 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 979 vdc->instance); 980 return (EIO); 981 } 982 983 return (0); 984 } 985 986 static int 987 vdc_create_device_nodes_vtoc(vdc_t *vdc) 988 { 989 ddi_remove_minor_node(vdc->dip, "wd"); 990 ddi_remove_minor_node(vdc->dip, "wd,raw"); 991 992 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 993 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 994 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 995 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 996 vdc->instance); 997 return (EIO); 998 } 999 1000 /* if any device node is created we set this flag */ 1001 vdc->initialized |= VDC_MINOR; 1002 1003 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 1004 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 1005 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1006 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 1007 vdc->instance); 1008 return (EIO); 1009 } 1010 1011 return (0); 1012 } 1013 1014 /* 1015 * Function: 1016 * vdc_create_device_nodes 1017 * 1018 * Description: 1019 * This function creates the block and character device nodes under 1020 * /devices along with the node properties. It is called as part of 1021 * the attach(9E) of the instance during the handshake with vds after 1022 * vds has sent the attributes to vdc. 1023 * 1024 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1025 * of 2 is used in keeping with the Solaris convention that slice 2 1026 * refers to a whole disk. Slices start at 'a' 1027 * 1028 * Parameters: 1029 * vdc - soft state pointer 1030 * 1031 * Return Values 1032 * 0 - Success 1033 * EIO - Failed to create node 1034 * EINVAL - Unknown type of disk exported 1035 */ 1036 static int 1037 vdc_create_device_nodes(vdc_t *vdc) 1038 { 1039 char name[sizeof ("s,raw")]; 1040 dev_info_t *dip = NULL; 1041 int instance, status; 1042 int num_slices = 1; 1043 int i; 1044 1045 ASSERT(vdc != NULL); 1046 1047 instance = vdc->instance; 1048 dip = vdc->dip; 1049 1050 switch (vdc->vdisk_type) { 1051 case VD_DISK_TYPE_DISK: 1052 num_slices = V_NUMPAR; 1053 break; 1054 case VD_DISK_TYPE_SLICE: 1055 num_slices = 1; 1056 break; 1057 case VD_DISK_TYPE_UNK: 1058 default: 1059 return (EINVAL); 1060 } 1061 1062 /* 1063 * Minor nodes are different for EFI disks: EFI disks do not have 1064 * a minor node 'g' for the minor number corresponding to slice 1065 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1066 * representing the whole disk. 1067 */ 1068 for (i = 0; i < num_slices; i++) { 1069 1070 if (i == VD_EFI_WD_SLICE) { 1071 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1072 status = vdc_create_device_nodes_efi(vdc); 1073 else 1074 status = vdc_create_device_nodes_vtoc(vdc); 1075 if (status != 0) 1076 return (status); 1077 continue; 1078 } 1079 1080 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1081 if (ddi_create_minor_node(dip, name, S_IFBLK, 1082 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1083 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1084 instance, name); 1085 return (EIO); 1086 } 1087 1088 /* if any device node is created we set this flag */ 1089 vdc->initialized |= VDC_MINOR; 1090 1091 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1092 1093 if (ddi_create_minor_node(dip, name, S_IFCHR, 1094 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1095 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1096 instance, name); 1097 return (EIO); 1098 } 1099 } 1100 1101 return (0); 1102 } 1103 1104 /* 1105 * Function: 1106 * vdc_create_device_nodes_props 1107 * 1108 * Description: 1109 * This function creates the block and character device nodes under 1110 * /devices along with the node properties. It is called as part of 1111 * the attach(9E) of the instance during the handshake with vds after 1112 * vds has sent the attributes to vdc. 1113 * 1114 * Parameters: 1115 * vdc - soft state pointer 1116 * 1117 * Return Values 1118 * 0 - Success 1119 * EIO - Failed to create device node property 1120 * EINVAL - Unknown type of disk exported 1121 */ 1122 static int 1123 vdc_create_device_nodes_props(vdc_t *vdc) 1124 { 1125 dev_info_t *dip = NULL; 1126 int instance; 1127 int num_slices = 1; 1128 int64_t size = 0; 1129 dev_t dev; 1130 int rv; 1131 int i; 1132 1133 ASSERT(vdc != NULL); 1134 1135 instance = vdc->instance; 1136 dip = vdc->dip; 1137 1138 switch (vdc->vdisk_type) { 1139 case VD_DISK_TYPE_DISK: 1140 num_slices = V_NUMPAR; 1141 break; 1142 case VD_DISK_TYPE_SLICE: 1143 num_slices = 1; 1144 break; 1145 case VD_DISK_TYPE_UNK: 1146 default: 1147 return (EINVAL); 1148 } 1149 1150 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1151 /* remove all properties */ 1152 for (i = 0; i < num_slices; i++) { 1153 dev = makedevice(ddi_driver_major(dip), 1154 VD_MAKE_DEV(instance, i)); 1155 (void) ddi_prop_remove(dev, dip, VDC_SIZE_PROP_NAME); 1156 (void) ddi_prop_remove(dev, dip, VDC_NBLOCKS_PROP_NAME); 1157 } 1158 return (0); 1159 } 1160 1161 for (i = 0; i < num_slices; i++) { 1162 dev = makedevice(ddi_driver_major(dip), 1163 VD_MAKE_DEV(instance, i)); 1164 1165 size = vdc->slice[i].nblocks * vdc->block_size; 1166 DMSG(vdc, 0, "[%d] sz %ld (%ld Mb) p_size %lx\n", 1167 instance, size, size / (1024 * 1024), 1168 vdc->slice[i].nblocks); 1169 1170 rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); 1171 if (rv != DDI_PROP_SUCCESS) { 1172 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop of [%ld]", 1173 instance, VDC_SIZE_PROP_NAME, size); 1174 return (EIO); 1175 } 1176 1177 rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME, 1178 lbtodb(size)); 1179 if (rv != DDI_PROP_SUCCESS) { 1180 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop [%llu]", 1181 instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size)); 1182 return (EIO); 1183 } 1184 } 1185 1186 return (0); 1187 } 1188 1189 /* 1190 * Function: 1191 * vdc_is_opened 1192 * 1193 * Description: 1194 * This function checks if any slice of a given virtual disk is 1195 * currently opened. 1196 * 1197 * Parameters: 1198 * vdc - soft state pointer 1199 * 1200 * Return Values 1201 * B_TRUE - at least one slice is opened. 1202 * B_FALSE - no slice is opened. 1203 */ 1204 static boolean_t 1205 vdc_is_opened(vdc_t *vdc) 1206 { 1207 int i, nslices; 1208 1209 switch (vdc->vdisk_type) { 1210 case VD_DISK_TYPE_DISK: 1211 nslices = V_NUMPAR; 1212 break; 1213 case VD_DISK_TYPE_SLICE: 1214 nslices = 1; 1215 break; 1216 case VD_DISK_TYPE_UNK: 1217 default: 1218 ASSERT(0); 1219 } 1220 1221 /* check if there's any layered open */ 1222 for (i = 0; i < nslices; i++) { 1223 if (vdc->open_lyr[i] > 0) 1224 return (B_TRUE); 1225 } 1226 1227 /* check if there is any other kind of open */ 1228 for (i = 0; i < OTYPCNT; i++) { 1229 if (vdc->open[i] != 0) 1230 return (B_TRUE); 1231 } 1232 1233 return (B_FALSE); 1234 } 1235 1236 static int 1237 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1238 { 1239 uint8_t slicemask; 1240 int i; 1241 1242 ASSERT(otyp < OTYPCNT); 1243 ASSERT(slice < V_NUMPAR); 1244 ASSERT(MUTEX_HELD(&vdc->lock)); 1245 1246 slicemask = 1 << slice; 1247 1248 /* check if slice is already exclusively opened */ 1249 if (vdc->open_excl & slicemask) 1250 return (EBUSY); 1251 1252 /* if open exclusive, check if slice is already opened */ 1253 if (flag & FEXCL) { 1254 if (vdc->open_lyr[slice] > 0) 1255 return (EBUSY); 1256 for (i = 0; i < OTYPCNT; i++) { 1257 if (vdc->open[i] & slicemask) 1258 return (EBUSY); 1259 } 1260 vdc->open_excl |= slicemask; 1261 } 1262 1263 /* mark slice as opened */ 1264 if (otyp == OTYP_LYR) { 1265 vdc->open_lyr[slice]++; 1266 } else { 1267 vdc->open[otyp] |= slicemask; 1268 } 1269 1270 return (0); 1271 } 1272 1273 static void 1274 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1275 { 1276 uint8_t slicemask; 1277 1278 ASSERT(otyp < OTYPCNT); 1279 ASSERT(slice < V_NUMPAR); 1280 ASSERT(MUTEX_HELD(&vdc->lock)); 1281 1282 slicemask = 1 << slice; 1283 1284 if (otyp == OTYP_LYR) { 1285 ASSERT(vdc->open_lyr[slice] > 0); 1286 vdc->open_lyr[slice]--; 1287 } else { 1288 vdc->open[otyp] &= ~slicemask; 1289 } 1290 1291 if (flag & FEXCL) 1292 vdc->open_excl &= ~slicemask; 1293 } 1294 1295 static int 1296 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1297 { 1298 _NOTE(ARGUNUSED(cred)) 1299 1300 int instance; 1301 int slice, status = 0; 1302 vdc_t *vdc; 1303 1304 ASSERT(dev != NULL); 1305 instance = VDCUNIT(*dev); 1306 1307 if (otyp >= OTYPCNT) 1308 return (EINVAL); 1309 1310 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1311 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1312 return (ENXIO); 1313 } 1314 1315 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1316 getminor(*dev), flag, otyp); 1317 1318 slice = VDCPART(*dev); 1319 1320 mutex_enter(&vdc->lock); 1321 1322 status = vdc_mark_opened(vdc, slice, flag, otyp); 1323 1324 if (status != 0) { 1325 mutex_exit(&vdc->lock); 1326 return (status); 1327 } 1328 1329 if (flag & (FNDELAY | FNONBLOCK)) { 1330 1331 /* don't resubmit a validate request if there's already one */ 1332 if (vdc->validate_pending > 0) { 1333 mutex_exit(&vdc->lock); 1334 return (0); 1335 } 1336 1337 /* call vdc_validate() asynchronously to avoid blocking */ 1338 if (taskq_dispatch(system_taskq, vdc_validate_task, 1339 (void *)vdc, TQ_NOSLEEP) == NULL) { 1340 vdc_mark_closed(vdc, slice, flag, otyp); 1341 mutex_exit(&vdc->lock); 1342 return (ENXIO); 1343 } 1344 1345 vdc->validate_pending++; 1346 mutex_exit(&vdc->lock); 1347 return (0); 1348 } 1349 1350 mutex_exit(&vdc->lock); 1351 1352 vdc_validate(vdc); 1353 1354 mutex_enter(&vdc->lock); 1355 1356 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1357 vdc->slice[slice].nblocks == 0) { 1358 vdc_mark_closed(vdc, slice, flag, otyp); 1359 status = EIO; 1360 } 1361 1362 mutex_exit(&vdc->lock); 1363 1364 return (status); 1365 } 1366 1367 static int 1368 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1369 { 1370 _NOTE(ARGUNUSED(cred)) 1371 1372 int instance; 1373 int slice; 1374 int rv, rval; 1375 vdc_t *vdc; 1376 1377 instance = VDCUNIT(dev); 1378 1379 if (otyp >= OTYPCNT) 1380 return (EINVAL); 1381 1382 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1383 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1384 return (ENXIO); 1385 } 1386 1387 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1388 1389 slice = VDCPART(dev); 1390 1391 /* 1392 * Attempt to flush the W$ on a close operation. If this is 1393 * not a supported IOCTL command or the backing device is read-only 1394 * do not fail the close operation. 1395 */ 1396 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1397 1398 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1399 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1400 instance, rv); 1401 return (EIO); 1402 } 1403 1404 mutex_enter(&vdc->lock); 1405 vdc_mark_closed(vdc, slice, flag, otyp); 1406 mutex_exit(&vdc->lock); 1407 1408 return (0); 1409 } 1410 1411 static int 1412 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1413 { 1414 _NOTE(ARGUNUSED(credp)) 1415 1416 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1417 } 1418 1419 static int 1420 vdc_print(dev_t dev, char *str) 1421 { 1422 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1423 return (0); 1424 } 1425 1426 static int 1427 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1428 { 1429 int rv; 1430 size_t nbytes = nblk * DEV_BSIZE; 1431 int instance = VDCUNIT(dev); 1432 vdc_t *vdc = NULL; 1433 1434 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1435 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1436 return (ENXIO); 1437 } 1438 1439 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1440 instance, nbytes, blkno, (void *)addr); 1441 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1442 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1443 if (rv) { 1444 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1445 return (rv); 1446 } 1447 1448 if (ddi_in_panic()) 1449 (void) vdc_drain_response(vdc); 1450 1451 DMSG(vdc, 0, "[%d] End\n", instance); 1452 1453 return (0); 1454 } 1455 1456 /* -------------------------------------------------------------------------- */ 1457 1458 /* 1459 * Disk access routines 1460 * 1461 */ 1462 1463 /* 1464 * vdc_strategy() 1465 * 1466 * Return Value: 1467 * 0: As per strategy(9E), the strategy() function must return 0 1468 * [ bioerror(9f) sets b_flags to the proper error code ] 1469 */ 1470 static int 1471 vdc_strategy(struct buf *buf) 1472 { 1473 int rv = -1; 1474 vdc_t *vdc = NULL; 1475 int instance = VDCUNIT(buf->b_edev); 1476 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1477 int slice; 1478 1479 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1480 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1481 bioerror(buf, ENXIO); 1482 biodone(buf); 1483 return (0); 1484 } 1485 1486 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1487 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1488 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1489 1490 bp_mapin(buf); 1491 1492 if ((long)buf->b_private == VD_SLICE_NONE) { 1493 /* I/O using an absolute disk offset */ 1494 slice = VD_SLICE_NONE; 1495 } else { 1496 slice = VDCPART(buf->b_edev); 1497 } 1498 1499 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1500 buf->b_bcount, slice, buf->b_lblkno, 1501 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1502 VIO_write_dir); 1503 1504 /* 1505 * If the request was successfully sent, the strategy call returns and 1506 * the ACK handler calls the bioxxx functions when the vDisk server is 1507 * done otherwise we handle the error here. 1508 */ 1509 if (rv) { 1510 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1511 bioerror(buf, rv); 1512 biodone(buf); 1513 } 1514 1515 return (0); 1516 } 1517 1518 /* 1519 * Function: 1520 * vdc_min 1521 * 1522 * Description: 1523 * Routine to limit the size of a data transfer. Used in 1524 * conjunction with physio(9F). 1525 * 1526 * Arguments: 1527 * bp - pointer to the indicated buf(9S) struct. 1528 * 1529 */ 1530 static void 1531 vdc_min(struct buf *bufp) 1532 { 1533 vdc_t *vdc = NULL; 1534 int instance = VDCUNIT(bufp->b_edev); 1535 1536 vdc = ddi_get_soft_state(vdc_state, instance); 1537 VERIFY(vdc != NULL); 1538 1539 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1540 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1541 } 1542 } 1543 1544 static int 1545 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1546 { 1547 _NOTE(ARGUNUSED(cred)) 1548 1549 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1550 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1551 } 1552 1553 static int 1554 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1555 { 1556 _NOTE(ARGUNUSED(cred)) 1557 1558 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1559 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1560 } 1561 1562 static int 1563 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1564 { 1565 _NOTE(ARGUNUSED(cred)) 1566 1567 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1568 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1569 } 1570 1571 static int 1572 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1573 { 1574 _NOTE(ARGUNUSED(cred)) 1575 1576 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1577 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1578 } 1579 1580 1581 /* -------------------------------------------------------------------------- */ 1582 1583 /* 1584 * Handshake support 1585 */ 1586 1587 1588 /* 1589 * Function: 1590 * vdc_init_ver_negotiation() 1591 * 1592 * Description: 1593 * 1594 * Arguments: 1595 * vdc - soft state pointer for this instance of the device driver. 1596 * 1597 * Return Code: 1598 * 0 - Success 1599 */ 1600 static int 1601 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1602 { 1603 vio_ver_msg_t pkt; 1604 size_t msglen = sizeof (pkt); 1605 int status = -1; 1606 1607 ASSERT(vdc != NULL); 1608 ASSERT(mutex_owned(&vdc->lock)); 1609 1610 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1611 1612 /* 1613 * set the Session ID to a unique value 1614 * (the lower 32 bits of the clock tick) 1615 */ 1616 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1617 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1618 1619 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1620 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1621 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1622 pkt.tag.vio_sid = vdc->session_id; 1623 pkt.dev_class = VDEV_DISK; 1624 pkt.ver_major = ver.major; 1625 pkt.ver_minor = ver.minor; 1626 1627 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1628 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1629 vdc->instance, status); 1630 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1631 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1632 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1633 status, msglen); 1634 if (msglen != sizeof (vio_ver_msg_t)) 1635 status = ENOMSG; 1636 } 1637 1638 return (status); 1639 } 1640 1641 /* 1642 * Function: 1643 * vdc_ver_negotiation() 1644 * 1645 * Description: 1646 * 1647 * Arguments: 1648 * vdcp - soft state pointer for this instance of the device driver. 1649 * 1650 * Return Code: 1651 * 0 - Success 1652 */ 1653 static int 1654 vdc_ver_negotiation(vdc_t *vdcp) 1655 { 1656 vio_msg_t vio_msg; 1657 int status; 1658 1659 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1660 return (status); 1661 1662 /* release lock and wait for response */ 1663 mutex_exit(&vdcp->lock); 1664 status = vdc_wait_for_response(vdcp, &vio_msg); 1665 mutex_enter(&vdcp->lock); 1666 if (status) { 1667 DMSG(vdcp, 0, 1668 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1669 vdcp->instance, status); 1670 return (status); 1671 } 1672 1673 /* check type and sub_type ... */ 1674 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1675 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1676 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1677 vdcp->instance); 1678 return (EPROTO); 1679 } 1680 1681 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1682 } 1683 1684 /* 1685 * Function: 1686 * vdc_init_attr_negotiation() 1687 * 1688 * Description: 1689 * 1690 * Arguments: 1691 * vdc - soft state pointer for this instance of the device driver. 1692 * 1693 * Return Code: 1694 * 0 - Success 1695 */ 1696 static int 1697 vdc_init_attr_negotiation(vdc_t *vdc) 1698 { 1699 vd_attr_msg_t pkt; 1700 size_t msglen = sizeof (pkt); 1701 int status; 1702 1703 ASSERT(vdc != NULL); 1704 ASSERT(mutex_owned(&vdc->lock)); 1705 1706 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1707 1708 /* fill in tag */ 1709 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1710 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1711 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1712 pkt.tag.vio_sid = vdc->session_id; 1713 /* fill in payload */ 1714 pkt.max_xfer_sz = vdc->max_xfer_sz; 1715 pkt.vdisk_block_size = vdc->block_size; 1716 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1717 pkt.operations = 0; /* server will set bits of valid operations */ 1718 pkt.vdisk_type = 0; /* server will set to valid device type */ 1719 pkt.vdisk_media = 0; /* server will set to valid media type */ 1720 pkt.vdisk_size = 0; /* server will set to valid size */ 1721 1722 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1723 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1724 1725 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1726 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1727 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1728 status, msglen); 1729 if (msglen != sizeof (vio_ver_msg_t)) 1730 status = ENOMSG; 1731 } 1732 1733 return (status); 1734 } 1735 1736 /* 1737 * Function: 1738 * vdc_attr_negotiation() 1739 * 1740 * Description: 1741 * 1742 * Arguments: 1743 * vdc - soft state pointer for this instance of the device driver. 1744 * 1745 * Return Code: 1746 * 0 - Success 1747 */ 1748 static int 1749 vdc_attr_negotiation(vdc_t *vdcp) 1750 { 1751 int status; 1752 vio_msg_t vio_msg; 1753 1754 if (status = vdc_init_attr_negotiation(vdcp)) 1755 return (status); 1756 1757 /* release lock and wait for response */ 1758 mutex_exit(&vdcp->lock); 1759 status = vdc_wait_for_response(vdcp, &vio_msg); 1760 mutex_enter(&vdcp->lock); 1761 if (status) { 1762 DMSG(vdcp, 0, 1763 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1764 vdcp->instance, status); 1765 return (status); 1766 } 1767 1768 /* check type and sub_type ... */ 1769 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1770 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1771 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1772 vdcp->instance); 1773 return (EPROTO); 1774 } 1775 1776 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1777 } 1778 1779 1780 /* 1781 * Function: 1782 * vdc_init_dring_negotiate() 1783 * 1784 * Description: 1785 * 1786 * Arguments: 1787 * vdc - soft state pointer for this instance of the device driver. 1788 * 1789 * Return Code: 1790 * 0 - Success 1791 */ 1792 static int 1793 vdc_init_dring_negotiate(vdc_t *vdc) 1794 { 1795 vio_dring_reg_msg_t pkt; 1796 size_t msglen = sizeof (pkt); 1797 int status = -1; 1798 int retry; 1799 int nretries = 10; 1800 1801 ASSERT(vdc != NULL); 1802 ASSERT(mutex_owned(&vdc->lock)); 1803 1804 for (retry = 0; retry < nretries; retry++) { 1805 status = vdc_init_descriptor_ring(vdc); 1806 if (status != EAGAIN) 1807 break; 1808 drv_usecwait(vdc_min_timeout_ldc); 1809 } 1810 1811 if (status != 0) { 1812 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1813 vdc->instance, status); 1814 return (status); 1815 } 1816 1817 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1818 vdc->instance, status); 1819 1820 /* fill in tag */ 1821 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1822 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1823 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1824 pkt.tag.vio_sid = vdc->session_id; 1825 /* fill in payload */ 1826 pkt.dring_ident = 0; 1827 pkt.num_descriptors = vdc->dring_len; 1828 pkt.descriptor_size = vdc->dring_entry_size; 1829 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1830 pkt.ncookies = vdc->dring_cookie_count; 1831 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1832 1833 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1834 if (status != 0) { 1835 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1836 vdc->instance, status); 1837 } 1838 1839 return (status); 1840 } 1841 1842 1843 /* 1844 * Function: 1845 * vdc_dring_negotiation() 1846 * 1847 * Description: 1848 * 1849 * Arguments: 1850 * vdc - soft state pointer for this instance of the device driver. 1851 * 1852 * Return Code: 1853 * 0 - Success 1854 */ 1855 static int 1856 vdc_dring_negotiation(vdc_t *vdcp) 1857 { 1858 int status; 1859 vio_msg_t vio_msg; 1860 1861 if (status = vdc_init_dring_negotiate(vdcp)) 1862 return (status); 1863 1864 /* release lock and wait for response */ 1865 mutex_exit(&vdcp->lock); 1866 status = vdc_wait_for_response(vdcp, &vio_msg); 1867 mutex_enter(&vdcp->lock); 1868 if (status) { 1869 DMSG(vdcp, 0, 1870 "[%d] Failed waiting for Dring negotiation response," 1871 " rv(%d)", vdcp->instance, status); 1872 return (status); 1873 } 1874 1875 /* check type and sub_type ... */ 1876 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1877 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1878 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1879 vdcp->instance); 1880 return (EPROTO); 1881 } 1882 1883 return (vdc_handle_dring_reg_msg(vdcp, 1884 (vio_dring_reg_msg_t *)&vio_msg)); 1885 } 1886 1887 1888 /* 1889 * Function: 1890 * vdc_send_rdx() 1891 * 1892 * Description: 1893 * 1894 * Arguments: 1895 * vdc - soft state pointer for this instance of the device driver. 1896 * 1897 * Return Code: 1898 * 0 - Success 1899 */ 1900 static int 1901 vdc_send_rdx(vdc_t *vdcp) 1902 { 1903 vio_msg_t msg; 1904 size_t msglen = sizeof (vio_msg_t); 1905 int status; 1906 1907 /* 1908 * Send an RDX message to vds to indicate we are ready 1909 * to send data 1910 */ 1911 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1912 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1913 msg.tag.vio_subtype_env = VIO_RDX; 1914 msg.tag.vio_sid = vdcp->session_id; 1915 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1916 if (status != 0) { 1917 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1918 vdcp->instance, status); 1919 } 1920 1921 return (status); 1922 } 1923 1924 /* 1925 * Function: 1926 * vdc_handle_rdx() 1927 * 1928 * Description: 1929 * 1930 * Arguments: 1931 * vdc - soft state pointer for this instance of the device driver. 1932 * msgp - received msg 1933 * 1934 * Return Code: 1935 * 0 - Success 1936 */ 1937 static int 1938 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1939 { 1940 _NOTE(ARGUNUSED(vdcp)) 1941 _NOTE(ARGUNUSED(msgp)) 1942 1943 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1944 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1945 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1946 1947 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1948 1949 return (0); 1950 } 1951 1952 /* 1953 * Function: 1954 * vdc_rdx_exchange() 1955 * 1956 * Description: 1957 * 1958 * Arguments: 1959 * vdc - soft state pointer for this instance of the device driver. 1960 * 1961 * Return Code: 1962 * 0 - Success 1963 */ 1964 static int 1965 vdc_rdx_exchange(vdc_t *vdcp) 1966 { 1967 int status; 1968 vio_msg_t vio_msg; 1969 1970 if (status = vdc_send_rdx(vdcp)) 1971 return (status); 1972 1973 /* release lock and wait for response */ 1974 mutex_exit(&vdcp->lock); 1975 status = vdc_wait_for_response(vdcp, &vio_msg); 1976 mutex_enter(&vdcp->lock); 1977 if (status) { 1978 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1979 vdcp->instance, status); 1980 return (status); 1981 } 1982 1983 /* check type and sub_type ... */ 1984 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1985 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1986 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1987 return (EPROTO); 1988 } 1989 1990 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1991 } 1992 1993 1994 /* -------------------------------------------------------------------------- */ 1995 1996 /* 1997 * LDC helper routines 1998 */ 1999 2000 static int 2001 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 2002 { 2003 int status; 2004 boolean_t q_has_pkts = B_FALSE; 2005 uint64_t delay_time; 2006 size_t len; 2007 2008 mutex_enter(&vdc->read_lock); 2009 2010 if (vdc->read_state == VDC_READ_IDLE) 2011 vdc->read_state = VDC_READ_WAITING; 2012 2013 while (vdc->read_state != VDC_READ_PENDING) { 2014 2015 /* detect if the connection has been reset */ 2016 if (vdc->read_state == VDC_READ_RESET) { 2017 status = ECONNRESET; 2018 goto done; 2019 } 2020 2021 cv_wait(&vdc->read_cv, &vdc->read_lock); 2022 } 2023 2024 /* 2025 * Until we get a blocking ldc read we have to retry 2026 * until the entire LDC message has arrived before 2027 * ldc_read() will succeed. Note we also bail out if 2028 * the channel is reset or goes away. 2029 */ 2030 delay_time = vdc_ldc_read_init_delay; 2031 loop: 2032 len = *nbytesp; 2033 status = ldc_read(vdc->ldc_handle, (caddr_t)msgp, &len); 2034 switch (status) { 2035 case EAGAIN: 2036 delay_time *= 2; 2037 if (delay_time >= vdc_ldc_read_max_delay) 2038 delay_time = vdc_ldc_read_max_delay; 2039 delay(delay_time); 2040 goto loop; 2041 2042 case 0: 2043 if (len == 0) { 2044 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 2045 "no error!\n", vdc->instance); 2046 goto loop; 2047 } 2048 2049 *nbytesp = len; 2050 2051 /* 2052 * If there are pending messages, leave the 2053 * read state as pending. Otherwise, set the state 2054 * back to idle. 2055 */ 2056 status = ldc_chkq(vdc->ldc_handle, &q_has_pkts); 2057 if (status == 0 && !q_has_pkts) 2058 vdc->read_state = VDC_READ_IDLE; 2059 2060 break; 2061 default: 2062 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2063 break; 2064 } 2065 2066 done: 2067 mutex_exit(&vdc->read_lock); 2068 2069 return (status); 2070 } 2071 2072 2073 2074 #ifdef DEBUG 2075 void 2076 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2077 { 2078 char *ms, *ss, *ses; 2079 switch (msg->tag.vio_msgtype) { 2080 #define Q(_s) case _s : ms = #_s; break; 2081 Q(VIO_TYPE_CTRL) 2082 Q(VIO_TYPE_DATA) 2083 Q(VIO_TYPE_ERR) 2084 #undef Q 2085 default: ms = "unknown"; break; 2086 } 2087 2088 switch (msg->tag.vio_subtype) { 2089 #define Q(_s) case _s : ss = #_s; break; 2090 Q(VIO_SUBTYPE_INFO) 2091 Q(VIO_SUBTYPE_ACK) 2092 Q(VIO_SUBTYPE_NACK) 2093 #undef Q 2094 default: ss = "unknown"; break; 2095 } 2096 2097 switch (msg->tag.vio_subtype_env) { 2098 #define Q(_s) case _s : ses = #_s; break; 2099 Q(VIO_VER_INFO) 2100 Q(VIO_ATTR_INFO) 2101 Q(VIO_DRING_REG) 2102 Q(VIO_DRING_UNREG) 2103 Q(VIO_RDX) 2104 Q(VIO_PKT_DATA) 2105 Q(VIO_DESC_DATA) 2106 Q(VIO_DRING_DATA) 2107 #undef Q 2108 default: ses = "unknown"; break; 2109 } 2110 2111 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2112 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2113 msg->tag.vio_subtype_env, ms, ss, ses); 2114 } 2115 #endif 2116 2117 /* 2118 * Function: 2119 * vdc_send() 2120 * 2121 * Description: 2122 * The function encapsulates the call to write a message using LDC. 2123 * If LDC indicates that the call failed due to the queue being full, 2124 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2125 * 2126 * Arguments: 2127 * ldc_handle - LDC handle for the channel this instance of vdc uses 2128 * pkt - address of LDC message to be sent 2129 * msglen - the size of the message being sent. When the function 2130 * returns, this contains the number of bytes written. 2131 * 2132 * Return Code: 2133 * 0 - Success. 2134 * EINVAL - pkt or msglen were NULL 2135 * ECONNRESET - The connection was not up. 2136 * EWOULDBLOCK - LDC queue is full 2137 * xxx - other error codes returned by ldc_write 2138 */ 2139 static int 2140 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2141 { 2142 size_t size = 0; 2143 int status = 0; 2144 clock_t delay_ticks; 2145 2146 ASSERT(vdc != NULL); 2147 ASSERT(mutex_owned(&vdc->lock)); 2148 ASSERT(msglen != NULL); 2149 ASSERT(*msglen != 0); 2150 2151 #ifdef DEBUG 2152 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2153 #endif 2154 /* 2155 * Wait indefinitely to send if channel 2156 * is busy, but bail out if we succeed or 2157 * if the channel closes or is reset. 2158 */ 2159 delay_ticks = vdc_hz_min_ldc_delay; 2160 do { 2161 size = *msglen; 2162 status = ldc_write(vdc->ldc_handle, pkt, &size); 2163 if (status == EWOULDBLOCK) { 2164 delay(delay_ticks); 2165 /* geometric backoff */ 2166 delay_ticks *= 2; 2167 if (delay_ticks > vdc_hz_max_ldc_delay) 2168 delay_ticks = vdc_hz_max_ldc_delay; 2169 } 2170 } while (status == EWOULDBLOCK); 2171 2172 /* if LDC had serious issues --- reset vdc state */ 2173 if (status == EIO || status == ECONNRESET) { 2174 /* LDC had serious issues --- reset vdc state */ 2175 mutex_enter(&vdc->read_lock); 2176 if ((vdc->read_state == VDC_READ_WAITING) || 2177 (vdc->read_state == VDC_READ_RESET)) 2178 cv_signal(&vdc->read_cv); 2179 vdc->read_state = VDC_READ_RESET; 2180 mutex_exit(&vdc->read_lock); 2181 2182 /* wake up any waiters in the reset thread */ 2183 if (vdc->state == VDC_STATE_INIT_WAITING) { 2184 DMSG(vdc, 0, "[%d] write reset - " 2185 "vdc is resetting ..\n", vdc->instance); 2186 vdc->state = VDC_STATE_RESETTING; 2187 cv_signal(&vdc->initwait_cv); 2188 } 2189 2190 return (ECONNRESET); 2191 } 2192 2193 /* return the last size written */ 2194 *msglen = size; 2195 2196 return (status); 2197 } 2198 2199 /* 2200 * Function: 2201 * vdc_get_md_node 2202 * 2203 * Description: 2204 * Get the MD, the device node and the port node for the given 2205 * disk instance. The caller is responsible for cleaning up the 2206 * reference to the returned MD (mdpp) by calling md_fini_handle(). 2207 * 2208 * Arguments: 2209 * dip - dev info pointer for this instance of the device driver. 2210 * mdpp - the returned MD. 2211 * vd_nodep - the returned device node. 2212 * vd_portp - the returned port node. The returned port node is NULL 2213 * if no port node is found. 2214 * 2215 * Return Code: 2216 * 0 - Success. 2217 * ENOENT - Expected node or property did not exist. 2218 * ENXIO - Unexpected error communicating with MD framework 2219 */ 2220 static int 2221 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep, 2222 mde_cookie_t *vd_portp) 2223 { 2224 int status = ENOENT; 2225 char *node_name = NULL; 2226 md_t *mdp = NULL; 2227 int num_nodes; 2228 int num_vdevs; 2229 int num_vports; 2230 mde_cookie_t rootnode; 2231 mde_cookie_t *listp = NULL; 2232 boolean_t found_inst = B_FALSE; 2233 int listsz; 2234 int idx; 2235 uint64_t md_inst; 2236 int obp_inst; 2237 int instance = ddi_get_instance(dip); 2238 2239 /* 2240 * Get the OBP instance number for comparison with the MD instance 2241 * 2242 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2243 * notion of "instance", or unique identifier, for that node; OBP 2244 * stores the value of the "cfg-handle" MD property as the value of 2245 * the "reg" property on the node in the device tree it builds from 2246 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2247 * "reg" property value to uniquely identify this device instance. 2248 * If the "reg" property cannot be found, the device tree state is 2249 * presumably so broken that there is no point in continuing. 2250 */ 2251 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2252 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2253 return (ENOENT); 2254 } 2255 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2256 OBP_REG, -1); 2257 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2258 2259 /* 2260 * We now walk the MD nodes to find the node for this vdisk. 2261 */ 2262 if ((mdp = md_get_handle()) == NULL) { 2263 cmn_err(CE_WARN, "unable to init machine description"); 2264 return (ENXIO); 2265 } 2266 2267 num_nodes = md_node_count(mdp); 2268 ASSERT(num_nodes > 0); 2269 2270 listsz = num_nodes * sizeof (mde_cookie_t); 2271 2272 /* allocate memory for nodes */ 2273 listp = kmem_zalloc(listsz, KM_SLEEP); 2274 2275 rootnode = md_root_node(mdp); 2276 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2277 2278 /* 2279 * Search for all the virtual devices, we will then check to see which 2280 * ones are disk nodes. 2281 */ 2282 num_vdevs = md_scan_dag(mdp, rootnode, 2283 md_find_name(mdp, VDC_MD_VDEV_NAME), 2284 md_find_name(mdp, "fwd"), listp); 2285 2286 if (num_vdevs <= 0) { 2287 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2288 status = ENOENT; 2289 goto done; 2290 } 2291 2292 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2293 for (idx = 0; idx < num_vdevs; idx++) { 2294 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2295 if ((status != 0) || (node_name == NULL)) { 2296 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2297 ": err %d", VDC_MD_VDEV_NAME, status); 2298 continue; 2299 } 2300 2301 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2302 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2303 status = md_get_prop_val(mdp, listp[idx], 2304 VDC_MD_CFG_HDL, &md_inst); 2305 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2306 instance, md_inst); 2307 if ((status == 0) && (md_inst == obp_inst)) { 2308 found_inst = B_TRUE; 2309 break; 2310 } 2311 } 2312 } 2313 2314 if (!found_inst) { 2315 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2316 status = ENOENT; 2317 goto done; 2318 } 2319 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2320 2321 *vd_nodep = listp[idx]; 2322 *mdpp = mdp; 2323 2324 num_vports = md_scan_dag(mdp, *vd_nodep, 2325 md_find_name(mdp, VDC_MD_PORT_NAME), 2326 md_find_name(mdp, "fwd"), listp); 2327 2328 if (num_vports != 1) { 2329 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2330 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME, num_vports); 2331 } 2332 2333 *vd_portp = (num_vports == 0)? NULL: listp[0]; 2334 2335 done: 2336 kmem_free(listp, listsz); 2337 return (status); 2338 } 2339 2340 /* 2341 * Function: 2342 * vdc_get_ldc_id() 2343 * 2344 * Description: 2345 * This function gets the 'ldc-id' for this particular instance of vdc. 2346 * The id returned is the guest domain channel endpoint LDC uses for 2347 * communication with vds. 2348 * 2349 * Arguments: 2350 * mdp - pointer to the machine description. 2351 * vd_node - the vdisk element from the MD. 2352 * ldc_id - pointer to variable used to return the 'ldc-id' found. 2353 * 2354 * Return Code: 2355 * 0 - Success. 2356 * ENOENT - Expected node or property did not exist. 2357 */ 2358 static int 2359 vdc_get_ldc_id(md_t *mdp, mde_cookie_t vd_node, uint64_t *ldc_id) 2360 { 2361 mde_cookie_t *chanp = NULL; 2362 int listsz; 2363 int num_chans; 2364 int num_nodes; 2365 int status = 0; 2366 2367 num_nodes = md_node_count(mdp); 2368 ASSERT(num_nodes > 0); 2369 2370 listsz = num_nodes * sizeof (mde_cookie_t); 2371 2372 /* allocate memory for nodes */ 2373 chanp = kmem_zalloc(listsz, KM_SLEEP); 2374 2375 /* get the channels for this node */ 2376 num_chans = md_scan_dag(mdp, vd_node, 2377 md_find_name(mdp, VDC_MD_CHAN_NAME), 2378 md_find_name(mdp, "fwd"), chanp); 2379 2380 /* expecting at least one channel */ 2381 if (num_chans <= 0) { 2382 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2383 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2384 status = ENOENT; 2385 goto done; 2386 2387 } else if (num_chans != 1) { 2388 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2389 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, num_chans); 2390 } 2391 2392 /* 2393 * We use the first channel found (index 0), irrespective of how 2394 * many are there in total. 2395 */ 2396 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, ldc_id) != 0) { 2397 cmn_err(CE_NOTE, "Channel '%s' property not found", VDC_MD_ID); 2398 status = ENOENT; 2399 } 2400 2401 done: 2402 kmem_free(chanp, listsz); 2403 return (status); 2404 } 2405 2406 static int 2407 vdc_do_ldc_up(vdc_t *vdc) 2408 { 2409 int status; 2410 ldc_status_t ldc_state; 2411 2412 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2413 vdc->instance, vdc->ldc_id); 2414 2415 if (vdc->lifecycle == VDC_LC_DETACHING) 2416 return (EINVAL); 2417 2418 if ((status = ldc_up(vdc->ldc_handle)) != 0) { 2419 switch (status) { 2420 case ECONNREFUSED: /* listener not ready at other end */ 2421 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2422 vdc->instance, vdc->ldc_id, status); 2423 status = 0; 2424 break; 2425 default: 2426 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2427 "channel=%ld, err=%d", vdc->instance, vdc->ldc_id, 2428 status); 2429 break; 2430 } 2431 } 2432 2433 if (ldc_status(vdc->ldc_handle, &ldc_state) == 0) { 2434 vdc->ldc_state = ldc_state; 2435 if (ldc_state == LDC_UP) { 2436 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2437 vdc->instance); 2438 vdc->seq_num = 1; 2439 vdc->seq_num_reply = 0; 2440 } 2441 } 2442 2443 return (status); 2444 } 2445 2446 /* 2447 * Function: 2448 * vdc_terminate_ldc() 2449 * 2450 * Description: 2451 * 2452 * Arguments: 2453 * vdc - soft state pointer for this instance of the device driver. 2454 * 2455 * Return Code: 2456 * None 2457 */ 2458 static void 2459 vdc_terminate_ldc(vdc_t *vdc) 2460 { 2461 int instance = ddi_get_instance(vdc->dip); 2462 2463 ASSERT(vdc != NULL); 2464 ASSERT(mutex_owned(&vdc->lock)); 2465 2466 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2467 2468 if (vdc->initialized & VDC_LDC_OPEN) { 2469 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2470 (void) ldc_close(vdc->ldc_handle); 2471 } 2472 if (vdc->initialized & VDC_LDC_CB) { 2473 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2474 (void) ldc_unreg_callback(vdc->ldc_handle); 2475 } 2476 if (vdc->initialized & VDC_LDC) { 2477 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2478 (void) ldc_fini(vdc->ldc_handle); 2479 vdc->ldc_handle = NULL; 2480 } 2481 2482 vdc->initialized &= ~(VDC_LDC | VDC_LDC_CB | VDC_LDC_OPEN); 2483 } 2484 2485 /* -------------------------------------------------------------------------- */ 2486 2487 /* 2488 * Descriptor Ring helper routines 2489 */ 2490 2491 /* 2492 * Function: 2493 * vdc_init_descriptor_ring() 2494 * 2495 * Description: 2496 * 2497 * Arguments: 2498 * vdc - soft state pointer for this instance of the device driver. 2499 * 2500 * Return Code: 2501 * 0 - Success 2502 */ 2503 static int 2504 vdc_init_descriptor_ring(vdc_t *vdc) 2505 { 2506 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2507 int status = 0; 2508 int i; 2509 2510 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2511 2512 ASSERT(vdc != NULL); 2513 ASSERT(mutex_owned(&vdc->lock)); 2514 ASSERT(vdc->ldc_handle != NULL); 2515 2516 /* ensure we have enough room to store max sized block */ 2517 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2518 2519 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2520 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2521 /* 2522 * Calculate the maximum block size we can transmit using one 2523 * Descriptor Ring entry from the attributes returned by the 2524 * vDisk server. This is subject to a minimum of 'maxphys' 2525 * as we do not have the capability to split requests over 2526 * multiple DRing entries. 2527 */ 2528 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2529 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2530 vdc->instance); 2531 vdc->dring_max_cookies = maxphys / PAGESIZE; 2532 } else { 2533 vdc->dring_max_cookies = 2534 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2535 } 2536 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2537 (sizeof (ldc_mem_cookie_t) * 2538 (vdc->dring_max_cookies - 1))); 2539 vdc->dring_len = VD_DRING_LEN; 2540 2541 status = ldc_mem_dring_create(vdc->dring_len, 2542 vdc->dring_entry_size, &vdc->ldc_dring_hdl); 2543 if ((vdc->ldc_dring_hdl == NULL) || (status != 0)) { 2544 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2545 vdc->instance); 2546 return (status); 2547 } 2548 vdc->initialized |= VDC_DRING_INIT; 2549 } 2550 2551 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2552 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2553 vdc->dring_cookie = 2554 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2555 2556 status = ldc_mem_dring_bind(vdc->ldc_handle, vdc->ldc_dring_hdl, 2557 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2558 &vdc->dring_cookie[0], 2559 &vdc->dring_cookie_count); 2560 if (status != 0) { 2561 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2562 "(%lx) to channel (%lx) status=%d\n", 2563 vdc->instance, vdc->ldc_dring_hdl, 2564 vdc->ldc_handle, status); 2565 return (status); 2566 } 2567 ASSERT(vdc->dring_cookie_count == 1); 2568 vdc->initialized |= VDC_DRING_BOUND; 2569 } 2570 2571 status = ldc_mem_dring_info(vdc->ldc_dring_hdl, &vdc->dring_mem_info); 2572 if (status != 0) { 2573 DMSG(vdc, 0, 2574 "[%d] Failed to get info for descriptor ring (%lx)\n", 2575 vdc->instance, vdc->ldc_dring_hdl); 2576 return (status); 2577 } 2578 2579 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2580 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2581 2582 /* Allocate the local copy of this dring */ 2583 vdc->local_dring = 2584 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2585 KM_SLEEP); 2586 vdc->initialized |= VDC_DRING_LOCAL; 2587 } 2588 2589 /* 2590 * Mark all DRing entries as free and initialize the private 2591 * descriptor's memory handles. If any entry is initialized, 2592 * we need to free it later so we set the bit in 'initialized' 2593 * at the start. 2594 */ 2595 vdc->initialized |= VDC_DRING_ENTRY; 2596 for (i = 0; i < vdc->dring_len; i++) { 2597 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2598 dep->hdr.dstate = VIO_DESC_FREE; 2599 2600 status = ldc_mem_alloc_handle(vdc->ldc_handle, 2601 &vdc->local_dring[i].desc_mhdl); 2602 if (status != 0) { 2603 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2604 " descriptor %d", vdc->instance, i); 2605 return (status); 2606 } 2607 vdc->local_dring[i].is_free = B_TRUE; 2608 vdc->local_dring[i].dep = dep; 2609 } 2610 2611 /* Initialize the starting index */ 2612 vdc->dring_curr_idx = 0; 2613 2614 return (status); 2615 } 2616 2617 /* 2618 * Function: 2619 * vdc_destroy_descriptor_ring() 2620 * 2621 * Description: 2622 * 2623 * Arguments: 2624 * vdc - soft state pointer for this instance of the device driver. 2625 * 2626 * Return Code: 2627 * None 2628 */ 2629 static void 2630 vdc_destroy_descriptor_ring(vdc_t *vdc) 2631 { 2632 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2633 ldc_mem_handle_t mhdl = NULL; 2634 ldc_mem_info_t minfo; 2635 int status = -1; 2636 int i; /* loop */ 2637 2638 ASSERT(vdc != NULL); 2639 ASSERT(mutex_owned(&vdc->lock)); 2640 2641 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2642 2643 if (vdc->initialized & VDC_DRING_ENTRY) { 2644 DMSG(vdc, 0, 2645 "[%d] Removing Local DRing entries\n", vdc->instance); 2646 for (i = 0; i < vdc->dring_len; i++) { 2647 ldep = &vdc->local_dring[i]; 2648 mhdl = ldep->desc_mhdl; 2649 2650 if (mhdl == NULL) 2651 continue; 2652 2653 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2654 DMSG(vdc, 0, 2655 "ldc_mem_info returned an error: %d\n", 2656 status); 2657 2658 /* 2659 * This must mean that the mem handle 2660 * is not valid. Clear it out so that 2661 * no one tries to use it. 2662 */ 2663 ldep->desc_mhdl = NULL; 2664 continue; 2665 } 2666 2667 if (minfo.status == LDC_BOUND) { 2668 (void) ldc_mem_unbind_handle(mhdl); 2669 } 2670 2671 (void) ldc_mem_free_handle(mhdl); 2672 2673 ldep->desc_mhdl = NULL; 2674 } 2675 vdc->initialized &= ~VDC_DRING_ENTRY; 2676 } 2677 2678 if (vdc->initialized & VDC_DRING_LOCAL) { 2679 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2680 kmem_free(vdc->local_dring, 2681 vdc->dring_len * sizeof (vdc_local_desc_t)); 2682 vdc->initialized &= ~VDC_DRING_LOCAL; 2683 } 2684 2685 if (vdc->initialized & VDC_DRING_BOUND) { 2686 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2687 status = ldc_mem_dring_unbind(vdc->ldc_dring_hdl); 2688 if (status == 0) { 2689 vdc->initialized &= ~VDC_DRING_BOUND; 2690 } else { 2691 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2692 vdc->instance, status, vdc->ldc_dring_hdl); 2693 } 2694 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2695 } 2696 2697 if (vdc->initialized & VDC_DRING_INIT) { 2698 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2699 status = ldc_mem_dring_destroy(vdc->ldc_dring_hdl); 2700 if (status == 0) { 2701 vdc->ldc_dring_hdl = NULL; 2702 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2703 vdc->initialized &= ~VDC_DRING_INIT; 2704 } else { 2705 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2706 vdc->instance, status, vdc->ldc_dring_hdl); 2707 } 2708 } 2709 } 2710 2711 /* 2712 * Function: 2713 * vdc_map_to_shared_ring() 2714 * 2715 * Description: 2716 * Copy contents of the local descriptor to the shared 2717 * memory descriptor. 2718 * 2719 * Arguments: 2720 * vdcp - soft state pointer for this instance of the device driver. 2721 * idx - descriptor ring index 2722 * 2723 * Return Code: 2724 * None 2725 */ 2726 static int 2727 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2728 { 2729 vdc_local_desc_t *ldep; 2730 vd_dring_entry_t *dep; 2731 int rv; 2732 2733 ldep = &(vdcp->local_dring[idx]); 2734 2735 /* for now leave in the old pop_mem_hdl stuff */ 2736 if (ldep->nbytes > 0) { 2737 rv = vdc_populate_mem_hdl(vdcp, ldep); 2738 if (rv) { 2739 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2740 vdcp->instance); 2741 return (rv); 2742 } 2743 } 2744 2745 /* 2746 * fill in the data details into the DRing 2747 */ 2748 dep = ldep->dep; 2749 ASSERT(dep != NULL); 2750 2751 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2752 dep->payload.operation = ldep->operation; 2753 dep->payload.addr = ldep->offset; 2754 dep->payload.nbytes = ldep->nbytes; 2755 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2756 dep->payload.slice = ldep->slice; 2757 dep->hdr.dstate = VIO_DESC_READY; 2758 dep->hdr.ack = 1; /* request an ACK for every message */ 2759 2760 return (0); 2761 } 2762 2763 /* 2764 * Function: 2765 * vdc_send_request 2766 * 2767 * Description: 2768 * This routine writes the data to be transmitted to vds into the 2769 * descriptor, notifies vds that the ring has been updated and 2770 * then waits for the request to be processed. 2771 * 2772 * Arguments: 2773 * vdcp - the soft state pointer 2774 * operation - operation we want vds to perform (VD_OP_XXX) 2775 * addr - address of data buf to be read/written. 2776 * nbytes - number of bytes to read/write 2777 * slice - the disk slice this request is for 2778 * offset - relative disk offset 2779 * cb_type - type of call - STRATEGY or SYNC 2780 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2781 * . mode for ioctl(9e) 2782 * . LP64 diskaddr_t (block I/O) 2783 * dir - direction of operation (READ/WRITE/BOTH) 2784 * 2785 * Return Codes: 2786 * 0 2787 * ENXIO 2788 */ 2789 static int 2790 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2791 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2792 void *cb_arg, vio_desc_direction_t dir) 2793 { 2794 int rv = 0; 2795 2796 ASSERT(vdcp != NULL); 2797 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2798 2799 mutex_enter(&vdcp->lock); 2800 2801 /* 2802 * If this is a block read/write operation we update the I/O statistics 2803 * to indicate that the request is being put on the waitq to be 2804 * serviced. 2805 * 2806 * We do it here (a common routine for both synchronous and strategy 2807 * calls) for performance reasons - we are already holding vdc->lock 2808 * so there is no extra locking overhead. We would have to explicitly 2809 * grab the 'lock' mutex to update the stats if we were to do this 2810 * higher up the stack in vdc_strategy() et. al. 2811 */ 2812 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2813 DTRACE_IO1(start, buf_t *, cb_arg); 2814 VD_KSTAT_WAITQ_ENTER(vdcp->io_stats); 2815 } 2816 2817 do { 2818 while (vdcp->state != VDC_STATE_RUNNING) { 2819 2820 /* return error if detaching */ 2821 if (vdcp->state == VDC_STATE_DETACH) { 2822 rv = ENXIO; 2823 goto done; 2824 } 2825 2826 /* fail request if connection timeout is reached */ 2827 if (vdcp->ctimeout_reached) { 2828 rv = EIO; 2829 goto done; 2830 } 2831 2832 /* 2833 * If we are panicking and the disk is not ready then 2834 * we can't send any request because we can't complete 2835 * the handshake now. 2836 */ 2837 if (ddi_in_panic()) { 2838 rv = EIO; 2839 goto done; 2840 } 2841 2842 cv_wait(&vdcp->running_cv, &vdcp->lock); 2843 } 2844 2845 } while (vdc_populate_descriptor(vdcp, operation, addr, 2846 nbytes, slice, offset, cb_type, cb_arg, dir)); 2847 2848 done: 2849 /* 2850 * If this is a block read/write we update the I/O statistics kstat 2851 * to indicate that this request has been placed on the queue for 2852 * processing (i.e sent to the vDisk server) - iostat(1M) will 2853 * report the time waiting for the vDisk server under the %b column 2854 * In the case of an error we simply take it off the wait queue. 2855 */ 2856 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2857 if (rv == 0) { 2858 VD_KSTAT_WAITQ_TO_RUNQ(vdcp->io_stats); 2859 DTRACE_PROBE1(send, buf_t *, cb_arg); 2860 } else { 2861 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2862 VD_KSTAT_WAITQ_EXIT(vdcp->io_stats); 2863 DTRACE_IO1(done, buf_t *, cb_arg); 2864 } 2865 } 2866 2867 mutex_exit(&vdcp->lock); 2868 2869 return (rv); 2870 } 2871 2872 2873 /* 2874 * Function: 2875 * vdc_populate_descriptor 2876 * 2877 * Description: 2878 * This routine writes the data to be transmitted to vds into the 2879 * descriptor, notifies vds that the ring has been updated and 2880 * then waits for the request to be processed. 2881 * 2882 * Arguments: 2883 * vdcp - the soft state pointer 2884 * operation - operation we want vds to perform (VD_OP_XXX) 2885 * addr - address of data buf to be read/written. 2886 * nbytes - number of bytes to read/write 2887 * slice - the disk slice this request is for 2888 * offset - relative disk offset 2889 * cb_type - type of call - STRATEGY or SYNC 2890 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2891 * . mode for ioctl(9e) 2892 * . LP64 diskaddr_t (block I/O) 2893 * dir - direction of operation (READ/WRITE/BOTH) 2894 * 2895 * Return Codes: 2896 * 0 2897 * EAGAIN 2898 * ECONNRESET 2899 * ENXIO 2900 */ 2901 static int 2902 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2903 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2904 void *cb_arg, vio_desc_direction_t dir) 2905 { 2906 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2907 int idx; /* Index of DRing entry used */ 2908 int next_idx; 2909 vio_dring_msg_t dmsg; 2910 size_t msglen; 2911 int rv; 2912 2913 ASSERT(MUTEX_HELD(&vdcp->lock)); 2914 vdcp->threads_pending++; 2915 loop: 2916 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2917 2918 /* Get next available D-Ring entry */ 2919 idx = vdcp->dring_curr_idx; 2920 local_dep = &(vdcp->local_dring[idx]); 2921 2922 if (!local_dep->is_free) { 2923 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2924 vdcp->instance); 2925 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2926 if (vdcp->state == VDC_STATE_RUNNING || 2927 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2928 goto loop; 2929 } 2930 vdcp->threads_pending--; 2931 return (ECONNRESET); 2932 } 2933 2934 next_idx = idx + 1; 2935 if (next_idx >= vdcp->dring_len) 2936 next_idx = 0; 2937 vdcp->dring_curr_idx = next_idx; 2938 2939 ASSERT(local_dep->is_free); 2940 2941 local_dep->operation = operation; 2942 local_dep->addr = addr; 2943 local_dep->nbytes = nbytes; 2944 local_dep->slice = slice; 2945 local_dep->offset = offset; 2946 local_dep->cb_type = cb_type; 2947 local_dep->cb_arg = cb_arg; 2948 local_dep->dir = dir; 2949 2950 local_dep->is_free = B_FALSE; 2951 2952 rv = vdc_map_to_shared_dring(vdcp, idx); 2953 if (rv) { 2954 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 2955 vdcp->instance); 2956 /* free the descriptor */ 2957 local_dep->is_free = B_TRUE; 2958 vdcp->dring_curr_idx = idx; 2959 cv_wait(&vdcp->membind_cv, &vdcp->lock); 2960 if (vdcp->state == VDC_STATE_RUNNING || 2961 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2962 goto loop; 2963 } 2964 vdcp->threads_pending--; 2965 return (ECONNRESET); 2966 } 2967 2968 /* 2969 * Send a msg with the DRing details to vds 2970 */ 2971 VIO_INIT_DRING_DATA_TAG(dmsg); 2972 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 2973 dmsg.dring_ident = vdcp->dring_ident; 2974 dmsg.start_idx = idx; 2975 dmsg.end_idx = idx; 2976 vdcp->seq_num++; 2977 2978 DTRACE_PROBE2(populate, int, vdcp->instance, 2979 vdc_local_desc_t *, local_dep); 2980 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 2981 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 2982 2983 /* 2984 * note we're still holding the lock here to 2985 * make sure the message goes out in order !!!... 2986 */ 2987 msglen = sizeof (dmsg); 2988 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 2989 switch (rv) { 2990 case ECONNRESET: 2991 /* 2992 * vdc_send initiates the reset on failure. 2993 * Since the transaction has already been put 2994 * on the local dring, it will automatically get 2995 * retried when the channel is reset. Given that, 2996 * it is ok to just return success even though the 2997 * send failed. 2998 */ 2999 rv = 0; 3000 break; 3001 3002 case 0: /* EOK */ 3003 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3004 break; 3005 3006 default: 3007 goto cleanup_and_exit; 3008 } 3009 3010 vdcp->threads_pending--; 3011 return (rv); 3012 3013 cleanup_and_exit: 3014 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3015 return (ENXIO); 3016 } 3017 3018 /* 3019 * Function: 3020 * vdc_do_sync_op 3021 * 3022 * Description: 3023 * Wrapper around vdc_populate_descriptor that blocks until the 3024 * response to the message is available. 3025 * 3026 * Arguments: 3027 * vdcp - the soft state pointer 3028 * operation - operation we want vds to perform (VD_OP_XXX) 3029 * addr - address of data buf to be read/written. 3030 * nbytes - number of bytes to read/write 3031 * slice - the disk slice this request is for 3032 * offset - relative disk offset 3033 * cb_type - type of call - STRATEGY or SYNC 3034 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3035 * . mode for ioctl(9e) 3036 * . LP64 diskaddr_t (block I/O) 3037 * dir - direction of operation (READ/WRITE/BOTH) 3038 * rconflict - check for reservation conflict in case of failure 3039 * 3040 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3041 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3042 * result of a successful operation with vd_scsi_status(). 3043 * 3044 * Return Codes: 3045 * 0 3046 * EAGAIN 3047 * EFAULT 3048 * ENXIO 3049 * EIO 3050 */ 3051 static int 3052 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3053 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3054 vio_desc_direction_t dir, boolean_t rconflict) 3055 { 3056 int status; 3057 vdc_io_t *vio; 3058 boolean_t check_resv_conflict = B_FALSE; 3059 3060 ASSERT(cb_type == CB_SYNC); 3061 3062 /* 3063 * Grab the lock, if blocked wait until the server 3064 * response causes us to wake up again. 3065 */ 3066 mutex_enter(&vdcp->lock); 3067 vdcp->sync_op_cnt++; 3068 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3069 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3070 3071 if (vdcp->state == VDC_STATE_DETACH) { 3072 cv_broadcast(&vdcp->sync_blocked_cv); 3073 vdcp->sync_op_cnt--; 3074 mutex_exit(&vdcp->lock); 3075 return (ENXIO); 3076 } 3077 3078 /* now block anyone other thread entering after us */ 3079 vdcp->sync_op_blocked = B_TRUE; 3080 vdcp->sync_op_pending = B_TRUE; 3081 mutex_exit(&vdcp->lock); 3082 3083 status = vdc_send_request(vdcp, operation, addr, 3084 nbytes, slice, offset, cb_type, cb_arg, dir); 3085 3086 mutex_enter(&vdcp->lock); 3087 3088 if (status != 0) { 3089 vdcp->sync_op_pending = B_FALSE; 3090 } else { 3091 /* 3092 * block until our transaction completes. 3093 * Also anyone else waiting also gets to go next. 3094 */ 3095 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3096 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3097 3098 DMSG(vdcp, 2, ": operation returned %d\n", 3099 vdcp->sync_op_status); 3100 if (vdcp->state == VDC_STATE_DETACH) { 3101 vdcp->sync_op_pending = B_FALSE; 3102 status = ENXIO; 3103 } else { 3104 status = vdcp->sync_op_status; 3105 if (status != 0 && vdcp->failfast_interval != 0) { 3106 /* 3107 * Operation has failed and failfast is enabled. 3108 * We need to check if the failure is due to a 3109 * reservation conflict if this was requested. 3110 */ 3111 check_resv_conflict = rconflict; 3112 } 3113 3114 } 3115 } 3116 3117 vdcp->sync_op_status = 0; 3118 vdcp->sync_op_blocked = B_FALSE; 3119 vdcp->sync_op_cnt--; 3120 3121 /* signal the next waiting thread */ 3122 cv_signal(&vdcp->sync_blocked_cv); 3123 3124 /* 3125 * We have to check for reservation conflict after unblocking sync 3126 * operations because some sync operations will be used to do this 3127 * check. 3128 */ 3129 if (check_resv_conflict) { 3130 vio = vdc_failfast_io_queue(vdcp, NULL); 3131 while (vio->vio_qtime != 0) 3132 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3133 kmem_free(vio, sizeof (vdc_io_t)); 3134 } 3135 3136 mutex_exit(&vdcp->lock); 3137 3138 return (status); 3139 } 3140 3141 3142 /* 3143 * Function: 3144 * vdc_drain_response() 3145 * 3146 * Description: 3147 * When a guest is panicking, the completion of requests needs to be 3148 * handled differently because interrupts are disabled and vdc 3149 * will not get messages. We have to poll for the messages instead. 3150 * 3151 * Note: since we don't have a buf_t available we cannot implement 3152 * the io:::done DTrace probe in this specific case. 3153 * 3154 * Arguments: 3155 * vdc - soft state pointer for this instance of the device driver. 3156 * 3157 * Return Code: 3158 * 0 - Success 3159 */ 3160 static int 3161 vdc_drain_response(vdc_t *vdc) 3162 { 3163 int rv, idx, retries; 3164 size_t msglen; 3165 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3166 vio_dring_msg_t dmsg; 3167 3168 mutex_enter(&vdc->lock); 3169 3170 retries = 0; 3171 for (;;) { 3172 msglen = sizeof (dmsg); 3173 rv = ldc_read(vdc->ldc_handle, (caddr_t)&dmsg, &msglen); 3174 if (rv) { 3175 rv = EINVAL; 3176 break; 3177 } 3178 3179 /* 3180 * if there are no packets wait and check again 3181 */ 3182 if ((rv == 0) && (msglen == 0)) { 3183 if (retries++ > vdc_dump_retries) { 3184 rv = EAGAIN; 3185 break; 3186 } 3187 3188 drv_usecwait(vdc_usec_timeout_dump); 3189 continue; 3190 } 3191 3192 /* 3193 * Ignore all messages that are not ACKs/NACKs to 3194 * DRing requests. 3195 */ 3196 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3197 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3198 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3199 dmsg.tag.vio_msgtype, 3200 dmsg.tag.vio_subtype, 3201 dmsg.tag.vio_subtype_env); 3202 continue; 3203 } 3204 3205 /* 3206 * set the appropriate return value for the current request. 3207 */ 3208 switch (dmsg.tag.vio_subtype) { 3209 case VIO_SUBTYPE_ACK: 3210 rv = 0; 3211 break; 3212 case VIO_SUBTYPE_NACK: 3213 rv = EAGAIN; 3214 break; 3215 default: 3216 continue; 3217 } 3218 3219 idx = dmsg.start_idx; 3220 if (idx >= vdc->dring_len) { 3221 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3222 vdc->instance, idx); 3223 continue; 3224 } 3225 ldep = &vdc->local_dring[idx]; 3226 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3227 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3228 vdc->instance, idx, ldep->dep->hdr.dstate); 3229 continue; 3230 } 3231 3232 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3233 vdc->instance, idx, ldep->dep->hdr.dstate); 3234 3235 rv = vdc_depopulate_descriptor(vdc, idx); 3236 if (rv) { 3237 DMSG(vdc, 0, 3238 "[%d] Entry @ %d - depopulate failed ..\n", 3239 vdc->instance, idx); 3240 } 3241 3242 /* if this is the last descriptor - break out of loop */ 3243 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3244 break; 3245 } 3246 3247 mutex_exit(&vdc->lock); 3248 DMSG(vdc, 0, "End idx=%d\n", idx); 3249 3250 return (rv); 3251 } 3252 3253 3254 /* 3255 * Function: 3256 * vdc_depopulate_descriptor() 3257 * 3258 * Description: 3259 * 3260 * Arguments: 3261 * vdc - soft state pointer for this instance of the device driver. 3262 * idx - Index of the Descriptor Ring entry being modified 3263 * 3264 * Return Code: 3265 * 0 - Success 3266 */ 3267 static int 3268 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3269 { 3270 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3271 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3272 int status = ENXIO; 3273 int rv = 0; 3274 3275 ASSERT(vdc != NULL); 3276 ASSERT(idx < vdc->dring_len); 3277 ldep = &vdc->local_dring[idx]; 3278 ASSERT(ldep != NULL); 3279 ASSERT(MUTEX_HELD(&vdc->lock)); 3280 3281 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3282 DMSG(vdc, 2, ": idx = %d\n", idx); 3283 3284 dep = ldep->dep; 3285 ASSERT(dep != NULL); 3286 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3287 (dep->payload.status == ECANCELED)); 3288 3289 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3290 3291 ldep->is_free = B_TRUE; 3292 status = dep->payload.status; 3293 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3294 3295 /* 3296 * If no buffers were used to transfer information to the server when 3297 * populating the descriptor then no memory handles need to be unbound 3298 * and we can return now. 3299 */ 3300 if (ldep->nbytes == 0) { 3301 cv_signal(&vdc->dring_free_cv); 3302 return (status); 3303 } 3304 3305 /* 3306 * If the upper layer passed in a misaligned address we copied the 3307 * data into an aligned buffer before sending it to LDC - we now 3308 * copy it back to the original buffer. 3309 */ 3310 if (ldep->align_addr) { 3311 ASSERT(ldep->addr != NULL); 3312 3313 if (dep->payload.nbytes > 0) 3314 bcopy(ldep->align_addr, ldep->addr, 3315 dep->payload.nbytes); 3316 kmem_free(ldep->align_addr, 3317 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3318 ldep->align_addr = NULL; 3319 } 3320 3321 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3322 if (rv != 0) { 3323 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3324 vdc->instance, ldep->desc_mhdl, idx, rv); 3325 /* 3326 * The error returned by the vDisk server is more informative 3327 * and thus has a higher priority but if it isn't set we ensure 3328 * that this function returns an error. 3329 */ 3330 if (status == 0) 3331 status = EINVAL; 3332 } 3333 3334 cv_signal(&vdc->membind_cv); 3335 cv_signal(&vdc->dring_free_cv); 3336 3337 return (status); 3338 } 3339 3340 /* 3341 * Function: 3342 * vdc_populate_mem_hdl() 3343 * 3344 * Description: 3345 * 3346 * Arguments: 3347 * vdc - soft state pointer for this instance of the device driver. 3348 * idx - Index of the Descriptor Ring entry being modified 3349 * addr - virtual address being mapped in 3350 * nybtes - number of bytes in 'addr' 3351 * operation - the vDisk operation being performed (VD_OP_xxx) 3352 * 3353 * Return Code: 3354 * 0 - Success 3355 */ 3356 static int 3357 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3358 { 3359 vd_dring_entry_t *dep = NULL; 3360 ldc_mem_handle_t mhdl; 3361 caddr_t vaddr; 3362 size_t nbytes; 3363 uint8_t perm = LDC_MEM_RW; 3364 uint8_t maptype; 3365 int rv = 0; 3366 int i; 3367 3368 ASSERT(vdcp != NULL); 3369 3370 dep = ldep->dep; 3371 mhdl = ldep->desc_mhdl; 3372 3373 switch (ldep->dir) { 3374 case VIO_read_dir: 3375 perm = LDC_MEM_W; 3376 break; 3377 3378 case VIO_write_dir: 3379 perm = LDC_MEM_R; 3380 break; 3381 3382 case VIO_both_dir: 3383 perm = LDC_MEM_RW; 3384 break; 3385 3386 default: 3387 ASSERT(0); /* catch bad programming in vdc */ 3388 } 3389 3390 /* 3391 * LDC expects any addresses passed in to be 8-byte aligned. We need 3392 * to copy the contents of any misaligned buffers to a newly allocated 3393 * buffer and bind it instead (and copy the the contents back to the 3394 * original buffer passed in when depopulating the descriptor) 3395 */ 3396 vaddr = ldep->addr; 3397 nbytes = ldep->nbytes; 3398 if (((uint64_t)vaddr & 0x7) != 0) { 3399 ASSERT(ldep->align_addr == NULL); 3400 ldep->align_addr = 3401 kmem_alloc(sizeof (caddr_t) * 3402 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3403 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3404 "(buf=%p nb=%ld op=%d)\n", 3405 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3406 nbytes, ldep->operation); 3407 if (perm != LDC_MEM_W) 3408 bcopy(vaddr, ldep->align_addr, nbytes); 3409 vaddr = ldep->align_addr; 3410 } 3411 3412 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3413 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3414 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3415 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3416 vdcp->instance, dep->payload.ncookies); 3417 if (rv != 0) { 3418 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3419 "(mhdl=%p, buf=%p, err=%d)\n", 3420 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3421 if (ldep->align_addr) { 3422 kmem_free(ldep->align_addr, 3423 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3424 ldep->align_addr = NULL; 3425 } 3426 return (EAGAIN); 3427 } 3428 3429 /* 3430 * Get the other cookies (if any). 3431 */ 3432 for (i = 1; i < dep->payload.ncookies; i++) { 3433 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3434 if (rv != 0) { 3435 (void) ldc_mem_unbind_handle(mhdl); 3436 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3437 "(mhdl=%lx cnum=%d), err=%d", 3438 vdcp->instance, mhdl, i, rv); 3439 if (ldep->align_addr) { 3440 kmem_free(ldep->align_addr, 3441 sizeof (caddr_t) * ldep->nbytes); 3442 ldep->align_addr = NULL; 3443 } 3444 return (EAGAIN); 3445 } 3446 } 3447 3448 return (rv); 3449 } 3450 3451 /* 3452 * Interrupt handlers for messages from LDC 3453 */ 3454 3455 /* 3456 * Function: 3457 * vdc_handle_cb() 3458 * 3459 * Description: 3460 * 3461 * Arguments: 3462 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3463 * arg - soft state pointer for this instance of the device driver. 3464 * 3465 * Return Code: 3466 * 0 - Success 3467 */ 3468 static uint_t 3469 vdc_handle_cb(uint64_t event, caddr_t arg) 3470 { 3471 ldc_status_t ldc_state; 3472 int rv = 0; 3473 3474 vdc_t *vdc = (vdc_t *)(void *)arg; 3475 3476 ASSERT(vdc != NULL); 3477 3478 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3479 3480 /* 3481 * Depending on the type of event that triggered this callback, 3482 * we modify the handshake state or read the data. 3483 * 3484 * NOTE: not done as a switch() as event could be triggered by 3485 * a state change and a read request. Also the ordering of the 3486 * check for the event types is deliberate. 3487 */ 3488 if (event & LDC_EVT_UP) { 3489 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3490 3491 mutex_enter(&vdc->lock); 3492 3493 /* get LDC state */ 3494 rv = ldc_status(vdc->ldc_handle, &ldc_state); 3495 if (rv != 0) { 3496 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3497 vdc->instance, rv); 3498 return (LDC_SUCCESS); 3499 } 3500 if (vdc->ldc_state != LDC_UP && ldc_state == LDC_UP) { 3501 /* 3502 * Reset the transaction sequence numbers when 3503 * LDC comes up. We then kick off the handshake 3504 * negotiation with the vDisk server. 3505 */ 3506 vdc->seq_num = 1; 3507 vdc->seq_num_reply = 0; 3508 vdc->ldc_state = ldc_state; 3509 cv_signal(&vdc->initwait_cv); 3510 } 3511 3512 mutex_exit(&vdc->lock); 3513 } 3514 3515 if (event & LDC_EVT_READ) { 3516 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3517 mutex_enter(&vdc->read_lock); 3518 cv_signal(&vdc->read_cv); 3519 vdc->read_state = VDC_READ_PENDING; 3520 mutex_exit(&vdc->read_lock); 3521 3522 /* that's all we have to do - no need to handle DOWN/RESET */ 3523 return (LDC_SUCCESS); 3524 } 3525 3526 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3527 3528 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3529 3530 mutex_enter(&vdc->lock); 3531 /* 3532 * Need to wake up any readers so they will 3533 * detect that a reset has occurred. 3534 */ 3535 mutex_enter(&vdc->read_lock); 3536 if ((vdc->read_state == VDC_READ_WAITING) || 3537 (vdc->read_state == VDC_READ_RESET)) 3538 cv_signal(&vdc->read_cv); 3539 vdc->read_state = VDC_READ_RESET; 3540 mutex_exit(&vdc->read_lock); 3541 3542 /* wake up any threads waiting for connection to come up */ 3543 if (vdc->state == VDC_STATE_INIT_WAITING) { 3544 vdc->state = VDC_STATE_RESETTING; 3545 cv_signal(&vdc->initwait_cv); 3546 } 3547 3548 mutex_exit(&vdc->lock); 3549 } 3550 3551 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3552 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3553 vdc->instance, event); 3554 3555 return (LDC_SUCCESS); 3556 } 3557 3558 /* 3559 * Function: 3560 * vdc_wait_for_response() 3561 * 3562 * Description: 3563 * Block waiting for a response from the server. If there is 3564 * no data the thread block on the read_cv that is signalled 3565 * by the callback when an EVT_READ occurs. 3566 * 3567 * Arguments: 3568 * vdcp - soft state pointer for this instance of the device driver. 3569 * 3570 * Return Code: 3571 * 0 - Success 3572 */ 3573 static int 3574 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3575 { 3576 size_t nbytes = sizeof (*msgp); 3577 int status; 3578 3579 ASSERT(vdcp != NULL); 3580 3581 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3582 3583 status = vdc_recv(vdcp, msgp, &nbytes); 3584 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3585 status, (int)nbytes); 3586 if (status) { 3587 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3588 vdcp->instance, status); 3589 return (status); 3590 } 3591 3592 if (nbytes < sizeof (vio_msg_tag_t)) { 3593 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3594 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3595 return (ENOMSG); 3596 } 3597 3598 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3599 msgp->tag.vio_msgtype, 3600 msgp->tag.vio_subtype, 3601 msgp->tag.vio_subtype_env); 3602 3603 /* 3604 * Verify the Session ID of the message 3605 * 3606 * Every message after the Version has been negotiated should 3607 * have the correct session ID set. 3608 */ 3609 if ((msgp->tag.vio_sid != vdcp->session_id) && 3610 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3611 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3612 "expected 0x%lx [seq num %lx @ %d]", 3613 vdcp->instance, msgp->tag.vio_sid, 3614 vdcp->session_id, 3615 ((vio_dring_msg_t *)msgp)->seq_num, 3616 ((vio_dring_msg_t *)msgp)->start_idx); 3617 return (ENOMSG); 3618 } 3619 return (0); 3620 } 3621 3622 3623 /* 3624 * Function: 3625 * vdc_resubmit_backup_dring() 3626 * 3627 * Description: 3628 * Resubmit each descriptor in the backed up dring to 3629 * vDisk server. The Dring was backed up during connection 3630 * reset. 3631 * 3632 * Arguments: 3633 * vdcp - soft state pointer for this instance of the device driver. 3634 * 3635 * Return Code: 3636 * 0 - Success 3637 */ 3638 static int 3639 vdc_resubmit_backup_dring(vdc_t *vdcp) 3640 { 3641 int count; 3642 int b_idx; 3643 int rv; 3644 int dring_size; 3645 int status; 3646 vio_msg_t vio_msg; 3647 vdc_local_desc_t *curr_ldep; 3648 3649 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3650 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3651 3652 if (vdcp->local_dring_backup == NULL) { 3653 /* the pending requests have already been processed */ 3654 return (0); 3655 } 3656 3657 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3658 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3659 3660 /* 3661 * Walk the backup copy of the local descriptor ring and 3662 * resubmit all the outstanding transactions. 3663 */ 3664 b_idx = vdcp->local_dring_backup_tail; 3665 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3666 3667 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3668 3669 /* only resubmit outstanding transactions */ 3670 if (!curr_ldep->is_free) { 3671 3672 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3673 mutex_enter(&vdcp->lock); 3674 rv = vdc_populate_descriptor(vdcp, curr_ldep->operation, 3675 curr_ldep->addr, curr_ldep->nbytes, 3676 curr_ldep->slice, curr_ldep->offset, 3677 curr_ldep->cb_type, curr_ldep->cb_arg, 3678 curr_ldep->dir); 3679 mutex_exit(&vdcp->lock); 3680 if (rv) { 3681 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3682 vdcp->instance, b_idx); 3683 return (rv); 3684 } 3685 3686 /* Wait for the response message. */ 3687 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3688 b_idx); 3689 status = vdc_wait_for_response(vdcp, &vio_msg); 3690 if (status) { 3691 DMSG(vdcp, 1, "[%d] wait_for_response " 3692 "returned err=%d\n", vdcp->instance, 3693 status); 3694 return (status); 3695 } 3696 3697 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3698 status = vdc_process_data_msg(vdcp, &vio_msg); 3699 if (status) { 3700 DMSG(vdcp, 1, "[%d] process_data_msg " 3701 "returned err=%d\n", vdcp->instance, 3702 status); 3703 return (status); 3704 } 3705 } 3706 3707 /* get the next element to submit */ 3708 if (++b_idx >= vdcp->local_dring_backup_len) 3709 b_idx = 0; 3710 } 3711 3712 /* all done - now clear up pending dring copy */ 3713 dring_size = vdcp->local_dring_backup_len * 3714 sizeof (vdcp->local_dring_backup[0]); 3715 3716 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3717 3718 vdcp->local_dring_backup = NULL; 3719 3720 return (0); 3721 } 3722 3723 /* 3724 * Function: 3725 * vdc_cancel_backup_dring 3726 * 3727 * Description: 3728 * Cancel each descriptor in the backed up dring to vDisk server. 3729 * The Dring was backed up during connection reset. 3730 * 3731 * Arguments: 3732 * vdcp - soft state pointer for this instance of the device driver. 3733 * 3734 * Return Code: 3735 * None 3736 */ 3737 void 3738 vdc_cancel_backup_ring(vdc_t *vdcp) 3739 { 3740 vdc_local_desc_t *ldep; 3741 struct buf *bufp; 3742 int count; 3743 int b_idx; 3744 int dring_size; 3745 3746 ASSERT(MUTEX_HELD(&vdcp->lock)); 3747 ASSERT(vdcp->state == VDC_STATE_INIT || 3748 vdcp->state == VDC_STATE_INIT_WAITING || 3749 vdcp->state == VDC_STATE_NEGOTIATE || 3750 vdcp->state == VDC_STATE_RESETTING); 3751 3752 if (vdcp->local_dring_backup == NULL) { 3753 /* the pending requests have already been processed */ 3754 return; 3755 } 3756 3757 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3758 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3759 3760 /* 3761 * Walk the backup copy of the local descriptor ring and 3762 * cancel all the outstanding transactions. 3763 */ 3764 b_idx = vdcp->local_dring_backup_tail; 3765 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3766 3767 ldep = &(vdcp->local_dring_backup[b_idx]); 3768 3769 /* only cancel outstanding transactions */ 3770 if (!ldep->is_free) { 3771 3772 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3773 3774 /* 3775 * All requests have already been cleared from the 3776 * local descriptor ring and the LDC channel has been 3777 * reset so we will never get any reply for these 3778 * requests. Now we just have to notify threads waiting 3779 * for replies that the request has failed. 3780 */ 3781 switch (ldep->cb_type) { 3782 case CB_SYNC: 3783 ASSERT(vdcp->sync_op_pending); 3784 vdcp->sync_op_status = EIO; 3785 vdcp->sync_op_pending = B_FALSE; 3786 cv_signal(&vdcp->sync_pending_cv); 3787 break; 3788 3789 case CB_STRATEGY: 3790 bufp = ldep->cb_arg; 3791 ASSERT(bufp != NULL); 3792 bufp->b_resid = bufp->b_bcount; 3793 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3794 VD_KSTAT_RUNQ_EXIT(vdcp->io_stats); 3795 DTRACE_IO1(done, buf_t *, bufp); 3796 bioerror(bufp, EIO); 3797 biodone(bufp); 3798 break; 3799 3800 default: 3801 ASSERT(0); 3802 } 3803 3804 } 3805 3806 /* get the next element to cancel */ 3807 if (++b_idx >= vdcp->local_dring_backup_len) 3808 b_idx = 0; 3809 } 3810 3811 /* all done - now clear up pending dring copy */ 3812 dring_size = vdcp->local_dring_backup_len * 3813 sizeof (vdcp->local_dring_backup[0]); 3814 3815 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3816 3817 vdcp->local_dring_backup = NULL; 3818 3819 DTRACE_PROBE2(processed, int, count, vdc_t *, vdcp); 3820 } 3821 3822 /* 3823 * Function: 3824 * vdc_connection_timeout 3825 * 3826 * Description: 3827 * This function is invoked if the timeout set to establish the connection 3828 * with vds expires. This will happen if we spend too much time in the 3829 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3830 * cancel any pending request and mark them as failed. 3831 * 3832 * If the timeout does not expire, it will be cancelled when we reach the 3833 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 3834 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 3835 * VDC_STATE_RESETTING state in which case we do nothing because the 3836 * timeout is being cancelled. 3837 * 3838 * Arguments: 3839 * arg - argument of the timeout function actually a soft state 3840 * pointer for the instance of the device driver. 3841 * 3842 * Return Code: 3843 * None 3844 */ 3845 void 3846 vdc_connection_timeout(void *arg) 3847 { 3848 vdc_t *vdcp = (vdc_t *)arg; 3849 3850 mutex_enter(&vdcp->lock); 3851 3852 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 3853 vdcp->state == VDC_STATE_DETACH) { 3854 /* 3855 * The connection has just been re-established or 3856 * we are detaching. 3857 */ 3858 vdcp->ctimeout_reached = B_FALSE; 3859 mutex_exit(&vdcp->lock); 3860 return; 3861 } 3862 3863 vdcp->ctimeout_reached = B_TRUE; 3864 3865 /* notify requests waiting for sending */ 3866 cv_broadcast(&vdcp->running_cv); 3867 3868 /* cancel requests waiting for a result */ 3869 vdc_cancel_backup_ring(vdcp); 3870 3871 mutex_exit(&vdcp->lock); 3872 3873 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 3874 vdcp->instance); 3875 } 3876 3877 /* 3878 * Function: 3879 * vdc_backup_local_dring() 3880 * 3881 * Description: 3882 * Backup the current dring in the event of a reset. The Dring 3883 * transactions will be resubmitted to the server when the 3884 * connection is restored. 3885 * 3886 * Arguments: 3887 * vdcp - soft state pointer for this instance of the device driver. 3888 * 3889 * Return Code: 3890 * NONE 3891 */ 3892 static void 3893 vdc_backup_local_dring(vdc_t *vdcp) 3894 { 3895 int dring_size; 3896 3897 ASSERT(MUTEX_HELD(&vdcp->lock)); 3898 ASSERT(vdcp->state == VDC_STATE_RESETTING); 3899 3900 /* 3901 * If the backup dring is stil around, it means 3902 * that the last restore did not complete. However, 3903 * since we never got back into the running state, 3904 * the backup copy we have is still valid. 3905 */ 3906 if (vdcp->local_dring_backup != NULL) { 3907 DMSG(vdcp, 1, "reusing local descriptor ring backup " 3908 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 3909 vdcp->local_dring_backup_tail); 3910 return; 3911 } 3912 3913 /* 3914 * The backup dring can be NULL and the local dring may not be 3915 * initialized. This can happen if we had a reset while establishing 3916 * a new connection but after the connection has timed out. In that 3917 * case the backup dring is NULL because the requests have been 3918 * cancelled and the request occured before the local dring is 3919 * initialized. 3920 */ 3921 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 3922 return; 3923 3924 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 3925 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 3926 3927 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 3928 3929 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 3930 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 3931 3932 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 3933 vdcp->local_dring_backup_len = vdcp->dring_len; 3934 } 3935 3936 /* -------------------------------------------------------------------------- */ 3937 3938 /* 3939 * The following functions process the incoming messages from vds 3940 */ 3941 3942 /* 3943 * Function: 3944 * vdc_process_msg_thread() 3945 * 3946 * Description: 3947 * 3948 * Main VDC message processing thread. Each vDisk instance 3949 * consists of a copy of this thread. This thread triggers 3950 * all the handshakes and data exchange with the server. It 3951 * also handles all channel resets 3952 * 3953 * Arguments: 3954 * vdc - soft state pointer for this instance of the device driver. 3955 * 3956 * Return Code: 3957 * None 3958 */ 3959 static void 3960 vdc_process_msg_thread(vdc_t *vdcp) 3961 { 3962 int status; 3963 int ctimeout; 3964 timeout_id_t tmid = 0; 3965 3966 mutex_enter(&vdcp->lock); 3967 3968 for (;;) { 3969 3970 #define Q(_s) (vdcp->state == _s) ? #_s : 3971 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 3972 Q(VDC_STATE_INIT) 3973 Q(VDC_STATE_INIT_WAITING) 3974 Q(VDC_STATE_NEGOTIATE) 3975 Q(VDC_STATE_HANDLE_PENDING) 3976 Q(VDC_STATE_RUNNING) 3977 Q(VDC_STATE_RESETTING) 3978 Q(VDC_STATE_DETACH) 3979 "UNKNOWN"); 3980 3981 switch (vdcp->state) { 3982 case VDC_STATE_INIT: 3983 3984 /* 3985 * If requested, start a timeout to check if the 3986 * connection with vds is established in the 3987 * specified delay. If the timeout expires, we 3988 * will cancel any pending request. 3989 * 3990 * If some reset have occurred while establishing 3991 * the connection, we already have a timeout armed 3992 * and in that case we don't need to arm a new one. 3993 */ 3994 ctimeout = (vdc_timeout != 0)? 3995 vdc_timeout : vdcp->ctimeout; 3996 3997 if (ctimeout != 0 && tmid == 0) { 3998 tmid = timeout(vdc_connection_timeout, vdcp, 3999 ctimeout * drv_usectohz(1000000)); 4000 } 4001 4002 /* Check if have re-initializing repeatedly */ 4003 if (vdcp->hshake_cnt++ > vdc_hshake_retries && 4004 vdcp->lifecycle != VDC_LC_ONLINE) { 4005 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4006 vdcp->instance); 4007 vdcp->state = VDC_STATE_DETACH; 4008 break; 4009 } 4010 4011 /* Bring up connection with vds via LDC */ 4012 status = vdc_start_ldc_connection(vdcp); 4013 if (status == EINVAL) { 4014 DMSG(vdcp, 0, "[%d] Could not start LDC", 4015 vdcp->instance); 4016 vdcp->state = VDC_STATE_DETACH; 4017 } else { 4018 vdcp->state = VDC_STATE_INIT_WAITING; 4019 } 4020 break; 4021 4022 case VDC_STATE_INIT_WAITING: 4023 4024 /* 4025 * Let the callback event move us on 4026 * when channel is open to server 4027 */ 4028 while (vdcp->ldc_state != LDC_UP) { 4029 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4030 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4031 DMSG(vdcp, 0, 4032 "state moved to %d out from under us...\n", 4033 vdcp->state); 4034 4035 break; 4036 } 4037 } 4038 if (vdcp->state == VDC_STATE_INIT_WAITING && 4039 vdcp->ldc_state == LDC_UP) { 4040 vdcp->state = VDC_STATE_NEGOTIATE; 4041 } 4042 break; 4043 4044 case VDC_STATE_NEGOTIATE: 4045 switch (status = vdc_ver_negotiation(vdcp)) { 4046 case 0: 4047 break; 4048 default: 4049 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4050 status); 4051 goto reset; 4052 } 4053 4054 switch (status = vdc_attr_negotiation(vdcp)) { 4055 case 0: 4056 break; 4057 default: 4058 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4059 status); 4060 goto reset; 4061 } 4062 4063 switch (status = vdc_dring_negotiation(vdcp)) { 4064 case 0: 4065 break; 4066 default: 4067 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4068 status); 4069 goto reset; 4070 } 4071 4072 switch (status = vdc_rdx_exchange(vdcp)) { 4073 case 0: 4074 vdcp->state = VDC_STATE_HANDLE_PENDING; 4075 goto done; 4076 default: 4077 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4078 status); 4079 goto reset; 4080 } 4081 reset: 4082 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4083 status); 4084 vdcp->state = VDC_STATE_RESETTING; 4085 vdcp->self_reset = B_TRUE; 4086 done: 4087 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4088 vdcp->state); 4089 break; 4090 4091 case VDC_STATE_HANDLE_PENDING: 4092 4093 if (vdcp->ctimeout_reached) { 4094 /* 4095 * The connection timeout had been reached so 4096 * pending requests have been cancelled. Now 4097 * that the connection is back we can reset 4098 * the timeout. 4099 */ 4100 ASSERT(vdcp->local_dring_backup == NULL); 4101 ASSERT(tmid != 0); 4102 tmid = 0; 4103 vdcp->ctimeout_reached = B_FALSE; 4104 vdcp->state = VDC_STATE_RUNNING; 4105 DMSG(vdcp, 0, "[%d] connection to service " 4106 "domain is up", vdcp->instance); 4107 break; 4108 } 4109 4110 mutex_exit(&vdcp->lock); 4111 if (tmid != 0) { 4112 (void) untimeout(tmid); 4113 tmid = 0; 4114 } 4115 status = vdc_resubmit_backup_dring(vdcp); 4116 mutex_enter(&vdcp->lock); 4117 4118 if (status) 4119 vdcp->state = VDC_STATE_RESETTING; 4120 else 4121 vdcp->state = VDC_STATE_RUNNING; 4122 4123 break; 4124 4125 /* enter running state */ 4126 case VDC_STATE_RUNNING: 4127 /* 4128 * Signal anyone waiting for the connection 4129 * to come on line. 4130 */ 4131 vdcp->hshake_cnt = 0; 4132 cv_broadcast(&vdcp->running_cv); 4133 4134 /* failfast has to been checked after reset */ 4135 cv_signal(&vdcp->failfast_cv); 4136 4137 /* ownership is lost during reset */ 4138 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4139 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4140 cv_signal(&vdcp->ownership_cv); 4141 4142 mutex_exit(&vdcp->lock); 4143 4144 for (;;) { 4145 vio_msg_t msg; 4146 status = vdc_wait_for_response(vdcp, &msg); 4147 if (status) break; 4148 4149 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4150 vdcp->instance); 4151 status = vdc_process_data_msg(vdcp, &msg); 4152 if (status) { 4153 DMSG(vdcp, 1, "[%d] process_data_msg " 4154 "returned err=%d\n", vdcp->instance, 4155 status); 4156 break; 4157 } 4158 4159 } 4160 4161 mutex_enter(&vdcp->lock); 4162 4163 vdcp->state = VDC_STATE_RESETTING; 4164 vdcp->self_reset = B_TRUE; 4165 break; 4166 4167 case VDC_STATE_RESETTING: 4168 /* 4169 * When we reach this state, we either come from the 4170 * VDC_STATE_RUNNING state and we can have pending 4171 * request but no timeout is armed; or we come from 4172 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4173 * VDC_HANDLE_PENDING state and there is no pending 4174 * request or pending requests have already been copied 4175 * into the backup dring. So we can safely keep the 4176 * connection timeout armed while we are in this state. 4177 */ 4178 4179 DMSG(vdcp, 0, "Initiating channel reset " 4180 "(pending = %d)\n", (int)vdcp->threads_pending); 4181 4182 if (vdcp->self_reset) { 4183 DMSG(vdcp, 0, 4184 "[%d] calling stop_ldc_connection.\n", 4185 vdcp->instance); 4186 status = vdc_stop_ldc_connection(vdcp); 4187 vdcp->self_reset = B_FALSE; 4188 } 4189 4190 /* 4191 * Wait for all threads currently waiting 4192 * for a free dring entry to use. 4193 */ 4194 while (vdcp->threads_pending) { 4195 cv_broadcast(&vdcp->membind_cv); 4196 cv_broadcast(&vdcp->dring_free_cv); 4197 mutex_exit(&vdcp->lock); 4198 /* give the waiters enough time to wake up */ 4199 delay(vdc_hz_min_ldc_delay); 4200 mutex_enter(&vdcp->lock); 4201 } 4202 4203 ASSERT(vdcp->threads_pending == 0); 4204 4205 /* Sanity check that no thread is receiving */ 4206 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4207 4208 vdcp->read_state = VDC_READ_IDLE; 4209 4210 vdc_backup_local_dring(vdcp); 4211 4212 /* cleanup the old d-ring */ 4213 vdc_destroy_descriptor_ring(vdcp); 4214 4215 /* go and start again */ 4216 vdcp->state = VDC_STATE_INIT; 4217 4218 break; 4219 4220 case VDC_STATE_DETACH: 4221 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4222 vdcp->instance); 4223 4224 /* cancel any pending timeout */ 4225 mutex_exit(&vdcp->lock); 4226 if (tmid != 0) { 4227 (void) untimeout(tmid); 4228 tmid = 0; 4229 } 4230 mutex_enter(&vdcp->lock); 4231 4232 /* 4233 * Signal anyone waiting for connection 4234 * to come online 4235 */ 4236 cv_broadcast(&vdcp->running_cv); 4237 4238 while (vdcp->sync_op_pending) { 4239 cv_signal(&vdcp->sync_pending_cv); 4240 cv_signal(&vdcp->sync_blocked_cv); 4241 mutex_exit(&vdcp->lock); 4242 /* give the waiters enough time to wake up */ 4243 delay(vdc_hz_min_ldc_delay); 4244 mutex_enter(&vdcp->lock); 4245 } 4246 4247 mutex_exit(&vdcp->lock); 4248 4249 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4250 vdcp->instance); 4251 thread_exit(); 4252 break; 4253 } 4254 } 4255 } 4256 4257 4258 /* 4259 * Function: 4260 * vdc_process_data_msg() 4261 * 4262 * Description: 4263 * This function is called by the message processing thread each time 4264 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4265 * be an ACK or NACK from vds[1] which vdc handles as follows. 4266 * ACK - wake up the waiting thread 4267 * NACK - resend any messages necessary 4268 * 4269 * [1] Although the message format allows it, vds should not send a 4270 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4271 * some bizarre reason it does, vdc will reset the connection. 4272 * 4273 * Arguments: 4274 * vdc - soft state pointer for this instance of the device driver. 4275 * msg - the LDC message sent by vds 4276 * 4277 * Return Code: 4278 * 0 - Success. 4279 * > 0 - error value returned by LDC 4280 */ 4281 static int 4282 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4283 { 4284 int status = 0; 4285 vio_dring_msg_t *dring_msg; 4286 vdc_local_desc_t *ldep = NULL; 4287 int start, end; 4288 int idx; 4289 4290 dring_msg = (vio_dring_msg_t *)msg; 4291 4292 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4293 ASSERT(vdcp != NULL); 4294 4295 mutex_enter(&vdcp->lock); 4296 4297 /* 4298 * Check to see if the message has bogus data 4299 */ 4300 idx = start = dring_msg->start_idx; 4301 end = dring_msg->end_idx; 4302 if ((start >= vdcp->dring_len) || 4303 (end >= vdcp->dring_len) || (end < -1)) { 4304 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4305 vdcp->instance, start, end); 4306 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4307 mutex_exit(&vdcp->lock); 4308 return (EINVAL); 4309 } 4310 4311 /* 4312 * Verify that the sequence number is what vdc expects. 4313 */ 4314 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4315 case VDC_SEQ_NUM_TODO: 4316 break; /* keep processing this message */ 4317 case VDC_SEQ_NUM_SKIP: 4318 mutex_exit(&vdcp->lock); 4319 return (0); 4320 case VDC_SEQ_NUM_INVALID: 4321 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4322 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4323 mutex_exit(&vdcp->lock); 4324 return (ENXIO); 4325 } 4326 4327 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4328 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4329 VDC_DUMP_DRING_MSG(dring_msg); 4330 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4331 mutex_exit(&vdcp->lock); 4332 return (EIO); 4333 4334 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4335 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4336 mutex_exit(&vdcp->lock); 4337 return (EPROTO); 4338 } 4339 4340 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4341 ASSERT(start == end); 4342 4343 ldep = &vdcp->local_dring[idx]; 4344 4345 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4346 ldep->dep->hdr.dstate, ldep->cb_type); 4347 4348 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4349 struct buf *bufp; 4350 4351 switch (ldep->cb_type) { 4352 case CB_SYNC: 4353 ASSERT(vdcp->sync_op_pending); 4354 4355 status = vdc_depopulate_descriptor(vdcp, idx); 4356 vdcp->sync_op_status = status; 4357 vdcp->sync_op_pending = B_FALSE; 4358 cv_signal(&vdcp->sync_pending_cv); 4359 break; 4360 4361 case CB_STRATEGY: 4362 bufp = ldep->cb_arg; 4363 ASSERT(bufp != NULL); 4364 bufp->b_resid = 4365 bufp->b_bcount - ldep->dep->payload.nbytes; 4366 status = ldep->dep->payload.status; /* Future:ntoh */ 4367 if (status != 0) { 4368 DMSG(vdcp, 1, "strategy status=%d\n", status); 4369 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4370 bioerror(bufp, status); 4371 } 4372 4373 (void) vdc_depopulate_descriptor(vdcp, idx); 4374 4375 DMSG(vdcp, 1, 4376 "strategy complete req=%ld bytes resp=%ld bytes\n", 4377 bufp->b_bcount, ldep->dep->payload.nbytes); 4378 4379 if (status != 0 && vdcp->failfast_interval != 0) { 4380 /* 4381 * The I/O has failed and failfast is enabled. 4382 * We need the failfast thread to check if the 4383 * failure is due to a reservation conflict. 4384 */ 4385 (void) vdc_failfast_io_queue(vdcp, bufp); 4386 } else { 4387 if (status == 0) { 4388 int op = (bufp->b_flags & B_READ) ? 4389 VD_OP_BREAD : VD_OP_BWRITE; 4390 VD_UPDATE_IO_STATS(vdcp, op, 4391 ldep->dep->payload.nbytes); 4392 } 4393 VD_KSTAT_RUNQ_EXIT(vdcp->io_stats); 4394 DTRACE_IO1(done, buf_t *, bufp); 4395 biodone(bufp); 4396 } 4397 break; 4398 4399 default: 4400 ASSERT(0); 4401 } 4402 } 4403 4404 /* let the arrival signal propogate */ 4405 mutex_exit(&vdcp->lock); 4406 4407 /* probe gives the count of how many entries were processed */ 4408 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4409 4410 return (0); 4411 } 4412 4413 4414 /* 4415 * Function: 4416 * vdc_handle_ver_msg() 4417 * 4418 * Description: 4419 * 4420 * Arguments: 4421 * vdc - soft state pointer for this instance of the device driver. 4422 * ver_msg - LDC message sent by vDisk server 4423 * 4424 * Return Code: 4425 * 0 - Success 4426 */ 4427 static int 4428 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4429 { 4430 int status = 0; 4431 4432 ASSERT(vdc != NULL); 4433 ASSERT(mutex_owned(&vdc->lock)); 4434 4435 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4436 return (EPROTO); 4437 } 4438 4439 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4440 return (EINVAL); 4441 } 4442 4443 switch (ver_msg->tag.vio_subtype) { 4444 case VIO_SUBTYPE_ACK: 4445 /* 4446 * We check to see if the version returned is indeed supported 4447 * (The server may have also adjusted the minor number downwards 4448 * and if so 'ver_msg' will contain the actual version agreed) 4449 */ 4450 if (vdc_is_supported_version(ver_msg)) { 4451 vdc->ver.major = ver_msg->ver_major; 4452 vdc->ver.minor = ver_msg->ver_minor; 4453 ASSERT(vdc->ver.major > 0); 4454 } else { 4455 status = EPROTO; 4456 } 4457 break; 4458 4459 case VIO_SUBTYPE_NACK: 4460 /* 4461 * call vdc_is_supported_version() which will return the next 4462 * supported version (if any) in 'ver_msg' 4463 */ 4464 (void) vdc_is_supported_version(ver_msg); 4465 if (ver_msg->ver_major > 0) { 4466 size_t len = sizeof (*ver_msg); 4467 4468 ASSERT(vdc->ver.major > 0); 4469 4470 /* reset the necessary fields and resend */ 4471 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4472 ver_msg->dev_class = VDEV_DISK; 4473 4474 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4475 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4476 vdc->instance, status); 4477 if (len != sizeof (*ver_msg)) 4478 status = EBADMSG; 4479 } else { 4480 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4481 vdc->instance); 4482 status = ENOTSUP; 4483 } 4484 4485 break; 4486 case VIO_SUBTYPE_INFO: 4487 /* 4488 * Handle the case where vds starts handshake 4489 * (for now only vdc is the instigator) 4490 */ 4491 status = ENOTSUP; 4492 break; 4493 4494 default: 4495 status = EINVAL; 4496 break; 4497 } 4498 4499 return (status); 4500 } 4501 4502 /* 4503 * Function: 4504 * vdc_handle_attr_msg() 4505 * 4506 * Description: 4507 * 4508 * Arguments: 4509 * vdc - soft state pointer for this instance of the device driver. 4510 * attr_msg - LDC message sent by vDisk server 4511 * 4512 * Return Code: 4513 * 0 - Success 4514 */ 4515 static int 4516 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4517 { 4518 int status = 0; 4519 4520 ASSERT(vdc != NULL); 4521 ASSERT(mutex_owned(&vdc->lock)); 4522 4523 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4524 return (EPROTO); 4525 } 4526 4527 switch (attr_msg->tag.vio_subtype) { 4528 case VIO_SUBTYPE_ACK: 4529 /* 4530 * We now verify the attributes sent by vds. 4531 */ 4532 if (attr_msg->vdisk_size == 0) { 4533 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4534 vdc->instance); 4535 status = EINVAL; 4536 break; 4537 } 4538 4539 if (attr_msg->max_xfer_sz == 0) { 4540 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4541 vdc->instance); 4542 status = EINVAL; 4543 break; 4544 } 4545 4546 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4547 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4548 vdc->instance); 4549 attr_msg->vdisk_size = 0; 4550 } 4551 4552 /* 4553 * If the disk size is already set check that it hasn't changed. 4554 */ 4555 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4556 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4557 DMSG(vdc, 0, "[%d] Different disk size from vds " 4558 "(old=0x%lx - new=0x%lx", vdc->instance, 4559 vdc->vdisk_size, attr_msg->vdisk_size) 4560 status = EINVAL; 4561 break; 4562 } 4563 4564 vdc->vdisk_size = attr_msg->vdisk_size; 4565 vdc->vdisk_type = attr_msg->vdisk_type; 4566 vdc->operations = attr_msg->operations; 4567 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4568 vdc->vdisk_media = attr_msg->vdisk_media; 4569 else 4570 vdc->vdisk_media = 0; 4571 4572 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4573 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4574 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4575 vdc->instance, vdc->block_size, 4576 attr_msg->vdisk_block_size); 4577 4578 /* 4579 * We don't know at compile time what the vDisk server will 4580 * think are good values but we apply a large (arbitrary) 4581 * upper bound to prevent memory exhaustion in vdc if it was 4582 * allocating a DRing based of huge values sent by the server. 4583 * We probably will never exceed this except if the message 4584 * was garbage. 4585 */ 4586 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4587 (PAGESIZE * DEV_BSIZE)) { 4588 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4589 vdc->block_size = attr_msg->vdisk_block_size; 4590 } else { 4591 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4592 " using max supported by vdc", vdc->instance); 4593 } 4594 4595 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4596 (attr_msg->vdisk_size > INT64_MAX) || 4597 (attr_msg->operations == 0) || 4598 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4599 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4600 vdc->instance); 4601 status = EINVAL; 4602 break; 4603 } 4604 4605 /* 4606 * Now that we have received all attributes we can create a 4607 * fake geometry for the disk. 4608 */ 4609 vdc_create_fake_geometry(vdc); 4610 break; 4611 4612 case VIO_SUBTYPE_NACK: 4613 /* 4614 * vds could not handle the attributes we sent so we 4615 * stop negotiating. 4616 */ 4617 status = EPROTO; 4618 break; 4619 4620 case VIO_SUBTYPE_INFO: 4621 /* 4622 * Handle the case where vds starts the handshake 4623 * (for now; vdc is the only supported instigatior) 4624 */ 4625 status = ENOTSUP; 4626 break; 4627 4628 default: 4629 status = ENOTSUP; 4630 break; 4631 } 4632 4633 return (status); 4634 } 4635 4636 /* 4637 * Function: 4638 * vdc_handle_dring_reg_msg() 4639 * 4640 * Description: 4641 * 4642 * Arguments: 4643 * vdc - soft state pointer for this instance of the driver. 4644 * dring_msg - LDC message sent by vDisk server 4645 * 4646 * Return Code: 4647 * 0 - Success 4648 */ 4649 static int 4650 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4651 { 4652 int status = 0; 4653 4654 ASSERT(vdc != NULL); 4655 ASSERT(mutex_owned(&vdc->lock)); 4656 4657 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4658 return (EPROTO); 4659 } 4660 4661 switch (dring_msg->tag.vio_subtype) { 4662 case VIO_SUBTYPE_ACK: 4663 /* save the received dring_ident */ 4664 vdc->dring_ident = dring_msg->dring_ident; 4665 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4666 vdc->instance, vdc->dring_ident); 4667 break; 4668 4669 case VIO_SUBTYPE_NACK: 4670 /* 4671 * vds could not handle the DRing info we sent so we 4672 * stop negotiating. 4673 */ 4674 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4675 vdc->instance); 4676 status = EPROTO; 4677 break; 4678 4679 case VIO_SUBTYPE_INFO: 4680 /* 4681 * Handle the case where vds starts handshake 4682 * (for now only vdc is the instigatior) 4683 */ 4684 status = ENOTSUP; 4685 break; 4686 default: 4687 status = ENOTSUP; 4688 } 4689 4690 return (status); 4691 } 4692 4693 /* 4694 * Function: 4695 * vdc_verify_seq_num() 4696 * 4697 * Description: 4698 * This functions verifies that the sequence number sent back by the vDisk 4699 * server with the latest message is what is expected (i.e. it is greater 4700 * than the last seq num sent by the vDisk server and less than or equal 4701 * to the last seq num generated by vdc). 4702 * 4703 * It then checks the request ID to see if any requests need processing 4704 * in the DRing. 4705 * 4706 * Arguments: 4707 * vdc - soft state pointer for this instance of the driver. 4708 * dring_msg - pointer to the LDC message sent by vds 4709 * 4710 * Return Code: 4711 * VDC_SEQ_NUM_TODO - Message needs to be processed 4712 * VDC_SEQ_NUM_SKIP - Message has already been processed 4713 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4714 * vdc cannot deal with them 4715 */ 4716 static int 4717 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4718 { 4719 ASSERT(vdc != NULL); 4720 ASSERT(dring_msg != NULL); 4721 ASSERT(mutex_owned(&vdc->lock)); 4722 4723 /* 4724 * Check to see if the messages were responded to in the correct 4725 * order by vds. 4726 */ 4727 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4728 (dring_msg->seq_num > vdc->seq_num)) { 4729 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4730 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4731 vdc->instance, dring_msg->seq_num, 4732 vdc->seq_num_reply, vdc->seq_num, 4733 vdc->req_id_proc, vdc->req_id); 4734 return (VDC_SEQ_NUM_INVALID); 4735 } 4736 vdc->seq_num_reply = dring_msg->seq_num; 4737 4738 if (vdc->req_id_proc < vdc->req_id) 4739 return (VDC_SEQ_NUM_TODO); 4740 else 4741 return (VDC_SEQ_NUM_SKIP); 4742 } 4743 4744 4745 /* 4746 * Function: 4747 * vdc_is_supported_version() 4748 * 4749 * Description: 4750 * This routine checks if the major/minor version numbers specified in 4751 * 'ver_msg' are supported. If not it finds the next version that is 4752 * in the supported version list 'vdc_version[]' and sets the fields in 4753 * 'ver_msg' to those values 4754 * 4755 * Arguments: 4756 * ver_msg - LDC message sent by vDisk server 4757 * 4758 * Return Code: 4759 * B_TRUE - Success 4760 * B_FALSE - Version not supported 4761 */ 4762 static boolean_t 4763 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 4764 { 4765 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 4766 4767 for (int i = 0; i < vdc_num_versions; i++) { 4768 ASSERT(vdc_version[i].major > 0); 4769 ASSERT((i == 0) || 4770 (vdc_version[i].major < vdc_version[i-1].major)); 4771 4772 /* 4773 * If the major versions match, adjust the minor version, if 4774 * necessary, down to the highest value supported by this 4775 * client. The server should support all minor versions lower 4776 * than the value it sent 4777 */ 4778 if (ver_msg->ver_major == vdc_version[i].major) { 4779 if (ver_msg->ver_minor > vdc_version[i].minor) { 4780 DMSGX(0, 4781 "Adjusting minor version from %u to %u", 4782 ver_msg->ver_minor, vdc_version[i].minor); 4783 ver_msg->ver_minor = vdc_version[i].minor; 4784 } 4785 return (B_TRUE); 4786 } 4787 4788 /* 4789 * If the message contains a higher major version number, set 4790 * the message's major/minor versions to the current values 4791 * and return false, so this message will get resent with 4792 * these values, and the server will potentially try again 4793 * with the same or a lower version 4794 */ 4795 if (ver_msg->ver_major > vdc_version[i].major) { 4796 ver_msg->ver_major = vdc_version[i].major; 4797 ver_msg->ver_minor = vdc_version[i].minor; 4798 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 4799 ver_msg->ver_major, ver_msg->ver_minor); 4800 4801 return (B_FALSE); 4802 } 4803 4804 /* 4805 * Otherwise, the message's major version is less than the 4806 * current major version, so continue the loop to the next 4807 * (lower) supported version 4808 */ 4809 } 4810 4811 /* 4812 * No common version was found; "ground" the version pair in the 4813 * message to terminate negotiation 4814 */ 4815 ver_msg->ver_major = 0; 4816 ver_msg->ver_minor = 0; 4817 4818 return (B_FALSE); 4819 } 4820 /* -------------------------------------------------------------------------- */ 4821 4822 /* 4823 * DKIO(7) support 4824 */ 4825 4826 typedef struct vdc_dk_arg { 4827 struct dk_callback dkc; 4828 int mode; 4829 dev_t dev; 4830 vdc_t *vdc; 4831 } vdc_dk_arg_t; 4832 4833 /* 4834 * Function: 4835 * vdc_dkio_flush_cb() 4836 * 4837 * Description: 4838 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 4839 * by kernel code. 4840 * 4841 * Arguments: 4842 * arg - a pointer to a vdc_dk_arg_t structure. 4843 */ 4844 void 4845 vdc_dkio_flush_cb(void *arg) 4846 { 4847 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 4848 struct dk_callback *dkc = NULL; 4849 vdc_t *vdc = NULL; 4850 int rv; 4851 4852 if (dk_arg == NULL) { 4853 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 4854 return; 4855 } 4856 dkc = &dk_arg->dkc; 4857 vdc = dk_arg->vdc; 4858 ASSERT(vdc != NULL); 4859 4860 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 4861 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 4862 if (rv != 0) { 4863 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 4864 vdc->instance, rv, 4865 ddi_model_convert_from(dk_arg->mode & FMODELS)); 4866 } 4867 4868 /* 4869 * Trigger the call back to notify the caller the the ioctl call has 4870 * been completed. 4871 */ 4872 if ((dk_arg->mode & FKIOCTL) && 4873 (dkc != NULL) && 4874 (dkc->dkc_callback != NULL)) { 4875 ASSERT(dkc->dkc_cookie != NULL); 4876 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 4877 } 4878 4879 /* Indicate that one less DKIO write flush is outstanding */ 4880 mutex_enter(&vdc->lock); 4881 vdc->dkio_flush_pending--; 4882 ASSERT(vdc->dkio_flush_pending >= 0); 4883 mutex_exit(&vdc->lock); 4884 4885 /* free the mem that was allocated when the callback was dispatched */ 4886 kmem_free(arg, sizeof (vdc_dk_arg_t)); 4887 } 4888 4889 /* 4890 * Function: 4891 * vdc_dkio_gapart() 4892 * 4893 * Description: 4894 * This function implements the DKIOCGAPART ioctl. 4895 * 4896 * Arguments: 4897 * vdc - soft state pointer 4898 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 4899 * flag - ioctl flags 4900 */ 4901 static int 4902 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 4903 { 4904 struct dk_geom *geom; 4905 struct vtoc *vtoc; 4906 union { 4907 struct dk_map map[NDKMAP]; 4908 struct dk_map32 map32[NDKMAP]; 4909 } data; 4910 int i, rv, size; 4911 4912 mutex_enter(&vdc->lock); 4913 4914 if ((rv = vdc_validate_geometry(vdc)) != 0) { 4915 mutex_exit(&vdc->lock); 4916 return (rv); 4917 } 4918 4919 vtoc = vdc->vtoc; 4920 geom = vdc->geom; 4921 4922 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 4923 4924 for (i = 0; i < vtoc->v_nparts; i++) { 4925 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 4926 (geom->dkg_nhead * geom->dkg_nsect); 4927 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 4928 } 4929 size = NDKMAP * sizeof (struct dk_map32); 4930 4931 } else { 4932 4933 for (i = 0; i < vtoc->v_nparts; i++) { 4934 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 4935 (geom->dkg_nhead * geom->dkg_nsect); 4936 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 4937 } 4938 size = NDKMAP * sizeof (struct dk_map); 4939 4940 } 4941 4942 mutex_exit(&vdc->lock); 4943 4944 if (ddi_copyout(&data, arg, size, flag) != 0) 4945 return (EFAULT); 4946 4947 return (0); 4948 } 4949 4950 /* 4951 * Function: 4952 * vdc_dkio_partition() 4953 * 4954 * Description: 4955 * This function implements the DKIOCPARTITION ioctl. 4956 * 4957 * Arguments: 4958 * vdc - soft state pointer 4959 * arg - a pointer to a struct partition64 structure 4960 * flag - ioctl flags 4961 */ 4962 static int 4963 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 4964 { 4965 struct partition64 p64; 4966 efi_gpt_t *gpt; 4967 efi_gpe_t *gpe; 4968 vd_efi_dev_t edev; 4969 uint_t partno; 4970 int rv; 4971 4972 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 4973 return (EFAULT); 4974 } 4975 4976 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 4977 4978 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 4979 return (rv); 4980 } 4981 4982 partno = p64.p_partno; 4983 4984 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 4985 vd_efi_free(&edev, gpt, gpe); 4986 return (ESRCH); 4987 } 4988 4989 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 4990 sizeof (struct uuid)); 4991 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 4992 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 4993 4994 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 4995 vd_efi_free(&edev, gpt, gpe); 4996 return (EFAULT); 4997 } 4998 4999 vd_efi_free(&edev, gpt, gpe); 5000 return (0); 5001 } 5002 5003 /* 5004 * Function: 5005 * vdc_dioctl_rwcmd() 5006 * 5007 * Description: 5008 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5009 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5010 * 5011 * Arguments: 5012 * dev - device 5013 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5014 * flag - ioctl flags 5015 */ 5016 static int 5017 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5018 { 5019 struct dadkio_rwcmd32 rwcmd32; 5020 struct dadkio_rwcmd rwcmd; 5021 struct iovec aiov; 5022 struct uio auio; 5023 int rw, status; 5024 struct buf *buf; 5025 5026 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5027 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5028 sizeof (struct dadkio_rwcmd32), flag)) { 5029 return (EFAULT); 5030 } 5031 rwcmd.cmd = rwcmd32.cmd; 5032 rwcmd.flags = rwcmd32.flags; 5033 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5034 rwcmd.buflen = rwcmd32.buflen; 5035 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5036 } else { 5037 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5038 sizeof (struct dadkio_rwcmd), flag)) { 5039 return (EFAULT); 5040 } 5041 } 5042 5043 switch (rwcmd.cmd) { 5044 case DADKIO_RWCMD_READ: 5045 rw = B_READ; 5046 break; 5047 case DADKIO_RWCMD_WRITE: 5048 rw = B_WRITE; 5049 break; 5050 default: 5051 return (EINVAL); 5052 } 5053 5054 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5055 aiov.iov_base = rwcmd.bufaddr; 5056 aiov.iov_len = rwcmd.buflen; 5057 5058 bzero((caddr_t)&auio, sizeof (struct uio)); 5059 auio.uio_iov = &aiov; 5060 auio.uio_iovcnt = 1; 5061 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5062 auio.uio_resid = rwcmd.buflen; 5063 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5064 5065 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5066 bioinit(buf); 5067 /* 5068 * We use the private field of buf to specify that this is an 5069 * I/O using an absolute offset. 5070 */ 5071 buf->b_private = (void *)VD_SLICE_NONE; 5072 5073 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5074 5075 biofini(buf); 5076 kmem_free(buf, sizeof (buf_t)); 5077 5078 return (status); 5079 } 5080 5081 /* 5082 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5083 * buffer is returned in alloc_len. 5084 */ 5085 static vd_scsi_t * 5086 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5087 int *alloc_len) 5088 { 5089 vd_scsi_t *vd_scsi; 5090 int vd_scsi_len = VD_SCSI_SIZE; 5091 5092 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5093 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5094 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5095 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5096 5097 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5098 5099 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5100 5101 vd_scsi->cdb_len = cdb_len; 5102 vd_scsi->sense_len = sense_len; 5103 vd_scsi->datain_len = datain_len; 5104 vd_scsi->dataout_len = dataout_len; 5105 5106 *alloc_len = vd_scsi_len; 5107 5108 return (vd_scsi); 5109 } 5110 5111 /* 5112 * Convert the status of a SCSI command to a Solaris return code. 5113 * 5114 * Arguments: 5115 * vd_scsi - The SCSI operation buffer. 5116 * log_error - indicate if an error message should be logged. 5117 * 5118 * Note that our SCSI error messages are rather primitive for the moment 5119 * and could be improved by decoding some data like the SCSI command and 5120 * the sense key. 5121 * 5122 * Return value: 5123 * 0 - Status is good. 5124 * EACCES - Status reports a reservation conflict. 5125 * ENOTSUP - Status reports a check condition and sense key 5126 * reports an illegal request. 5127 * EIO - Any other status. 5128 */ 5129 static int 5130 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5131 { 5132 int rv; 5133 char path_str[MAXPATHLEN]; 5134 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5135 union scsi_cdb *cdb; 5136 struct scsi_extended_sense *sense; 5137 5138 if (vd_scsi->cmd_status == STATUS_GOOD) 5139 /* no error */ 5140 return (0); 5141 5142 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5143 if (vdc_scsi_log_error) 5144 log_error = B_TRUE; 5145 5146 if (log_error) { 5147 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5148 ddi_pathname(vdc->dip, path_str), vdc->instance, 5149 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5150 } 5151 5152 /* default returned value */ 5153 rv = EIO; 5154 5155 switch (vd_scsi->cmd_status) { 5156 5157 case STATUS_CHECK: 5158 case STATUS_TERMINATED: 5159 if (log_error) 5160 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5161 5162 /* check sense buffer */ 5163 if (vd_scsi->sense_len == 0 || 5164 vd_scsi->sense_status != STATUS_GOOD) { 5165 if (log_error) 5166 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5167 break; 5168 } 5169 5170 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5171 5172 if (log_error) { 5173 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5174 "\tASC: 0x%x, ASCQ: 0x%x\n", 5175 scsi_sense_key((uint8_t *)sense), 5176 scsi_sense_asc((uint8_t *)sense), 5177 scsi_sense_ascq((uint8_t *)sense)); 5178 } 5179 5180 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5181 rv = ENOTSUP; 5182 break; 5183 5184 case STATUS_BUSY: 5185 if (log_error) 5186 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5187 break; 5188 5189 case STATUS_RESERVATION_CONFLICT: 5190 /* 5191 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5192 * reservation conflict could be due to various reasons like 5193 * incorrect keys, not registered or not reserved etc. So, 5194 * we should not panic in that case. 5195 */ 5196 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5197 if (vdc->failfast_interval != 0 && 5198 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5199 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5200 /* failfast is enabled so we have to panic */ 5201 (void) snprintf(panic_str, sizeof (panic_str), 5202 VDC_RESV_CONFLICT_FMT_STR "%s", 5203 ddi_pathname(vdc->dip, path_str)); 5204 panic(panic_str); 5205 } 5206 if (log_error) 5207 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5208 rv = EACCES; 5209 break; 5210 5211 case STATUS_QFULL: 5212 if (log_error) 5213 cmn_err(CE_NOTE, "\tQueue Full\n"); 5214 break; 5215 5216 case STATUS_MET: 5217 case STATUS_INTERMEDIATE: 5218 case STATUS_SCSI2: 5219 case STATUS_INTERMEDIATE_MET: 5220 case STATUS_ACA_ACTIVE: 5221 if (log_error) 5222 cmn_err(CE_CONT, 5223 "\tUnexpected SCSI status received: 0x%x\n", 5224 vd_scsi->cmd_status); 5225 break; 5226 5227 default: 5228 if (log_error) 5229 cmn_err(CE_CONT, 5230 "\tInvalid SCSI status received: 0x%x\n", 5231 vd_scsi->cmd_status); 5232 break; 5233 } 5234 5235 return (rv); 5236 } 5237 5238 /* 5239 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5240 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5241 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5242 * converted to a VD_OP_RESET operation. 5243 */ 5244 static int 5245 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5246 { 5247 struct uscsi_cmd uscsi; 5248 struct uscsi_cmd32 uscsi32; 5249 vd_scsi_t *vd_scsi; 5250 int vd_scsi_len; 5251 union scsi_cdb *cdb; 5252 struct scsi_extended_sense *sense; 5253 char *datain, *dataout; 5254 size_t cdb_len, datain_len, dataout_len, sense_len; 5255 int rv; 5256 5257 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5258 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5259 mode) != 0) 5260 return (EFAULT); 5261 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5262 } else { 5263 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5264 mode) != 0) 5265 return (EFAULT); 5266 } 5267 5268 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5269 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5270 USCSI_RESET_ALL)) { 5271 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5272 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5273 return (rv); 5274 } 5275 5276 /* cdb buffer length */ 5277 cdb_len = uscsi.uscsi_cdblen; 5278 5279 /* data in and out buffers length */ 5280 if (uscsi.uscsi_flags & USCSI_READ) { 5281 datain_len = uscsi.uscsi_buflen; 5282 dataout_len = 0; 5283 } else { 5284 datain_len = 0; 5285 dataout_len = uscsi.uscsi_buflen; 5286 } 5287 5288 /* sense buffer length */ 5289 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5290 sense_len = uscsi.uscsi_rqlen; 5291 else 5292 sense_len = 0; 5293 5294 /* allocate buffer for the VD_SCSICMD_OP operation */ 5295 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5296 &vd_scsi_len); 5297 5298 /* 5299 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5300 * but basically they prevent a SCSI command from being retried in case 5301 * of an error. 5302 */ 5303 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5304 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5305 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5306 5307 /* set task attribute */ 5308 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5309 vd_scsi->task_attribute = 0; 5310 } else { 5311 if (uscsi.uscsi_flags & USCSI_HEAD) 5312 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5313 else if (uscsi.uscsi_flags & USCSI_HTAG) 5314 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5315 else if (uscsi.uscsi_flags & USCSI_OTAG) 5316 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5317 else 5318 vd_scsi->task_attribute = 0; 5319 } 5320 5321 /* set timeout */ 5322 vd_scsi->timeout = uscsi.uscsi_timeout; 5323 5324 /* copy-in cdb data */ 5325 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5326 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5327 rv = EFAULT; 5328 goto done; 5329 } 5330 5331 /* keep a pointer to the sense buffer */ 5332 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5333 5334 /* keep a pointer to the data-in buffer */ 5335 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5336 5337 /* copy-in request data to the data-out buffer */ 5338 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5339 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5340 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5341 mode)) { 5342 rv = EFAULT; 5343 goto done; 5344 } 5345 } 5346 5347 /* submit the request */ 5348 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5349 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5350 5351 if (rv != 0) 5352 goto done; 5353 5354 /* update scsi status */ 5355 uscsi.uscsi_status = vd_scsi->cmd_status; 5356 5357 /* update sense data */ 5358 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5359 (uscsi.uscsi_status == STATUS_CHECK || 5360 uscsi.uscsi_status == STATUS_TERMINATED)) { 5361 5362 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5363 5364 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5365 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5366 vd_scsi->sense_len; 5367 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5368 vd_scsi->sense_len, mode) != 0) { 5369 rv = EFAULT; 5370 goto done; 5371 } 5372 } 5373 } 5374 5375 /* update request data */ 5376 if (uscsi.uscsi_status == STATUS_GOOD) { 5377 if (uscsi.uscsi_flags & USCSI_READ) { 5378 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5379 vd_scsi->datain_len; 5380 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5381 vd_scsi->datain_len, mode) != 0) { 5382 rv = EFAULT; 5383 goto done; 5384 } 5385 } else { 5386 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5387 vd_scsi->dataout_len; 5388 } 5389 } 5390 5391 /* copy-out result */ 5392 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5393 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5394 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5395 mode) != 0) { 5396 rv = EFAULT; 5397 goto done; 5398 } 5399 } else { 5400 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5401 mode) != 0) { 5402 rv = EFAULT; 5403 goto done; 5404 } 5405 } 5406 5407 /* get the return code from the SCSI command status */ 5408 rv = vdc_scsi_status(vdc, vd_scsi, 5409 !(uscsi.uscsi_flags & USCSI_SILENT)); 5410 5411 done: 5412 kmem_free(vd_scsi, vd_scsi_len); 5413 return (rv); 5414 } 5415 5416 /* 5417 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5418 * 5419 * Arguments: 5420 * cmd - SCSI PERSISTENT IN command 5421 * len - length of the SCSI input buffer 5422 * vd_scsi_len - return the length of the allocated buffer 5423 * 5424 * Returned Value: 5425 * a pointer to the allocated VD_OP_SCSICMD buffer. 5426 */ 5427 static vd_scsi_t * 5428 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5429 { 5430 int cdb_len, sense_len, datain_len, dataout_len; 5431 vd_scsi_t *vd_scsi; 5432 union scsi_cdb *cdb; 5433 5434 cdb_len = CDB_GROUP1; 5435 sense_len = sizeof (struct scsi_extended_sense); 5436 datain_len = len; 5437 dataout_len = 0; 5438 5439 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5440 vd_scsi_len); 5441 5442 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5443 5444 /* set cdb */ 5445 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5446 cdb->cdb_opaque[1] = cmd; 5447 FORMG1COUNT(cdb, datain_len); 5448 5449 vd_scsi->timeout = vdc_scsi_timeout; 5450 5451 return (vd_scsi); 5452 } 5453 5454 /* 5455 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5456 * 5457 * Arguments: 5458 * cmd - SCSI PERSISTENT OUT command 5459 * len - length of the SCSI output buffer 5460 * vd_scsi_len - return the length of the allocated buffer 5461 * 5462 * Returned Code: 5463 * a pointer to the allocated VD_OP_SCSICMD buffer. 5464 */ 5465 static vd_scsi_t * 5466 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5467 { 5468 int cdb_len, sense_len, datain_len, dataout_len; 5469 vd_scsi_t *vd_scsi; 5470 union scsi_cdb *cdb; 5471 5472 cdb_len = CDB_GROUP1; 5473 sense_len = sizeof (struct scsi_extended_sense); 5474 datain_len = 0; 5475 dataout_len = len; 5476 5477 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5478 vd_scsi_len); 5479 5480 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5481 5482 /* set cdb */ 5483 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5484 cdb->cdb_opaque[1] = cmd; 5485 FORMG1COUNT(cdb, dataout_len); 5486 5487 vd_scsi->timeout = vdc_scsi_timeout; 5488 5489 return (vd_scsi); 5490 } 5491 5492 /* 5493 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5494 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5495 * server with a VD_OP_SCSICMD operation. 5496 */ 5497 static int 5498 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5499 { 5500 vd_scsi_t *vd_scsi; 5501 mhioc_inkeys_t inkeys; 5502 mhioc_key_list_t klist; 5503 struct mhioc_inkeys32 inkeys32; 5504 struct mhioc_key_list32 klist32; 5505 sd_prin_readkeys_t *scsi_keys; 5506 void *user_keys; 5507 int vd_scsi_len; 5508 int listsize, listlen, rv; 5509 5510 /* copyin arguments */ 5511 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5512 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5513 if (rv != 0) 5514 return (EFAULT); 5515 5516 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5517 sizeof (klist32), mode); 5518 if (rv != 0) 5519 return (EFAULT); 5520 5521 listsize = klist32.listsize; 5522 } else { 5523 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5524 if (rv != 0) 5525 return (EFAULT); 5526 5527 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5528 if (rv != 0) 5529 return (EFAULT); 5530 5531 listsize = klist.listsize; 5532 } 5533 5534 /* build SCSI VD_OP request */ 5535 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5536 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5537 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5538 5539 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5540 5541 /* submit the request */ 5542 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5543 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5544 5545 if (rv != 0) 5546 goto done; 5547 5548 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5549 5550 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5551 inkeys32.generation = scsi_keys->generation; 5552 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5553 if (rv != 0) { 5554 rv = EFAULT; 5555 goto done; 5556 } 5557 5558 klist32.listlen = listlen; 5559 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5560 sizeof (klist32), mode); 5561 if (rv != 0) { 5562 rv = EFAULT; 5563 goto done; 5564 } 5565 5566 user_keys = (caddr_t)(uintptr_t)klist32.list; 5567 } else { 5568 inkeys.generation = scsi_keys->generation; 5569 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5570 if (rv != 0) { 5571 rv = EFAULT; 5572 goto done; 5573 } 5574 5575 klist.listlen = listlen; 5576 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5577 if (rv != 0) { 5578 rv = EFAULT; 5579 goto done; 5580 } 5581 5582 user_keys = klist.list; 5583 } 5584 5585 /* copy out keys */ 5586 if (listlen > 0 && listsize > 0) { 5587 if (listsize < listlen) 5588 listlen = listsize; 5589 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5590 listlen * MHIOC_RESV_KEY_SIZE, mode); 5591 if (rv != 0) 5592 rv = EFAULT; 5593 } 5594 5595 if (rv == 0) 5596 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5597 5598 done: 5599 kmem_free(vd_scsi, vd_scsi_len); 5600 5601 return (rv); 5602 } 5603 5604 /* 5605 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5606 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5607 * the vdisk server with a VD_OP_SCSICMD operation. 5608 */ 5609 static int 5610 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5611 { 5612 vd_scsi_t *vd_scsi; 5613 mhioc_inresvs_t inresv; 5614 mhioc_resv_desc_list_t rlist; 5615 struct mhioc_inresvs32 inresv32; 5616 struct mhioc_resv_desc_list32 rlist32; 5617 mhioc_resv_desc_t mhd_resv; 5618 sd_prin_readresv_t *scsi_resv; 5619 sd_readresv_desc_t *resv; 5620 mhioc_resv_desc_t *user_resv; 5621 int vd_scsi_len; 5622 int listsize, listlen, i, rv; 5623 5624 /* copyin arguments */ 5625 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5626 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5627 if (rv != 0) 5628 return (EFAULT); 5629 5630 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5631 sizeof (rlist32), mode); 5632 if (rv != 0) 5633 return (EFAULT); 5634 5635 listsize = rlist32.listsize; 5636 } else { 5637 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5638 if (rv != 0) 5639 return (EFAULT); 5640 5641 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5642 if (rv != 0) 5643 return (EFAULT); 5644 5645 listsize = rlist.listsize; 5646 } 5647 5648 /* build SCSI VD_OP request */ 5649 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5650 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5651 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5652 5653 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5654 5655 /* submit the request */ 5656 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5657 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5658 5659 if (rv != 0) 5660 goto done; 5661 5662 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5663 5664 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5665 inresv32.generation = scsi_resv->generation; 5666 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5667 if (rv != 0) { 5668 rv = EFAULT; 5669 goto done; 5670 } 5671 5672 rlist32.listlen = listlen; 5673 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5674 sizeof (rlist32), mode); 5675 if (rv != 0) { 5676 rv = EFAULT; 5677 goto done; 5678 } 5679 5680 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5681 } else { 5682 inresv.generation = scsi_resv->generation; 5683 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5684 if (rv != 0) { 5685 rv = EFAULT; 5686 goto done; 5687 } 5688 5689 rlist.listlen = listlen; 5690 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5691 if (rv != 0) { 5692 rv = EFAULT; 5693 goto done; 5694 } 5695 5696 user_resv = rlist.list; 5697 } 5698 5699 /* copy out reservations */ 5700 if (listsize > 0 && listlen > 0) { 5701 if (listsize < listlen) 5702 listlen = listsize; 5703 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5704 5705 for (i = 0; i < listlen; i++) { 5706 mhd_resv.type = resv->type; 5707 mhd_resv.scope = resv->scope; 5708 mhd_resv.scope_specific_addr = 5709 BE_32(resv->scope_specific_addr); 5710 bcopy(&resv->resvkey, &mhd_resv.key, 5711 MHIOC_RESV_KEY_SIZE); 5712 5713 rv = ddi_copyout(&mhd_resv, user_resv, 5714 sizeof (mhd_resv), mode); 5715 if (rv != 0) { 5716 rv = EFAULT; 5717 goto done; 5718 } 5719 resv++; 5720 user_resv++; 5721 } 5722 } 5723 5724 if (rv == 0) 5725 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5726 5727 done: 5728 kmem_free(vd_scsi, vd_scsi_len); 5729 return (rv); 5730 } 5731 5732 /* 5733 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5734 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5735 * server with a VD_OP_SCSICMD operation. 5736 */ 5737 static int 5738 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5739 { 5740 vd_scsi_t *vd_scsi; 5741 sd_prout_t *scsi_prout; 5742 mhioc_register_t mhd_reg; 5743 int vd_scsi_len, rv; 5744 5745 /* copyin arguments */ 5746 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 5747 if (rv != 0) 5748 return (EFAULT); 5749 5750 /* build SCSI VD_OP request */ 5751 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 5752 sizeof (sd_prout_t), &vd_scsi_len); 5753 5754 /* set parameters */ 5755 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5756 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5757 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 5758 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 5759 5760 /* submit the request */ 5761 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5762 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5763 5764 if (rv == 0) 5765 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5766 5767 kmem_free(vd_scsi, vd_scsi_len); 5768 return (rv); 5769 } 5770 5771 /* 5772 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 5773 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 5774 * server with a VD_OP_SCSICMD operation. 5775 */ 5776 static int 5777 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 5778 { 5779 union scsi_cdb *cdb; 5780 vd_scsi_t *vd_scsi; 5781 sd_prout_t *scsi_prout; 5782 mhioc_resv_desc_t mhd_resv; 5783 int vd_scsi_len, rv; 5784 5785 /* copyin arguments */ 5786 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 5787 if (rv != 0) 5788 return (EFAULT); 5789 5790 /* build SCSI VD_OP request */ 5791 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 5792 sizeof (sd_prout_t), &vd_scsi_len); 5793 5794 /* set parameters */ 5795 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5796 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5797 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5798 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 5799 cdb->cdb_opaque[2] = mhd_resv.type; 5800 5801 /* submit the request */ 5802 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5803 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5804 5805 if (rv == 0) 5806 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5807 5808 kmem_free(vd_scsi, vd_scsi_len); 5809 return (rv); 5810 } 5811 5812 /* 5813 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 5814 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 5815 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 5816 */ 5817 static int 5818 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 5819 { 5820 union scsi_cdb *cdb; 5821 vd_scsi_t *vd_scsi; 5822 sd_prout_t *scsi_prout; 5823 mhioc_preemptandabort_t mhd_preempt; 5824 int vd_scsi_len, rv; 5825 5826 /* copyin arguments */ 5827 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 5828 if (rv != 0) 5829 return (EFAULT); 5830 5831 /* build SCSI VD_OP request */ 5832 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 5833 sizeof (sd_prout_t), &vd_scsi_len); 5834 5835 /* set parameters */ 5836 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5837 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5838 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5839 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 5840 MHIOC_RESV_KEY_SIZE); 5841 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 5842 MHIOC_RESV_KEY_SIZE); 5843 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 5844 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 5845 5846 /* submit the request */ 5847 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5848 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5849 5850 if (rv == 0) 5851 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5852 5853 kmem_free(vd_scsi, vd_scsi_len); 5854 return (rv); 5855 } 5856 5857 /* 5858 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 5859 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 5860 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 5861 */ 5862 static int 5863 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 5864 { 5865 vd_scsi_t *vd_scsi; 5866 sd_prout_t *scsi_prout; 5867 mhioc_registerandignorekey_t mhd_regi; 5868 int vd_scsi_len, rv; 5869 5870 /* copyin arguments */ 5871 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 5872 if (rv != 0) 5873 return (EFAULT); 5874 5875 /* build SCSI VD_OP request */ 5876 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 5877 sizeof (sd_prout_t), &vd_scsi_len); 5878 5879 /* set parameters */ 5880 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5881 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 5882 MHIOC_RESV_KEY_SIZE); 5883 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 5884 5885 /* submit the request */ 5886 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5887 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5888 5889 if (rv == 0) 5890 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5891 5892 kmem_free(vd_scsi, vd_scsi_len); 5893 return (rv); 5894 } 5895 5896 /* 5897 * This function is used by the failfast mechanism to send a SCSI command 5898 * to check for reservation conflict. 5899 */ 5900 static int 5901 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 5902 { 5903 int cdb_len, sense_len, vd_scsi_len; 5904 vd_scsi_t *vd_scsi; 5905 union scsi_cdb *cdb; 5906 int rv; 5907 5908 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 5909 5910 if (scmd == SCMD_WRITE_G1) 5911 cdb_len = CDB_GROUP1; 5912 else 5913 cdb_len = CDB_GROUP0; 5914 5915 sense_len = sizeof (struct scsi_extended_sense); 5916 5917 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 5918 5919 /* set cdb */ 5920 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5921 cdb->scc_cmd = scmd; 5922 5923 vd_scsi->timeout = vdc_scsi_timeout; 5924 5925 /* 5926 * Submit the request. The last argument has to be B_FALSE so that 5927 * vdc_do_sync_op does not loop checking for reservation conflict if 5928 * the operation returns an error. 5929 */ 5930 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5931 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 5932 5933 if (rv == 0) 5934 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5935 5936 kmem_free(vd_scsi, vd_scsi_len); 5937 return (rv); 5938 } 5939 5940 /* 5941 * This function is used by the failfast mechanism to check for reservation 5942 * conflict. It sends some SCSI commands which will fail with a reservation 5943 * conflict error if the system does not have access to the disk and this 5944 * will panic the system. 5945 * 5946 * Returned Code: 5947 * 0 - disk is accessible without reservation conflict error 5948 * != 0 - unable to check if disk is accessible 5949 */ 5950 int 5951 vdc_failfast_check_resv(vdc_t *vdc) 5952 { 5953 int failure = 0; 5954 5955 /* 5956 * Send a TEST UNIT READY command. The command will panic 5957 * the system if it fails with a reservation conflict. 5958 */ 5959 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 5960 failure++; 5961 5962 /* 5963 * With SPC-3 compliant devices TEST UNIT READY will succeed on 5964 * a reserved device, so we also do a WRITE(10) of zero byte in 5965 * order to provoke a Reservation Conflict status on those newer 5966 * devices. 5967 */ 5968 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 5969 failure++; 5970 5971 return (failure); 5972 } 5973 5974 /* 5975 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 5976 * queue when it has failed and failfast is enabled. Then we have to check 5977 * if it has failed because of a reservation conflict in which case we have 5978 * to panic the system. 5979 * 5980 * Async I/O should be queued with their block I/O data transfer structure 5981 * (buf). Sync I/O should be queued with buf = NULL. 5982 */ 5983 static vdc_io_t * 5984 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 5985 { 5986 vdc_io_t *vio; 5987 5988 ASSERT(MUTEX_HELD(&vdc->lock)); 5989 5990 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 5991 vio->vio_next = vdc->failfast_io_queue; 5992 vio->vio_buf = buf; 5993 vio->vio_qtime = ddi_get_lbolt(); 5994 5995 vdc->failfast_io_queue = vio; 5996 5997 /* notify the failfast thread that a new I/O is queued */ 5998 cv_signal(&vdc->failfast_cv); 5999 6000 return (vio); 6001 } 6002 6003 /* 6004 * Remove and complete I/O in the failfast I/O queue which have been 6005 * added after the indicated deadline. A deadline of 0 means that all 6006 * I/O have to be unqueued and marked as completed. 6007 */ 6008 static void 6009 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6010 { 6011 vdc_io_t *vio, *vio_tmp; 6012 6013 ASSERT(MUTEX_HELD(&vdc->lock)); 6014 6015 vio_tmp = NULL; 6016 vio = vdc->failfast_io_queue; 6017 6018 if (deadline != 0) { 6019 /* 6020 * Skip any io queued after the deadline. The failfast 6021 * I/O queue is ordered starting with the last I/O added 6022 * to the queue. 6023 */ 6024 while (vio != NULL && vio->vio_qtime > deadline) { 6025 vio_tmp = vio; 6026 vio = vio->vio_next; 6027 } 6028 } 6029 6030 if (vio == NULL) 6031 /* nothing to unqueue */ 6032 return; 6033 6034 /* update the queue */ 6035 if (vio_tmp == NULL) 6036 vdc->failfast_io_queue = NULL; 6037 else 6038 vio_tmp->vio_next = NULL; 6039 6040 /* 6041 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6042 * structure (buf) and they are completed by calling biodone(). Sync 6043 * I/O do not have a buf and they are completed by setting the 6044 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6045 * thread waiting for the I/O to complete is responsible for freeing 6046 * the vio structure. 6047 */ 6048 while (vio != NULL) { 6049 vio_tmp = vio->vio_next; 6050 if (vio->vio_buf != NULL) { 6051 VD_KSTAT_RUNQ_EXIT(vdc->io_stats); 6052 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6053 biodone(vio->vio_buf); 6054 kmem_free(vio, sizeof (vdc_io_t)); 6055 } else { 6056 vio->vio_qtime = 0; 6057 } 6058 vio = vio_tmp; 6059 } 6060 6061 cv_broadcast(&vdc->failfast_io_cv); 6062 } 6063 6064 /* 6065 * Failfast Thread. 6066 * 6067 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6068 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6069 * we still have access to the disk. If a command fails with a RESERVATION 6070 * CONFLICT error then the system will immediatly panic. 6071 * 6072 * The failfast thread is also woken up when an I/O has failed. It then check 6073 * the access to the disk to ensure that the I/O failure was not due to a 6074 * reservation conflict. 6075 * 6076 * There is one failfast thread for each virtual disk for which failfast is 6077 * enabled. We could have only one thread sending requests for all disks but 6078 * this would need vdc to send asynchronous requests and to have callbacks to 6079 * process replies. 6080 */ 6081 static void 6082 vdc_failfast_thread(void *arg) 6083 { 6084 int status; 6085 vdc_t *vdc = (vdc_t *)arg; 6086 clock_t timeout, starttime; 6087 6088 mutex_enter(&vdc->lock); 6089 6090 while (vdc->failfast_interval != 0) { 6091 6092 starttime = ddi_get_lbolt(); 6093 6094 mutex_exit(&vdc->lock); 6095 6096 /* check for reservation conflict */ 6097 status = vdc_failfast_check_resv(vdc); 6098 6099 mutex_enter(&vdc->lock); 6100 /* 6101 * We have dropped the lock to send the SCSI command so we have 6102 * to check that failfast is still enabled. 6103 */ 6104 if (vdc->failfast_interval == 0) 6105 break; 6106 6107 /* 6108 * If we have successfully check the disk access and there was 6109 * no reservation conflict then we can complete any I/O queued 6110 * before the last check. 6111 */ 6112 if (status == 0) 6113 vdc_failfast_io_unqueue(vdc, starttime); 6114 6115 /* proceed again if some I/O are still in the queue */ 6116 if (vdc->failfast_io_queue != NULL) 6117 continue; 6118 6119 timeout = ddi_get_lbolt() + 6120 drv_usectohz(vdc->failfast_interval); 6121 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6122 } 6123 6124 /* 6125 * Failfast is being stop so we can complete any queued I/O. 6126 */ 6127 vdc_failfast_io_unqueue(vdc, 0); 6128 vdc->failfast_thread = NULL; 6129 mutex_exit(&vdc->lock); 6130 thread_exit(); 6131 } 6132 6133 /* 6134 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6135 */ 6136 static int 6137 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6138 { 6139 unsigned int mh_time; 6140 6141 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6142 return (EFAULT); 6143 6144 mutex_enter(&vdc->lock); 6145 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6146 vdc->failfast_thread = thread_create(NULL, 0, 6147 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6148 v.v_maxsyspri - 2); 6149 } 6150 6151 vdc->failfast_interval = mh_time * 1000; 6152 cv_signal(&vdc->failfast_cv); 6153 mutex_exit(&vdc->lock); 6154 6155 return (0); 6156 } 6157 6158 /* 6159 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6160 * converted to VD_OP_SET_ACCESS operations. 6161 */ 6162 static int 6163 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6164 { 6165 int rv; 6166 6167 /* submit owership command request */ 6168 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6169 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6170 VIO_both_dir, B_TRUE); 6171 6172 return (rv); 6173 } 6174 6175 /* 6176 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6177 * VD_OP_GET_ACCESS operation. 6178 */ 6179 static int 6180 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6181 { 6182 int rv; 6183 6184 /* submit owership command request */ 6185 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6186 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6187 VIO_both_dir, B_TRUE); 6188 6189 return (rv); 6190 } 6191 6192 /* 6193 * Disk Ownership Thread. 6194 * 6195 * When we have taken the ownership of a disk, this thread waits to be 6196 * notified when the LDC channel is reset so that it can recover the 6197 * ownership. 6198 * 6199 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6200 * can not be used to do the ownership recovery because it has to be 6201 * running to handle the reply message to the ownership operation. 6202 */ 6203 static void 6204 vdc_ownership_thread(void *arg) 6205 { 6206 vdc_t *vdc = (vdc_t *)arg; 6207 clock_t timeout; 6208 uint64_t status; 6209 6210 mutex_enter(&vdc->ownership_lock); 6211 mutex_enter(&vdc->lock); 6212 6213 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6214 6215 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6216 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6217 /* 6218 * There was a reset so the ownership has been lost, 6219 * try to recover. We do this without using the preempt 6220 * option so that we don't steal the ownership from 6221 * someone who has preempted us. 6222 */ 6223 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6224 vdc->instance); 6225 6226 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6227 VDC_OWNERSHIP_GRANTED); 6228 6229 mutex_exit(&vdc->lock); 6230 6231 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6232 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6233 6234 mutex_enter(&vdc->lock); 6235 6236 if (status == 0) { 6237 DMSG(vdc, 0, "[%d] Ownership recovered", 6238 vdc->instance); 6239 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6240 } else { 6241 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6242 vdc->instance); 6243 } 6244 6245 } 6246 6247 /* 6248 * If we have the ownership then we just wait for an event 6249 * to happen (LDC reset), otherwise we will retry to recover 6250 * after a delay. 6251 */ 6252 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6253 timeout = 0; 6254 else 6255 timeout = ddi_get_lbolt() + 6256 drv_usectohz(vdc_ownership_delay); 6257 6258 /* Release the ownership_lock and wait on the vdc lock */ 6259 mutex_exit(&vdc->ownership_lock); 6260 6261 if (timeout == 0) 6262 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6263 else 6264 (void) cv_timedwait(&vdc->ownership_cv, 6265 &vdc->lock, timeout); 6266 6267 mutex_exit(&vdc->lock); 6268 6269 mutex_enter(&vdc->ownership_lock); 6270 mutex_enter(&vdc->lock); 6271 } 6272 6273 vdc->ownership_thread = NULL; 6274 mutex_exit(&vdc->lock); 6275 mutex_exit(&vdc->ownership_lock); 6276 6277 thread_exit(); 6278 } 6279 6280 static void 6281 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6282 { 6283 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6284 6285 mutex_enter(&vdc->lock); 6286 vdc->ownership = ownership_flags; 6287 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6288 vdc->ownership_thread == NULL) { 6289 /* start ownership thread */ 6290 vdc->ownership_thread = thread_create(NULL, 0, 6291 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6292 v.v_maxsyspri - 2); 6293 } else { 6294 /* notify the ownership thread */ 6295 cv_signal(&vdc->ownership_cv); 6296 } 6297 mutex_exit(&vdc->lock); 6298 } 6299 6300 /* 6301 * Get the size and the block size of a virtual disk from the vdisk server. 6302 * We need to use this operation when the vdisk_size attribute was not 6303 * available during the handshake with the vdisk server. 6304 */ 6305 static int 6306 vdc_check_capacity(vdc_t *vdc) 6307 { 6308 int rv = 0; 6309 size_t alloc_len; 6310 vd_capacity_t *vd_cap; 6311 6312 if (vdc->vdisk_size != 0) 6313 return (0); 6314 6315 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6316 6317 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6318 6319 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6320 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6321 6322 if (rv == 0) { 6323 if (vd_cap->vdisk_block_size != vdc->block_size || 6324 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6325 vd_cap->vdisk_size == 0) 6326 rv = EINVAL; 6327 else 6328 vdc->vdisk_size = vd_cap->vdisk_size; 6329 } 6330 6331 kmem_free(vd_cap, alloc_len); 6332 return (rv); 6333 } 6334 6335 /* 6336 * This structure is used in the DKIO(7I) array below. 6337 */ 6338 typedef struct vdc_dk_ioctl { 6339 uint8_t op; /* VD_OP_XXX value */ 6340 int cmd; /* Solaris ioctl operation number */ 6341 size_t nbytes; /* size of structure to be copied */ 6342 6343 /* function to convert between vDisk and Solaris structure formats */ 6344 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6345 int mode, int dir); 6346 } vdc_dk_ioctl_t; 6347 6348 /* 6349 * Subset of DKIO(7I) operations currently supported 6350 */ 6351 static vdc_dk_ioctl_t dk_ioctl[] = { 6352 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6353 vdc_null_copy_func}, 6354 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6355 vdc_get_wce_convert}, 6356 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6357 vdc_set_wce_convert}, 6358 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6359 vdc_get_vtoc_convert}, 6360 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6361 vdc_set_vtoc_convert}, 6362 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6363 vdc_get_geom_convert}, 6364 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6365 vdc_get_geom_convert}, 6366 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6367 vdc_get_geom_convert}, 6368 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6369 vdc_set_geom_convert}, 6370 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6371 vdc_get_efi_convert}, 6372 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6373 vdc_set_efi_convert}, 6374 6375 /* DIOCTL_RWCMD is converted to a read or a write */ 6376 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6377 6378 /* mhd(7I) non-shared multihost disks ioctls */ 6379 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6380 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6381 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6382 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6383 6384 /* mhd(7I) shared multihost disks ioctls */ 6385 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6386 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6387 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6388 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6389 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6390 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6391 6392 /* mhd(7I) failfast ioctl */ 6393 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6394 6395 /* 6396 * These particular ioctls are not sent to the server - vdc fakes up 6397 * the necessary info. 6398 */ 6399 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6400 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6401 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6402 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6403 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6404 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6405 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6406 }; 6407 6408 /* 6409 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6410 * function and forward them to the vdisk. 6411 */ 6412 static int 6413 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6414 { 6415 vdc_t *vdc = (vdc_t *)vdisk; 6416 dev_t dev; 6417 int rval; 6418 6419 dev = makedevice(ddi_driver_major(vdc->dip), 6420 VD_MAKE_DEV(vdc->instance, 0)); 6421 6422 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6423 } 6424 6425 /* 6426 * Function: 6427 * vd_process_ioctl() 6428 * 6429 * Description: 6430 * This routine processes disk specific ioctl calls 6431 * 6432 * Arguments: 6433 * dev - the device number 6434 * cmd - the operation [dkio(7I)] to be processed 6435 * arg - pointer to user provided structure 6436 * (contains data to be set or reference parameter for get) 6437 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6438 * rvalp - pointer to return value for calling process. 6439 * 6440 * Return Code: 6441 * 0 6442 * EFAULT 6443 * ENXIO 6444 * EIO 6445 * ENOTSUP 6446 */ 6447 static int 6448 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6449 { 6450 int instance = VDCUNIT(dev); 6451 vdc_t *vdc = NULL; 6452 int rv = -1; 6453 int idx = 0; /* index into dk_ioctl[] */ 6454 size_t len = 0; /* #bytes to send to vds */ 6455 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6456 caddr_t mem_p = NULL; 6457 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6458 vdc_dk_ioctl_t *iop; 6459 6460 vdc = ddi_get_soft_state(vdc_state, instance); 6461 if (vdc == NULL) { 6462 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6463 instance); 6464 return (ENXIO); 6465 } 6466 6467 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6468 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6469 6470 if (rvalp != NULL) { 6471 /* the return value of the ioctl is 0 by default */ 6472 *rvalp = 0; 6473 } 6474 6475 /* 6476 * Validate the ioctl operation to be performed. 6477 * 6478 * If we have looped through the array without finding a match then we 6479 * don't support this ioctl. 6480 */ 6481 for (idx = 0; idx < nioctls; idx++) { 6482 if (cmd == dk_ioctl[idx].cmd) 6483 break; 6484 } 6485 6486 if (idx >= nioctls) { 6487 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6488 vdc->instance, cmd); 6489 return (ENOTSUP); 6490 } 6491 6492 iop = &(dk_ioctl[idx]); 6493 6494 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6495 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6496 dk_efi_t dk_efi; 6497 6498 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6499 if (rv != 0) 6500 return (EFAULT); 6501 6502 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6503 } else { 6504 len = iop->nbytes; 6505 } 6506 6507 /* check if the ioctl is applicable */ 6508 switch (cmd) { 6509 case CDROMREADOFFSET: 6510 case DKIOCREMOVABLE: 6511 return (ENOTTY); 6512 6513 case USCSICMD: 6514 case MHIOCTKOWN: 6515 case MHIOCSTATUS: 6516 case MHIOCQRESERVE: 6517 case MHIOCRELEASE: 6518 case MHIOCGRP_INKEYS: 6519 case MHIOCGRP_INRESV: 6520 case MHIOCGRP_REGISTER: 6521 case MHIOCGRP_RESERVE: 6522 case MHIOCGRP_PREEMPTANDABORT: 6523 case MHIOCGRP_REGISTERANDIGNOREKEY: 6524 case MHIOCENFAILFAST: 6525 if (vdc->cinfo == NULL) 6526 return (ENXIO); 6527 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6528 return (ENOTTY); 6529 break; 6530 6531 case DIOCTL_RWCMD: 6532 if (vdc->cinfo == NULL) 6533 return (ENXIO); 6534 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6535 return (ENOTTY); 6536 break; 6537 6538 case DKIOCINFO: 6539 if (vdc->cinfo == NULL) 6540 return (ENXIO); 6541 break; 6542 6543 case DKIOCGMEDIAINFO: 6544 if (vdc->minfo == NULL) 6545 return (ENXIO); 6546 if (vdc_check_capacity(vdc) != 0) 6547 /* disk capacity is not available */ 6548 return (EIO); 6549 break; 6550 } 6551 6552 /* 6553 * Deal with ioctls which require a processing different than 6554 * converting ioctl arguments and sending a corresponding 6555 * VD operation. 6556 */ 6557 switch (cmd) { 6558 6559 case USCSICMD: 6560 { 6561 return (vdc_uscsi_cmd(vdc, arg, mode)); 6562 } 6563 6564 case MHIOCTKOWN: 6565 { 6566 mutex_enter(&vdc->ownership_lock); 6567 /* 6568 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6569 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6570 * while we are processing the ioctl. 6571 */ 6572 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6573 6574 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6575 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6576 if (rv == 0) { 6577 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6578 VDC_OWNERSHIP_GRANTED); 6579 } else { 6580 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6581 } 6582 mutex_exit(&vdc->ownership_lock); 6583 return (rv); 6584 } 6585 6586 case MHIOCRELEASE: 6587 { 6588 mutex_enter(&vdc->ownership_lock); 6589 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6590 if (rv == 0) { 6591 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6592 } 6593 mutex_exit(&vdc->ownership_lock); 6594 return (rv); 6595 } 6596 6597 case MHIOCSTATUS: 6598 { 6599 uint64_t status; 6600 6601 rv = vdc_access_get(vdc, &status, mode); 6602 if (rv == 0 && rvalp != NULL) 6603 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6604 return (rv); 6605 } 6606 6607 case MHIOCQRESERVE: 6608 { 6609 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6610 return (rv); 6611 } 6612 6613 case MHIOCGRP_INKEYS: 6614 { 6615 return (vdc_mhd_inkeys(vdc, arg, mode)); 6616 } 6617 6618 case MHIOCGRP_INRESV: 6619 { 6620 return (vdc_mhd_inresv(vdc, arg, mode)); 6621 } 6622 6623 case MHIOCGRP_REGISTER: 6624 { 6625 return (vdc_mhd_register(vdc, arg, mode)); 6626 } 6627 6628 case MHIOCGRP_RESERVE: 6629 { 6630 return (vdc_mhd_reserve(vdc, arg, mode)); 6631 } 6632 6633 case MHIOCGRP_PREEMPTANDABORT: 6634 { 6635 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6636 } 6637 6638 case MHIOCGRP_REGISTERANDIGNOREKEY: 6639 { 6640 return (vdc_mhd_registerignore(vdc, arg, mode)); 6641 } 6642 6643 case MHIOCENFAILFAST: 6644 { 6645 rv = vdc_failfast(vdc, arg, mode); 6646 return (rv); 6647 } 6648 6649 case DIOCTL_RWCMD: 6650 { 6651 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6652 } 6653 6654 case DKIOCGAPART: 6655 { 6656 return (vdc_dkio_gapart(vdc, arg, mode)); 6657 } 6658 6659 case DKIOCPARTITION: 6660 { 6661 return (vdc_dkio_partition(vdc, arg, mode)); 6662 } 6663 6664 case DKIOCINFO: 6665 { 6666 struct dk_cinfo cinfo; 6667 6668 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6669 cinfo.dki_partition = VDCPART(dev); 6670 6671 rv = ddi_copyout(&cinfo, (void *)arg, 6672 sizeof (struct dk_cinfo), mode); 6673 if (rv != 0) 6674 return (EFAULT); 6675 6676 return (0); 6677 } 6678 6679 case DKIOCGMEDIAINFO: 6680 { 6681 ASSERT(vdc->vdisk_size != 0); 6682 if (vdc->minfo->dki_capacity == 0) 6683 vdc->minfo->dki_capacity = vdc->vdisk_size; 6684 rv = ddi_copyout(vdc->minfo, (void *)arg, 6685 sizeof (struct dk_minfo), mode); 6686 if (rv != 0) 6687 return (EFAULT); 6688 6689 return (0); 6690 } 6691 6692 case DKIOCFLUSHWRITECACHE: 6693 { 6694 struct dk_callback *dkc = 6695 (struct dk_callback *)(uintptr_t)arg; 6696 vdc_dk_arg_t *dkarg = NULL; 6697 6698 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6699 instance, mode); 6700 6701 /* 6702 * If arg is NULL, then there is no callback function 6703 * registered and the call operates synchronously; we 6704 * break and continue with the rest of the function and 6705 * wait for vds to return (i.e. after the request to 6706 * vds returns successfully, all writes completed prior 6707 * to the ioctl will have been flushed from the disk 6708 * write cache to persistent media. 6709 * 6710 * If a callback function is registered, we dispatch 6711 * the request on a task queue and return immediately. 6712 * The callback will deal with informing the calling 6713 * thread that the flush request is completed. 6714 */ 6715 if (dkc == NULL) 6716 break; 6717 6718 /* 6719 * the asynchronous callback is only supported if 6720 * invoked from within the kernel 6721 */ 6722 if ((mode & FKIOCTL) == 0) 6723 return (ENOTSUP); 6724 6725 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6726 6727 dkarg->mode = mode; 6728 dkarg->dev = dev; 6729 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 6730 6731 mutex_enter(&vdc->lock); 6732 vdc->dkio_flush_pending++; 6733 dkarg->vdc = vdc; 6734 mutex_exit(&vdc->lock); 6735 6736 /* put the request on a task queue */ 6737 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 6738 (void *)dkarg, DDI_SLEEP); 6739 if (rv == NULL) { 6740 /* clean up if dispatch fails */ 6741 mutex_enter(&vdc->lock); 6742 vdc->dkio_flush_pending--; 6743 mutex_exit(&vdc->lock); 6744 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 6745 } 6746 6747 return (rv == NULL ? ENOMEM : 0); 6748 } 6749 } 6750 6751 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 6752 ASSERT(iop->op != 0); 6753 6754 /* check if the vDisk server handles the operation for this vDisk */ 6755 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 6756 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 6757 vdc->instance, iop->op); 6758 return (ENOTSUP); 6759 } 6760 6761 /* LDC requires that the memory being mapped is 8-byte aligned */ 6762 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 6763 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 6764 instance, len, alloc_len); 6765 6766 if (alloc_len > 0) 6767 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 6768 6769 /* 6770 * Call the conversion function for this ioctl which, if necessary, 6771 * converts from the Solaris format to the format ARC'ed 6772 * as part of the vDisk protocol (FWARC 2006/195) 6773 */ 6774 ASSERT(iop->convert != NULL); 6775 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 6776 if (rv != 0) { 6777 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6778 instance, rv, cmd); 6779 if (mem_p != NULL) 6780 kmem_free(mem_p, alloc_len); 6781 return (rv); 6782 } 6783 6784 /* 6785 * send request to vds to service the ioctl. 6786 */ 6787 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 6788 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 6789 VIO_both_dir, B_TRUE); 6790 6791 if (rv != 0) { 6792 /* 6793 * This is not necessarily an error. The ioctl could 6794 * be returning a value such as ENOTTY to indicate 6795 * that the ioctl is not applicable. 6796 */ 6797 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 6798 instance, rv, cmd); 6799 if (mem_p != NULL) 6800 kmem_free(mem_p, alloc_len); 6801 6802 return (rv); 6803 } 6804 6805 /* 6806 * Call the conversion function (if it exists) for this ioctl 6807 * which converts from the format ARC'ed as part of the vDisk 6808 * protocol (FWARC 2006/195) back to a format understood by 6809 * the rest of Solaris. 6810 */ 6811 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 6812 if (rv != 0) { 6813 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6814 instance, rv, cmd); 6815 if (mem_p != NULL) 6816 kmem_free(mem_p, alloc_len); 6817 return (rv); 6818 } 6819 6820 if (mem_p != NULL) 6821 kmem_free(mem_p, alloc_len); 6822 6823 return (rv); 6824 } 6825 6826 /* 6827 * Function: 6828 * 6829 * Description: 6830 * This is an empty conversion function used by ioctl calls which 6831 * do not need to convert the data being passed in/out to userland 6832 */ 6833 static int 6834 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 6835 { 6836 _NOTE(ARGUNUSED(vdc)) 6837 _NOTE(ARGUNUSED(from)) 6838 _NOTE(ARGUNUSED(to)) 6839 _NOTE(ARGUNUSED(mode)) 6840 _NOTE(ARGUNUSED(dir)) 6841 6842 return (0); 6843 } 6844 6845 static int 6846 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 6847 int mode, int dir) 6848 { 6849 _NOTE(ARGUNUSED(vdc)) 6850 6851 if (dir == VD_COPYIN) 6852 return (0); /* nothing to do */ 6853 6854 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 6855 return (EFAULT); 6856 6857 return (0); 6858 } 6859 6860 static int 6861 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 6862 int mode, int dir) 6863 { 6864 _NOTE(ARGUNUSED(vdc)) 6865 6866 if (dir == VD_COPYOUT) 6867 return (0); /* nothing to do */ 6868 6869 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 6870 return (EFAULT); 6871 6872 return (0); 6873 } 6874 6875 /* 6876 * Function: 6877 * vdc_get_vtoc_convert() 6878 * 6879 * Description: 6880 * This routine performs the necessary convertions from the DKIOCGVTOC 6881 * Solaris structure to the format defined in FWARC 2006/195. 6882 * 6883 * In the struct vtoc definition, the timestamp field is marked as not 6884 * supported so it is not part of vDisk protocol (FWARC 2006/195). 6885 * However SVM uses that field to check it can write into the VTOC, 6886 * so we fake up the info of that field. 6887 * 6888 * Arguments: 6889 * vdc - the vDisk client 6890 * from - the buffer containing the data to be copied from 6891 * to - the buffer to be copied to 6892 * mode - flags passed to ioctl() call 6893 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 6894 * 6895 * Return Code: 6896 * 0 - Success 6897 * ENXIO - incorrect buffer passed in. 6898 * EFAULT - ddi_copyout routine encountered an error. 6899 */ 6900 static int 6901 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6902 { 6903 int i; 6904 void *tmp_mem = NULL; 6905 void *tmp_memp; 6906 struct vtoc vt; 6907 struct vtoc32 vt32; 6908 int copy_len = 0; 6909 int rv = 0; 6910 6911 if (dir != VD_COPYOUT) 6912 return (0); /* nothing to do */ 6913 6914 if ((from == NULL) || (to == NULL)) 6915 return (ENXIO); 6916 6917 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6918 copy_len = sizeof (struct vtoc32); 6919 else 6920 copy_len = sizeof (struct vtoc); 6921 6922 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6923 6924 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 6925 6926 /* fake the VTOC timestamp field */ 6927 for (i = 0; i < V_NUMPAR; i++) { 6928 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 6929 } 6930 6931 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6932 /* LINTED E_ASSIGN_NARROW_CONV */ 6933 vtoctovtoc32(vt, vt32); 6934 tmp_memp = &vt32; 6935 } else { 6936 tmp_memp = &vt; 6937 } 6938 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 6939 if (rv != 0) 6940 rv = EFAULT; 6941 6942 kmem_free(tmp_mem, copy_len); 6943 return (rv); 6944 } 6945 6946 /* 6947 * Function: 6948 * vdc_set_vtoc_convert() 6949 * 6950 * Description: 6951 * This routine performs the necessary convertions from the DKIOCSVTOC 6952 * Solaris structure to the format defined in FWARC 2006/195. 6953 * 6954 * Arguments: 6955 * vdc - the vDisk client 6956 * from - Buffer with data 6957 * to - Buffer where data is to be copied to 6958 * mode - flags passed to ioctl 6959 * dir - direction of copy (in or out) 6960 * 6961 * Return Code: 6962 * 0 - Success 6963 * ENXIO - Invalid buffer passed in 6964 * EFAULT - ddi_copyin of data failed 6965 */ 6966 static int 6967 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6968 { 6969 _NOTE(ARGUNUSED(vdc)) 6970 6971 void *tmp_mem = NULL, *uvtoc; 6972 struct vtoc vt; 6973 struct vtoc *vtp = &vt; 6974 vd_vtoc_t vtvd; 6975 int copy_len = 0; 6976 int i, rv = 0; 6977 6978 if ((from == NULL) || (to == NULL)) 6979 return (ENXIO); 6980 6981 if (dir == VD_COPYIN) 6982 uvtoc = from; 6983 else 6984 uvtoc = to; 6985 6986 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6987 copy_len = sizeof (struct vtoc32); 6988 else 6989 copy_len = sizeof (struct vtoc); 6990 6991 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6992 6993 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 6994 if (rv != 0) { 6995 kmem_free(tmp_mem, copy_len); 6996 return (EFAULT); 6997 } 6998 6999 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7000 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 7001 } else { 7002 vtp = tmp_mem; 7003 } 7004 7005 if (dir == VD_COPYOUT) { 7006 /* 7007 * The disk label may have changed. Revalidate the disk 7008 * geometry. This will also update the device nodes and 7009 * properties. 7010 */ 7011 vdc_validate(vdc); 7012 7013 /* 7014 * We also need to keep track of the timestamp fields. 7015 */ 7016 for (i = 0; i < V_NUMPAR; i++) { 7017 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 7018 } 7019 7020 return (0); 7021 } 7022 7023 VTOC2VD_VTOC(vtp, &vtvd); 7024 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 7025 kmem_free(tmp_mem, copy_len); 7026 7027 return (0); 7028 } 7029 7030 /* 7031 * Function: 7032 * vdc_get_geom_convert() 7033 * 7034 * Description: 7035 * This routine performs the necessary convertions from the DKIOCGGEOM, 7036 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7037 * defined in FWARC 2006/195 7038 * 7039 * Arguments: 7040 * vdc - the vDisk client 7041 * from - Buffer with data 7042 * to - Buffer where data is to be copied to 7043 * mode - flags passed to ioctl 7044 * dir - direction of copy (in or out) 7045 * 7046 * Return Code: 7047 * 0 - Success 7048 * ENXIO - Invalid buffer passed in 7049 * EFAULT - ddi_copyout of data failed 7050 */ 7051 static int 7052 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7053 { 7054 _NOTE(ARGUNUSED(vdc)) 7055 7056 struct dk_geom geom; 7057 int copy_len = sizeof (struct dk_geom); 7058 int rv = 0; 7059 7060 if (dir != VD_COPYOUT) 7061 return (0); /* nothing to do */ 7062 7063 if ((from == NULL) || (to == NULL)) 7064 return (ENXIO); 7065 7066 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7067 rv = ddi_copyout(&geom, to, copy_len, mode); 7068 if (rv != 0) 7069 rv = EFAULT; 7070 7071 return (rv); 7072 } 7073 7074 /* 7075 * Function: 7076 * vdc_set_geom_convert() 7077 * 7078 * Description: 7079 * This routine performs the necessary convertions from the DKIOCSGEOM 7080 * Solaris structure to the format defined in FWARC 2006/195. 7081 * 7082 * Arguments: 7083 * vdc - the vDisk client 7084 * from - Buffer with data 7085 * to - Buffer where data is to be copied to 7086 * mode - flags passed to ioctl 7087 * dir - direction of copy (in or out) 7088 * 7089 * Return Code: 7090 * 0 - Success 7091 * ENXIO - Invalid buffer passed in 7092 * EFAULT - ddi_copyin of data failed 7093 */ 7094 static int 7095 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7096 { 7097 _NOTE(ARGUNUSED(vdc)) 7098 7099 vd_geom_t vdgeom; 7100 void *tmp_mem = NULL; 7101 int copy_len = sizeof (struct dk_geom); 7102 int rv = 0; 7103 7104 if (dir != VD_COPYIN) 7105 return (0); /* nothing to do */ 7106 7107 if ((from == NULL) || (to == NULL)) 7108 return (ENXIO); 7109 7110 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7111 7112 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7113 if (rv != 0) { 7114 kmem_free(tmp_mem, copy_len); 7115 return (EFAULT); 7116 } 7117 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7118 bcopy(&vdgeom, to, sizeof (vdgeom)); 7119 kmem_free(tmp_mem, copy_len); 7120 7121 return (0); 7122 } 7123 7124 static int 7125 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7126 { 7127 _NOTE(ARGUNUSED(vdc)) 7128 7129 vd_efi_t *vd_efi; 7130 dk_efi_t dk_efi; 7131 int rv = 0; 7132 void *uaddr; 7133 7134 if ((from == NULL) || (to == NULL)) 7135 return (ENXIO); 7136 7137 if (dir == VD_COPYIN) { 7138 7139 vd_efi = (vd_efi_t *)to; 7140 7141 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7142 if (rv != 0) 7143 return (EFAULT); 7144 7145 vd_efi->lba = dk_efi.dki_lba; 7146 vd_efi->length = dk_efi.dki_length; 7147 bzero(vd_efi->data, vd_efi->length); 7148 7149 } else { 7150 7151 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7152 if (rv != 0) 7153 return (EFAULT); 7154 7155 uaddr = dk_efi.dki_data; 7156 7157 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7158 7159 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7160 7161 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7162 mode); 7163 if (rv != 0) 7164 return (EFAULT); 7165 7166 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7167 } 7168 7169 return (0); 7170 } 7171 7172 static int 7173 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7174 { 7175 _NOTE(ARGUNUSED(vdc)) 7176 7177 dk_efi_t dk_efi; 7178 void *uaddr; 7179 7180 if (dir == VD_COPYOUT) { 7181 /* 7182 * The disk label may have changed. Revalidate the disk 7183 * geometry. This will also update the device nodes and 7184 * properties. 7185 */ 7186 vdc_validate(vdc); 7187 return (0); 7188 } 7189 7190 if ((from == NULL) || (to == NULL)) 7191 return (ENXIO); 7192 7193 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7194 return (EFAULT); 7195 7196 uaddr = dk_efi.dki_data; 7197 7198 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7199 7200 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7201 return (EFAULT); 7202 7203 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7204 7205 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7206 7207 return (0); 7208 } 7209 7210 7211 /* -------------------------------------------------------------------------- */ 7212 7213 /* 7214 * Function: 7215 * vdc_create_fake_geometry() 7216 * 7217 * Description: 7218 * This routine fakes up the disk info needed for some DKIO ioctls such 7219 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7220 * 7221 * Note: This function must not be called until the vDisk attributes have 7222 * been exchanged as part of the handshake with the vDisk server. 7223 * 7224 * Arguments: 7225 * vdc - soft state pointer for this instance of the device driver. 7226 * 7227 * Return Code: 7228 * none. 7229 */ 7230 static void 7231 vdc_create_fake_geometry(vdc_t *vdc) 7232 { 7233 ASSERT(vdc != NULL); 7234 ASSERT(vdc->max_xfer_sz != 0); 7235 7236 /* 7237 * DKIOCINFO support 7238 */ 7239 if (vdc->cinfo == NULL) 7240 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7241 7242 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7243 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7244 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7245 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7246 7247 /* 7248 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7249 * operation is supported, otherwise the controller type is DKC_DIRECT. 7250 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7251 * controller type is always DKC_DIRECT in that case. 7252 * 7253 * If the virtual disk is backed by a physical CD/DVD device or 7254 * an ISO image, modify the controller type to indicate this 7255 */ 7256 switch (vdc->vdisk_media) { 7257 case VD_MEDIA_CD: 7258 case VD_MEDIA_DVD: 7259 vdc->cinfo->dki_ctype = DKC_CDROM; 7260 break; 7261 case VD_MEDIA_FIXED: 7262 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7263 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7264 else 7265 vdc->cinfo->dki_ctype = DKC_DIRECT; 7266 break; 7267 default: 7268 /* in the case of v1.0 we default to a fixed disk */ 7269 vdc->cinfo->dki_ctype = DKC_DIRECT; 7270 break; 7271 } 7272 vdc->cinfo->dki_flags = DKI_FMTVOL; 7273 vdc->cinfo->dki_cnum = 0; 7274 vdc->cinfo->dki_addr = 0; 7275 vdc->cinfo->dki_space = 0; 7276 vdc->cinfo->dki_prio = 0; 7277 vdc->cinfo->dki_vec = 0; 7278 vdc->cinfo->dki_unit = vdc->instance; 7279 vdc->cinfo->dki_slave = 0; 7280 /* 7281 * The partition number will be created on the fly depending on the 7282 * actual slice (i.e. minor node) that is used to request the data. 7283 */ 7284 vdc->cinfo->dki_partition = 0; 7285 7286 /* 7287 * DKIOCGMEDIAINFO support 7288 */ 7289 if (vdc->minfo == NULL) 7290 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7291 7292 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7293 vdc->minfo->dki_media_type = 7294 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7295 } else { 7296 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7297 } 7298 7299 vdc->minfo->dki_capacity = vdc->vdisk_size; 7300 vdc->minfo->dki_lbsize = vdc->block_size; 7301 } 7302 7303 static ushort_t 7304 vdc_lbl2cksum(struct dk_label *label) 7305 { 7306 int count; 7307 ushort_t sum, *sp; 7308 7309 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7310 sp = (ushort_t *)label; 7311 sum = 0; 7312 while (count--) { 7313 sum ^= *sp++; 7314 } 7315 7316 return (sum); 7317 } 7318 7319 /* 7320 * Function: 7321 * vdc_validate_geometry 7322 * 7323 * Description: 7324 * This routine discovers the label and geometry of the disk. It stores 7325 * the disk label and related information in the vdc structure. If it 7326 * fails to validate the geometry or to discover the disk label then 7327 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7328 * 7329 * Arguments: 7330 * vdc - soft state pointer for this instance of the device driver. 7331 * 7332 * Return Code: 7333 * 0 - success. 7334 * EINVAL - unknown disk label. 7335 * ENOTSUP - geometry not applicable (EFI label). 7336 * EIO - error accessing the disk. 7337 */ 7338 static int 7339 vdc_validate_geometry(vdc_t *vdc) 7340 { 7341 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7342 dev_t dev; 7343 int rv, rval; 7344 struct dk_label label; 7345 struct dk_geom geom; 7346 struct vtoc vtoc; 7347 efi_gpt_t *gpt; 7348 efi_gpe_t *gpe; 7349 vd_efi_dev_t edev; 7350 7351 ASSERT(vdc != NULL); 7352 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7353 ASSERT(MUTEX_HELD(&vdc->lock)); 7354 7355 mutex_exit(&vdc->lock); 7356 7357 dev = makedevice(ddi_driver_major(vdc->dip), 7358 VD_MAKE_DEV(vdc->instance, 0)); 7359 7360 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7361 if (rv == 0) 7362 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7363 FKIOCTL, &rval); 7364 7365 if (rv == ENOTSUP) { 7366 /* 7367 * If the device does not support VTOC then we try 7368 * to read an EFI label. 7369 * 7370 * We need to know the block size and the disk size to 7371 * be able to read an EFI label. 7372 */ 7373 if (vdc->vdisk_size == 0) { 7374 if ((rv = vdc_check_capacity(vdc)) != 0) { 7375 mutex_enter(&vdc->lock); 7376 vdc_store_label_unk(vdc); 7377 return (rv); 7378 } 7379 } 7380 7381 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7382 7383 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7384 7385 if (rv) { 7386 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7387 vdc->instance, rv); 7388 mutex_enter(&vdc->lock); 7389 vdc_store_label_unk(vdc); 7390 return (EIO); 7391 } 7392 7393 mutex_enter(&vdc->lock); 7394 vdc_store_label_efi(vdc, gpt, gpe); 7395 vd_efi_free(&edev, gpt, gpe); 7396 return (ENOTSUP); 7397 } 7398 7399 if (rv != 0) { 7400 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7401 vdc->instance, rv); 7402 mutex_enter(&vdc->lock); 7403 vdc_store_label_unk(vdc); 7404 if (rv != EINVAL) 7405 rv = EIO; 7406 return (rv); 7407 } 7408 7409 /* check that geometry and vtoc are valid */ 7410 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7411 vtoc.v_sanity != VTOC_SANE) { 7412 mutex_enter(&vdc->lock); 7413 vdc_store_label_unk(vdc); 7414 return (EINVAL); 7415 } 7416 7417 /* 7418 * We have a disk and a valid VTOC. However this does not mean 7419 * that the disk currently have a VTOC label. The returned VTOC may 7420 * be a default VTOC to be used for configuring the disk (this is 7421 * what is done for disk image). So we read the label from the 7422 * beginning of the disk to ensure we really have a VTOC label. 7423 * 7424 * FUTURE: This could be the default way for reading the VTOC 7425 * from the disk as opposed to sending the VD_OP_GET_VTOC 7426 * to the server. This will be the default if vdc is implemented 7427 * ontop of cmlb. 7428 */ 7429 7430 /* 7431 * Single slice disk does not support read using an absolute disk 7432 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7433 */ 7434 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7435 mutex_enter(&vdc->lock); 7436 if (vtoc.v_nparts != 1) { 7437 vdc_store_label_unk(vdc); 7438 return (EINVAL); 7439 } 7440 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7441 return (0); 7442 } 7443 7444 if (vtoc.v_nparts != V_NUMPAR) { 7445 mutex_enter(&vdc->lock); 7446 vdc_store_label_unk(vdc); 7447 return (EINVAL); 7448 } 7449 7450 /* 7451 * Read disk label from start of disk 7452 */ 7453 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7454 bioinit(buf); 7455 buf->b_un.b_addr = (caddr_t)&label; 7456 buf->b_bcount = DK_LABEL_SIZE; 7457 buf->b_flags = B_BUSY | B_READ; 7458 buf->b_dev = cmpdev(dev); 7459 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7460 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7461 if (rv) { 7462 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7463 vdc->instance); 7464 } else { 7465 rv = biowait(buf); 7466 biofini(buf); 7467 } 7468 kmem_free(buf, sizeof (buf_t)); 7469 7470 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7471 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7472 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7473 vdc->instance); 7474 mutex_enter(&vdc->lock); 7475 vdc_store_label_unk(vdc); 7476 return (EINVAL); 7477 } 7478 7479 mutex_enter(&vdc->lock); 7480 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7481 return (0); 7482 } 7483 7484 /* 7485 * Function: 7486 * vdc_validate 7487 * 7488 * Description: 7489 * This routine discovers the label of the disk and create the 7490 * appropriate device nodes if the label has changed. 7491 * 7492 * Arguments: 7493 * vdc - soft state pointer for this instance of the device driver. 7494 * 7495 * Return Code: 7496 * none. 7497 */ 7498 static void 7499 vdc_validate(vdc_t *vdc) 7500 { 7501 vd_disk_label_t old_label; 7502 vd_slice_t old_slice[V_NUMPAR]; 7503 int rv; 7504 7505 ASSERT(!MUTEX_HELD(&vdc->lock)); 7506 7507 mutex_enter(&vdc->lock); 7508 7509 /* save the current label and vtoc */ 7510 old_label = vdc->vdisk_label; 7511 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7512 7513 /* check the geometry */ 7514 (void) vdc_validate_geometry(vdc); 7515 7516 /* if the disk label has changed, update device nodes */ 7517 if (vdc->vdisk_label != old_label) { 7518 7519 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7520 rv = vdc_create_device_nodes_efi(vdc); 7521 else 7522 rv = vdc_create_device_nodes_vtoc(vdc); 7523 7524 if (rv != 0) { 7525 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7526 vdc->instance); 7527 } 7528 } 7529 7530 /* if the vtoc has changed, update device nodes properties */ 7531 if (bcmp(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR) != 0) { 7532 7533 if (vdc_create_device_nodes_props(vdc) != 0) { 7534 DMSG(vdc, 0, "![%d] Failed to update device nodes" 7535 " properties", vdc->instance); 7536 } 7537 } 7538 7539 mutex_exit(&vdc->lock); 7540 } 7541 7542 static void 7543 vdc_validate_task(void *arg) 7544 { 7545 vdc_t *vdc = (vdc_t *)arg; 7546 7547 vdc_validate(vdc); 7548 7549 mutex_enter(&vdc->lock); 7550 ASSERT(vdc->validate_pending > 0); 7551 vdc->validate_pending--; 7552 mutex_exit(&vdc->lock); 7553 } 7554 7555 /* 7556 * Function: 7557 * vdc_setup_devid() 7558 * 7559 * Description: 7560 * This routine discovers the devid of a vDisk. It requests the devid of 7561 * the underlying device from the vDisk server, builds an encapsulated 7562 * devid based on the retrieved devid and registers that new devid to 7563 * the vDisk. 7564 * 7565 * Arguments: 7566 * vdc - soft state pointer for this instance of the device driver. 7567 * 7568 * Return Code: 7569 * 0 - A devid was succesfully registered for the vDisk 7570 */ 7571 static int 7572 vdc_setup_devid(vdc_t *vdc) 7573 { 7574 int rv; 7575 vd_devid_t *vd_devid; 7576 size_t bufsize, bufid_len; 7577 7578 /* 7579 * At first sight, we don't know the size of the devid that the 7580 * server will return but this size will be encoded into the 7581 * reply. So we do a first request using a default size then we 7582 * check if this size was large enough. If not then we do a second 7583 * request with the correct size returned by the server. Note that 7584 * ldc requires size to be 8-byte aligned. 7585 */ 7586 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7587 sizeof (uint64_t)); 7588 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7589 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7590 7591 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7592 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7593 7594 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7595 7596 if (rv) { 7597 kmem_free(vd_devid, bufsize); 7598 return (rv); 7599 } 7600 7601 if (vd_devid->length > bufid_len) { 7602 /* 7603 * The returned devid is larger than the buffer used. Try again 7604 * with a buffer with the right size. 7605 */ 7606 kmem_free(vd_devid, bufsize); 7607 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7608 sizeof (uint64_t)); 7609 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7610 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7611 7612 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7613 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7614 VIO_both_dir, B_TRUE); 7615 7616 if (rv) { 7617 kmem_free(vd_devid, bufsize); 7618 return (rv); 7619 } 7620 } 7621 7622 /* 7623 * The virtual disk should have the same device id as the one associated 7624 * with the physical disk it is mapped on, otherwise sharing a disk 7625 * between a LDom and a non-LDom may not work (for example for a shared 7626 * SVM disk set). 7627 * 7628 * The DDI framework does not allow creating a device id with any 7629 * type so we first create a device id of type DEVID_ENCAP and then 7630 * we restore the orignal type of the physical device. 7631 */ 7632 7633 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7634 7635 /* build an encapsulated devid based on the returned devid */ 7636 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7637 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7638 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7639 kmem_free(vd_devid, bufsize); 7640 return (1); 7641 } 7642 7643 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7644 7645 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7646 7647 kmem_free(vd_devid, bufsize); 7648 7649 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7650 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7651 return (1); 7652 } 7653 7654 return (0); 7655 } 7656 7657 static void 7658 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7659 { 7660 int i, nparts; 7661 7662 ASSERT(MUTEX_HELD(&vdc->lock)); 7663 7664 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7665 bzero(vdc->vtoc, sizeof (struct vtoc)); 7666 bzero(vdc->geom, sizeof (struct dk_geom)); 7667 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7668 7669 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7670 7671 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7672 7673 if (gpe[i].efi_gpe_StartingLBA == 0 || 7674 gpe[i].efi_gpe_EndingLBA == 0) { 7675 continue; 7676 } 7677 7678 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7679 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7680 gpe[i].efi_gpe_StartingLBA + 1; 7681 } 7682 7683 ASSERT(vdc->vdisk_size != 0); 7684 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7685 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7686 7687 } 7688 7689 static void 7690 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7691 { 7692 int i; 7693 7694 ASSERT(MUTEX_HELD(&vdc->lock)); 7695 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7696 7697 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7698 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7699 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7700 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7701 7702 for (i = 0; i < vtoc->v_nparts; i++) { 7703 vdc->slice[i].start = vtoc->v_part[i].p_start; 7704 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7705 } 7706 } 7707 7708 static void 7709 vdc_store_label_unk(vdc_t *vdc) 7710 { 7711 ASSERT(MUTEX_HELD(&vdc->lock)); 7712 7713 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7714 bzero(vdc->vtoc, sizeof (struct vtoc)); 7715 bzero(vdc->geom, sizeof (struct dk_geom)); 7716 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7717 } 7718