1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/kstat.h> 69 #include <sys/mach_descrip.h> 70 #include <sys/modctl.h> 71 #include <sys/mdeg.h> 72 #include <sys/note.h> 73 #include <sys/open.h> 74 #include <sys/sdt.h> 75 #include <sys/stat.h> 76 #include <sys/sunddi.h> 77 #include <sys/types.h> 78 #include <sys/promif.h> 79 #include <sys/var.h> 80 #include <sys/vtoc.h> 81 #include <sys/archsystm.h> 82 #include <sys/sysmacros.h> 83 84 #include <sys/cdio.h> 85 #include <sys/dktp/fdisk.h> 86 #include <sys/dktp/dadkio.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 124 /* setup */ 125 static void vdc_min(struct buf *bufp); 126 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 127 static int vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node); 128 static int vdc_start_ldc_connection(vdc_t *vdc); 129 static int vdc_create_device_nodes(vdc_t *vdc); 130 static int vdc_create_device_nodes_efi(vdc_t *vdc); 131 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 132 static int vdc_create_device_nodes_props(vdc_t *vdc); 133 static void vdc_create_io_kstats(vdc_t *vdc); 134 static void vdc_create_err_kstats(vdc_t *vdc); 135 static void vdc_set_err_kstats(vdc_t *vdc); 136 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 137 mde_cookie_t *vd_nodep, mde_cookie_t *vd_portp); 138 static int vdc_get_ldc_id(md_t *, mde_cookie_t, uint64_t *); 139 static int vdc_do_ldc_up(vdc_t *vdc); 140 static void vdc_terminate_ldc(vdc_t *vdc); 141 static int vdc_init_descriptor_ring(vdc_t *vdc); 142 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 143 static int vdc_setup_devid(vdc_t *vdc); 144 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 145 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 146 static void vdc_store_label_unk(vdc_t *vdc); 147 static boolean_t vdc_is_opened(vdc_t *vdc); 148 149 /* handshake with vds */ 150 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 151 static int vdc_ver_negotiation(vdc_t *vdcp); 152 static int vdc_init_attr_negotiation(vdc_t *vdc); 153 static int vdc_attr_negotiation(vdc_t *vdcp); 154 static int vdc_init_dring_negotiate(vdc_t *vdc); 155 static int vdc_dring_negotiation(vdc_t *vdcp); 156 static int vdc_send_rdx(vdc_t *vdcp); 157 static int vdc_rdx_exchange(vdc_t *vdcp); 158 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 159 160 /* processing incoming messages from vDisk server */ 161 static void vdc_process_msg_thread(vdc_t *vdc); 162 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 163 164 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 165 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 166 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 167 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 168 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 169 static int vdc_send_request(vdc_t *vdcp, int operation, 170 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 171 int cb_type, void *cb_arg, vio_desc_direction_t dir); 172 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 173 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 174 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 175 int cb_type, void *cb_arg, vio_desc_direction_t dir); 176 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 177 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 178 void *cb_arg, vio_desc_direction_t dir, boolean_t); 179 180 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 181 static int vdc_drain_response(vdc_t *vdcp); 182 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 183 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 184 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 185 186 /* dkio */ 187 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 188 int *rvalp); 189 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 190 static void vdc_create_fake_geometry(vdc_t *vdc); 191 static int vdc_validate_geometry(vdc_t *vdc); 192 static void vdc_validate(vdc_t *vdc); 193 static void vdc_validate_task(void *arg); 194 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 195 int mode, int dir); 196 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 197 int mode, int dir); 198 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 199 int mode, int dir); 200 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 201 int mode, int dir); 202 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 203 int mode, int dir); 204 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 205 int mode, int dir); 206 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 207 int mode, int dir); 208 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 209 int mode, int dir); 210 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 211 int mode, int dir); 212 213 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 214 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 215 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 216 static int vdc_failfast_check_resv(vdc_t *vdc); 217 218 /* 219 * Module variables 220 */ 221 222 /* 223 * Tunable variables to control how long vdc waits before timing out on 224 * various operations 225 */ 226 static int vdc_hshake_retries = 3; 227 228 static int vdc_timeout = 0; /* units: seconds */ 229 230 static uint64_t vdc_hz_min_ldc_delay; 231 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 232 static uint64_t vdc_hz_max_ldc_delay; 233 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 234 235 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 236 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 237 238 /* values for dumping - need to run in a tighter loop */ 239 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 240 static int vdc_dump_retries = 100; 241 242 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 243 244 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 245 246 /* Count of the number of vdc instances attached */ 247 static volatile uint32_t vdc_instance_count = 0; 248 249 /* Tunable to log all SCSI errors */ 250 static boolean_t vdc_scsi_log_error = B_FALSE; 251 252 /* Soft state pointer */ 253 static void *vdc_state; 254 255 /* 256 * Controlling the verbosity of the error/debug messages 257 * 258 * vdc_msglevel - controls level of messages 259 * vdc_matchinst - 64-bit variable where each bit corresponds 260 * to the vdc instance the vdc_msglevel applies. 261 */ 262 int vdc_msglevel = 0x0; 263 uint64_t vdc_matchinst = 0ull; 264 265 /* 266 * Supported vDisk protocol version pairs. 267 * 268 * The first array entry is the latest and preferred version. 269 */ 270 static const vio_ver_t vdc_version[] = {{1, 1}}; 271 272 static struct cb_ops vdc_cb_ops = { 273 vdc_open, /* cb_open */ 274 vdc_close, /* cb_close */ 275 vdc_strategy, /* cb_strategy */ 276 vdc_print, /* cb_print */ 277 vdc_dump, /* cb_dump */ 278 vdc_read, /* cb_read */ 279 vdc_write, /* cb_write */ 280 vdc_ioctl, /* cb_ioctl */ 281 nodev, /* cb_devmap */ 282 nodev, /* cb_mmap */ 283 nodev, /* cb_segmap */ 284 nochpoll, /* cb_chpoll */ 285 ddi_prop_op, /* cb_prop_op */ 286 NULL, /* cb_str */ 287 D_MP | D_64BIT, /* cb_flag */ 288 CB_REV, /* cb_rev */ 289 vdc_aread, /* cb_aread */ 290 vdc_awrite /* cb_awrite */ 291 }; 292 293 static struct dev_ops vdc_ops = { 294 DEVO_REV, /* devo_rev */ 295 0, /* devo_refcnt */ 296 vdc_getinfo, /* devo_getinfo */ 297 nulldev, /* devo_identify */ 298 nulldev, /* devo_probe */ 299 vdc_attach, /* devo_attach */ 300 vdc_detach, /* devo_detach */ 301 nodev, /* devo_reset */ 302 &vdc_cb_ops, /* devo_cb_ops */ 303 NULL, /* devo_bus_ops */ 304 nulldev /* devo_power */ 305 }; 306 307 static struct modldrv modldrv = { 308 &mod_driverops, 309 "virtual disk client", 310 &vdc_ops, 311 }; 312 313 static struct modlinkage modlinkage = { 314 MODREV_1, 315 &modldrv, 316 NULL 317 }; 318 319 /* -------------------------------------------------------------------------- */ 320 321 /* 322 * Device Driver housekeeping and setup 323 */ 324 325 int 326 _init(void) 327 { 328 int status; 329 330 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 331 return (status); 332 if ((status = mod_install(&modlinkage)) != 0) 333 ddi_soft_state_fini(&vdc_state); 334 return (status); 335 } 336 337 int 338 _info(struct modinfo *modinfop) 339 { 340 return (mod_info(&modlinkage, modinfop)); 341 } 342 343 int 344 _fini(void) 345 { 346 int status; 347 348 if ((status = mod_remove(&modlinkage)) != 0) 349 return (status); 350 ddi_soft_state_fini(&vdc_state); 351 return (0); 352 } 353 354 static int 355 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 356 { 357 _NOTE(ARGUNUSED(dip)) 358 359 int instance = VDCUNIT((dev_t)arg); 360 vdc_t *vdc = NULL; 361 362 switch (cmd) { 363 case DDI_INFO_DEVT2DEVINFO: 364 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 365 *resultp = NULL; 366 return (DDI_FAILURE); 367 } 368 *resultp = vdc->dip; 369 return (DDI_SUCCESS); 370 case DDI_INFO_DEVT2INSTANCE: 371 *resultp = (void *)(uintptr_t)instance; 372 return (DDI_SUCCESS); 373 default: 374 *resultp = NULL; 375 return (DDI_FAILURE); 376 } 377 } 378 379 static int 380 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 381 { 382 kt_did_t failfast_tid, ownership_tid; 383 int instance; 384 int rv; 385 vdc_t *vdc = NULL; 386 387 switch (cmd) { 388 case DDI_DETACH: 389 /* the real work happens below */ 390 break; 391 case DDI_SUSPEND: 392 /* nothing to do for this non-device */ 393 return (DDI_SUCCESS); 394 default: 395 return (DDI_FAILURE); 396 } 397 398 ASSERT(cmd == DDI_DETACH); 399 instance = ddi_get_instance(dip); 400 DMSGX(1, "[%d] Entered\n", instance); 401 402 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 403 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 404 return (DDI_FAILURE); 405 } 406 407 /* 408 * This function is called when vdc is detached or if it has failed to 409 * attach. In that case, the attach may have fail before the vdisk type 410 * has been set so we can't call vdc_is_opened(). However as the attach 411 * has failed, we know that the vdisk is not opened and we can safely 412 * detach. 413 */ 414 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 415 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 416 return (DDI_FAILURE); 417 } 418 419 if (vdc->dkio_flush_pending) { 420 DMSG(vdc, 0, 421 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 422 instance, vdc->dkio_flush_pending); 423 return (DDI_FAILURE); 424 } 425 426 if (vdc->validate_pending) { 427 DMSG(vdc, 0, 428 "[%d] Cannot detach: %d outstanding validate request\n", 429 instance, vdc->validate_pending); 430 return (DDI_FAILURE); 431 } 432 433 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 434 435 /* If we took ownership, release ownership */ 436 mutex_enter(&vdc->ownership_lock); 437 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 438 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 439 if (rv == 0) { 440 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 441 } 442 } 443 mutex_exit(&vdc->ownership_lock); 444 445 /* mark instance as detaching */ 446 vdc->lifecycle = VDC_LC_DETACHING; 447 448 /* 449 * try and disable callbacks to prevent another handshake 450 */ 451 rv = ldc_set_cb_mode(vdc->ldc_handle, LDC_CB_DISABLE); 452 DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); 453 454 if (vdc->initialized & VDC_THREAD) { 455 mutex_enter(&vdc->read_lock); 456 if ((vdc->read_state == VDC_READ_WAITING) || 457 (vdc->read_state == VDC_READ_RESET)) { 458 vdc->read_state = VDC_READ_RESET; 459 cv_signal(&vdc->read_cv); 460 } 461 462 mutex_exit(&vdc->read_lock); 463 464 /* wake up any thread waiting for connection to come online */ 465 mutex_enter(&vdc->lock); 466 if (vdc->state == VDC_STATE_INIT_WAITING) { 467 DMSG(vdc, 0, 468 "[%d] write reset - move to resetting state...\n", 469 instance); 470 vdc->state = VDC_STATE_RESETTING; 471 cv_signal(&vdc->initwait_cv); 472 } 473 mutex_exit(&vdc->lock); 474 475 /* now wait until state transitions to VDC_STATE_DETACH */ 476 thread_join(vdc->msg_proc_thr->t_did); 477 ASSERT(vdc->state == VDC_STATE_DETACH); 478 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 479 vdc->instance); 480 } 481 482 mutex_enter(&vdc->lock); 483 484 if (vdc->initialized & VDC_DRING) 485 vdc_destroy_descriptor_ring(vdc); 486 487 if (vdc->initialized & VDC_LDC) 488 vdc_terminate_ldc(vdc); 489 490 if (vdc->failfast_thread) { 491 failfast_tid = vdc->failfast_thread->t_did; 492 vdc->failfast_interval = 0; 493 cv_signal(&vdc->failfast_cv); 494 } else { 495 failfast_tid = 0; 496 } 497 498 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 499 ownership_tid = vdc->ownership_thread->t_did; 500 vdc->ownership = VDC_OWNERSHIP_NONE; 501 cv_signal(&vdc->ownership_cv); 502 } else { 503 ownership_tid = 0; 504 } 505 506 mutex_exit(&vdc->lock); 507 508 if (failfast_tid != 0) 509 thread_join(failfast_tid); 510 511 if (ownership_tid != 0) 512 thread_join(ownership_tid); 513 514 if (vdc->initialized & VDC_MINOR) { 515 ddi_prop_remove_all(dip); 516 ddi_remove_minor_node(dip, NULL); 517 } 518 519 if (vdc->io_stats) { 520 kstat_delete(vdc->io_stats); 521 vdc->io_stats = NULL; 522 } 523 524 if (vdc->err_stats) { 525 kstat_delete(vdc->err_stats); 526 vdc->err_stats = NULL; 527 } 528 529 if (vdc->initialized & VDC_LOCKS) { 530 mutex_destroy(&vdc->lock); 531 mutex_destroy(&vdc->read_lock); 532 mutex_destroy(&vdc->ownership_lock); 533 cv_destroy(&vdc->initwait_cv); 534 cv_destroy(&vdc->dring_free_cv); 535 cv_destroy(&vdc->membind_cv); 536 cv_destroy(&vdc->sync_pending_cv); 537 cv_destroy(&vdc->sync_blocked_cv); 538 cv_destroy(&vdc->read_cv); 539 cv_destroy(&vdc->running_cv); 540 cv_destroy(&vdc->ownership_cv); 541 cv_destroy(&vdc->failfast_cv); 542 cv_destroy(&vdc->failfast_io_cv); 543 } 544 545 if (vdc->minfo) 546 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 547 548 if (vdc->cinfo) 549 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 550 551 if (vdc->vtoc) 552 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 553 554 if (vdc->geom) 555 kmem_free(vdc->geom, sizeof (struct dk_geom)); 556 557 if (vdc->devid) { 558 ddi_devid_unregister(dip); 559 ddi_devid_free(vdc->devid); 560 } 561 562 if (vdc->initialized & VDC_SOFT_STATE) 563 ddi_soft_state_free(vdc_state, instance); 564 565 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 566 567 return (DDI_SUCCESS); 568 } 569 570 571 static int 572 vdc_do_attach(dev_info_t *dip) 573 { 574 int instance; 575 vdc_t *vdc = NULL; 576 int status; 577 md_t *mdp; 578 mde_cookie_t vd_node, vd_port; 579 580 ASSERT(dip != NULL); 581 582 instance = ddi_get_instance(dip); 583 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 584 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 585 instance); 586 return (DDI_FAILURE); 587 } 588 589 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 590 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 591 return (DDI_FAILURE); 592 } 593 594 /* 595 * We assign the value to initialized in this case to zero out the 596 * variable and then set bits in it to indicate what has been done 597 */ 598 vdc->initialized = VDC_SOFT_STATE; 599 600 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 601 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 602 603 vdc->dip = dip; 604 vdc->instance = instance; 605 vdc->vdisk_type = VD_DISK_TYPE_UNK; 606 vdc->vdisk_label = VD_DISK_LABEL_UNK; 607 vdc->state = VDC_STATE_INIT; 608 vdc->lifecycle = VDC_LC_ATTACHING; 609 vdc->ldc_state = 0; 610 vdc->session_id = 0; 611 vdc->block_size = DEV_BSIZE; 612 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 613 614 /* 615 * We assume, for now, that the vDisk server will export 'read' 616 * operations to us at a minimum (this is needed because of checks 617 * in vdc for supported operations early in the handshake process). 618 * The vDisk server will return ENOTSUP if this is not the case. 619 * The value will be overwritten during the attribute exchange with 620 * the bitmask of operations exported by server. 621 */ 622 vdc->operations = VD_OP_MASK_READ; 623 624 vdc->vtoc = NULL; 625 vdc->geom = NULL; 626 vdc->cinfo = NULL; 627 vdc->minfo = NULL; 628 629 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 630 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 631 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 632 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 633 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 634 635 vdc->threads_pending = 0; 636 vdc->sync_op_pending = B_FALSE; 637 vdc->sync_op_blocked = B_FALSE; 638 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 639 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 640 641 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 642 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 643 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 645 646 /* init blocking msg read functionality */ 647 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 648 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 649 vdc->read_state = VDC_READ_IDLE; 650 651 vdc->initialized |= VDC_LOCKS; 652 653 /* get device and port MD node for this disk instance */ 654 if (vdc_get_md_node(dip, &mdp, &vd_node, &vd_port) != 0) { 655 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 656 instance); 657 return (DDI_FAILURE); 658 } 659 660 /* set the connection timeout */ 661 if (vd_port == NULL || (md_get_prop_val(mdp, vd_port, 662 VDC_MD_TIMEOUT, &vdc->ctimeout) != 0)) { 663 vdc->ctimeout = 0; 664 } 665 666 /* initialise LDC channel which will be used to communicate with vds */ 667 status = vdc_do_ldc_init(vdc, mdp, vd_node); 668 669 (void) md_fini_handle(mdp); 670 671 if (status != 0) { 672 cmn_err(CE_NOTE, "[%d] Couldn't initialize LDC", instance); 673 goto return_status; 674 } 675 676 /* initialize the thread responsible for managing state with server */ 677 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 678 vdc, 0, &p0, TS_RUN, minclsyspri); 679 if (vdc->msg_proc_thr == NULL) { 680 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 681 instance); 682 return (DDI_FAILURE); 683 } 684 685 vdc->initialized |= VDC_THREAD; 686 687 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 688 vdc_create_io_kstats(vdc); 689 vdc_create_err_kstats(vdc); 690 691 atomic_inc_32(&vdc_instance_count); 692 693 /* 694 * Check the disk label. This will send requests and do the handshake. 695 * We don't really care about the disk label now. What we really need is 696 * the handshake do be done so that we know the type of the disk (slice 697 * or full disk) and the appropriate device nodes can be created. 698 */ 699 vdc->vdisk_label = VD_DISK_LABEL_UNK; 700 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 701 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 702 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 703 704 mutex_enter(&vdc->lock); 705 (void) vdc_validate_geometry(vdc); 706 mutex_exit(&vdc->lock); 707 708 /* 709 * Now that we have the device info we can create the 710 * device nodes and properties 711 */ 712 status = vdc_create_device_nodes(vdc); 713 if (status) { 714 DMSG(vdc, 0, "[%d] Failed to create device nodes", 715 instance); 716 goto return_status; 717 } 718 status = vdc_create_device_nodes_props(vdc); 719 if (status) { 720 DMSG(vdc, 0, "[%d] Failed to create device nodes" 721 " properties (%d)", instance, status); 722 goto return_status; 723 } 724 725 /* 726 * Setup devid 727 */ 728 if (vdc_setup_devid(vdc)) { 729 DMSG(vdc, 0, "[%d] No device id available\n", instance); 730 } 731 732 /* 733 * Fill in the fields of the error statistics kstat that were not 734 * available when creating the kstat 735 */ 736 vdc_set_err_kstats(vdc); 737 738 ddi_report_dev(dip); 739 vdc->lifecycle = VDC_LC_ONLINE; 740 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 741 742 return_status: 743 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 744 return (status); 745 } 746 747 static int 748 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 749 { 750 int status; 751 752 switch (cmd) { 753 case DDI_ATTACH: 754 if ((status = vdc_do_attach(dip)) != 0) 755 (void) vdc_detach(dip, DDI_DETACH); 756 return (status); 757 case DDI_RESUME: 758 /* nothing to do for this non-device */ 759 return (DDI_SUCCESS); 760 default: 761 return (DDI_FAILURE); 762 } 763 } 764 765 static int 766 vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node) 767 { 768 int status = 0; 769 ldc_status_t ldc_state; 770 ldc_attr_t ldc_attr; 771 uint64_t ldc_id = 0; 772 773 ASSERT(vdc != NULL); 774 775 vdc->initialized |= VDC_LDC; 776 777 if ((status = vdc_get_ldc_id(mdp, vd_node, &ldc_id)) != 0) { 778 DMSG(vdc, 0, "[%d] Failed to get LDC channel ID property", 779 vdc->instance); 780 return (EIO); 781 } 782 783 DMSGX(0, "[%d] LDC id is 0x%lx\n", vdc->instance, ldc_id); 784 785 vdc->ldc_id = ldc_id; 786 787 ldc_attr.devclass = LDC_DEV_BLK; 788 ldc_attr.instance = vdc->instance; 789 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 790 ldc_attr.mtu = VD_LDC_MTU; 791 792 if ((vdc->initialized & VDC_LDC_INIT) == 0) { 793 status = ldc_init(ldc_id, &ldc_attr, &vdc->ldc_handle); 794 if (status != 0) { 795 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 796 vdc->instance, ldc_id, status); 797 return (status); 798 } 799 vdc->initialized |= VDC_LDC_INIT; 800 } 801 status = ldc_status(vdc->ldc_handle, &ldc_state); 802 if (status != 0) { 803 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 804 vdc->instance, status); 805 return (status); 806 } 807 vdc->ldc_state = ldc_state; 808 809 if ((vdc->initialized & VDC_LDC_CB) == 0) { 810 status = ldc_reg_callback(vdc->ldc_handle, vdc_handle_cb, 811 (caddr_t)vdc); 812 if (status != 0) { 813 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 814 vdc->instance, status); 815 return (status); 816 } 817 vdc->initialized |= VDC_LDC_CB; 818 } 819 820 vdc->initialized |= VDC_LDC; 821 822 /* 823 * At this stage we have initialised LDC, we will now try and open 824 * the connection. 825 */ 826 if (vdc->ldc_state == LDC_INIT) { 827 status = ldc_open(vdc->ldc_handle); 828 if (status != 0) { 829 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 830 vdc->instance, vdc->ldc_id, status); 831 return (status); 832 } 833 vdc->initialized |= VDC_LDC_OPEN; 834 } 835 836 return (status); 837 } 838 839 static int 840 vdc_start_ldc_connection(vdc_t *vdc) 841 { 842 int status = 0; 843 844 ASSERT(vdc != NULL); 845 846 ASSERT(MUTEX_HELD(&vdc->lock)); 847 848 status = vdc_do_ldc_up(vdc); 849 850 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 851 852 return (status); 853 } 854 855 static int 856 vdc_stop_ldc_connection(vdc_t *vdcp) 857 { 858 int status; 859 860 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 861 vdcp->state); 862 863 status = ldc_down(vdcp->ldc_handle); 864 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 865 866 vdcp->initialized &= ~VDC_HANDSHAKE; 867 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 868 869 return (status); 870 } 871 872 static void 873 vdc_create_io_kstats(vdc_t *vdc) 874 { 875 if (vdc->io_stats != NULL) { 876 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 877 return; 878 } 879 880 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 881 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 882 if (vdc->io_stats != NULL) { 883 vdc->io_stats->ks_lock = &vdc->lock; 884 kstat_install(vdc->io_stats); 885 } else { 886 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 887 " will not be gathered", vdc->instance); 888 } 889 } 890 891 static void 892 vdc_create_err_kstats(vdc_t *vdc) 893 { 894 vd_err_stats_t *stp; 895 char kstatmodule_err[KSTAT_STRLEN]; 896 char kstatname[KSTAT_STRLEN]; 897 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 898 int instance = vdc->instance; 899 900 if (vdc->err_stats != NULL) { 901 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 902 return; 903 } 904 905 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 906 "%serr", VDC_DRIVER_NAME); 907 (void) snprintf(kstatname, sizeof (kstatname), 908 "%s%d,err", VDC_DRIVER_NAME, instance); 909 910 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 911 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 912 913 if (vdc->err_stats == NULL) { 914 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 915 " will not be gathered", instance); 916 return; 917 } 918 919 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 920 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 921 KSTAT_DATA_UINT32); 922 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 923 KSTAT_DATA_UINT32); 924 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 925 KSTAT_DATA_UINT32); 926 kstat_named_init(&stp->vd_vid, "Vendor", 927 KSTAT_DATA_CHAR); 928 kstat_named_init(&stp->vd_pid, "Product", 929 KSTAT_DATA_CHAR); 930 kstat_named_init(&stp->vd_capacity, "Size", 931 KSTAT_DATA_ULONGLONG); 932 933 vdc->err_stats->ks_update = nulldev; 934 935 kstat_install(vdc->err_stats); 936 } 937 938 static void 939 vdc_set_err_kstats(vdc_t *vdc) 940 { 941 vd_err_stats_t *stp; 942 943 if (vdc->err_stats == NULL) 944 return; 945 946 mutex_enter(&vdc->lock); 947 948 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 949 ASSERT(stp != NULL); 950 951 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 952 (void) strcpy(stp->vd_vid.value.c, "SUN"); 953 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 954 955 mutex_exit(&vdc->lock); 956 } 957 958 static int 959 vdc_create_device_nodes_efi(vdc_t *vdc) 960 { 961 ddi_remove_minor_node(vdc->dip, "h"); 962 ddi_remove_minor_node(vdc->dip, "h,raw"); 963 964 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 965 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 966 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 967 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 968 vdc->instance); 969 return (EIO); 970 } 971 972 /* if any device node is created we set this flag */ 973 vdc->initialized |= VDC_MINOR; 974 975 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 976 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 977 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 978 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 979 vdc->instance); 980 return (EIO); 981 } 982 983 return (0); 984 } 985 986 static int 987 vdc_create_device_nodes_vtoc(vdc_t *vdc) 988 { 989 ddi_remove_minor_node(vdc->dip, "wd"); 990 ddi_remove_minor_node(vdc->dip, "wd,raw"); 991 992 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 993 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 994 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 995 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 996 vdc->instance); 997 return (EIO); 998 } 999 1000 /* if any device node is created we set this flag */ 1001 vdc->initialized |= VDC_MINOR; 1002 1003 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 1004 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 1005 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1006 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 1007 vdc->instance); 1008 return (EIO); 1009 } 1010 1011 return (0); 1012 } 1013 1014 /* 1015 * Function: 1016 * vdc_create_device_nodes 1017 * 1018 * Description: 1019 * This function creates the block and character device nodes under 1020 * /devices along with the node properties. It is called as part of 1021 * the attach(9E) of the instance during the handshake with vds after 1022 * vds has sent the attributes to vdc. 1023 * 1024 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1025 * of 2 is used in keeping with the Solaris convention that slice 2 1026 * refers to a whole disk. Slices start at 'a' 1027 * 1028 * Parameters: 1029 * vdc - soft state pointer 1030 * 1031 * Return Values 1032 * 0 - Success 1033 * EIO - Failed to create node 1034 * EINVAL - Unknown type of disk exported 1035 */ 1036 static int 1037 vdc_create_device_nodes(vdc_t *vdc) 1038 { 1039 char name[sizeof ("s,raw")]; 1040 dev_info_t *dip = NULL; 1041 int instance, status; 1042 int num_slices = 1; 1043 int i; 1044 1045 ASSERT(vdc != NULL); 1046 1047 instance = vdc->instance; 1048 dip = vdc->dip; 1049 1050 switch (vdc->vdisk_type) { 1051 case VD_DISK_TYPE_DISK: 1052 num_slices = V_NUMPAR; 1053 break; 1054 case VD_DISK_TYPE_SLICE: 1055 num_slices = 1; 1056 break; 1057 case VD_DISK_TYPE_UNK: 1058 default: 1059 return (EINVAL); 1060 } 1061 1062 /* 1063 * Minor nodes are different for EFI disks: EFI disks do not have 1064 * a minor node 'g' for the minor number corresponding to slice 1065 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1066 * representing the whole disk. 1067 */ 1068 for (i = 0; i < num_slices; i++) { 1069 1070 if (i == VD_EFI_WD_SLICE) { 1071 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1072 status = vdc_create_device_nodes_efi(vdc); 1073 else 1074 status = vdc_create_device_nodes_vtoc(vdc); 1075 if (status != 0) 1076 return (status); 1077 continue; 1078 } 1079 1080 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1081 if (ddi_create_minor_node(dip, name, S_IFBLK, 1082 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1083 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1084 instance, name); 1085 return (EIO); 1086 } 1087 1088 /* if any device node is created we set this flag */ 1089 vdc->initialized |= VDC_MINOR; 1090 1091 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1092 1093 if (ddi_create_minor_node(dip, name, S_IFCHR, 1094 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1095 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1096 instance, name); 1097 return (EIO); 1098 } 1099 } 1100 1101 return (0); 1102 } 1103 1104 /* 1105 * Function: 1106 * vdc_create_device_nodes_props 1107 * 1108 * Description: 1109 * This function creates the block and character device nodes under 1110 * /devices along with the node properties. It is called as part of 1111 * the attach(9E) of the instance during the handshake with vds after 1112 * vds has sent the attributes to vdc. 1113 * 1114 * Parameters: 1115 * vdc - soft state pointer 1116 * 1117 * Return Values 1118 * 0 - Success 1119 * EIO - Failed to create device node property 1120 * EINVAL - Unknown type of disk exported 1121 */ 1122 static int 1123 vdc_create_device_nodes_props(vdc_t *vdc) 1124 { 1125 dev_info_t *dip = NULL; 1126 int instance; 1127 int num_slices = 1; 1128 int64_t size = 0; 1129 dev_t dev; 1130 int rv; 1131 int i; 1132 1133 ASSERT(vdc != NULL); 1134 1135 instance = vdc->instance; 1136 dip = vdc->dip; 1137 1138 switch (vdc->vdisk_type) { 1139 case VD_DISK_TYPE_DISK: 1140 num_slices = V_NUMPAR; 1141 break; 1142 case VD_DISK_TYPE_SLICE: 1143 num_slices = 1; 1144 break; 1145 case VD_DISK_TYPE_UNK: 1146 default: 1147 return (EINVAL); 1148 } 1149 1150 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1151 /* remove all properties */ 1152 for (i = 0; i < num_slices; i++) { 1153 dev = makedevice(ddi_driver_major(dip), 1154 VD_MAKE_DEV(instance, i)); 1155 (void) ddi_prop_remove(dev, dip, VDC_SIZE_PROP_NAME); 1156 (void) ddi_prop_remove(dev, dip, VDC_NBLOCKS_PROP_NAME); 1157 } 1158 return (0); 1159 } 1160 1161 for (i = 0; i < num_slices; i++) { 1162 dev = makedevice(ddi_driver_major(dip), 1163 VD_MAKE_DEV(instance, i)); 1164 1165 size = vdc->slice[i].nblocks * vdc->block_size; 1166 DMSG(vdc, 0, "[%d] sz %ld (%ld Mb) p_size %lx\n", 1167 instance, size, size / (1024 * 1024), 1168 vdc->slice[i].nblocks); 1169 1170 rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); 1171 if (rv != DDI_PROP_SUCCESS) { 1172 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop of [%ld]", 1173 instance, VDC_SIZE_PROP_NAME, size); 1174 return (EIO); 1175 } 1176 1177 rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME, 1178 lbtodb(size)); 1179 if (rv != DDI_PROP_SUCCESS) { 1180 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop [%llu]", 1181 instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size)); 1182 return (EIO); 1183 } 1184 } 1185 1186 return (0); 1187 } 1188 1189 /* 1190 * Function: 1191 * vdc_is_opened 1192 * 1193 * Description: 1194 * This function checks if any slice of a given virtual disk is 1195 * currently opened. 1196 * 1197 * Parameters: 1198 * vdc - soft state pointer 1199 * 1200 * Return Values 1201 * B_TRUE - at least one slice is opened. 1202 * B_FALSE - no slice is opened. 1203 */ 1204 static boolean_t 1205 vdc_is_opened(vdc_t *vdc) 1206 { 1207 int i, nslices; 1208 1209 switch (vdc->vdisk_type) { 1210 case VD_DISK_TYPE_DISK: 1211 nslices = V_NUMPAR; 1212 break; 1213 case VD_DISK_TYPE_SLICE: 1214 nslices = 1; 1215 break; 1216 case VD_DISK_TYPE_UNK: 1217 default: 1218 ASSERT(0); 1219 } 1220 1221 /* check if there's any layered open */ 1222 for (i = 0; i < nslices; i++) { 1223 if (vdc->open_lyr[i] > 0) 1224 return (B_TRUE); 1225 } 1226 1227 /* check if there is any other kind of open */ 1228 for (i = 0; i < OTYPCNT; i++) { 1229 if (vdc->open[i] != 0) 1230 return (B_TRUE); 1231 } 1232 1233 return (B_FALSE); 1234 } 1235 1236 static int 1237 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1238 { 1239 uint8_t slicemask; 1240 int i; 1241 1242 ASSERT(otyp < OTYPCNT); 1243 ASSERT(slice < V_NUMPAR); 1244 ASSERT(MUTEX_HELD(&vdc->lock)); 1245 1246 slicemask = 1 << slice; 1247 1248 /* check if slice is already exclusively opened */ 1249 if (vdc->open_excl & slicemask) 1250 return (EBUSY); 1251 1252 /* if open exclusive, check if slice is already opened */ 1253 if (flag & FEXCL) { 1254 if (vdc->open_lyr[slice] > 0) 1255 return (EBUSY); 1256 for (i = 0; i < OTYPCNT; i++) { 1257 if (vdc->open[i] & slicemask) 1258 return (EBUSY); 1259 } 1260 vdc->open_excl |= slicemask; 1261 } 1262 1263 /* mark slice as opened */ 1264 if (otyp == OTYP_LYR) { 1265 vdc->open_lyr[slice]++; 1266 } else { 1267 vdc->open[otyp] |= slicemask; 1268 } 1269 1270 return (0); 1271 } 1272 1273 static void 1274 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1275 { 1276 uint8_t slicemask; 1277 1278 ASSERT(otyp < OTYPCNT); 1279 ASSERT(slice < V_NUMPAR); 1280 ASSERT(MUTEX_HELD(&vdc->lock)); 1281 1282 slicemask = 1 << slice; 1283 1284 if (otyp == OTYP_LYR) { 1285 ASSERT(vdc->open_lyr[slice] > 0); 1286 vdc->open_lyr[slice]--; 1287 } else { 1288 vdc->open[otyp] &= ~slicemask; 1289 } 1290 1291 if (flag & FEXCL) 1292 vdc->open_excl &= ~slicemask; 1293 } 1294 1295 static int 1296 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1297 { 1298 _NOTE(ARGUNUSED(cred)) 1299 1300 int instance; 1301 int slice, status = 0; 1302 vdc_t *vdc; 1303 1304 ASSERT(dev != NULL); 1305 instance = VDCUNIT(*dev); 1306 1307 if (otyp >= OTYPCNT) 1308 return (EINVAL); 1309 1310 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1311 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1312 return (ENXIO); 1313 } 1314 1315 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1316 getminor(*dev), flag, otyp); 1317 1318 slice = VDCPART(*dev); 1319 1320 mutex_enter(&vdc->lock); 1321 1322 status = vdc_mark_opened(vdc, slice, flag, otyp); 1323 1324 if (status != 0) { 1325 mutex_exit(&vdc->lock); 1326 return (status); 1327 } 1328 1329 if (flag & (FNDELAY | FNONBLOCK)) { 1330 1331 /* don't resubmit a validate request if there's already one */ 1332 if (vdc->validate_pending > 0) { 1333 mutex_exit(&vdc->lock); 1334 return (0); 1335 } 1336 1337 /* call vdc_validate() asynchronously to avoid blocking */ 1338 if (taskq_dispatch(system_taskq, vdc_validate_task, 1339 (void *)vdc, TQ_NOSLEEP) == NULL) { 1340 vdc_mark_closed(vdc, slice, flag, otyp); 1341 mutex_exit(&vdc->lock); 1342 return (ENXIO); 1343 } 1344 1345 vdc->validate_pending++; 1346 mutex_exit(&vdc->lock); 1347 return (0); 1348 } 1349 1350 mutex_exit(&vdc->lock); 1351 1352 vdc_validate(vdc); 1353 1354 mutex_enter(&vdc->lock); 1355 1356 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1357 vdc->slice[slice].nblocks == 0) { 1358 vdc_mark_closed(vdc, slice, flag, otyp); 1359 status = EIO; 1360 } 1361 1362 mutex_exit(&vdc->lock); 1363 1364 return (status); 1365 } 1366 1367 static int 1368 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1369 { 1370 _NOTE(ARGUNUSED(cred)) 1371 1372 int instance; 1373 int slice; 1374 int rv, rval; 1375 vdc_t *vdc; 1376 1377 instance = VDCUNIT(dev); 1378 1379 if (otyp >= OTYPCNT) 1380 return (EINVAL); 1381 1382 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1383 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1384 return (ENXIO); 1385 } 1386 1387 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1388 1389 slice = VDCPART(dev); 1390 1391 /* 1392 * Attempt to flush the W$ on a close operation. If this is 1393 * not a supported IOCTL command or the backing device is read-only 1394 * do not fail the close operation. 1395 */ 1396 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1397 1398 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1399 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1400 instance, rv); 1401 return (EIO); 1402 } 1403 1404 mutex_enter(&vdc->lock); 1405 vdc_mark_closed(vdc, slice, flag, otyp); 1406 mutex_exit(&vdc->lock); 1407 1408 return (0); 1409 } 1410 1411 static int 1412 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1413 { 1414 _NOTE(ARGUNUSED(credp)) 1415 1416 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1417 } 1418 1419 static int 1420 vdc_print(dev_t dev, char *str) 1421 { 1422 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1423 return (0); 1424 } 1425 1426 static int 1427 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1428 { 1429 int rv; 1430 size_t nbytes = nblk * DEV_BSIZE; 1431 int instance = VDCUNIT(dev); 1432 vdc_t *vdc = NULL; 1433 1434 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1435 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1436 return (ENXIO); 1437 } 1438 1439 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1440 instance, nbytes, blkno, (void *)addr); 1441 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1442 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1443 if (rv) { 1444 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1445 return (rv); 1446 } 1447 1448 if (ddi_in_panic()) 1449 (void) vdc_drain_response(vdc); 1450 1451 DMSG(vdc, 0, "[%d] End\n", instance); 1452 1453 return (0); 1454 } 1455 1456 /* -------------------------------------------------------------------------- */ 1457 1458 /* 1459 * Disk access routines 1460 * 1461 */ 1462 1463 /* 1464 * vdc_strategy() 1465 * 1466 * Return Value: 1467 * 0: As per strategy(9E), the strategy() function must return 0 1468 * [ bioerror(9f) sets b_flags to the proper error code ] 1469 */ 1470 static int 1471 vdc_strategy(struct buf *buf) 1472 { 1473 int rv = -1; 1474 vdc_t *vdc = NULL; 1475 int instance = VDCUNIT(buf->b_edev); 1476 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1477 int slice; 1478 1479 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1480 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1481 bioerror(buf, ENXIO); 1482 biodone(buf); 1483 return (0); 1484 } 1485 1486 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1487 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1488 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1489 1490 bp_mapin(buf); 1491 1492 if ((long)buf->b_private == VD_SLICE_NONE) { 1493 /* I/O using an absolute disk offset */ 1494 slice = VD_SLICE_NONE; 1495 } else { 1496 slice = VDCPART(buf->b_edev); 1497 } 1498 1499 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1500 buf->b_bcount, slice, buf->b_lblkno, 1501 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1502 VIO_write_dir); 1503 1504 /* 1505 * If the request was successfully sent, the strategy call returns and 1506 * the ACK handler calls the bioxxx functions when the vDisk server is 1507 * done otherwise we handle the error here. 1508 */ 1509 if (rv) { 1510 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1511 bioerror(buf, rv); 1512 biodone(buf); 1513 } 1514 1515 return (0); 1516 } 1517 1518 /* 1519 * Function: 1520 * vdc_min 1521 * 1522 * Description: 1523 * Routine to limit the size of a data transfer. Used in 1524 * conjunction with physio(9F). 1525 * 1526 * Arguments: 1527 * bp - pointer to the indicated buf(9S) struct. 1528 * 1529 */ 1530 static void 1531 vdc_min(struct buf *bufp) 1532 { 1533 vdc_t *vdc = NULL; 1534 int instance = VDCUNIT(bufp->b_edev); 1535 1536 vdc = ddi_get_soft_state(vdc_state, instance); 1537 VERIFY(vdc != NULL); 1538 1539 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1540 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1541 } 1542 } 1543 1544 static int 1545 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1546 { 1547 _NOTE(ARGUNUSED(cred)) 1548 1549 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1550 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1551 } 1552 1553 static int 1554 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1555 { 1556 _NOTE(ARGUNUSED(cred)) 1557 1558 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1559 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1560 } 1561 1562 static int 1563 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1564 { 1565 _NOTE(ARGUNUSED(cred)) 1566 1567 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1568 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1569 } 1570 1571 static int 1572 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1573 { 1574 _NOTE(ARGUNUSED(cred)) 1575 1576 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1577 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1578 } 1579 1580 1581 /* -------------------------------------------------------------------------- */ 1582 1583 /* 1584 * Handshake support 1585 */ 1586 1587 1588 /* 1589 * Function: 1590 * vdc_init_ver_negotiation() 1591 * 1592 * Description: 1593 * 1594 * Arguments: 1595 * vdc - soft state pointer for this instance of the device driver. 1596 * 1597 * Return Code: 1598 * 0 - Success 1599 */ 1600 static int 1601 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1602 { 1603 vio_ver_msg_t pkt; 1604 size_t msglen = sizeof (pkt); 1605 int status = -1; 1606 1607 ASSERT(vdc != NULL); 1608 ASSERT(mutex_owned(&vdc->lock)); 1609 1610 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1611 1612 /* 1613 * set the Session ID to a unique value 1614 * (the lower 32 bits of the clock tick) 1615 */ 1616 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1617 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1618 1619 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1620 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1621 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1622 pkt.tag.vio_sid = vdc->session_id; 1623 pkt.dev_class = VDEV_DISK; 1624 pkt.ver_major = ver.major; 1625 pkt.ver_minor = ver.minor; 1626 1627 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1628 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1629 vdc->instance, status); 1630 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1631 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1632 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1633 status, msglen); 1634 if (msglen != sizeof (vio_ver_msg_t)) 1635 status = ENOMSG; 1636 } 1637 1638 return (status); 1639 } 1640 1641 /* 1642 * Function: 1643 * vdc_ver_negotiation() 1644 * 1645 * Description: 1646 * 1647 * Arguments: 1648 * vdcp - soft state pointer for this instance of the device driver. 1649 * 1650 * Return Code: 1651 * 0 - Success 1652 */ 1653 static int 1654 vdc_ver_negotiation(vdc_t *vdcp) 1655 { 1656 vio_msg_t vio_msg; 1657 int status; 1658 1659 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1660 return (status); 1661 1662 /* release lock and wait for response */ 1663 mutex_exit(&vdcp->lock); 1664 status = vdc_wait_for_response(vdcp, &vio_msg); 1665 mutex_enter(&vdcp->lock); 1666 if (status) { 1667 DMSG(vdcp, 0, 1668 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1669 vdcp->instance, status); 1670 return (status); 1671 } 1672 1673 /* check type and sub_type ... */ 1674 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1675 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1676 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1677 vdcp->instance); 1678 return (EPROTO); 1679 } 1680 1681 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1682 } 1683 1684 /* 1685 * Function: 1686 * vdc_init_attr_negotiation() 1687 * 1688 * Description: 1689 * 1690 * Arguments: 1691 * vdc - soft state pointer for this instance of the device driver. 1692 * 1693 * Return Code: 1694 * 0 - Success 1695 */ 1696 static int 1697 vdc_init_attr_negotiation(vdc_t *vdc) 1698 { 1699 vd_attr_msg_t pkt; 1700 size_t msglen = sizeof (pkt); 1701 int status; 1702 1703 ASSERT(vdc != NULL); 1704 ASSERT(mutex_owned(&vdc->lock)); 1705 1706 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1707 1708 /* fill in tag */ 1709 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1710 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1711 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1712 pkt.tag.vio_sid = vdc->session_id; 1713 /* fill in payload */ 1714 pkt.max_xfer_sz = vdc->max_xfer_sz; 1715 pkt.vdisk_block_size = vdc->block_size; 1716 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1717 pkt.operations = 0; /* server will set bits of valid operations */ 1718 pkt.vdisk_type = 0; /* server will set to valid device type */ 1719 pkt.vdisk_media = 0; /* server will set to valid media type */ 1720 pkt.vdisk_size = 0; /* server will set to valid size */ 1721 1722 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1723 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1724 1725 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1726 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1727 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1728 status, msglen); 1729 if (msglen != sizeof (vio_ver_msg_t)) 1730 status = ENOMSG; 1731 } 1732 1733 return (status); 1734 } 1735 1736 /* 1737 * Function: 1738 * vdc_attr_negotiation() 1739 * 1740 * Description: 1741 * 1742 * Arguments: 1743 * vdc - soft state pointer for this instance of the device driver. 1744 * 1745 * Return Code: 1746 * 0 - Success 1747 */ 1748 static int 1749 vdc_attr_negotiation(vdc_t *vdcp) 1750 { 1751 int status; 1752 vio_msg_t vio_msg; 1753 1754 if (status = vdc_init_attr_negotiation(vdcp)) 1755 return (status); 1756 1757 /* release lock and wait for response */ 1758 mutex_exit(&vdcp->lock); 1759 status = vdc_wait_for_response(vdcp, &vio_msg); 1760 mutex_enter(&vdcp->lock); 1761 if (status) { 1762 DMSG(vdcp, 0, 1763 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1764 vdcp->instance, status); 1765 return (status); 1766 } 1767 1768 /* check type and sub_type ... */ 1769 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1770 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1771 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1772 vdcp->instance); 1773 return (EPROTO); 1774 } 1775 1776 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1777 } 1778 1779 1780 /* 1781 * Function: 1782 * vdc_init_dring_negotiate() 1783 * 1784 * Description: 1785 * 1786 * Arguments: 1787 * vdc - soft state pointer for this instance of the device driver. 1788 * 1789 * Return Code: 1790 * 0 - Success 1791 */ 1792 static int 1793 vdc_init_dring_negotiate(vdc_t *vdc) 1794 { 1795 vio_dring_reg_msg_t pkt; 1796 size_t msglen = sizeof (pkt); 1797 int status = -1; 1798 int retry; 1799 int nretries = 10; 1800 1801 ASSERT(vdc != NULL); 1802 ASSERT(mutex_owned(&vdc->lock)); 1803 1804 for (retry = 0; retry < nretries; retry++) { 1805 status = vdc_init_descriptor_ring(vdc); 1806 if (status != EAGAIN) 1807 break; 1808 drv_usecwait(vdc_min_timeout_ldc); 1809 } 1810 1811 if (status != 0) { 1812 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1813 vdc->instance, status); 1814 return (status); 1815 } 1816 1817 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1818 vdc->instance, status); 1819 1820 /* fill in tag */ 1821 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1822 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1823 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1824 pkt.tag.vio_sid = vdc->session_id; 1825 /* fill in payload */ 1826 pkt.dring_ident = 0; 1827 pkt.num_descriptors = vdc->dring_len; 1828 pkt.descriptor_size = vdc->dring_entry_size; 1829 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1830 pkt.ncookies = vdc->dring_cookie_count; 1831 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1832 1833 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1834 if (status != 0) { 1835 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1836 vdc->instance, status); 1837 } 1838 1839 return (status); 1840 } 1841 1842 1843 /* 1844 * Function: 1845 * vdc_dring_negotiation() 1846 * 1847 * Description: 1848 * 1849 * Arguments: 1850 * vdc - soft state pointer for this instance of the device driver. 1851 * 1852 * Return Code: 1853 * 0 - Success 1854 */ 1855 static int 1856 vdc_dring_negotiation(vdc_t *vdcp) 1857 { 1858 int status; 1859 vio_msg_t vio_msg; 1860 1861 if (status = vdc_init_dring_negotiate(vdcp)) 1862 return (status); 1863 1864 /* release lock and wait for response */ 1865 mutex_exit(&vdcp->lock); 1866 status = vdc_wait_for_response(vdcp, &vio_msg); 1867 mutex_enter(&vdcp->lock); 1868 if (status) { 1869 DMSG(vdcp, 0, 1870 "[%d] Failed waiting for Dring negotiation response," 1871 " rv(%d)", vdcp->instance, status); 1872 return (status); 1873 } 1874 1875 /* check type and sub_type ... */ 1876 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1877 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1878 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1879 vdcp->instance); 1880 return (EPROTO); 1881 } 1882 1883 return (vdc_handle_dring_reg_msg(vdcp, 1884 (vio_dring_reg_msg_t *)&vio_msg)); 1885 } 1886 1887 1888 /* 1889 * Function: 1890 * vdc_send_rdx() 1891 * 1892 * Description: 1893 * 1894 * Arguments: 1895 * vdc - soft state pointer for this instance of the device driver. 1896 * 1897 * Return Code: 1898 * 0 - Success 1899 */ 1900 static int 1901 vdc_send_rdx(vdc_t *vdcp) 1902 { 1903 vio_msg_t msg; 1904 size_t msglen = sizeof (vio_msg_t); 1905 int status; 1906 1907 /* 1908 * Send an RDX message to vds to indicate we are ready 1909 * to send data 1910 */ 1911 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1912 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1913 msg.tag.vio_subtype_env = VIO_RDX; 1914 msg.tag.vio_sid = vdcp->session_id; 1915 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1916 if (status != 0) { 1917 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1918 vdcp->instance, status); 1919 } 1920 1921 return (status); 1922 } 1923 1924 /* 1925 * Function: 1926 * vdc_handle_rdx() 1927 * 1928 * Description: 1929 * 1930 * Arguments: 1931 * vdc - soft state pointer for this instance of the device driver. 1932 * msgp - received msg 1933 * 1934 * Return Code: 1935 * 0 - Success 1936 */ 1937 static int 1938 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1939 { 1940 _NOTE(ARGUNUSED(vdcp)) 1941 _NOTE(ARGUNUSED(msgp)) 1942 1943 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1944 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1945 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1946 1947 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1948 1949 return (0); 1950 } 1951 1952 /* 1953 * Function: 1954 * vdc_rdx_exchange() 1955 * 1956 * Description: 1957 * 1958 * Arguments: 1959 * vdc - soft state pointer for this instance of the device driver. 1960 * 1961 * Return Code: 1962 * 0 - Success 1963 */ 1964 static int 1965 vdc_rdx_exchange(vdc_t *vdcp) 1966 { 1967 int status; 1968 vio_msg_t vio_msg; 1969 1970 if (status = vdc_send_rdx(vdcp)) 1971 return (status); 1972 1973 /* release lock and wait for response */ 1974 mutex_exit(&vdcp->lock); 1975 status = vdc_wait_for_response(vdcp, &vio_msg); 1976 mutex_enter(&vdcp->lock); 1977 if (status) { 1978 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1979 vdcp->instance, status); 1980 return (status); 1981 } 1982 1983 /* check type and sub_type ... */ 1984 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1985 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1986 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1987 return (EPROTO); 1988 } 1989 1990 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1991 } 1992 1993 1994 /* -------------------------------------------------------------------------- */ 1995 1996 /* 1997 * LDC helper routines 1998 */ 1999 2000 static int 2001 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 2002 { 2003 int status; 2004 boolean_t q_has_pkts = B_FALSE; 2005 uint64_t delay_time; 2006 size_t len; 2007 2008 mutex_enter(&vdc->read_lock); 2009 2010 if (vdc->read_state == VDC_READ_IDLE) 2011 vdc->read_state = VDC_READ_WAITING; 2012 2013 while (vdc->read_state != VDC_READ_PENDING) { 2014 2015 /* detect if the connection has been reset */ 2016 if (vdc->read_state == VDC_READ_RESET) { 2017 status = ECONNRESET; 2018 goto done; 2019 } 2020 2021 cv_wait(&vdc->read_cv, &vdc->read_lock); 2022 } 2023 2024 /* 2025 * Until we get a blocking ldc read we have to retry 2026 * until the entire LDC message has arrived before 2027 * ldc_read() will succeed. Note we also bail out if 2028 * the channel is reset or goes away. 2029 */ 2030 delay_time = vdc_ldc_read_init_delay; 2031 loop: 2032 len = *nbytesp; 2033 status = ldc_read(vdc->ldc_handle, (caddr_t)msgp, &len); 2034 switch (status) { 2035 case EAGAIN: 2036 delay_time *= 2; 2037 if (delay_time >= vdc_ldc_read_max_delay) 2038 delay_time = vdc_ldc_read_max_delay; 2039 delay(delay_time); 2040 goto loop; 2041 2042 case 0: 2043 if (len == 0) { 2044 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 2045 "no error!\n", vdc->instance); 2046 goto loop; 2047 } 2048 2049 *nbytesp = len; 2050 2051 /* 2052 * If there are pending messages, leave the 2053 * read state as pending. Otherwise, set the state 2054 * back to idle. 2055 */ 2056 status = ldc_chkq(vdc->ldc_handle, &q_has_pkts); 2057 if (status == 0 && !q_has_pkts) 2058 vdc->read_state = VDC_READ_IDLE; 2059 2060 break; 2061 default: 2062 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2063 break; 2064 } 2065 2066 done: 2067 mutex_exit(&vdc->read_lock); 2068 2069 return (status); 2070 } 2071 2072 2073 2074 #ifdef DEBUG 2075 void 2076 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2077 { 2078 char *ms, *ss, *ses; 2079 switch (msg->tag.vio_msgtype) { 2080 #define Q(_s) case _s : ms = #_s; break; 2081 Q(VIO_TYPE_CTRL) 2082 Q(VIO_TYPE_DATA) 2083 Q(VIO_TYPE_ERR) 2084 #undef Q 2085 default: ms = "unknown"; break; 2086 } 2087 2088 switch (msg->tag.vio_subtype) { 2089 #define Q(_s) case _s : ss = #_s; break; 2090 Q(VIO_SUBTYPE_INFO) 2091 Q(VIO_SUBTYPE_ACK) 2092 Q(VIO_SUBTYPE_NACK) 2093 #undef Q 2094 default: ss = "unknown"; break; 2095 } 2096 2097 switch (msg->tag.vio_subtype_env) { 2098 #define Q(_s) case _s : ses = #_s; break; 2099 Q(VIO_VER_INFO) 2100 Q(VIO_ATTR_INFO) 2101 Q(VIO_DRING_REG) 2102 Q(VIO_DRING_UNREG) 2103 Q(VIO_RDX) 2104 Q(VIO_PKT_DATA) 2105 Q(VIO_DESC_DATA) 2106 Q(VIO_DRING_DATA) 2107 #undef Q 2108 default: ses = "unknown"; break; 2109 } 2110 2111 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2112 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2113 msg->tag.vio_subtype_env, ms, ss, ses); 2114 } 2115 #endif 2116 2117 /* 2118 * Function: 2119 * vdc_send() 2120 * 2121 * Description: 2122 * The function encapsulates the call to write a message using LDC. 2123 * If LDC indicates that the call failed due to the queue being full, 2124 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2125 * 2126 * Arguments: 2127 * ldc_handle - LDC handle for the channel this instance of vdc uses 2128 * pkt - address of LDC message to be sent 2129 * msglen - the size of the message being sent. When the function 2130 * returns, this contains the number of bytes written. 2131 * 2132 * Return Code: 2133 * 0 - Success. 2134 * EINVAL - pkt or msglen were NULL 2135 * ECONNRESET - The connection was not up. 2136 * EWOULDBLOCK - LDC queue is full 2137 * xxx - other error codes returned by ldc_write 2138 */ 2139 static int 2140 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2141 { 2142 size_t size = 0; 2143 int status = 0; 2144 clock_t delay_ticks; 2145 2146 ASSERT(vdc != NULL); 2147 ASSERT(mutex_owned(&vdc->lock)); 2148 ASSERT(msglen != NULL); 2149 ASSERT(*msglen != 0); 2150 2151 #ifdef DEBUG 2152 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2153 #endif 2154 /* 2155 * Wait indefinitely to send if channel 2156 * is busy, but bail out if we succeed or 2157 * if the channel closes or is reset. 2158 */ 2159 delay_ticks = vdc_hz_min_ldc_delay; 2160 do { 2161 size = *msglen; 2162 status = ldc_write(vdc->ldc_handle, pkt, &size); 2163 if (status == EWOULDBLOCK) { 2164 delay(delay_ticks); 2165 /* geometric backoff */ 2166 delay_ticks *= 2; 2167 if (delay_ticks > vdc_hz_max_ldc_delay) 2168 delay_ticks = vdc_hz_max_ldc_delay; 2169 } 2170 } while (status == EWOULDBLOCK); 2171 2172 /* if LDC had serious issues --- reset vdc state */ 2173 if (status == EIO || status == ECONNRESET) { 2174 /* LDC had serious issues --- reset vdc state */ 2175 mutex_enter(&vdc->read_lock); 2176 if ((vdc->read_state == VDC_READ_WAITING) || 2177 (vdc->read_state == VDC_READ_RESET)) 2178 cv_signal(&vdc->read_cv); 2179 vdc->read_state = VDC_READ_RESET; 2180 mutex_exit(&vdc->read_lock); 2181 2182 /* wake up any waiters in the reset thread */ 2183 if (vdc->state == VDC_STATE_INIT_WAITING) { 2184 DMSG(vdc, 0, "[%d] write reset - " 2185 "vdc is resetting ..\n", vdc->instance); 2186 vdc->state = VDC_STATE_RESETTING; 2187 cv_signal(&vdc->initwait_cv); 2188 } 2189 2190 return (ECONNRESET); 2191 } 2192 2193 /* return the last size written */ 2194 *msglen = size; 2195 2196 return (status); 2197 } 2198 2199 /* 2200 * Function: 2201 * vdc_get_md_node 2202 * 2203 * Description: 2204 * Get the MD, the device node and the port node for the given 2205 * disk instance. The caller is responsible for cleaning up the 2206 * reference to the returned MD (mdpp) by calling md_fini_handle(). 2207 * 2208 * Arguments: 2209 * dip - dev info pointer for this instance of the device driver. 2210 * mdpp - the returned MD. 2211 * vd_nodep - the returned device node. 2212 * vd_portp - the returned port node. The returned port node is NULL 2213 * if no port node is found. 2214 * 2215 * Return Code: 2216 * 0 - Success. 2217 * ENOENT - Expected node or property did not exist. 2218 * ENXIO - Unexpected error communicating with MD framework 2219 */ 2220 static int 2221 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep, 2222 mde_cookie_t *vd_portp) 2223 { 2224 int status = ENOENT; 2225 char *node_name = NULL; 2226 md_t *mdp = NULL; 2227 int num_nodes; 2228 int num_vdevs; 2229 int num_vports; 2230 mde_cookie_t rootnode; 2231 mde_cookie_t *listp = NULL; 2232 boolean_t found_inst = B_FALSE; 2233 int listsz; 2234 int idx; 2235 uint64_t md_inst; 2236 int obp_inst; 2237 int instance = ddi_get_instance(dip); 2238 2239 /* 2240 * Get the OBP instance number for comparison with the MD instance 2241 * 2242 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2243 * notion of "instance", or unique identifier, for that node; OBP 2244 * stores the value of the "cfg-handle" MD property as the value of 2245 * the "reg" property on the node in the device tree it builds from 2246 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2247 * "reg" property value to uniquely identify this device instance. 2248 * If the "reg" property cannot be found, the device tree state is 2249 * presumably so broken that there is no point in continuing. 2250 */ 2251 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2252 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2253 return (ENOENT); 2254 } 2255 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2256 OBP_REG, -1); 2257 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2258 2259 /* 2260 * We now walk the MD nodes to find the node for this vdisk. 2261 */ 2262 if ((mdp = md_get_handle()) == NULL) { 2263 cmn_err(CE_WARN, "unable to init machine description"); 2264 return (ENXIO); 2265 } 2266 2267 num_nodes = md_node_count(mdp); 2268 ASSERT(num_nodes > 0); 2269 2270 listsz = num_nodes * sizeof (mde_cookie_t); 2271 2272 /* allocate memory for nodes */ 2273 listp = kmem_zalloc(listsz, KM_SLEEP); 2274 2275 rootnode = md_root_node(mdp); 2276 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2277 2278 /* 2279 * Search for all the virtual devices, we will then check to see which 2280 * ones are disk nodes. 2281 */ 2282 num_vdevs = md_scan_dag(mdp, rootnode, 2283 md_find_name(mdp, VDC_MD_VDEV_NAME), 2284 md_find_name(mdp, "fwd"), listp); 2285 2286 if (num_vdevs <= 0) { 2287 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2288 status = ENOENT; 2289 goto done; 2290 } 2291 2292 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2293 for (idx = 0; idx < num_vdevs; idx++) { 2294 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2295 if ((status != 0) || (node_name == NULL)) { 2296 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2297 ": err %d", VDC_MD_VDEV_NAME, status); 2298 continue; 2299 } 2300 2301 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2302 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2303 status = md_get_prop_val(mdp, listp[idx], 2304 VDC_MD_CFG_HDL, &md_inst); 2305 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2306 instance, md_inst); 2307 if ((status == 0) && (md_inst == obp_inst)) { 2308 found_inst = B_TRUE; 2309 break; 2310 } 2311 } 2312 } 2313 2314 if (!found_inst) { 2315 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2316 status = ENOENT; 2317 goto done; 2318 } 2319 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2320 2321 *vd_nodep = listp[idx]; 2322 *mdpp = mdp; 2323 2324 num_vports = md_scan_dag(mdp, *vd_nodep, 2325 md_find_name(mdp, VDC_MD_PORT_NAME), 2326 md_find_name(mdp, "fwd"), listp); 2327 2328 if (num_vports != 1) { 2329 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2330 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME, num_vports); 2331 } 2332 2333 *vd_portp = (num_vports == 0)? NULL: listp[0]; 2334 2335 done: 2336 kmem_free(listp, listsz); 2337 return (status); 2338 } 2339 2340 /* 2341 * Function: 2342 * vdc_get_ldc_id() 2343 * 2344 * Description: 2345 * This function gets the 'ldc-id' for this particular instance of vdc. 2346 * The id returned is the guest domain channel endpoint LDC uses for 2347 * communication with vds. 2348 * 2349 * Arguments: 2350 * mdp - pointer to the machine description. 2351 * vd_node - the vdisk element from the MD. 2352 * ldc_id - pointer to variable used to return the 'ldc-id' found. 2353 * 2354 * Return Code: 2355 * 0 - Success. 2356 * ENOENT - Expected node or property did not exist. 2357 */ 2358 static int 2359 vdc_get_ldc_id(md_t *mdp, mde_cookie_t vd_node, uint64_t *ldc_id) 2360 { 2361 mde_cookie_t *chanp = NULL; 2362 int listsz; 2363 int num_chans; 2364 int num_nodes; 2365 int status = 0; 2366 2367 num_nodes = md_node_count(mdp); 2368 ASSERT(num_nodes > 0); 2369 2370 listsz = num_nodes * sizeof (mde_cookie_t); 2371 2372 /* allocate memory for nodes */ 2373 chanp = kmem_zalloc(listsz, KM_SLEEP); 2374 2375 /* get the channels for this node */ 2376 num_chans = md_scan_dag(mdp, vd_node, 2377 md_find_name(mdp, VDC_MD_CHAN_NAME), 2378 md_find_name(mdp, "fwd"), chanp); 2379 2380 /* expecting at least one channel */ 2381 if (num_chans <= 0) { 2382 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2383 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2384 status = ENOENT; 2385 goto done; 2386 2387 } else if (num_chans != 1) { 2388 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2389 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, num_chans); 2390 } 2391 2392 /* 2393 * We use the first channel found (index 0), irrespective of how 2394 * many are there in total. 2395 */ 2396 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, ldc_id) != 0) { 2397 cmn_err(CE_NOTE, "Channel '%s' property not found", VDC_MD_ID); 2398 status = ENOENT; 2399 } 2400 2401 done: 2402 kmem_free(chanp, listsz); 2403 return (status); 2404 } 2405 2406 static int 2407 vdc_do_ldc_up(vdc_t *vdc) 2408 { 2409 int status; 2410 ldc_status_t ldc_state; 2411 2412 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2413 vdc->instance, vdc->ldc_id); 2414 2415 if (vdc->lifecycle == VDC_LC_DETACHING) 2416 return (EINVAL); 2417 2418 if ((status = ldc_up(vdc->ldc_handle)) != 0) { 2419 switch (status) { 2420 case ECONNREFUSED: /* listener not ready at other end */ 2421 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2422 vdc->instance, vdc->ldc_id, status); 2423 status = 0; 2424 break; 2425 default: 2426 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2427 "channel=%ld, err=%d", vdc->instance, vdc->ldc_id, 2428 status); 2429 break; 2430 } 2431 } 2432 2433 if (ldc_status(vdc->ldc_handle, &ldc_state) == 0) { 2434 vdc->ldc_state = ldc_state; 2435 if (ldc_state == LDC_UP) { 2436 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2437 vdc->instance); 2438 vdc->seq_num = 1; 2439 vdc->seq_num_reply = 0; 2440 } 2441 } 2442 2443 return (status); 2444 } 2445 2446 /* 2447 * Function: 2448 * vdc_terminate_ldc() 2449 * 2450 * Description: 2451 * 2452 * Arguments: 2453 * vdc - soft state pointer for this instance of the device driver. 2454 * 2455 * Return Code: 2456 * None 2457 */ 2458 static void 2459 vdc_terminate_ldc(vdc_t *vdc) 2460 { 2461 int instance = ddi_get_instance(vdc->dip); 2462 2463 ASSERT(vdc != NULL); 2464 ASSERT(mutex_owned(&vdc->lock)); 2465 2466 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2467 2468 if (vdc->initialized & VDC_LDC_OPEN) { 2469 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2470 (void) ldc_close(vdc->ldc_handle); 2471 } 2472 if (vdc->initialized & VDC_LDC_CB) { 2473 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2474 (void) ldc_unreg_callback(vdc->ldc_handle); 2475 } 2476 if (vdc->initialized & VDC_LDC) { 2477 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2478 (void) ldc_fini(vdc->ldc_handle); 2479 vdc->ldc_handle = NULL; 2480 } 2481 2482 vdc->initialized &= ~(VDC_LDC | VDC_LDC_CB | VDC_LDC_OPEN); 2483 } 2484 2485 /* -------------------------------------------------------------------------- */ 2486 2487 /* 2488 * Descriptor Ring helper routines 2489 */ 2490 2491 /* 2492 * Function: 2493 * vdc_init_descriptor_ring() 2494 * 2495 * Description: 2496 * 2497 * Arguments: 2498 * vdc - soft state pointer for this instance of the device driver. 2499 * 2500 * Return Code: 2501 * 0 - Success 2502 */ 2503 static int 2504 vdc_init_descriptor_ring(vdc_t *vdc) 2505 { 2506 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2507 int status = 0; 2508 int i; 2509 2510 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2511 2512 ASSERT(vdc != NULL); 2513 ASSERT(mutex_owned(&vdc->lock)); 2514 ASSERT(vdc->ldc_handle != NULL); 2515 2516 /* ensure we have enough room to store max sized block */ 2517 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2518 2519 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2520 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2521 /* 2522 * Calculate the maximum block size we can transmit using one 2523 * Descriptor Ring entry from the attributes returned by the 2524 * vDisk server. This is subject to a minimum of 'maxphys' 2525 * as we do not have the capability to split requests over 2526 * multiple DRing entries. 2527 */ 2528 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2529 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2530 vdc->instance); 2531 vdc->dring_max_cookies = maxphys / PAGESIZE; 2532 } else { 2533 vdc->dring_max_cookies = 2534 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2535 } 2536 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2537 (sizeof (ldc_mem_cookie_t) * 2538 (vdc->dring_max_cookies - 1))); 2539 vdc->dring_len = VD_DRING_LEN; 2540 2541 status = ldc_mem_dring_create(vdc->dring_len, 2542 vdc->dring_entry_size, &vdc->ldc_dring_hdl); 2543 if ((vdc->ldc_dring_hdl == NULL) || (status != 0)) { 2544 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2545 vdc->instance); 2546 return (status); 2547 } 2548 vdc->initialized |= VDC_DRING_INIT; 2549 } 2550 2551 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2552 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2553 vdc->dring_cookie = 2554 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2555 2556 status = ldc_mem_dring_bind(vdc->ldc_handle, vdc->ldc_dring_hdl, 2557 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2558 &vdc->dring_cookie[0], 2559 &vdc->dring_cookie_count); 2560 if (status != 0) { 2561 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2562 "(%lx) to channel (%lx) status=%d\n", 2563 vdc->instance, vdc->ldc_dring_hdl, 2564 vdc->ldc_handle, status); 2565 return (status); 2566 } 2567 ASSERT(vdc->dring_cookie_count == 1); 2568 vdc->initialized |= VDC_DRING_BOUND; 2569 } 2570 2571 status = ldc_mem_dring_info(vdc->ldc_dring_hdl, &vdc->dring_mem_info); 2572 if (status != 0) { 2573 DMSG(vdc, 0, 2574 "[%d] Failed to get info for descriptor ring (%lx)\n", 2575 vdc->instance, vdc->ldc_dring_hdl); 2576 return (status); 2577 } 2578 2579 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2580 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2581 2582 /* Allocate the local copy of this dring */ 2583 vdc->local_dring = 2584 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2585 KM_SLEEP); 2586 vdc->initialized |= VDC_DRING_LOCAL; 2587 } 2588 2589 /* 2590 * Mark all DRing entries as free and initialize the private 2591 * descriptor's memory handles. If any entry is initialized, 2592 * we need to free it later so we set the bit in 'initialized' 2593 * at the start. 2594 */ 2595 vdc->initialized |= VDC_DRING_ENTRY; 2596 for (i = 0; i < vdc->dring_len; i++) { 2597 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2598 dep->hdr.dstate = VIO_DESC_FREE; 2599 2600 status = ldc_mem_alloc_handle(vdc->ldc_handle, 2601 &vdc->local_dring[i].desc_mhdl); 2602 if (status != 0) { 2603 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2604 " descriptor %d", vdc->instance, i); 2605 return (status); 2606 } 2607 vdc->local_dring[i].is_free = B_TRUE; 2608 vdc->local_dring[i].dep = dep; 2609 } 2610 2611 /* Initialize the starting index */ 2612 vdc->dring_curr_idx = 0; 2613 2614 return (status); 2615 } 2616 2617 /* 2618 * Function: 2619 * vdc_destroy_descriptor_ring() 2620 * 2621 * Description: 2622 * 2623 * Arguments: 2624 * vdc - soft state pointer for this instance of the device driver. 2625 * 2626 * Return Code: 2627 * None 2628 */ 2629 static void 2630 vdc_destroy_descriptor_ring(vdc_t *vdc) 2631 { 2632 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2633 ldc_mem_handle_t mhdl = NULL; 2634 ldc_mem_info_t minfo; 2635 int status = -1; 2636 int i; /* loop */ 2637 2638 ASSERT(vdc != NULL); 2639 ASSERT(mutex_owned(&vdc->lock)); 2640 2641 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2642 2643 if (vdc->initialized & VDC_DRING_ENTRY) { 2644 DMSG(vdc, 0, 2645 "[%d] Removing Local DRing entries\n", vdc->instance); 2646 for (i = 0; i < vdc->dring_len; i++) { 2647 ldep = &vdc->local_dring[i]; 2648 mhdl = ldep->desc_mhdl; 2649 2650 if (mhdl == NULL) 2651 continue; 2652 2653 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2654 DMSG(vdc, 0, 2655 "ldc_mem_info returned an error: %d\n", 2656 status); 2657 2658 /* 2659 * This must mean that the mem handle 2660 * is not valid. Clear it out so that 2661 * no one tries to use it. 2662 */ 2663 ldep->desc_mhdl = NULL; 2664 continue; 2665 } 2666 2667 if (minfo.status == LDC_BOUND) { 2668 (void) ldc_mem_unbind_handle(mhdl); 2669 } 2670 2671 (void) ldc_mem_free_handle(mhdl); 2672 2673 ldep->desc_mhdl = NULL; 2674 } 2675 vdc->initialized &= ~VDC_DRING_ENTRY; 2676 } 2677 2678 if (vdc->initialized & VDC_DRING_LOCAL) { 2679 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2680 kmem_free(vdc->local_dring, 2681 vdc->dring_len * sizeof (vdc_local_desc_t)); 2682 vdc->initialized &= ~VDC_DRING_LOCAL; 2683 } 2684 2685 if (vdc->initialized & VDC_DRING_BOUND) { 2686 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2687 status = ldc_mem_dring_unbind(vdc->ldc_dring_hdl); 2688 if (status == 0) { 2689 vdc->initialized &= ~VDC_DRING_BOUND; 2690 } else { 2691 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2692 vdc->instance, status, vdc->ldc_dring_hdl); 2693 } 2694 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2695 } 2696 2697 if (vdc->initialized & VDC_DRING_INIT) { 2698 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2699 status = ldc_mem_dring_destroy(vdc->ldc_dring_hdl); 2700 if (status == 0) { 2701 vdc->ldc_dring_hdl = NULL; 2702 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2703 vdc->initialized &= ~VDC_DRING_INIT; 2704 } else { 2705 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2706 vdc->instance, status, vdc->ldc_dring_hdl); 2707 } 2708 } 2709 } 2710 2711 /* 2712 * Function: 2713 * vdc_map_to_shared_ring() 2714 * 2715 * Description: 2716 * Copy contents of the local descriptor to the shared 2717 * memory descriptor. 2718 * 2719 * Arguments: 2720 * vdcp - soft state pointer for this instance of the device driver. 2721 * idx - descriptor ring index 2722 * 2723 * Return Code: 2724 * None 2725 */ 2726 static int 2727 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2728 { 2729 vdc_local_desc_t *ldep; 2730 vd_dring_entry_t *dep; 2731 int rv; 2732 2733 ldep = &(vdcp->local_dring[idx]); 2734 2735 /* for now leave in the old pop_mem_hdl stuff */ 2736 if (ldep->nbytes > 0) { 2737 rv = vdc_populate_mem_hdl(vdcp, ldep); 2738 if (rv) { 2739 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2740 vdcp->instance); 2741 return (rv); 2742 } 2743 } 2744 2745 /* 2746 * fill in the data details into the DRing 2747 */ 2748 dep = ldep->dep; 2749 ASSERT(dep != NULL); 2750 2751 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2752 dep->payload.operation = ldep->operation; 2753 dep->payload.addr = ldep->offset; 2754 dep->payload.nbytes = ldep->nbytes; 2755 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2756 dep->payload.slice = ldep->slice; 2757 dep->hdr.dstate = VIO_DESC_READY; 2758 dep->hdr.ack = 1; /* request an ACK for every message */ 2759 2760 return (0); 2761 } 2762 2763 /* 2764 * Function: 2765 * vdc_send_request 2766 * 2767 * Description: 2768 * This routine writes the data to be transmitted to vds into the 2769 * descriptor, notifies vds that the ring has been updated and 2770 * then waits for the request to be processed. 2771 * 2772 * Arguments: 2773 * vdcp - the soft state pointer 2774 * operation - operation we want vds to perform (VD_OP_XXX) 2775 * addr - address of data buf to be read/written. 2776 * nbytes - number of bytes to read/write 2777 * slice - the disk slice this request is for 2778 * offset - relative disk offset 2779 * cb_type - type of call - STRATEGY or SYNC 2780 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2781 * . mode for ioctl(9e) 2782 * . LP64 diskaddr_t (block I/O) 2783 * dir - direction of operation (READ/WRITE/BOTH) 2784 * 2785 * Return Codes: 2786 * 0 2787 * ENXIO 2788 */ 2789 static int 2790 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2791 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2792 void *cb_arg, vio_desc_direction_t dir) 2793 { 2794 int rv = 0; 2795 2796 ASSERT(vdcp != NULL); 2797 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2798 2799 mutex_enter(&vdcp->lock); 2800 2801 /* 2802 * If this is a block read/write operation we update the I/O statistics 2803 * to indicate that the request is being put on the waitq to be 2804 * serviced. 2805 * 2806 * We do it here (a common routine for both synchronous and strategy 2807 * calls) for performance reasons - we are already holding vdc->lock 2808 * so there is no extra locking overhead. We would have to explicitly 2809 * grab the 'lock' mutex to update the stats if we were to do this 2810 * higher up the stack in vdc_strategy() et. al. 2811 */ 2812 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2813 DTRACE_IO1(start, buf_t *, cb_arg); 2814 VD_KSTAT_WAITQ_ENTER(vdcp->io_stats); 2815 } 2816 2817 do { 2818 while (vdcp->state != VDC_STATE_RUNNING) { 2819 2820 /* return error if detaching */ 2821 if (vdcp->state == VDC_STATE_DETACH) { 2822 rv = ENXIO; 2823 goto done; 2824 } 2825 2826 /* fail request if connection timeout is reached */ 2827 if (vdcp->ctimeout_reached) { 2828 rv = EIO; 2829 goto done; 2830 } 2831 2832 /* 2833 * If we are panicking and the disk is not ready then 2834 * we can't send any request because we can't complete 2835 * the handshake now. 2836 */ 2837 if (ddi_in_panic()) { 2838 rv = EIO; 2839 goto done; 2840 } 2841 2842 cv_wait(&vdcp->running_cv, &vdcp->lock); 2843 } 2844 2845 } while (vdc_populate_descriptor(vdcp, operation, addr, 2846 nbytes, slice, offset, cb_type, cb_arg, dir)); 2847 2848 done: 2849 /* 2850 * If this is a block read/write we update the I/O statistics kstat 2851 * to indicate that this request has been placed on the queue for 2852 * processing (i.e sent to the vDisk server) - iostat(1M) will 2853 * report the time waiting for the vDisk server under the %b column 2854 * In the case of an error we simply take it off the wait queue. 2855 */ 2856 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2857 if (rv == 0) { 2858 VD_KSTAT_WAITQ_TO_RUNQ(vdcp->io_stats); 2859 DTRACE_PROBE1(send, buf_t *, cb_arg); 2860 } else { 2861 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2862 VD_KSTAT_WAITQ_EXIT(vdcp->io_stats); 2863 DTRACE_IO1(done, buf_t *, cb_arg); 2864 } 2865 } 2866 2867 mutex_exit(&vdcp->lock); 2868 2869 return (rv); 2870 } 2871 2872 2873 /* 2874 * Function: 2875 * vdc_populate_descriptor 2876 * 2877 * Description: 2878 * This routine writes the data to be transmitted to vds into the 2879 * descriptor, notifies vds that the ring has been updated and 2880 * then waits for the request to be processed. 2881 * 2882 * Arguments: 2883 * vdcp - the soft state pointer 2884 * operation - operation we want vds to perform (VD_OP_XXX) 2885 * addr - address of data buf to be read/written. 2886 * nbytes - number of bytes to read/write 2887 * slice - the disk slice this request is for 2888 * offset - relative disk offset 2889 * cb_type - type of call - STRATEGY or SYNC 2890 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2891 * . mode for ioctl(9e) 2892 * . LP64 diskaddr_t (block I/O) 2893 * dir - direction of operation (READ/WRITE/BOTH) 2894 * 2895 * Return Codes: 2896 * 0 2897 * EAGAIN 2898 * ECONNRESET 2899 * ENXIO 2900 */ 2901 static int 2902 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2903 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2904 void *cb_arg, vio_desc_direction_t dir) 2905 { 2906 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2907 int idx; /* Index of DRing entry used */ 2908 int next_idx; 2909 vio_dring_msg_t dmsg; 2910 size_t msglen; 2911 int rv; 2912 2913 ASSERT(MUTEX_HELD(&vdcp->lock)); 2914 vdcp->threads_pending++; 2915 loop: 2916 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2917 2918 /* Get next available D-Ring entry */ 2919 idx = vdcp->dring_curr_idx; 2920 local_dep = &(vdcp->local_dring[idx]); 2921 2922 if (!local_dep->is_free) { 2923 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2924 vdcp->instance); 2925 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2926 if (vdcp->state == VDC_STATE_RUNNING || 2927 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2928 goto loop; 2929 } 2930 vdcp->threads_pending--; 2931 return (ECONNRESET); 2932 } 2933 2934 next_idx = idx + 1; 2935 if (next_idx >= vdcp->dring_len) 2936 next_idx = 0; 2937 vdcp->dring_curr_idx = next_idx; 2938 2939 ASSERT(local_dep->is_free); 2940 2941 local_dep->operation = operation; 2942 local_dep->addr = addr; 2943 local_dep->nbytes = nbytes; 2944 local_dep->slice = slice; 2945 local_dep->offset = offset; 2946 local_dep->cb_type = cb_type; 2947 local_dep->cb_arg = cb_arg; 2948 local_dep->dir = dir; 2949 2950 local_dep->is_free = B_FALSE; 2951 2952 rv = vdc_map_to_shared_dring(vdcp, idx); 2953 if (rv) { 2954 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 2955 vdcp->instance); 2956 /* free the descriptor */ 2957 local_dep->is_free = B_TRUE; 2958 vdcp->dring_curr_idx = idx; 2959 cv_wait(&vdcp->membind_cv, &vdcp->lock); 2960 if (vdcp->state == VDC_STATE_RUNNING || 2961 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2962 goto loop; 2963 } 2964 vdcp->threads_pending--; 2965 return (ECONNRESET); 2966 } 2967 2968 /* 2969 * Send a msg with the DRing details to vds 2970 */ 2971 VIO_INIT_DRING_DATA_TAG(dmsg); 2972 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 2973 dmsg.dring_ident = vdcp->dring_ident; 2974 dmsg.start_idx = idx; 2975 dmsg.end_idx = idx; 2976 vdcp->seq_num++; 2977 2978 DTRACE_PROBE2(populate, int, vdcp->instance, 2979 vdc_local_desc_t *, local_dep); 2980 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 2981 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 2982 2983 /* 2984 * note we're still holding the lock here to 2985 * make sure the message goes out in order !!!... 2986 */ 2987 msglen = sizeof (dmsg); 2988 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 2989 switch (rv) { 2990 case ECONNRESET: 2991 /* 2992 * vdc_send initiates the reset on failure. 2993 * Since the transaction has already been put 2994 * on the local dring, it will automatically get 2995 * retried when the channel is reset. Given that, 2996 * it is ok to just return success even though the 2997 * send failed. 2998 */ 2999 rv = 0; 3000 break; 3001 3002 case 0: /* EOK */ 3003 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3004 break; 3005 3006 default: 3007 goto cleanup_and_exit; 3008 } 3009 3010 vdcp->threads_pending--; 3011 return (rv); 3012 3013 cleanup_and_exit: 3014 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3015 return (ENXIO); 3016 } 3017 3018 /* 3019 * Function: 3020 * vdc_do_sync_op 3021 * 3022 * Description: 3023 * Wrapper around vdc_populate_descriptor that blocks until the 3024 * response to the message is available. 3025 * 3026 * Arguments: 3027 * vdcp - the soft state pointer 3028 * operation - operation we want vds to perform (VD_OP_XXX) 3029 * addr - address of data buf to be read/written. 3030 * nbytes - number of bytes to read/write 3031 * slice - the disk slice this request is for 3032 * offset - relative disk offset 3033 * cb_type - type of call - STRATEGY or SYNC 3034 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3035 * . mode for ioctl(9e) 3036 * . LP64 diskaddr_t (block I/O) 3037 * dir - direction of operation (READ/WRITE/BOTH) 3038 * rconflict - check for reservation conflict in case of failure 3039 * 3040 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3041 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3042 * result of a successful operation with vd_scsi_status(). 3043 * 3044 * Return Codes: 3045 * 0 3046 * EAGAIN 3047 * EFAULT 3048 * ENXIO 3049 * EIO 3050 */ 3051 static int 3052 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3053 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3054 vio_desc_direction_t dir, boolean_t rconflict) 3055 { 3056 int status; 3057 vdc_io_t *vio; 3058 boolean_t check_resv_conflict = B_FALSE; 3059 3060 ASSERT(cb_type == CB_SYNC); 3061 3062 /* 3063 * Grab the lock, if blocked wait until the server 3064 * response causes us to wake up again. 3065 */ 3066 mutex_enter(&vdcp->lock); 3067 vdcp->sync_op_cnt++; 3068 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3069 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3070 3071 if (vdcp->state == VDC_STATE_DETACH) { 3072 cv_broadcast(&vdcp->sync_blocked_cv); 3073 vdcp->sync_op_cnt--; 3074 mutex_exit(&vdcp->lock); 3075 return (ENXIO); 3076 } 3077 3078 /* now block anyone other thread entering after us */ 3079 vdcp->sync_op_blocked = B_TRUE; 3080 vdcp->sync_op_pending = B_TRUE; 3081 mutex_exit(&vdcp->lock); 3082 3083 status = vdc_send_request(vdcp, operation, addr, 3084 nbytes, slice, offset, cb_type, cb_arg, dir); 3085 3086 mutex_enter(&vdcp->lock); 3087 3088 if (status != 0) { 3089 vdcp->sync_op_pending = B_FALSE; 3090 } else { 3091 /* 3092 * block until our transaction completes. 3093 * Also anyone else waiting also gets to go next. 3094 */ 3095 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3096 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3097 3098 DMSG(vdcp, 2, ": operation returned %d\n", 3099 vdcp->sync_op_status); 3100 if (vdcp->state == VDC_STATE_DETACH) { 3101 vdcp->sync_op_pending = B_FALSE; 3102 status = ENXIO; 3103 } else { 3104 status = vdcp->sync_op_status; 3105 if (status != 0 && vdcp->failfast_interval != 0) { 3106 /* 3107 * Operation has failed and failfast is enabled. 3108 * We need to check if the failure is due to a 3109 * reservation conflict if this was requested. 3110 */ 3111 check_resv_conflict = rconflict; 3112 } 3113 3114 } 3115 } 3116 3117 vdcp->sync_op_status = 0; 3118 vdcp->sync_op_blocked = B_FALSE; 3119 vdcp->sync_op_cnt--; 3120 3121 /* signal the next waiting thread */ 3122 cv_signal(&vdcp->sync_blocked_cv); 3123 3124 /* 3125 * We have to check for reservation conflict after unblocking sync 3126 * operations because some sync operations will be used to do this 3127 * check. 3128 */ 3129 if (check_resv_conflict) { 3130 vio = vdc_failfast_io_queue(vdcp, NULL); 3131 while (vio->vio_qtime != 0) 3132 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3133 kmem_free(vio, sizeof (vdc_io_t)); 3134 } 3135 3136 mutex_exit(&vdcp->lock); 3137 3138 return (status); 3139 } 3140 3141 3142 /* 3143 * Function: 3144 * vdc_drain_response() 3145 * 3146 * Description: 3147 * When a guest is panicking, the completion of requests needs to be 3148 * handled differently because interrupts are disabled and vdc 3149 * will not get messages. We have to poll for the messages instead. 3150 * 3151 * Note: since we don't have a buf_t available we cannot implement 3152 * the io:::done DTrace probe in this specific case. 3153 * 3154 * Arguments: 3155 * vdc - soft state pointer for this instance of the device driver. 3156 * 3157 * Return Code: 3158 * 0 - Success 3159 */ 3160 static int 3161 vdc_drain_response(vdc_t *vdc) 3162 { 3163 int rv, idx, retries; 3164 size_t msglen; 3165 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3166 vio_dring_msg_t dmsg; 3167 3168 mutex_enter(&vdc->lock); 3169 3170 retries = 0; 3171 for (;;) { 3172 msglen = sizeof (dmsg); 3173 rv = ldc_read(vdc->ldc_handle, (caddr_t)&dmsg, &msglen); 3174 if (rv) { 3175 rv = EINVAL; 3176 break; 3177 } 3178 3179 /* 3180 * if there are no packets wait and check again 3181 */ 3182 if ((rv == 0) && (msglen == 0)) { 3183 if (retries++ > vdc_dump_retries) { 3184 rv = EAGAIN; 3185 break; 3186 } 3187 3188 drv_usecwait(vdc_usec_timeout_dump); 3189 continue; 3190 } 3191 3192 /* 3193 * Ignore all messages that are not ACKs/NACKs to 3194 * DRing requests. 3195 */ 3196 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3197 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3198 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3199 dmsg.tag.vio_msgtype, 3200 dmsg.tag.vio_subtype, 3201 dmsg.tag.vio_subtype_env); 3202 continue; 3203 } 3204 3205 /* 3206 * set the appropriate return value for the current request. 3207 */ 3208 switch (dmsg.tag.vio_subtype) { 3209 case VIO_SUBTYPE_ACK: 3210 rv = 0; 3211 break; 3212 case VIO_SUBTYPE_NACK: 3213 rv = EAGAIN; 3214 break; 3215 default: 3216 continue; 3217 } 3218 3219 idx = dmsg.start_idx; 3220 if (idx >= vdc->dring_len) { 3221 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3222 vdc->instance, idx); 3223 continue; 3224 } 3225 ldep = &vdc->local_dring[idx]; 3226 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3227 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3228 vdc->instance, idx, ldep->dep->hdr.dstate); 3229 continue; 3230 } 3231 3232 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3233 vdc->instance, idx, ldep->dep->hdr.dstate); 3234 3235 rv = vdc_depopulate_descriptor(vdc, idx); 3236 if (rv) { 3237 DMSG(vdc, 0, 3238 "[%d] Entry @ %d - depopulate failed ..\n", 3239 vdc->instance, idx); 3240 } 3241 3242 /* if this is the last descriptor - break out of loop */ 3243 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3244 break; 3245 } 3246 3247 mutex_exit(&vdc->lock); 3248 DMSG(vdc, 0, "End idx=%d\n", idx); 3249 3250 return (rv); 3251 } 3252 3253 3254 /* 3255 * Function: 3256 * vdc_depopulate_descriptor() 3257 * 3258 * Description: 3259 * 3260 * Arguments: 3261 * vdc - soft state pointer for this instance of the device driver. 3262 * idx - Index of the Descriptor Ring entry being modified 3263 * 3264 * Return Code: 3265 * 0 - Success 3266 */ 3267 static int 3268 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3269 { 3270 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3271 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3272 int status = ENXIO; 3273 int rv = 0; 3274 3275 ASSERT(vdc != NULL); 3276 ASSERT(idx < vdc->dring_len); 3277 ldep = &vdc->local_dring[idx]; 3278 ASSERT(ldep != NULL); 3279 ASSERT(MUTEX_HELD(&vdc->lock)); 3280 3281 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3282 DMSG(vdc, 2, ": idx = %d\n", idx); 3283 3284 dep = ldep->dep; 3285 ASSERT(dep != NULL); 3286 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3287 (dep->payload.status == ECANCELED)); 3288 3289 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3290 3291 ldep->is_free = B_TRUE; 3292 status = dep->payload.status; 3293 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3294 3295 /* 3296 * If no buffers were used to transfer information to the server when 3297 * populating the descriptor then no memory handles need to be unbound 3298 * and we can return now. 3299 */ 3300 if (ldep->nbytes == 0) { 3301 cv_signal(&vdc->dring_free_cv); 3302 return (status); 3303 } 3304 3305 /* 3306 * If the upper layer passed in a misaligned address we copied the 3307 * data into an aligned buffer before sending it to LDC - we now 3308 * copy it back to the original buffer. 3309 */ 3310 if (ldep->align_addr) { 3311 ASSERT(ldep->addr != NULL); 3312 3313 if (dep->payload.nbytes > 0) 3314 bcopy(ldep->align_addr, ldep->addr, 3315 dep->payload.nbytes); 3316 kmem_free(ldep->align_addr, 3317 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3318 ldep->align_addr = NULL; 3319 } 3320 3321 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3322 if (rv != 0) { 3323 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3324 vdc->instance, ldep->desc_mhdl, idx, rv); 3325 /* 3326 * The error returned by the vDisk server is more informative 3327 * and thus has a higher priority but if it isn't set we ensure 3328 * that this function returns an error. 3329 */ 3330 if (status == 0) 3331 status = EINVAL; 3332 } 3333 3334 cv_signal(&vdc->membind_cv); 3335 cv_signal(&vdc->dring_free_cv); 3336 3337 return (status); 3338 } 3339 3340 /* 3341 * Function: 3342 * vdc_populate_mem_hdl() 3343 * 3344 * Description: 3345 * 3346 * Arguments: 3347 * vdc - soft state pointer for this instance of the device driver. 3348 * idx - Index of the Descriptor Ring entry being modified 3349 * addr - virtual address being mapped in 3350 * nybtes - number of bytes in 'addr' 3351 * operation - the vDisk operation being performed (VD_OP_xxx) 3352 * 3353 * Return Code: 3354 * 0 - Success 3355 */ 3356 static int 3357 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3358 { 3359 vd_dring_entry_t *dep = NULL; 3360 ldc_mem_handle_t mhdl; 3361 caddr_t vaddr; 3362 size_t nbytes; 3363 uint8_t perm = LDC_MEM_RW; 3364 uint8_t maptype; 3365 int rv = 0; 3366 int i; 3367 3368 ASSERT(vdcp != NULL); 3369 3370 dep = ldep->dep; 3371 mhdl = ldep->desc_mhdl; 3372 3373 switch (ldep->dir) { 3374 case VIO_read_dir: 3375 perm = LDC_MEM_W; 3376 break; 3377 3378 case VIO_write_dir: 3379 perm = LDC_MEM_R; 3380 break; 3381 3382 case VIO_both_dir: 3383 perm = LDC_MEM_RW; 3384 break; 3385 3386 default: 3387 ASSERT(0); /* catch bad programming in vdc */ 3388 } 3389 3390 /* 3391 * LDC expects any addresses passed in to be 8-byte aligned. We need 3392 * to copy the contents of any misaligned buffers to a newly allocated 3393 * buffer and bind it instead (and copy the the contents back to the 3394 * original buffer passed in when depopulating the descriptor) 3395 */ 3396 vaddr = ldep->addr; 3397 nbytes = ldep->nbytes; 3398 if (((uint64_t)vaddr & 0x7) != 0) { 3399 ASSERT(ldep->align_addr == NULL); 3400 ldep->align_addr = 3401 kmem_alloc(sizeof (caddr_t) * 3402 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3403 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3404 "(buf=%p nb=%ld op=%d)\n", 3405 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3406 nbytes, ldep->operation); 3407 if (perm != LDC_MEM_W) 3408 bcopy(vaddr, ldep->align_addr, nbytes); 3409 vaddr = ldep->align_addr; 3410 } 3411 3412 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3413 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3414 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3415 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3416 vdcp->instance, dep->payload.ncookies); 3417 if (rv != 0) { 3418 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3419 "(mhdl=%p, buf=%p, err=%d)\n", 3420 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3421 if (ldep->align_addr) { 3422 kmem_free(ldep->align_addr, 3423 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3424 ldep->align_addr = NULL; 3425 } 3426 return (EAGAIN); 3427 } 3428 3429 /* 3430 * Get the other cookies (if any). 3431 */ 3432 for (i = 1; i < dep->payload.ncookies; i++) { 3433 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3434 if (rv != 0) { 3435 (void) ldc_mem_unbind_handle(mhdl); 3436 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3437 "(mhdl=%lx cnum=%d), err=%d", 3438 vdcp->instance, mhdl, i, rv); 3439 if (ldep->align_addr) { 3440 kmem_free(ldep->align_addr, 3441 sizeof (caddr_t) * ldep->nbytes); 3442 ldep->align_addr = NULL; 3443 } 3444 return (EAGAIN); 3445 } 3446 } 3447 3448 return (rv); 3449 } 3450 3451 /* 3452 * Interrupt handlers for messages from LDC 3453 */ 3454 3455 /* 3456 * Function: 3457 * vdc_handle_cb() 3458 * 3459 * Description: 3460 * 3461 * Arguments: 3462 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3463 * arg - soft state pointer for this instance of the device driver. 3464 * 3465 * Return Code: 3466 * 0 - Success 3467 */ 3468 static uint_t 3469 vdc_handle_cb(uint64_t event, caddr_t arg) 3470 { 3471 ldc_status_t ldc_state; 3472 int rv = 0; 3473 3474 vdc_t *vdc = (vdc_t *)(void *)arg; 3475 3476 ASSERT(vdc != NULL); 3477 3478 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3479 3480 /* 3481 * Depending on the type of event that triggered this callback, 3482 * we modify the handshake state or read the data. 3483 * 3484 * NOTE: not done as a switch() as event could be triggered by 3485 * a state change and a read request. Also the ordering of the 3486 * check for the event types is deliberate. 3487 */ 3488 if (event & LDC_EVT_UP) { 3489 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3490 3491 mutex_enter(&vdc->lock); 3492 3493 /* get LDC state */ 3494 rv = ldc_status(vdc->ldc_handle, &ldc_state); 3495 if (rv != 0) { 3496 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3497 vdc->instance, rv); 3498 return (LDC_SUCCESS); 3499 } 3500 if (vdc->ldc_state != LDC_UP && ldc_state == LDC_UP) { 3501 /* 3502 * Reset the transaction sequence numbers when 3503 * LDC comes up. We then kick off the handshake 3504 * negotiation with the vDisk server. 3505 */ 3506 vdc->seq_num = 1; 3507 vdc->seq_num_reply = 0; 3508 vdc->ldc_state = ldc_state; 3509 cv_signal(&vdc->initwait_cv); 3510 } 3511 3512 mutex_exit(&vdc->lock); 3513 } 3514 3515 if (event & LDC_EVT_READ) { 3516 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3517 mutex_enter(&vdc->read_lock); 3518 cv_signal(&vdc->read_cv); 3519 vdc->read_state = VDC_READ_PENDING; 3520 mutex_exit(&vdc->read_lock); 3521 3522 /* that's all we have to do - no need to handle DOWN/RESET */ 3523 return (LDC_SUCCESS); 3524 } 3525 3526 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3527 3528 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3529 3530 mutex_enter(&vdc->lock); 3531 /* 3532 * Need to wake up any readers so they will 3533 * detect that a reset has occurred. 3534 */ 3535 mutex_enter(&vdc->read_lock); 3536 if ((vdc->read_state == VDC_READ_WAITING) || 3537 (vdc->read_state == VDC_READ_RESET)) 3538 cv_signal(&vdc->read_cv); 3539 vdc->read_state = VDC_READ_RESET; 3540 mutex_exit(&vdc->read_lock); 3541 3542 /* wake up any threads waiting for connection to come up */ 3543 if (vdc->state == VDC_STATE_INIT_WAITING) { 3544 vdc->state = VDC_STATE_RESETTING; 3545 cv_signal(&vdc->initwait_cv); 3546 } 3547 3548 mutex_exit(&vdc->lock); 3549 } 3550 3551 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3552 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3553 vdc->instance, event); 3554 3555 return (LDC_SUCCESS); 3556 } 3557 3558 /* 3559 * Function: 3560 * vdc_wait_for_response() 3561 * 3562 * Description: 3563 * Block waiting for a response from the server. If there is 3564 * no data the thread block on the read_cv that is signalled 3565 * by the callback when an EVT_READ occurs. 3566 * 3567 * Arguments: 3568 * vdcp - soft state pointer for this instance of the device driver. 3569 * 3570 * Return Code: 3571 * 0 - Success 3572 */ 3573 static int 3574 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3575 { 3576 size_t nbytes = sizeof (*msgp); 3577 int status; 3578 3579 ASSERT(vdcp != NULL); 3580 3581 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3582 3583 status = vdc_recv(vdcp, msgp, &nbytes); 3584 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3585 status, (int)nbytes); 3586 if (status) { 3587 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3588 vdcp->instance, status); 3589 return (status); 3590 } 3591 3592 if (nbytes < sizeof (vio_msg_tag_t)) { 3593 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3594 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3595 return (ENOMSG); 3596 } 3597 3598 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3599 msgp->tag.vio_msgtype, 3600 msgp->tag.vio_subtype, 3601 msgp->tag.vio_subtype_env); 3602 3603 /* 3604 * Verify the Session ID of the message 3605 * 3606 * Every message after the Version has been negotiated should 3607 * have the correct session ID set. 3608 */ 3609 if ((msgp->tag.vio_sid != vdcp->session_id) && 3610 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3611 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3612 "expected 0x%lx [seq num %lx @ %d]", 3613 vdcp->instance, msgp->tag.vio_sid, 3614 vdcp->session_id, 3615 ((vio_dring_msg_t *)msgp)->seq_num, 3616 ((vio_dring_msg_t *)msgp)->start_idx); 3617 return (ENOMSG); 3618 } 3619 return (0); 3620 } 3621 3622 3623 /* 3624 * Function: 3625 * vdc_resubmit_backup_dring() 3626 * 3627 * Description: 3628 * Resubmit each descriptor in the backed up dring to 3629 * vDisk server. The Dring was backed up during connection 3630 * reset. 3631 * 3632 * Arguments: 3633 * vdcp - soft state pointer for this instance of the device driver. 3634 * 3635 * Return Code: 3636 * 0 - Success 3637 */ 3638 static int 3639 vdc_resubmit_backup_dring(vdc_t *vdcp) 3640 { 3641 int count; 3642 int b_idx; 3643 int rv; 3644 int dring_size; 3645 int status; 3646 vio_msg_t vio_msg; 3647 vdc_local_desc_t *curr_ldep; 3648 3649 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3650 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3651 3652 if (vdcp->local_dring_backup == NULL) { 3653 /* the pending requests have already been processed */ 3654 return (0); 3655 } 3656 3657 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3658 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3659 3660 /* 3661 * Walk the backup copy of the local descriptor ring and 3662 * resubmit all the outstanding transactions. 3663 */ 3664 b_idx = vdcp->local_dring_backup_tail; 3665 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3666 3667 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3668 3669 /* only resubmit outstanding transactions */ 3670 if (!curr_ldep->is_free) { 3671 3672 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3673 mutex_enter(&vdcp->lock); 3674 rv = vdc_populate_descriptor(vdcp, curr_ldep->operation, 3675 curr_ldep->addr, curr_ldep->nbytes, 3676 curr_ldep->slice, curr_ldep->offset, 3677 curr_ldep->cb_type, curr_ldep->cb_arg, 3678 curr_ldep->dir); 3679 mutex_exit(&vdcp->lock); 3680 if (rv) { 3681 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3682 vdcp->instance, b_idx); 3683 return (rv); 3684 } 3685 3686 /* Wait for the response message. */ 3687 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3688 b_idx); 3689 status = vdc_wait_for_response(vdcp, &vio_msg); 3690 if (status) { 3691 DMSG(vdcp, 1, "[%d] wait_for_response " 3692 "returned err=%d\n", vdcp->instance, 3693 status); 3694 return (status); 3695 } 3696 3697 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3698 status = vdc_process_data_msg(vdcp, &vio_msg); 3699 if (status) { 3700 DMSG(vdcp, 1, "[%d] process_data_msg " 3701 "returned err=%d\n", vdcp->instance, 3702 status); 3703 return (status); 3704 } 3705 } 3706 3707 /* get the next element to submit */ 3708 if (++b_idx >= vdcp->local_dring_backup_len) 3709 b_idx = 0; 3710 } 3711 3712 /* all done - now clear up pending dring copy */ 3713 dring_size = vdcp->local_dring_backup_len * 3714 sizeof (vdcp->local_dring_backup[0]); 3715 3716 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3717 3718 vdcp->local_dring_backup = NULL; 3719 3720 return (0); 3721 } 3722 3723 /* 3724 * Function: 3725 * vdc_cancel_backup_dring 3726 * 3727 * Description: 3728 * Cancel each descriptor in the backed up dring to vDisk server. 3729 * The Dring was backed up during connection reset. 3730 * 3731 * Arguments: 3732 * vdcp - soft state pointer for this instance of the device driver. 3733 * 3734 * Return Code: 3735 * None 3736 */ 3737 void 3738 vdc_cancel_backup_ring(vdc_t *vdcp) 3739 { 3740 vdc_local_desc_t *ldep; 3741 struct buf *bufp; 3742 int count; 3743 int b_idx; 3744 int dring_size; 3745 3746 ASSERT(MUTEX_HELD(&vdcp->lock)); 3747 ASSERT(vdcp->state == VDC_STATE_INIT || 3748 vdcp->state == VDC_STATE_INIT_WAITING || 3749 vdcp->state == VDC_STATE_NEGOTIATE || 3750 vdcp->state == VDC_STATE_RESETTING); 3751 3752 if (vdcp->local_dring_backup == NULL) { 3753 /* the pending requests have already been processed */ 3754 return; 3755 } 3756 3757 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3758 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3759 3760 /* 3761 * Walk the backup copy of the local descriptor ring and 3762 * cancel all the outstanding transactions. 3763 */ 3764 b_idx = vdcp->local_dring_backup_tail; 3765 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3766 3767 ldep = &(vdcp->local_dring_backup[b_idx]); 3768 3769 /* only cancel outstanding transactions */ 3770 if (!ldep->is_free) { 3771 3772 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3773 3774 /* 3775 * All requests have already been cleared from the 3776 * local descriptor ring and the LDC channel has been 3777 * reset so we will never get any reply for these 3778 * requests. Now we just have to notify threads waiting 3779 * for replies that the request has failed. 3780 */ 3781 switch (ldep->cb_type) { 3782 case CB_SYNC: 3783 ASSERT(vdcp->sync_op_pending); 3784 vdcp->sync_op_status = EIO; 3785 vdcp->sync_op_pending = B_FALSE; 3786 cv_signal(&vdcp->sync_pending_cv); 3787 break; 3788 3789 case CB_STRATEGY: 3790 bufp = ldep->cb_arg; 3791 ASSERT(bufp != NULL); 3792 bufp->b_resid = bufp->b_bcount; 3793 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3794 VD_KSTAT_RUNQ_EXIT(vdcp->io_stats); 3795 DTRACE_IO1(done, buf_t *, bufp); 3796 bioerror(bufp, EIO); 3797 biodone(bufp); 3798 break; 3799 3800 default: 3801 ASSERT(0); 3802 } 3803 3804 } 3805 3806 /* get the next element to cancel */ 3807 if (++b_idx >= vdcp->local_dring_backup_len) 3808 b_idx = 0; 3809 } 3810 3811 /* all done - now clear up pending dring copy */ 3812 dring_size = vdcp->local_dring_backup_len * 3813 sizeof (vdcp->local_dring_backup[0]); 3814 3815 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3816 3817 vdcp->local_dring_backup = NULL; 3818 3819 DTRACE_PROBE2(processed, int, count, vdc_t *, vdcp); 3820 } 3821 3822 /* 3823 * Function: 3824 * vdc_connection_timeout 3825 * 3826 * Description: 3827 * This function is invoked if the timeout set to establish the connection 3828 * with vds expires. This will happen if we spend too much time in the 3829 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3830 * cancel any pending request and mark them as failed. 3831 * 3832 * If the timeout does not expire, it will be cancelled when we reach the 3833 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 3834 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 3835 * VDC_STATE_RESETTING state in which case we do nothing because the 3836 * timeout is being cancelled. 3837 * 3838 * Arguments: 3839 * arg - argument of the timeout function actually a soft state 3840 * pointer for the instance of the device driver. 3841 * 3842 * Return Code: 3843 * None 3844 */ 3845 void 3846 vdc_connection_timeout(void *arg) 3847 { 3848 vdc_t *vdcp = (vdc_t *)arg; 3849 3850 mutex_enter(&vdcp->lock); 3851 3852 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 3853 vdcp->state == VDC_STATE_DETACH) { 3854 /* 3855 * The connection has just been re-established or 3856 * we are detaching. 3857 */ 3858 vdcp->ctimeout_reached = B_FALSE; 3859 mutex_exit(&vdcp->lock); 3860 return; 3861 } 3862 3863 vdcp->ctimeout_reached = B_TRUE; 3864 3865 /* notify requests waiting for sending */ 3866 cv_broadcast(&vdcp->running_cv); 3867 3868 /* cancel requests waiting for a result */ 3869 vdc_cancel_backup_ring(vdcp); 3870 3871 mutex_exit(&vdcp->lock); 3872 3873 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 3874 vdcp->instance); 3875 } 3876 3877 /* 3878 * Function: 3879 * vdc_backup_local_dring() 3880 * 3881 * Description: 3882 * Backup the current dring in the event of a reset. The Dring 3883 * transactions will be resubmitted to the server when the 3884 * connection is restored. 3885 * 3886 * Arguments: 3887 * vdcp - soft state pointer for this instance of the device driver. 3888 * 3889 * Return Code: 3890 * NONE 3891 */ 3892 static void 3893 vdc_backup_local_dring(vdc_t *vdcp) 3894 { 3895 int dring_size; 3896 3897 ASSERT(MUTEX_HELD(&vdcp->lock)); 3898 ASSERT(vdcp->state == VDC_STATE_RESETTING); 3899 3900 /* 3901 * If the backup dring is stil around, it means 3902 * that the last restore did not complete. However, 3903 * since we never got back into the running state, 3904 * the backup copy we have is still valid. 3905 */ 3906 if (vdcp->local_dring_backup != NULL) { 3907 DMSG(vdcp, 1, "reusing local descriptor ring backup " 3908 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 3909 vdcp->local_dring_backup_tail); 3910 return; 3911 } 3912 3913 /* 3914 * The backup dring can be NULL and the local dring may not be 3915 * initialized. This can happen if we had a reset while establishing 3916 * a new connection but after the connection has timed out. In that 3917 * case the backup dring is NULL because the requests have been 3918 * cancelled and the request occured before the local dring is 3919 * initialized. 3920 */ 3921 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 3922 return; 3923 3924 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 3925 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 3926 3927 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 3928 3929 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 3930 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 3931 3932 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 3933 vdcp->local_dring_backup_len = vdcp->dring_len; 3934 } 3935 3936 /* -------------------------------------------------------------------------- */ 3937 3938 /* 3939 * The following functions process the incoming messages from vds 3940 */ 3941 3942 /* 3943 * Function: 3944 * vdc_process_msg_thread() 3945 * 3946 * Description: 3947 * 3948 * Main VDC message processing thread. Each vDisk instance 3949 * consists of a copy of this thread. This thread triggers 3950 * all the handshakes and data exchange with the server. It 3951 * also handles all channel resets 3952 * 3953 * Arguments: 3954 * vdc - soft state pointer for this instance of the device driver. 3955 * 3956 * Return Code: 3957 * None 3958 */ 3959 static void 3960 vdc_process_msg_thread(vdc_t *vdcp) 3961 { 3962 int status; 3963 int ctimeout; 3964 timeout_id_t tmid = 0; 3965 3966 mutex_enter(&vdcp->lock); 3967 3968 for (;;) { 3969 3970 #define Q(_s) (vdcp->state == _s) ? #_s : 3971 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 3972 Q(VDC_STATE_INIT) 3973 Q(VDC_STATE_INIT_WAITING) 3974 Q(VDC_STATE_NEGOTIATE) 3975 Q(VDC_STATE_HANDLE_PENDING) 3976 Q(VDC_STATE_RUNNING) 3977 Q(VDC_STATE_RESETTING) 3978 Q(VDC_STATE_DETACH) 3979 "UNKNOWN"); 3980 3981 switch (vdcp->state) { 3982 case VDC_STATE_INIT: 3983 3984 /* 3985 * If requested, start a timeout to check if the 3986 * connection with vds is established in the 3987 * specified delay. If the timeout expires, we 3988 * will cancel any pending request. 3989 * 3990 * If some reset have occurred while establishing 3991 * the connection, we already have a timeout armed 3992 * and in that case we don't need to arm a new one. 3993 */ 3994 ctimeout = (vdc_timeout != 0)? 3995 vdc_timeout : vdcp->ctimeout; 3996 3997 if (ctimeout != 0 && tmid == 0) { 3998 tmid = timeout(vdc_connection_timeout, vdcp, 3999 ctimeout * drv_usectohz(1000000)); 4000 } 4001 4002 /* Check if have re-initializing repeatedly */ 4003 if (vdcp->hshake_cnt++ > vdc_hshake_retries && 4004 vdcp->lifecycle != VDC_LC_ONLINE) { 4005 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4006 vdcp->instance); 4007 vdcp->state = VDC_STATE_DETACH; 4008 break; 4009 } 4010 4011 /* Bring up connection with vds via LDC */ 4012 status = vdc_start_ldc_connection(vdcp); 4013 if (status == EINVAL) { 4014 DMSG(vdcp, 0, "[%d] Could not start LDC", 4015 vdcp->instance); 4016 vdcp->state = VDC_STATE_DETACH; 4017 } else { 4018 vdcp->state = VDC_STATE_INIT_WAITING; 4019 } 4020 break; 4021 4022 case VDC_STATE_INIT_WAITING: 4023 4024 /* 4025 * Let the callback event move us on 4026 * when channel is open to server 4027 */ 4028 while (vdcp->ldc_state != LDC_UP) { 4029 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4030 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4031 DMSG(vdcp, 0, 4032 "state moved to %d out from under us...\n", 4033 vdcp->state); 4034 4035 break; 4036 } 4037 } 4038 if (vdcp->state == VDC_STATE_INIT_WAITING && 4039 vdcp->ldc_state == LDC_UP) { 4040 vdcp->state = VDC_STATE_NEGOTIATE; 4041 } 4042 break; 4043 4044 case VDC_STATE_NEGOTIATE: 4045 switch (status = vdc_ver_negotiation(vdcp)) { 4046 case 0: 4047 break; 4048 default: 4049 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4050 status); 4051 goto reset; 4052 } 4053 4054 switch (status = vdc_attr_negotiation(vdcp)) { 4055 case 0: 4056 break; 4057 default: 4058 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4059 status); 4060 goto reset; 4061 } 4062 4063 switch (status = vdc_dring_negotiation(vdcp)) { 4064 case 0: 4065 break; 4066 default: 4067 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4068 status); 4069 goto reset; 4070 } 4071 4072 switch (status = vdc_rdx_exchange(vdcp)) { 4073 case 0: 4074 vdcp->state = VDC_STATE_HANDLE_PENDING; 4075 goto done; 4076 default: 4077 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4078 status); 4079 goto reset; 4080 } 4081 reset: 4082 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4083 status); 4084 vdcp->state = VDC_STATE_RESETTING; 4085 vdcp->self_reset = B_TRUE; 4086 done: 4087 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4088 vdcp->state); 4089 break; 4090 4091 case VDC_STATE_HANDLE_PENDING: 4092 4093 if (vdcp->ctimeout_reached) { 4094 /* 4095 * The connection timeout had been reached so 4096 * pending requests have been cancelled. Now 4097 * that the connection is back we can reset 4098 * the timeout. 4099 */ 4100 ASSERT(vdcp->local_dring_backup == NULL); 4101 ASSERT(tmid != 0); 4102 tmid = 0; 4103 vdcp->ctimeout_reached = B_FALSE; 4104 vdcp->state = VDC_STATE_RUNNING; 4105 DMSG(vdcp, 0, "[%d] connection to service " 4106 "domain is up", vdcp->instance); 4107 break; 4108 } 4109 4110 mutex_exit(&vdcp->lock); 4111 if (tmid != 0) { 4112 (void) untimeout(tmid); 4113 tmid = 0; 4114 } 4115 status = vdc_resubmit_backup_dring(vdcp); 4116 mutex_enter(&vdcp->lock); 4117 4118 if (status) 4119 vdcp->state = VDC_STATE_RESETTING; 4120 else 4121 vdcp->state = VDC_STATE_RUNNING; 4122 4123 break; 4124 4125 /* enter running state */ 4126 case VDC_STATE_RUNNING: 4127 /* 4128 * Signal anyone waiting for the connection 4129 * to come on line. 4130 */ 4131 vdcp->hshake_cnt = 0; 4132 cv_broadcast(&vdcp->running_cv); 4133 4134 /* failfast has to been checked after reset */ 4135 cv_signal(&vdcp->failfast_cv); 4136 4137 /* ownership is lost during reset */ 4138 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4139 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4140 cv_signal(&vdcp->ownership_cv); 4141 4142 mutex_exit(&vdcp->lock); 4143 4144 for (;;) { 4145 vio_msg_t msg; 4146 status = vdc_wait_for_response(vdcp, &msg); 4147 if (status) break; 4148 4149 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4150 vdcp->instance); 4151 status = vdc_process_data_msg(vdcp, &msg); 4152 if (status) { 4153 DMSG(vdcp, 1, "[%d] process_data_msg " 4154 "returned err=%d\n", vdcp->instance, 4155 status); 4156 break; 4157 } 4158 4159 } 4160 4161 mutex_enter(&vdcp->lock); 4162 4163 vdcp->state = VDC_STATE_RESETTING; 4164 vdcp->self_reset = B_TRUE; 4165 break; 4166 4167 case VDC_STATE_RESETTING: 4168 /* 4169 * When we reach this state, we either come from the 4170 * VDC_STATE_RUNNING state and we can have pending 4171 * request but no timeout is armed; or we come from 4172 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4173 * VDC_HANDLE_PENDING state and there is no pending 4174 * request or pending requests have already been copied 4175 * into the backup dring. So we can safely keep the 4176 * connection timeout armed while we are in this state. 4177 */ 4178 4179 DMSG(vdcp, 0, "Initiating channel reset " 4180 "(pending = %d)\n", (int)vdcp->threads_pending); 4181 4182 if (vdcp->self_reset) { 4183 DMSG(vdcp, 0, 4184 "[%d] calling stop_ldc_connection.\n", 4185 vdcp->instance); 4186 status = vdc_stop_ldc_connection(vdcp); 4187 vdcp->self_reset = B_FALSE; 4188 } 4189 4190 /* 4191 * Wait for all threads currently waiting 4192 * for a free dring entry to use. 4193 */ 4194 while (vdcp->threads_pending) { 4195 cv_broadcast(&vdcp->membind_cv); 4196 cv_broadcast(&vdcp->dring_free_cv); 4197 mutex_exit(&vdcp->lock); 4198 /* give the waiters enough time to wake up */ 4199 delay(vdc_hz_min_ldc_delay); 4200 mutex_enter(&vdcp->lock); 4201 } 4202 4203 ASSERT(vdcp->threads_pending == 0); 4204 4205 /* Sanity check that no thread is receiving */ 4206 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4207 4208 vdcp->read_state = VDC_READ_IDLE; 4209 4210 vdc_backup_local_dring(vdcp); 4211 4212 /* cleanup the old d-ring */ 4213 vdc_destroy_descriptor_ring(vdcp); 4214 4215 /* go and start again */ 4216 vdcp->state = VDC_STATE_INIT; 4217 4218 break; 4219 4220 case VDC_STATE_DETACH: 4221 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4222 vdcp->instance); 4223 4224 /* cancel any pending timeout */ 4225 mutex_exit(&vdcp->lock); 4226 if (tmid != 0) { 4227 (void) untimeout(tmid); 4228 tmid = 0; 4229 } 4230 mutex_enter(&vdcp->lock); 4231 4232 /* 4233 * Signal anyone waiting for connection 4234 * to come online 4235 */ 4236 cv_broadcast(&vdcp->running_cv); 4237 4238 while (vdcp->sync_op_pending) { 4239 cv_signal(&vdcp->sync_pending_cv); 4240 cv_signal(&vdcp->sync_blocked_cv); 4241 mutex_exit(&vdcp->lock); 4242 /* give the waiters enough time to wake up */ 4243 delay(vdc_hz_min_ldc_delay); 4244 mutex_enter(&vdcp->lock); 4245 } 4246 4247 mutex_exit(&vdcp->lock); 4248 4249 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4250 vdcp->instance); 4251 thread_exit(); 4252 break; 4253 } 4254 } 4255 } 4256 4257 4258 /* 4259 * Function: 4260 * vdc_process_data_msg() 4261 * 4262 * Description: 4263 * This function is called by the message processing thread each time 4264 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4265 * be an ACK or NACK from vds[1] which vdc handles as follows. 4266 * ACK - wake up the waiting thread 4267 * NACK - resend any messages necessary 4268 * 4269 * [1] Although the message format allows it, vds should not send a 4270 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4271 * some bizarre reason it does, vdc will reset the connection. 4272 * 4273 * Arguments: 4274 * vdc - soft state pointer for this instance of the device driver. 4275 * msg - the LDC message sent by vds 4276 * 4277 * Return Code: 4278 * 0 - Success. 4279 * > 0 - error value returned by LDC 4280 */ 4281 static int 4282 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4283 { 4284 int status = 0; 4285 vio_dring_msg_t *dring_msg; 4286 vdc_local_desc_t *ldep = NULL; 4287 int start, end; 4288 int idx; 4289 4290 dring_msg = (vio_dring_msg_t *)msg; 4291 4292 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4293 ASSERT(vdcp != NULL); 4294 4295 mutex_enter(&vdcp->lock); 4296 4297 /* 4298 * Check to see if the message has bogus data 4299 */ 4300 idx = start = dring_msg->start_idx; 4301 end = dring_msg->end_idx; 4302 if ((start >= vdcp->dring_len) || 4303 (end >= vdcp->dring_len) || (end < -1)) { 4304 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4305 vdcp->instance, start, end); 4306 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4307 mutex_exit(&vdcp->lock); 4308 return (EINVAL); 4309 } 4310 4311 /* 4312 * Verify that the sequence number is what vdc expects. 4313 */ 4314 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4315 case VDC_SEQ_NUM_TODO: 4316 break; /* keep processing this message */ 4317 case VDC_SEQ_NUM_SKIP: 4318 mutex_exit(&vdcp->lock); 4319 return (0); 4320 case VDC_SEQ_NUM_INVALID: 4321 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4322 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4323 mutex_exit(&vdcp->lock); 4324 return (ENXIO); 4325 } 4326 4327 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4328 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4329 VDC_DUMP_DRING_MSG(dring_msg); 4330 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4331 mutex_exit(&vdcp->lock); 4332 return (EIO); 4333 4334 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4335 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4336 mutex_exit(&vdcp->lock); 4337 return (EPROTO); 4338 } 4339 4340 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4341 ASSERT(start == end); 4342 4343 ldep = &vdcp->local_dring[idx]; 4344 4345 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4346 ldep->dep->hdr.dstate, ldep->cb_type); 4347 4348 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4349 struct buf *bufp; 4350 4351 switch (ldep->cb_type) { 4352 case CB_SYNC: 4353 ASSERT(vdcp->sync_op_pending); 4354 4355 status = vdc_depopulate_descriptor(vdcp, idx); 4356 vdcp->sync_op_status = status; 4357 vdcp->sync_op_pending = B_FALSE; 4358 cv_signal(&vdcp->sync_pending_cv); 4359 break; 4360 4361 case CB_STRATEGY: 4362 bufp = ldep->cb_arg; 4363 ASSERT(bufp != NULL); 4364 bufp->b_resid = 4365 bufp->b_bcount - ldep->dep->payload.nbytes; 4366 status = ldep->dep->payload.status; /* Future:ntoh */ 4367 if (status != 0) { 4368 DMSG(vdcp, 1, "strategy status=%d\n", status); 4369 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4370 bioerror(bufp, status); 4371 } 4372 4373 (void) vdc_depopulate_descriptor(vdcp, idx); 4374 4375 DMSG(vdcp, 1, 4376 "strategy complete req=%ld bytes resp=%ld bytes\n", 4377 bufp->b_bcount, ldep->dep->payload.nbytes); 4378 4379 if (status != 0 && vdcp->failfast_interval != 0) { 4380 /* 4381 * The I/O has failed and failfast is enabled. 4382 * We need the failfast thread to check if the 4383 * failure is due to a reservation conflict. 4384 */ 4385 (void) vdc_failfast_io_queue(vdcp, bufp); 4386 } else { 4387 if (status == 0) { 4388 int op = (bufp->b_flags & B_READ) ? 4389 VD_OP_BREAD : VD_OP_BWRITE; 4390 VD_UPDATE_IO_STATS(vdcp, op, 4391 ldep->dep->payload.nbytes); 4392 } 4393 VD_KSTAT_RUNQ_EXIT(vdcp->io_stats); 4394 DTRACE_IO1(done, buf_t *, bufp); 4395 biodone(bufp); 4396 } 4397 break; 4398 4399 default: 4400 ASSERT(0); 4401 } 4402 } 4403 4404 /* let the arrival signal propogate */ 4405 mutex_exit(&vdcp->lock); 4406 4407 /* probe gives the count of how many entries were processed */ 4408 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4409 4410 return (0); 4411 } 4412 4413 4414 /* 4415 * Function: 4416 * vdc_handle_ver_msg() 4417 * 4418 * Description: 4419 * 4420 * Arguments: 4421 * vdc - soft state pointer for this instance of the device driver. 4422 * ver_msg - LDC message sent by vDisk server 4423 * 4424 * Return Code: 4425 * 0 - Success 4426 */ 4427 static int 4428 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4429 { 4430 int status = 0; 4431 4432 ASSERT(vdc != NULL); 4433 ASSERT(mutex_owned(&vdc->lock)); 4434 4435 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4436 return (EPROTO); 4437 } 4438 4439 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4440 return (EINVAL); 4441 } 4442 4443 switch (ver_msg->tag.vio_subtype) { 4444 case VIO_SUBTYPE_ACK: 4445 /* 4446 * We check to see if the version returned is indeed supported 4447 * (The server may have also adjusted the minor number downwards 4448 * and if so 'ver_msg' will contain the actual version agreed) 4449 */ 4450 if (vdc_is_supported_version(ver_msg)) { 4451 vdc->ver.major = ver_msg->ver_major; 4452 vdc->ver.minor = ver_msg->ver_minor; 4453 ASSERT(vdc->ver.major > 0); 4454 } else { 4455 status = EPROTO; 4456 } 4457 break; 4458 4459 case VIO_SUBTYPE_NACK: 4460 /* 4461 * call vdc_is_supported_version() which will return the next 4462 * supported version (if any) in 'ver_msg' 4463 */ 4464 (void) vdc_is_supported_version(ver_msg); 4465 if (ver_msg->ver_major > 0) { 4466 size_t len = sizeof (*ver_msg); 4467 4468 ASSERT(vdc->ver.major > 0); 4469 4470 /* reset the necessary fields and resend */ 4471 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4472 ver_msg->dev_class = VDEV_DISK; 4473 4474 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4475 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4476 vdc->instance, status); 4477 if (len != sizeof (*ver_msg)) 4478 status = EBADMSG; 4479 } else { 4480 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4481 vdc->instance); 4482 status = ENOTSUP; 4483 } 4484 4485 break; 4486 case VIO_SUBTYPE_INFO: 4487 /* 4488 * Handle the case where vds starts handshake 4489 * (for now only vdc is the instigator) 4490 */ 4491 status = ENOTSUP; 4492 break; 4493 4494 default: 4495 status = EINVAL; 4496 break; 4497 } 4498 4499 return (status); 4500 } 4501 4502 /* 4503 * Function: 4504 * vdc_handle_attr_msg() 4505 * 4506 * Description: 4507 * 4508 * Arguments: 4509 * vdc - soft state pointer for this instance of the device driver. 4510 * attr_msg - LDC message sent by vDisk server 4511 * 4512 * Return Code: 4513 * 0 - Success 4514 */ 4515 static int 4516 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4517 { 4518 int status = 0; 4519 4520 ASSERT(vdc != NULL); 4521 ASSERT(mutex_owned(&vdc->lock)); 4522 4523 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4524 return (EPROTO); 4525 } 4526 4527 switch (attr_msg->tag.vio_subtype) { 4528 case VIO_SUBTYPE_ACK: 4529 /* 4530 * We now verify the attributes sent by vds. 4531 */ 4532 if (attr_msg->vdisk_size == 0) { 4533 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4534 vdc->instance); 4535 status = EINVAL; 4536 break; 4537 } 4538 4539 if (attr_msg->max_xfer_sz == 0) { 4540 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4541 vdc->instance); 4542 status = EINVAL; 4543 break; 4544 } 4545 4546 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4547 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4548 vdc->instance); 4549 attr_msg->vdisk_size = 0; 4550 } 4551 4552 /* 4553 * If the disk size is already set check that it hasn't changed. 4554 */ 4555 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4556 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4557 DMSG(vdc, 0, "[%d] Different disk size from vds " 4558 "(old=0x%lx - new=0x%lx", vdc->instance, 4559 vdc->vdisk_size, attr_msg->vdisk_size) 4560 status = EINVAL; 4561 break; 4562 } 4563 4564 vdc->vdisk_size = attr_msg->vdisk_size; 4565 vdc->vdisk_type = attr_msg->vdisk_type; 4566 vdc->operations = attr_msg->operations; 4567 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4568 vdc->vdisk_media = attr_msg->vdisk_media; 4569 else 4570 vdc->vdisk_media = 0; 4571 4572 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4573 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4574 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4575 vdc->instance, vdc->block_size, 4576 attr_msg->vdisk_block_size); 4577 4578 /* 4579 * We don't know at compile time what the vDisk server will 4580 * think are good values but we apply a large (arbitrary) 4581 * upper bound to prevent memory exhaustion in vdc if it was 4582 * allocating a DRing based of huge values sent by the server. 4583 * We probably will never exceed this except if the message 4584 * was garbage. 4585 */ 4586 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4587 (PAGESIZE * DEV_BSIZE)) { 4588 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4589 vdc->block_size = attr_msg->vdisk_block_size; 4590 } else { 4591 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4592 " using max supported by vdc", vdc->instance); 4593 } 4594 4595 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4596 (attr_msg->vdisk_size > INT64_MAX) || 4597 (attr_msg->operations == 0) || 4598 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4599 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4600 vdc->instance); 4601 status = EINVAL; 4602 break; 4603 } 4604 4605 /* 4606 * Now that we have received all attributes we can create a 4607 * fake geometry for the disk. 4608 */ 4609 vdc_create_fake_geometry(vdc); 4610 break; 4611 4612 case VIO_SUBTYPE_NACK: 4613 /* 4614 * vds could not handle the attributes we sent so we 4615 * stop negotiating. 4616 */ 4617 status = EPROTO; 4618 break; 4619 4620 case VIO_SUBTYPE_INFO: 4621 /* 4622 * Handle the case where vds starts the handshake 4623 * (for now; vdc is the only supported instigatior) 4624 */ 4625 status = ENOTSUP; 4626 break; 4627 4628 default: 4629 status = ENOTSUP; 4630 break; 4631 } 4632 4633 return (status); 4634 } 4635 4636 /* 4637 * Function: 4638 * vdc_handle_dring_reg_msg() 4639 * 4640 * Description: 4641 * 4642 * Arguments: 4643 * vdc - soft state pointer for this instance of the driver. 4644 * dring_msg - LDC message sent by vDisk server 4645 * 4646 * Return Code: 4647 * 0 - Success 4648 */ 4649 static int 4650 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4651 { 4652 int status = 0; 4653 4654 ASSERT(vdc != NULL); 4655 ASSERT(mutex_owned(&vdc->lock)); 4656 4657 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4658 return (EPROTO); 4659 } 4660 4661 switch (dring_msg->tag.vio_subtype) { 4662 case VIO_SUBTYPE_ACK: 4663 /* save the received dring_ident */ 4664 vdc->dring_ident = dring_msg->dring_ident; 4665 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4666 vdc->instance, vdc->dring_ident); 4667 break; 4668 4669 case VIO_SUBTYPE_NACK: 4670 /* 4671 * vds could not handle the DRing info we sent so we 4672 * stop negotiating. 4673 */ 4674 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4675 vdc->instance); 4676 status = EPROTO; 4677 break; 4678 4679 case VIO_SUBTYPE_INFO: 4680 /* 4681 * Handle the case where vds starts handshake 4682 * (for now only vdc is the instigatior) 4683 */ 4684 status = ENOTSUP; 4685 break; 4686 default: 4687 status = ENOTSUP; 4688 } 4689 4690 return (status); 4691 } 4692 4693 /* 4694 * Function: 4695 * vdc_verify_seq_num() 4696 * 4697 * Description: 4698 * This functions verifies that the sequence number sent back by the vDisk 4699 * server with the latest message is what is expected (i.e. it is greater 4700 * than the last seq num sent by the vDisk server and less than or equal 4701 * to the last seq num generated by vdc). 4702 * 4703 * It then checks the request ID to see if any requests need processing 4704 * in the DRing. 4705 * 4706 * Arguments: 4707 * vdc - soft state pointer for this instance of the driver. 4708 * dring_msg - pointer to the LDC message sent by vds 4709 * 4710 * Return Code: 4711 * VDC_SEQ_NUM_TODO - Message needs to be processed 4712 * VDC_SEQ_NUM_SKIP - Message has already been processed 4713 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4714 * vdc cannot deal with them 4715 */ 4716 static int 4717 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4718 { 4719 ASSERT(vdc != NULL); 4720 ASSERT(dring_msg != NULL); 4721 ASSERT(mutex_owned(&vdc->lock)); 4722 4723 /* 4724 * Check to see if the messages were responded to in the correct 4725 * order by vds. 4726 */ 4727 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4728 (dring_msg->seq_num > vdc->seq_num)) { 4729 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4730 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4731 vdc->instance, dring_msg->seq_num, 4732 vdc->seq_num_reply, vdc->seq_num, 4733 vdc->req_id_proc, vdc->req_id); 4734 return (VDC_SEQ_NUM_INVALID); 4735 } 4736 vdc->seq_num_reply = dring_msg->seq_num; 4737 4738 if (vdc->req_id_proc < vdc->req_id) 4739 return (VDC_SEQ_NUM_TODO); 4740 else 4741 return (VDC_SEQ_NUM_SKIP); 4742 } 4743 4744 4745 /* 4746 * Function: 4747 * vdc_is_supported_version() 4748 * 4749 * Description: 4750 * This routine checks if the major/minor version numbers specified in 4751 * 'ver_msg' are supported. If not it finds the next version that is 4752 * in the supported version list 'vdc_version[]' and sets the fields in 4753 * 'ver_msg' to those values 4754 * 4755 * Arguments: 4756 * ver_msg - LDC message sent by vDisk server 4757 * 4758 * Return Code: 4759 * B_TRUE - Success 4760 * B_FALSE - Version not supported 4761 */ 4762 static boolean_t 4763 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 4764 { 4765 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 4766 4767 for (int i = 0; i < vdc_num_versions; i++) { 4768 ASSERT(vdc_version[i].major > 0); 4769 ASSERT((i == 0) || 4770 (vdc_version[i].major < vdc_version[i-1].major)); 4771 4772 /* 4773 * If the major versions match, adjust the minor version, if 4774 * necessary, down to the highest value supported by this 4775 * client. The server should support all minor versions lower 4776 * than the value it sent 4777 */ 4778 if (ver_msg->ver_major == vdc_version[i].major) { 4779 if (ver_msg->ver_minor > vdc_version[i].minor) { 4780 DMSGX(0, 4781 "Adjusting minor version from %u to %u", 4782 ver_msg->ver_minor, vdc_version[i].minor); 4783 ver_msg->ver_minor = vdc_version[i].minor; 4784 } 4785 return (B_TRUE); 4786 } 4787 4788 /* 4789 * If the message contains a higher major version number, set 4790 * the message's major/minor versions to the current values 4791 * and return false, so this message will get resent with 4792 * these values, and the server will potentially try again 4793 * with the same or a lower version 4794 */ 4795 if (ver_msg->ver_major > vdc_version[i].major) { 4796 ver_msg->ver_major = vdc_version[i].major; 4797 ver_msg->ver_minor = vdc_version[i].minor; 4798 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 4799 ver_msg->ver_major, ver_msg->ver_minor); 4800 4801 return (B_FALSE); 4802 } 4803 4804 /* 4805 * Otherwise, the message's major version is less than the 4806 * current major version, so continue the loop to the next 4807 * (lower) supported version 4808 */ 4809 } 4810 4811 /* 4812 * No common version was found; "ground" the version pair in the 4813 * message to terminate negotiation 4814 */ 4815 ver_msg->ver_major = 0; 4816 ver_msg->ver_minor = 0; 4817 4818 return (B_FALSE); 4819 } 4820 /* -------------------------------------------------------------------------- */ 4821 4822 /* 4823 * DKIO(7) support 4824 */ 4825 4826 typedef struct vdc_dk_arg { 4827 struct dk_callback dkc; 4828 int mode; 4829 dev_t dev; 4830 vdc_t *vdc; 4831 } vdc_dk_arg_t; 4832 4833 /* 4834 * Function: 4835 * vdc_dkio_flush_cb() 4836 * 4837 * Description: 4838 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 4839 * by kernel code. 4840 * 4841 * Arguments: 4842 * arg - a pointer to a vdc_dk_arg_t structure. 4843 */ 4844 void 4845 vdc_dkio_flush_cb(void *arg) 4846 { 4847 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 4848 struct dk_callback *dkc = NULL; 4849 vdc_t *vdc = NULL; 4850 int rv; 4851 4852 if (dk_arg == NULL) { 4853 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 4854 return; 4855 } 4856 dkc = &dk_arg->dkc; 4857 vdc = dk_arg->vdc; 4858 ASSERT(vdc != NULL); 4859 4860 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 4861 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 4862 if (rv != 0) { 4863 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 4864 vdc->instance, rv, 4865 ddi_model_convert_from(dk_arg->mode & FMODELS)); 4866 } 4867 4868 /* 4869 * Trigger the call back to notify the caller the the ioctl call has 4870 * been completed. 4871 */ 4872 if ((dk_arg->mode & FKIOCTL) && 4873 (dkc != NULL) && 4874 (dkc->dkc_callback != NULL)) { 4875 ASSERT(dkc->dkc_cookie != NULL); 4876 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 4877 } 4878 4879 /* Indicate that one less DKIO write flush is outstanding */ 4880 mutex_enter(&vdc->lock); 4881 vdc->dkio_flush_pending--; 4882 ASSERT(vdc->dkio_flush_pending >= 0); 4883 mutex_exit(&vdc->lock); 4884 4885 /* free the mem that was allocated when the callback was dispatched */ 4886 kmem_free(arg, sizeof (vdc_dk_arg_t)); 4887 } 4888 4889 /* 4890 * Function: 4891 * vdc_dkio_get_partition() 4892 * 4893 * Description: 4894 * This function implements the DKIOCGAPART ioctl. 4895 * 4896 * Arguments: 4897 * vdc - soft state pointer 4898 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 4899 * flag - ioctl flags 4900 */ 4901 static int 4902 vdc_dkio_get_partition(vdc_t *vdc, caddr_t arg, int flag) 4903 { 4904 struct dk_geom *geom; 4905 struct vtoc *vtoc; 4906 union { 4907 struct dk_map map[NDKMAP]; 4908 struct dk_map32 map32[NDKMAP]; 4909 } data; 4910 int i, rv, size; 4911 4912 mutex_enter(&vdc->lock); 4913 4914 if ((rv = vdc_validate_geometry(vdc)) != 0) { 4915 mutex_exit(&vdc->lock); 4916 return (rv); 4917 } 4918 4919 vtoc = vdc->vtoc; 4920 geom = vdc->geom; 4921 4922 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 4923 4924 for (i = 0; i < vtoc->v_nparts; i++) { 4925 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 4926 (geom->dkg_nhead * geom->dkg_nsect); 4927 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 4928 } 4929 size = NDKMAP * sizeof (struct dk_map32); 4930 4931 } else { 4932 4933 for (i = 0; i < vtoc->v_nparts; i++) { 4934 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 4935 (geom->dkg_nhead * geom->dkg_nsect); 4936 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 4937 } 4938 size = NDKMAP * sizeof (struct dk_map); 4939 4940 } 4941 4942 mutex_exit(&vdc->lock); 4943 4944 if (ddi_copyout(&data, arg, size, flag) != 0) 4945 return (EFAULT); 4946 4947 return (0); 4948 } 4949 4950 /* 4951 * Function: 4952 * vdc_dioctl_rwcmd() 4953 * 4954 * Description: 4955 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 4956 * for DKC_DIRECT disks to read or write at an absolute disk offset. 4957 * 4958 * Arguments: 4959 * dev - device 4960 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 4961 * flag - ioctl flags 4962 */ 4963 static int 4964 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 4965 { 4966 struct dadkio_rwcmd32 rwcmd32; 4967 struct dadkio_rwcmd rwcmd; 4968 struct iovec aiov; 4969 struct uio auio; 4970 int rw, status; 4971 struct buf *buf; 4972 4973 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 4974 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 4975 sizeof (struct dadkio_rwcmd32), flag)) { 4976 return (EFAULT); 4977 } 4978 rwcmd.cmd = rwcmd32.cmd; 4979 rwcmd.flags = rwcmd32.flags; 4980 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 4981 rwcmd.buflen = rwcmd32.buflen; 4982 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 4983 } else { 4984 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 4985 sizeof (struct dadkio_rwcmd), flag)) { 4986 return (EFAULT); 4987 } 4988 } 4989 4990 switch (rwcmd.cmd) { 4991 case DADKIO_RWCMD_READ: 4992 rw = B_READ; 4993 break; 4994 case DADKIO_RWCMD_WRITE: 4995 rw = B_WRITE; 4996 break; 4997 default: 4998 return (EINVAL); 4999 } 5000 5001 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5002 aiov.iov_base = rwcmd.bufaddr; 5003 aiov.iov_len = rwcmd.buflen; 5004 5005 bzero((caddr_t)&auio, sizeof (struct uio)); 5006 auio.uio_iov = &aiov; 5007 auio.uio_iovcnt = 1; 5008 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5009 auio.uio_resid = rwcmd.buflen; 5010 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5011 5012 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5013 bioinit(buf); 5014 /* 5015 * We use the private field of buf to specify that this is an 5016 * I/O using an absolute offset. 5017 */ 5018 buf->b_private = (void *)VD_SLICE_NONE; 5019 5020 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5021 5022 biofini(buf); 5023 kmem_free(buf, sizeof (buf_t)); 5024 5025 return (status); 5026 } 5027 5028 /* 5029 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5030 * buffer is returned in alloc_len. 5031 */ 5032 static vd_scsi_t * 5033 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5034 int *alloc_len) 5035 { 5036 vd_scsi_t *vd_scsi; 5037 int vd_scsi_len = VD_SCSI_SIZE; 5038 5039 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5040 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5041 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5042 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5043 5044 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5045 5046 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5047 5048 vd_scsi->cdb_len = cdb_len; 5049 vd_scsi->sense_len = sense_len; 5050 vd_scsi->datain_len = datain_len; 5051 vd_scsi->dataout_len = dataout_len; 5052 5053 *alloc_len = vd_scsi_len; 5054 5055 return (vd_scsi); 5056 } 5057 5058 /* 5059 * Convert the status of a SCSI command to a Solaris return code. 5060 * 5061 * Arguments: 5062 * vd_scsi - The SCSI operation buffer. 5063 * log_error - indicate if an error message should be logged. 5064 * 5065 * Note that our SCSI error messages are rather primitive for the moment 5066 * and could be improved by decoding some data like the SCSI command and 5067 * the sense key. 5068 * 5069 * Return value: 5070 * 0 - Status is good. 5071 * EACCES - Status reports a reservation conflict. 5072 * ENOTSUP - Status reports a check condition and sense key 5073 * reports an illegal request. 5074 * EIO - Any other status. 5075 */ 5076 static int 5077 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5078 { 5079 int rv; 5080 char path_str[MAXPATHLEN]; 5081 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5082 union scsi_cdb *cdb; 5083 struct scsi_extended_sense *sense; 5084 5085 if (vd_scsi->cmd_status == STATUS_GOOD) 5086 /* no error */ 5087 return (0); 5088 5089 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5090 if (vdc_scsi_log_error) 5091 log_error = B_TRUE; 5092 5093 if (log_error) { 5094 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5095 ddi_pathname(vdc->dip, path_str), vdc->instance, 5096 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5097 } 5098 5099 /* default returned value */ 5100 rv = EIO; 5101 5102 switch (vd_scsi->cmd_status) { 5103 5104 case STATUS_CHECK: 5105 case STATUS_TERMINATED: 5106 if (log_error) 5107 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5108 5109 /* check sense buffer */ 5110 if (vd_scsi->sense_len == 0 || 5111 vd_scsi->sense_status != STATUS_GOOD) { 5112 if (log_error) 5113 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5114 break; 5115 } 5116 5117 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5118 5119 if (log_error) { 5120 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5121 "\tASC: 0x%x, ASCQ: 0x%x\n", 5122 scsi_sense_key((uint8_t *)sense), 5123 scsi_sense_asc((uint8_t *)sense), 5124 scsi_sense_ascq((uint8_t *)sense)); 5125 } 5126 5127 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5128 rv = ENOTSUP; 5129 break; 5130 5131 case STATUS_BUSY: 5132 if (log_error) 5133 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5134 break; 5135 5136 case STATUS_RESERVATION_CONFLICT: 5137 /* 5138 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5139 * reservation conflict could be due to various reasons like 5140 * incorrect keys, not registered or not reserved etc. So, 5141 * we should not panic in that case. 5142 */ 5143 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5144 if (vdc->failfast_interval != 0 && 5145 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5146 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5147 /* failfast is enabled so we have to panic */ 5148 (void) snprintf(panic_str, sizeof (panic_str), 5149 VDC_RESV_CONFLICT_FMT_STR "%s", 5150 ddi_pathname(vdc->dip, path_str)); 5151 panic(panic_str); 5152 } 5153 if (log_error) 5154 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5155 rv = EACCES; 5156 break; 5157 5158 case STATUS_QFULL: 5159 if (log_error) 5160 cmn_err(CE_NOTE, "\tQueue Full\n"); 5161 break; 5162 5163 case STATUS_MET: 5164 case STATUS_INTERMEDIATE: 5165 case STATUS_SCSI2: 5166 case STATUS_INTERMEDIATE_MET: 5167 case STATUS_ACA_ACTIVE: 5168 if (log_error) 5169 cmn_err(CE_CONT, 5170 "\tUnexpected SCSI status received: 0x%x\n", 5171 vd_scsi->cmd_status); 5172 break; 5173 5174 default: 5175 if (log_error) 5176 cmn_err(CE_CONT, 5177 "\tInvalid SCSI status received: 0x%x\n", 5178 vd_scsi->cmd_status); 5179 break; 5180 } 5181 5182 return (rv); 5183 } 5184 5185 /* 5186 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5187 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5188 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5189 * converted to a VD_OP_RESET operation. 5190 */ 5191 static int 5192 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5193 { 5194 struct uscsi_cmd uscsi; 5195 struct uscsi_cmd32 uscsi32; 5196 vd_scsi_t *vd_scsi; 5197 int vd_scsi_len; 5198 union scsi_cdb *cdb; 5199 struct scsi_extended_sense *sense; 5200 char *datain, *dataout; 5201 size_t cdb_len, datain_len, dataout_len, sense_len; 5202 int rv; 5203 5204 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5205 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5206 mode) != 0) 5207 return (EFAULT); 5208 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5209 } else { 5210 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5211 mode) != 0) 5212 return (EFAULT); 5213 } 5214 5215 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5216 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5217 USCSI_RESET_ALL)) { 5218 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5219 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5220 return (rv); 5221 } 5222 5223 /* cdb buffer length */ 5224 cdb_len = uscsi.uscsi_cdblen; 5225 5226 /* data in and out buffers length */ 5227 if (uscsi.uscsi_flags & USCSI_READ) { 5228 datain_len = uscsi.uscsi_buflen; 5229 dataout_len = 0; 5230 } else { 5231 datain_len = 0; 5232 dataout_len = uscsi.uscsi_buflen; 5233 } 5234 5235 /* sense buffer length */ 5236 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5237 sense_len = uscsi.uscsi_rqlen; 5238 else 5239 sense_len = 0; 5240 5241 /* allocate buffer for the VD_SCSICMD_OP operation */ 5242 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5243 &vd_scsi_len); 5244 5245 /* 5246 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5247 * but basically they prevent a SCSI command from being retried in case 5248 * of an error. 5249 */ 5250 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5251 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5252 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5253 5254 /* set task attribute */ 5255 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5256 vd_scsi->task_attribute = 0; 5257 } else { 5258 if (uscsi.uscsi_flags & USCSI_HEAD) 5259 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5260 else if (uscsi.uscsi_flags & USCSI_HTAG) 5261 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5262 else if (uscsi.uscsi_flags & USCSI_OTAG) 5263 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5264 else 5265 vd_scsi->task_attribute = 0; 5266 } 5267 5268 /* set timeout */ 5269 vd_scsi->timeout = uscsi.uscsi_timeout; 5270 5271 /* copy-in cdb data */ 5272 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5273 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5274 rv = EFAULT; 5275 goto done; 5276 } 5277 5278 /* keep a pointer to the sense buffer */ 5279 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5280 5281 /* keep a pointer to the data-in buffer */ 5282 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5283 5284 /* copy-in request data to the data-out buffer */ 5285 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5286 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5287 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5288 mode)) { 5289 rv = EFAULT; 5290 goto done; 5291 } 5292 } 5293 5294 /* submit the request */ 5295 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5296 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5297 5298 if (rv != 0) 5299 goto done; 5300 5301 /* update scsi status */ 5302 uscsi.uscsi_status = vd_scsi->cmd_status; 5303 5304 /* update sense data */ 5305 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5306 (uscsi.uscsi_status == STATUS_CHECK || 5307 uscsi.uscsi_status == STATUS_TERMINATED)) { 5308 5309 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5310 5311 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5312 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5313 vd_scsi->sense_len; 5314 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5315 vd_scsi->sense_len, mode) != 0) { 5316 rv = EFAULT; 5317 goto done; 5318 } 5319 } 5320 } 5321 5322 /* update request data */ 5323 if (uscsi.uscsi_status == STATUS_GOOD) { 5324 if (uscsi.uscsi_flags & USCSI_READ) { 5325 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5326 vd_scsi->datain_len; 5327 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5328 vd_scsi->datain_len, mode) != 0) { 5329 rv = EFAULT; 5330 goto done; 5331 } 5332 } else { 5333 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5334 vd_scsi->dataout_len; 5335 } 5336 } 5337 5338 /* copy-out result */ 5339 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5340 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5341 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5342 mode) != 0) { 5343 rv = EFAULT; 5344 goto done; 5345 } 5346 } else { 5347 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5348 mode) != 0) { 5349 rv = EFAULT; 5350 goto done; 5351 } 5352 } 5353 5354 /* get the return code from the SCSI command status */ 5355 rv = vdc_scsi_status(vdc, vd_scsi, 5356 !(uscsi.uscsi_flags & USCSI_SILENT)); 5357 5358 done: 5359 kmem_free(vd_scsi, vd_scsi_len); 5360 return (rv); 5361 } 5362 5363 /* 5364 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5365 * 5366 * Arguments: 5367 * cmd - SCSI PERSISTENT IN command 5368 * len - length of the SCSI input buffer 5369 * vd_scsi_len - return the length of the allocated buffer 5370 * 5371 * Returned Value: 5372 * a pointer to the allocated VD_OP_SCSICMD buffer. 5373 */ 5374 static vd_scsi_t * 5375 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5376 { 5377 int cdb_len, sense_len, datain_len, dataout_len; 5378 vd_scsi_t *vd_scsi; 5379 union scsi_cdb *cdb; 5380 5381 cdb_len = CDB_GROUP1; 5382 sense_len = sizeof (struct scsi_extended_sense); 5383 datain_len = len; 5384 dataout_len = 0; 5385 5386 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5387 vd_scsi_len); 5388 5389 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5390 5391 /* set cdb */ 5392 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5393 cdb->cdb_opaque[1] = cmd; 5394 FORMG1COUNT(cdb, datain_len); 5395 5396 vd_scsi->timeout = vdc_scsi_timeout; 5397 5398 return (vd_scsi); 5399 } 5400 5401 /* 5402 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5403 * 5404 * Arguments: 5405 * cmd - SCSI PERSISTENT OUT command 5406 * len - length of the SCSI output buffer 5407 * vd_scsi_len - return the length of the allocated buffer 5408 * 5409 * Returned Code: 5410 * a pointer to the allocated VD_OP_SCSICMD buffer. 5411 */ 5412 static vd_scsi_t * 5413 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5414 { 5415 int cdb_len, sense_len, datain_len, dataout_len; 5416 vd_scsi_t *vd_scsi; 5417 union scsi_cdb *cdb; 5418 5419 cdb_len = CDB_GROUP1; 5420 sense_len = sizeof (struct scsi_extended_sense); 5421 datain_len = 0; 5422 dataout_len = len; 5423 5424 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5425 vd_scsi_len); 5426 5427 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5428 5429 /* set cdb */ 5430 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5431 cdb->cdb_opaque[1] = cmd; 5432 FORMG1COUNT(cdb, dataout_len); 5433 5434 vd_scsi->timeout = vdc_scsi_timeout; 5435 5436 return (vd_scsi); 5437 } 5438 5439 /* 5440 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5441 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5442 * server with a VD_OP_SCSICMD operation. 5443 */ 5444 static int 5445 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5446 { 5447 vd_scsi_t *vd_scsi; 5448 mhioc_inkeys_t inkeys; 5449 mhioc_key_list_t klist; 5450 struct mhioc_inkeys32 inkeys32; 5451 struct mhioc_key_list32 klist32; 5452 sd_prin_readkeys_t *scsi_keys; 5453 void *user_keys; 5454 int vd_scsi_len; 5455 int listsize, listlen, rv; 5456 5457 /* copyin arguments */ 5458 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5459 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5460 if (rv != 0) 5461 return (EFAULT); 5462 5463 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5464 sizeof (klist32), mode); 5465 if (rv != 0) 5466 return (EFAULT); 5467 5468 listsize = klist32.listsize; 5469 } else { 5470 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5471 if (rv != 0) 5472 return (EFAULT); 5473 5474 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5475 if (rv != 0) 5476 return (EFAULT); 5477 5478 listsize = klist.listsize; 5479 } 5480 5481 /* build SCSI VD_OP request */ 5482 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5483 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5484 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5485 5486 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5487 5488 /* submit the request */ 5489 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5490 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5491 5492 if (rv != 0) 5493 goto done; 5494 5495 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5496 5497 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5498 inkeys32.generation = scsi_keys->generation; 5499 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5500 if (rv != 0) { 5501 rv = EFAULT; 5502 goto done; 5503 } 5504 5505 klist32.listlen = listlen; 5506 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5507 sizeof (klist32), mode); 5508 if (rv != 0) { 5509 rv = EFAULT; 5510 goto done; 5511 } 5512 5513 user_keys = (caddr_t)(uintptr_t)klist32.list; 5514 } else { 5515 inkeys.generation = scsi_keys->generation; 5516 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5517 if (rv != 0) { 5518 rv = EFAULT; 5519 goto done; 5520 } 5521 5522 klist.listlen = listlen; 5523 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5524 if (rv != 0) { 5525 rv = EFAULT; 5526 goto done; 5527 } 5528 5529 user_keys = klist.list; 5530 } 5531 5532 /* copy out keys */ 5533 if (listlen > 0 && listsize > 0) { 5534 if (listsize < listlen) 5535 listlen = listsize; 5536 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5537 listlen * MHIOC_RESV_KEY_SIZE, mode); 5538 if (rv != 0) 5539 rv = EFAULT; 5540 } 5541 5542 if (rv == 0) 5543 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5544 5545 done: 5546 kmem_free(vd_scsi, vd_scsi_len); 5547 5548 return (rv); 5549 } 5550 5551 /* 5552 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5553 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5554 * the vdisk server with a VD_OP_SCSICMD operation. 5555 */ 5556 static int 5557 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5558 { 5559 vd_scsi_t *vd_scsi; 5560 mhioc_inresvs_t inresv; 5561 mhioc_resv_desc_list_t rlist; 5562 struct mhioc_inresvs32 inresv32; 5563 struct mhioc_resv_desc_list32 rlist32; 5564 mhioc_resv_desc_t mhd_resv; 5565 sd_prin_readresv_t *scsi_resv; 5566 sd_readresv_desc_t *resv; 5567 mhioc_resv_desc_t *user_resv; 5568 int vd_scsi_len; 5569 int listsize, listlen, i, rv; 5570 5571 /* copyin arguments */ 5572 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5573 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5574 if (rv != 0) 5575 return (EFAULT); 5576 5577 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5578 sizeof (rlist32), mode); 5579 if (rv != 0) 5580 return (EFAULT); 5581 5582 listsize = rlist32.listsize; 5583 } else { 5584 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5585 if (rv != 0) 5586 return (EFAULT); 5587 5588 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5589 if (rv != 0) 5590 return (EFAULT); 5591 5592 listsize = rlist.listsize; 5593 } 5594 5595 /* build SCSI VD_OP request */ 5596 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5597 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5598 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5599 5600 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5601 5602 /* submit the request */ 5603 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5604 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5605 5606 if (rv != 0) 5607 goto done; 5608 5609 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5610 5611 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5612 inresv32.generation = scsi_resv->generation; 5613 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5614 if (rv != 0) { 5615 rv = EFAULT; 5616 goto done; 5617 } 5618 5619 rlist32.listlen = listlen; 5620 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5621 sizeof (rlist32), mode); 5622 if (rv != 0) { 5623 rv = EFAULT; 5624 goto done; 5625 } 5626 5627 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5628 } else { 5629 inresv.generation = scsi_resv->generation; 5630 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5631 if (rv != 0) { 5632 rv = EFAULT; 5633 goto done; 5634 } 5635 5636 rlist.listlen = listlen; 5637 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5638 if (rv != 0) { 5639 rv = EFAULT; 5640 goto done; 5641 } 5642 5643 user_resv = rlist.list; 5644 } 5645 5646 /* copy out reservations */ 5647 if (listsize > 0 && listlen > 0) { 5648 if (listsize < listlen) 5649 listlen = listsize; 5650 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5651 5652 for (i = 0; i < listlen; i++) { 5653 mhd_resv.type = resv->type; 5654 mhd_resv.scope = resv->scope; 5655 mhd_resv.scope_specific_addr = 5656 BE_32(resv->scope_specific_addr); 5657 bcopy(&resv->resvkey, &mhd_resv.key, 5658 MHIOC_RESV_KEY_SIZE); 5659 5660 rv = ddi_copyout(&mhd_resv, user_resv, 5661 sizeof (mhd_resv), mode); 5662 if (rv != 0) { 5663 rv = EFAULT; 5664 goto done; 5665 } 5666 resv++; 5667 user_resv++; 5668 } 5669 } 5670 5671 if (rv == 0) 5672 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5673 5674 done: 5675 kmem_free(vd_scsi, vd_scsi_len); 5676 return (rv); 5677 } 5678 5679 /* 5680 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5681 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5682 * server with a VD_OP_SCSICMD operation. 5683 */ 5684 static int 5685 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5686 { 5687 vd_scsi_t *vd_scsi; 5688 sd_prout_t *scsi_prout; 5689 mhioc_register_t mhd_reg; 5690 int vd_scsi_len, rv; 5691 5692 /* copyin arguments */ 5693 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 5694 if (rv != 0) 5695 return (EFAULT); 5696 5697 /* build SCSI VD_OP request */ 5698 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 5699 sizeof (sd_prout_t), &vd_scsi_len); 5700 5701 /* set parameters */ 5702 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5703 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5704 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 5705 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 5706 5707 /* submit the request */ 5708 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5709 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5710 5711 if (rv == 0) 5712 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5713 5714 kmem_free(vd_scsi, vd_scsi_len); 5715 return (rv); 5716 } 5717 5718 /* 5719 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 5720 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 5721 * server with a VD_OP_SCSICMD operation. 5722 */ 5723 static int 5724 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 5725 { 5726 union scsi_cdb *cdb; 5727 vd_scsi_t *vd_scsi; 5728 sd_prout_t *scsi_prout; 5729 mhioc_resv_desc_t mhd_resv; 5730 int vd_scsi_len, rv; 5731 5732 /* copyin arguments */ 5733 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 5734 if (rv != 0) 5735 return (EFAULT); 5736 5737 /* build SCSI VD_OP request */ 5738 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 5739 sizeof (sd_prout_t), &vd_scsi_len); 5740 5741 /* set parameters */ 5742 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5743 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5744 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5745 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 5746 cdb->cdb_opaque[2] = mhd_resv.type; 5747 5748 /* submit the request */ 5749 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5750 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5751 5752 if (rv == 0) 5753 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5754 5755 kmem_free(vd_scsi, vd_scsi_len); 5756 return (rv); 5757 } 5758 5759 /* 5760 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 5761 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 5762 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 5763 */ 5764 static int 5765 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 5766 { 5767 union scsi_cdb *cdb; 5768 vd_scsi_t *vd_scsi; 5769 sd_prout_t *scsi_prout; 5770 mhioc_preemptandabort_t mhd_preempt; 5771 int vd_scsi_len, rv; 5772 5773 /* copyin arguments */ 5774 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 5775 if (rv != 0) 5776 return (EFAULT); 5777 5778 /* build SCSI VD_OP request */ 5779 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 5780 sizeof (sd_prout_t), &vd_scsi_len); 5781 5782 /* set parameters */ 5783 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5784 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5785 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5786 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 5787 MHIOC_RESV_KEY_SIZE); 5788 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 5789 MHIOC_RESV_KEY_SIZE); 5790 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 5791 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 5792 5793 /* submit the request */ 5794 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5795 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5796 5797 if (rv == 0) 5798 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5799 5800 kmem_free(vd_scsi, vd_scsi_len); 5801 return (rv); 5802 } 5803 5804 /* 5805 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 5806 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 5807 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 5808 */ 5809 static int 5810 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 5811 { 5812 vd_scsi_t *vd_scsi; 5813 sd_prout_t *scsi_prout; 5814 mhioc_registerandignorekey_t mhd_regi; 5815 int vd_scsi_len, rv; 5816 5817 /* copyin arguments */ 5818 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 5819 if (rv != 0) 5820 return (EFAULT); 5821 5822 /* build SCSI VD_OP request */ 5823 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 5824 sizeof (sd_prout_t), &vd_scsi_len); 5825 5826 /* set parameters */ 5827 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5828 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 5829 MHIOC_RESV_KEY_SIZE); 5830 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 5831 5832 /* submit the request */ 5833 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5834 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5835 5836 if (rv == 0) 5837 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5838 5839 kmem_free(vd_scsi, vd_scsi_len); 5840 return (rv); 5841 } 5842 5843 /* 5844 * This function is used by the failfast mechanism to send a SCSI command 5845 * to check for reservation conflict. 5846 */ 5847 static int 5848 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 5849 { 5850 int cdb_len, sense_len, vd_scsi_len; 5851 vd_scsi_t *vd_scsi; 5852 union scsi_cdb *cdb; 5853 int rv; 5854 5855 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 5856 5857 if (scmd == SCMD_WRITE_G1) 5858 cdb_len = CDB_GROUP1; 5859 else 5860 cdb_len = CDB_GROUP0; 5861 5862 sense_len = sizeof (struct scsi_extended_sense); 5863 5864 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 5865 5866 /* set cdb */ 5867 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5868 cdb->scc_cmd = scmd; 5869 5870 vd_scsi->timeout = vdc_scsi_timeout; 5871 5872 /* 5873 * Submit the request. The last argument has to be B_FALSE so that 5874 * vdc_do_sync_op does not loop checking for reservation conflict if 5875 * the operation returns an error. 5876 */ 5877 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5878 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 5879 5880 if (rv == 0) 5881 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5882 5883 kmem_free(vd_scsi, vd_scsi_len); 5884 return (rv); 5885 } 5886 5887 /* 5888 * This function is used by the failfast mechanism to check for reservation 5889 * conflict. It sends some SCSI commands which will fail with a reservation 5890 * conflict error if the system does not have access to the disk and this 5891 * will panic the system. 5892 * 5893 * Returned Code: 5894 * 0 - disk is accessible without reservation conflict error 5895 * != 0 - unable to check if disk is accessible 5896 */ 5897 int 5898 vdc_failfast_check_resv(vdc_t *vdc) 5899 { 5900 int failure = 0; 5901 5902 /* 5903 * Send a TEST UNIT READY command. The command will panic 5904 * the system if it fails with a reservation conflict. 5905 */ 5906 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 5907 failure++; 5908 5909 /* 5910 * With SPC-3 compliant devices TEST UNIT READY will succeed on 5911 * a reserved device, so we also do a WRITE(10) of zero byte in 5912 * order to provoke a Reservation Conflict status on those newer 5913 * devices. 5914 */ 5915 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 5916 failure++; 5917 5918 return (failure); 5919 } 5920 5921 /* 5922 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 5923 * queue when it has failed and failfast is enabled. Then we have to check 5924 * if it has failed because of a reservation conflict in which case we have 5925 * to panic the system. 5926 * 5927 * Async I/O should be queued with their block I/O data transfer structure 5928 * (buf). Sync I/O should be queued with buf = NULL. 5929 */ 5930 static vdc_io_t * 5931 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 5932 { 5933 vdc_io_t *vio; 5934 5935 ASSERT(MUTEX_HELD(&vdc->lock)); 5936 5937 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 5938 vio->vio_next = vdc->failfast_io_queue; 5939 vio->vio_buf = buf; 5940 vio->vio_qtime = ddi_get_lbolt(); 5941 5942 vdc->failfast_io_queue = vio; 5943 5944 /* notify the failfast thread that a new I/O is queued */ 5945 cv_signal(&vdc->failfast_cv); 5946 5947 return (vio); 5948 } 5949 5950 /* 5951 * Remove and complete I/O in the failfast I/O queue which have been 5952 * added after the indicated deadline. A deadline of 0 means that all 5953 * I/O have to be unqueued and marked as completed. 5954 */ 5955 static void 5956 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 5957 { 5958 vdc_io_t *vio, *vio_tmp; 5959 5960 ASSERT(MUTEX_HELD(&vdc->lock)); 5961 5962 vio_tmp = NULL; 5963 vio = vdc->failfast_io_queue; 5964 5965 if (deadline != 0) { 5966 /* 5967 * Skip any io queued after the deadline. The failfast 5968 * I/O queue is ordered starting with the last I/O added 5969 * to the queue. 5970 */ 5971 while (vio != NULL && vio->vio_qtime > deadline) { 5972 vio_tmp = vio; 5973 vio = vio->vio_next; 5974 } 5975 } 5976 5977 if (vio == NULL) 5978 /* nothing to unqueue */ 5979 return; 5980 5981 /* update the queue */ 5982 if (vio_tmp == NULL) 5983 vdc->failfast_io_queue = NULL; 5984 else 5985 vio_tmp->vio_next = NULL; 5986 5987 /* 5988 * Complete unqueued I/O. Async I/O have a block I/O data transfer 5989 * structure (buf) and they are completed by calling biodone(). Sync 5990 * I/O do not have a buf and they are completed by setting the 5991 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 5992 * thread waiting for the I/O to complete is responsible for freeing 5993 * the vio structure. 5994 */ 5995 while (vio != NULL) { 5996 vio_tmp = vio->vio_next; 5997 if (vio->vio_buf != NULL) { 5998 VD_KSTAT_RUNQ_EXIT(vdc->io_stats); 5999 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6000 biodone(vio->vio_buf); 6001 kmem_free(vio, sizeof (vdc_io_t)); 6002 } else { 6003 vio->vio_qtime = 0; 6004 } 6005 vio = vio_tmp; 6006 } 6007 6008 cv_broadcast(&vdc->failfast_io_cv); 6009 } 6010 6011 /* 6012 * Failfast Thread. 6013 * 6014 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6015 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6016 * we still have access to the disk. If a command fails with a RESERVATION 6017 * CONFLICT error then the system will immediatly panic. 6018 * 6019 * The failfast thread is also woken up when an I/O has failed. It then check 6020 * the access to the disk to ensure that the I/O failure was not due to a 6021 * reservation conflict. 6022 * 6023 * There is one failfast thread for each virtual disk for which failfast is 6024 * enabled. We could have only one thread sending requests for all disks but 6025 * this would need vdc to send asynchronous requests and to have callbacks to 6026 * process replies. 6027 */ 6028 static void 6029 vdc_failfast_thread(void *arg) 6030 { 6031 int status; 6032 vdc_t *vdc = (vdc_t *)arg; 6033 clock_t timeout, starttime; 6034 6035 mutex_enter(&vdc->lock); 6036 6037 while (vdc->failfast_interval != 0) { 6038 6039 starttime = ddi_get_lbolt(); 6040 6041 mutex_exit(&vdc->lock); 6042 6043 /* check for reservation conflict */ 6044 status = vdc_failfast_check_resv(vdc); 6045 6046 mutex_enter(&vdc->lock); 6047 /* 6048 * We have dropped the lock to send the SCSI command so we have 6049 * to check that failfast is still enabled. 6050 */ 6051 if (vdc->failfast_interval == 0) 6052 break; 6053 6054 /* 6055 * If we have successfully check the disk access and there was 6056 * no reservation conflict then we can complete any I/O queued 6057 * before the last check. 6058 */ 6059 if (status == 0) 6060 vdc_failfast_io_unqueue(vdc, starttime); 6061 6062 /* proceed again if some I/O are still in the queue */ 6063 if (vdc->failfast_io_queue != NULL) 6064 continue; 6065 6066 timeout = ddi_get_lbolt() + 6067 drv_usectohz(vdc->failfast_interval); 6068 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6069 } 6070 6071 /* 6072 * Failfast is being stop so we can complete any queued I/O. 6073 */ 6074 vdc_failfast_io_unqueue(vdc, 0); 6075 vdc->failfast_thread = NULL; 6076 mutex_exit(&vdc->lock); 6077 thread_exit(); 6078 } 6079 6080 /* 6081 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6082 */ 6083 static int 6084 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6085 { 6086 unsigned int mh_time; 6087 6088 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6089 return (EFAULT); 6090 6091 mutex_enter(&vdc->lock); 6092 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6093 vdc->failfast_thread = thread_create(NULL, 0, 6094 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6095 v.v_maxsyspri - 2); 6096 } 6097 6098 vdc->failfast_interval = mh_time * 1000; 6099 cv_signal(&vdc->failfast_cv); 6100 mutex_exit(&vdc->lock); 6101 6102 return (0); 6103 } 6104 6105 /* 6106 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6107 * converted to VD_OP_SET_ACCESS operations. 6108 */ 6109 static int 6110 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6111 { 6112 int rv; 6113 6114 /* submit owership command request */ 6115 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6116 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6117 VIO_both_dir, B_TRUE); 6118 6119 return (rv); 6120 } 6121 6122 /* 6123 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6124 * VD_OP_GET_ACCESS operation. 6125 */ 6126 static int 6127 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6128 { 6129 int rv; 6130 6131 /* submit owership command request */ 6132 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6133 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6134 VIO_both_dir, B_TRUE); 6135 6136 return (rv); 6137 } 6138 6139 /* 6140 * Disk Ownership Thread. 6141 * 6142 * When we have taken the ownership of a disk, this thread waits to be 6143 * notified when the LDC channel is reset so that it can recover the 6144 * ownership. 6145 * 6146 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6147 * can not be used to do the ownership recovery because it has to be 6148 * running to handle the reply message to the ownership operation. 6149 */ 6150 static void 6151 vdc_ownership_thread(void *arg) 6152 { 6153 vdc_t *vdc = (vdc_t *)arg; 6154 clock_t timeout; 6155 uint64_t status; 6156 6157 mutex_enter(&vdc->ownership_lock); 6158 mutex_enter(&vdc->lock); 6159 6160 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6161 6162 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6163 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6164 /* 6165 * There was a reset so the ownership has been lost, 6166 * try to recover. We do this without using the preempt 6167 * option so that we don't steal the ownership from 6168 * someone who has preempted us. 6169 */ 6170 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6171 vdc->instance); 6172 6173 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6174 VDC_OWNERSHIP_GRANTED); 6175 6176 mutex_exit(&vdc->lock); 6177 6178 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6179 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6180 6181 mutex_enter(&vdc->lock); 6182 6183 if (status == 0) { 6184 DMSG(vdc, 0, "[%d] Ownership recovered", 6185 vdc->instance); 6186 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6187 } else { 6188 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6189 vdc->instance); 6190 } 6191 6192 } 6193 6194 /* 6195 * If we have the ownership then we just wait for an event 6196 * to happen (LDC reset), otherwise we will retry to recover 6197 * after a delay. 6198 */ 6199 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6200 timeout = 0; 6201 else 6202 timeout = ddi_get_lbolt() + 6203 drv_usectohz(vdc_ownership_delay); 6204 6205 /* Release the ownership_lock and wait on the vdc lock */ 6206 mutex_exit(&vdc->ownership_lock); 6207 6208 if (timeout == 0) 6209 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6210 else 6211 (void) cv_timedwait(&vdc->ownership_cv, 6212 &vdc->lock, timeout); 6213 6214 mutex_exit(&vdc->lock); 6215 6216 mutex_enter(&vdc->ownership_lock); 6217 mutex_enter(&vdc->lock); 6218 } 6219 6220 vdc->ownership_thread = NULL; 6221 mutex_exit(&vdc->lock); 6222 mutex_exit(&vdc->ownership_lock); 6223 6224 thread_exit(); 6225 } 6226 6227 static void 6228 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6229 { 6230 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6231 6232 mutex_enter(&vdc->lock); 6233 vdc->ownership = ownership_flags; 6234 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6235 vdc->ownership_thread == NULL) { 6236 /* start ownership thread */ 6237 vdc->ownership_thread = thread_create(NULL, 0, 6238 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6239 v.v_maxsyspri - 2); 6240 } else { 6241 /* notify the ownership thread */ 6242 cv_signal(&vdc->ownership_cv); 6243 } 6244 mutex_exit(&vdc->lock); 6245 } 6246 6247 /* 6248 * Get the size and the block size of a virtual disk from the vdisk server. 6249 * We need to use this operation when the vdisk_size attribute was not 6250 * available during the handshake with the vdisk server. 6251 */ 6252 static int 6253 vdc_check_capacity(vdc_t *vdc) 6254 { 6255 int rv = 0; 6256 size_t alloc_len; 6257 vd_capacity_t *vd_cap; 6258 6259 if (vdc->vdisk_size != 0) 6260 return (0); 6261 6262 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6263 6264 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6265 6266 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6267 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6268 6269 if (rv == 0) { 6270 if (vd_cap->vdisk_block_size != vdc->block_size || 6271 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6272 vd_cap->vdisk_size == 0) 6273 rv = EINVAL; 6274 else 6275 vdc->vdisk_size = vd_cap->vdisk_size; 6276 } 6277 6278 kmem_free(vd_cap, alloc_len); 6279 return (rv); 6280 } 6281 6282 /* 6283 * This structure is used in the DKIO(7I) array below. 6284 */ 6285 typedef struct vdc_dk_ioctl { 6286 uint8_t op; /* VD_OP_XXX value */ 6287 int cmd; /* Solaris ioctl operation number */ 6288 size_t nbytes; /* size of structure to be copied */ 6289 6290 /* function to convert between vDisk and Solaris structure formats */ 6291 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6292 int mode, int dir); 6293 } vdc_dk_ioctl_t; 6294 6295 /* 6296 * Subset of DKIO(7I) operations currently supported 6297 */ 6298 static vdc_dk_ioctl_t dk_ioctl[] = { 6299 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6300 vdc_null_copy_func}, 6301 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6302 vdc_get_wce_convert}, 6303 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6304 vdc_set_wce_convert}, 6305 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6306 vdc_get_vtoc_convert}, 6307 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6308 vdc_set_vtoc_convert}, 6309 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6310 vdc_get_geom_convert}, 6311 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6312 vdc_get_geom_convert}, 6313 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6314 vdc_get_geom_convert}, 6315 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6316 vdc_set_geom_convert}, 6317 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6318 vdc_get_efi_convert}, 6319 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6320 vdc_set_efi_convert}, 6321 6322 /* DIOCTL_RWCMD is converted to a read or a write */ 6323 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6324 6325 /* mhd(7I) non-shared multihost disks ioctls */ 6326 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6327 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6328 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6329 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6330 6331 /* mhd(7I) shared multihost disks ioctls */ 6332 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6333 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6334 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6335 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6336 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6337 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6338 6339 /* mhd(7I) failfast ioctl */ 6340 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6341 6342 /* 6343 * These particular ioctls are not sent to the server - vdc fakes up 6344 * the necessary info. 6345 */ 6346 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6347 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6348 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6349 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6350 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6351 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6352 }; 6353 6354 /* 6355 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6356 * function and forward them to the vdisk. 6357 */ 6358 static int 6359 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6360 { 6361 vdc_t *vdc = (vdc_t *)vdisk; 6362 dev_t dev; 6363 int rval; 6364 6365 dev = makedevice(ddi_driver_major(vdc->dip), 6366 VD_MAKE_DEV(vdc->instance, 0)); 6367 6368 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6369 } 6370 6371 /* 6372 * Function: 6373 * vd_process_ioctl() 6374 * 6375 * Description: 6376 * This routine processes disk specific ioctl calls 6377 * 6378 * Arguments: 6379 * dev - the device number 6380 * cmd - the operation [dkio(7I)] to be processed 6381 * arg - pointer to user provided structure 6382 * (contains data to be set or reference parameter for get) 6383 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6384 * rvalp - pointer to return value for calling process. 6385 * 6386 * Return Code: 6387 * 0 6388 * EFAULT 6389 * ENXIO 6390 * EIO 6391 * ENOTSUP 6392 */ 6393 static int 6394 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6395 { 6396 int instance = VDCUNIT(dev); 6397 vdc_t *vdc = NULL; 6398 int rv = -1; 6399 int idx = 0; /* index into dk_ioctl[] */ 6400 size_t len = 0; /* #bytes to send to vds */ 6401 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6402 caddr_t mem_p = NULL; 6403 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6404 vdc_dk_ioctl_t *iop; 6405 6406 vdc = ddi_get_soft_state(vdc_state, instance); 6407 if (vdc == NULL) { 6408 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6409 instance); 6410 return (ENXIO); 6411 } 6412 6413 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6414 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6415 6416 if (rvalp != NULL) { 6417 /* the return value of the ioctl is 0 by default */ 6418 *rvalp = 0; 6419 } 6420 6421 /* 6422 * Validate the ioctl operation to be performed. 6423 * 6424 * If we have looped through the array without finding a match then we 6425 * don't support this ioctl. 6426 */ 6427 for (idx = 0; idx < nioctls; idx++) { 6428 if (cmd == dk_ioctl[idx].cmd) 6429 break; 6430 } 6431 6432 if (idx >= nioctls) { 6433 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6434 vdc->instance, cmd); 6435 return (ENOTSUP); 6436 } 6437 6438 iop = &(dk_ioctl[idx]); 6439 6440 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6441 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6442 dk_efi_t dk_efi; 6443 6444 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6445 if (rv != 0) 6446 return (EFAULT); 6447 6448 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6449 } else { 6450 len = iop->nbytes; 6451 } 6452 6453 /* check if the ioctl is applicable */ 6454 switch (cmd) { 6455 case CDROMREADOFFSET: 6456 case DKIOCREMOVABLE: 6457 return (ENOTTY); 6458 6459 case USCSICMD: 6460 case MHIOCTKOWN: 6461 case MHIOCSTATUS: 6462 case MHIOCQRESERVE: 6463 case MHIOCRELEASE: 6464 case MHIOCGRP_INKEYS: 6465 case MHIOCGRP_INRESV: 6466 case MHIOCGRP_REGISTER: 6467 case MHIOCGRP_RESERVE: 6468 case MHIOCGRP_PREEMPTANDABORT: 6469 case MHIOCGRP_REGISTERANDIGNOREKEY: 6470 case MHIOCENFAILFAST: 6471 if (vdc->cinfo == NULL) 6472 return (ENXIO); 6473 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6474 return (ENOTTY); 6475 break; 6476 6477 case DIOCTL_RWCMD: 6478 if (vdc->cinfo == NULL) 6479 return (ENXIO); 6480 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6481 return (ENOTTY); 6482 break; 6483 6484 case DKIOCINFO: 6485 if (vdc->cinfo == NULL) 6486 return (ENXIO); 6487 break; 6488 6489 case DKIOCGMEDIAINFO: 6490 if (vdc->minfo == NULL) 6491 return (ENXIO); 6492 if (vdc_check_capacity(vdc) != 0) 6493 /* disk capacity is not available */ 6494 return (EIO); 6495 break; 6496 } 6497 6498 /* 6499 * Deal with ioctls which require a processing different than 6500 * converting ioctl arguments and sending a corresponding 6501 * VD operation. 6502 */ 6503 switch (cmd) { 6504 6505 case USCSICMD: 6506 { 6507 return (vdc_uscsi_cmd(vdc, arg, mode)); 6508 } 6509 6510 case MHIOCTKOWN: 6511 { 6512 mutex_enter(&vdc->ownership_lock); 6513 /* 6514 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6515 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6516 * while we are processing the ioctl. 6517 */ 6518 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6519 6520 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6521 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6522 if (rv == 0) { 6523 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6524 VDC_OWNERSHIP_GRANTED); 6525 } else { 6526 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6527 } 6528 mutex_exit(&vdc->ownership_lock); 6529 return (rv); 6530 } 6531 6532 case MHIOCRELEASE: 6533 { 6534 mutex_enter(&vdc->ownership_lock); 6535 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6536 if (rv == 0) { 6537 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6538 } 6539 mutex_exit(&vdc->ownership_lock); 6540 return (rv); 6541 } 6542 6543 case MHIOCSTATUS: 6544 { 6545 uint64_t status; 6546 6547 rv = vdc_access_get(vdc, &status, mode); 6548 if (rv == 0 && rvalp != NULL) 6549 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6550 return (rv); 6551 } 6552 6553 case MHIOCQRESERVE: 6554 { 6555 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6556 return (rv); 6557 } 6558 6559 case MHIOCGRP_INKEYS: 6560 { 6561 return (vdc_mhd_inkeys(vdc, arg, mode)); 6562 } 6563 6564 case MHIOCGRP_INRESV: 6565 { 6566 return (vdc_mhd_inresv(vdc, arg, mode)); 6567 } 6568 6569 case MHIOCGRP_REGISTER: 6570 { 6571 return (vdc_mhd_register(vdc, arg, mode)); 6572 } 6573 6574 case MHIOCGRP_RESERVE: 6575 { 6576 return (vdc_mhd_reserve(vdc, arg, mode)); 6577 } 6578 6579 case MHIOCGRP_PREEMPTANDABORT: 6580 { 6581 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6582 } 6583 6584 case MHIOCGRP_REGISTERANDIGNOREKEY: 6585 { 6586 return (vdc_mhd_registerignore(vdc, arg, mode)); 6587 } 6588 6589 case MHIOCENFAILFAST: 6590 { 6591 rv = vdc_failfast(vdc, arg, mode); 6592 return (rv); 6593 } 6594 6595 case DIOCTL_RWCMD: 6596 { 6597 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6598 } 6599 6600 case DKIOCGAPART: 6601 { 6602 return (vdc_dkio_get_partition(vdc, arg, mode)); 6603 } 6604 6605 case DKIOCINFO: 6606 { 6607 struct dk_cinfo cinfo; 6608 6609 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6610 cinfo.dki_partition = VDCPART(dev); 6611 6612 rv = ddi_copyout(&cinfo, (void *)arg, 6613 sizeof (struct dk_cinfo), mode); 6614 if (rv != 0) 6615 return (EFAULT); 6616 6617 return (0); 6618 } 6619 6620 case DKIOCGMEDIAINFO: 6621 { 6622 ASSERT(vdc->vdisk_size != 0); 6623 if (vdc->minfo->dki_capacity == 0) 6624 vdc->minfo->dki_capacity = vdc->vdisk_size; 6625 rv = ddi_copyout(vdc->minfo, (void *)arg, 6626 sizeof (struct dk_minfo), mode); 6627 if (rv != 0) 6628 return (EFAULT); 6629 6630 return (0); 6631 } 6632 6633 case DKIOCFLUSHWRITECACHE: 6634 { 6635 struct dk_callback *dkc = 6636 (struct dk_callback *)(uintptr_t)arg; 6637 vdc_dk_arg_t *dkarg = NULL; 6638 6639 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6640 instance, mode); 6641 6642 /* 6643 * If arg is NULL, then there is no callback function 6644 * registered and the call operates synchronously; we 6645 * break and continue with the rest of the function and 6646 * wait for vds to return (i.e. after the request to 6647 * vds returns successfully, all writes completed prior 6648 * to the ioctl will have been flushed from the disk 6649 * write cache to persistent media. 6650 * 6651 * If a callback function is registered, we dispatch 6652 * the request on a task queue and return immediately. 6653 * The callback will deal with informing the calling 6654 * thread that the flush request is completed. 6655 */ 6656 if (dkc == NULL) 6657 break; 6658 6659 /* 6660 * the asynchronous callback is only supported if 6661 * invoked from within the kernel 6662 */ 6663 if ((mode & FKIOCTL) == 0) 6664 return (ENOTSUP); 6665 6666 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6667 6668 dkarg->mode = mode; 6669 dkarg->dev = dev; 6670 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 6671 6672 mutex_enter(&vdc->lock); 6673 vdc->dkio_flush_pending++; 6674 dkarg->vdc = vdc; 6675 mutex_exit(&vdc->lock); 6676 6677 /* put the request on a task queue */ 6678 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 6679 (void *)dkarg, DDI_SLEEP); 6680 if (rv == NULL) { 6681 /* clean up if dispatch fails */ 6682 mutex_enter(&vdc->lock); 6683 vdc->dkio_flush_pending--; 6684 mutex_exit(&vdc->lock); 6685 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 6686 } 6687 6688 return (rv == NULL ? ENOMEM : 0); 6689 } 6690 } 6691 6692 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 6693 ASSERT(iop->op != 0); 6694 6695 /* check if the vDisk server handles the operation for this vDisk */ 6696 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 6697 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 6698 vdc->instance, iop->op); 6699 return (ENOTSUP); 6700 } 6701 6702 /* LDC requires that the memory being mapped is 8-byte aligned */ 6703 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 6704 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 6705 instance, len, alloc_len); 6706 6707 if (alloc_len > 0) 6708 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 6709 6710 /* 6711 * Call the conversion function for this ioctl which, if necessary, 6712 * converts from the Solaris format to the format ARC'ed 6713 * as part of the vDisk protocol (FWARC 2006/195) 6714 */ 6715 ASSERT(iop->convert != NULL); 6716 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 6717 if (rv != 0) { 6718 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6719 instance, rv, cmd); 6720 if (mem_p != NULL) 6721 kmem_free(mem_p, alloc_len); 6722 return (rv); 6723 } 6724 6725 /* 6726 * send request to vds to service the ioctl. 6727 */ 6728 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 6729 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 6730 VIO_both_dir, B_TRUE); 6731 6732 if (rv != 0) { 6733 /* 6734 * This is not necessarily an error. The ioctl could 6735 * be returning a value such as ENOTTY to indicate 6736 * that the ioctl is not applicable. 6737 */ 6738 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 6739 instance, rv, cmd); 6740 if (mem_p != NULL) 6741 kmem_free(mem_p, alloc_len); 6742 6743 return (rv); 6744 } 6745 6746 /* 6747 * Call the conversion function (if it exists) for this ioctl 6748 * which converts from the format ARC'ed as part of the vDisk 6749 * protocol (FWARC 2006/195) back to a format understood by 6750 * the rest of Solaris. 6751 */ 6752 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 6753 if (rv != 0) { 6754 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6755 instance, rv, cmd); 6756 if (mem_p != NULL) 6757 kmem_free(mem_p, alloc_len); 6758 return (rv); 6759 } 6760 6761 if (mem_p != NULL) 6762 kmem_free(mem_p, alloc_len); 6763 6764 return (rv); 6765 } 6766 6767 /* 6768 * Function: 6769 * 6770 * Description: 6771 * This is an empty conversion function used by ioctl calls which 6772 * do not need to convert the data being passed in/out to userland 6773 */ 6774 static int 6775 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 6776 { 6777 _NOTE(ARGUNUSED(vdc)) 6778 _NOTE(ARGUNUSED(from)) 6779 _NOTE(ARGUNUSED(to)) 6780 _NOTE(ARGUNUSED(mode)) 6781 _NOTE(ARGUNUSED(dir)) 6782 6783 return (0); 6784 } 6785 6786 static int 6787 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 6788 int mode, int dir) 6789 { 6790 _NOTE(ARGUNUSED(vdc)) 6791 6792 if (dir == VD_COPYIN) 6793 return (0); /* nothing to do */ 6794 6795 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 6796 return (EFAULT); 6797 6798 return (0); 6799 } 6800 6801 static int 6802 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 6803 int mode, int dir) 6804 { 6805 _NOTE(ARGUNUSED(vdc)) 6806 6807 if (dir == VD_COPYOUT) 6808 return (0); /* nothing to do */ 6809 6810 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 6811 return (EFAULT); 6812 6813 return (0); 6814 } 6815 6816 /* 6817 * Function: 6818 * vdc_get_vtoc_convert() 6819 * 6820 * Description: 6821 * This routine performs the necessary convertions from the DKIOCGVTOC 6822 * Solaris structure to the format defined in FWARC 2006/195. 6823 * 6824 * In the struct vtoc definition, the timestamp field is marked as not 6825 * supported so it is not part of vDisk protocol (FWARC 2006/195). 6826 * However SVM uses that field to check it can write into the VTOC, 6827 * so we fake up the info of that field. 6828 * 6829 * Arguments: 6830 * vdc - the vDisk client 6831 * from - the buffer containing the data to be copied from 6832 * to - the buffer to be copied to 6833 * mode - flags passed to ioctl() call 6834 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 6835 * 6836 * Return Code: 6837 * 0 - Success 6838 * ENXIO - incorrect buffer passed in. 6839 * EFAULT - ddi_copyout routine encountered an error. 6840 */ 6841 static int 6842 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6843 { 6844 int i; 6845 void *tmp_mem = NULL; 6846 void *tmp_memp; 6847 struct vtoc vt; 6848 struct vtoc32 vt32; 6849 int copy_len = 0; 6850 int rv = 0; 6851 6852 if (dir != VD_COPYOUT) 6853 return (0); /* nothing to do */ 6854 6855 if ((from == NULL) || (to == NULL)) 6856 return (ENXIO); 6857 6858 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6859 copy_len = sizeof (struct vtoc32); 6860 else 6861 copy_len = sizeof (struct vtoc); 6862 6863 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6864 6865 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 6866 6867 /* fake the VTOC timestamp field */ 6868 for (i = 0; i < V_NUMPAR; i++) { 6869 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 6870 } 6871 6872 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6873 /* LINTED E_ASSIGN_NARROW_CONV */ 6874 vtoctovtoc32(vt, vt32); 6875 tmp_memp = &vt32; 6876 } else { 6877 tmp_memp = &vt; 6878 } 6879 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 6880 if (rv != 0) 6881 rv = EFAULT; 6882 6883 kmem_free(tmp_mem, copy_len); 6884 return (rv); 6885 } 6886 6887 /* 6888 * Function: 6889 * vdc_set_vtoc_convert() 6890 * 6891 * Description: 6892 * This routine performs the necessary convertions from the DKIOCSVTOC 6893 * Solaris structure to the format defined in FWARC 2006/195. 6894 * 6895 * Arguments: 6896 * vdc - the vDisk client 6897 * from - Buffer with data 6898 * to - Buffer where data is to be copied to 6899 * mode - flags passed to ioctl 6900 * dir - direction of copy (in or out) 6901 * 6902 * Return Code: 6903 * 0 - Success 6904 * ENXIO - Invalid buffer passed in 6905 * EFAULT - ddi_copyin of data failed 6906 */ 6907 static int 6908 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6909 { 6910 _NOTE(ARGUNUSED(vdc)) 6911 6912 void *tmp_mem = NULL, *uvtoc; 6913 struct vtoc vt; 6914 struct vtoc *vtp = &vt; 6915 vd_vtoc_t vtvd; 6916 int copy_len = 0; 6917 int i, rv = 0; 6918 6919 if ((from == NULL) || (to == NULL)) 6920 return (ENXIO); 6921 6922 if (dir == VD_COPYIN) 6923 uvtoc = from; 6924 else 6925 uvtoc = to; 6926 6927 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6928 copy_len = sizeof (struct vtoc32); 6929 else 6930 copy_len = sizeof (struct vtoc); 6931 6932 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6933 6934 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 6935 if (rv != 0) { 6936 kmem_free(tmp_mem, copy_len); 6937 return (EFAULT); 6938 } 6939 6940 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6941 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 6942 } else { 6943 vtp = tmp_mem; 6944 } 6945 6946 if (dir == VD_COPYOUT) { 6947 /* 6948 * The disk label may have changed. Revalidate the disk 6949 * geometry. This will also update the device nodes and 6950 * properties. 6951 */ 6952 vdc_validate(vdc); 6953 6954 /* 6955 * We also need to keep track of the timestamp fields. 6956 */ 6957 for (i = 0; i < V_NUMPAR; i++) { 6958 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 6959 } 6960 6961 return (0); 6962 } 6963 6964 VTOC2VD_VTOC(vtp, &vtvd); 6965 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 6966 kmem_free(tmp_mem, copy_len); 6967 6968 return (0); 6969 } 6970 6971 /* 6972 * Function: 6973 * vdc_get_geom_convert() 6974 * 6975 * Description: 6976 * This routine performs the necessary convertions from the DKIOCGGEOM, 6977 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 6978 * defined in FWARC 2006/195 6979 * 6980 * Arguments: 6981 * vdc - the vDisk client 6982 * from - Buffer with data 6983 * to - Buffer where data is to be copied to 6984 * mode - flags passed to ioctl 6985 * dir - direction of copy (in or out) 6986 * 6987 * Return Code: 6988 * 0 - Success 6989 * ENXIO - Invalid buffer passed in 6990 * EFAULT - ddi_copyout of data failed 6991 */ 6992 static int 6993 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6994 { 6995 _NOTE(ARGUNUSED(vdc)) 6996 6997 struct dk_geom geom; 6998 int copy_len = sizeof (struct dk_geom); 6999 int rv = 0; 7000 7001 if (dir != VD_COPYOUT) 7002 return (0); /* nothing to do */ 7003 7004 if ((from == NULL) || (to == NULL)) 7005 return (ENXIO); 7006 7007 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7008 rv = ddi_copyout(&geom, to, copy_len, mode); 7009 if (rv != 0) 7010 rv = EFAULT; 7011 7012 return (rv); 7013 } 7014 7015 /* 7016 * Function: 7017 * vdc_set_geom_convert() 7018 * 7019 * Description: 7020 * This routine performs the necessary convertions from the DKIOCSGEOM 7021 * Solaris structure to the format defined in FWARC 2006/195. 7022 * 7023 * Arguments: 7024 * vdc - the vDisk client 7025 * from - Buffer with data 7026 * to - Buffer where data is to be copied to 7027 * mode - flags passed to ioctl 7028 * dir - direction of copy (in or out) 7029 * 7030 * Return Code: 7031 * 0 - Success 7032 * ENXIO - Invalid buffer passed in 7033 * EFAULT - ddi_copyin of data failed 7034 */ 7035 static int 7036 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7037 { 7038 _NOTE(ARGUNUSED(vdc)) 7039 7040 vd_geom_t vdgeom; 7041 void *tmp_mem = NULL; 7042 int copy_len = sizeof (struct dk_geom); 7043 int rv = 0; 7044 7045 if (dir != VD_COPYIN) 7046 return (0); /* nothing to do */ 7047 7048 if ((from == NULL) || (to == NULL)) 7049 return (ENXIO); 7050 7051 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7052 7053 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7054 if (rv != 0) { 7055 kmem_free(tmp_mem, copy_len); 7056 return (EFAULT); 7057 } 7058 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7059 bcopy(&vdgeom, to, sizeof (vdgeom)); 7060 kmem_free(tmp_mem, copy_len); 7061 7062 return (0); 7063 } 7064 7065 static int 7066 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7067 { 7068 _NOTE(ARGUNUSED(vdc)) 7069 7070 vd_efi_t *vd_efi; 7071 dk_efi_t dk_efi; 7072 int rv = 0; 7073 void *uaddr; 7074 7075 if ((from == NULL) || (to == NULL)) 7076 return (ENXIO); 7077 7078 if (dir == VD_COPYIN) { 7079 7080 vd_efi = (vd_efi_t *)to; 7081 7082 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7083 if (rv != 0) 7084 return (EFAULT); 7085 7086 vd_efi->lba = dk_efi.dki_lba; 7087 vd_efi->length = dk_efi.dki_length; 7088 bzero(vd_efi->data, vd_efi->length); 7089 7090 } else { 7091 7092 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7093 if (rv != 0) 7094 return (EFAULT); 7095 7096 uaddr = dk_efi.dki_data; 7097 7098 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7099 7100 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7101 7102 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7103 mode); 7104 if (rv != 0) 7105 return (EFAULT); 7106 7107 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7108 } 7109 7110 return (0); 7111 } 7112 7113 static int 7114 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7115 { 7116 _NOTE(ARGUNUSED(vdc)) 7117 7118 dk_efi_t dk_efi; 7119 void *uaddr; 7120 7121 if (dir == VD_COPYOUT) { 7122 /* 7123 * The disk label may have changed. Revalidate the disk 7124 * geometry. This will also update the device nodes and 7125 * properties. 7126 */ 7127 vdc_validate(vdc); 7128 return (0); 7129 } 7130 7131 if ((from == NULL) || (to == NULL)) 7132 return (ENXIO); 7133 7134 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7135 return (EFAULT); 7136 7137 uaddr = dk_efi.dki_data; 7138 7139 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7140 7141 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7142 return (EFAULT); 7143 7144 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7145 7146 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7147 7148 return (0); 7149 } 7150 7151 7152 /* -------------------------------------------------------------------------- */ 7153 7154 /* 7155 * Function: 7156 * vdc_create_fake_geometry() 7157 * 7158 * Description: 7159 * This routine fakes up the disk info needed for some DKIO ioctls such 7160 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7161 * 7162 * Note: This function must not be called until the vDisk attributes have 7163 * been exchanged as part of the handshake with the vDisk server. 7164 * 7165 * Arguments: 7166 * vdc - soft state pointer for this instance of the device driver. 7167 * 7168 * Return Code: 7169 * none. 7170 */ 7171 static void 7172 vdc_create_fake_geometry(vdc_t *vdc) 7173 { 7174 ASSERT(vdc != NULL); 7175 ASSERT(vdc->max_xfer_sz != 0); 7176 7177 /* 7178 * DKIOCINFO support 7179 */ 7180 if (vdc->cinfo == NULL) 7181 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7182 7183 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7184 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7185 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7186 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7187 7188 /* 7189 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7190 * operation is supported, otherwise the controller type is DKC_DIRECT. 7191 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7192 * controller type is always DKC_DIRECT in that case. 7193 * 7194 * If the virtual disk is backed by a physical CD/DVD device or 7195 * an ISO image, modify the controller type to indicate this 7196 */ 7197 switch (vdc->vdisk_media) { 7198 case VD_MEDIA_CD: 7199 case VD_MEDIA_DVD: 7200 vdc->cinfo->dki_ctype = DKC_CDROM; 7201 break; 7202 case VD_MEDIA_FIXED: 7203 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7204 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7205 else 7206 vdc->cinfo->dki_ctype = DKC_DIRECT; 7207 break; 7208 default: 7209 /* in the case of v1.0 we default to a fixed disk */ 7210 vdc->cinfo->dki_ctype = DKC_DIRECT; 7211 break; 7212 } 7213 vdc->cinfo->dki_flags = DKI_FMTVOL; 7214 vdc->cinfo->dki_cnum = 0; 7215 vdc->cinfo->dki_addr = 0; 7216 vdc->cinfo->dki_space = 0; 7217 vdc->cinfo->dki_prio = 0; 7218 vdc->cinfo->dki_vec = 0; 7219 vdc->cinfo->dki_unit = vdc->instance; 7220 vdc->cinfo->dki_slave = 0; 7221 /* 7222 * The partition number will be created on the fly depending on the 7223 * actual slice (i.e. minor node) that is used to request the data. 7224 */ 7225 vdc->cinfo->dki_partition = 0; 7226 7227 /* 7228 * DKIOCGMEDIAINFO support 7229 */ 7230 if (vdc->minfo == NULL) 7231 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7232 7233 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7234 vdc->minfo->dki_media_type = 7235 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7236 } else { 7237 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7238 } 7239 7240 vdc->minfo->dki_capacity = vdc->vdisk_size; 7241 vdc->minfo->dki_lbsize = vdc->block_size; 7242 } 7243 7244 static ushort_t 7245 vdc_lbl2cksum(struct dk_label *label) 7246 { 7247 int count; 7248 ushort_t sum, *sp; 7249 7250 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7251 sp = (ushort_t *)label; 7252 sum = 0; 7253 while (count--) { 7254 sum ^= *sp++; 7255 } 7256 7257 return (sum); 7258 } 7259 7260 /* 7261 * Function: 7262 * vdc_validate_geometry 7263 * 7264 * Description: 7265 * This routine discovers the label and geometry of the disk. It stores 7266 * the disk label and related information in the vdc structure. If it 7267 * fails to validate the geometry or to discover the disk label then 7268 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7269 * 7270 * Arguments: 7271 * vdc - soft state pointer for this instance of the device driver. 7272 * 7273 * Return Code: 7274 * 0 - success. 7275 * EINVAL - unknown disk label. 7276 * ENOTSUP - geometry not applicable (EFI label). 7277 * EIO - error accessing the disk. 7278 */ 7279 static int 7280 vdc_validate_geometry(vdc_t *vdc) 7281 { 7282 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7283 dev_t dev; 7284 int rv, rval; 7285 struct dk_label label; 7286 struct dk_geom geom; 7287 struct vtoc vtoc; 7288 efi_gpt_t *gpt; 7289 efi_gpe_t *gpe; 7290 vd_efi_dev_t edev; 7291 7292 ASSERT(vdc != NULL); 7293 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7294 ASSERT(MUTEX_HELD(&vdc->lock)); 7295 7296 mutex_exit(&vdc->lock); 7297 7298 dev = makedevice(ddi_driver_major(vdc->dip), 7299 VD_MAKE_DEV(vdc->instance, 0)); 7300 7301 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7302 if (rv == 0) 7303 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7304 FKIOCTL, &rval); 7305 7306 if (rv == ENOTSUP) { 7307 /* 7308 * If the device does not support VTOC then we try 7309 * to read an EFI label. 7310 * 7311 * We need to know the block size and the disk size to 7312 * be able to read an EFI label. 7313 */ 7314 if (vdc->vdisk_size == 0) { 7315 if ((rv = vdc_check_capacity(vdc)) != 0) { 7316 mutex_enter(&vdc->lock); 7317 vdc_store_label_unk(vdc); 7318 return (rv); 7319 } 7320 } 7321 7322 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7323 7324 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7325 7326 if (rv) { 7327 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7328 vdc->instance, rv); 7329 mutex_enter(&vdc->lock); 7330 vdc_store_label_unk(vdc); 7331 return (EIO); 7332 } 7333 7334 mutex_enter(&vdc->lock); 7335 vdc_store_label_efi(vdc, gpt, gpe); 7336 vd_efi_free(&edev, gpt, gpe); 7337 return (ENOTSUP); 7338 } 7339 7340 if (rv != 0) { 7341 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7342 vdc->instance, rv); 7343 mutex_enter(&vdc->lock); 7344 vdc_store_label_unk(vdc); 7345 if (rv != EINVAL) 7346 rv = EIO; 7347 return (rv); 7348 } 7349 7350 /* check that geometry and vtoc are valid */ 7351 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7352 vtoc.v_sanity != VTOC_SANE) { 7353 mutex_enter(&vdc->lock); 7354 vdc_store_label_unk(vdc); 7355 return (EINVAL); 7356 } 7357 7358 /* 7359 * We have a disk and a valid VTOC. However this does not mean 7360 * that the disk currently have a VTOC label. The returned VTOC may 7361 * be a default VTOC to be used for configuring the disk (this is 7362 * what is done for disk image). So we read the label from the 7363 * beginning of the disk to ensure we really have a VTOC label. 7364 * 7365 * FUTURE: This could be the default way for reading the VTOC 7366 * from the disk as opposed to sending the VD_OP_GET_VTOC 7367 * to the server. This will be the default if vdc is implemented 7368 * ontop of cmlb. 7369 */ 7370 7371 /* 7372 * Single slice disk does not support read using an absolute disk 7373 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7374 */ 7375 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7376 mutex_enter(&vdc->lock); 7377 if (vtoc.v_nparts != 1) { 7378 vdc_store_label_unk(vdc); 7379 return (EINVAL); 7380 } 7381 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7382 return (0); 7383 } 7384 7385 if (vtoc.v_nparts != V_NUMPAR) { 7386 mutex_enter(&vdc->lock); 7387 vdc_store_label_unk(vdc); 7388 return (EINVAL); 7389 } 7390 7391 /* 7392 * Read disk label from start of disk 7393 */ 7394 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7395 bioinit(buf); 7396 buf->b_un.b_addr = (caddr_t)&label; 7397 buf->b_bcount = DK_LABEL_SIZE; 7398 buf->b_flags = B_BUSY | B_READ; 7399 buf->b_dev = cmpdev(dev); 7400 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7401 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7402 if (rv) { 7403 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7404 vdc->instance); 7405 } else { 7406 rv = biowait(buf); 7407 biofini(buf); 7408 } 7409 kmem_free(buf, sizeof (buf_t)); 7410 7411 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7412 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7413 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7414 vdc->instance); 7415 mutex_enter(&vdc->lock); 7416 vdc_store_label_unk(vdc); 7417 return (EINVAL); 7418 } 7419 7420 mutex_enter(&vdc->lock); 7421 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7422 return (0); 7423 } 7424 7425 /* 7426 * Function: 7427 * vdc_validate 7428 * 7429 * Description: 7430 * This routine discovers the label of the disk and create the 7431 * appropriate device nodes if the label has changed. 7432 * 7433 * Arguments: 7434 * vdc - soft state pointer for this instance of the device driver. 7435 * 7436 * Return Code: 7437 * none. 7438 */ 7439 static void 7440 vdc_validate(vdc_t *vdc) 7441 { 7442 vd_disk_label_t old_label; 7443 vd_slice_t old_slice[V_NUMPAR]; 7444 int rv; 7445 7446 ASSERT(!MUTEX_HELD(&vdc->lock)); 7447 7448 mutex_enter(&vdc->lock); 7449 7450 /* save the current label and vtoc */ 7451 old_label = vdc->vdisk_label; 7452 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7453 7454 /* check the geometry */ 7455 (void) vdc_validate_geometry(vdc); 7456 7457 /* if the disk label has changed, update device nodes */ 7458 if (vdc->vdisk_label != old_label) { 7459 7460 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7461 rv = vdc_create_device_nodes_efi(vdc); 7462 else 7463 rv = vdc_create_device_nodes_vtoc(vdc); 7464 7465 if (rv != 0) { 7466 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7467 vdc->instance); 7468 } 7469 } 7470 7471 /* if the vtoc has changed, update device nodes properties */ 7472 if (bcmp(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR) != 0) { 7473 7474 if (vdc_create_device_nodes_props(vdc) != 0) { 7475 DMSG(vdc, 0, "![%d] Failed to update device nodes" 7476 " properties", vdc->instance); 7477 } 7478 } 7479 7480 mutex_exit(&vdc->lock); 7481 } 7482 7483 static void 7484 vdc_validate_task(void *arg) 7485 { 7486 vdc_t *vdc = (vdc_t *)arg; 7487 7488 vdc_validate(vdc); 7489 7490 mutex_enter(&vdc->lock); 7491 ASSERT(vdc->validate_pending > 0); 7492 vdc->validate_pending--; 7493 mutex_exit(&vdc->lock); 7494 } 7495 7496 /* 7497 * Function: 7498 * vdc_setup_devid() 7499 * 7500 * Description: 7501 * This routine discovers the devid of a vDisk. It requests the devid of 7502 * the underlying device from the vDisk server, builds an encapsulated 7503 * devid based on the retrieved devid and registers that new devid to 7504 * the vDisk. 7505 * 7506 * Arguments: 7507 * vdc - soft state pointer for this instance of the device driver. 7508 * 7509 * Return Code: 7510 * 0 - A devid was succesfully registered for the vDisk 7511 */ 7512 static int 7513 vdc_setup_devid(vdc_t *vdc) 7514 { 7515 int rv; 7516 vd_devid_t *vd_devid; 7517 size_t bufsize, bufid_len; 7518 7519 /* 7520 * At first sight, we don't know the size of the devid that the 7521 * server will return but this size will be encoded into the 7522 * reply. So we do a first request using a default size then we 7523 * check if this size was large enough. If not then we do a second 7524 * request with the correct size returned by the server. Note that 7525 * ldc requires size to be 8-byte aligned. 7526 */ 7527 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7528 sizeof (uint64_t)); 7529 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7530 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7531 7532 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7533 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7534 7535 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7536 7537 if (rv) { 7538 kmem_free(vd_devid, bufsize); 7539 return (rv); 7540 } 7541 7542 if (vd_devid->length > bufid_len) { 7543 /* 7544 * The returned devid is larger than the buffer used. Try again 7545 * with a buffer with the right size. 7546 */ 7547 kmem_free(vd_devid, bufsize); 7548 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7549 sizeof (uint64_t)); 7550 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7551 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7552 7553 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7554 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7555 VIO_both_dir, B_TRUE); 7556 7557 if (rv) { 7558 kmem_free(vd_devid, bufsize); 7559 return (rv); 7560 } 7561 } 7562 7563 /* 7564 * The virtual disk should have the same device id as the one associated 7565 * with the physical disk it is mapped on, otherwise sharing a disk 7566 * between a LDom and a non-LDom may not work (for example for a shared 7567 * SVM disk set). 7568 * 7569 * The DDI framework does not allow creating a device id with any 7570 * type so we first create a device id of type DEVID_ENCAP and then 7571 * we restore the orignal type of the physical device. 7572 */ 7573 7574 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7575 7576 /* build an encapsulated devid based on the returned devid */ 7577 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7578 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7579 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7580 kmem_free(vd_devid, bufsize); 7581 return (1); 7582 } 7583 7584 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7585 7586 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7587 7588 kmem_free(vd_devid, bufsize); 7589 7590 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7591 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7592 return (1); 7593 } 7594 7595 return (0); 7596 } 7597 7598 static void 7599 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7600 { 7601 int i, nparts; 7602 7603 ASSERT(MUTEX_HELD(&vdc->lock)); 7604 7605 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7606 bzero(vdc->vtoc, sizeof (struct vtoc)); 7607 bzero(vdc->geom, sizeof (struct dk_geom)); 7608 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7609 7610 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7611 7612 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7613 7614 if (gpe[i].efi_gpe_StartingLBA == 0 || 7615 gpe[i].efi_gpe_EndingLBA == 0) { 7616 continue; 7617 } 7618 7619 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7620 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7621 gpe[i].efi_gpe_StartingLBA + 1; 7622 } 7623 7624 ASSERT(vdc->vdisk_size != 0); 7625 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7626 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7627 7628 } 7629 7630 static void 7631 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7632 { 7633 int i; 7634 7635 ASSERT(MUTEX_HELD(&vdc->lock)); 7636 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7637 7638 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7639 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7640 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7641 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7642 7643 for (i = 0; i < vtoc->v_nparts; i++) { 7644 vdc->slice[i].start = vtoc->v_part[i].p_start; 7645 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7646 } 7647 } 7648 7649 static void 7650 vdc_store_label_unk(vdc_t *vdc) 7651 { 7652 ASSERT(MUTEX_HELD(&vdc->lock)); 7653 7654 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7655 bzero(vdc->vtoc, sizeof (struct vtoc)); 7656 bzero(vdc->geom, sizeof (struct dk_geom)); 7657 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7658 } 7659