1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/kstat.h> 69 #include <sys/mach_descrip.h> 70 #include <sys/modctl.h> 71 #include <sys/mdeg.h> 72 #include <sys/note.h> 73 #include <sys/open.h> 74 #include <sys/sdt.h> 75 #include <sys/stat.h> 76 #include <sys/sunddi.h> 77 #include <sys/types.h> 78 #include <sys/promif.h> 79 #include <sys/var.h> 80 #include <sys/vtoc.h> 81 #include <sys/archsystm.h> 82 #include <sys/sysmacros.h> 83 84 #include <sys/cdio.h> 85 #include <sys/dktp/fdisk.h> 86 #include <sys/dktp/dadkio.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 124 /* setup */ 125 static void vdc_min(struct buf *bufp); 126 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 127 static int vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node); 128 static int vdc_start_ldc_connection(vdc_t *vdc); 129 static int vdc_create_device_nodes(vdc_t *vdc); 130 static int vdc_create_device_nodes_efi(vdc_t *vdc); 131 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 132 static int vdc_create_device_nodes_props(vdc_t *vdc); 133 static void vdc_create_io_kstats(vdc_t *vdc); 134 static void vdc_create_err_kstats(vdc_t *vdc); 135 static void vdc_set_err_kstats(vdc_t *vdc); 136 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 137 mde_cookie_t *vd_nodep, mde_cookie_t *vd_portp); 138 static int vdc_get_ldc_id(md_t *, mde_cookie_t, uint64_t *); 139 static int vdc_do_ldc_up(vdc_t *vdc); 140 static void vdc_terminate_ldc(vdc_t *vdc); 141 static int vdc_init_descriptor_ring(vdc_t *vdc); 142 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 143 static int vdc_setup_devid(vdc_t *vdc); 144 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 145 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 146 static void vdc_store_label_unk(vdc_t *vdc); 147 static boolean_t vdc_is_opened(vdc_t *vdc); 148 149 /* handshake with vds */ 150 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 151 static int vdc_ver_negotiation(vdc_t *vdcp); 152 static int vdc_init_attr_negotiation(vdc_t *vdc); 153 static int vdc_attr_negotiation(vdc_t *vdcp); 154 static int vdc_init_dring_negotiate(vdc_t *vdc); 155 static int vdc_dring_negotiation(vdc_t *vdcp); 156 static int vdc_send_rdx(vdc_t *vdcp); 157 static int vdc_rdx_exchange(vdc_t *vdcp); 158 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 159 160 /* processing incoming messages from vDisk server */ 161 static void vdc_process_msg_thread(vdc_t *vdc); 162 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 163 164 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 165 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 166 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 167 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 168 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 169 static int vdc_send_request(vdc_t *vdcp, int operation, 170 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 171 int cb_type, void *cb_arg, vio_desc_direction_t dir); 172 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 173 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 174 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 175 int cb_type, void *cb_arg, vio_desc_direction_t dir); 176 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 177 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 178 void *cb_arg, vio_desc_direction_t dir, boolean_t); 179 180 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 181 static int vdc_drain_response(vdc_t *vdcp); 182 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 183 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 184 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 185 186 /* dkio */ 187 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 188 int *rvalp); 189 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 190 static void vdc_create_fake_geometry(vdc_t *vdc); 191 static int vdc_validate_geometry(vdc_t *vdc); 192 static void vdc_validate(vdc_t *vdc); 193 static void vdc_validate_task(void *arg); 194 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 195 int mode, int dir); 196 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 197 int mode, int dir); 198 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 199 int mode, int dir); 200 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 201 int mode, int dir); 202 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 203 int mode, int dir); 204 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 205 int mode, int dir); 206 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 207 int mode, int dir); 208 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 209 int mode, int dir); 210 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 211 int mode, int dir); 212 213 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 214 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 215 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 216 static int vdc_failfast_check_resv(vdc_t *vdc); 217 218 /* 219 * Module variables 220 */ 221 222 /* 223 * Tunable variables to control how long vdc waits before timing out on 224 * various operations 225 */ 226 static int vdc_hshake_retries = 3; 227 228 static int vdc_timeout = 0; /* units: seconds */ 229 230 static uint64_t vdc_hz_min_ldc_delay; 231 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 232 static uint64_t vdc_hz_max_ldc_delay; 233 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 234 235 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 236 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 237 238 /* values for dumping - need to run in a tighter loop */ 239 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 240 static int vdc_dump_retries = 100; 241 242 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 243 244 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 245 246 /* Count of the number of vdc instances attached */ 247 static volatile uint32_t vdc_instance_count = 0; 248 249 /* Tunable to log all SCSI errors */ 250 static boolean_t vdc_scsi_log_error = B_FALSE; 251 252 /* Soft state pointer */ 253 static void *vdc_state; 254 255 /* 256 * Controlling the verbosity of the error/debug messages 257 * 258 * vdc_msglevel - controls level of messages 259 * vdc_matchinst - 64-bit variable where each bit corresponds 260 * to the vdc instance the vdc_msglevel applies. 261 */ 262 int vdc_msglevel = 0x0; 263 uint64_t vdc_matchinst = 0ull; 264 265 /* 266 * Supported vDisk protocol version pairs. 267 * 268 * The first array entry is the latest and preferred version. 269 */ 270 static const vio_ver_t vdc_version[] = {{1, 1}}; 271 272 static struct cb_ops vdc_cb_ops = { 273 vdc_open, /* cb_open */ 274 vdc_close, /* cb_close */ 275 vdc_strategy, /* cb_strategy */ 276 vdc_print, /* cb_print */ 277 vdc_dump, /* cb_dump */ 278 vdc_read, /* cb_read */ 279 vdc_write, /* cb_write */ 280 vdc_ioctl, /* cb_ioctl */ 281 nodev, /* cb_devmap */ 282 nodev, /* cb_mmap */ 283 nodev, /* cb_segmap */ 284 nochpoll, /* cb_chpoll */ 285 ddi_prop_op, /* cb_prop_op */ 286 NULL, /* cb_str */ 287 D_MP | D_64BIT, /* cb_flag */ 288 CB_REV, /* cb_rev */ 289 vdc_aread, /* cb_aread */ 290 vdc_awrite /* cb_awrite */ 291 }; 292 293 static struct dev_ops vdc_ops = { 294 DEVO_REV, /* devo_rev */ 295 0, /* devo_refcnt */ 296 vdc_getinfo, /* devo_getinfo */ 297 nulldev, /* devo_identify */ 298 nulldev, /* devo_probe */ 299 vdc_attach, /* devo_attach */ 300 vdc_detach, /* devo_detach */ 301 nodev, /* devo_reset */ 302 &vdc_cb_ops, /* devo_cb_ops */ 303 NULL, /* devo_bus_ops */ 304 nulldev /* devo_power */ 305 }; 306 307 static struct modldrv modldrv = { 308 &mod_driverops, 309 "virtual disk client", 310 &vdc_ops, 311 }; 312 313 static struct modlinkage modlinkage = { 314 MODREV_1, 315 &modldrv, 316 NULL 317 }; 318 319 /* -------------------------------------------------------------------------- */ 320 321 /* 322 * Device Driver housekeeping and setup 323 */ 324 325 int 326 _init(void) 327 { 328 int status; 329 330 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 331 return (status); 332 if ((status = mod_install(&modlinkage)) != 0) 333 ddi_soft_state_fini(&vdc_state); 334 return (status); 335 } 336 337 int 338 _info(struct modinfo *modinfop) 339 { 340 return (mod_info(&modlinkage, modinfop)); 341 } 342 343 int 344 _fini(void) 345 { 346 int status; 347 348 if ((status = mod_remove(&modlinkage)) != 0) 349 return (status); 350 ddi_soft_state_fini(&vdc_state); 351 return (0); 352 } 353 354 static int 355 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 356 { 357 _NOTE(ARGUNUSED(dip)) 358 359 int instance = VDCUNIT((dev_t)arg); 360 vdc_t *vdc = NULL; 361 362 switch (cmd) { 363 case DDI_INFO_DEVT2DEVINFO: 364 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 365 *resultp = NULL; 366 return (DDI_FAILURE); 367 } 368 *resultp = vdc->dip; 369 return (DDI_SUCCESS); 370 case DDI_INFO_DEVT2INSTANCE: 371 *resultp = (void *)(uintptr_t)instance; 372 return (DDI_SUCCESS); 373 default: 374 *resultp = NULL; 375 return (DDI_FAILURE); 376 } 377 } 378 379 static int 380 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 381 { 382 kt_did_t failfast_tid, ownership_tid; 383 int instance; 384 int rv; 385 vdc_t *vdc = NULL; 386 387 switch (cmd) { 388 case DDI_DETACH: 389 /* the real work happens below */ 390 break; 391 case DDI_SUSPEND: 392 /* nothing to do for this non-device */ 393 return (DDI_SUCCESS); 394 default: 395 return (DDI_FAILURE); 396 } 397 398 ASSERT(cmd == DDI_DETACH); 399 instance = ddi_get_instance(dip); 400 DMSGX(1, "[%d] Entered\n", instance); 401 402 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 403 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 404 return (DDI_FAILURE); 405 } 406 407 /* 408 * This function is called when vdc is detached or if it has failed to 409 * attach. In that case, the attach may have fail before the vdisk type 410 * has been set so we can't call vdc_is_opened(). However as the attach 411 * has failed, we know that the vdisk is not opened and we can safely 412 * detach. 413 */ 414 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 415 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 416 return (DDI_FAILURE); 417 } 418 419 if (vdc->dkio_flush_pending) { 420 DMSG(vdc, 0, 421 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 422 instance, vdc->dkio_flush_pending); 423 return (DDI_FAILURE); 424 } 425 426 if (vdc->validate_pending) { 427 DMSG(vdc, 0, 428 "[%d] Cannot detach: %d outstanding validate request\n", 429 instance, vdc->validate_pending); 430 return (DDI_FAILURE); 431 } 432 433 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 434 435 /* If we took ownership, release ownership */ 436 mutex_enter(&vdc->ownership_lock); 437 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 438 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 439 if (rv == 0) { 440 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 441 } 442 } 443 mutex_exit(&vdc->ownership_lock); 444 445 /* mark instance as detaching */ 446 vdc->lifecycle = VDC_LC_DETACHING; 447 448 /* 449 * try and disable callbacks to prevent another handshake 450 */ 451 rv = ldc_set_cb_mode(vdc->ldc_handle, LDC_CB_DISABLE); 452 DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); 453 454 if (vdc->initialized & VDC_THREAD) { 455 mutex_enter(&vdc->read_lock); 456 if ((vdc->read_state == VDC_READ_WAITING) || 457 (vdc->read_state == VDC_READ_RESET)) { 458 vdc->read_state = VDC_READ_RESET; 459 cv_signal(&vdc->read_cv); 460 } 461 462 mutex_exit(&vdc->read_lock); 463 464 /* wake up any thread waiting for connection to come online */ 465 mutex_enter(&vdc->lock); 466 if (vdc->state == VDC_STATE_INIT_WAITING) { 467 DMSG(vdc, 0, 468 "[%d] write reset - move to resetting state...\n", 469 instance); 470 vdc->state = VDC_STATE_RESETTING; 471 cv_signal(&vdc->initwait_cv); 472 } 473 mutex_exit(&vdc->lock); 474 475 /* now wait until state transitions to VDC_STATE_DETACH */ 476 thread_join(vdc->msg_proc_thr->t_did); 477 ASSERT(vdc->state == VDC_STATE_DETACH); 478 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 479 vdc->instance); 480 } 481 482 mutex_enter(&vdc->lock); 483 484 if (vdc->initialized & VDC_DRING) 485 vdc_destroy_descriptor_ring(vdc); 486 487 if (vdc->initialized & VDC_LDC) 488 vdc_terminate_ldc(vdc); 489 490 if (vdc->failfast_thread) { 491 failfast_tid = vdc->failfast_thread->t_did; 492 vdc->failfast_interval = 0; 493 cv_signal(&vdc->failfast_cv); 494 } else { 495 failfast_tid = 0; 496 } 497 498 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 499 ownership_tid = vdc->ownership_thread->t_did; 500 vdc->ownership = VDC_OWNERSHIP_NONE; 501 cv_signal(&vdc->ownership_cv); 502 } else { 503 ownership_tid = 0; 504 } 505 506 mutex_exit(&vdc->lock); 507 508 if (failfast_tid != 0) 509 thread_join(failfast_tid); 510 511 if (ownership_tid != 0) 512 thread_join(ownership_tid); 513 514 if (vdc->initialized & VDC_MINOR) { 515 ddi_prop_remove_all(dip); 516 ddi_remove_minor_node(dip, NULL); 517 } 518 519 if (vdc->io_stats) { 520 kstat_delete(vdc->io_stats); 521 vdc->io_stats = NULL; 522 } 523 524 if (vdc->err_stats) { 525 kstat_delete(vdc->err_stats); 526 vdc->err_stats = NULL; 527 } 528 529 if (vdc->initialized & VDC_LOCKS) { 530 mutex_destroy(&vdc->lock); 531 mutex_destroy(&vdc->read_lock); 532 mutex_destroy(&vdc->ownership_lock); 533 cv_destroy(&vdc->initwait_cv); 534 cv_destroy(&vdc->dring_free_cv); 535 cv_destroy(&vdc->membind_cv); 536 cv_destroy(&vdc->sync_pending_cv); 537 cv_destroy(&vdc->sync_blocked_cv); 538 cv_destroy(&vdc->read_cv); 539 cv_destroy(&vdc->running_cv); 540 cv_destroy(&vdc->ownership_cv); 541 cv_destroy(&vdc->failfast_cv); 542 cv_destroy(&vdc->failfast_io_cv); 543 } 544 545 if (vdc->minfo) 546 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 547 548 if (vdc->cinfo) 549 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 550 551 if (vdc->vtoc) 552 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 553 554 if (vdc->geom) 555 kmem_free(vdc->geom, sizeof (struct dk_geom)); 556 557 if (vdc->devid) { 558 ddi_devid_unregister(dip); 559 ddi_devid_free(vdc->devid); 560 } 561 562 if (vdc->initialized & VDC_SOFT_STATE) 563 ddi_soft_state_free(vdc_state, instance); 564 565 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 566 567 return (DDI_SUCCESS); 568 } 569 570 571 static int 572 vdc_do_attach(dev_info_t *dip) 573 { 574 int instance; 575 vdc_t *vdc = NULL; 576 int status; 577 md_t *mdp; 578 mde_cookie_t vd_node, vd_port; 579 580 ASSERT(dip != NULL); 581 582 instance = ddi_get_instance(dip); 583 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 584 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 585 instance); 586 return (DDI_FAILURE); 587 } 588 589 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 590 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 591 return (DDI_FAILURE); 592 } 593 594 /* 595 * We assign the value to initialized in this case to zero out the 596 * variable and then set bits in it to indicate what has been done 597 */ 598 vdc->initialized = VDC_SOFT_STATE; 599 600 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 601 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 602 603 vdc->dip = dip; 604 vdc->instance = instance; 605 vdc->vdisk_type = VD_DISK_TYPE_UNK; 606 vdc->vdisk_label = VD_DISK_LABEL_UNK; 607 vdc->state = VDC_STATE_INIT; 608 vdc->lifecycle = VDC_LC_ATTACHING; 609 vdc->ldc_state = 0; 610 vdc->session_id = 0; 611 vdc->block_size = DEV_BSIZE; 612 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 613 614 /* 615 * We assume, for now, that the vDisk server will export 'read' 616 * operations to us at a minimum (this is needed because of checks 617 * in vdc for supported operations early in the handshake process). 618 * The vDisk server will return ENOTSUP if this is not the case. 619 * The value will be overwritten during the attribute exchange with 620 * the bitmask of operations exported by server. 621 */ 622 vdc->operations = VD_OP_MASK_READ; 623 624 vdc->vtoc = NULL; 625 vdc->geom = NULL; 626 vdc->cinfo = NULL; 627 vdc->minfo = NULL; 628 629 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 630 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 631 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 632 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 633 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 634 635 vdc->threads_pending = 0; 636 vdc->sync_op_pending = B_FALSE; 637 vdc->sync_op_blocked = B_FALSE; 638 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 639 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 640 641 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 642 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 643 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 645 646 /* init blocking msg read functionality */ 647 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 648 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 649 vdc->read_state = VDC_READ_IDLE; 650 651 vdc->initialized |= VDC_LOCKS; 652 653 /* get device and port MD node for this disk instance */ 654 if (vdc_get_md_node(dip, &mdp, &vd_node, &vd_port) != 0) { 655 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 656 instance); 657 return (DDI_FAILURE); 658 } 659 660 /* set the connection timeout */ 661 if (vd_port == NULL || (md_get_prop_val(mdp, vd_port, 662 VDC_MD_TIMEOUT, &vdc->ctimeout) != 0)) { 663 vdc->ctimeout = 0; 664 } 665 666 /* initialise LDC channel which will be used to communicate with vds */ 667 status = vdc_do_ldc_init(vdc, mdp, vd_node); 668 669 (void) md_fini_handle(mdp); 670 671 if (status != 0) { 672 cmn_err(CE_NOTE, "[%d] Couldn't initialize LDC", instance); 673 goto return_status; 674 } 675 676 /* initialize the thread responsible for managing state with server */ 677 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 678 vdc, 0, &p0, TS_RUN, minclsyspri); 679 if (vdc->msg_proc_thr == NULL) { 680 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 681 instance); 682 return (DDI_FAILURE); 683 } 684 685 vdc->initialized |= VDC_THREAD; 686 687 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 688 vdc_create_io_kstats(vdc); 689 vdc_create_err_kstats(vdc); 690 691 atomic_inc_32(&vdc_instance_count); 692 693 /* 694 * Check the disk label. This will send requests and do the handshake. 695 * We don't really care about the disk label now. What we really need is 696 * the handshake do be done so that we know the type of the disk (slice 697 * or full disk) and the appropriate device nodes can be created. 698 */ 699 vdc->vdisk_label = VD_DISK_LABEL_UNK; 700 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 701 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 702 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 703 704 mutex_enter(&vdc->lock); 705 (void) vdc_validate_geometry(vdc); 706 mutex_exit(&vdc->lock); 707 708 /* 709 * Now that we have the device info we can create the 710 * device nodes and properties 711 */ 712 status = vdc_create_device_nodes(vdc); 713 if (status) { 714 DMSG(vdc, 0, "[%d] Failed to create device nodes", 715 instance); 716 goto return_status; 717 } 718 status = vdc_create_device_nodes_props(vdc); 719 if (status) { 720 DMSG(vdc, 0, "[%d] Failed to create device nodes" 721 " properties (%d)", instance, status); 722 goto return_status; 723 } 724 725 /* 726 * Setup devid 727 */ 728 if (vdc_setup_devid(vdc)) { 729 DMSG(vdc, 0, "[%d] No device id available\n", instance); 730 } 731 732 /* 733 * Fill in the fields of the error statistics kstat that were not 734 * available when creating the kstat 735 */ 736 vdc_set_err_kstats(vdc); 737 738 ddi_report_dev(dip); 739 vdc->lifecycle = VDC_LC_ONLINE; 740 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 741 742 return_status: 743 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 744 return (status); 745 } 746 747 static int 748 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 749 { 750 int status; 751 752 switch (cmd) { 753 case DDI_ATTACH: 754 if ((status = vdc_do_attach(dip)) != 0) 755 (void) vdc_detach(dip, DDI_DETACH); 756 return (status); 757 case DDI_RESUME: 758 /* nothing to do for this non-device */ 759 return (DDI_SUCCESS); 760 default: 761 return (DDI_FAILURE); 762 } 763 } 764 765 static int 766 vdc_do_ldc_init(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_node) 767 { 768 int status = 0; 769 ldc_status_t ldc_state; 770 ldc_attr_t ldc_attr; 771 uint64_t ldc_id = 0; 772 773 ASSERT(vdc != NULL); 774 775 vdc->initialized |= VDC_LDC; 776 777 if ((status = vdc_get_ldc_id(mdp, vd_node, &ldc_id)) != 0) { 778 DMSG(vdc, 0, "[%d] Failed to get LDC channel ID property", 779 vdc->instance); 780 return (EIO); 781 } 782 783 DMSGX(0, "[%d] LDC id is 0x%lx\n", vdc->instance, ldc_id); 784 785 vdc->ldc_id = ldc_id; 786 787 ldc_attr.devclass = LDC_DEV_BLK; 788 ldc_attr.instance = vdc->instance; 789 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 790 ldc_attr.mtu = VD_LDC_MTU; 791 792 if ((vdc->initialized & VDC_LDC_INIT) == 0) { 793 status = ldc_init(ldc_id, &ldc_attr, &vdc->ldc_handle); 794 if (status != 0) { 795 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 796 vdc->instance, ldc_id, status); 797 return (status); 798 } 799 vdc->initialized |= VDC_LDC_INIT; 800 } 801 status = ldc_status(vdc->ldc_handle, &ldc_state); 802 if (status != 0) { 803 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 804 vdc->instance, status); 805 return (status); 806 } 807 vdc->ldc_state = ldc_state; 808 809 if ((vdc->initialized & VDC_LDC_CB) == 0) { 810 status = ldc_reg_callback(vdc->ldc_handle, vdc_handle_cb, 811 (caddr_t)vdc); 812 if (status != 0) { 813 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 814 vdc->instance, status); 815 return (status); 816 } 817 vdc->initialized |= VDC_LDC_CB; 818 } 819 820 vdc->initialized |= VDC_LDC; 821 822 /* 823 * At this stage we have initialised LDC, we will now try and open 824 * the connection. 825 */ 826 if (vdc->ldc_state == LDC_INIT) { 827 status = ldc_open(vdc->ldc_handle); 828 if (status != 0) { 829 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 830 vdc->instance, vdc->ldc_id, status); 831 return (status); 832 } 833 vdc->initialized |= VDC_LDC_OPEN; 834 } 835 836 return (status); 837 } 838 839 static int 840 vdc_start_ldc_connection(vdc_t *vdc) 841 { 842 int status = 0; 843 844 ASSERT(vdc != NULL); 845 846 ASSERT(MUTEX_HELD(&vdc->lock)); 847 848 status = vdc_do_ldc_up(vdc); 849 850 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 851 852 return (status); 853 } 854 855 static int 856 vdc_stop_ldc_connection(vdc_t *vdcp) 857 { 858 int status; 859 860 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 861 vdcp->state); 862 863 status = ldc_down(vdcp->ldc_handle); 864 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 865 866 vdcp->initialized &= ~VDC_HANDSHAKE; 867 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 868 869 return (status); 870 } 871 872 static void 873 vdc_create_io_kstats(vdc_t *vdc) 874 { 875 if (vdc->io_stats != NULL) { 876 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 877 return; 878 } 879 880 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 881 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 882 if (vdc->io_stats != NULL) { 883 vdc->io_stats->ks_lock = &vdc->lock; 884 kstat_install(vdc->io_stats); 885 } else { 886 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 887 " will not be gathered", vdc->instance); 888 } 889 } 890 891 static void 892 vdc_create_err_kstats(vdc_t *vdc) 893 { 894 vd_err_stats_t *stp; 895 char kstatmodule_err[KSTAT_STRLEN]; 896 char kstatname[KSTAT_STRLEN]; 897 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 898 int instance = vdc->instance; 899 900 if (vdc->err_stats != NULL) { 901 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 902 return; 903 } 904 905 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 906 "%serr", VDC_DRIVER_NAME); 907 (void) snprintf(kstatname, sizeof (kstatname), 908 "%s%d,err", VDC_DRIVER_NAME, instance); 909 910 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 911 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 912 913 if (vdc->err_stats == NULL) { 914 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 915 " will not be gathered", instance); 916 return; 917 } 918 919 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 920 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 921 KSTAT_DATA_UINT32); 922 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 923 KSTAT_DATA_UINT32); 924 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 925 KSTAT_DATA_UINT32); 926 kstat_named_init(&stp->vd_vid, "Vendor", 927 KSTAT_DATA_CHAR); 928 kstat_named_init(&stp->vd_pid, "Product", 929 KSTAT_DATA_CHAR); 930 kstat_named_init(&stp->vd_capacity, "Size", 931 KSTAT_DATA_ULONGLONG); 932 933 vdc->err_stats->ks_update = nulldev; 934 935 kstat_install(vdc->err_stats); 936 } 937 938 static void 939 vdc_set_err_kstats(vdc_t *vdc) 940 { 941 vd_err_stats_t *stp; 942 943 if (vdc->err_stats == NULL) 944 return; 945 946 mutex_enter(&vdc->lock); 947 948 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 949 ASSERT(stp != NULL); 950 951 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 952 (void) strcpy(stp->vd_vid.value.c, "SUN"); 953 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 954 955 mutex_exit(&vdc->lock); 956 } 957 958 static int 959 vdc_create_device_nodes_efi(vdc_t *vdc) 960 { 961 ddi_remove_minor_node(vdc->dip, "h"); 962 ddi_remove_minor_node(vdc->dip, "h,raw"); 963 964 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 965 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 966 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 967 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 968 vdc->instance); 969 return (EIO); 970 } 971 972 /* if any device node is created we set this flag */ 973 vdc->initialized |= VDC_MINOR; 974 975 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 976 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 977 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 978 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 979 vdc->instance); 980 return (EIO); 981 } 982 983 return (0); 984 } 985 986 static int 987 vdc_create_device_nodes_vtoc(vdc_t *vdc) 988 { 989 ddi_remove_minor_node(vdc->dip, "wd"); 990 ddi_remove_minor_node(vdc->dip, "wd,raw"); 991 992 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 993 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 994 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 995 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 996 vdc->instance); 997 return (EIO); 998 } 999 1000 /* if any device node is created we set this flag */ 1001 vdc->initialized |= VDC_MINOR; 1002 1003 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 1004 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 1005 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1006 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 1007 vdc->instance); 1008 return (EIO); 1009 } 1010 1011 return (0); 1012 } 1013 1014 /* 1015 * Function: 1016 * vdc_create_device_nodes 1017 * 1018 * Description: 1019 * This function creates the block and character device nodes under 1020 * /devices along with the node properties. It is called as part of 1021 * the attach(9E) of the instance during the handshake with vds after 1022 * vds has sent the attributes to vdc. 1023 * 1024 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1025 * of 2 is used in keeping with the Solaris convention that slice 2 1026 * refers to a whole disk. Slices start at 'a' 1027 * 1028 * Parameters: 1029 * vdc - soft state pointer 1030 * 1031 * Return Values 1032 * 0 - Success 1033 * EIO - Failed to create node 1034 * EINVAL - Unknown type of disk exported 1035 */ 1036 static int 1037 vdc_create_device_nodes(vdc_t *vdc) 1038 { 1039 char name[sizeof ("s,raw")]; 1040 dev_info_t *dip = NULL; 1041 int instance, status; 1042 int num_slices = 1; 1043 int i; 1044 1045 ASSERT(vdc != NULL); 1046 1047 instance = vdc->instance; 1048 dip = vdc->dip; 1049 1050 switch (vdc->vdisk_type) { 1051 case VD_DISK_TYPE_DISK: 1052 num_slices = V_NUMPAR; 1053 break; 1054 case VD_DISK_TYPE_SLICE: 1055 num_slices = 1; 1056 break; 1057 case VD_DISK_TYPE_UNK: 1058 default: 1059 return (EINVAL); 1060 } 1061 1062 /* 1063 * Minor nodes are different for EFI disks: EFI disks do not have 1064 * a minor node 'g' for the minor number corresponding to slice 1065 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1066 * representing the whole disk. 1067 */ 1068 for (i = 0; i < num_slices; i++) { 1069 1070 if (i == VD_EFI_WD_SLICE) { 1071 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1072 status = vdc_create_device_nodes_efi(vdc); 1073 else 1074 status = vdc_create_device_nodes_vtoc(vdc); 1075 if (status != 0) 1076 return (status); 1077 continue; 1078 } 1079 1080 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1081 if (ddi_create_minor_node(dip, name, S_IFBLK, 1082 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1083 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1084 instance, name); 1085 return (EIO); 1086 } 1087 1088 /* if any device node is created we set this flag */ 1089 vdc->initialized |= VDC_MINOR; 1090 1091 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1092 1093 if (ddi_create_minor_node(dip, name, S_IFCHR, 1094 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1095 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1096 instance, name); 1097 return (EIO); 1098 } 1099 } 1100 1101 return (0); 1102 } 1103 1104 /* 1105 * Function: 1106 * vdc_create_device_nodes_props 1107 * 1108 * Description: 1109 * This function creates the block and character device nodes under 1110 * /devices along with the node properties. It is called as part of 1111 * the attach(9E) of the instance during the handshake with vds after 1112 * vds has sent the attributes to vdc. 1113 * 1114 * Parameters: 1115 * vdc - soft state pointer 1116 * 1117 * Return Values 1118 * 0 - Success 1119 * EIO - Failed to create device node property 1120 * EINVAL - Unknown type of disk exported 1121 */ 1122 static int 1123 vdc_create_device_nodes_props(vdc_t *vdc) 1124 { 1125 dev_info_t *dip = NULL; 1126 int instance; 1127 int num_slices = 1; 1128 int64_t size = 0; 1129 dev_t dev; 1130 int rv; 1131 int i; 1132 1133 ASSERT(vdc != NULL); 1134 1135 instance = vdc->instance; 1136 dip = vdc->dip; 1137 1138 switch (vdc->vdisk_type) { 1139 case VD_DISK_TYPE_DISK: 1140 num_slices = V_NUMPAR; 1141 break; 1142 case VD_DISK_TYPE_SLICE: 1143 num_slices = 1; 1144 break; 1145 case VD_DISK_TYPE_UNK: 1146 default: 1147 return (EINVAL); 1148 } 1149 1150 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1151 /* remove all properties */ 1152 for (i = 0; i < num_slices; i++) { 1153 dev = makedevice(ddi_driver_major(dip), 1154 VD_MAKE_DEV(instance, i)); 1155 (void) ddi_prop_remove(dev, dip, VDC_SIZE_PROP_NAME); 1156 (void) ddi_prop_remove(dev, dip, VDC_NBLOCKS_PROP_NAME); 1157 } 1158 return (0); 1159 } 1160 1161 for (i = 0; i < num_slices; i++) { 1162 dev = makedevice(ddi_driver_major(dip), 1163 VD_MAKE_DEV(instance, i)); 1164 1165 size = vdc->slice[i].nblocks * vdc->block_size; 1166 DMSG(vdc, 0, "[%d] sz %ld (%ld Mb) p_size %lx\n", 1167 instance, size, size / (1024 * 1024), 1168 vdc->slice[i].nblocks); 1169 1170 rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); 1171 if (rv != DDI_PROP_SUCCESS) { 1172 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop of [%ld]", 1173 instance, VDC_SIZE_PROP_NAME, size); 1174 return (EIO); 1175 } 1176 1177 rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME, 1178 lbtodb(size)); 1179 if (rv != DDI_PROP_SUCCESS) { 1180 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop [%llu]", 1181 instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size)); 1182 return (EIO); 1183 } 1184 } 1185 1186 return (0); 1187 } 1188 1189 /* 1190 * Function: 1191 * vdc_is_opened 1192 * 1193 * Description: 1194 * This function checks if any slice of a given virtual disk is 1195 * currently opened. 1196 * 1197 * Parameters: 1198 * vdc - soft state pointer 1199 * 1200 * Return Values 1201 * B_TRUE - at least one slice is opened. 1202 * B_FALSE - no slice is opened. 1203 */ 1204 static boolean_t 1205 vdc_is_opened(vdc_t *vdc) 1206 { 1207 int i, nslices; 1208 1209 switch (vdc->vdisk_type) { 1210 case VD_DISK_TYPE_DISK: 1211 nslices = V_NUMPAR; 1212 break; 1213 case VD_DISK_TYPE_SLICE: 1214 nslices = 1; 1215 break; 1216 case VD_DISK_TYPE_UNK: 1217 default: 1218 ASSERT(0); 1219 } 1220 1221 /* check if there's any layered open */ 1222 for (i = 0; i < nslices; i++) { 1223 if (vdc->open_lyr[i] > 0) 1224 return (B_TRUE); 1225 } 1226 1227 /* check if there is any other kind of open */ 1228 for (i = 0; i < OTYPCNT; i++) { 1229 if (vdc->open[i] != 0) 1230 return (B_TRUE); 1231 } 1232 1233 return (B_FALSE); 1234 } 1235 1236 static int 1237 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1238 { 1239 uint8_t slicemask; 1240 int i; 1241 1242 ASSERT(otyp < OTYPCNT); 1243 ASSERT(slice < V_NUMPAR); 1244 ASSERT(MUTEX_HELD(&vdc->lock)); 1245 1246 slicemask = 1 << slice; 1247 1248 /* check if slice is already exclusively opened */ 1249 if (vdc->open_excl & slicemask) 1250 return (EBUSY); 1251 1252 /* if open exclusive, check if slice is already opened */ 1253 if (flag & FEXCL) { 1254 if (vdc->open_lyr[slice] > 0) 1255 return (EBUSY); 1256 for (i = 0; i < OTYPCNT; i++) { 1257 if (vdc->open[i] & slicemask) 1258 return (EBUSY); 1259 } 1260 vdc->open_excl |= slicemask; 1261 } 1262 1263 /* mark slice as opened */ 1264 if (otyp == OTYP_LYR) { 1265 vdc->open_lyr[slice]++; 1266 } else { 1267 vdc->open[otyp] |= slicemask; 1268 } 1269 1270 return (0); 1271 } 1272 1273 static void 1274 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1275 { 1276 uint8_t slicemask; 1277 1278 ASSERT(otyp < OTYPCNT); 1279 ASSERT(slice < V_NUMPAR); 1280 ASSERT(MUTEX_HELD(&vdc->lock)); 1281 1282 slicemask = 1 << slice; 1283 1284 if (otyp == OTYP_LYR) { 1285 ASSERT(vdc->open_lyr[slice] > 0); 1286 vdc->open_lyr[slice]--; 1287 } else { 1288 vdc->open[otyp] &= ~slicemask; 1289 } 1290 1291 if (flag & FEXCL) 1292 vdc->open_excl &= ~slicemask; 1293 } 1294 1295 static int 1296 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1297 { 1298 _NOTE(ARGUNUSED(cred)) 1299 1300 int instance, nodelay; 1301 int slice, status = 0; 1302 vdc_t *vdc; 1303 1304 ASSERT(dev != NULL); 1305 instance = VDCUNIT(*dev); 1306 1307 if (otyp >= OTYPCNT) 1308 return (EINVAL); 1309 1310 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1311 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1312 return (ENXIO); 1313 } 1314 1315 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1316 getminor(*dev), flag, otyp); 1317 1318 slice = VDCPART(*dev); 1319 1320 nodelay = flag & (FNDELAY | FNONBLOCK); 1321 1322 if ((flag & FWRITE) && (!nodelay) && 1323 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1324 return (EROFS); 1325 } 1326 1327 mutex_enter(&vdc->lock); 1328 1329 status = vdc_mark_opened(vdc, slice, flag, otyp); 1330 1331 if (status != 0) { 1332 mutex_exit(&vdc->lock); 1333 return (status); 1334 } 1335 1336 if (nodelay) { 1337 1338 /* don't resubmit a validate request if there's already one */ 1339 if (vdc->validate_pending > 0) { 1340 mutex_exit(&vdc->lock); 1341 return (0); 1342 } 1343 1344 /* call vdc_validate() asynchronously to avoid blocking */ 1345 if (taskq_dispatch(system_taskq, vdc_validate_task, 1346 (void *)vdc, TQ_NOSLEEP) == NULL) { 1347 vdc_mark_closed(vdc, slice, flag, otyp); 1348 mutex_exit(&vdc->lock); 1349 return (ENXIO); 1350 } 1351 1352 vdc->validate_pending++; 1353 mutex_exit(&vdc->lock); 1354 return (0); 1355 } 1356 1357 mutex_exit(&vdc->lock); 1358 1359 vdc_validate(vdc); 1360 1361 mutex_enter(&vdc->lock); 1362 1363 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1364 vdc->slice[slice].nblocks == 0) { 1365 vdc_mark_closed(vdc, slice, flag, otyp); 1366 status = EIO; 1367 } 1368 1369 mutex_exit(&vdc->lock); 1370 1371 return (status); 1372 } 1373 1374 static int 1375 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1376 { 1377 _NOTE(ARGUNUSED(cred)) 1378 1379 int instance; 1380 int slice; 1381 int rv, rval; 1382 vdc_t *vdc; 1383 1384 instance = VDCUNIT(dev); 1385 1386 if (otyp >= OTYPCNT) 1387 return (EINVAL); 1388 1389 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1390 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1391 return (ENXIO); 1392 } 1393 1394 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1395 1396 slice = VDCPART(dev); 1397 1398 /* 1399 * Attempt to flush the W$ on a close operation. If this is 1400 * not a supported IOCTL command or the backing device is read-only 1401 * do not fail the close operation. 1402 */ 1403 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1404 1405 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1406 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1407 instance, rv); 1408 return (EIO); 1409 } 1410 1411 mutex_enter(&vdc->lock); 1412 vdc_mark_closed(vdc, slice, flag, otyp); 1413 mutex_exit(&vdc->lock); 1414 1415 return (0); 1416 } 1417 1418 static int 1419 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1420 { 1421 _NOTE(ARGUNUSED(credp)) 1422 1423 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1424 } 1425 1426 static int 1427 vdc_print(dev_t dev, char *str) 1428 { 1429 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1430 return (0); 1431 } 1432 1433 static int 1434 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1435 { 1436 int rv; 1437 size_t nbytes = nblk * DEV_BSIZE; 1438 int instance = VDCUNIT(dev); 1439 vdc_t *vdc = NULL; 1440 1441 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1442 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1443 return (ENXIO); 1444 } 1445 1446 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1447 instance, nbytes, blkno, (void *)addr); 1448 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1449 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1450 if (rv) { 1451 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1452 return (rv); 1453 } 1454 1455 if (ddi_in_panic()) 1456 (void) vdc_drain_response(vdc); 1457 1458 DMSG(vdc, 0, "[%d] End\n", instance); 1459 1460 return (0); 1461 } 1462 1463 /* -------------------------------------------------------------------------- */ 1464 1465 /* 1466 * Disk access routines 1467 * 1468 */ 1469 1470 /* 1471 * vdc_strategy() 1472 * 1473 * Return Value: 1474 * 0: As per strategy(9E), the strategy() function must return 0 1475 * [ bioerror(9f) sets b_flags to the proper error code ] 1476 */ 1477 static int 1478 vdc_strategy(struct buf *buf) 1479 { 1480 int rv = -1; 1481 vdc_t *vdc = NULL; 1482 int instance = VDCUNIT(buf->b_edev); 1483 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1484 int slice; 1485 1486 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1487 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1488 bioerror(buf, ENXIO); 1489 biodone(buf); 1490 return (0); 1491 } 1492 1493 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1494 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1495 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1496 1497 bp_mapin(buf); 1498 1499 if ((long)buf->b_private == VD_SLICE_NONE) { 1500 /* I/O using an absolute disk offset */ 1501 slice = VD_SLICE_NONE; 1502 } else { 1503 slice = VDCPART(buf->b_edev); 1504 } 1505 1506 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1507 buf->b_bcount, slice, buf->b_lblkno, 1508 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1509 VIO_write_dir); 1510 1511 /* 1512 * If the request was successfully sent, the strategy call returns and 1513 * the ACK handler calls the bioxxx functions when the vDisk server is 1514 * done otherwise we handle the error here. 1515 */ 1516 if (rv) { 1517 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1518 bioerror(buf, rv); 1519 biodone(buf); 1520 } 1521 1522 return (0); 1523 } 1524 1525 /* 1526 * Function: 1527 * vdc_min 1528 * 1529 * Description: 1530 * Routine to limit the size of a data transfer. Used in 1531 * conjunction with physio(9F). 1532 * 1533 * Arguments: 1534 * bp - pointer to the indicated buf(9S) struct. 1535 * 1536 */ 1537 static void 1538 vdc_min(struct buf *bufp) 1539 { 1540 vdc_t *vdc = NULL; 1541 int instance = VDCUNIT(bufp->b_edev); 1542 1543 vdc = ddi_get_soft_state(vdc_state, instance); 1544 VERIFY(vdc != NULL); 1545 1546 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1547 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1548 } 1549 } 1550 1551 static int 1552 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1553 { 1554 _NOTE(ARGUNUSED(cred)) 1555 1556 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1557 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1558 } 1559 1560 static int 1561 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1562 { 1563 _NOTE(ARGUNUSED(cred)) 1564 1565 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1566 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1567 } 1568 1569 static int 1570 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1571 { 1572 _NOTE(ARGUNUSED(cred)) 1573 1574 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1575 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1576 } 1577 1578 static int 1579 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1580 { 1581 _NOTE(ARGUNUSED(cred)) 1582 1583 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1584 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1585 } 1586 1587 1588 /* -------------------------------------------------------------------------- */ 1589 1590 /* 1591 * Handshake support 1592 */ 1593 1594 1595 /* 1596 * Function: 1597 * vdc_init_ver_negotiation() 1598 * 1599 * Description: 1600 * 1601 * Arguments: 1602 * vdc - soft state pointer for this instance of the device driver. 1603 * 1604 * Return Code: 1605 * 0 - Success 1606 */ 1607 static int 1608 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1609 { 1610 vio_ver_msg_t pkt; 1611 size_t msglen = sizeof (pkt); 1612 int status = -1; 1613 1614 ASSERT(vdc != NULL); 1615 ASSERT(mutex_owned(&vdc->lock)); 1616 1617 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1618 1619 /* 1620 * set the Session ID to a unique value 1621 * (the lower 32 bits of the clock tick) 1622 */ 1623 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1624 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1625 1626 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1627 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1628 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1629 pkt.tag.vio_sid = vdc->session_id; 1630 pkt.dev_class = VDEV_DISK; 1631 pkt.ver_major = ver.major; 1632 pkt.ver_minor = ver.minor; 1633 1634 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1635 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1636 vdc->instance, status); 1637 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1638 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1639 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1640 status, msglen); 1641 if (msglen != sizeof (vio_ver_msg_t)) 1642 status = ENOMSG; 1643 } 1644 1645 return (status); 1646 } 1647 1648 /* 1649 * Function: 1650 * vdc_ver_negotiation() 1651 * 1652 * Description: 1653 * 1654 * Arguments: 1655 * vdcp - soft state pointer for this instance of the device driver. 1656 * 1657 * Return Code: 1658 * 0 - Success 1659 */ 1660 static int 1661 vdc_ver_negotiation(vdc_t *vdcp) 1662 { 1663 vio_msg_t vio_msg; 1664 int status; 1665 1666 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1667 return (status); 1668 1669 /* release lock and wait for response */ 1670 mutex_exit(&vdcp->lock); 1671 status = vdc_wait_for_response(vdcp, &vio_msg); 1672 mutex_enter(&vdcp->lock); 1673 if (status) { 1674 DMSG(vdcp, 0, 1675 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1676 vdcp->instance, status); 1677 return (status); 1678 } 1679 1680 /* check type and sub_type ... */ 1681 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1682 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1683 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1684 vdcp->instance); 1685 return (EPROTO); 1686 } 1687 1688 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1689 } 1690 1691 /* 1692 * Function: 1693 * vdc_init_attr_negotiation() 1694 * 1695 * Description: 1696 * 1697 * Arguments: 1698 * vdc - soft state pointer for this instance of the device driver. 1699 * 1700 * Return Code: 1701 * 0 - Success 1702 */ 1703 static int 1704 vdc_init_attr_negotiation(vdc_t *vdc) 1705 { 1706 vd_attr_msg_t pkt; 1707 size_t msglen = sizeof (pkt); 1708 int status; 1709 1710 ASSERT(vdc != NULL); 1711 ASSERT(mutex_owned(&vdc->lock)); 1712 1713 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1714 1715 /* fill in tag */ 1716 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1717 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1718 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1719 pkt.tag.vio_sid = vdc->session_id; 1720 /* fill in payload */ 1721 pkt.max_xfer_sz = vdc->max_xfer_sz; 1722 pkt.vdisk_block_size = vdc->block_size; 1723 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1724 pkt.operations = 0; /* server will set bits of valid operations */ 1725 pkt.vdisk_type = 0; /* server will set to valid device type */ 1726 pkt.vdisk_media = 0; /* server will set to valid media type */ 1727 pkt.vdisk_size = 0; /* server will set to valid size */ 1728 1729 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1730 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1731 1732 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1733 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1734 "id(%lx) rv(%d) size(%ld)", vdc->instance, vdc->ldc_handle, 1735 status, msglen); 1736 if (msglen != sizeof (vio_ver_msg_t)) 1737 status = ENOMSG; 1738 } 1739 1740 return (status); 1741 } 1742 1743 /* 1744 * Function: 1745 * vdc_attr_negotiation() 1746 * 1747 * Description: 1748 * 1749 * Arguments: 1750 * vdc - soft state pointer for this instance of the device driver. 1751 * 1752 * Return Code: 1753 * 0 - Success 1754 */ 1755 static int 1756 vdc_attr_negotiation(vdc_t *vdcp) 1757 { 1758 int status; 1759 vio_msg_t vio_msg; 1760 1761 if (status = vdc_init_attr_negotiation(vdcp)) 1762 return (status); 1763 1764 /* release lock and wait for response */ 1765 mutex_exit(&vdcp->lock); 1766 status = vdc_wait_for_response(vdcp, &vio_msg); 1767 mutex_enter(&vdcp->lock); 1768 if (status) { 1769 DMSG(vdcp, 0, 1770 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1771 vdcp->instance, status); 1772 return (status); 1773 } 1774 1775 /* check type and sub_type ... */ 1776 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1777 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1778 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1779 vdcp->instance); 1780 return (EPROTO); 1781 } 1782 1783 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1784 } 1785 1786 1787 /* 1788 * Function: 1789 * vdc_init_dring_negotiate() 1790 * 1791 * Description: 1792 * 1793 * Arguments: 1794 * vdc - soft state pointer for this instance of the device driver. 1795 * 1796 * Return Code: 1797 * 0 - Success 1798 */ 1799 static int 1800 vdc_init_dring_negotiate(vdc_t *vdc) 1801 { 1802 vio_dring_reg_msg_t pkt; 1803 size_t msglen = sizeof (pkt); 1804 int status = -1; 1805 int retry; 1806 int nretries = 10; 1807 1808 ASSERT(vdc != NULL); 1809 ASSERT(mutex_owned(&vdc->lock)); 1810 1811 for (retry = 0; retry < nretries; retry++) { 1812 status = vdc_init_descriptor_ring(vdc); 1813 if (status != EAGAIN) 1814 break; 1815 drv_usecwait(vdc_min_timeout_ldc); 1816 } 1817 1818 if (status != 0) { 1819 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1820 vdc->instance, status); 1821 return (status); 1822 } 1823 1824 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1825 vdc->instance, status); 1826 1827 /* fill in tag */ 1828 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1829 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1830 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1831 pkt.tag.vio_sid = vdc->session_id; 1832 /* fill in payload */ 1833 pkt.dring_ident = 0; 1834 pkt.num_descriptors = vdc->dring_len; 1835 pkt.descriptor_size = vdc->dring_entry_size; 1836 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1837 pkt.ncookies = vdc->dring_cookie_count; 1838 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1839 1840 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1841 if (status != 0) { 1842 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1843 vdc->instance, status); 1844 } 1845 1846 return (status); 1847 } 1848 1849 1850 /* 1851 * Function: 1852 * vdc_dring_negotiation() 1853 * 1854 * Description: 1855 * 1856 * Arguments: 1857 * vdc - soft state pointer for this instance of the device driver. 1858 * 1859 * Return Code: 1860 * 0 - Success 1861 */ 1862 static int 1863 vdc_dring_negotiation(vdc_t *vdcp) 1864 { 1865 int status; 1866 vio_msg_t vio_msg; 1867 1868 if (status = vdc_init_dring_negotiate(vdcp)) 1869 return (status); 1870 1871 /* release lock and wait for response */ 1872 mutex_exit(&vdcp->lock); 1873 status = vdc_wait_for_response(vdcp, &vio_msg); 1874 mutex_enter(&vdcp->lock); 1875 if (status) { 1876 DMSG(vdcp, 0, 1877 "[%d] Failed waiting for Dring negotiation response," 1878 " rv(%d)", vdcp->instance, status); 1879 return (status); 1880 } 1881 1882 /* check type and sub_type ... */ 1883 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1884 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1885 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1886 vdcp->instance); 1887 return (EPROTO); 1888 } 1889 1890 return (vdc_handle_dring_reg_msg(vdcp, 1891 (vio_dring_reg_msg_t *)&vio_msg)); 1892 } 1893 1894 1895 /* 1896 * Function: 1897 * vdc_send_rdx() 1898 * 1899 * Description: 1900 * 1901 * Arguments: 1902 * vdc - soft state pointer for this instance of the device driver. 1903 * 1904 * Return Code: 1905 * 0 - Success 1906 */ 1907 static int 1908 vdc_send_rdx(vdc_t *vdcp) 1909 { 1910 vio_msg_t msg; 1911 size_t msglen = sizeof (vio_msg_t); 1912 int status; 1913 1914 /* 1915 * Send an RDX message to vds to indicate we are ready 1916 * to send data 1917 */ 1918 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1919 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1920 msg.tag.vio_subtype_env = VIO_RDX; 1921 msg.tag.vio_sid = vdcp->session_id; 1922 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1923 if (status != 0) { 1924 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1925 vdcp->instance, status); 1926 } 1927 1928 return (status); 1929 } 1930 1931 /* 1932 * Function: 1933 * vdc_handle_rdx() 1934 * 1935 * Description: 1936 * 1937 * Arguments: 1938 * vdc - soft state pointer for this instance of the device driver. 1939 * msgp - received msg 1940 * 1941 * Return Code: 1942 * 0 - Success 1943 */ 1944 static int 1945 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1946 { 1947 _NOTE(ARGUNUSED(vdcp)) 1948 _NOTE(ARGUNUSED(msgp)) 1949 1950 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1951 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1952 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1953 1954 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1955 1956 return (0); 1957 } 1958 1959 /* 1960 * Function: 1961 * vdc_rdx_exchange() 1962 * 1963 * Description: 1964 * 1965 * Arguments: 1966 * vdc - soft state pointer for this instance of the device driver. 1967 * 1968 * Return Code: 1969 * 0 - Success 1970 */ 1971 static int 1972 vdc_rdx_exchange(vdc_t *vdcp) 1973 { 1974 int status; 1975 vio_msg_t vio_msg; 1976 1977 if (status = vdc_send_rdx(vdcp)) 1978 return (status); 1979 1980 /* release lock and wait for response */ 1981 mutex_exit(&vdcp->lock); 1982 status = vdc_wait_for_response(vdcp, &vio_msg); 1983 mutex_enter(&vdcp->lock); 1984 if (status) { 1985 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1986 vdcp->instance, status); 1987 return (status); 1988 } 1989 1990 /* check type and sub_type ... */ 1991 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1992 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1993 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1994 return (EPROTO); 1995 } 1996 1997 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1998 } 1999 2000 2001 /* -------------------------------------------------------------------------- */ 2002 2003 /* 2004 * LDC helper routines 2005 */ 2006 2007 static int 2008 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 2009 { 2010 int status; 2011 boolean_t q_has_pkts = B_FALSE; 2012 uint64_t delay_time; 2013 size_t len; 2014 2015 mutex_enter(&vdc->read_lock); 2016 2017 if (vdc->read_state == VDC_READ_IDLE) 2018 vdc->read_state = VDC_READ_WAITING; 2019 2020 while (vdc->read_state != VDC_READ_PENDING) { 2021 2022 /* detect if the connection has been reset */ 2023 if (vdc->read_state == VDC_READ_RESET) { 2024 status = ECONNRESET; 2025 goto done; 2026 } 2027 2028 cv_wait(&vdc->read_cv, &vdc->read_lock); 2029 } 2030 2031 /* 2032 * Until we get a blocking ldc read we have to retry 2033 * until the entire LDC message has arrived before 2034 * ldc_read() will succeed. Note we also bail out if 2035 * the channel is reset or goes away. 2036 */ 2037 delay_time = vdc_ldc_read_init_delay; 2038 loop: 2039 len = *nbytesp; 2040 status = ldc_read(vdc->ldc_handle, (caddr_t)msgp, &len); 2041 switch (status) { 2042 case EAGAIN: 2043 delay_time *= 2; 2044 if (delay_time >= vdc_ldc_read_max_delay) 2045 delay_time = vdc_ldc_read_max_delay; 2046 delay(delay_time); 2047 goto loop; 2048 2049 case 0: 2050 if (len == 0) { 2051 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 2052 "no error!\n", vdc->instance); 2053 goto loop; 2054 } 2055 2056 *nbytesp = len; 2057 2058 /* 2059 * If there are pending messages, leave the 2060 * read state as pending. Otherwise, set the state 2061 * back to idle. 2062 */ 2063 status = ldc_chkq(vdc->ldc_handle, &q_has_pkts); 2064 if (status == 0 && !q_has_pkts) 2065 vdc->read_state = VDC_READ_IDLE; 2066 2067 break; 2068 default: 2069 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2070 break; 2071 } 2072 2073 done: 2074 mutex_exit(&vdc->read_lock); 2075 2076 return (status); 2077 } 2078 2079 2080 2081 #ifdef DEBUG 2082 void 2083 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2084 { 2085 char *ms, *ss, *ses; 2086 switch (msg->tag.vio_msgtype) { 2087 #define Q(_s) case _s : ms = #_s; break; 2088 Q(VIO_TYPE_CTRL) 2089 Q(VIO_TYPE_DATA) 2090 Q(VIO_TYPE_ERR) 2091 #undef Q 2092 default: ms = "unknown"; break; 2093 } 2094 2095 switch (msg->tag.vio_subtype) { 2096 #define Q(_s) case _s : ss = #_s; break; 2097 Q(VIO_SUBTYPE_INFO) 2098 Q(VIO_SUBTYPE_ACK) 2099 Q(VIO_SUBTYPE_NACK) 2100 #undef Q 2101 default: ss = "unknown"; break; 2102 } 2103 2104 switch (msg->tag.vio_subtype_env) { 2105 #define Q(_s) case _s : ses = #_s; break; 2106 Q(VIO_VER_INFO) 2107 Q(VIO_ATTR_INFO) 2108 Q(VIO_DRING_REG) 2109 Q(VIO_DRING_UNREG) 2110 Q(VIO_RDX) 2111 Q(VIO_PKT_DATA) 2112 Q(VIO_DESC_DATA) 2113 Q(VIO_DRING_DATA) 2114 #undef Q 2115 default: ses = "unknown"; break; 2116 } 2117 2118 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2119 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2120 msg->tag.vio_subtype_env, ms, ss, ses); 2121 } 2122 #endif 2123 2124 /* 2125 * Function: 2126 * vdc_send() 2127 * 2128 * Description: 2129 * The function encapsulates the call to write a message using LDC. 2130 * If LDC indicates that the call failed due to the queue being full, 2131 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2132 * 2133 * Arguments: 2134 * ldc_handle - LDC handle for the channel this instance of vdc uses 2135 * pkt - address of LDC message to be sent 2136 * msglen - the size of the message being sent. When the function 2137 * returns, this contains the number of bytes written. 2138 * 2139 * Return Code: 2140 * 0 - Success. 2141 * EINVAL - pkt or msglen were NULL 2142 * ECONNRESET - The connection was not up. 2143 * EWOULDBLOCK - LDC queue is full 2144 * xxx - other error codes returned by ldc_write 2145 */ 2146 static int 2147 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2148 { 2149 size_t size = 0; 2150 int status = 0; 2151 clock_t delay_ticks; 2152 2153 ASSERT(vdc != NULL); 2154 ASSERT(mutex_owned(&vdc->lock)); 2155 ASSERT(msglen != NULL); 2156 ASSERT(*msglen != 0); 2157 2158 #ifdef DEBUG 2159 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2160 #endif 2161 /* 2162 * Wait indefinitely to send if channel 2163 * is busy, but bail out if we succeed or 2164 * if the channel closes or is reset. 2165 */ 2166 delay_ticks = vdc_hz_min_ldc_delay; 2167 do { 2168 size = *msglen; 2169 status = ldc_write(vdc->ldc_handle, pkt, &size); 2170 if (status == EWOULDBLOCK) { 2171 delay(delay_ticks); 2172 /* geometric backoff */ 2173 delay_ticks *= 2; 2174 if (delay_ticks > vdc_hz_max_ldc_delay) 2175 delay_ticks = vdc_hz_max_ldc_delay; 2176 } 2177 } while (status == EWOULDBLOCK); 2178 2179 /* if LDC had serious issues --- reset vdc state */ 2180 if (status == EIO || status == ECONNRESET) { 2181 /* LDC had serious issues --- reset vdc state */ 2182 mutex_enter(&vdc->read_lock); 2183 if ((vdc->read_state == VDC_READ_WAITING) || 2184 (vdc->read_state == VDC_READ_RESET)) 2185 cv_signal(&vdc->read_cv); 2186 vdc->read_state = VDC_READ_RESET; 2187 mutex_exit(&vdc->read_lock); 2188 2189 /* wake up any waiters in the reset thread */ 2190 if (vdc->state == VDC_STATE_INIT_WAITING) { 2191 DMSG(vdc, 0, "[%d] write reset - " 2192 "vdc is resetting ..\n", vdc->instance); 2193 vdc->state = VDC_STATE_RESETTING; 2194 cv_signal(&vdc->initwait_cv); 2195 } 2196 2197 return (ECONNRESET); 2198 } 2199 2200 /* return the last size written */ 2201 *msglen = size; 2202 2203 return (status); 2204 } 2205 2206 /* 2207 * Function: 2208 * vdc_get_md_node 2209 * 2210 * Description: 2211 * Get the MD, the device node and the port node for the given 2212 * disk instance. The caller is responsible for cleaning up the 2213 * reference to the returned MD (mdpp) by calling md_fini_handle(). 2214 * 2215 * Arguments: 2216 * dip - dev info pointer for this instance of the device driver. 2217 * mdpp - the returned MD. 2218 * vd_nodep - the returned device node. 2219 * vd_portp - the returned port node. The returned port node is NULL 2220 * if no port node is found. 2221 * 2222 * Return Code: 2223 * 0 - Success. 2224 * ENOENT - Expected node or property did not exist. 2225 * ENXIO - Unexpected error communicating with MD framework 2226 */ 2227 static int 2228 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep, 2229 mde_cookie_t *vd_portp) 2230 { 2231 int status = ENOENT; 2232 char *node_name = NULL; 2233 md_t *mdp = NULL; 2234 int num_nodes; 2235 int num_vdevs; 2236 int num_vports; 2237 mde_cookie_t rootnode; 2238 mde_cookie_t *listp = NULL; 2239 boolean_t found_inst = B_FALSE; 2240 int listsz; 2241 int idx; 2242 uint64_t md_inst; 2243 int obp_inst; 2244 int instance = ddi_get_instance(dip); 2245 2246 /* 2247 * Get the OBP instance number for comparison with the MD instance 2248 * 2249 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2250 * notion of "instance", or unique identifier, for that node; OBP 2251 * stores the value of the "cfg-handle" MD property as the value of 2252 * the "reg" property on the node in the device tree it builds from 2253 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2254 * "reg" property value to uniquely identify this device instance. 2255 * If the "reg" property cannot be found, the device tree state is 2256 * presumably so broken that there is no point in continuing. 2257 */ 2258 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2259 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2260 return (ENOENT); 2261 } 2262 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2263 OBP_REG, -1); 2264 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2265 2266 /* 2267 * We now walk the MD nodes to find the node for this vdisk. 2268 */ 2269 if ((mdp = md_get_handle()) == NULL) { 2270 cmn_err(CE_WARN, "unable to init machine description"); 2271 return (ENXIO); 2272 } 2273 2274 num_nodes = md_node_count(mdp); 2275 ASSERT(num_nodes > 0); 2276 2277 listsz = num_nodes * sizeof (mde_cookie_t); 2278 2279 /* allocate memory for nodes */ 2280 listp = kmem_zalloc(listsz, KM_SLEEP); 2281 2282 rootnode = md_root_node(mdp); 2283 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2284 2285 /* 2286 * Search for all the virtual devices, we will then check to see which 2287 * ones are disk nodes. 2288 */ 2289 num_vdevs = md_scan_dag(mdp, rootnode, 2290 md_find_name(mdp, VDC_MD_VDEV_NAME), 2291 md_find_name(mdp, "fwd"), listp); 2292 2293 if (num_vdevs <= 0) { 2294 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2295 status = ENOENT; 2296 goto done; 2297 } 2298 2299 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2300 for (idx = 0; idx < num_vdevs; idx++) { 2301 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2302 if ((status != 0) || (node_name == NULL)) { 2303 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2304 ": err %d", VDC_MD_VDEV_NAME, status); 2305 continue; 2306 } 2307 2308 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2309 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2310 status = md_get_prop_val(mdp, listp[idx], 2311 VDC_MD_CFG_HDL, &md_inst); 2312 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2313 instance, md_inst); 2314 if ((status == 0) && (md_inst == obp_inst)) { 2315 found_inst = B_TRUE; 2316 break; 2317 } 2318 } 2319 } 2320 2321 if (!found_inst) { 2322 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2323 status = ENOENT; 2324 goto done; 2325 } 2326 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2327 2328 *vd_nodep = listp[idx]; 2329 *mdpp = mdp; 2330 2331 num_vports = md_scan_dag(mdp, *vd_nodep, 2332 md_find_name(mdp, VDC_MD_PORT_NAME), 2333 md_find_name(mdp, "fwd"), listp); 2334 2335 if (num_vports != 1) { 2336 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2337 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME, num_vports); 2338 } 2339 2340 *vd_portp = (num_vports == 0)? NULL: listp[0]; 2341 2342 done: 2343 kmem_free(listp, listsz); 2344 return (status); 2345 } 2346 2347 /* 2348 * Function: 2349 * vdc_get_ldc_id() 2350 * 2351 * Description: 2352 * This function gets the 'ldc-id' for this particular instance of vdc. 2353 * The id returned is the guest domain channel endpoint LDC uses for 2354 * communication with vds. 2355 * 2356 * Arguments: 2357 * mdp - pointer to the machine description. 2358 * vd_node - the vdisk element from the MD. 2359 * ldc_id - pointer to variable used to return the 'ldc-id' found. 2360 * 2361 * Return Code: 2362 * 0 - Success. 2363 * ENOENT - Expected node or property did not exist. 2364 */ 2365 static int 2366 vdc_get_ldc_id(md_t *mdp, mde_cookie_t vd_node, uint64_t *ldc_id) 2367 { 2368 mde_cookie_t *chanp = NULL; 2369 int listsz; 2370 int num_chans; 2371 int num_nodes; 2372 int status = 0; 2373 2374 num_nodes = md_node_count(mdp); 2375 ASSERT(num_nodes > 0); 2376 2377 listsz = num_nodes * sizeof (mde_cookie_t); 2378 2379 /* allocate memory for nodes */ 2380 chanp = kmem_zalloc(listsz, KM_SLEEP); 2381 2382 /* get the channels for this node */ 2383 num_chans = md_scan_dag(mdp, vd_node, 2384 md_find_name(mdp, VDC_MD_CHAN_NAME), 2385 md_find_name(mdp, "fwd"), chanp); 2386 2387 /* expecting at least one channel */ 2388 if (num_chans <= 0) { 2389 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2390 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2391 status = ENOENT; 2392 goto done; 2393 2394 } else if (num_chans != 1) { 2395 DMSGX(0, "Expected 1 '%s' node for '%s' port, found %d\n", 2396 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, num_chans); 2397 } 2398 2399 /* 2400 * We use the first channel found (index 0), irrespective of how 2401 * many are there in total. 2402 */ 2403 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, ldc_id) != 0) { 2404 cmn_err(CE_NOTE, "Channel '%s' property not found", VDC_MD_ID); 2405 status = ENOENT; 2406 } 2407 2408 done: 2409 kmem_free(chanp, listsz); 2410 return (status); 2411 } 2412 2413 static int 2414 vdc_do_ldc_up(vdc_t *vdc) 2415 { 2416 int status; 2417 ldc_status_t ldc_state; 2418 2419 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2420 vdc->instance, vdc->ldc_id); 2421 2422 if (vdc->lifecycle == VDC_LC_DETACHING) 2423 return (EINVAL); 2424 2425 if ((status = ldc_up(vdc->ldc_handle)) != 0) { 2426 switch (status) { 2427 case ECONNREFUSED: /* listener not ready at other end */ 2428 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2429 vdc->instance, vdc->ldc_id, status); 2430 status = 0; 2431 break; 2432 default: 2433 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2434 "channel=%ld, err=%d", vdc->instance, vdc->ldc_id, 2435 status); 2436 break; 2437 } 2438 } 2439 2440 if (ldc_status(vdc->ldc_handle, &ldc_state) == 0) { 2441 vdc->ldc_state = ldc_state; 2442 if (ldc_state == LDC_UP) { 2443 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2444 vdc->instance); 2445 vdc->seq_num = 1; 2446 vdc->seq_num_reply = 0; 2447 } 2448 } 2449 2450 return (status); 2451 } 2452 2453 /* 2454 * Function: 2455 * vdc_terminate_ldc() 2456 * 2457 * Description: 2458 * 2459 * Arguments: 2460 * vdc - soft state pointer for this instance of the device driver. 2461 * 2462 * Return Code: 2463 * None 2464 */ 2465 static void 2466 vdc_terminate_ldc(vdc_t *vdc) 2467 { 2468 int instance = ddi_get_instance(vdc->dip); 2469 2470 ASSERT(vdc != NULL); 2471 ASSERT(mutex_owned(&vdc->lock)); 2472 2473 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2474 2475 if (vdc->initialized & VDC_LDC_OPEN) { 2476 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2477 (void) ldc_close(vdc->ldc_handle); 2478 } 2479 if (vdc->initialized & VDC_LDC_CB) { 2480 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2481 (void) ldc_unreg_callback(vdc->ldc_handle); 2482 } 2483 if (vdc->initialized & VDC_LDC) { 2484 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2485 (void) ldc_fini(vdc->ldc_handle); 2486 vdc->ldc_handle = NULL; 2487 } 2488 2489 vdc->initialized &= ~(VDC_LDC | VDC_LDC_CB | VDC_LDC_OPEN); 2490 } 2491 2492 /* -------------------------------------------------------------------------- */ 2493 2494 /* 2495 * Descriptor Ring helper routines 2496 */ 2497 2498 /* 2499 * Function: 2500 * vdc_init_descriptor_ring() 2501 * 2502 * Description: 2503 * 2504 * Arguments: 2505 * vdc - soft state pointer for this instance of the device driver. 2506 * 2507 * Return Code: 2508 * 0 - Success 2509 */ 2510 static int 2511 vdc_init_descriptor_ring(vdc_t *vdc) 2512 { 2513 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2514 int status = 0; 2515 int i; 2516 2517 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2518 2519 ASSERT(vdc != NULL); 2520 ASSERT(mutex_owned(&vdc->lock)); 2521 ASSERT(vdc->ldc_handle != NULL); 2522 2523 /* ensure we have enough room to store max sized block */ 2524 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2525 2526 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2527 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2528 /* 2529 * Calculate the maximum block size we can transmit using one 2530 * Descriptor Ring entry from the attributes returned by the 2531 * vDisk server. This is subject to a minimum of 'maxphys' 2532 * as we do not have the capability to split requests over 2533 * multiple DRing entries. 2534 */ 2535 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2536 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2537 vdc->instance); 2538 vdc->dring_max_cookies = maxphys / PAGESIZE; 2539 } else { 2540 vdc->dring_max_cookies = 2541 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2542 } 2543 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2544 (sizeof (ldc_mem_cookie_t) * 2545 (vdc->dring_max_cookies - 1))); 2546 vdc->dring_len = VD_DRING_LEN; 2547 2548 status = ldc_mem_dring_create(vdc->dring_len, 2549 vdc->dring_entry_size, &vdc->ldc_dring_hdl); 2550 if ((vdc->ldc_dring_hdl == NULL) || (status != 0)) { 2551 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2552 vdc->instance); 2553 return (status); 2554 } 2555 vdc->initialized |= VDC_DRING_INIT; 2556 } 2557 2558 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2559 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2560 vdc->dring_cookie = 2561 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2562 2563 status = ldc_mem_dring_bind(vdc->ldc_handle, vdc->ldc_dring_hdl, 2564 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2565 &vdc->dring_cookie[0], 2566 &vdc->dring_cookie_count); 2567 if (status != 0) { 2568 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2569 "(%lx) to channel (%lx) status=%d\n", 2570 vdc->instance, vdc->ldc_dring_hdl, 2571 vdc->ldc_handle, status); 2572 return (status); 2573 } 2574 ASSERT(vdc->dring_cookie_count == 1); 2575 vdc->initialized |= VDC_DRING_BOUND; 2576 } 2577 2578 status = ldc_mem_dring_info(vdc->ldc_dring_hdl, &vdc->dring_mem_info); 2579 if (status != 0) { 2580 DMSG(vdc, 0, 2581 "[%d] Failed to get info for descriptor ring (%lx)\n", 2582 vdc->instance, vdc->ldc_dring_hdl); 2583 return (status); 2584 } 2585 2586 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2587 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2588 2589 /* Allocate the local copy of this dring */ 2590 vdc->local_dring = 2591 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2592 KM_SLEEP); 2593 vdc->initialized |= VDC_DRING_LOCAL; 2594 } 2595 2596 /* 2597 * Mark all DRing entries as free and initialize the private 2598 * descriptor's memory handles. If any entry is initialized, 2599 * we need to free it later so we set the bit in 'initialized' 2600 * at the start. 2601 */ 2602 vdc->initialized |= VDC_DRING_ENTRY; 2603 for (i = 0; i < vdc->dring_len; i++) { 2604 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2605 dep->hdr.dstate = VIO_DESC_FREE; 2606 2607 status = ldc_mem_alloc_handle(vdc->ldc_handle, 2608 &vdc->local_dring[i].desc_mhdl); 2609 if (status != 0) { 2610 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2611 " descriptor %d", vdc->instance, i); 2612 return (status); 2613 } 2614 vdc->local_dring[i].is_free = B_TRUE; 2615 vdc->local_dring[i].dep = dep; 2616 } 2617 2618 /* Initialize the starting index */ 2619 vdc->dring_curr_idx = 0; 2620 2621 return (status); 2622 } 2623 2624 /* 2625 * Function: 2626 * vdc_destroy_descriptor_ring() 2627 * 2628 * Description: 2629 * 2630 * Arguments: 2631 * vdc - soft state pointer for this instance of the device driver. 2632 * 2633 * Return Code: 2634 * None 2635 */ 2636 static void 2637 vdc_destroy_descriptor_ring(vdc_t *vdc) 2638 { 2639 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2640 ldc_mem_handle_t mhdl = NULL; 2641 ldc_mem_info_t minfo; 2642 int status = -1; 2643 int i; /* loop */ 2644 2645 ASSERT(vdc != NULL); 2646 ASSERT(mutex_owned(&vdc->lock)); 2647 2648 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2649 2650 if (vdc->initialized & VDC_DRING_ENTRY) { 2651 DMSG(vdc, 0, 2652 "[%d] Removing Local DRing entries\n", vdc->instance); 2653 for (i = 0; i < vdc->dring_len; i++) { 2654 ldep = &vdc->local_dring[i]; 2655 mhdl = ldep->desc_mhdl; 2656 2657 if (mhdl == NULL) 2658 continue; 2659 2660 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2661 DMSG(vdc, 0, 2662 "ldc_mem_info returned an error: %d\n", 2663 status); 2664 2665 /* 2666 * This must mean that the mem handle 2667 * is not valid. Clear it out so that 2668 * no one tries to use it. 2669 */ 2670 ldep->desc_mhdl = NULL; 2671 continue; 2672 } 2673 2674 if (minfo.status == LDC_BOUND) { 2675 (void) ldc_mem_unbind_handle(mhdl); 2676 } 2677 2678 (void) ldc_mem_free_handle(mhdl); 2679 2680 ldep->desc_mhdl = NULL; 2681 } 2682 vdc->initialized &= ~VDC_DRING_ENTRY; 2683 } 2684 2685 if (vdc->initialized & VDC_DRING_LOCAL) { 2686 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2687 kmem_free(vdc->local_dring, 2688 vdc->dring_len * sizeof (vdc_local_desc_t)); 2689 vdc->initialized &= ~VDC_DRING_LOCAL; 2690 } 2691 2692 if (vdc->initialized & VDC_DRING_BOUND) { 2693 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2694 status = ldc_mem_dring_unbind(vdc->ldc_dring_hdl); 2695 if (status == 0) { 2696 vdc->initialized &= ~VDC_DRING_BOUND; 2697 } else { 2698 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2699 vdc->instance, status, vdc->ldc_dring_hdl); 2700 } 2701 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2702 } 2703 2704 if (vdc->initialized & VDC_DRING_INIT) { 2705 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2706 status = ldc_mem_dring_destroy(vdc->ldc_dring_hdl); 2707 if (status == 0) { 2708 vdc->ldc_dring_hdl = NULL; 2709 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2710 vdc->initialized &= ~VDC_DRING_INIT; 2711 } else { 2712 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2713 vdc->instance, status, vdc->ldc_dring_hdl); 2714 } 2715 } 2716 } 2717 2718 /* 2719 * Function: 2720 * vdc_map_to_shared_ring() 2721 * 2722 * Description: 2723 * Copy contents of the local descriptor to the shared 2724 * memory descriptor. 2725 * 2726 * Arguments: 2727 * vdcp - soft state pointer for this instance of the device driver. 2728 * idx - descriptor ring index 2729 * 2730 * Return Code: 2731 * None 2732 */ 2733 static int 2734 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2735 { 2736 vdc_local_desc_t *ldep; 2737 vd_dring_entry_t *dep; 2738 int rv; 2739 2740 ldep = &(vdcp->local_dring[idx]); 2741 2742 /* for now leave in the old pop_mem_hdl stuff */ 2743 if (ldep->nbytes > 0) { 2744 rv = vdc_populate_mem_hdl(vdcp, ldep); 2745 if (rv) { 2746 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2747 vdcp->instance); 2748 return (rv); 2749 } 2750 } 2751 2752 /* 2753 * fill in the data details into the DRing 2754 */ 2755 dep = ldep->dep; 2756 ASSERT(dep != NULL); 2757 2758 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2759 dep->payload.operation = ldep->operation; 2760 dep->payload.addr = ldep->offset; 2761 dep->payload.nbytes = ldep->nbytes; 2762 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2763 dep->payload.slice = ldep->slice; 2764 dep->hdr.dstate = VIO_DESC_READY; 2765 dep->hdr.ack = 1; /* request an ACK for every message */ 2766 2767 return (0); 2768 } 2769 2770 /* 2771 * Function: 2772 * vdc_send_request 2773 * 2774 * Description: 2775 * This routine writes the data to be transmitted to vds into the 2776 * descriptor, notifies vds that the ring has been updated and 2777 * then waits for the request to be processed. 2778 * 2779 * Arguments: 2780 * vdcp - the soft state pointer 2781 * operation - operation we want vds to perform (VD_OP_XXX) 2782 * addr - address of data buf to be read/written. 2783 * nbytes - number of bytes to read/write 2784 * slice - the disk slice this request is for 2785 * offset - relative disk offset 2786 * cb_type - type of call - STRATEGY or SYNC 2787 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2788 * . mode for ioctl(9e) 2789 * . LP64 diskaddr_t (block I/O) 2790 * dir - direction of operation (READ/WRITE/BOTH) 2791 * 2792 * Return Codes: 2793 * 0 2794 * ENXIO 2795 */ 2796 static int 2797 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2798 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2799 void *cb_arg, vio_desc_direction_t dir) 2800 { 2801 int rv = 0; 2802 2803 ASSERT(vdcp != NULL); 2804 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2805 2806 mutex_enter(&vdcp->lock); 2807 2808 /* 2809 * If this is a block read/write operation we update the I/O statistics 2810 * to indicate that the request is being put on the waitq to be 2811 * serviced. 2812 * 2813 * We do it here (a common routine for both synchronous and strategy 2814 * calls) for performance reasons - we are already holding vdc->lock 2815 * so there is no extra locking overhead. We would have to explicitly 2816 * grab the 'lock' mutex to update the stats if we were to do this 2817 * higher up the stack in vdc_strategy() et. al. 2818 */ 2819 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2820 DTRACE_IO1(start, buf_t *, cb_arg); 2821 VD_KSTAT_WAITQ_ENTER(vdcp->io_stats); 2822 } 2823 2824 do { 2825 while (vdcp->state != VDC_STATE_RUNNING) { 2826 2827 /* return error if detaching */ 2828 if (vdcp->state == VDC_STATE_DETACH) { 2829 rv = ENXIO; 2830 goto done; 2831 } 2832 2833 /* fail request if connection timeout is reached */ 2834 if (vdcp->ctimeout_reached) { 2835 rv = EIO; 2836 goto done; 2837 } 2838 2839 /* 2840 * If we are panicking and the disk is not ready then 2841 * we can't send any request because we can't complete 2842 * the handshake now. 2843 */ 2844 if (ddi_in_panic()) { 2845 rv = EIO; 2846 goto done; 2847 } 2848 2849 cv_wait(&vdcp->running_cv, &vdcp->lock); 2850 } 2851 2852 } while (vdc_populate_descriptor(vdcp, operation, addr, 2853 nbytes, slice, offset, cb_type, cb_arg, dir)); 2854 2855 done: 2856 /* 2857 * If this is a block read/write we update the I/O statistics kstat 2858 * to indicate that this request has been placed on the queue for 2859 * processing (i.e sent to the vDisk server) - iostat(1M) will 2860 * report the time waiting for the vDisk server under the %b column 2861 * In the case of an error we simply take it off the wait queue. 2862 */ 2863 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2864 if (rv == 0) { 2865 VD_KSTAT_WAITQ_TO_RUNQ(vdcp->io_stats); 2866 DTRACE_PROBE1(send, buf_t *, cb_arg); 2867 } else { 2868 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2869 VD_KSTAT_WAITQ_EXIT(vdcp->io_stats); 2870 DTRACE_IO1(done, buf_t *, cb_arg); 2871 } 2872 } 2873 2874 mutex_exit(&vdcp->lock); 2875 2876 return (rv); 2877 } 2878 2879 2880 /* 2881 * Function: 2882 * vdc_populate_descriptor 2883 * 2884 * Description: 2885 * This routine writes the data to be transmitted to vds into the 2886 * descriptor, notifies vds that the ring has been updated and 2887 * then waits for the request to be processed. 2888 * 2889 * Arguments: 2890 * vdcp - the soft state pointer 2891 * operation - operation we want vds to perform (VD_OP_XXX) 2892 * addr - address of data buf to be read/written. 2893 * nbytes - number of bytes to read/write 2894 * slice - the disk slice this request is for 2895 * offset - relative disk offset 2896 * cb_type - type of call - STRATEGY or SYNC 2897 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2898 * . mode for ioctl(9e) 2899 * . LP64 diskaddr_t (block I/O) 2900 * dir - direction of operation (READ/WRITE/BOTH) 2901 * 2902 * Return Codes: 2903 * 0 2904 * EAGAIN 2905 * ECONNRESET 2906 * ENXIO 2907 */ 2908 static int 2909 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2910 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2911 void *cb_arg, vio_desc_direction_t dir) 2912 { 2913 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2914 int idx; /* Index of DRing entry used */ 2915 int next_idx; 2916 vio_dring_msg_t dmsg; 2917 size_t msglen; 2918 int rv; 2919 2920 ASSERT(MUTEX_HELD(&vdcp->lock)); 2921 vdcp->threads_pending++; 2922 loop: 2923 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2924 2925 /* Get next available D-Ring entry */ 2926 idx = vdcp->dring_curr_idx; 2927 local_dep = &(vdcp->local_dring[idx]); 2928 2929 if (!local_dep->is_free) { 2930 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2931 vdcp->instance); 2932 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2933 if (vdcp->state == VDC_STATE_RUNNING || 2934 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2935 goto loop; 2936 } 2937 vdcp->threads_pending--; 2938 return (ECONNRESET); 2939 } 2940 2941 next_idx = idx + 1; 2942 if (next_idx >= vdcp->dring_len) 2943 next_idx = 0; 2944 vdcp->dring_curr_idx = next_idx; 2945 2946 ASSERT(local_dep->is_free); 2947 2948 local_dep->operation = operation; 2949 local_dep->addr = addr; 2950 local_dep->nbytes = nbytes; 2951 local_dep->slice = slice; 2952 local_dep->offset = offset; 2953 local_dep->cb_type = cb_type; 2954 local_dep->cb_arg = cb_arg; 2955 local_dep->dir = dir; 2956 2957 local_dep->is_free = B_FALSE; 2958 2959 rv = vdc_map_to_shared_dring(vdcp, idx); 2960 if (rv) { 2961 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 2962 vdcp->instance); 2963 /* free the descriptor */ 2964 local_dep->is_free = B_TRUE; 2965 vdcp->dring_curr_idx = idx; 2966 cv_wait(&vdcp->membind_cv, &vdcp->lock); 2967 if (vdcp->state == VDC_STATE_RUNNING || 2968 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2969 goto loop; 2970 } 2971 vdcp->threads_pending--; 2972 return (ECONNRESET); 2973 } 2974 2975 /* 2976 * Send a msg with the DRing details to vds 2977 */ 2978 VIO_INIT_DRING_DATA_TAG(dmsg); 2979 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 2980 dmsg.dring_ident = vdcp->dring_ident; 2981 dmsg.start_idx = idx; 2982 dmsg.end_idx = idx; 2983 vdcp->seq_num++; 2984 2985 DTRACE_PROBE2(populate, int, vdcp->instance, 2986 vdc_local_desc_t *, local_dep); 2987 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 2988 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 2989 2990 /* 2991 * note we're still holding the lock here to 2992 * make sure the message goes out in order !!!... 2993 */ 2994 msglen = sizeof (dmsg); 2995 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 2996 switch (rv) { 2997 case ECONNRESET: 2998 /* 2999 * vdc_send initiates the reset on failure. 3000 * Since the transaction has already been put 3001 * on the local dring, it will automatically get 3002 * retried when the channel is reset. Given that, 3003 * it is ok to just return success even though the 3004 * send failed. 3005 */ 3006 rv = 0; 3007 break; 3008 3009 case 0: /* EOK */ 3010 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3011 break; 3012 3013 default: 3014 goto cleanup_and_exit; 3015 } 3016 3017 vdcp->threads_pending--; 3018 return (rv); 3019 3020 cleanup_and_exit: 3021 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3022 return (ENXIO); 3023 } 3024 3025 /* 3026 * Function: 3027 * vdc_do_sync_op 3028 * 3029 * Description: 3030 * Wrapper around vdc_populate_descriptor that blocks until the 3031 * response to the message is available. 3032 * 3033 * Arguments: 3034 * vdcp - the soft state pointer 3035 * operation - operation we want vds to perform (VD_OP_XXX) 3036 * addr - address of data buf to be read/written. 3037 * nbytes - number of bytes to read/write 3038 * slice - the disk slice this request is for 3039 * offset - relative disk offset 3040 * cb_type - type of call - STRATEGY or SYNC 3041 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3042 * . mode for ioctl(9e) 3043 * . LP64 diskaddr_t (block I/O) 3044 * dir - direction of operation (READ/WRITE/BOTH) 3045 * rconflict - check for reservation conflict in case of failure 3046 * 3047 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3048 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3049 * result of a successful operation with vd_scsi_status(). 3050 * 3051 * Return Codes: 3052 * 0 3053 * EAGAIN 3054 * EFAULT 3055 * ENXIO 3056 * EIO 3057 */ 3058 static int 3059 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3060 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3061 vio_desc_direction_t dir, boolean_t rconflict) 3062 { 3063 int status; 3064 vdc_io_t *vio; 3065 boolean_t check_resv_conflict = B_FALSE; 3066 3067 ASSERT(cb_type == CB_SYNC); 3068 3069 /* 3070 * Grab the lock, if blocked wait until the server 3071 * response causes us to wake up again. 3072 */ 3073 mutex_enter(&vdcp->lock); 3074 vdcp->sync_op_cnt++; 3075 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3076 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3077 3078 if (vdcp->state == VDC_STATE_DETACH) { 3079 cv_broadcast(&vdcp->sync_blocked_cv); 3080 vdcp->sync_op_cnt--; 3081 mutex_exit(&vdcp->lock); 3082 return (ENXIO); 3083 } 3084 3085 /* now block anyone other thread entering after us */ 3086 vdcp->sync_op_blocked = B_TRUE; 3087 vdcp->sync_op_pending = B_TRUE; 3088 mutex_exit(&vdcp->lock); 3089 3090 status = vdc_send_request(vdcp, operation, addr, 3091 nbytes, slice, offset, cb_type, cb_arg, dir); 3092 3093 mutex_enter(&vdcp->lock); 3094 3095 if (status != 0) { 3096 vdcp->sync_op_pending = B_FALSE; 3097 } else { 3098 /* 3099 * block until our transaction completes. 3100 * Also anyone else waiting also gets to go next. 3101 */ 3102 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3103 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3104 3105 DMSG(vdcp, 2, ": operation returned %d\n", 3106 vdcp->sync_op_status); 3107 if (vdcp->state == VDC_STATE_DETACH) { 3108 vdcp->sync_op_pending = B_FALSE; 3109 status = ENXIO; 3110 } else { 3111 status = vdcp->sync_op_status; 3112 if (status != 0 && vdcp->failfast_interval != 0) { 3113 /* 3114 * Operation has failed and failfast is enabled. 3115 * We need to check if the failure is due to a 3116 * reservation conflict if this was requested. 3117 */ 3118 check_resv_conflict = rconflict; 3119 } 3120 3121 } 3122 } 3123 3124 vdcp->sync_op_status = 0; 3125 vdcp->sync_op_blocked = B_FALSE; 3126 vdcp->sync_op_cnt--; 3127 3128 /* signal the next waiting thread */ 3129 cv_signal(&vdcp->sync_blocked_cv); 3130 3131 /* 3132 * We have to check for reservation conflict after unblocking sync 3133 * operations because some sync operations will be used to do this 3134 * check. 3135 */ 3136 if (check_resv_conflict) { 3137 vio = vdc_failfast_io_queue(vdcp, NULL); 3138 while (vio->vio_qtime != 0) 3139 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3140 kmem_free(vio, sizeof (vdc_io_t)); 3141 } 3142 3143 mutex_exit(&vdcp->lock); 3144 3145 return (status); 3146 } 3147 3148 3149 /* 3150 * Function: 3151 * vdc_drain_response() 3152 * 3153 * Description: 3154 * When a guest is panicking, the completion of requests needs to be 3155 * handled differently because interrupts are disabled and vdc 3156 * will not get messages. We have to poll for the messages instead. 3157 * 3158 * Note: since we don't have a buf_t available we cannot implement 3159 * the io:::done DTrace probe in this specific case. 3160 * 3161 * Arguments: 3162 * vdc - soft state pointer for this instance of the device driver. 3163 * 3164 * Return Code: 3165 * 0 - Success 3166 */ 3167 static int 3168 vdc_drain_response(vdc_t *vdc) 3169 { 3170 int rv, idx, retries; 3171 size_t msglen; 3172 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3173 vio_dring_msg_t dmsg; 3174 3175 mutex_enter(&vdc->lock); 3176 3177 retries = 0; 3178 for (;;) { 3179 msglen = sizeof (dmsg); 3180 rv = ldc_read(vdc->ldc_handle, (caddr_t)&dmsg, &msglen); 3181 if (rv) { 3182 rv = EINVAL; 3183 break; 3184 } 3185 3186 /* 3187 * if there are no packets wait and check again 3188 */ 3189 if ((rv == 0) && (msglen == 0)) { 3190 if (retries++ > vdc_dump_retries) { 3191 rv = EAGAIN; 3192 break; 3193 } 3194 3195 drv_usecwait(vdc_usec_timeout_dump); 3196 continue; 3197 } 3198 3199 /* 3200 * Ignore all messages that are not ACKs/NACKs to 3201 * DRing requests. 3202 */ 3203 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3204 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3205 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3206 dmsg.tag.vio_msgtype, 3207 dmsg.tag.vio_subtype, 3208 dmsg.tag.vio_subtype_env); 3209 continue; 3210 } 3211 3212 /* 3213 * set the appropriate return value for the current request. 3214 */ 3215 switch (dmsg.tag.vio_subtype) { 3216 case VIO_SUBTYPE_ACK: 3217 rv = 0; 3218 break; 3219 case VIO_SUBTYPE_NACK: 3220 rv = EAGAIN; 3221 break; 3222 default: 3223 continue; 3224 } 3225 3226 idx = dmsg.start_idx; 3227 if (idx >= vdc->dring_len) { 3228 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3229 vdc->instance, idx); 3230 continue; 3231 } 3232 ldep = &vdc->local_dring[idx]; 3233 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3234 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3235 vdc->instance, idx, ldep->dep->hdr.dstate); 3236 continue; 3237 } 3238 3239 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3240 vdc->instance, idx, ldep->dep->hdr.dstate); 3241 3242 rv = vdc_depopulate_descriptor(vdc, idx); 3243 if (rv) { 3244 DMSG(vdc, 0, 3245 "[%d] Entry @ %d - depopulate failed ..\n", 3246 vdc->instance, idx); 3247 } 3248 3249 /* if this is the last descriptor - break out of loop */ 3250 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3251 break; 3252 } 3253 3254 mutex_exit(&vdc->lock); 3255 DMSG(vdc, 0, "End idx=%d\n", idx); 3256 3257 return (rv); 3258 } 3259 3260 3261 /* 3262 * Function: 3263 * vdc_depopulate_descriptor() 3264 * 3265 * Description: 3266 * 3267 * Arguments: 3268 * vdc - soft state pointer for this instance of the device driver. 3269 * idx - Index of the Descriptor Ring entry being modified 3270 * 3271 * Return Code: 3272 * 0 - Success 3273 */ 3274 static int 3275 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3276 { 3277 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3278 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3279 int status = ENXIO; 3280 int rv = 0; 3281 3282 ASSERT(vdc != NULL); 3283 ASSERT(idx < vdc->dring_len); 3284 ldep = &vdc->local_dring[idx]; 3285 ASSERT(ldep != NULL); 3286 ASSERT(MUTEX_HELD(&vdc->lock)); 3287 3288 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3289 DMSG(vdc, 2, ": idx = %d\n", idx); 3290 3291 dep = ldep->dep; 3292 ASSERT(dep != NULL); 3293 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3294 (dep->payload.status == ECANCELED)); 3295 3296 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3297 3298 ldep->is_free = B_TRUE; 3299 status = dep->payload.status; 3300 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3301 3302 /* 3303 * If no buffers were used to transfer information to the server when 3304 * populating the descriptor then no memory handles need to be unbound 3305 * and we can return now. 3306 */ 3307 if (ldep->nbytes == 0) { 3308 cv_signal(&vdc->dring_free_cv); 3309 return (status); 3310 } 3311 3312 /* 3313 * If the upper layer passed in a misaligned address we copied the 3314 * data into an aligned buffer before sending it to LDC - we now 3315 * copy it back to the original buffer. 3316 */ 3317 if (ldep->align_addr) { 3318 ASSERT(ldep->addr != NULL); 3319 3320 if (dep->payload.nbytes > 0) 3321 bcopy(ldep->align_addr, ldep->addr, 3322 dep->payload.nbytes); 3323 kmem_free(ldep->align_addr, 3324 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3325 ldep->align_addr = NULL; 3326 } 3327 3328 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3329 if (rv != 0) { 3330 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3331 vdc->instance, ldep->desc_mhdl, idx, rv); 3332 /* 3333 * The error returned by the vDisk server is more informative 3334 * and thus has a higher priority but if it isn't set we ensure 3335 * that this function returns an error. 3336 */ 3337 if (status == 0) 3338 status = EINVAL; 3339 } 3340 3341 cv_signal(&vdc->membind_cv); 3342 cv_signal(&vdc->dring_free_cv); 3343 3344 return (status); 3345 } 3346 3347 /* 3348 * Function: 3349 * vdc_populate_mem_hdl() 3350 * 3351 * Description: 3352 * 3353 * Arguments: 3354 * vdc - soft state pointer for this instance of the device driver. 3355 * idx - Index of the Descriptor Ring entry being modified 3356 * addr - virtual address being mapped in 3357 * nybtes - number of bytes in 'addr' 3358 * operation - the vDisk operation being performed (VD_OP_xxx) 3359 * 3360 * Return Code: 3361 * 0 - Success 3362 */ 3363 static int 3364 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3365 { 3366 vd_dring_entry_t *dep = NULL; 3367 ldc_mem_handle_t mhdl; 3368 caddr_t vaddr; 3369 size_t nbytes; 3370 uint8_t perm = LDC_MEM_RW; 3371 uint8_t maptype; 3372 int rv = 0; 3373 int i; 3374 3375 ASSERT(vdcp != NULL); 3376 3377 dep = ldep->dep; 3378 mhdl = ldep->desc_mhdl; 3379 3380 switch (ldep->dir) { 3381 case VIO_read_dir: 3382 perm = LDC_MEM_W; 3383 break; 3384 3385 case VIO_write_dir: 3386 perm = LDC_MEM_R; 3387 break; 3388 3389 case VIO_both_dir: 3390 perm = LDC_MEM_RW; 3391 break; 3392 3393 default: 3394 ASSERT(0); /* catch bad programming in vdc */ 3395 } 3396 3397 /* 3398 * LDC expects any addresses passed in to be 8-byte aligned. We need 3399 * to copy the contents of any misaligned buffers to a newly allocated 3400 * buffer and bind it instead (and copy the the contents back to the 3401 * original buffer passed in when depopulating the descriptor) 3402 */ 3403 vaddr = ldep->addr; 3404 nbytes = ldep->nbytes; 3405 if (((uint64_t)vaddr & 0x7) != 0) { 3406 ASSERT(ldep->align_addr == NULL); 3407 ldep->align_addr = 3408 kmem_alloc(sizeof (caddr_t) * 3409 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3410 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3411 "(buf=%p nb=%ld op=%d)\n", 3412 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3413 nbytes, ldep->operation); 3414 if (perm != LDC_MEM_W) 3415 bcopy(vaddr, ldep->align_addr, nbytes); 3416 vaddr = ldep->align_addr; 3417 } 3418 3419 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3420 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3421 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3422 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3423 vdcp->instance, dep->payload.ncookies); 3424 if (rv != 0) { 3425 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3426 "(mhdl=%p, buf=%p, err=%d)\n", 3427 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3428 if (ldep->align_addr) { 3429 kmem_free(ldep->align_addr, 3430 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3431 ldep->align_addr = NULL; 3432 } 3433 return (EAGAIN); 3434 } 3435 3436 /* 3437 * Get the other cookies (if any). 3438 */ 3439 for (i = 1; i < dep->payload.ncookies; i++) { 3440 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3441 if (rv != 0) { 3442 (void) ldc_mem_unbind_handle(mhdl); 3443 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3444 "(mhdl=%lx cnum=%d), err=%d", 3445 vdcp->instance, mhdl, i, rv); 3446 if (ldep->align_addr) { 3447 kmem_free(ldep->align_addr, 3448 sizeof (caddr_t) * ldep->nbytes); 3449 ldep->align_addr = NULL; 3450 } 3451 return (EAGAIN); 3452 } 3453 } 3454 3455 return (rv); 3456 } 3457 3458 /* 3459 * Interrupt handlers for messages from LDC 3460 */ 3461 3462 /* 3463 * Function: 3464 * vdc_handle_cb() 3465 * 3466 * Description: 3467 * 3468 * Arguments: 3469 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3470 * arg - soft state pointer for this instance of the device driver. 3471 * 3472 * Return Code: 3473 * 0 - Success 3474 */ 3475 static uint_t 3476 vdc_handle_cb(uint64_t event, caddr_t arg) 3477 { 3478 ldc_status_t ldc_state; 3479 int rv = 0; 3480 3481 vdc_t *vdc = (vdc_t *)(void *)arg; 3482 3483 ASSERT(vdc != NULL); 3484 3485 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3486 3487 /* 3488 * Depending on the type of event that triggered this callback, 3489 * we modify the handshake state or read the data. 3490 * 3491 * NOTE: not done as a switch() as event could be triggered by 3492 * a state change and a read request. Also the ordering of the 3493 * check for the event types is deliberate. 3494 */ 3495 if (event & LDC_EVT_UP) { 3496 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3497 3498 mutex_enter(&vdc->lock); 3499 3500 /* get LDC state */ 3501 rv = ldc_status(vdc->ldc_handle, &ldc_state); 3502 if (rv != 0) { 3503 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3504 vdc->instance, rv); 3505 return (LDC_SUCCESS); 3506 } 3507 if (vdc->ldc_state != LDC_UP && ldc_state == LDC_UP) { 3508 /* 3509 * Reset the transaction sequence numbers when 3510 * LDC comes up. We then kick off the handshake 3511 * negotiation with the vDisk server. 3512 */ 3513 vdc->seq_num = 1; 3514 vdc->seq_num_reply = 0; 3515 vdc->ldc_state = ldc_state; 3516 cv_signal(&vdc->initwait_cv); 3517 } 3518 3519 mutex_exit(&vdc->lock); 3520 } 3521 3522 if (event & LDC_EVT_READ) { 3523 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3524 mutex_enter(&vdc->read_lock); 3525 cv_signal(&vdc->read_cv); 3526 vdc->read_state = VDC_READ_PENDING; 3527 mutex_exit(&vdc->read_lock); 3528 3529 /* that's all we have to do - no need to handle DOWN/RESET */ 3530 return (LDC_SUCCESS); 3531 } 3532 3533 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3534 3535 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3536 3537 mutex_enter(&vdc->lock); 3538 /* 3539 * Need to wake up any readers so they will 3540 * detect that a reset has occurred. 3541 */ 3542 mutex_enter(&vdc->read_lock); 3543 if ((vdc->read_state == VDC_READ_WAITING) || 3544 (vdc->read_state == VDC_READ_RESET)) 3545 cv_signal(&vdc->read_cv); 3546 vdc->read_state = VDC_READ_RESET; 3547 mutex_exit(&vdc->read_lock); 3548 3549 /* wake up any threads waiting for connection to come up */ 3550 if (vdc->state == VDC_STATE_INIT_WAITING) { 3551 vdc->state = VDC_STATE_RESETTING; 3552 cv_signal(&vdc->initwait_cv); 3553 } 3554 3555 mutex_exit(&vdc->lock); 3556 } 3557 3558 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3559 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3560 vdc->instance, event); 3561 3562 return (LDC_SUCCESS); 3563 } 3564 3565 /* 3566 * Function: 3567 * vdc_wait_for_response() 3568 * 3569 * Description: 3570 * Block waiting for a response from the server. If there is 3571 * no data the thread block on the read_cv that is signalled 3572 * by the callback when an EVT_READ occurs. 3573 * 3574 * Arguments: 3575 * vdcp - soft state pointer for this instance of the device driver. 3576 * 3577 * Return Code: 3578 * 0 - Success 3579 */ 3580 static int 3581 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3582 { 3583 size_t nbytes = sizeof (*msgp); 3584 int status; 3585 3586 ASSERT(vdcp != NULL); 3587 3588 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3589 3590 status = vdc_recv(vdcp, msgp, &nbytes); 3591 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3592 status, (int)nbytes); 3593 if (status) { 3594 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3595 vdcp->instance, status); 3596 return (status); 3597 } 3598 3599 if (nbytes < sizeof (vio_msg_tag_t)) { 3600 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3601 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3602 return (ENOMSG); 3603 } 3604 3605 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3606 msgp->tag.vio_msgtype, 3607 msgp->tag.vio_subtype, 3608 msgp->tag.vio_subtype_env); 3609 3610 /* 3611 * Verify the Session ID of the message 3612 * 3613 * Every message after the Version has been negotiated should 3614 * have the correct session ID set. 3615 */ 3616 if ((msgp->tag.vio_sid != vdcp->session_id) && 3617 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3618 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3619 "expected 0x%lx [seq num %lx @ %d]", 3620 vdcp->instance, msgp->tag.vio_sid, 3621 vdcp->session_id, 3622 ((vio_dring_msg_t *)msgp)->seq_num, 3623 ((vio_dring_msg_t *)msgp)->start_idx); 3624 return (ENOMSG); 3625 } 3626 return (0); 3627 } 3628 3629 3630 /* 3631 * Function: 3632 * vdc_resubmit_backup_dring() 3633 * 3634 * Description: 3635 * Resubmit each descriptor in the backed up dring to 3636 * vDisk server. The Dring was backed up during connection 3637 * reset. 3638 * 3639 * Arguments: 3640 * vdcp - soft state pointer for this instance of the device driver. 3641 * 3642 * Return Code: 3643 * 0 - Success 3644 */ 3645 static int 3646 vdc_resubmit_backup_dring(vdc_t *vdcp) 3647 { 3648 int count; 3649 int b_idx; 3650 int rv; 3651 int dring_size; 3652 int status; 3653 vio_msg_t vio_msg; 3654 vdc_local_desc_t *curr_ldep; 3655 3656 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3657 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3658 3659 if (vdcp->local_dring_backup == NULL) { 3660 /* the pending requests have already been processed */ 3661 return (0); 3662 } 3663 3664 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3665 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3666 3667 /* 3668 * Walk the backup copy of the local descriptor ring and 3669 * resubmit all the outstanding transactions. 3670 */ 3671 b_idx = vdcp->local_dring_backup_tail; 3672 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3673 3674 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3675 3676 /* only resubmit outstanding transactions */ 3677 if (!curr_ldep->is_free) { 3678 3679 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3680 mutex_enter(&vdcp->lock); 3681 rv = vdc_populate_descriptor(vdcp, curr_ldep->operation, 3682 curr_ldep->addr, curr_ldep->nbytes, 3683 curr_ldep->slice, curr_ldep->offset, 3684 curr_ldep->cb_type, curr_ldep->cb_arg, 3685 curr_ldep->dir); 3686 mutex_exit(&vdcp->lock); 3687 if (rv) { 3688 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3689 vdcp->instance, b_idx); 3690 return (rv); 3691 } 3692 3693 /* Wait for the response message. */ 3694 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3695 b_idx); 3696 status = vdc_wait_for_response(vdcp, &vio_msg); 3697 if (status) { 3698 DMSG(vdcp, 1, "[%d] wait_for_response " 3699 "returned err=%d\n", vdcp->instance, 3700 status); 3701 return (status); 3702 } 3703 3704 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3705 status = vdc_process_data_msg(vdcp, &vio_msg); 3706 if (status) { 3707 DMSG(vdcp, 1, "[%d] process_data_msg " 3708 "returned err=%d\n", vdcp->instance, 3709 status); 3710 return (status); 3711 } 3712 } 3713 3714 /* get the next element to submit */ 3715 if (++b_idx >= vdcp->local_dring_backup_len) 3716 b_idx = 0; 3717 } 3718 3719 /* all done - now clear up pending dring copy */ 3720 dring_size = vdcp->local_dring_backup_len * 3721 sizeof (vdcp->local_dring_backup[0]); 3722 3723 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3724 3725 vdcp->local_dring_backup = NULL; 3726 3727 return (0); 3728 } 3729 3730 /* 3731 * Function: 3732 * vdc_cancel_backup_dring 3733 * 3734 * Description: 3735 * Cancel each descriptor in the backed up dring to vDisk server. 3736 * The Dring was backed up during connection reset. 3737 * 3738 * Arguments: 3739 * vdcp - soft state pointer for this instance of the device driver. 3740 * 3741 * Return Code: 3742 * None 3743 */ 3744 void 3745 vdc_cancel_backup_ring(vdc_t *vdcp) 3746 { 3747 vdc_local_desc_t *ldep; 3748 struct buf *bufp; 3749 int count; 3750 int b_idx; 3751 int dring_size; 3752 3753 ASSERT(MUTEX_HELD(&vdcp->lock)); 3754 ASSERT(vdcp->state == VDC_STATE_INIT || 3755 vdcp->state == VDC_STATE_INIT_WAITING || 3756 vdcp->state == VDC_STATE_NEGOTIATE || 3757 vdcp->state == VDC_STATE_RESETTING); 3758 3759 if (vdcp->local_dring_backup == NULL) { 3760 /* the pending requests have already been processed */ 3761 return; 3762 } 3763 3764 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3765 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3766 3767 /* 3768 * Walk the backup copy of the local descriptor ring and 3769 * cancel all the outstanding transactions. 3770 */ 3771 b_idx = vdcp->local_dring_backup_tail; 3772 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3773 3774 ldep = &(vdcp->local_dring_backup[b_idx]); 3775 3776 /* only cancel outstanding transactions */ 3777 if (!ldep->is_free) { 3778 3779 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3780 3781 /* 3782 * All requests have already been cleared from the 3783 * local descriptor ring and the LDC channel has been 3784 * reset so we will never get any reply for these 3785 * requests. Now we just have to notify threads waiting 3786 * for replies that the request has failed. 3787 */ 3788 switch (ldep->cb_type) { 3789 case CB_SYNC: 3790 ASSERT(vdcp->sync_op_pending); 3791 vdcp->sync_op_status = EIO; 3792 vdcp->sync_op_pending = B_FALSE; 3793 cv_signal(&vdcp->sync_pending_cv); 3794 break; 3795 3796 case CB_STRATEGY: 3797 bufp = ldep->cb_arg; 3798 ASSERT(bufp != NULL); 3799 bufp->b_resid = bufp->b_bcount; 3800 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3801 VD_KSTAT_RUNQ_EXIT(vdcp->io_stats); 3802 DTRACE_IO1(done, buf_t *, bufp); 3803 bioerror(bufp, EIO); 3804 biodone(bufp); 3805 break; 3806 3807 default: 3808 ASSERT(0); 3809 } 3810 3811 } 3812 3813 /* get the next element to cancel */ 3814 if (++b_idx >= vdcp->local_dring_backup_len) 3815 b_idx = 0; 3816 } 3817 3818 /* all done - now clear up pending dring copy */ 3819 dring_size = vdcp->local_dring_backup_len * 3820 sizeof (vdcp->local_dring_backup[0]); 3821 3822 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3823 3824 vdcp->local_dring_backup = NULL; 3825 3826 DTRACE_PROBE2(processed, int, count, vdc_t *, vdcp); 3827 } 3828 3829 /* 3830 * Function: 3831 * vdc_connection_timeout 3832 * 3833 * Description: 3834 * This function is invoked if the timeout set to establish the connection 3835 * with vds expires. This will happen if we spend too much time in the 3836 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3837 * cancel any pending request and mark them as failed. 3838 * 3839 * If the timeout does not expire, it will be cancelled when we reach the 3840 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 3841 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 3842 * VDC_STATE_RESETTING state in which case we do nothing because the 3843 * timeout is being cancelled. 3844 * 3845 * Arguments: 3846 * arg - argument of the timeout function actually a soft state 3847 * pointer for the instance of the device driver. 3848 * 3849 * Return Code: 3850 * None 3851 */ 3852 void 3853 vdc_connection_timeout(void *arg) 3854 { 3855 vdc_t *vdcp = (vdc_t *)arg; 3856 3857 mutex_enter(&vdcp->lock); 3858 3859 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 3860 vdcp->state == VDC_STATE_DETACH) { 3861 /* 3862 * The connection has just been re-established or 3863 * we are detaching. 3864 */ 3865 vdcp->ctimeout_reached = B_FALSE; 3866 mutex_exit(&vdcp->lock); 3867 return; 3868 } 3869 3870 vdcp->ctimeout_reached = B_TRUE; 3871 3872 /* notify requests waiting for sending */ 3873 cv_broadcast(&vdcp->running_cv); 3874 3875 /* cancel requests waiting for a result */ 3876 vdc_cancel_backup_ring(vdcp); 3877 3878 mutex_exit(&vdcp->lock); 3879 3880 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 3881 vdcp->instance); 3882 } 3883 3884 /* 3885 * Function: 3886 * vdc_backup_local_dring() 3887 * 3888 * Description: 3889 * Backup the current dring in the event of a reset. The Dring 3890 * transactions will be resubmitted to the server when the 3891 * connection is restored. 3892 * 3893 * Arguments: 3894 * vdcp - soft state pointer for this instance of the device driver. 3895 * 3896 * Return Code: 3897 * NONE 3898 */ 3899 static void 3900 vdc_backup_local_dring(vdc_t *vdcp) 3901 { 3902 int dring_size; 3903 3904 ASSERT(MUTEX_HELD(&vdcp->lock)); 3905 ASSERT(vdcp->state == VDC_STATE_RESETTING); 3906 3907 /* 3908 * If the backup dring is stil around, it means 3909 * that the last restore did not complete. However, 3910 * since we never got back into the running state, 3911 * the backup copy we have is still valid. 3912 */ 3913 if (vdcp->local_dring_backup != NULL) { 3914 DMSG(vdcp, 1, "reusing local descriptor ring backup " 3915 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 3916 vdcp->local_dring_backup_tail); 3917 return; 3918 } 3919 3920 /* 3921 * The backup dring can be NULL and the local dring may not be 3922 * initialized. This can happen if we had a reset while establishing 3923 * a new connection but after the connection has timed out. In that 3924 * case the backup dring is NULL because the requests have been 3925 * cancelled and the request occured before the local dring is 3926 * initialized. 3927 */ 3928 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 3929 return; 3930 3931 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 3932 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 3933 3934 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 3935 3936 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 3937 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 3938 3939 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 3940 vdcp->local_dring_backup_len = vdcp->dring_len; 3941 } 3942 3943 /* -------------------------------------------------------------------------- */ 3944 3945 /* 3946 * The following functions process the incoming messages from vds 3947 */ 3948 3949 /* 3950 * Function: 3951 * vdc_process_msg_thread() 3952 * 3953 * Description: 3954 * 3955 * Main VDC message processing thread. Each vDisk instance 3956 * consists of a copy of this thread. This thread triggers 3957 * all the handshakes and data exchange with the server. It 3958 * also handles all channel resets 3959 * 3960 * Arguments: 3961 * vdc - soft state pointer for this instance of the device driver. 3962 * 3963 * Return Code: 3964 * None 3965 */ 3966 static void 3967 vdc_process_msg_thread(vdc_t *vdcp) 3968 { 3969 int status; 3970 int ctimeout; 3971 timeout_id_t tmid = 0; 3972 3973 mutex_enter(&vdcp->lock); 3974 3975 for (;;) { 3976 3977 #define Q(_s) (vdcp->state == _s) ? #_s : 3978 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 3979 Q(VDC_STATE_INIT) 3980 Q(VDC_STATE_INIT_WAITING) 3981 Q(VDC_STATE_NEGOTIATE) 3982 Q(VDC_STATE_HANDLE_PENDING) 3983 Q(VDC_STATE_RUNNING) 3984 Q(VDC_STATE_RESETTING) 3985 Q(VDC_STATE_DETACH) 3986 "UNKNOWN"); 3987 3988 switch (vdcp->state) { 3989 case VDC_STATE_INIT: 3990 3991 /* 3992 * If requested, start a timeout to check if the 3993 * connection with vds is established in the 3994 * specified delay. If the timeout expires, we 3995 * will cancel any pending request. 3996 * 3997 * If some reset have occurred while establishing 3998 * the connection, we already have a timeout armed 3999 * and in that case we don't need to arm a new one. 4000 */ 4001 ctimeout = (vdc_timeout != 0)? 4002 vdc_timeout : vdcp->ctimeout; 4003 4004 if (ctimeout != 0 && tmid == 0) { 4005 tmid = timeout(vdc_connection_timeout, vdcp, 4006 ctimeout * drv_usectohz(1000000)); 4007 } 4008 4009 /* Check if have re-initializing repeatedly */ 4010 if (vdcp->hshake_cnt++ > vdc_hshake_retries && 4011 vdcp->lifecycle != VDC_LC_ONLINE) { 4012 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4013 vdcp->instance); 4014 vdcp->state = VDC_STATE_DETACH; 4015 break; 4016 } 4017 4018 /* Bring up connection with vds via LDC */ 4019 status = vdc_start_ldc_connection(vdcp); 4020 if (status == EINVAL) { 4021 DMSG(vdcp, 0, "[%d] Could not start LDC", 4022 vdcp->instance); 4023 vdcp->state = VDC_STATE_DETACH; 4024 } else { 4025 vdcp->state = VDC_STATE_INIT_WAITING; 4026 } 4027 break; 4028 4029 case VDC_STATE_INIT_WAITING: 4030 4031 /* 4032 * Let the callback event move us on 4033 * when channel is open to server 4034 */ 4035 while (vdcp->ldc_state != LDC_UP) { 4036 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4037 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4038 DMSG(vdcp, 0, 4039 "state moved to %d out from under us...\n", 4040 vdcp->state); 4041 4042 break; 4043 } 4044 } 4045 if (vdcp->state == VDC_STATE_INIT_WAITING && 4046 vdcp->ldc_state == LDC_UP) { 4047 vdcp->state = VDC_STATE_NEGOTIATE; 4048 } 4049 break; 4050 4051 case VDC_STATE_NEGOTIATE: 4052 switch (status = vdc_ver_negotiation(vdcp)) { 4053 case 0: 4054 break; 4055 default: 4056 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4057 status); 4058 goto reset; 4059 } 4060 4061 switch (status = vdc_attr_negotiation(vdcp)) { 4062 case 0: 4063 break; 4064 default: 4065 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4066 status); 4067 goto reset; 4068 } 4069 4070 switch (status = vdc_dring_negotiation(vdcp)) { 4071 case 0: 4072 break; 4073 default: 4074 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4075 status); 4076 goto reset; 4077 } 4078 4079 switch (status = vdc_rdx_exchange(vdcp)) { 4080 case 0: 4081 vdcp->state = VDC_STATE_HANDLE_PENDING; 4082 goto done; 4083 default: 4084 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4085 status); 4086 goto reset; 4087 } 4088 reset: 4089 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4090 status); 4091 vdcp->state = VDC_STATE_RESETTING; 4092 vdcp->self_reset = B_TRUE; 4093 done: 4094 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4095 vdcp->state); 4096 break; 4097 4098 case VDC_STATE_HANDLE_PENDING: 4099 4100 if (vdcp->ctimeout_reached) { 4101 /* 4102 * The connection timeout had been reached so 4103 * pending requests have been cancelled. Now 4104 * that the connection is back we can reset 4105 * the timeout. 4106 */ 4107 ASSERT(vdcp->local_dring_backup == NULL); 4108 ASSERT(tmid != 0); 4109 tmid = 0; 4110 vdcp->ctimeout_reached = B_FALSE; 4111 vdcp->state = VDC_STATE_RUNNING; 4112 DMSG(vdcp, 0, "[%d] connection to service " 4113 "domain is up", vdcp->instance); 4114 break; 4115 } 4116 4117 mutex_exit(&vdcp->lock); 4118 if (tmid != 0) { 4119 (void) untimeout(tmid); 4120 tmid = 0; 4121 } 4122 status = vdc_resubmit_backup_dring(vdcp); 4123 mutex_enter(&vdcp->lock); 4124 4125 if (status) 4126 vdcp->state = VDC_STATE_RESETTING; 4127 else 4128 vdcp->state = VDC_STATE_RUNNING; 4129 4130 break; 4131 4132 /* enter running state */ 4133 case VDC_STATE_RUNNING: 4134 /* 4135 * Signal anyone waiting for the connection 4136 * to come on line. 4137 */ 4138 vdcp->hshake_cnt = 0; 4139 cv_broadcast(&vdcp->running_cv); 4140 4141 /* failfast has to been checked after reset */ 4142 cv_signal(&vdcp->failfast_cv); 4143 4144 /* ownership is lost during reset */ 4145 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4146 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4147 cv_signal(&vdcp->ownership_cv); 4148 4149 mutex_exit(&vdcp->lock); 4150 4151 for (;;) { 4152 vio_msg_t msg; 4153 status = vdc_wait_for_response(vdcp, &msg); 4154 if (status) break; 4155 4156 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4157 vdcp->instance); 4158 status = vdc_process_data_msg(vdcp, &msg); 4159 if (status) { 4160 DMSG(vdcp, 1, "[%d] process_data_msg " 4161 "returned err=%d\n", vdcp->instance, 4162 status); 4163 break; 4164 } 4165 4166 } 4167 4168 mutex_enter(&vdcp->lock); 4169 4170 vdcp->state = VDC_STATE_RESETTING; 4171 vdcp->self_reset = B_TRUE; 4172 break; 4173 4174 case VDC_STATE_RESETTING: 4175 /* 4176 * When we reach this state, we either come from the 4177 * VDC_STATE_RUNNING state and we can have pending 4178 * request but no timeout is armed; or we come from 4179 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4180 * VDC_HANDLE_PENDING state and there is no pending 4181 * request or pending requests have already been copied 4182 * into the backup dring. So we can safely keep the 4183 * connection timeout armed while we are in this state. 4184 */ 4185 4186 DMSG(vdcp, 0, "Initiating channel reset " 4187 "(pending = %d)\n", (int)vdcp->threads_pending); 4188 4189 if (vdcp->self_reset) { 4190 DMSG(vdcp, 0, 4191 "[%d] calling stop_ldc_connection.\n", 4192 vdcp->instance); 4193 status = vdc_stop_ldc_connection(vdcp); 4194 vdcp->self_reset = B_FALSE; 4195 } 4196 4197 /* 4198 * Wait for all threads currently waiting 4199 * for a free dring entry to use. 4200 */ 4201 while (vdcp->threads_pending) { 4202 cv_broadcast(&vdcp->membind_cv); 4203 cv_broadcast(&vdcp->dring_free_cv); 4204 mutex_exit(&vdcp->lock); 4205 /* give the waiters enough time to wake up */ 4206 delay(vdc_hz_min_ldc_delay); 4207 mutex_enter(&vdcp->lock); 4208 } 4209 4210 ASSERT(vdcp->threads_pending == 0); 4211 4212 /* Sanity check that no thread is receiving */ 4213 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4214 4215 vdcp->read_state = VDC_READ_IDLE; 4216 4217 vdc_backup_local_dring(vdcp); 4218 4219 /* cleanup the old d-ring */ 4220 vdc_destroy_descriptor_ring(vdcp); 4221 4222 /* go and start again */ 4223 vdcp->state = VDC_STATE_INIT; 4224 4225 break; 4226 4227 case VDC_STATE_DETACH: 4228 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4229 vdcp->instance); 4230 4231 /* cancel any pending timeout */ 4232 mutex_exit(&vdcp->lock); 4233 if (tmid != 0) { 4234 (void) untimeout(tmid); 4235 tmid = 0; 4236 } 4237 mutex_enter(&vdcp->lock); 4238 4239 /* 4240 * Signal anyone waiting for connection 4241 * to come online 4242 */ 4243 cv_broadcast(&vdcp->running_cv); 4244 4245 while (vdcp->sync_op_pending) { 4246 cv_signal(&vdcp->sync_pending_cv); 4247 cv_signal(&vdcp->sync_blocked_cv); 4248 mutex_exit(&vdcp->lock); 4249 /* give the waiters enough time to wake up */ 4250 delay(vdc_hz_min_ldc_delay); 4251 mutex_enter(&vdcp->lock); 4252 } 4253 4254 mutex_exit(&vdcp->lock); 4255 4256 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4257 vdcp->instance); 4258 thread_exit(); 4259 break; 4260 } 4261 } 4262 } 4263 4264 4265 /* 4266 * Function: 4267 * vdc_process_data_msg() 4268 * 4269 * Description: 4270 * This function is called by the message processing thread each time 4271 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4272 * be an ACK or NACK from vds[1] which vdc handles as follows. 4273 * ACK - wake up the waiting thread 4274 * NACK - resend any messages necessary 4275 * 4276 * [1] Although the message format allows it, vds should not send a 4277 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4278 * some bizarre reason it does, vdc will reset the connection. 4279 * 4280 * Arguments: 4281 * vdc - soft state pointer for this instance of the device driver. 4282 * msg - the LDC message sent by vds 4283 * 4284 * Return Code: 4285 * 0 - Success. 4286 * > 0 - error value returned by LDC 4287 */ 4288 static int 4289 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4290 { 4291 int status = 0; 4292 vio_dring_msg_t *dring_msg; 4293 vdc_local_desc_t *ldep = NULL; 4294 int start, end; 4295 int idx; 4296 4297 dring_msg = (vio_dring_msg_t *)msg; 4298 4299 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4300 ASSERT(vdcp != NULL); 4301 4302 mutex_enter(&vdcp->lock); 4303 4304 /* 4305 * Check to see if the message has bogus data 4306 */ 4307 idx = start = dring_msg->start_idx; 4308 end = dring_msg->end_idx; 4309 if ((start >= vdcp->dring_len) || 4310 (end >= vdcp->dring_len) || (end < -1)) { 4311 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4312 vdcp->instance, start, end); 4313 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4314 mutex_exit(&vdcp->lock); 4315 return (EINVAL); 4316 } 4317 4318 /* 4319 * Verify that the sequence number is what vdc expects. 4320 */ 4321 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4322 case VDC_SEQ_NUM_TODO: 4323 break; /* keep processing this message */ 4324 case VDC_SEQ_NUM_SKIP: 4325 mutex_exit(&vdcp->lock); 4326 return (0); 4327 case VDC_SEQ_NUM_INVALID: 4328 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4329 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4330 mutex_exit(&vdcp->lock); 4331 return (ENXIO); 4332 } 4333 4334 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4335 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4336 VDC_DUMP_DRING_MSG(dring_msg); 4337 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4338 mutex_exit(&vdcp->lock); 4339 return (EIO); 4340 4341 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4342 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4343 mutex_exit(&vdcp->lock); 4344 return (EPROTO); 4345 } 4346 4347 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4348 ASSERT(start == end); 4349 4350 ldep = &vdcp->local_dring[idx]; 4351 4352 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4353 ldep->dep->hdr.dstate, ldep->cb_type); 4354 4355 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4356 struct buf *bufp; 4357 4358 switch (ldep->cb_type) { 4359 case CB_SYNC: 4360 ASSERT(vdcp->sync_op_pending); 4361 4362 status = vdc_depopulate_descriptor(vdcp, idx); 4363 vdcp->sync_op_status = status; 4364 vdcp->sync_op_pending = B_FALSE; 4365 cv_signal(&vdcp->sync_pending_cv); 4366 break; 4367 4368 case CB_STRATEGY: 4369 bufp = ldep->cb_arg; 4370 ASSERT(bufp != NULL); 4371 bufp->b_resid = 4372 bufp->b_bcount - ldep->dep->payload.nbytes; 4373 status = ldep->dep->payload.status; /* Future:ntoh */ 4374 if (status != 0) { 4375 DMSG(vdcp, 1, "strategy status=%d\n", status); 4376 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4377 bioerror(bufp, status); 4378 } 4379 4380 (void) vdc_depopulate_descriptor(vdcp, idx); 4381 4382 DMSG(vdcp, 1, 4383 "strategy complete req=%ld bytes resp=%ld bytes\n", 4384 bufp->b_bcount, ldep->dep->payload.nbytes); 4385 4386 if (status != 0 && vdcp->failfast_interval != 0) { 4387 /* 4388 * The I/O has failed and failfast is enabled. 4389 * We need the failfast thread to check if the 4390 * failure is due to a reservation conflict. 4391 */ 4392 (void) vdc_failfast_io_queue(vdcp, bufp); 4393 } else { 4394 if (status == 0) { 4395 int op = (bufp->b_flags & B_READ) ? 4396 VD_OP_BREAD : VD_OP_BWRITE; 4397 VD_UPDATE_IO_STATS(vdcp, op, 4398 ldep->dep->payload.nbytes); 4399 } 4400 VD_KSTAT_RUNQ_EXIT(vdcp->io_stats); 4401 DTRACE_IO1(done, buf_t *, bufp); 4402 biodone(bufp); 4403 } 4404 break; 4405 4406 default: 4407 ASSERT(0); 4408 } 4409 } 4410 4411 /* let the arrival signal propogate */ 4412 mutex_exit(&vdcp->lock); 4413 4414 /* probe gives the count of how many entries were processed */ 4415 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4416 4417 return (0); 4418 } 4419 4420 4421 /* 4422 * Function: 4423 * vdc_handle_ver_msg() 4424 * 4425 * Description: 4426 * 4427 * Arguments: 4428 * vdc - soft state pointer for this instance of the device driver. 4429 * ver_msg - LDC message sent by vDisk server 4430 * 4431 * Return Code: 4432 * 0 - Success 4433 */ 4434 static int 4435 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4436 { 4437 int status = 0; 4438 4439 ASSERT(vdc != NULL); 4440 ASSERT(mutex_owned(&vdc->lock)); 4441 4442 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4443 return (EPROTO); 4444 } 4445 4446 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4447 return (EINVAL); 4448 } 4449 4450 switch (ver_msg->tag.vio_subtype) { 4451 case VIO_SUBTYPE_ACK: 4452 /* 4453 * We check to see if the version returned is indeed supported 4454 * (The server may have also adjusted the minor number downwards 4455 * and if so 'ver_msg' will contain the actual version agreed) 4456 */ 4457 if (vdc_is_supported_version(ver_msg)) { 4458 vdc->ver.major = ver_msg->ver_major; 4459 vdc->ver.minor = ver_msg->ver_minor; 4460 ASSERT(vdc->ver.major > 0); 4461 } else { 4462 status = EPROTO; 4463 } 4464 break; 4465 4466 case VIO_SUBTYPE_NACK: 4467 /* 4468 * call vdc_is_supported_version() which will return the next 4469 * supported version (if any) in 'ver_msg' 4470 */ 4471 (void) vdc_is_supported_version(ver_msg); 4472 if (ver_msg->ver_major > 0) { 4473 size_t len = sizeof (*ver_msg); 4474 4475 ASSERT(vdc->ver.major > 0); 4476 4477 /* reset the necessary fields and resend */ 4478 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4479 ver_msg->dev_class = VDEV_DISK; 4480 4481 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4482 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4483 vdc->instance, status); 4484 if (len != sizeof (*ver_msg)) 4485 status = EBADMSG; 4486 } else { 4487 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4488 vdc->instance); 4489 status = ENOTSUP; 4490 } 4491 4492 break; 4493 case VIO_SUBTYPE_INFO: 4494 /* 4495 * Handle the case where vds starts handshake 4496 * (for now only vdc is the instigator) 4497 */ 4498 status = ENOTSUP; 4499 break; 4500 4501 default: 4502 status = EINVAL; 4503 break; 4504 } 4505 4506 return (status); 4507 } 4508 4509 /* 4510 * Function: 4511 * vdc_handle_attr_msg() 4512 * 4513 * Description: 4514 * 4515 * Arguments: 4516 * vdc - soft state pointer for this instance of the device driver. 4517 * attr_msg - LDC message sent by vDisk server 4518 * 4519 * Return Code: 4520 * 0 - Success 4521 */ 4522 static int 4523 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4524 { 4525 int status = 0; 4526 4527 ASSERT(vdc != NULL); 4528 ASSERT(mutex_owned(&vdc->lock)); 4529 4530 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4531 return (EPROTO); 4532 } 4533 4534 switch (attr_msg->tag.vio_subtype) { 4535 case VIO_SUBTYPE_ACK: 4536 /* 4537 * We now verify the attributes sent by vds. 4538 */ 4539 if (attr_msg->vdisk_size == 0) { 4540 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4541 vdc->instance); 4542 status = EINVAL; 4543 break; 4544 } 4545 4546 if (attr_msg->max_xfer_sz == 0) { 4547 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4548 vdc->instance); 4549 status = EINVAL; 4550 break; 4551 } 4552 4553 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4554 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4555 vdc->instance); 4556 attr_msg->vdisk_size = 0; 4557 } 4558 4559 /* 4560 * If the disk size is already set check that it hasn't changed. 4561 */ 4562 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4563 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4564 DMSG(vdc, 0, "[%d] Different disk size from vds " 4565 "(old=0x%lx - new=0x%lx", vdc->instance, 4566 vdc->vdisk_size, attr_msg->vdisk_size) 4567 status = EINVAL; 4568 break; 4569 } 4570 4571 vdc->vdisk_size = attr_msg->vdisk_size; 4572 vdc->vdisk_type = attr_msg->vdisk_type; 4573 vdc->operations = attr_msg->operations; 4574 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4575 vdc->vdisk_media = attr_msg->vdisk_media; 4576 else 4577 vdc->vdisk_media = 0; 4578 4579 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4580 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4581 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4582 vdc->instance, vdc->block_size, 4583 attr_msg->vdisk_block_size); 4584 4585 /* 4586 * We don't know at compile time what the vDisk server will 4587 * think are good values but we apply a large (arbitrary) 4588 * upper bound to prevent memory exhaustion in vdc if it was 4589 * allocating a DRing based of huge values sent by the server. 4590 * We probably will never exceed this except if the message 4591 * was garbage. 4592 */ 4593 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4594 (PAGESIZE * DEV_BSIZE)) { 4595 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4596 vdc->block_size = attr_msg->vdisk_block_size; 4597 } else { 4598 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4599 " using max supported by vdc", vdc->instance); 4600 } 4601 4602 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4603 (attr_msg->vdisk_size > INT64_MAX) || 4604 (attr_msg->operations == 0) || 4605 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4606 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4607 vdc->instance); 4608 status = EINVAL; 4609 break; 4610 } 4611 4612 /* 4613 * Now that we have received all attributes we can create a 4614 * fake geometry for the disk. 4615 */ 4616 vdc_create_fake_geometry(vdc); 4617 break; 4618 4619 case VIO_SUBTYPE_NACK: 4620 /* 4621 * vds could not handle the attributes we sent so we 4622 * stop negotiating. 4623 */ 4624 status = EPROTO; 4625 break; 4626 4627 case VIO_SUBTYPE_INFO: 4628 /* 4629 * Handle the case where vds starts the handshake 4630 * (for now; vdc is the only supported instigatior) 4631 */ 4632 status = ENOTSUP; 4633 break; 4634 4635 default: 4636 status = ENOTSUP; 4637 break; 4638 } 4639 4640 return (status); 4641 } 4642 4643 /* 4644 * Function: 4645 * vdc_handle_dring_reg_msg() 4646 * 4647 * Description: 4648 * 4649 * Arguments: 4650 * vdc - soft state pointer for this instance of the driver. 4651 * dring_msg - LDC message sent by vDisk server 4652 * 4653 * Return Code: 4654 * 0 - Success 4655 */ 4656 static int 4657 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4658 { 4659 int status = 0; 4660 4661 ASSERT(vdc != NULL); 4662 ASSERT(mutex_owned(&vdc->lock)); 4663 4664 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4665 return (EPROTO); 4666 } 4667 4668 switch (dring_msg->tag.vio_subtype) { 4669 case VIO_SUBTYPE_ACK: 4670 /* save the received dring_ident */ 4671 vdc->dring_ident = dring_msg->dring_ident; 4672 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4673 vdc->instance, vdc->dring_ident); 4674 break; 4675 4676 case VIO_SUBTYPE_NACK: 4677 /* 4678 * vds could not handle the DRing info we sent so we 4679 * stop negotiating. 4680 */ 4681 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4682 vdc->instance); 4683 status = EPROTO; 4684 break; 4685 4686 case VIO_SUBTYPE_INFO: 4687 /* 4688 * Handle the case where vds starts handshake 4689 * (for now only vdc is the instigatior) 4690 */ 4691 status = ENOTSUP; 4692 break; 4693 default: 4694 status = ENOTSUP; 4695 } 4696 4697 return (status); 4698 } 4699 4700 /* 4701 * Function: 4702 * vdc_verify_seq_num() 4703 * 4704 * Description: 4705 * This functions verifies that the sequence number sent back by the vDisk 4706 * server with the latest message is what is expected (i.e. it is greater 4707 * than the last seq num sent by the vDisk server and less than or equal 4708 * to the last seq num generated by vdc). 4709 * 4710 * It then checks the request ID to see if any requests need processing 4711 * in the DRing. 4712 * 4713 * Arguments: 4714 * vdc - soft state pointer for this instance of the driver. 4715 * dring_msg - pointer to the LDC message sent by vds 4716 * 4717 * Return Code: 4718 * VDC_SEQ_NUM_TODO - Message needs to be processed 4719 * VDC_SEQ_NUM_SKIP - Message has already been processed 4720 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4721 * vdc cannot deal with them 4722 */ 4723 static int 4724 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4725 { 4726 ASSERT(vdc != NULL); 4727 ASSERT(dring_msg != NULL); 4728 ASSERT(mutex_owned(&vdc->lock)); 4729 4730 /* 4731 * Check to see if the messages were responded to in the correct 4732 * order by vds. 4733 */ 4734 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4735 (dring_msg->seq_num > vdc->seq_num)) { 4736 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4737 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4738 vdc->instance, dring_msg->seq_num, 4739 vdc->seq_num_reply, vdc->seq_num, 4740 vdc->req_id_proc, vdc->req_id); 4741 return (VDC_SEQ_NUM_INVALID); 4742 } 4743 vdc->seq_num_reply = dring_msg->seq_num; 4744 4745 if (vdc->req_id_proc < vdc->req_id) 4746 return (VDC_SEQ_NUM_TODO); 4747 else 4748 return (VDC_SEQ_NUM_SKIP); 4749 } 4750 4751 4752 /* 4753 * Function: 4754 * vdc_is_supported_version() 4755 * 4756 * Description: 4757 * This routine checks if the major/minor version numbers specified in 4758 * 'ver_msg' are supported. If not it finds the next version that is 4759 * in the supported version list 'vdc_version[]' and sets the fields in 4760 * 'ver_msg' to those values 4761 * 4762 * Arguments: 4763 * ver_msg - LDC message sent by vDisk server 4764 * 4765 * Return Code: 4766 * B_TRUE - Success 4767 * B_FALSE - Version not supported 4768 */ 4769 static boolean_t 4770 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 4771 { 4772 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 4773 4774 for (int i = 0; i < vdc_num_versions; i++) { 4775 ASSERT(vdc_version[i].major > 0); 4776 ASSERT((i == 0) || 4777 (vdc_version[i].major < vdc_version[i-1].major)); 4778 4779 /* 4780 * If the major versions match, adjust the minor version, if 4781 * necessary, down to the highest value supported by this 4782 * client. The server should support all minor versions lower 4783 * than the value it sent 4784 */ 4785 if (ver_msg->ver_major == vdc_version[i].major) { 4786 if (ver_msg->ver_minor > vdc_version[i].minor) { 4787 DMSGX(0, 4788 "Adjusting minor version from %u to %u", 4789 ver_msg->ver_minor, vdc_version[i].minor); 4790 ver_msg->ver_minor = vdc_version[i].minor; 4791 } 4792 return (B_TRUE); 4793 } 4794 4795 /* 4796 * If the message contains a higher major version number, set 4797 * the message's major/minor versions to the current values 4798 * and return false, so this message will get resent with 4799 * these values, and the server will potentially try again 4800 * with the same or a lower version 4801 */ 4802 if (ver_msg->ver_major > vdc_version[i].major) { 4803 ver_msg->ver_major = vdc_version[i].major; 4804 ver_msg->ver_minor = vdc_version[i].minor; 4805 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 4806 ver_msg->ver_major, ver_msg->ver_minor); 4807 4808 return (B_FALSE); 4809 } 4810 4811 /* 4812 * Otherwise, the message's major version is less than the 4813 * current major version, so continue the loop to the next 4814 * (lower) supported version 4815 */ 4816 } 4817 4818 /* 4819 * No common version was found; "ground" the version pair in the 4820 * message to terminate negotiation 4821 */ 4822 ver_msg->ver_major = 0; 4823 ver_msg->ver_minor = 0; 4824 4825 return (B_FALSE); 4826 } 4827 /* -------------------------------------------------------------------------- */ 4828 4829 /* 4830 * DKIO(7) support 4831 */ 4832 4833 typedef struct vdc_dk_arg { 4834 struct dk_callback dkc; 4835 int mode; 4836 dev_t dev; 4837 vdc_t *vdc; 4838 } vdc_dk_arg_t; 4839 4840 /* 4841 * Function: 4842 * vdc_dkio_flush_cb() 4843 * 4844 * Description: 4845 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 4846 * by kernel code. 4847 * 4848 * Arguments: 4849 * arg - a pointer to a vdc_dk_arg_t structure. 4850 */ 4851 void 4852 vdc_dkio_flush_cb(void *arg) 4853 { 4854 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 4855 struct dk_callback *dkc = NULL; 4856 vdc_t *vdc = NULL; 4857 int rv; 4858 4859 if (dk_arg == NULL) { 4860 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 4861 return; 4862 } 4863 dkc = &dk_arg->dkc; 4864 vdc = dk_arg->vdc; 4865 ASSERT(vdc != NULL); 4866 4867 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 4868 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 4869 if (rv != 0) { 4870 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 4871 vdc->instance, rv, 4872 ddi_model_convert_from(dk_arg->mode & FMODELS)); 4873 } 4874 4875 /* 4876 * Trigger the call back to notify the caller the the ioctl call has 4877 * been completed. 4878 */ 4879 if ((dk_arg->mode & FKIOCTL) && 4880 (dkc != NULL) && 4881 (dkc->dkc_callback != NULL)) { 4882 ASSERT(dkc->dkc_cookie != NULL); 4883 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 4884 } 4885 4886 /* Indicate that one less DKIO write flush is outstanding */ 4887 mutex_enter(&vdc->lock); 4888 vdc->dkio_flush_pending--; 4889 ASSERT(vdc->dkio_flush_pending >= 0); 4890 mutex_exit(&vdc->lock); 4891 4892 /* free the mem that was allocated when the callback was dispatched */ 4893 kmem_free(arg, sizeof (vdc_dk_arg_t)); 4894 } 4895 4896 /* 4897 * Function: 4898 * vdc_dkio_gapart() 4899 * 4900 * Description: 4901 * This function implements the DKIOCGAPART ioctl. 4902 * 4903 * Arguments: 4904 * vdc - soft state pointer 4905 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 4906 * flag - ioctl flags 4907 */ 4908 static int 4909 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 4910 { 4911 struct dk_geom *geom; 4912 struct vtoc *vtoc; 4913 union { 4914 struct dk_map map[NDKMAP]; 4915 struct dk_map32 map32[NDKMAP]; 4916 } data; 4917 int i, rv, size; 4918 4919 mutex_enter(&vdc->lock); 4920 4921 if ((rv = vdc_validate_geometry(vdc)) != 0) { 4922 mutex_exit(&vdc->lock); 4923 return (rv); 4924 } 4925 4926 vtoc = vdc->vtoc; 4927 geom = vdc->geom; 4928 4929 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 4930 4931 for (i = 0; i < vtoc->v_nparts; i++) { 4932 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 4933 (geom->dkg_nhead * geom->dkg_nsect); 4934 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 4935 } 4936 size = NDKMAP * sizeof (struct dk_map32); 4937 4938 } else { 4939 4940 for (i = 0; i < vtoc->v_nparts; i++) { 4941 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 4942 (geom->dkg_nhead * geom->dkg_nsect); 4943 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 4944 } 4945 size = NDKMAP * sizeof (struct dk_map); 4946 4947 } 4948 4949 mutex_exit(&vdc->lock); 4950 4951 if (ddi_copyout(&data, arg, size, flag) != 0) 4952 return (EFAULT); 4953 4954 return (0); 4955 } 4956 4957 /* 4958 * Function: 4959 * vdc_dkio_partition() 4960 * 4961 * Description: 4962 * This function implements the DKIOCPARTITION ioctl. 4963 * 4964 * Arguments: 4965 * vdc - soft state pointer 4966 * arg - a pointer to a struct partition64 structure 4967 * flag - ioctl flags 4968 */ 4969 static int 4970 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 4971 { 4972 struct partition64 p64; 4973 efi_gpt_t *gpt; 4974 efi_gpe_t *gpe; 4975 vd_efi_dev_t edev; 4976 uint_t partno; 4977 int rv; 4978 4979 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 4980 return (EFAULT); 4981 } 4982 4983 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 4984 4985 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 4986 return (rv); 4987 } 4988 4989 partno = p64.p_partno; 4990 4991 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 4992 vd_efi_free(&edev, gpt, gpe); 4993 return (ESRCH); 4994 } 4995 4996 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 4997 sizeof (struct uuid)); 4998 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 4999 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5000 5001 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5002 vd_efi_free(&edev, gpt, gpe); 5003 return (EFAULT); 5004 } 5005 5006 vd_efi_free(&edev, gpt, gpe); 5007 return (0); 5008 } 5009 5010 /* 5011 * Function: 5012 * vdc_dioctl_rwcmd() 5013 * 5014 * Description: 5015 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5016 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5017 * 5018 * Arguments: 5019 * dev - device 5020 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5021 * flag - ioctl flags 5022 */ 5023 static int 5024 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5025 { 5026 struct dadkio_rwcmd32 rwcmd32; 5027 struct dadkio_rwcmd rwcmd; 5028 struct iovec aiov; 5029 struct uio auio; 5030 int rw, status; 5031 struct buf *buf; 5032 5033 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5034 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5035 sizeof (struct dadkio_rwcmd32), flag)) { 5036 return (EFAULT); 5037 } 5038 rwcmd.cmd = rwcmd32.cmd; 5039 rwcmd.flags = rwcmd32.flags; 5040 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5041 rwcmd.buflen = rwcmd32.buflen; 5042 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5043 } else { 5044 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5045 sizeof (struct dadkio_rwcmd), flag)) { 5046 return (EFAULT); 5047 } 5048 } 5049 5050 switch (rwcmd.cmd) { 5051 case DADKIO_RWCMD_READ: 5052 rw = B_READ; 5053 break; 5054 case DADKIO_RWCMD_WRITE: 5055 rw = B_WRITE; 5056 break; 5057 default: 5058 return (EINVAL); 5059 } 5060 5061 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5062 aiov.iov_base = rwcmd.bufaddr; 5063 aiov.iov_len = rwcmd.buflen; 5064 5065 bzero((caddr_t)&auio, sizeof (struct uio)); 5066 auio.uio_iov = &aiov; 5067 auio.uio_iovcnt = 1; 5068 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5069 auio.uio_resid = rwcmd.buflen; 5070 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5071 5072 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5073 bioinit(buf); 5074 /* 5075 * We use the private field of buf to specify that this is an 5076 * I/O using an absolute offset. 5077 */ 5078 buf->b_private = (void *)VD_SLICE_NONE; 5079 5080 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5081 5082 biofini(buf); 5083 kmem_free(buf, sizeof (buf_t)); 5084 5085 return (status); 5086 } 5087 5088 /* 5089 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5090 * buffer is returned in alloc_len. 5091 */ 5092 static vd_scsi_t * 5093 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5094 int *alloc_len) 5095 { 5096 vd_scsi_t *vd_scsi; 5097 int vd_scsi_len = VD_SCSI_SIZE; 5098 5099 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5100 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5101 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5102 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5103 5104 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5105 5106 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5107 5108 vd_scsi->cdb_len = cdb_len; 5109 vd_scsi->sense_len = sense_len; 5110 vd_scsi->datain_len = datain_len; 5111 vd_scsi->dataout_len = dataout_len; 5112 5113 *alloc_len = vd_scsi_len; 5114 5115 return (vd_scsi); 5116 } 5117 5118 /* 5119 * Convert the status of a SCSI command to a Solaris return code. 5120 * 5121 * Arguments: 5122 * vd_scsi - The SCSI operation buffer. 5123 * log_error - indicate if an error message should be logged. 5124 * 5125 * Note that our SCSI error messages are rather primitive for the moment 5126 * and could be improved by decoding some data like the SCSI command and 5127 * the sense key. 5128 * 5129 * Return value: 5130 * 0 - Status is good. 5131 * EACCES - Status reports a reservation conflict. 5132 * ENOTSUP - Status reports a check condition and sense key 5133 * reports an illegal request. 5134 * EIO - Any other status. 5135 */ 5136 static int 5137 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5138 { 5139 int rv; 5140 char path_str[MAXPATHLEN]; 5141 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5142 union scsi_cdb *cdb; 5143 struct scsi_extended_sense *sense; 5144 5145 if (vd_scsi->cmd_status == STATUS_GOOD) 5146 /* no error */ 5147 return (0); 5148 5149 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5150 if (vdc_scsi_log_error) 5151 log_error = B_TRUE; 5152 5153 if (log_error) { 5154 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5155 ddi_pathname(vdc->dip, path_str), vdc->instance, 5156 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5157 } 5158 5159 /* default returned value */ 5160 rv = EIO; 5161 5162 switch (vd_scsi->cmd_status) { 5163 5164 case STATUS_CHECK: 5165 case STATUS_TERMINATED: 5166 if (log_error) 5167 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5168 5169 /* check sense buffer */ 5170 if (vd_scsi->sense_len == 0 || 5171 vd_scsi->sense_status != STATUS_GOOD) { 5172 if (log_error) 5173 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5174 break; 5175 } 5176 5177 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5178 5179 if (log_error) { 5180 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5181 "\tASC: 0x%x, ASCQ: 0x%x\n", 5182 scsi_sense_key((uint8_t *)sense), 5183 scsi_sense_asc((uint8_t *)sense), 5184 scsi_sense_ascq((uint8_t *)sense)); 5185 } 5186 5187 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5188 rv = ENOTSUP; 5189 break; 5190 5191 case STATUS_BUSY: 5192 if (log_error) 5193 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5194 break; 5195 5196 case STATUS_RESERVATION_CONFLICT: 5197 /* 5198 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5199 * reservation conflict could be due to various reasons like 5200 * incorrect keys, not registered or not reserved etc. So, 5201 * we should not panic in that case. 5202 */ 5203 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5204 if (vdc->failfast_interval != 0 && 5205 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5206 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5207 /* failfast is enabled so we have to panic */ 5208 (void) snprintf(panic_str, sizeof (panic_str), 5209 VDC_RESV_CONFLICT_FMT_STR "%s", 5210 ddi_pathname(vdc->dip, path_str)); 5211 panic(panic_str); 5212 } 5213 if (log_error) 5214 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5215 rv = EACCES; 5216 break; 5217 5218 case STATUS_QFULL: 5219 if (log_error) 5220 cmn_err(CE_NOTE, "\tQueue Full\n"); 5221 break; 5222 5223 case STATUS_MET: 5224 case STATUS_INTERMEDIATE: 5225 case STATUS_SCSI2: 5226 case STATUS_INTERMEDIATE_MET: 5227 case STATUS_ACA_ACTIVE: 5228 if (log_error) 5229 cmn_err(CE_CONT, 5230 "\tUnexpected SCSI status received: 0x%x\n", 5231 vd_scsi->cmd_status); 5232 break; 5233 5234 default: 5235 if (log_error) 5236 cmn_err(CE_CONT, 5237 "\tInvalid SCSI status received: 0x%x\n", 5238 vd_scsi->cmd_status); 5239 break; 5240 } 5241 5242 return (rv); 5243 } 5244 5245 /* 5246 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5247 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5248 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5249 * converted to a VD_OP_RESET operation. 5250 */ 5251 static int 5252 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5253 { 5254 struct uscsi_cmd uscsi; 5255 struct uscsi_cmd32 uscsi32; 5256 vd_scsi_t *vd_scsi; 5257 int vd_scsi_len; 5258 union scsi_cdb *cdb; 5259 struct scsi_extended_sense *sense; 5260 char *datain, *dataout; 5261 size_t cdb_len, datain_len, dataout_len, sense_len; 5262 int rv; 5263 5264 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5265 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5266 mode) != 0) 5267 return (EFAULT); 5268 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5269 } else { 5270 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5271 mode) != 0) 5272 return (EFAULT); 5273 } 5274 5275 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5276 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5277 USCSI_RESET_ALL)) { 5278 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5279 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5280 return (rv); 5281 } 5282 5283 /* cdb buffer length */ 5284 cdb_len = uscsi.uscsi_cdblen; 5285 5286 /* data in and out buffers length */ 5287 if (uscsi.uscsi_flags & USCSI_READ) { 5288 datain_len = uscsi.uscsi_buflen; 5289 dataout_len = 0; 5290 } else { 5291 datain_len = 0; 5292 dataout_len = uscsi.uscsi_buflen; 5293 } 5294 5295 /* sense buffer length */ 5296 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5297 sense_len = uscsi.uscsi_rqlen; 5298 else 5299 sense_len = 0; 5300 5301 /* allocate buffer for the VD_SCSICMD_OP operation */ 5302 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5303 &vd_scsi_len); 5304 5305 /* 5306 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5307 * but basically they prevent a SCSI command from being retried in case 5308 * of an error. 5309 */ 5310 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5311 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5312 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5313 5314 /* set task attribute */ 5315 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5316 vd_scsi->task_attribute = 0; 5317 } else { 5318 if (uscsi.uscsi_flags & USCSI_HEAD) 5319 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5320 else if (uscsi.uscsi_flags & USCSI_HTAG) 5321 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5322 else if (uscsi.uscsi_flags & USCSI_OTAG) 5323 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5324 else 5325 vd_scsi->task_attribute = 0; 5326 } 5327 5328 /* set timeout */ 5329 vd_scsi->timeout = uscsi.uscsi_timeout; 5330 5331 /* copy-in cdb data */ 5332 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5333 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5334 rv = EFAULT; 5335 goto done; 5336 } 5337 5338 /* keep a pointer to the sense buffer */ 5339 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5340 5341 /* keep a pointer to the data-in buffer */ 5342 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5343 5344 /* copy-in request data to the data-out buffer */ 5345 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5346 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5347 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5348 mode)) { 5349 rv = EFAULT; 5350 goto done; 5351 } 5352 } 5353 5354 /* submit the request */ 5355 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5356 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5357 5358 if (rv != 0) 5359 goto done; 5360 5361 /* update scsi status */ 5362 uscsi.uscsi_status = vd_scsi->cmd_status; 5363 5364 /* update sense data */ 5365 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5366 (uscsi.uscsi_status == STATUS_CHECK || 5367 uscsi.uscsi_status == STATUS_TERMINATED)) { 5368 5369 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5370 5371 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5372 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5373 vd_scsi->sense_len; 5374 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5375 vd_scsi->sense_len, mode) != 0) { 5376 rv = EFAULT; 5377 goto done; 5378 } 5379 } 5380 } 5381 5382 /* update request data */ 5383 if (uscsi.uscsi_status == STATUS_GOOD) { 5384 if (uscsi.uscsi_flags & USCSI_READ) { 5385 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5386 vd_scsi->datain_len; 5387 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5388 vd_scsi->datain_len, mode) != 0) { 5389 rv = EFAULT; 5390 goto done; 5391 } 5392 } else { 5393 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5394 vd_scsi->dataout_len; 5395 } 5396 } 5397 5398 /* copy-out result */ 5399 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5400 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5401 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5402 mode) != 0) { 5403 rv = EFAULT; 5404 goto done; 5405 } 5406 } else { 5407 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5408 mode) != 0) { 5409 rv = EFAULT; 5410 goto done; 5411 } 5412 } 5413 5414 /* get the return code from the SCSI command status */ 5415 rv = vdc_scsi_status(vdc, vd_scsi, 5416 !(uscsi.uscsi_flags & USCSI_SILENT)); 5417 5418 done: 5419 kmem_free(vd_scsi, vd_scsi_len); 5420 return (rv); 5421 } 5422 5423 /* 5424 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5425 * 5426 * Arguments: 5427 * cmd - SCSI PERSISTENT IN command 5428 * len - length of the SCSI input buffer 5429 * vd_scsi_len - return the length of the allocated buffer 5430 * 5431 * Returned Value: 5432 * a pointer to the allocated VD_OP_SCSICMD buffer. 5433 */ 5434 static vd_scsi_t * 5435 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5436 { 5437 int cdb_len, sense_len, datain_len, dataout_len; 5438 vd_scsi_t *vd_scsi; 5439 union scsi_cdb *cdb; 5440 5441 cdb_len = CDB_GROUP1; 5442 sense_len = sizeof (struct scsi_extended_sense); 5443 datain_len = len; 5444 dataout_len = 0; 5445 5446 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5447 vd_scsi_len); 5448 5449 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5450 5451 /* set cdb */ 5452 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5453 cdb->cdb_opaque[1] = cmd; 5454 FORMG1COUNT(cdb, datain_len); 5455 5456 vd_scsi->timeout = vdc_scsi_timeout; 5457 5458 return (vd_scsi); 5459 } 5460 5461 /* 5462 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5463 * 5464 * Arguments: 5465 * cmd - SCSI PERSISTENT OUT command 5466 * len - length of the SCSI output buffer 5467 * vd_scsi_len - return the length of the allocated buffer 5468 * 5469 * Returned Code: 5470 * a pointer to the allocated VD_OP_SCSICMD buffer. 5471 */ 5472 static vd_scsi_t * 5473 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5474 { 5475 int cdb_len, sense_len, datain_len, dataout_len; 5476 vd_scsi_t *vd_scsi; 5477 union scsi_cdb *cdb; 5478 5479 cdb_len = CDB_GROUP1; 5480 sense_len = sizeof (struct scsi_extended_sense); 5481 datain_len = 0; 5482 dataout_len = len; 5483 5484 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5485 vd_scsi_len); 5486 5487 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5488 5489 /* set cdb */ 5490 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5491 cdb->cdb_opaque[1] = cmd; 5492 FORMG1COUNT(cdb, dataout_len); 5493 5494 vd_scsi->timeout = vdc_scsi_timeout; 5495 5496 return (vd_scsi); 5497 } 5498 5499 /* 5500 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5501 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5502 * server with a VD_OP_SCSICMD operation. 5503 */ 5504 static int 5505 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5506 { 5507 vd_scsi_t *vd_scsi; 5508 mhioc_inkeys_t inkeys; 5509 mhioc_key_list_t klist; 5510 struct mhioc_inkeys32 inkeys32; 5511 struct mhioc_key_list32 klist32; 5512 sd_prin_readkeys_t *scsi_keys; 5513 void *user_keys; 5514 int vd_scsi_len; 5515 int listsize, listlen, rv; 5516 5517 /* copyin arguments */ 5518 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5519 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5520 if (rv != 0) 5521 return (EFAULT); 5522 5523 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5524 sizeof (klist32), mode); 5525 if (rv != 0) 5526 return (EFAULT); 5527 5528 listsize = klist32.listsize; 5529 } else { 5530 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5531 if (rv != 0) 5532 return (EFAULT); 5533 5534 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5535 if (rv != 0) 5536 return (EFAULT); 5537 5538 listsize = klist.listsize; 5539 } 5540 5541 /* build SCSI VD_OP request */ 5542 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5543 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5544 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5545 5546 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5547 5548 /* submit the request */ 5549 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5550 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5551 5552 if (rv != 0) 5553 goto done; 5554 5555 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5556 5557 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5558 inkeys32.generation = scsi_keys->generation; 5559 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5560 if (rv != 0) { 5561 rv = EFAULT; 5562 goto done; 5563 } 5564 5565 klist32.listlen = listlen; 5566 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5567 sizeof (klist32), mode); 5568 if (rv != 0) { 5569 rv = EFAULT; 5570 goto done; 5571 } 5572 5573 user_keys = (caddr_t)(uintptr_t)klist32.list; 5574 } else { 5575 inkeys.generation = scsi_keys->generation; 5576 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5577 if (rv != 0) { 5578 rv = EFAULT; 5579 goto done; 5580 } 5581 5582 klist.listlen = listlen; 5583 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5584 if (rv != 0) { 5585 rv = EFAULT; 5586 goto done; 5587 } 5588 5589 user_keys = klist.list; 5590 } 5591 5592 /* copy out keys */ 5593 if (listlen > 0 && listsize > 0) { 5594 if (listsize < listlen) 5595 listlen = listsize; 5596 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5597 listlen * MHIOC_RESV_KEY_SIZE, mode); 5598 if (rv != 0) 5599 rv = EFAULT; 5600 } 5601 5602 if (rv == 0) 5603 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5604 5605 done: 5606 kmem_free(vd_scsi, vd_scsi_len); 5607 5608 return (rv); 5609 } 5610 5611 /* 5612 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5613 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5614 * the vdisk server with a VD_OP_SCSICMD operation. 5615 */ 5616 static int 5617 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5618 { 5619 vd_scsi_t *vd_scsi; 5620 mhioc_inresvs_t inresv; 5621 mhioc_resv_desc_list_t rlist; 5622 struct mhioc_inresvs32 inresv32; 5623 struct mhioc_resv_desc_list32 rlist32; 5624 mhioc_resv_desc_t mhd_resv; 5625 sd_prin_readresv_t *scsi_resv; 5626 sd_readresv_desc_t *resv; 5627 mhioc_resv_desc_t *user_resv; 5628 int vd_scsi_len; 5629 int listsize, listlen, i, rv; 5630 5631 /* copyin arguments */ 5632 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5633 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5634 if (rv != 0) 5635 return (EFAULT); 5636 5637 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5638 sizeof (rlist32), mode); 5639 if (rv != 0) 5640 return (EFAULT); 5641 5642 listsize = rlist32.listsize; 5643 } else { 5644 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5645 if (rv != 0) 5646 return (EFAULT); 5647 5648 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5649 if (rv != 0) 5650 return (EFAULT); 5651 5652 listsize = rlist.listsize; 5653 } 5654 5655 /* build SCSI VD_OP request */ 5656 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5657 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5658 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5659 5660 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5661 5662 /* submit the request */ 5663 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5664 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5665 5666 if (rv != 0) 5667 goto done; 5668 5669 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5670 5671 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5672 inresv32.generation = scsi_resv->generation; 5673 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5674 if (rv != 0) { 5675 rv = EFAULT; 5676 goto done; 5677 } 5678 5679 rlist32.listlen = listlen; 5680 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5681 sizeof (rlist32), mode); 5682 if (rv != 0) { 5683 rv = EFAULT; 5684 goto done; 5685 } 5686 5687 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5688 } else { 5689 inresv.generation = scsi_resv->generation; 5690 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5691 if (rv != 0) { 5692 rv = EFAULT; 5693 goto done; 5694 } 5695 5696 rlist.listlen = listlen; 5697 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5698 if (rv != 0) { 5699 rv = EFAULT; 5700 goto done; 5701 } 5702 5703 user_resv = rlist.list; 5704 } 5705 5706 /* copy out reservations */ 5707 if (listsize > 0 && listlen > 0) { 5708 if (listsize < listlen) 5709 listlen = listsize; 5710 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5711 5712 for (i = 0; i < listlen; i++) { 5713 mhd_resv.type = resv->type; 5714 mhd_resv.scope = resv->scope; 5715 mhd_resv.scope_specific_addr = 5716 BE_32(resv->scope_specific_addr); 5717 bcopy(&resv->resvkey, &mhd_resv.key, 5718 MHIOC_RESV_KEY_SIZE); 5719 5720 rv = ddi_copyout(&mhd_resv, user_resv, 5721 sizeof (mhd_resv), mode); 5722 if (rv != 0) { 5723 rv = EFAULT; 5724 goto done; 5725 } 5726 resv++; 5727 user_resv++; 5728 } 5729 } 5730 5731 if (rv == 0) 5732 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5733 5734 done: 5735 kmem_free(vd_scsi, vd_scsi_len); 5736 return (rv); 5737 } 5738 5739 /* 5740 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5741 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5742 * server with a VD_OP_SCSICMD operation. 5743 */ 5744 static int 5745 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5746 { 5747 vd_scsi_t *vd_scsi; 5748 sd_prout_t *scsi_prout; 5749 mhioc_register_t mhd_reg; 5750 int vd_scsi_len, rv; 5751 5752 /* copyin arguments */ 5753 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 5754 if (rv != 0) 5755 return (EFAULT); 5756 5757 /* build SCSI VD_OP request */ 5758 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 5759 sizeof (sd_prout_t), &vd_scsi_len); 5760 5761 /* set parameters */ 5762 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5763 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5764 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 5765 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 5766 5767 /* submit the request */ 5768 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5769 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5770 5771 if (rv == 0) 5772 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5773 5774 kmem_free(vd_scsi, vd_scsi_len); 5775 return (rv); 5776 } 5777 5778 /* 5779 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 5780 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 5781 * server with a VD_OP_SCSICMD operation. 5782 */ 5783 static int 5784 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 5785 { 5786 union scsi_cdb *cdb; 5787 vd_scsi_t *vd_scsi; 5788 sd_prout_t *scsi_prout; 5789 mhioc_resv_desc_t mhd_resv; 5790 int vd_scsi_len, rv; 5791 5792 /* copyin arguments */ 5793 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 5794 if (rv != 0) 5795 return (EFAULT); 5796 5797 /* build SCSI VD_OP request */ 5798 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 5799 sizeof (sd_prout_t), &vd_scsi_len); 5800 5801 /* set parameters */ 5802 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5803 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5804 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5805 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 5806 cdb->cdb_opaque[2] = mhd_resv.type; 5807 5808 /* submit the request */ 5809 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5810 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5811 5812 if (rv == 0) 5813 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5814 5815 kmem_free(vd_scsi, vd_scsi_len); 5816 return (rv); 5817 } 5818 5819 /* 5820 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 5821 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 5822 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 5823 */ 5824 static int 5825 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 5826 { 5827 union scsi_cdb *cdb; 5828 vd_scsi_t *vd_scsi; 5829 sd_prout_t *scsi_prout; 5830 mhioc_preemptandabort_t mhd_preempt; 5831 int vd_scsi_len, rv; 5832 5833 /* copyin arguments */ 5834 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 5835 if (rv != 0) 5836 return (EFAULT); 5837 5838 /* build SCSI VD_OP request */ 5839 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 5840 sizeof (sd_prout_t), &vd_scsi_len); 5841 5842 /* set parameters */ 5843 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5844 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5845 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5846 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 5847 MHIOC_RESV_KEY_SIZE); 5848 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 5849 MHIOC_RESV_KEY_SIZE); 5850 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 5851 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 5852 5853 /* submit the request */ 5854 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5855 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5856 5857 if (rv == 0) 5858 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5859 5860 kmem_free(vd_scsi, vd_scsi_len); 5861 return (rv); 5862 } 5863 5864 /* 5865 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 5866 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 5867 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 5868 */ 5869 static int 5870 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 5871 { 5872 vd_scsi_t *vd_scsi; 5873 sd_prout_t *scsi_prout; 5874 mhioc_registerandignorekey_t mhd_regi; 5875 int vd_scsi_len, rv; 5876 5877 /* copyin arguments */ 5878 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 5879 if (rv != 0) 5880 return (EFAULT); 5881 5882 /* build SCSI VD_OP request */ 5883 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 5884 sizeof (sd_prout_t), &vd_scsi_len); 5885 5886 /* set parameters */ 5887 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5888 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 5889 MHIOC_RESV_KEY_SIZE); 5890 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 5891 5892 /* submit the request */ 5893 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5894 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5895 5896 if (rv == 0) 5897 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5898 5899 kmem_free(vd_scsi, vd_scsi_len); 5900 return (rv); 5901 } 5902 5903 /* 5904 * This function is used by the failfast mechanism to send a SCSI command 5905 * to check for reservation conflict. 5906 */ 5907 static int 5908 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 5909 { 5910 int cdb_len, sense_len, vd_scsi_len; 5911 vd_scsi_t *vd_scsi; 5912 union scsi_cdb *cdb; 5913 int rv; 5914 5915 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 5916 5917 if (scmd == SCMD_WRITE_G1) 5918 cdb_len = CDB_GROUP1; 5919 else 5920 cdb_len = CDB_GROUP0; 5921 5922 sense_len = sizeof (struct scsi_extended_sense); 5923 5924 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 5925 5926 /* set cdb */ 5927 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5928 cdb->scc_cmd = scmd; 5929 5930 vd_scsi->timeout = vdc_scsi_timeout; 5931 5932 /* 5933 * Submit the request. The last argument has to be B_FALSE so that 5934 * vdc_do_sync_op does not loop checking for reservation conflict if 5935 * the operation returns an error. 5936 */ 5937 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5938 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 5939 5940 if (rv == 0) 5941 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5942 5943 kmem_free(vd_scsi, vd_scsi_len); 5944 return (rv); 5945 } 5946 5947 /* 5948 * This function is used by the failfast mechanism to check for reservation 5949 * conflict. It sends some SCSI commands which will fail with a reservation 5950 * conflict error if the system does not have access to the disk and this 5951 * will panic the system. 5952 * 5953 * Returned Code: 5954 * 0 - disk is accessible without reservation conflict error 5955 * != 0 - unable to check if disk is accessible 5956 */ 5957 int 5958 vdc_failfast_check_resv(vdc_t *vdc) 5959 { 5960 int failure = 0; 5961 5962 /* 5963 * Send a TEST UNIT READY command. The command will panic 5964 * the system if it fails with a reservation conflict. 5965 */ 5966 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 5967 failure++; 5968 5969 /* 5970 * With SPC-3 compliant devices TEST UNIT READY will succeed on 5971 * a reserved device, so we also do a WRITE(10) of zero byte in 5972 * order to provoke a Reservation Conflict status on those newer 5973 * devices. 5974 */ 5975 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 5976 failure++; 5977 5978 return (failure); 5979 } 5980 5981 /* 5982 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 5983 * queue when it has failed and failfast is enabled. Then we have to check 5984 * if it has failed because of a reservation conflict in which case we have 5985 * to panic the system. 5986 * 5987 * Async I/O should be queued with their block I/O data transfer structure 5988 * (buf). Sync I/O should be queued with buf = NULL. 5989 */ 5990 static vdc_io_t * 5991 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 5992 { 5993 vdc_io_t *vio; 5994 5995 ASSERT(MUTEX_HELD(&vdc->lock)); 5996 5997 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 5998 vio->vio_next = vdc->failfast_io_queue; 5999 vio->vio_buf = buf; 6000 vio->vio_qtime = ddi_get_lbolt(); 6001 6002 vdc->failfast_io_queue = vio; 6003 6004 /* notify the failfast thread that a new I/O is queued */ 6005 cv_signal(&vdc->failfast_cv); 6006 6007 return (vio); 6008 } 6009 6010 /* 6011 * Remove and complete I/O in the failfast I/O queue which have been 6012 * added after the indicated deadline. A deadline of 0 means that all 6013 * I/O have to be unqueued and marked as completed. 6014 */ 6015 static void 6016 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6017 { 6018 vdc_io_t *vio, *vio_tmp; 6019 6020 ASSERT(MUTEX_HELD(&vdc->lock)); 6021 6022 vio_tmp = NULL; 6023 vio = vdc->failfast_io_queue; 6024 6025 if (deadline != 0) { 6026 /* 6027 * Skip any io queued after the deadline. The failfast 6028 * I/O queue is ordered starting with the last I/O added 6029 * to the queue. 6030 */ 6031 while (vio != NULL && vio->vio_qtime > deadline) { 6032 vio_tmp = vio; 6033 vio = vio->vio_next; 6034 } 6035 } 6036 6037 if (vio == NULL) 6038 /* nothing to unqueue */ 6039 return; 6040 6041 /* update the queue */ 6042 if (vio_tmp == NULL) 6043 vdc->failfast_io_queue = NULL; 6044 else 6045 vio_tmp->vio_next = NULL; 6046 6047 /* 6048 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6049 * structure (buf) and they are completed by calling biodone(). Sync 6050 * I/O do not have a buf and they are completed by setting the 6051 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6052 * thread waiting for the I/O to complete is responsible for freeing 6053 * the vio structure. 6054 */ 6055 while (vio != NULL) { 6056 vio_tmp = vio->vio_next; 6057 if (vio->vio_buf != NULL) { 6058 VD_KSTAT_RUNQ_EXIT(vdc->io_stats); 6059 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6060 biodone(vio->vio_buf); 6061 kmem_free(vio, sizeof (vdc_io_t)); 6062 } else { 6063 vio->vio_qtime = 0; 6064 } 6065 vio = vio_tmp; 6066 } 6067 6068 cv_broadcast(&vdc->failfast_io_cv); 6069 } 6070 6071 /* 6072 * Failfast Thread. 6073 * 6074 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6075 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6076 * we still have access to the disk. If a command fails with a RESERVATION 6077 * CONFLICT error then the system will immediatly panic. 6078 * 6079 * The failfast thread is also woken up when an I/O has failed. It then check 6080 * the access to the disk to ensure that the I/O failure was not due to a 6081 * reservation conflict. 6082 * 6083 * There is one failfast thread for each virtual disk for which failfast is 6084 * enabled. We could have only one thread sending requests for all disks but 6085 * this would need vdc to send asynchronous requests and to have callbacks to 6086 * process replies. 6087 */ 6088 static void 6089 vdc_failfast_thread(void *arg) 6090 { 6091 int status; 6092 vdc_t *vdc = (vdc_t *)arg; 6093 clock_t timeout, starttime; 6094 6095 mutex_enter(&vdc->lock); 6096 6097 while (vdc->failfast_interval != 0) { 6098 6099 starttime = ddi_get_lbolt(); 6100 6101 mutex_exit(&vdc->lock); 6102 6103 /* check for reservation conflict */ 6104 status = vdc_failfast_check_resv(vdc); 6105 6106 mutex_enter(&vdc->lock); 6107 /* 6108 * We have dropped the lock to send the SCSI command so we have 6109 * to check that failfast is still enabled. 6110 */ 6111 if (vdc->failfast_interval == 0) 6112 break; 6113 6114 /* 6115 * If we have successfully check the disk access and there was 6116 * no reservation conflict then we can complete any I/O queued 6117 * before the last check. 6118 */ 6119 if (status == 0) 6120 vdc_failfast_io_unqueue(vdc, starttime); 6121 6122 /* proceed again if some I/O are still in the queue */ 6123 if (vdc->failfast_io_queue != NULL) 6124 continue; 6125 6126 timeout = ddi_get_lbolt() + 6127 drv_usectohz(vdc->failfast_interval); 6128 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6129 } 6130 6131 /* 6132 * Failfast is being stop so we can complete any queued I/O. 6133 */ 6134 vdc_failfast_io_unqueue(vdc, 0); 6135 vdc->failfast_thread = NULL; 6136 mutex_exit(&vdc->lock); 6137 thread_exit(); 6138 } 6139 6140 /* 6141 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6142 */ 6143 static int 6144 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6145 { 6146 unsigned int mh_time; 6147 6148 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6149 return (EFAULT); 6150 6151 mutex_enter(&vdc->lock); 6152 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6153 vdc->failfast_thread = thread_create(NULL, 0, 6154 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6155 v.v_maxsyspri - 2); 6156 } 6157 6158 vdc->failfast_interval = mh_time * 1000; 6159 cv_signal(&vdc->failfast_cv); 6160 mutex_exit(&vdc->lock); 6161 6162 return (0); 6163 } 6164 6165 /* 6166 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6167 * converted to VD_OP_SET_ACCESS operations. 6168 */ 6169 static int 6170 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6171 { 6172 int rv; 6173 6174 /* submit owership command request */ 6175 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6176 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6177 VIO_both_dir, B_TRUE); 6178 6179 return (rv); 6180 } 6181 6182 /* 6183 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6184 * VD_OP_GET_ACCESS operation. 6185 */ 6186 static int 6187 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6188 { 6189 int rv; 6190 6191 /* submit owership command request */ 6192 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6193 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6194 VIO_both_dir, B_TRUE); 6195 6196 return (rv); 6197 } 6198 6199 /* 6200 * Disk Ownership Thread. 6201 * 6202 * When we have taken the ownership of a disk, this thread waits to be 6203 * notified when the LDC channel is reset so that it can recover the 6204 * ownership. 6205 * 6206 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6207 * can not be used to do the ownership recovery because it has to be 6208 * running to handle the reply message to the ownership operation. 6209 */ 6210 static void 6211 vdc_ownership_thread(void *arg) 6212 { 6213 vdc_t *vdc = (vdc_t *)arg; 6214 clock_t timeout; 6215 uint64_t status; 6216 6217 mutex_enter(&vdc->ownership_lock); 6218 mutex_enter(&vdc->lock); 6219 6220 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6221 6222 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6223 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6224 /* 6225 * There was a reset so the ownership has been lost, 6226 * try to recover. We do this without using the preempt 6227 * option so that we don't steal the ownership from 6228 * someone who has preempted us. 6229 */ 6230 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6231 vdc->instance); 6232 6233 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6234 VDC_OWNERSHIP_GRANTED); 6235 6236 mutex_exit(&vdc->lock); 6237 6238 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6239 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6240 6241 mutex_enter(&vdc->lock); 6242 6243 if (status == 0) { 6244 DMSG(vdc, 0, "[%d] Ownership recovered", 6245 vdc->instance); 6246 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6247 } else { 6248 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6249 vdc->instance); 6250 } 6251 6252 } 6253 6254 /* 6255 * If we have the ownership then we just wait for an event 6256 * to happen (LDC reset), otherwise we will retry to recover 6257 * after a delay. 6258 */ 6259 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6260 timeout = 0; 6261 else 6262 timeout = ddi_get_lbolt() + 6263 drv_usectohz(vdc_ownership_delay); 6264 6265 /* Release the ownership_lock and wait on the vdc lock */ 6266 mutex_exit(&vdc->ownership_lock); 6267 6268 if (timeout == 0) 6269 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6270 else 6271 (void) cv_timedwait(&vdc->ownership_cv, 6272 &vdc->lock, timeout); 6273 6274 mutex_exit(&vdc->lock); 6275 6276 mutex_enter(&vdc->ownership_lock); 6277 mutex_enter(&vdc->lock); 6278 } 6279 6280 vdc->ownership_thread = NULL; 6281 mutex_exit(&vdc->lock); 6282 mutex_exit(&vdc->ownership_lock); 6283 6284 thread_exit(); 6285 } 6286 6287 static void 6288 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6289 { 6290 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6291 6292 mutex_enter(&vdc->lock); 6293 vdc->ownership = ownership_flags; 6294 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6295 vdc->ownership_thread == NULL) { 6296 /* start ownership thread */ 6297 vdc->ownership_thread = thread_create(NULL, 0, 6298 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6299 v.v_maxsyspri - 2); 6300 } else { 6301 /* notify the ownership thread */ 6302 cv_signal(&vdc->ownership_cv); 6303 } 6304 mutex_exit(&vdc->lock); 6305 } 6306 6307 /* 6308 * Get the size and the block size of a virtual disk from the vdisk server. 6309 * We need to use this operation when the vdisk_size attribute was not 6310 * available during the handshake with the vdisk server. 6311 */ 6312 static int 6313 vdc_check_capacity(vdc_t *vdc) 6314 { 6315 int rv = 0; 6316 size_t alloc_len; 6317 vd_capacity_t *vd_cap; 6318 6319 if (vdc->vdisk_size != 0) 6320 return (0); 6321 6322 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6323 6324 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6325 6326 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6327 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6328 6329 if (rv == 0) { 6330 if (vd_cap->vdisk_block_size != vdc->block_size || 6331 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6332 vd_cap->vdisk_size == 0) 6333 rv = EINVAL; 6334 else 6335 vdc->vdisk_size = vd_cap->vdisk_size; 6336 } 6337 6338 kmem_free(vd_cap, alloc_len); 6339 return (rv); 6340 } 6341 6342 /* 6343 * This structure is used in the DKIO(7I) array below. 6344 */ 6345 typedef struct vdc_dk_ioctl { 6346 uint8_t op; /* VD_OP_XXX value */ 6347 int cmd; /* Solaris ioctl operation number */ 6348 size_t nbytes; /* size of structure to be copied */ 6349 6350 /* function to convert between vDisk and Solaris structure formats */ 6351 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6352 int mode, int dir); 6353 } vdc_dk_ioctl_t; 6354 6355 /* 6356 * Subset of DKIO(7I) operations currently supported 6357 */ 6358 static vdc_dk_ioctl_t dk_ioctl[] = { 6359 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6360 vdc_null_copy_func}, 6361 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6362 vdc_get_wce_convert}, 6363 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6364 vdc_set_wce_convert}, 6365 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6366 vdc_get_vtoc_convert}, 6367 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6368 vdc_set_vtoc_convert}, 6369 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6370 vdc_get_geom_convert}, 6371 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6372 vdc_get_geom_convert}, 6373 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6374 vdc_get_geom_convert}, 6375 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6376 vdc_set_geom_convert}, 6377 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6378 vdc_get_efi_convert}, 6379 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6380 vdc_set_efi_convert}, 6381 6382 /* DIOCTL_RWCMD is converted to a read or a write */ 6383 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6384 6385 /* mhd(7I) non-shared multihost disks ioctls */ 6386 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6387 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6388 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6389 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6390 6391 /* mhd(7I) shared multihost disks ioctls */ 6392 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6393 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6394 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6395 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6396 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6397 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6398 6399 /* mhd(7I) failfast ioctl */ 6400 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6401 6402 /* 6403 * These particular ioctls are not sent to the server - vdc fakes up 6404 * the necessary info. 6405 */ 6406 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6407 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6408 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6409 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6410 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6411 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6412 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6413 }; 6414 6415 /* 6416 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6417 * function and forward them to the vdisk. 6418 */ 6419 static int 6420 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6421 { 6422 vdc_t *vdc = (vdc_t *)vdisk; 6423 dev_t dev; 6424 int rval; 6425 6426 dev = makedevice(ddi_driver_major(vdc->dip), 6427 VD_MAKE_DEV(vdc->instance, 0)); 6428 6429 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6430 } 6431 6432 /* 6433 * Function: 6434 * vd_process_ioctl() 6435 * 6436 * Description: 6437 * This routine processes disk specific ioctl calls 6438 * 6439 * Arguments: 6440 * dev - the device number 6441 * cmd - the operation [dkio(7I)] to be processed 6442 * arg - pointer to user provided structure 6443 * (contains data to be set or reference parameter for get) 6444 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6445 * rvalp - pointer to return value for calling process. 6446 * 6447 * Return Code: 6448 * 0 6449 * EFAULT 6450 * ENXIO 6451 * EIO 6452 * ENOTSUP 6453 */ 6454 static int 6455 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6456 { 6457 int instance = VDCUNIT(dev); 6458 vdc_t *vdc = NULL; 6459 int rv = -1; 6460 int idx = 0; /* index into dk_ioctl[] */ 6461 size_t len = 0; /* #bytes to send to vds */ 6462 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6463 caddr_t mem_p = NULL; 6464 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6465 vdc_dk_ioctl_t *iop; 6466 6467 vdc = ddi_get_soft_state(vdc_state, instance); 6468 if (vdc == NULL) { 6469 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6470 instance); 6471 return (ENXIO); 6472 } 6473 6474 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6475 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6476 6477 if (rvalp != NULL) { 6478 /* the return value of the ioctl is 0 by default */ 6479 *rvalp = 0; 6480 } 6481 6482 /* 6483 * Validate the ioctl operation to be performed. 6484 * 6485 * If we have looped through the array without finding a match then we 6486 * don't support this ioctl. 6487 */ 6488 for (idx = 0; idx < nioctls; idx++) { 6489 if (cmd == dk_ioctl[idx].cmd) 6490 break; 6491 } 6492 6493 if (idx >= nioctls) { 6494 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6495 vdc->instance, cmd); 6496 return (ENOTSUP); 6497 } 6498 6499 iop = &(dk_ioctl[idx]); 6500 6501 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6502 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6503 dk_efi_t dk_efi; 6504 6505 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6506 if (rv != 0) 6507 return (EFAULT); 6508 6509 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6510 } else { 6511 len = iop->nbytes; 6512 } 6513 6514 /* check if the ioctl is applicable */ 6515 switch (cmd) { 6516 case CDROMREADOFFSET: 6517 case DKIOCREMOVABLE: 6518 return (ENOTTY); 6519 6520 case USCSICMD: 6521 case MHIOCTKOWN: 6522 case MHIOCSTATUS: 6523 case MHIOCQRESERVE: 6524 case MHIOCRELEASE: 6525 case MHIOCGRP_INKEYS: 6526 case MHIOCGRP_INRESV: 6527 case MHIOCGRP_REGISTER: 6528 case MHIOCGRP_RESERVE: 6529 case MHIOCGRP_PREEMPTANDABORT: 6530 case MHIOCGRP_REGISTERANDIGNOREKEY: 6531 case MHIOCENFAILFAST: 6532 if (vdc->cinfo == NULL) 6533 return (ENXIO); 6534 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6535 return (ENOTTY); 6536 break; 6537 6538 case DIOCTL_RWCMD: 6539 if (vdc->cinfo == NULL) 6540 return (ENXIO); 6541 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6542 return (ENOTTY); 6543 break; 6544 6545 case DKIOCINFO: 6546 if (vdc->cinfo == NULL) 6547 return (ENXIO); 6548 break; 6549 6550 case DKIOCGMEDIAINFO: 6551 if (vdc->minfo == NULL) 6552 return (ENXIO); 6553 if (vdc_check_capacity(vdc) != 0) 6554 /* disk capacity is not available */ 6555 return (EIO); 6556 break; 6557 } 6558 6559 /* 6560 * Deal with ioctls which require a processing different than 6561 * converting ioctl arguments and sending a corresponding 6562 * VD operation. 6563 */ 6564 switch (cmd) { 6565 6566 case USCSICMD: 6567 { 6568 return (vdc_uscsi_cmd(vdc, arg, mode)); 6569 } 6570 6571 case MHIOCTKOWN: 6572 { 6573 mutex_enter(&vdc->ownership_lock); 6574 /* 6575 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6576 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6577 * while we are processing the ioctl. 6578 */ 6579 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6580 6581 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6582 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6583 if (rv == 0) { 6584 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6585 VDC_OWNERSHIP_GRANTED); 6586 } else { 6587 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6588 } 6589 mutex_exit(&vdc->ownership_lock); 6590 return (rv); 6591 } 6592 6593 case MHIOCRELEASE: 6594 { 6595 mutex_enter(&vdc->ownership_lock); 6596 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6597 if (rv == 0) { 6598 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6599 } 6600 mutex_exit(&vdc->ownership_lock); 6601 return (rv); 6602 } 6603 6604 case MHIOCSTATUS: 6605 { 6606 uint64_t status; 6607 6608 rv = vdc_access_get(vdc, &status, mode); 6609 if (rv == 0 && rvalp != NULL) 6610 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6611 return (rv); 6612 } 6613 6614 case MHIOCQRESERVE: 6615 { 6616 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6617 return (rv); 6618 } 6619 6620 case MHIOCGRP_INKEYS: 6621 { 6622 return (vdc_mhd_inkeys(vdc, arg, mode)); 6623 } 6624 6625 case MHIOCGRP_INRESV: 6626 { 6627 return (vdc_mhd_inresv(vdc, arg, mode)); 6628 } 6629 6630 case MHIOCGRP_REGISTER: 6631 { 6632 return (vdc_mhd_register(vdc, arg, mode)); 6633 } 6634 6635 case MHIOCGRP_RESERVE: 6636 { 6637 return (vdc_mhd_reserve(vdc, arg, mode)); 6638 } 6639 6640 case MHIOCGRP_PREEMPTANDABORT: 6641 { 6642 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6643 } 6644 6645 case MHIOCGRP_REGISTERANDIGNOREKEY: 6646 { 6647 return (vdc_mhd_registerignore(vdc, arg, mode)); 6648 } 6649 6650 case MHIOCENFAILFAST: 6651 { 6652 rv = vdc_failfast(vdc, arg, mode); 6653 return (rv); 6654 } 6655 6656 case DIOCTL_RWCMD: 6657 { 6658 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6659 } 6660 6661 case DKIOCGAPART: 6662 { 6663 return (vdc_dkio_gapart(vdc, arg, mode)); 6664 } 6665 6666 case DKIOCPARTITION: 6667 { 6668 return (vdc_dkio_partition(vdc, arg, mode)); 6669 } 6670 6671 case DKIOCINFO: 6672 { 6673 struct dk_cinfo cinfo; 6674 6675 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6676 cinfo.dki_partition = VDCPART(dev); 6677 6678 rv = ddi_copyout(&cinfo, (void *)arg, 6679 sizeof (struct dk_cinfo), mode); 6680 if (rv != 0) 6681 return (EFAULT); 6682 6683 return (0); 6684 } 6685 6686 case DKIOCGMEDIAINFO: 6687 { 6688 ASSERT(vdc->vdisk_size != 0); 6689 if (vdc->minfo->dki_capacity == 0) 6690 vdc->minfo->dki_capacity = vdc->vdisk_size; 6691 rv = ddi_copyout(vdc->minfo, (void *)arg, 6692 sizeof (struct dk_minfo), mode); 6693 if (rv != 0) 6694 return (EFAULT); 6695 6696 return (0); 6697 } 6698 6699 case DKIOCFLUSHWRITECACHE: 6700 { 6701 struct dk_callback *dkc = 6702 (struct dk_callback *)(uintptr_t)arg; 6703 vdc_dk_arg_t *dkarg = NULL; 6704 6705 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6706 instance, mode); 6707 6708 /* 6709 * If arg is NULL, then there is no callback function 6710 * registered and the call operates synchronously; we 6711 * break and continue with the rest of the function and 6712 * wait for vds to return (i.e. after the request to 6713 * vds returns successfully, all writes completed prior 6714 * to the ioctl will have been flushed from the disk 6715 * write cache to persistent media. 6716 * 6717 * If a callback function is registered, we dispatch 6718 * the request on a task queue and return immediately. 6719 * The callback will deal with informing the calling 6720 * thread that the flush request is completed. 6721 */ 6722 if (dkc == NULL) 6723 break; 6724 6725 /* 6726 * the asynchronous callback is only supported if 6727 * invoked from within the kernel 6728 */ 6729 if ((mode & FKIOCTL) == 0) 6730 return (ENOTSUP); 6731 6732 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6733 6734 dkarg->mode = mode; 6735 dkarg->dev = dev; 6736 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 6737 6738 mutex_enter(&vdc->lock); 6739 vdc->dkio_flush_pending++; 6740 dkarg->vdc = vdc; 6741 mutex_exit(&vdc->lock); 6742 6743 /* put the request on a task queue */ 6744 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 6745 (void *)dkarg, DDI_SLEEP); 6746 if (rv == NULL) { 6747 /* clean up if dispatch fails */ 6748 mutex_enter(&vdc->lock); 6749 vdc->dkio_flush_pending--; 6750 mutex_exit(&vdc->lock); 6751 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 6752 } 6753 6754 return (rv == NULL ? ENOMEM : 0); 6755 } 6756 } 6757 6758 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 6759 ASSERT(iop->op != 0); 6760 6761 /* check if the vDisk server handles the operation for this vDisk */ 6762 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 6763 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 6764 vdc->instance, iop->op); 6765 return (ENOTSUP); 6766 } 6767 6768 /* LDC requires that the memory being mapped is 8-byte aligned */ 6769 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 6770 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 6771 instance, len, alloc_len); 6772 6773 if (alloc_len > 0) 6774 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 6775 6776 /* 6777 * Call the conversion function for this ioctl which, if necessary, 6778 * converts from the Solaris format to the format ARC'ed 6779 * as part of the vDisk protocol (FWARC 2006/195) 6780 */ 6781 ASSERT(iop->convert != NULL); 6782 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 6783 if (rv != 0) { 6784 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6785 instance, rv, cmd); 6786 if (mem_p != NULL) 6787 kmem_free(mem_p, alloc_len); 6788 return (rv); 6789 } 6790 6791 /* 6792 * send request to vds to service the ioctl. 6793 */ 6794 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 6795 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 6796 VIO_both_dir, B_TRUE); 6797 6798 if (rv != 0) { 6799 /* 6800 * This is not necessarily an error. The ioctl could 6801 * be returning a value such as ENOTTY to indicate 6802 * that the ioctl is not applicable. 6803 */ 6804 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 6805 instance, rv, cmd); 6806 if (mem_p != NULL) 6807 kmem_free(mem_p, alloc_len); 6808 6809 return (rv); 6810 } 6811 6812 /* 6813 * Call the conversion function (if it exists) for this ioctl 6814 * which converts from the format ARC'ed as part of the vDisk 6815 * protocol (FWARC 2006/195) back to a format understood by 6816 * the rest of Solaris. 6817 */ 6818 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 6819 if (rv != 0) { 6820 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 6821 instance, rv, cmd); 6822 if (mem_p != NULL) 6823 kmem_free(mem_p, alloc_len); 6824 return (rv); 6825 } 6826 6827 if (mem_p != NULL) 6828 kmem_free(mem_p, alloc_len); 6829 6830 return (rv); 6831 } 6832 6833 /* 6834 * Function: 6835 * 6836 * Description: 6837 * This is an empty conversion function used by ioctl calls which 6838 * do not need to convert the data being passed in/out to userland 6839 */ 6840 static int 6841 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 6842 { 6843 _NOTE(ARGUNUSED(vdc)) 6844 _NOTE(ARGUNUSED(from)) 6845 _NOTE(ARGUNUSED(to)) 6846 _NOTE(ARGUNUSED(mode)) 6847 _NOTE(ARGUNUSED(dir)) 6848 6849 return (0); 6850 } 6851 6852 static int 6853 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 6854 int mode, int dir) 6855 { 6856 _NOTE(ARGUNUSED(vdc)) 6857 6858 if (dir == VD_COPYIN) 6859 return (0); /* nothing to do */ 6860 6861 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 6862 return (EFAULT); 6863 6864 return (0); 6865 } 6866 6867 static int 6868 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 6869 int mode, int dir) 6870 { 6871 _NOTE(ARGUNUSED(vdc)) 6872 6873 if (dir == VD_COPYOUT) 6874 return (0); /* nothing to do */ 6875 6876 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 6877 return (EFAULT); 6878 6879 return (0); 6880 } 6881 6882 /* 6883 * Function: 6884 * vdc_get_vtoc_convert() 6885 * 6886 * Description: 6887 * This routine performs the necessary convertions from the DKIOCGVTOC 6888 * Solaris structure to the format defined in FWARC 2006/195. 6889 * 6890 * In the struct vtoc definition, the timestamp field is marked as not 6891 * supported so it is not part of vDisk protocol (FWARC 2006/195). 6892 * However SVM uses that field to check it can write into the VTOC, 6893 * so we fake up the info of that field. 6894 * 6895 * Arguments: 6896 * vdc - the vDisk client 6897 * from - the buffer containing the data to be copied from 6898 * to - the buffer to be copied to 6899 * mode - flags passed to ioctl() call 6900 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 6901 * 6902 * Return Code: 6903 * 0 - Success 6904 * ENXIO - incorrect buffer passed in. 6905 * EFAULT - ddi_copyout routine encountered an error. 6906 */ 6907 static int 6908 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6909 { 6910 int i; 6911 void *tmp_mem = NULL; 6912 void *tmp_memp; 6913 struct vtoc vt; 6914 struct vtoc32 vt32; 6915 int copy_len = 0; 6916 int rv = 0; 6917 6918 if (dir != VD_COPYOUT) 6919 return (0); /* nothing to do */ 6920 6921 if ((from == NULL) || (to == NULL)) 6922 return (ENXIO); 6923 6924 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6925 copy_len = sizeof (struct vtoc32); 6926 else 6927 copy_len = sizeof (struct vtoc); 6928 6929 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6930 6931 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 6932 6933 /* fake the VTOC timestamp field */ 6934 for (i = 0; i < V_NUMPAR; i++) { 6935 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 6936 } 6937 6938 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6939 /* LINTED E_ASSIGN_NARROW_CONV */ 6940 vtoctovtoc32(vt, vt32); 6941 tmp_memp = &vt32; 6942 } else { 6943 tmp_memp = &vt; 6944 } 6945 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 6946 if (rv != 0) 6947 rv = EFAULT; 6948 6949 kmem_free(tmp_mem, copy_len); 6950 return (rv); 6951 } 6952 6953 /* 6954 * Function: 6955 * vdc_set_vtoc_convert() 6956 * 6957 * Description: 6958 * This routine performs the necessary convertions from the DKIOCSVTOC 6959 * Solaris structure to the format defined in FWARC 2006/195. 6960 * 6961 * Arguments: 6962 * vdc - the vDisk client 6963 * from - Buffer with data 6964 * to - Buffer where data is to be copied to 6965 * mode - flags passed to ioctl 6966 * dir - direction of copy (in or out) 6967 * 6968 * Return Code: 6969 * 0 - Success 6970 * ENXIO - Invalid buffer passed in 6971 * EFAULT - ddi_copyin of data failed 6972 */ 6973 static int 6974 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 6975 { 6976 _NOTE(ARGUNUSED(vdc)) 6977 6978 void *tmp_mem = NULL, *uvtoc; 6979 struct vtoc vt; 6980 struct vtoc *vtp = &vt; 6981 vd_vtoc_t vtvd; 6982 int copy_len = 0; 6983 int i, rv = 0; 6984 6985 if ((from == NULL) || (to == NULL)) 6986 return (ENXIO); 6987 6988 if (dir == VD_COPYIN) 6989 uvtoc = from; 6990 else 6991 uvtoc = to; 6992 6993 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 6994 copy_len = sizeof (struct vtoc32); 6995 else 6996 copy_len = sizeof (struct vtoc); 6997 6998 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 6999 7000 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 7001 if (rv != 0) { 7002 kmem_free(tmp_mem, copy_len); 7003 return (EFAULT); 7004 } 7005 7006 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7007 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 7008 } else { 7009 vtp = tmp_mem; 7010 } 7011 7012 if (dir == VD_COPYOUT) { 7013 /* 7014 * The disk label may have changed. Revalidate the disk 7015 * geometry. This will also update the device nodes and 7016 * properties. 7017 */ 7018 vdc_validate(vdc); 7019 7020 /* 7021 * We also need to keep track of the timestamp fields. 7022 */ 7023 for (i = 0; i < V_NUMPAR; i++) { 7024 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 7025 } 7026 7027 return (0); 7028 } 7029 7030 VTOC2VD_VTOC(vtp, &vtvd); 7031 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 7032 kmem_free(tmp_mem, copy_len); 7033 7034 return (0); 7035 } 7036 7037 /* 7038 * Function: 7039 * vdc_get_geom_convert() 7040 * 7041 * Description: 7042 * This routine performs the necessary convertions from the DKIOCGGEOM, 7043 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7044 * defined in FWARC 2006/195 7045 * 7046 * Arguments: 7047 * vdc - the vDisk client 7048 * from - Buffer with data 7049 * to - Buffer where data is to be copied to 7050 * mode - flags passed to ioctl 7051 * dir - direction of copy (in or out) 7052 * 7053 * Return Code: 7054 * 0 - Success 7055 * ENXIO - Invalid buffer passed in 7056 * EFAULT - ddi_copyout of data failed 7057 */ 7058 static int 7059 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7060 { 7061 _NOTE(ARGUNUSED(vdc)) 7062 7063 struct dk_geom geom; 7064 int copy_len = sizeof (struct dk_geom); 7065 int rv = 0; 7066 7067 if (dir != VD_COPYOUT) 7068 return (0); /* nothing to do */ 7069 7070 if ((from == NULL) || (to == NULL)) 7071 return (ENXIO); 7072 7073 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7074 rv = ddi_copyout(&geom, to, copy_len, mode); 7075 if (rv != 0) 7076 rv = EFAULT; 7077 7078 return (rv); 7079 } 7080 7081 /* 7082 * Function: 7083 * vdc_set_geom_convert() 7084 * 7085 * Description: 7086 * This routine performs the necessary convertions from the DKIOCSGEOM 7087 * Solaris structure to the format defined in FWARC 2006/195. 7088 * 7089 * Arguments: 7090 * vdc - the vDisk client 7091 * from - Buffer with data 7092 * to - Buffer where data is to be copied to 7093 * mode - flags passed to ioctl 7094 * dir - direction of copy (in or out) 7095 * 7096 * Return Code: 7097 * 0 - Success 7098 * ENXIO - Invalid buffer passed in 7099 * EFAULT - ddi_copyin of data failed 7100 */ 7101 static int 7102 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7103 { 7104 _NOTE(ARGUNUSED(vdc)) 7105 7106 vd_geom_t vdgeom; 7107 void *tmp_mem = NULL; 7108 int copy_len = sizeof (struct dk_geom); 7109 int rv = 0; 7110 7111 if (dir != VD_COPYIN) 7112 return (0); /* nothing to do */ 7113 7114 if ((from == NULL) || (to == NULL)) 7115 return (ENXIO); 7116 7117 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7118 7119 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7120 if (rv != 0) { 7121 kmem_free(tmp_mem, copy_len); 7122 return (EFAULT); 7123 } 7124 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7125 bcopy(&vdgeom, to, sizeof (vdgeom)); 7126 kmem_free(tmp_mem, copy_len); 7127 7128 return (0); 7129 } 7130 7131 static int 7132 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7133 { 7134 _NOTE(ARGUNUSED(vdc)) 7135 7136 vd_efi_t *vd_efi; 7137 dk_efi_t dk_efi; 7138 int rv = 0; 7139 void *uaddr; 7140 7141 if ((from == NULL) || (to == NULL)) 7142 return (ENXIO); 7143 7144 if (dir == VD_COPYIN) { 7145 7146 vd_efi = (vd_efi_t *)to; 7147 7148 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7149 if (rv != 0) 7150 return (EFAULT); 7151 7152 vd_efi->lba = dk_efi.dki_lba; 7153 vd_efi->length = dk_efi.dki_length; 7154 bzero(vd_efi->data, vd_efi->length); 7155 7156 } else { 7157 7158 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7159 if (rv != 0) 7160 return (EFAULT); 7161 7162 uaddr = dk_efi.dki_data; 7163 7164 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7165 7166 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7167 7168 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7169 mode); 7170 if (rv != 0) 7171 return (EFAULT); 7172 7173 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7174 } 7175 7176 return (0); 7177 } 7178 7179 static int 7180 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7181 { 7182 _NOTE(ARGUNUSED(vdc)) 7183 7184 dk_efi_t dk_efi; 7185 void *uaddr; 7186 7187 if (dir == VD_COPYOUT) { 7188 /* 7189 * The disk label may have changed. Revalidate the disk 7190 * geometry. This will also update the device nodes and 7191 * properties. 7192 */ 7193 vdc_validate(vdc); 7194 return (0); 7195 } 7196 7197 if ((from == NULL) || (to == NULL)) 7198 return (ENXIO); 7199 7200 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7201 return (EFAULT); 7202 7203 uaddr = dk_efi.dki_data; 7204 7205 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7206 7207 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7208 return (EFAULT); 7209 7210 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7211 7212 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7213 7214 return (0); 7215 } 7216 7217 7218 /* -------------------------------------------------------------------------- */ 7219 7220 /* 7221 * Function: 7222 * vdc_create_fake_geometry() 7223 * 7224 * Description: 7225 * This routine fakes up the disk info needed for some DKIO ioctls such 7226 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7227 * 7228 * Note: This function must not be called until the vDisk attributes have 7229 * been exchanged as part of the handshake with the vDisk server. 7230 * 7231 * Arguments: 7232 * vdc - soft state pointer for this instance of the device driver. 7233 * 7234 * Return Code: 7235 * none. 7236 */ 7237 static void 7238 vdc_create_fake_geometry(vdc_t *vdc) 7239 { 7240 ASSERT(vdc != NULL); 7241 ASSERT(vdc->max_xfer_sz != 0); 7242 7243 /* 7244 * DKIOCINFO support 7245 */ 7246 if (vdc->cinfo == NULL) 7247 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7248 7249 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7250 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7251 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7252 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7253 7254 /* 7255 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7256 * operation is supported, otherwise the controller type is DKC_DIRECT. 7257 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7258 * controller type is always DKC_DIRECT in that case. 7259 * 7260 * If the virtual disk is backed by a physical CD/DVD device or 7261 * an ISO image, modify the controller type to indicate this 7262 */ 7263 switch (vdc->vdisk_media) { 7264 case VD_MEDIA_CD: 7265 case VD_MEDIA_DVD: 7266 vdc->cinfo->dki_ctype = DKC_CDROM; 7267 break; 7268 case VD_MEDIA_FIXED: 7269 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7270 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7271 else 7272 vdc->cinfo->dki_ctype = DKC_DIRECT; 7273 break; 7274 default: 7275 /* in the case of v1.0 we default to a fixed disk */ 7276 vdc->cinfo->dki_ctype = DKC_DIRECT; 7277 break; 7278 } 7279 vdc->cinfo->dki_flags = DKI_FMTVOL; 7280 vdc->cinfo->dki_cnum = 0; 7281 vdc->cinfo->dki_addr = 0; 7282 vdc->cinfo->dki_space = 0; 7283 vdc->cinfo->dki_prio = 0; 7284 vdc->cinfo->dki_vec = 0; 7285 vdc->cinfo->dki_unit = vdc->instance; 7286 vdc->cinfo->dki_slave = 0; 7287 /* 7288 * The partition number will be created on the fly depending on the 7289 * actual slice (i.e. minor node) that is used to request the data. 7290 */ 7291 vdc->cinfo->dki_partition = 0; 7292 7293 /* 7294 * DKIOCGMEDIAINFO support 7295 */ 7296 if (vdc->minfo == NULL) 7297 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7298 7299 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7300 vdc->minfo->dki_media_type = 7301 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7302 } else { 7303 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7304 } 7305 7306 vdc->minfo->dki_capacity = vdc->vdisk_size; 7307 vdc->minfo->dki_lbsize = vdc->block_size; 7308 } 7309 7310 static ushort_t 7311 vdc_lbl2cksum(struct dk_label *label) 7312 { 7313 int count; 7314 ushort_t sum, *sp; 7315 7316 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7317 sp = (ushort_t *)label; 7318 sum = 0; 7319 while (count--) { 7320 sum ^= *sp++; 7321 } 7322 7323 return (sum); 7324 } 7325 7326 /* 7327 * Function: 7328 * vdc_validate_geometry 7329 * 7330 * Description: 7331 * This routine discovers the label and geometry of the disk. It stores 7332 * the disk label and related information in the vdc structure. If it 7333 * fails to validate the geometry or to discover the disk label then 7334 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7335 * 7336 * Arguments: 7337 * vdc - soft state pointer for this instance of the device driver. 7338 * 7339 * Return Code: 7340 * 0 - success. 7341 * EINVAL - unknown disk label. 7342 * ENOTSUP - geometry not applicable (EFI label). 7343 * EIO - error accessing the disk. 7344 */ 7345 static int 7346 vdc_validate_geometry(vdc_t *vdc) 7347 { 7348 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7349 dev_t dev; 7350 int rv, rval; 7351 struct dk_label label; 7352 struct dk_geom geom; 7353 struct vtoc vtoc; 7354 efi_gpt_t *gpt; 7355 efi_gpe_t *gpe; 7356 vd_efi_dev_t edev; 7357 7358 ASSERT(vdc != NULL); 7359 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7360 ASSERT(MUTEX_HELD(&vdc->lock)); 7361 7362 mutex_exit(&vdc->lock); 7363 7364 dev = makedevice(ddi_driver_major(vdc->dip), 7365 VD_MAKE_DEV(vdc->instance, 0)); 7366 7367 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7368 if (rv == 0) 7369 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7370 FKIOCTL, &rval); 7371 7372 if (rv == ENOTSUP) { 7373 /* 7374 * If the device does not support VTOC then we try 7375 * to read an EFI label. 7376 * 7377 * We need to know the block size and the disk size to 7378 * be able to read an EFI label. 7379 */ 7380 if (vdc->vdisk_size == 0) { 7381 if ((rv = vdc_check_capacity(vdc)) != 0) { 7382 mutex_enter(&vdc->lock); 7383 vdc_store_label_unk(vdc); 7384 return (rv); 7385 } 7386 } 7387 7388 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7389 7390 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7391 7392 if (rv) { 7393 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7394 vdc->instance, rv); 7395 mutex_enter(&vdc->lock); 7396 vdc_store_label_unk(vdc); 7397 return (EIO); 7398 } 7399 7400 mutex_enter(&vdc->lock); 7401 vdc_store_label_efi(vdc, gpt, gpe); 7402 vd_efi_free(&edev, gpt, gpe); 7403 return (ENOTSUP); 7404 } 7405 7406 if (rv != 0) { 7407 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7408 vdc->instance, rv); 7409 mutex_enter(&vdc->lock); 7410 vdc_store_label_unk(vdc); 7411 if (rv != EINVAL) 7412 rv = EIO; 7413 return (rv); 7414 } 7415 7416 /* check that geometry and vtoc are valid */ 7417 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7418 vtoc.v_sanity != VTOC_SANE) { 7419 mutex_enter(&vdc->lock); 7420 vdc_store_label_unk(vdc); 7421 return (EINVAL); 7422 } 7423 7424 /* 7425 * We have a disk and a valid VTOC. However this does not mean 7426 * that the disk currently have a VTOC label. The returned VTOC may 7427 * be a default VTOC to be used for configuring the disk (this is 7428 * what is done for disk image). So we read the label from the 7429 * beginning of the disk to ensure we really have a VTOC label. 7430 * 7431 * FUTURE: This could be the default way for reading the VTOC 7432 * from the disk as opposed to sending the VD_OP_GET_VTOC 7433 * to the server. This will be the default if vdc is implemented 7434 * ontop of cmlb. 7435 */ 7436 7437 /* 7438 * Single slice disk does not support read using an absolute disk 7439 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7440 */ 7441 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7442 mutex_enter(&vdc->lock); 7443 if (vtoc.v_nparts != 1) { 7444 vdc_store_label_unk(vdc); 7445 return (EINVAL); 7446 } 7447 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7448 return (0); 7449 } 7450 7451 if (vtoc.v_nparts != V_NUMPAR) { 7452 mutex_enter(&vdc->lock); 7453 vdc_store_label_unk(vdc); 7454 return (EINVAL); 7455 } 7456 7457 /* 7458 * Read disk label from start of disk 7459 */ 7460 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7461 bioinit(buf); 7462 buf->b_un.b_addr = (caddr_t)&label; 7463 buf->b_bcount = DK_LABEL_SIZE; 7464 buf->b_flags = B_BUSY | B_READ; 7465 buf->b_dev = cmpdev(dev); 7466 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7467 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7468 if (rv) { 7469 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7470 vdc->instance); 7471 } else { 7472 rv = biowait(buf); 7473 biofini(buf); 7474 } 7475 kmem_free(buf, sizeof (buf_t)); 7476 7477 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7478 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7479 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7480 vdc->instance); 7481 mutex_enter(&vdc->lock); 7482 vdc_store_label_unk(vdc); 7483 return (EINVAL); 7484 } 7485 7486 mutex_enter(&vdc->lock); 7487 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7488 return (0); 7489 } 7490 7491 /* 7492 * Function: 7493 * vdc_validate 7494 * 7495 * Description: 7496 * This routine discovers the label of the disk and create the 7497 * appropriate device nodes if the label has changed. 7498 * 7499 * Arguments: 7500 * vdc - soft state pointer for this instance of the device driver. 7501 * 7502 * Return Code: 7503 * none. 7504 */ 7505 static void 7506 vdc_validate(vdc_t *vdc) 7507 { 7508 vd_disk_label_t old_label; 7509 vd_slice_t old_slice[V_NUMPAR]; 7510 int rv; 7511 7512 ASSERT(!MUTEX_HELD(&vdc->lock)); 7513 7514 mutex_enter(&vdc->lock); 7515 7516 /* save the current label and vtoc */ 7517 old_label = vdc->vdisk_label; 7518 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7519 7520 /* check the geometry */ 7521 (void) vdc_validate_geometry(vdc); 7522 7523 /* if the disk label has changed, update device nodes */ 7524 if (vdc->vdisk_label != old_label) { 7525 7526 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7527 rv = vdc_create_device_nodes_efi(vdc); 7528 else 7529 rv = vdc_create_device_nodes_vtoc(vdc); 7530 7531 if (rv != 0) { 7532 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7533 vdc->instance); 7534 } 7535 } 7536 7537 /* if the vtoc has changed, update device nodes properties */ 7538 if (bcmp(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR) != 0) { 7539 7540 if (vdc_create_device_nodes_props(vdc) != 0) { 7541 DMSG(vdc, 0, "![%d] Failed to update device nodes" 7542 " properties", vdc->instance); 7543 } 7544 } 7545 7546 mutex_exit(&vdc->lock); 7547 } 7548 7549 static void 7550 vdc_validate_task(void *arg) 7551 { 7552 vdc_t *vdc = (vdc_t *)arg; 7553 7554 vdc_validate(vdc); 7555 7556 mutex_enter(&vdc->lock); 7557 ASSERT(vdc->validate_pending > 0); 7558 vdc->validate_pending--; 7559 mutex_exit(&vdc->lock); 7560 } 7561 7562 /* 7563 * Function: 7564 * vdc_setup_devid() 7565 * 7566 * Description: 7567 * This routine discovers the devid of a vDisk. It requests the devid of 7568 * the underlying device from the vDisk server, builds an encapsulated 7569 * devid based on the retrieved devid and registers that new devid to 7570 * the vDisk. 7571 * 7572 * Arguments: 7573 * vdc - soft state pointer for this instance of the device driver. 7574 * 7575 * Return Code: 7576 * 0 - A devid was succesfully registered for the vDisk 7577 */ 7578 static int 7579 vdc_setup_devid(vdc_t *vdc) 7580 { 7581 int rv; 7582 vd_devid_t *vd_devid; 7583 size_t bufsize, bufid_len; 7584 7585 /* 7586 * At first sight, we don't know the size of the devid that the 7587 * server will return but this size will be encoded into the 7588 * reply. So we do a first request using a default size then we 7589 * check if this size was large enough. If not then we do a second 7590 * request with the correct size returned by the server. Note that 7591 * ldc requires size to be 8-byte aligned. 7592 */ 7593 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7594 sizeof (uint64_t)); 7595 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7596 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7597 7598 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7599 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7600 7601 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7602 7603 if (rv) { 7604 kmem_free(vd_devid, bufsize); 7605 return (rv); 7606 } 7607 7608 if (vd_devid->length > bufid_len) { 7609 /* 7610 * The returned devid is larger than the buffer used. Try again 7611 * with a buffer with the right size. 7612 */ 7613 kmem_free(vd_devid, bufsize); 7614 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7615 sizeof (uint64_t)); 7616 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7617 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7618 7619 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7620 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7621 VIO_both_dir, B_TRUE); 7622 7623 if (rv) { 7624 kmem_free(vd_devid, bufsize); 7625 return (rv); 7626 } 7627 } 7628 7629 /* 7630 * The virtual disk should have the same device id as the one associated 7631 * with the physical disk it is mapped on, otherwise sharing a disk 7632 * between a LDom and a non-LDom may not work (for example for a shared 7633 * SVM disk set). 7634 * 7635 * The DDI framework does not allow creating a device id with any 7636 * type so we first create a device id of type DEVID_ENCAP and then 7637 * we restore the orignal type of the physical device. 7638 */ 7639 7640 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7641 7642 /* build an encapsulated devid based on the returned devid */ 7643 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7644 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7645 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7646 kmem_free(vd_devid, bufsize); 7647 return (1); 7648 } 7649 7650 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7651 7652 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7653 7654 kmem_free(vd_devid, bufsize); 7655 7656 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7657 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7658 return (1); 7659 } 7660 7661 return (0); 7662 } 7663 7664 static void 7665 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7666 { 7667 int i, nparts; 7668 7669 ASSERT(MUTEX_HELD(&vdc->lock)); 7670 7671 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7672 bzero(vdc->vtoc, sizeof (struct vtoc)); 7673 bzero(vdc->geom, sizeof (struct dk_geom)); 7674 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7675 7676 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7677 7678 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7679 7680 if (gpe[i].efi_gpe_StartingLBA == 0 || 7681 gpe[i].efi_gpe_EndingLBA == 0) { 7682 continue; 7683 } 7684 7685 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7686 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7687 gpe[i].efi_gpe_StartingLBA + 1; 7688 } 7689 7690 ASSERT(vdc->vdisk_size != 0); 7691 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7692 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7693 7694 } 7695 7696 static void 7697 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7698 { 7699 int i; 7700 7701 ASSERT(MUTEX_HELD(&vdc->lock)); 7702 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7703 7704 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7705 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7706 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7707 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7708 7709 for (i = 0; i < vtoc->v_nparts; i++) { 7710 vdc->slice[i].start = vtoc->v_part[i].p_start; 7711 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7712 } 7713 } 7714 7715 static void 7716 vdc_store_label_unk(vdc_t *vdc) 7717 { 7718 ASSERT(MUTEX_HELD(&vdc->lock)); 7719 7720 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7721 bzero(vdc->vtoc, sizeof (struct vtoc)); 7722 bzero(vdc->geom, sizeof (struct dk_geom)); 7723 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7724 } 7725