1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/kstat.h> 69 #include <sys/mach_descrip.h> 70 #include <sys/modctl.h> 71 #include <sys/mdeg.h> 72 #include <sys/note.h> 73 #include <sys/open.h> 74 #include <sys/sdt.h> 75 #include <sys/stat.h> 76 #include <sys/sunddi.h> 77 #include <sys/types.h> 78 #include <sys/promif.h> 79 #include <sys/var.h> 80 #include <sys/vtoc.h> 81 #include <sys/archsystm.h> 82 #include <sys/sysmacros.h> 83 84 #include <sys/cdio.h> 85 #include <sys/dktp/fdisk.h> 86 #include <sys/dktp/dadkio.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 124 int mod_flags, char *name, caddr_t valuep, int *lengthp); 125 126 /* setup */ 127 static void vdc_min(struct buf *bufp); 128 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 129 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 130 static int vdc_start_ldc_connection(vdc_t *vdc); 131 static int vdc_create_device_nodes(vdc_t *vdc); 132 static int vdc_create_device_nodes_efi(vdc_t *vdc); 133 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 134 static void vdc_create_io_kstats(vdc_t *vdc); 135 static void vdc_create_err_kstats(vdc_t *vdc); 136 static void vdc_set_err_kstats(vdc_t *vdc); 137 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 138 mde_cookie_t *vd_nodep); 139 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 140 static void vdc_fini_ports(vdc_t *vdc); 141 static void vdc_switch_server(vdc_t *vdcp); 142 static int vdc_do_ldc_up(vdc_t *vdc); 143 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 144 static int vdc_init_descriptor_ring(vdc_t *vdc); 145 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 146 static int vdc_setup_devid(vdc_t *vdc); 147 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 148 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 149 static void vdc_store_label_unk(vdc_t *vdc); 150 static boolean_t vdc_is_opened(vdc_t *vdc); 151 152 /* handshake with vds */ 153 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 154 static int vdc_ver_negotiation(vdc_t *vdcp); 155 static int vdc_init_attr_negotiation(vdc_t *vdc); 156 static int vdc_attr_negotiation(vdc_t *vdcp); 157 static int vdc_init_dring_negotiate(vdc_t *vdc); 158 static int vdc_dring_negotiation(vdc_t *vdcp); 159 static int vdc_send_rdx(vdc_t *vdcp); 160 static int vdc_rdx_exchange(vdc_t *vdcp); 161 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 162 163 /* processing incoming messages from vDisk server */ 164 static void vdc_process_msg_thread(vdc_t *vdc); 165 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 166 167 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 168 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 169 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 170 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 171 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 172 static int vdc_send_request(vdc_t *vdcp, int operation, 173 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 174 int cb_type, void *cb_arg, vio_desc_direction_t dir); 175 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 176 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 177 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 178 int cb_type, void *cb_arg, vio_desc_direction_t dir); 179 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 180 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 181 void *cb_arg, vio_desc_direction_t dir, boolean_t); 182 183 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 184 static int vdc_drain_response(vdc_t *vdcp); 185 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 186 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 187 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 188 189 /* dkio */ 190 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 191 int *rvalp); 192 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 193 static void vdc_create_fake_geometry(vdc_t *vdc); 194 static int vdc_validate_geometry(vdc_t *vdc); 195 static void vdc_validate(vdc_t *vdc); 196 static void vdc_validate_task(void *arg); 197 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 198 int mode, int dir); 199 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 200 int mode, int dir); 201 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 202 int mode, int dir); 203 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 204 int mode, int dir); 205 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 206 int mode, int dir); 207 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 208 int mode, int dir); 209 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 210 int mode, int dir); 211 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 212 int mode, int dir); 213 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 214 int mode, int dir); 215 216 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 217 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 218 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 219 static int vdc_failfast_check_resv(vdc_t *vdc); 220 221 /* 222 * Module variables 223 */ 224 225 /* 226 * Tunable variables to control how long vdc waits before timing out on 227 * various operations 228 */ 229 static int vdc_hshake_retries = 3; 230 231 static int vdc_timeout = 0; /* units: seconds */ 232 static int vdc_ldcup_timeout = 1; /* units: seconds */ 233 234 static uint64_t vdc_hz_min_ldc_delay; 235 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 236 static uint64_t vdc_hz_max_ldc_delay; 237 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 238 239 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 240 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 241 242 /* values for dumping - need to run in a tighter loop */ 243 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 244 static int vdc_dump_retries = 100; 245 246 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 247 248 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 249 250 /* Count of the number of vdc instances attached */ 251 static volatile uint32_t vdc_instance_count = 0; 252 253 /* Tunable to log all SCSI errors */ 254 static boolean_t vdc_scsi_log_error = B_FALSE; 255 256 /* Soft state pointer */ 257 static void *vdc_state; 258 259 /* 260 * Controlling the verbosity of the error/debug messages 261 * 262 * vdc_msglevel - controls level of messages 263 * vdc_matchinst - 64-bit variable where each bit corresponds 264 * to the vdc instance the vdc_msglevel applies. 265 */ 266 int vdc_msglevel = 0x0; 267 uint64_t vdc_matchinst = 0ull; 268 269 /* 270 * Supported vDisk protocol version pairs. 271 * 272 * The first array entry is the latest and preferred version. 273 */ 274 static const vio_ver_t vdc_version[] = {{1, 1}}; 275 276 static struct cb_ops vdc_cb_ops = { 277 vdc_open, /* cb_open */ 278 vdc_close, /* cb_close */ 279 vdc_strategy, /* cb_strategy */ 280 vdc_print, /* cb_print */ 281 vdc_dump, /* cb_dump */ 282 vdc_read, /* cb_read */ 283 vdc_write, /* cb_write */ 284 vdc_ioctl, /* cb_ioctl */ 285 nodev, /* cb_devmap */ 286 nodev, /* cb_mmap */ 287 nodev, /* cb_segmap */ 288 nochpoll, /* cb_chpoll */ 289 vdc_prop_op, /* cb_prop_op */ 290 NULL, /* cb_str */ 291 D_MP | D_64BIT, /* cb_flag */ 292 CB_REV, /* cb_rev */ 293 vdc_aread, /* cb_aread */ 294 vdc_awrite /* cb_awrite */ 295 }; 296 297 static struct dev_ops vdc_ops = { 298 DEVO_REV, /* devo_rev */ 299 0, /* devo_refcnt */ 300 vdc_getinfo, /* devo_getinfo */ 301 nulldev, /* devo_identify */ 302 nulldev, /* devo_probe */ 303 vdc_attach, /* devo_attach */ 304 vdc_detach, /* devo_detach */ 305 nodev, /* devo_reset */ 306 &vdc_cb_ops, /* devo_cb_ops */ 307 NULL, /* devo_bus_ops */ 308 nulldev /* devo_power */ 309 }; 310 311 static struct modldrv modldrv = { 312 &mod_driverops, 313 "virtual disk client", 314 &vdc_ops, 315 }; 316 317 static struct modlinkage modlinkage = { 318 MODREV_1, 319 &modldrv, 320 NULL 321 }; 322 323 /* -------------------------------------------------------------------------- */ 324 325 /* 326 * Device Driver housekeeping and setup 327 */ 328 329 int 330 _init(void) 331 { 332 int status; 333 334 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 335 return (status); 336 if ((status = mod_install(&modlinkage)) != 0) 337 ddi_soft_state_fini(&vdc_state); 338 return (status); 339 } 340 341 int 342 _info(struct modinfo *modinfop) 343 { 344 return (mod_info(&modlinkage, modinfop)); 345 } 346 347 int 348 _fini(void) 349 { 350 int status; 351 352 if ((status = mod_remove(&modlinkage)) != 0) 353 return (status); 354 ddi_soft_state_fini(&vdc_state); 355 return (0); 356 } 357 358 static int 359 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 360 { 361 _NOTE(ARGUNUSED(dip)) 362 363 int instance = VDCUNIT((dev_t)arg); 364 vdc_t *vdc = NULL; 365 366 switch (cmd) { 367 case DDI_INFO_DEVT2DEVINFO: 368 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 369 *resultp = NULL; 370 return (DDI_FAILURE); 371 } 372 *resultp = vdc->dip; 373 return (DDI_SUCCESS); 374 case DDI_INFO_DEVT2INSTANCE: 375 *resultp = (void *)(uintptr_t)instance; 376 return (DDI_SUCCESS); 377 default: 378 *resultp = NULL; 379 return (DDI_FAILURE); 380 } 381 } 382 383 static int 384 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 385 { 386 kt_did_t failfast_tid, ownership_tid; 387 int instance; 388 int rv; 389 vdc_t *vdc = NULL; 390 391 switch (cmd) { 392 case DDI_DETACH: 393 /* the real work happens below */ 394 break; 395 case DDI_SUSPEND: 396 /* nothing to do for this non-device */ 397 return (DDI_SUCCESS); 398 default: 399 return (DDI_FAILURE); 400 } 401 402 ASSERT(cmd == DDI_DETACH); 403 instance = ddi_get_instance(dip); 404 DMSGX(1, "[%d] Entered\n", instance); 405 406 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 407 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 408 return (DDI_FAILURE); 409 } 410 411 /* 412 * This function is called when vdc is detached or if it has failed to 413 * attach. In that case, the attach may have fail before the vdisk type 414 * has been set so we can't call vdc_is_opened(). However as the attach 415 * has failed, we know that the vdisk is not opened and we can safely 416 * detach. 417 */ 418 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 419 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 420 return (DDI_FAILURE); 421 } 422 423 if (vdc->dkio_flush_pending) { 424 DMSG(vdc, 0, 425 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 426 instance, vdc->dkio_flush_pending); 427 return (DDI_FAILURE); 428 } 429 430 if (vdc->validate_pending) { 431 DMSG(vdc, 0, 432 "[%d] Cannot detach: %d outstanding validate request\n", 433 instance, vdc->validate_pending); 434 return (DDI_FAILURE); 435 } 436 437 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 438 439 /* If we took ownership, release ownership */ 440 mutex_enter(&vdc->ownership_lock); 441 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 442 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 443 if (rv == 0) { 444 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 445 } 446 } 447 mutex_exit(&vdc->ownership_lock); 448 449 /* mark instance as detaching */ 450 vdc->lifecycle = VDC_LC_DETACHING; 451 452 /* 453 * try and disable callbacks to prevent another handshake 454 */ 455 if (vdc->curr_server != NULL) { 456 rv = ldc_set_cb_mode(vdc->curr_server->ldc_handle, 457 LDC_CB_DISABLE); 458 DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); 459 } 460 461 if (vdc->initialized & VDC_THREAD) { 462 mutex_enter(&vdc->read_lock); 463 if ((vdc->read_state == VDC_READ_WAITING) || 464 (vdc->read_state == VDC_READ_RESET)) { 465 vdc->read_state = VDC_READ_RESET; 466 cv_signal(&vdc->read_cv); 467 } 468 469 mutex_exit(&vdc->read_lock); 470 471 /* wake up any thread waiting for connection to come online */ 472 mutex_enter(&vdc->lock); 473 if (vdc->state == VDC_STATE_INIT_WAITING) { 474 DMSG(vdc, 0, 475 "[%d] write reset - move to resetting state...\n", 476 instance); 477 vdc->state = VDC_STATE_RESETTING; 478 cv_signal(&vdc->initwait_cv); 479 } 480 mutex_exit(&vdc->lock); 481 482 /* now wait until state transitions to VDC_STATE_DETACH */ 483 thread_join(vdc->msg_proc_thr->t_did); 484 ASSERT(vdc->state == VDC_STATE_DETACH); 485 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 486 vdc->instance); 487 } 488 489 mutex_enter(&vdc->lock); 490 491 if (vdc->initialized & VDC_DRING) 492 vdc_destroy_descriptor_ring(vdc); 493 494 vdc_fini_ports(vdc); 495 496 if (vdc->failfast_thread) { 497 failfast_tid = vdc->failfast_thread->t_did; 498 vdc->failfast_interval = 0; 499 cv_signal(&vdc->failfast_cv); 500 } else { 501 failfast_tid = 0; 502 } 503 504 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 505 ownership_tid = vdc->ownership_thread->t_did; 506 vdc->ownership = VDC_OWNERSHIP_NONE; 507 cv_signal(&vdc->ownership_cv); 508 } else { 509 ownership_tid = 0; 510 } 511 512 mutex_exit(&vdc->lock); 513 514 if (failfast_tid != 0) 515 thread_join(failfast_tid); 516 517 if (ownership_tid != 0) 518 thread_join(ownership_tid); 519 520 if (vdc->initialized & VDC_MINOR) 521 ddi_remove_minor_node(dip, NULL); 522 523 if (vdc->io_stats) { 524 kstat_delete(vdc->io_stats); 525 vdc->io_stats = NULL; 526 } 527 528 if (vdc->err_stats) { 529 kstat_delete(vdc->err_stats); 530 vdc->err_stats = NULL; 531 } 532 533 if (vdc->initialized & VDC_LOCKS) { 534 mutex_destroy(&vdc->lock); 535 mutex_destroy(&vdc->read_lock); 536 mutex_destroy(&vdc->ownership_lock); 537 cv_destroy(&vdc->initwait_cv); 538 cv_destroy(&vdc->dring_free_cv); 539 cv_destroy(&vdc->membind_cv); 540 cv_destroy(&vdc->sync_pending_cv); 541 cv_destroy(&vdc->sync_blocked_cv); 542 cv_destroy(&vdc->read_cv); 543 cv_destroy(&vdc->running_cv); 544 cv_destroy(&vdc->ownership_cv); 545 cv_destroy(&vdc->failfast_cv); 546 cv_destroy(&vdc->failfast_io_cv); 547 } 548 549 if (vdc->minfo) 550 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 551 552 if (vdc->cinfo) 553 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 554 555 if (vdc->vtoc) 556 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 557 558 if (vdc->geom) 559 kmem_free(vdc->geom, sizeof (struct dk_geom)); 560 561 if (vdc->devid) { 562 ddi_devid_unregister(dip); 563 ddi_devid_free(vdc->devid); 564 } 565 566 if (vdc->initialized & VDC_SOFT_STATE) 567 ddi_soft_state_free(vdc_state, instance); 568 569 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 570 571 return (DDI_SUCCESS); 572 } 573 574 575 static int 576 vdc_do_attach(dev_info_t *dip) 577 { 578 int instance; 579 vdc_t *vdc = NULL; 580 int status; 581 md_t *mdp; 582 mde_cookie_t vd_node; 583 584 ASSERT(dip != NULL); 585 586 instance = ddi_get_instance(dip); 587 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 588 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 589 instance); 590 return (DDI_FAILURE); 591 } 592 593 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 594 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 595 return (DDI_FAILURE); 596 } 597 598 /* 599 * We assign the value to initialized in this case to zero out the 600 * variable and then set bits in it to indicate what has been done 601 */ 602 vdc->initialized = VDC_SOFT_STATE; 603 604 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 605 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 606 607 vdc->dip = dip; 608 vdc->instance = instance; 609 vdc->vdisk_type = VD_DISK_TYPE_UNK; 610 vdc->vdisk_label = VD_DISK_LABEL_UNK; 611 vdc->state = VDC_STATE_INIT; 612 vdc->lifecycle = VDC_LC_ATTACHING; 613 vdc->session_id = 0; 614 vdc->block_size = DEV_BSIZE; 615 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 616 617 /* 618 * We assume, for now, that the vDisk server will export 'read' 619 * operations to us at a minimum (this is needed because of checks 620 * in vdc for supported operations early in the handshake process). 621 * The vDisk server will return ENOTSUP if this is not the case. 622 * The value will be overwritten during the attribute exchange with 623 * the bitmask of operations exported by server. 624 */ 625 vdc->operations = VD_OP_MASK_READ; 626 627 vdc->vtoc = NULL; 628 vdc->geom = NULL; 629 vdc->cinfo = NULL; 630 vdc->minfo = NULL; 631 632 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 633 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 634 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 635 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 636 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 637 638 vdc->threads_pending = 0; 639 vdc->sync_op_pending = B_FALSE; 640 vdc->sync_op_blocked = B_FALSE; 641 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 642 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 643 644 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 645 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 646 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 647 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 648 649 /* init blocking msg read functionality */ 650 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 651 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 652 vdc->read_state = VDC_READ_IDLE; 653 654 vdc->initialized |= VDC_LOCKS; 655 656 /* get device and port MD node for this disk instance */ 657 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 658 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 659 instance); 660 return (DDI_FAILURE); 661 } 662 663 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 664 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 665 return (DDI_FAILURE); 666 } 667 668 (void) md_fini_handle(mdp); 669 670 /* initialize the thread responsible for managing state with server */ 671 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 672 vdc, 0, &p0, TS_RUN, minclsyspri); 673 if (vdc->msg_proc_thr == NULL) { 674 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 675 instance); 676 return (DDI_FAILURE); 677 } 678 679 vdc->initialized |= VDC_THREAD; 680 681 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 682 vdc_create_io_kstats(vdc); 683 vdc_create_err_kstats(vdc); 684 685 atomic_inc_32(&vdc_instance_count); 686 687 /* 688 * Check the disk label. This will send requests and do the handshake. 689 * We don't really care about the disk label now. What we really need is 690 * the handshake do be done so that we know the type of the disk (slice 691 * or full disk) and the appropriate device nodes can be created. 692 */ 693 vdc->vdisk_label = VD_DISK_LABEL_UNK; 694 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 695 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 696 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 697 698 mutex_enter(&vdc->lock); 699 (void) vdc_validate_geometry(vdc); 700 mutex_exit(&vdc->lock); 701 702 /* 703 * Now that we have the device info we can create the device nodes 704 */ 705 status = vdc_create_device_nodes(vdc); 706 if (status) { 707 DMSG(vdc, 0, "[%d] Failed to create device nodes", 708 instance); 709 goto return_status; 710 } 711 712 /* 713 * Setup devid 714 */ 715 if (vdc_setup_devid(vdc)) { 716 DMSG(vdc, 0, "[%d] No device id available\n", instance); 717 } 718 719 /* 720 * Fill in the fields of the error statistics kstat that were not 721 * available when creating the kstat 722 */ 723 vdc_set_err_kstats(vdc); 724 725 ddi_report_dev(dip); 726 vdc->lifecycle = VDC_LC_ONLINE; 727 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 728 729 return_status: 730 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 731 return (status); 732 } 733 734 static int 735 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 736 { 737 int status; 738 739 switch (cmd) { 740 case DDI_ATTACH: 741 if ((status = vdc_do_attach(dip)) != 0) 742 (void) vdc_detach(dip, DDI_DETACH); 743 return (status); 744 case DDI_RESUME: 745 /* nothing to do for this non-device */ 746 return (DDI_SUCCESS); 747 default: 748 return (DDI_FAILURE); 749 } 750 } 751 752 static int 753 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 754 { 755 int status = 0; 756 ldc_status_t ldc_state; 757 ldc_attr_t ldc_attr; 758 759 ASSERT(vdc != NULL); 760 ASSERT(srvr != NULL); 761 762 ldc_attr.devclass = LDC_DEV_BLK; 763 ldc_attr.instance = vdc->instance; 764 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 765 ldc_attr.mtu = VD_LDC_MTU; 766 767 if ((srvr->state & VDC_LDC_INIT) == 0) { 768 status = ldc_init(srvr->ldc_id, &ldc_attr, 769 &srvr->ldc_handle); 770 if (status != 0) { 771 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 772 vdc->instance, srvr->ldc_id, status); 773 return (status); 774 } 775 srvr->state |= VDC_LDC_INIT; 776 } 777 status = ldc_status(srvr->ldc_handle, &ldc_state); 778 if (status != 0) { 779 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 780 vdc->instance, status); 781 goto init_exit; 782 } 783 srvr->ldc_state = ldc_state; 784 785 if ((srvr->state & VDC_LDC_CB) == 0) { 786 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 787 (caddr_t)srvr); 788 if (status != 0) { 789 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 790 vdc->instance, status); 791 goto init_exit; 792 } 793 srvr->state |= VDC_LDC_CB; 794 } 795 796 /* 797 * At this stage we have initialised LDC, we will now try and open 798 * the connection. 799 */ 800 if (srvr->ldc_state == LDC_INIT) { 801 status = ldc_open(srvr->ldc_handle); 802 if (status != 0) { 803 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 804 vdc->instance, srvr->ldc_id, status); 805 goto init_exit; 806 } 807 srvr->state |= VDC_LDC_OPEN; 808 } 809 810 init_exit: 811 if (status) { 812 vdc_terminate_ldc(vdc, srvr); 813 } 814 815 return (status); 816 } 817 818 static int 819 vdc_start_ldc_connection(vdc_t *vdc) 820 { 821 int status = 0; 822 823 ASSERT(vdc != NULL); 824 825 ASSERT(MUTEX_HELD(&vdc->lock)); 826 827 status = vdc_do_ldc_up(vdc); 828 829 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 830 831 return (status); 832 } 833 834 static int 835 vdc_stop_ldc_connection(vdc_t *vdcp) 836 { 837 int status; 838 839 ASSERT(vdcp != NULL); 840 841 ASSERT(MUTEX_HELD(&vdcp->lock)); 842 843 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 844 vdcp->state); 845 846 status = ldc_down(vdcp->curr_server->ldc_handle); 847 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 848 849 vdcp->initialized &= ~VDC_HANDSHAKE; 850 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 851 852 return (status); 853 } 854 855 static void 856 vdc_create_io_kstats(vdc_t *vdc) 857 { 858 if (vdc->io_stats != NULL) { 859 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 860 return; 861 } 862 863 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 864 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 865 if (vdc->io_stats != NULL) { 866 vdc->io_stats->ks_lock = &vdc->lock; 867 kstat_install(vdc->io_stats); 868 } else { 869 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 870 " will not be gathered", vdc->instance); 871 } 872 } 873 874 static void 875 vdc_create_err_kstats(vdc_t *vdc) 876 { 877 vd_err_stats_t *stp; 878 char kstatmodule_err[KSTAT_STRLEN]; 879 char kstatname[KSTAT_STRLEN]; 880 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 881 int instance = vdc->instance; 882 883 if (vdc->err_stats != NULL) { 884 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 885 return; 886 } 887 888 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 889 "%serr", VDC_DRIVER_NAME); 890 (void) snprintf(kstatname, sizeof (kstatname), 891 "%s%d,err", VDC_DRIVER_NAME, instance); 892 893 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 894 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 895 896 if (vdc->err_stats == NULL) { 897 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 898 " will not be gathered", instance); 899 return; 900 } 901 902 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 903 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 904 KSTAT_DATA_UINT32); 905 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 906 KSTAT_DATA_UINT32); 907 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 908 KSTAT_DATA_UINT32); 909 kstat_named_init(&stp->vd_vid, "Vendor", 910 KSTAT_DATA_CHAR); 911 kstat_named_init(&stp->vd_pid, "Product", 912 KSTAT_DATA_CHAR); 913 kstat_named_init(&stp->vd_capacity, "Size", 914 KSTAT_DATA_ULONGLONG); 915 916 vdc->err_stats->ks_update = nulldev; 917 918 kstat_install(vdc->err_stats); 919 } 920 921 static void 922 vdc_set_err_kstats(vdc_t *vdc) 923 { 924 vd_err_stats_t *stp; 925 926 if (vdc->err_stats == NULL) 927 return; 928 929 mutex_enter(&vdc->lock); 930 931 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 932 ASSERT(stp != NULL); 933 934 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 935 (void) strcpy(stp->vd_vid.value.c, "SUN"); 936 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 937 938 mutex_exit(&vdc->lock); 939 } 940 941 static int 942 vdc_create_device_nodes_efi(vdc_t *vdc) 943 { 944 ddi_remove_minor_node(vdc->dip, "h"); 945 ddi_remove_minor_node(vdc->dip, "h,raw"); 946 947 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 948 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 949 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 950 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 951 vdc->instance); 952 return (EIO); 953 } 954 955 /* if any device node is created we set this flag */ 956 vdc->initialized |= VDC_MINOR; 957 958 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 959 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 960 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 961 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 962 vdc->instance); 963 return (EIO); 964 } 965 966 return (0); 967 } 968 969 static int 970 vdc_create_device_nodes_vtoc(vdc_t *vdc) 971 { 972 ddi_remove_minor_node(vdc->dip, "wd"); 973 ddi_remove_minor_node(vdc->dip, "wd,raw"); 974 975 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 976 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 977 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 978 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 979 vdc->instance); 980 return (EIO); 981 } 982 983 /* if any device node is created we set this flag */ 984 vdc->initialized |= VDC_MINOR; 985 986 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 987 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 988 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 989 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 990 vdc->instance); 991 return (EIO); 992 } 993 994 return (0); 995 } 996 997 /* 998 * Function: 999 * vdc_create_device_nodes 1000 * 1001 * Description: 1002 * This function creates the block and character device nodes under 1003 * /devices. It is called as part of the attach(9E) of the instance 1004 * during the handshake with vds after vds has sent the attributes 1005 * to vdc. 1006 * 1007 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1008 * of 2 is used in keeping with the Solaris convention that slice 2 1009 * refers to a whole disk. Slices start at 'a' 1010 * 1011 * Parameters: 1012 * vdc - soft state pointer 1013 * 1014 * Return Values 1015 * 0 - Success 1016 * EIO - Failed to create node 1017 * EINVAL - Unknown type of disk exported 1018 */ 1019 static int 1020 vdc_create_device_nodes(vdc_t *vdc) 1021 { 1022 char name[sizeof ("s,raw")]; 1023 dev_info_t *dip = NULL; 1024 int instance, status; 1025 int num_slices = 1; 1026 int i; 1027 1028 ASSERT(vdc != NULL); 1029 1030 instance = vdc->instance; 1031 dip = vdc->dip; 1032 1033 switch (vdc->vdisk_type) { 1034 case VD_DISK_TYPE_DISK: 1035 num_slices = V_NUMPAR; 1036 break; 1037 case VD_DISK_TYPE_SLICE: 1038 num_slices = 1; 1039 break; 1040 case VD_DISK_TYPE_UNK: 1041 default: 1042 return (EINVAL); 1043 } 1044 1045 /* 1046 * Minor nodes are different for EFI disks: EFI disks do not have 1047 * a minor node 'g' for the minor number corresponding to slice 1048 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1049 * representing the whole disk. 1050 */ 1051 for (i = 0; i < num_slices; i++) { 1052 1053 if (i == VD_EFI_WD_SLICE) { 1054 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1055 status = vdc_create_device_nodes_efi(vdc); 1056 else 1057 status = vdc_create_device_nodes_vtoc(vdc); 1058 if (status != 0) 1059 return (status); 1060 continue; 1061 } 1062 1063 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1064 if (ddi_create_minor_node(dip, name, S_IFBLK, 1065 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1066 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1067 instance, name); 1068 return (EIO); 1069 } 1070 1071 /* if any device node is created we set this flag */ 1072 vdc->initialized |= VDC_MINOR; 1073 1074 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1075 1076 if (ddi_create_minor_node(dip, name, S_IFCHR, 1077 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1078 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1079 instance, name); 1080 return (EIO); 1081 } 1082 } 1083 1084 return (0); 1085 } 1086 1087 /* 1088 * Driver prop_op(9e) entry point function. Return the number of blocks for 1089 * the partition in question or forward the request to the property facilities. 1090 */ 1091 static int 1092 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1093 char *name, caddr_t valuep, int *lengthp) 1094 { 1095 int instance = ddi_get_instance(dip); 1096 vdc_t *vdc; 1097 uint64_t nblocks; 1098 uint_t blksize; 1099 1100 vdc = ddi_get_soft_state(vdc_state, instance); 1101 1102 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1103 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1104 name, valuep, lengthp)); 1105 } 1106 1107 mutex_enter(&vdc->lock); 1108 (void) vdc_validate_geometry(vdc); 1109 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1110 mutex_exit(&vdc->lock); 1111 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1112 name, valuep, lengthp)); 1113 } 1114 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1115 blksize = vdc->block_size; 1116 mutex_exit(&vdc->lock); 1117 1118 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1119 name, valuep, lengthp, nblocks, blksize)); 1120 } 1121 1122 /* 1123 * Function: 1124 * vdc_is_opened 1125 * 1126 * Description: 1127 * This function checks if any slice of a given virtual disk is 1128 * currently opened. 1129 * 1130 * Parameters: 1131 * vdc - soft state pointer 1132 * 1133 * Return Values 1134 * B_TRUE - at least one slice is opened. 1135 * B_FALSE - no slice is opened. 1136 */ 1137 static boolean_t 1138 vdc_is_opened(vdc_t *vdc) 1139 { 1140 int i, nslices; 1141 1142 switch (vdc->vdisk_type) { 1143 case VD_DISK_TYPE_DISK: 1144 nslices = V_NUMPAR; 1145 break; 1146 case VD_DISK_TYPE_SLICE: 1147 nslices = 1; 1148 break; 1149 case VD_DISK_TYPE_UNK: 1150 default: 1151 ASSERT(0); 1152 } 1153 1154 /* check if there's any layered open */ 1155 for (i = 0; i < nslices; i++) { 1156 if (vdc->open_lyr[i] > 0) 1157 return (B_TRUE); 1158 } 1159 1160 /* check if there is any other kind of open */ 1161 for (i = 0; i < OTYPCNT; i++) { 1162 if (vdc->open[i] != 0) 1163 return (B_TRUE); 1164 } 1165 1166 return (B_FALSE); 1167 } 1168 1169 static int 1170 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1171 { 1172 uint8_t slicemask; 1173 int i; 1174 1175 ASSERT(otyp < OTYPCNT); 1176 ASSERT(slice < V_NUMPAR); 1177 ASSERT(MUTEX_HELD(&vdc->lock)); 1178 1179 slicemask = 1 << slice; 1180 1181 /* check if slice is already exclusively opened */ 1182 if (vdc->open_excl & slicemask) 1183 return (EBUSY); 1184 1185 /* if open exclusive, check if slice is already opened */ 1186 if (flag & FEXCL) { 1187 if (vdc->open_lyr[slice] > 0) 1188 return (EBUSY); 1189 for (i = 0; i < OTYPCNT; i++) { 1190 if (vdc->open[i] & slicemask) 1191 return (EBUSY); 1192 } 1193 vdc->open_excl |= slicemask; 1194 } 1195 1196 /* mark slice as opened */ 1197 if (otyp == OTYP_LYR) { 1198 vdc->open_lyr[slice]++; 1199 } else { 1200 vdc->open[otyp] |= slicemask; 1201 } 1202 1203 return (0); 1204 } 1205 1206 static void 1207 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1208 { 1209 uint8_t slicemask; 1210 1211 ASSERT(otyp < OTYPCNT); 1212 ASSERT(slice < V_NUMPAR); 1213 ASSERT(MUTEX_HELD(&vdc->lock)); 1214 1215 slicemask = 1 << slice; 1216 1217 if (otyp == OTYP_LYR) { 1218 ASSERT(vdc->open_lyr[slice] > 0); 1219 vdc->open_lyr[slice]--; 1220 } else { 1221 vdc->open[otyp] &= ~slicemask; 1222 } 1223 1224 if (flag & FEXCL) 1225 vdc->open_excl &= ~slicemask; 1226 } 1227 1228 static int 1229 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1230 { 1231 _NOTE(ARGUNUSED(cred)) 1232 1233 int instance, nodelay; 1234 int slice, status = 0; 1235 vdc_t *vdc; 1236 1237 ASSERT(dev != NULL); 1238 instance = VDCUNIT(*dev); 1239 1240 if (otyp >= OTYPCNT) 1241 return (EINVAL); 1242 1243 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1244 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1245 return (ENXIO); 1246 } 1247 1248 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1249 getminor(*dev), flag, otyp); 1250 1251 slice = VDCPART(*dev); 1252 1253 nodelay = flag & (FNDELAY | FNONBLOCK); 1254 1255 if ((flag & FWRITE) && (!nodelay) && 1256 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1257 return (EROFS); 1258 } 1259 1260 mutex_enter(&vdc->lock); 1261 1262 status = vdc_mark_opened(vdc, slice, flag, otyp); 1263 1264 if (status != 0) { 1265 mutex_exit(&vdc->lock); 1266 return (status); 1267 } 1268 1269 if (nodelay) { 1270 1271 /* don't resubmit a validate request if there's already one */ 1272 if (vdc->validate_pending > 0) { 1273 mutex_exit(&vdc->lock); 1274 return (0); 1275 } 1276 1277 /* call vdc_validate() asynchronously to avoid blocking */ 1278 if (taskq_dispatch(system_taskq, vdc_validate_task, 1279 (void *)vdc, TQ_NOSLEEP) == NULL) { 1280 vdc_mark_closed(vdc, slice, flag, otyp); 1281 mutex_exit(&vdc->lock); 1282 return (ENXIO); 1283 } 1284 1285 vdc->validate_pending++; 1286 mutex_exit(&vdc->lock); 1287 return (0); 1288 } 1289 1290 mutex_exit(&vdc->lock); 1291 1292 vdc_validate(vdc); 1293 1294 mutex_enter(&vdc->lock); 1295 1296 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1297 vdc->slice[slice].nblocks == 0) { 1298 vdc_mark_closed(vdc, slice, flag, otyp); 1299 status = EIO; 1300 } 1301 1302 mutex_exit(&vdc->lock); 1303 1304 return (status); 1305 } 1306 1307 static int 1308 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1309 { 1310 _NOTE(ARGUNUSED(cred)) 1311 1312 int instance; 1313 int slice; 1314 int rv, rval; 1315 vdc_t *vdc; 1316 1317 instance = VDCUNIT(dev); 1318 1319 if (otyp >= OTYPCNT) 1320 return (EINVAL); 1321 1322 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1323 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1324 return (ENXIO); 1325 } 1326 1327 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1328 1329 slice = VDCPART(dev); 1330 1331 /* 1332 * Attempt to flush the W$ on a close operation. If this is 1333 * not a supported IOCTL command or the backing device is read-only 1334 * do not fail the close operation. 1335 */ 1336 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1337 1338 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1339 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1340 instance, rv); 1341 return (EIO); 1342 } 1343 1344 mutex_enter(&vdc->lock); 1345 vdc_mark_closed(vdc, slice, flag, otyp); 1346 mutex_exit(&vdc->lock); 1347 1348 return (0); 1349 } 1350 1351 static int 1352 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1353 { 1354 _NOTE(ARGUNUSED(credp)) 1355 1356 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1357 } 1358 1359 static int 1360 vdc_print(dev_t dev, char *str) 1361 { 1362 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1363 return (0); 1364 } 1365 1366 static int 1367 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1368 { 1369 int rv; 1370 size_t nbytes = nblk * DEV_BSIZE; 1371 int instance = VDCUNIT(dev); 1372 vdc_t *vdc = NULL; 1373 1374 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1375 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1376 return (ENXIO); 1377 } 1378 1379 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1380 instance, nbytes, blkno, (void *)addr); 1381 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1382 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1383 if (rv) { 1384 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1385 return (rv); 1386 } 1387 1388 if (ddi_in_panic()) 1389 (void) vdc_drain_response(vdc); 1390 1391 DMSG(vdc, 0, "[%d] End\n", instance); 1392 1393 return (0); 1394 } 1395 1396 /* -------------------------------------------------------------------------- */ 1397 1398 /* 1399 * Disk access routines 1400 * 1401 */ 1402 1403 /* 1404 * vdc_strategy() 1405 * 1406 * Return Value: 1407 * 0: As per strategy(9E), the strategy() function must return 0 1408 * [ bioerror(9f) sets b_flags to the proper error code ] 1409 */ 1410 static int 1411 vdc_strategy(struct buf *buf) 1412 { 1413 int rv = -1; 1414 vdc_t *vdc = NULL; 1415 int instance = VDCUNIT(buf->b_edev); 1416 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1417 int slice; 1418 1419 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1420 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1421 bioerror(buf, ENXIO); 1422 biodone(buf); 1423 return (0); 1424 } 1425 1426 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1427 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1428 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1429 1430 bp_mapin(buf); 1431 1432 if ((long)buf->b_private == VD_SLICE_NONE) { 1433 /* I/O using an absolute disk offset */ 1434 slice = VD_SLICE_NONE; 1435 } else { 1436 slice = VDCPART(buf->b_edev); 1437 } 1438 1439 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1440 buf->b_bcount, slice, buf->b_lblkno, 1441 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1442 VIO_write_dir); 1443 1444 /* 1445 * If the request was successfully sent, the strategy call returns and 1446 * the ACK handler calls the bioxxx functions when the vDisk server is 1447 * done otherwise we handle the error here. 1448 */ 1449 if (rv) { 1450 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1451 bioerror(buf, rv); 1452 biodone(buf); 1453 } 1454 1455 return (0); 1456 } 1457 1458 /* 1459 * Function: 1460 * vdc_min 1461 * 1462 * Description: 1463 * Routine to limit the size of a data transfer. Used in 1464 * conjunction with physio(9F). 1465 * 1466 * Arguments: 1467 * bp - pointer to the indicated buf(9S) struct. 1468 * 1469 */ 1470 static void 1471 vdc_min(struct buf *bufp) 1472 { 1473 vdc_t *vdc = NULL; 1474 int instance = VDCUNIT(bufp->b_edev); 1475 1476 vdc = ddi_get_soft_state(vdc_state, instance); 1477 VERIFY(vdc != NULL); 1478 1479 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1480 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1481 } 1482 } 1483 1484 static int 1485 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1486 { 1487 _NOTE(ARGUNUSED(cred)) 1488 1489 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1490 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1491 } 1492 1493 static int 1494 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1495 { 1496 _NOTE(ARGUNUSED(cred)) 1497 1498 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1499 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1500 } 1501 1502 static int 1503 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1504 { 1505 _NOTE(ARGUNUSED(cred)) 1506 1507 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1508 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1509 } 1510 1511 static int 1512 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1513 { 1514 _NOTE(ARGUNUSED(cred)) 1515 1516 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1517 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1518 } 1519 1520 1521 /* -------------------------------------------------------------------------- */ 1522 1523 /* 1524 * Handshake support 1525 */ 1526 1527 1528 /* 1529 * Function: 1530 * vdc_init_ver_negotiation() 1531 * 1532 * Description: 1533 * 1534 * Arguments: 1535 * vdc - soft state pointer for this instance of the device driver. 1536 * 1537 * Return Code: 1538 * 0 - Success 1539 */ 1540 static int 1541 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1542 { 1543 vio_ver_msg_t pkt; 1544 size_t msglen = sizeof (pkt); 1545 int status = -1; 1546 1547 ASSERT(vdc != NULL); 1548 ASSERT(mutex_owned(&vdc->lock)); 1549 1550 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1551 1552 /* 1553 * set the Session ID to a unique value 1554 * (the lower 32 bits of the clock tick) 1555 */ 1556 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1557 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1558 1559 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1560 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1561 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1562 pkt.tag.vio_sid = vdc->session_id; 1563 pkt.dev_class = VDEV_DISK; 1564 pkt.ver_major = ver.major; 1565 pkt.ver_minor = ver.minor; 1566 1567 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1568 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1569 vdc->instance, status); 1570 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1571 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1572 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1573 vdc->curr_server->ldc_handle, status, msglen); 1574 if (msglen != sizeof (vio_ver_msg_t)) 1575 status = ENOMSG; 1576 } 1577 1578 return (status); 1579 } 1580 1581 /* 1582 * Function: 1583 * vdc_ver_negotiation() 1584 * 1585 * Description: 1586 * 1587 * Arguments: 1588 * vdcp - soft state pointer for this instance of the device driver. 1589 * 1590 * Return Code: 1591 * 0 - Success 1592 */ 1593 static int 1594 vdc_ver_negotiation(vdc_t *vdcp) 1595 { 1596 vio_msg_t vio_msg; 1597 int status; 1598 1599 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1600 return (status); 1601 1602 /* release lock and wait for response */ 1603 mutex_exit(&vdcp->lock); 1604 status = vdc_wait_for_response(vdcp, &vio_msg); 1605 mutex_enter(&vdcp->lock); 1606 if (status) { 1607 DMSG(vdcp, 0, 1608 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1609 vdcp->instance, status); 1610 return (status); 1611 } 1612 1613 /* check type and sub_type ... */ 1614 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1615 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1616 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1617 vdcp->instance); 1618 return (EPROTO); 1619 } 1620 1621 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1622 } 1623 1624 /* 1625 * Function: 1626 * vdc_init_attr_negotiation() 1627 * 1628 * Description: 1629 * 1630 * Arguments: 1631 * vdc - soft state pointer for this instance of the device driver. 1632 * 1633 * Return Code: 1634 * 0 - Success 1635 */ 1636 static int 1637 vdc_init_attr_negotiation(vdc_t *vdc) 1638 { 1639 vd_attr_msg_t pkt; 1640 size_t msglen = sizeof (pkt); 1641 int status; 1642 1643 ASSERT(vdc != NULL); 1644 ASSERT(mutex_owned(&vdc->lock)); 1645 1646 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1647 1648 /* fill in tag */ 1649 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1650 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1651 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1652 pkt.tag.vio_sid = vdc->session_id; 1653 /* fill in payload */ 1654 pkt.max_xfer_sz = vdc->max_xfer_sz; 1655 pkt.vdisk_block_size = vdc->block_size; 1656 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1657 pkt.operations = 0; /* server will set bits of valid operations */ 1658 pkt.vdisk_type = 0; /* server will set to valid device type */ 1659 pkt.vdisk_media = 0; /* server will set to valid media type */ 1660 pkt.vdisk_size = 0; /* server will set to valid size */ 1661 1662 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1663 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1664 1665 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1666 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1667 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1668 vdc->curr_server->ldc_handle, status, msglen); 1669 if (msglen != sizeof (vio_ver_msg_t)) 1670 status = ENOMSG; 1671 } 1672 1673 return (status); 1674 } 1675 1676 /* 1677 * Function: 1678 * vdc_attr_negotiation() 1679 * 1680 * Description: 1681 * 1682 * Arguments: 1683 * vdc - soft state pointer for this instance of the device driver. 1684 * 1685 * Return Code: 1686 * 0 - Success 1687 */ 1688 static int 1689 vdc_attr_negotiation(vdc_t *vdcp) 1690 { 1691 int status; 1692 vio_msg_t vio_msg; 1693 1694 if (status = vdc_init_attr_negotiation(vdcp)) 1695 return (status); 1696 1697 /* release lock and wait for response */ 1698 mutex_exit(&vdcp->lock); 1699 status = vdc_wait_for_response(vdcp, &vio_msg); 1700 mutex_enter(&vdcp->lock); 1701 if (status) { 1702 DMSG(vdcp, 0, 1703 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1704 vdcp->instance, status); 1705 return (status); 1706 } 1707 1708 /* check type and sub_type ... */ 1709 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1710 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1711 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1712 vdcp->instance); 1713 return (EPROTO); 1714 } 1715 1716 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1717 } 1718 1719 1720 /* 1721 * Function: 1722 * vdc_init_dring_negotiate() 1723 * 1724 * Description: 1725 * 1726 * Arguments: 1727 * vdc - soft state pointer for this instance of the device driver. 1728 * 1729 * Return Code: 1730 * 0 - Success 1731 */ 1732 static int 1733 vdc_init_dring_negotiate(vdc_t *vdc) 1734 { 1735 vio_dring_reg_msg_t pkt; 1736 size_t msglen = sizeof (pkt); 1737 int status = -1; 1738 int retry; 1739 int nretries = 10; 1740 1741 ASSERT(vdc != NULL); 1742 ASSERT(mutex_owned(&vdc->lock)); 1743 1744 for (retry = 0; retry < nretries; retry++) { 1745 status = vdc_init_descriptor_ring(vdc); 1746 if (status != EAGAIN) 1747 break; 1748 drv_usecwait(vdc_min_timeout_ldc); 1749 } 1750 1751 if (status != 0) { 1752 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1753 vdc->instance, status); 1754 return (status); 1755 } 1756 1757 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1758 vdc->instance, status); 1759 1760 /* fill in tag */ 1761 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1762 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1763 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1764 pkt.tag.vio_sid = vdc->session_id; 1765 /* fill in payload */ 1766 pkt.dring_ident = 0; 1767 pkt.num_descriptors = vdc->dring_len; 1768 pkt.descriptor_size = vdc->dring_entry_size; 1769 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1770 pkt.ncookies = vdc->dring_cookie_count; 1771 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1772 1773 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1774 if (status != 0) { 1775 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1776 vdc->instance, status); 1777 } 1778 1779 return (status); 1780 } 1781 1782 1783 /* 1784 * Function: 1785 * vdc_dring_negotiation() 1786 * 1787 * Description: 1788 * 1789 * Arguments: 1790 * vdc - soft state pointer for this instance of the device driver. 1791 * 1792 * Return Code: 1793 * 0 - Success 1794 */ 1795 static int 1796 vdc_dring_negotiation(vdc_t *vdcp) 1797 { 1798 int status; 1799 vio_msg_t vio_msg; 1800 1801 if (status = vdc_init_dring_negotiate(vdcp)) 1802 return (status); 1803 1804 /* release lock and wait for response */ 1805 mutex_exit(&vdcp->lock); 1806 status = vdc_wait_for_response(vdcp, &vio_msg); 1807 mutex_enter(&vdcp->lock); 1808 if (status) { 1809 DMSG(vdcp, 0, 1810 "[%d] Failed waiting for Dring negotiation response," 1811 " rv(%d)", vdcp->instance, status); 1812 return (status); 1813 } 1814 1815 /* check type and sub_type ... */ 1816 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1817 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1818 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1819 vdcp->instance); 1820 return (EPROTO); 1821 } 1822 1823 return (vdc_handle_dring_reg_msg(vdcp, 1824 (vio_dring_reg_msg_t *)&vio_msg)); 1825 } 1826 1827 1828 /* 1829 * Function: 1830 * vdc_send_rdx() 1831 * 1832 * Description: 1833 * 1834 * Arguments: 1835 * vdc - soft state pointer for this instance of the device driver. 1836 * 1837 * Return Code: 1838 * 0 - Success 1839 */ 1840 static int 1841 vdc_send_rdx(vdc_t *vdcp) 1842 { 1843 vio_msg_t msg; 1844 size_t msglen = sizeof (vio_msg_t); 1845 int status; 1846 1847 /* 1848 * Send an RDX message to vds to indicate we are ready 1849 * to send data 1850 */ 1851 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1852 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1853 msg.tag.vio_subtype_env = VIO_RDX; 1854 msg.tag.vio_sid = vdcp->session_id; 1855 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1856 if (status != 0) { 1857 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1858 vdcp->instance, status); 1859 } 1860 1861 return (status); 1862 } 1863 1864 /* 1865 * Function: 1866 * vdc_handle_rdx() 1867 * 1868 * Description: 1869 * 1870 * Arguments: 1871 * vdc - soft state pointer for this instance of the device driver. 1872 * msgp - received msg 1873 * 1874 * Return Code: 1875 * 0 - Success 1876 */ 1877 static int 1878 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1879 { 1880 _NOTE(ARGUNUSED(vdcp)) 1881 _NOTE(ARGUNUSED(msgp)) 1882 1883 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1884 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1885 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1886 1887 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1888 1889 return (0); 1890 } 1891 1892 /* 1893 * Function: 1894 * vdc_rdx_exchange() 1895 * 1896 * Description: 1897 * 1898 * Arguments: 1899 * vdc - soft state pointer for this instance of the device driver. 1900 * 1901 * Return Code: 1902 * 0 - Success 1903 */ 1904 static int 1905 vdc_rdx_exchange(vdc_t *vdcp) 1906 { 1907 int status; 1908 vio_msg_t vio_msg; 1909 1910 if (status = vdc_send_rdx(vdcp)) 1911 return (status); 1912 1913 /* release lock and wait for response */ 1914 mutex_exit(&vdcp->lock); 1915 status = vdc_wait_for_response(vdcp, &vio_msg); 1916 mutex_enter(&vdcp->lock); 1917 if (status) { 1918 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1919 vdcp->instance, status); 1920 return (status); 1921 } 1922 1923 /* check type and sub_type ... */ 1924 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1925 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1926 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1927 return (EPROTO); 1928 } 1929 1930 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1931 } 1932 1933 1934 /* -------------------------------------------------------------------------- */ 1935 1936 /* 1937 * LDC helper routines 1938 */ 1939 1940 static int 1941 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1942 { 1943 int status; 1944 boolean_t q_has_pkts = B_FALSE; 1945 uint64_t delay_time; 1946 size_t len; 1947 1948 mutex_enter(&vdc->read_lock); 1949 1950 if (vdc->read_state == VDC_READ_IDLE) 1951 vdc->read_state = VDC_READ_WAITING; 1952 1953 while (vdc->read_state != VDC_READ_PENDING) { 1954 1955 /* detect if the connection has been reset */ 1956 if (vdc->read_state == VDC_READ_RESET) { 1957 status = ECONNRESET; 1958 goto done; 1959 } 1960 1961 cv_wait(&vdc->read_cv, &vdc->read_lock); 1962 } 1963 1964 /* 1965 * Until we get a blocking ldc read we have to retry 1966 * until the entire LDC message has arrived before 1967 * ldc_read() will succeed. Note we also bail out if 1968 * the channel is reset or goes away. 1969 */ 1970 delay_time = vdc_ldc_read_init_delay; 1971 loop: 1972 len = *nbytesp; 1973 status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); 1974 switch (status) { 1975 case EAGAIN: 1976 delay_time *= 2; 1977 if (delay_time >= vdc_ldc_read_max_delay) 1978 delay_time = vdc_ldc_read_max_delay; 1979 delay(delay_time); 1980 goto loop; 1981 1982 case 0: 1983 if (len == 0) { 1984 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 1985 "no error!\n", vdc->instance); 1986 goto loop; 1987 } 1988 1989 *nbytesp = len; 1990 1991 /* 1992 * If there are pending messages, leave the 1993 * read state as pending. Otherwise, set the state 1994 * back to idle. 1995 */ 1996 status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); 1997 if (status == 0 && !q_has_pkts) 1998 vdc->read_state = VDC_READ_IDLE; 1999 2000 break; 2001 default: 2002 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2003 break; 2004 } 2005 2006 done: 2007 mutex_exit(&vdc->read_lock); 2008 2009 return (status); 2010 } 2011 2012 2013 2014 #ifdef DEBUG 2015 void 2016 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2017 { 2018 char *ms, *ss, *ses; 2019 switch (msg->tag.vio_msgtype) { 2020 #define Q(_s) case _s : ms = #_s; break; 2021 Q(VIO_TYPE_CTRL) 2022 Q(VIO_TYPE_DATA) 2023 Q(VIO_TYPE_ERR) 2024 #undef Q 2025 default: ms = "unknown"; break; 2026 } 2027 2028 switch (msg->tag.vio_subtype) { 2029 #define Q(_s) case _s : ss = #_s; break; 2030 Q(VIO_SUBTYPE_INFO) 2031 Q(VIO_SUBTYPE_ACK) 2032 Q(VIO_SUBTYPE_NACK) 2033 #undef Q 2034 default: ss = "unknown"; break; 2035 } 2036 2037 switch (msg->tag.vio_subtype_env) { 2038 #define Q(_s) case _s : ses = #_s; break; 2039 Q(VIO_VER_INFO) 2040 Q(VIO_ATTR_INFO) 2041 Q(VIO_DRING_REG) 2042 Q(VIO_DRING_UNREG) 2043 Q(VIO_RDX) 2044 Q(VIO_PKT_DATA) 2045 Q(VIO_DESC_DATA) 2046 Q(VIO_DRING_DATA) 2047 #undef Q 2048 default: ses = "unknown"; break; 2049 } 2050 2051 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2052 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2053 msg->tag.vio_subtype_env, ms, ss, ses); 2054 } 2055 #endif 2056 2057 /* 2058 * Function: 2059 * vdc_send() 2060 * 2061 * Description: 2062 * The function encapsulates the call to write a message using LDC. 2063 * If LDC indicates that the call failed due to the queue being full, 2064 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2065 * 2066 * Arguments: 2067 * ldc_handle - LDC handle for the channel this instance of vdc uses 2068 * pkt - address of LDC message to be sent 2069 * msglen - the size of the message being sent. When the function 2070 * returns, this contains the number of bytes written. 2071 * 2072 * Return Code: 2073 * 0 - Success. 2074 * EINVAL - pkt or msglen were NULL 2075 * ECONNRESET - The connection was not up. 2076 * EWOULDBLOCK - LDC queue is full 2077 * xxx - other error codes returned by ldc_write 2078 */ 2079 static int 2080 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2081 { 2082 size_t size = 0; 2083 int status = 0; 2084 clock_t delay_ticks; 2085 2086 ASSERT(vdc != NULL); 2087 ASSERT(mutex_owned(&vdc->lock)); 2088 ASSERT(msglen != NULL); 2089 ASSERT(*msglen != 0); 2090 2091 #ifdef DEBUG 2092 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2093 #endif 2094 /* 2095 * Wait indefinitely to send if channel 2096 * is busy, but bail out if we succeed or 2097 * if the channel closes or is reset. 2098 */ 2099 delay_ticks = vdc_hz_min_ldc_delay; 2100 do { 2101 size = *msglen; 2102 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2103 if (status == EWOULDBLOCK) { 2104 delay(delay_ticks); 2105 /* geometric backoff */ 2106 delay_ticks *= 2; 2107 if (delay_ticks > vdc_hz_max_ldc_delay) 2108 delay_ticks = vdc_hz_max_ldc_delay; 2109 } 2110 } while (status == EWOULDBLOCK); 2111 2112 /* if LDC had serious issues --- reset vdc state */ 2113 if (status == EIO || status == ECONNRESET) { 2114 /* LDC had serious issues --- reset vdc state */ 2115 mutex_enter(&vdc->read_lock); 2116 if ((vdc->read_state == VDC_READ_WAITING) || 2117 (vdc->read_state == VDC_READ_RESET)) 2118 cv_signal(&vdc->read_cv); 2119 vdc->read_state = VDC_READ_RESET; 2120 mutex_exit(&vdc->read_lock); 2121 2122 /* wake up any waiters in the reset thread */ 2123 if (vdc->state == VDC_STATE_INIT_WAITING) { 2124 DMSG(vdc, 0, "[%d] write reset - " 2125 "vdc is resetting ..\n", vdc->instance); 2126 vdc->state = VDC_STATE_RESETTING; 2127 cv_signal(&vdc->initwait_cv); 2128 } 2129 2130 return (ECONNRESET); 2131 } 2132 2133 /* return the last size written */ 2134 *msglen = size; 2135 2136 return (status); 2137 } 2138 2139 /* 2140 * Function: 2141 * vdc_get_md_node 2142 * 2143 * Description: 2144 * Get the MD, the device node for the given disk instance. The 2145 * caller is responsible for cleaning up the reference to the 2146 * returned MD (mdpp) by calling md_fini_handle(). 2147 * 2148 * Arguments: 2149 * dip - dev info pointer for this instance of the device driver. 2150 * mdpp - the returned MD. 2151 * vd_nodep - the returned device node. 2152 * 2153 * Return Code: 2154 * 0 - Success. 2155 * ENOENT - Expected node or property did not exist. 2156 * ENXIO - Unexpected error communicating with MD framework 2157 */ 2158 static int 2159 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2160 { 2161 int status = ENOENT; 2162 char *node_name = NULL; 2163 md_t *mdp = NULL; 2164 int num_nodes; 2165 int num_vdevs; 2166 mde_cookie_t rootnode; 2167 mde_cookie_t *listp = NULL; 2168 boolean_t found_inst = B_FALSE; 2169 int listsz; 2170 int idx; 2171 uint64_t md_inst; 2172 int obp_inst; 2173 int instance = ddi_get_instance(dip); 2174 2175 /* 2176 * Get the OBP instance number for comparison with the MD instance 2177 * 2178 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2179 * notion of "instance", or unique identifier, for that node; OBP 2180 * stores the value of the "cfg-handle" MD property as the value of 2181 * the "reg" property on the node in the device tree it builds from 2182 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2183 * "reg" property value to uniquely identify this device instance. 2184 * If the "reg" property cannot be found, the device tree state is 2185 * presumably so broken that there is no point in continuing. 2186 */ 2187 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2188 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2189 return (ENOENT); 2190 } 2191 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2192 OBP_REG, -1); 2193 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2194 2195 /* 2196 * We now walk the MD nodes to find the node for this vdisk. 2197 */ 2198 if ((mdp = md_get_handle()) == NULL) { 2199 cmn_err(CE_WARN, "unable to init machine description"); 2200 return (ENXIO); 2201 } 2202 2203 num_nodes = md_node_count(mdp); 2204 ASSERT(num_nodes > 0); 2205 2206 listsz = num_nodes * sizeof (mde_cookie_t); 2207 2208 /* allocate memory for nodes */ 2209 listp = kmem_zalloc(listsz, KM_SLEEP); 2210 2211 rootnode = md_root_node(mdp); 2212 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2213 2214 /* 2215 * Search for all the virtual devices, we will then check to see which 2216 * ones are disk nodes. 2217 */ 2218 num_vdevs = md_scan_dag(mdp, rootnode, 2219 md_find_name(mdp, VDC_MD_VDEV_NAME), 2220 md_find_name(mdp, "fwd"), listp); 2221 2222 if (num_vdevs <= 0) { 2223 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2224 status = ENOENT; 2225 goto done; 2226 } 2227 2228 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2229 for (idx = 0; idx < num_vdevs; idx++) { 2230 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2231 if ((status != 0) || (node_name == NULL)) { 2232 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2233 ": err %d", VDC_MD_VDEV_NAME, status); 2234 continue; 2235 } 2236 2237 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2238 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2239 status = md_get_prop_val(mdp, listp[idx], 2240 VDC_MD_CFG_HDL, &md_inst); 2241 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2242 instance, md_inst); 2243 if ((status == 0) && (md_inst == obp_inst)) { 2244 found_inst = B_TRUE; 2245 break; 2246 } 2247 } 2248 } 2249 2250 if (!found_inst) { 2251 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2252 status = ENOENT; 2253 goto done; 2254 } 2255 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2256 2257 *vd_nodep = listp[idx]; 2258 *mdpp = mdp; 2259 done: 2260 kmem_free(listp, listsz); 2261 return (status); 2262 } 2263 2264 /* 2265 * Function: 2266 * vdc_init_ports 2267 * 2268 * Description: 2269 * Initialize all the ports for this vdisk instance. 2270 * 2271 * Arguments: 2272 * vdc - soft state pointer for this instance of the device driver. 2273 * mdp - md pointer 2274 * vd_nodep - device md node. 2275 * 2276 * Return Code: 2277 * 0 - Success. 2278 * ENOENT - Expected node or property did not exist. 2279 */ 2280 static int 2281 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2282 { 2283 int status = 0; 2284 int idx; 2285 int num_nodes; 2286 int num_vports; 2287 int num_chans; 2288 int listsz; 2289 mde_cookie_t vd_port; 2290 mde_cookie_t *chanp = NULL; 2291 mde_cookie_t *portp = NULL; 2292 vdc_server_t *srvr; 2293 vdc_server_t *prev_srvr = NULL; 2294 2295 /* 2296 * We now walk the MD nodes to find the port nodes for this vdisk. 2297 */ 2298 num_nodes = md_node_count(mdp); 2299 ASSERT(num_nodes > 0); 2300 2301 listsz = num_nodes * sizeof (mde_cookie_t); 2302 2303 /* allocate memory for nodes */ 2304 portp = kmem_zalloc(listsz, KM_SLEEP); 2305 chanp = kmem_zalloc(listsz, KM_SLEEP); 2306 2307 num_vports = md_scan_dag(mdp, vd_nodep, 2308 md_find_name(mdp, VDC_MD_PORT_NAME), 2309 md_find_name(mdp, "fwd"), portp); 2310 if (num_vports == 0) { 2311 DMSGX(0, "Found no '%s' node for '%s' port\n", 2312 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2313 status = ENOENT; 2314 goto done; 2315 } 2316 2317 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2318 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2319 2320 vdc->num_servers = 0; 2321 for (idx = 0; idx < num_vports; idx++) { 2322 2323 /* initialize this port */ 2324 vd_port = portp[idx]; 2325 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2326 srvr->vdcp = vdc; 2327 2328 /* get port id */ 2329 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2330 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2331 VDC_MD_ID); 2332 kmem_free(srvr, sizeof (vdc_server_t)); 2333 continue; 2334 } 2335 2336 /* set the connection timeout */ 2337 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2338 &srvr->ctimeout) != 0) { 2339 srvr->ctimeout = 0; 2340 } 2341 2342 /* get the ldc id */ 2343 num_chans = md_scan_dag(mdp, vd_port, 2344 md_find_name(mdp, VDC_MD_CHAN_NAME), 2345 md_find_name(mdp, "fwd"), chanp); 2346 2347 /* expecting at least one channel */ 2348 if (num_chans <= 0) { 2349 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2350 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2351 kmem_free(srvr, sizeof (vdc_server_t)); 2352 continue; 2353 } else if (num_chans != 1) { 2354 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2355 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2356 num_chans); 2357 } 2358 2359 /* 2360 * We use the first channel found (index 0), irrespective of how 2361 * many are there in total. 2362 */ 2363 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2364 &srvr->ldc_id) != 0) { 2365 cmn_err(CE_NOTE, "Channel '%s' property not found", 2366 VDC_MD_ID); 2367 kmem_free(srvr, sizeof (vdc_server_t)); 2368 continue; 2369 } 2370 2371 /* 2372 * now initialise LDC channel which will be used to 2373 * communicate with this server 2374 */ 2375 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2376 kmem_free(srvr, sizeof (vdc_server_t)); 2377 continue; 2378 } 2379 2380 /* add server to list */ 2381 if (prev_srvr) { 2382 prev_srvr->next = srvr; 2383 } else { 2384 vdc->server_list = srvr; 2385 prev_srvr = srvr; 2386 } 2387 2388 /* inc numbers of servers */ 2389 vdc->num_servers++; 2390 } 2391 2392 /* 2393 * Adjust the max number of handshake retries to match 2394 * the number of vdisk servers. 2395 */ 2396 if (vdc_hshake_retries < vdc->num_servers) 2397 vdc_hshake_retries = vdc->num_servers; 2398 2399 /* pick first server as current server */ 2400 if (vdc->server_list != NULL) { 2401 vdc->curr_server = vdc->server_list; 2402 status = 0; 2403 } else { 2404 status = ENOENT; 2405 } 2406 2407 done: 2408 kmem_free(chanp, listsz); 2409 kmem_free(portp, listsz); 2410 return (status); 2411 } 2412 2413 2414 /* 2415 * Function: 2416 * vdc_do_ldc_up 2417 * 2418 * Description: 2419 * Bring the channel for the current server up. 2420 * 2421 * Arguments: 2422 * vdc - soft state pointer for this instance of the device driver. 2423 * 2424 * Return Code: 2425 * 0 - Success. 2426 * EINVAL - Driver is detaching / LDC error 2427 * ECONNREFUSED - Other end is not listening 2428 */ 2429 static int 2430 vdc_do_ldc_up(vdc_t *vdc) 2431 { 2432 int status; 2433 ldc_status_t ldc_state; 2434 2435 ASSERT(MUTEX_HELD(&vdc->lock)); 2436 2437 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2438 vdc->instance, vdc->curr_server->ldc_id); 2439 2440 if (vdc->lifecycle == VDC_LC_DETACHING) 2441 return (EINVAL); 2442 2443 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2444 switch (status) { 2445 case ECONNREFUSED: /* listener not ready at other end */ 2446 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2447 vdc->instance, vdc->curr_server->ldc_id, status); 2448 status = 0; 2449 break; 2450 default: 2451 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2452 "channel=%ld, err=%d", vdc->instance, 2453 vdc->curr_server->ldc_id, status); 2454 break; 2455 } 2456 } 2457 2458 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2459 vdc->curr_server->ldc_state = ldc_state; 2460 if (ldc_state == LDC_UP) { 2461 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2462 vdc->instance); 2463 vdc->seq_num = 1; 2464 vdc->seq_num_reply = 0; 2465 } 2466 } 2467 2468 return (status); 2469 } 2470 2471 /* 2472 * Function: 2473 * vdc_terminate_ldc() 2474 * 2475 * Description: 2476 * 2477 * Arguments: 2478 * vdc - soft state pointer for this instance of the device driver. 2479 * srvr - vdc per-server info structure 2480 * 2481 * Return Code: 2482 * None 2483 */ 2484 static void 2485 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2486 { 2487 int instance = ddi_get_instance(vdc->dip); 2488 2489 if (srvr->state & VDC_LDC_OPEN) { 2490 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2491 (void) ldc_close(srvr->ldc_handle); 2492 } 2493 if (srvr->state & VDC_LDC_CB) { 2494 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2495 (void) ldc_unreg_callback(srvr->ldc_handle); 2496 } 2497 if (srvr->state & VDC_LDC_INIT) { 2498 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2499 (void) ldc_fini(srvr->ldc_handle); 2500 srvr->ldc_handle = NULL; 2501 } 2502 2503 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2504 } 2505 2506 /* 2507 * Function: 2508 * vdc_fini_ports() 2509 * 2510 * Description: 2511 * Finalize all ports by closing the channel associated with each 2512 * port and also freeing the server structure. 2513 * 2514 * Arguments: 2515 * vdc - soft state pointer for this instance of the device driver. 2516 * 2517 * Return Code: 2518 * None 2519 */ 2520 static void 2521 vdc_fini_ports(vdc_t *vdc) 2522 { 2523 int instance = ddi_get_instance(vdc->dip); 2524 vdc_server_t *srvr, *prev_srvr; 2525 2526 ASSERT(vdc != NULL); 2527 ASSERT(mutex_owned(&vdc->lock)); 2528 2529 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2530 2531 srvr = vdc->server_list; 2532 2533 while (srvr) { 2534 2535 vdc_terminate_ldc(vdc, srvr); 2536 2537 /* next server */ 2538 prev_srvr = srvr; 2539 srvr = srvr->next; 2540 2541 /* free server */ 2542 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2543 } 2544 2545 vdc->server_list = NULL; 2546 } 2547 2548 /* -------------------------------------------------------------------------- */ 2549 2550 /* 2551 * Descriptor Ring helper routines 2552 */ 2553 2554 /* 2555 * Function: 2556 * vdc_init_descriptor_ring() 2557 * 2558 * Description: 2559 * 2560 * Arguments: 2561 * vdc - soft state pointer for this instance of the device driver. 2562 * 2563 * Return Code: 2564 * 0 - Success 2565 */ 2566 static int 2567 vdc_init_descriptor_ring(vdc_t *vdc) 2568 { 2569 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2570 int status = 0; 2571 int i; 2572 2573 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2574 2575 ASSERT(vdc != NULL); 2576 ASSERT(mutex_owned(&vdc->lock)); 2577 2578 /* ensure we have enough room to store max sized block */ 2579 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2580 2581 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2582 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2583 /* 2584 * Calculate the maximum block size we can transmit using one 2585 * Descriptor Ring entry from the attributes returned by the 2586 * vDisk server. This is subject to a minimum of 'maxphys' 2587 * as we do not have the capability to split requests over 2588 * multiple DRing entries. 2589 */ 2590 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2591 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2592 vdc->instance); 2593 vdc->dring_max_cookies = maxphys / PAGESIZE; 2594 } else { 2595 vdc->dring_max_cookies = 2596 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2597 } 2598 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2599 (sizeof (ldc_mem_cookie_t) * 2600 (vdc->dring_max_cookies - 1))); 2601 vdc->dring_len = VD_DRING_LEN; 2602 2603 status = ldc_mem_dring_create(vdc->dring_len, 2604 vdc->dring_entry_size, &vdc->dring_hdl); 2605 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2606 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2607 vdc->instance); 2608 return (status); 2609 } 2610 vdc->initialized |= VDC_DRING_INIT; 2611 } 2612 2613 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2614 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2615 vdc->dring_cookie = 2616 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2617 2618 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2619 vdc->dring_hdl, 2620 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2621 &vdc->dring_cookie[0], 2622 &vdc->dring_cookie_count); 2623 if (status != 0) { 2624 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2625 "(%lx) to channel (%lx) status=%d\n", 2626 vdc->instance, vdc->dring_hdl, 2627 vdc->curr_server->ldc_handle, status); 2628 return (status); 2629 } 2630 ASSERT(vdc->dring_cookie_count == 1); 2631 vdc->initialized |= VDC_DRING_BOUND; 2632 } 2633 2634 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2635 if (status != 0) { 2636 DMSG(vdc, 0, 2637 "[%d] Failed to get info for descriptor ring (%lx)\n", 2638 vdc->instance, vdc->dring_hdl); 2639 return (status); 2640 } 2641 2642 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2643 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2644 2645 /* Allocate the local copy of this dring */ 2646 vdc->local_dring = 2647 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2648 KM_SLEEP); 2649 vdc->initialized |= VDC_DRING_LOCAL; 2650 } 2651 2652 /* 2653 * Mark all DRing entries as free and initialize the private 2654 * descriptor's memory handles. If any entry is initialized, 2655 * we need to free it later so we set the bit in 'initialized' 2656 * at the start. 2657 */ 2658 vdc->initialized |= VDC_DRING_ENTRY; 2659 for (i = 0; i < vdc->dring_len; i++) { 2660 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2661 dep->hdr.dstate = VIO_DESC_FREE; 2662 2663 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2664 &vdc->local_dring[i].desc_mhdl); 2665 if (status != 0) { 2666 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2667 " descriptor %d", vdc->instance, i); 2668 return (status); 2669 } 2670 vdc->local_dring[i].is_free = B_TRUE; 2671 vdc->local_dring[i].dep = dep; 2672 } 2673 2674 /* Initialize the starting index */ 2675 vdc->dring_curr_idx = 0; 2676 2677 return (status); 2678 } 2679 2680 /* 2681 * Function: 2682 * vdc_destroy_descriptor_ring() 2683 * 2684 * Description: 2685 * 2686 * Arguments: 2687 * vdc - soft state pointer for this instance of the device driver. 2688 * 2689 * Return Code: 2690 * None 2691 */ 2692 static void 2693 vdc_destroy_descriptor_ring(vdc_t *vdc) 2694 { 2695 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2696 ldc_mem_handle_t mhdl = NULL; 2697 ldc_mem_info_t minfo; 2698 int status = -1; 2699 int i; /* loop */ 2700 2701 ASSERT(vdc != NULL); 2702 ASSERT(mutex_owned(&vdc->lock)); 2703 2704 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2705 2706 if (vdc->initialized & VDC_DRING_ENTRY) { 2707 DMSG(vdc, 0, 2708 "[%d] Removing Local DRing entries\n", vdc->instance); 2709 for (i = 0; i < vdc->dring_len; i++) { 2710 ldep = &vdc->local_dring[i]; 2711 mhdl = ldep->desc_mhdl; 2712 2713 if (mhdl == NULL) 2714 continue; 2715 2716 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2717 DMSG(vdc, 0, 2718 "ldc_mem_info returned an error: %d\n", 2719 status); 2720 2721 /* 2722 * This must mean that the mem handle 2723 * is not valid. Clear it out so that 2724 * no one tries to use it. 2725 */ 2726 ldep->desc_mhdl = NULL; 2727 continue; 2728 } 2729 2730 if (minfo.status == LDC_BOUND) { 2731 (void) ldc_mem_unbind_handle(mhdl); 2732 } 2733 2734 (void) ldc_mem_free_handle(mhdl); 2735 2736 ldep->desc_mhdl = NULL; 2737 } 2738 vdc->initialized &= ~VDC_DRING_ENTRY; 2739 } 2740 2741 if (vdc->initialized & VDC_DRING_LOCAL) { 2742 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2743 kmem_free(vdc->local_dring, 2744 vdc->dring_len * sizeof (vdc_local_desc_t)); 2745 vdc->initialized &= ~VDC_DRING_LOCAL; 2746 } 2747 2748 if (vdc->initialized & VDC_DRING_BOUND) { 2749 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2750 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2751 if (status == 0) { 2752 vdc->initialized &= ~VDC_DRING_BOUND; 2753 } else { 2754 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2755 vdc->instance, status, vdc->dring_hdl); 2756 } 2757 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2758 } 2759 2760 if (vdc->initialized & VDC_DRING_INIT) { 2761 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2762 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2763 if (status == 0) { 2764 vdc->dring_hdl = NULL; 2765 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2766 vdc->initialized &= ~VDC_DRING_INIT; 2767 } else { 2768 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2769 vdc->instance, status, vdc->dring_hdl); 2770 } 2771 } 2772 } 2773 2774 /* 2775 * Function: 2776 * vdc_map_to_shared_dring() 2777 * 2778 * Description: 2779 * Copy contents of the local descriptor to the shared 2780 * memory descriptor. 2781 * 2782 * Arguments: 2783 * vdcp - soft state pointer for this instance of the device driver. 2784 * idx - descriptor ring index 2785 * 2786 * Return Code: 2787 * None 2788 */ 2789 static int 2790 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2791 { 2792 vdc_local_desc_t *ldep; 2793 vd_dring_entry_t *dep; 2794 int rv; 2795 2796 ldep = &(vdcp->local_dring[idx]); 2797 2798 /* for now leave in the old pop_mem_hdl stuff */ 2799 if (ldep->nbytes > 0) { 2800 rv = vdc_populate_mem_hdl(vdcp, ldep); 2801 if (rv) { 2802 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2803 vdcp->instance); 2804 return (rv); 2805 } 2806 } 2807 2808 /* 2809 * fill in the data details into the DRing 2810 */ 2811 dep = ldep->dep; 2812 ASSERT(dep != NULL); 2813 2814 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2815 dep->payload.operation = ldep->operation; 2816 dep->payload.addr = ldep->offset; 2817 dep->payload.nbytes = ldep->nbytes; 2818 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2819 dep->payload.slice = ldep->slice; 2820 dep->hdr.dstate = VIO_DESC_READY; 2821 dep->hdr.ack = 1; /* request an ACK for every message */ 2822 2823 return (0); 2824 } 2825 2826 /* 2827 * Function: 2828 * vdc_send_request 2829 * 2830 * Description: 2831 * This routine writes the data to be transmitted to vds into the 2832 * descriptor, notifies vds that the ring has been updated and 2833 * then waits for the request to be processed. 2834 * 2835 * Arguments: 2836 * vdcp - the soft state pointer 2837 * operation - operation we want vds to perform (VD_OP_XXX) 2838 * addr - address of data buf to be read/written. 2839 * nbytes - number of bytes to read/write 2840 * slice - the disk slice this request is for 2841 * offset - relative disk offset 2842 * cb_type - type of call - STRATEGY or SYNC 2843 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2844 * . mode for ioctl(9e) 2845 * . LP64 diskaddr_t (block I/O) 2846 * dir - direction of operation (READ/WRITE/BOTH) 2847 * 2848 * Return Codes: 2849 * 0 2850 * ENXIO 2851 */ 2852 static int 2853 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2854 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2855 void *cb_arg, vio_desc_direction_t dir) 2856 { 2857 int rv = 0; 2858 2859 ASSERT(vdcp != NULL); 2860 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2861 2862 mutex_enter(&vdcp->lock); 2863 2864 /* 2865 * If this is a block read/write operation we update the I/O statistics 2866 * to indicate that the request is being put on the waitq to be 2867 * serviced. 2868 * 2869 * We do it here (a common routine for both synchronous and strategy 2870 * calls) for performance reasons - we are already holding vdc->lock 2871 * so there is no extra locking overhead. We would have to explicitly 2872 * grab the 'lock' mutex to update the stats if we were to do this 2873 * higher up the stack in vdc_strategy() et. al. 2874 */ 2875 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2876 DTRACE_IO1(start, buf_t *, cb_arg); 2877 VD_KSTAT_WAITQ_ENTER(vdcp); 2878 } 2879 2880 do { 2881 while (vdcp->state != VDC_STATE_RUNNING) { 2882 2883 /* return error if detaching */ 2884 if (vdcp->state == VDC_STATE_DETACH) { 2885 rv = ENXIO; 2886 goto done; 2887 } 2888 2889 /* fail request if connection timeout is reached */ 2890 if (vdcp->ctimeout_reached) { 2891 rv = EIO; 2892 goto done; 2893 } 2894 2895 /* 2896 * If we are panicking and the disk is not ready then 2897 * we can't send any request because we can't complete 2898 * the handshake now. 2899 */ 2900 if (ddi_in_panic()) { 2901 rv = EIO; 2902 goto done; 2903 } 2904 2905 cv_wait(&vdcp->running_cv, &vdcp->lock); 2906 } 2907 2908 } while (vdc_populate_descriptor(vdcp, operation, addr, 2909 nbytes, slice, offset, cb_type, cb_arg, dir)); 2910 2911 done: 2912 /* 2913 * If this is a block read/write we update the I/O statistics kstat 2914 * to indicate that this request has been placed on the queue for 2915 * processing (i.e sent to the vDisk server) - iostat(1M) will 2916 * report the time waiting for the vDisk server under the %b column 2917 * In the case of an error we simply take it off the wait queue. 2918 */ 2919 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2920 if (rv == 0) { 2921 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2922 DTRACE_PROBE1(send, buf_t *, cb_arg); 2923 } else { 2924 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2925 VD_KSTAT_WAITQ_EXIT(vdcp); 2926 DTRACE_IO1(done, buf_t *, cb_arg); 2927 } 2928 } 2929 2930 mutex_exit(&vdcp->lock); 2931 2932 return (rv); 2933 } 2934 2935 2936 /* 2937 * Function: 2938 * vdc_populate_descriptor 2939 * 2940 * Description: 2941 * This routine writes the data to be transmitted to vds into the 2942 * descriptor, notifies vds that the ring has been updated and 2943 * then waits for the request to be processed. 2944 * 2945 * Arguments: 2946 * vdcp - the soft state pointer 2947 * operation - operation we want vds to perform (VD_OP_XXX) 2948 * addr - address of data buf to be read/written. 2949 * nbytes - number of bytes to read/write 2950 * slice - the disk slice this request is for 2951 * offset - relative disk offset 2952 * cb_type - type of call - STRATEGY or SYNC 2953 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2954 * . mode for ioctl(9e) 2955 * . LP64 diskaddr_t (block I/O) 2956 * dir - direction of operation (READ/WRITE/BOTH) 2957 * 2958 * Return Codes: 2959 * 0 2960 * EAGAIN 2961 * ECONNRESET 2962 * ENXIO 2963 */ 2964 static int 2965 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2966 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2967 void *cb_arg, vio_desc_direction_t dir) 2968 { 2969 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2970 int idx; /* Index of DRing entry used */ 2971 int next_idx; 2972 vio_dring_msg_t dmsg; 2973 size_t msglen; 2974 int rv; 2975 2976 ASSERT(MUTEX_HELD(&vdcp->lock)); 2977 vdcp->threads_pending++; 2978 loop: 2979 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2980 2981 /* Get next available D-Ring entry */ 2982 idx = vdcp->dring_curr_idx; 2983 local_dep = &(vdcp->local_dring[idx]); 2984 2985 if (!local_dep->is_free) { 2986 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2987 vdcp->instance); 2988 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2989 if (vdcp->state == VDC_STATE_RUNNING || 2990 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2991 goto loop; 2992 } 2993 vdcp->threads_pending--; 2994 return (ECONNRESET); 2995 } 2996 2997 next_idx = idx + 1; 2998 if (next_idx >= vdcp->dring_len) 2999 next_idx = 0; 3000 vdcp->dring_curr_idx = next_idx; 3001 3002 ASSERT(local_dep->is_free); 3003 3004 local_dep->operation = operation; 3005 local_dep->addr = addr; 3006 local_dep->nbytes = nbytes; 3007 local_dep->slice = slice; 3008 local_dep->offset = offset; 3009 local_dep->cb_type = cb_type; 3010 local_dep->cb_arg = cb_arg; 3011 local_dep->dir = dir; 3012 3013 local_dep->is_free = B_FALSE; 3014 3015 rv = vdc_map_to_shared_dring(vdcp, idx); 3016 if (rv) { 3017 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3018 vdcp->instance); 3019 /* free the descriptor */ 3020 local_dep->is_free = B_TRUE; 3021 vdcp->dring_curr_idx = idx; 3022 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3023 if (vdcp->state == VDC_STATE_RUNNING || 3024 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3025 goto loop; 3026 } 3027 vdcp->threads_pending--; 3028 return (ECONNRESET); 3029 } 3030 3031 /* 3032 * Send a msg with the DRing details to vds 3033 */ 3034 VIO_INIT_DRING_DATA_TAG(dmsg); 3035 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3036 dmsg.dring_ident = vdcp->dring_ident; 3037 dmsg.start_idx = idx; 3038 dmsg.end_idx = idx; 3039 vdcp->seq_num++; 3040 3041 DTRACE_PROBE2(populate, int, vdcp->instance, 3042 vdc_local_desc_t *, local_dep); 3043 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3044 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3045 3046 /* 3047 * note we're still holding the lock here to 3048 * make sure the message goes out in order !!!... 3049 */ 3050 msglen = sizeof (dmsg); 3051 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3052 switch (rv) { 3053 case ECONNRESET: 3054 /* 3055 * vdc_send initiates the reset on failure. 3056 * Since the transaction has already been put 3057 * on the local dring, it will automatically get 3058 * retried when the channel is reset. Given that, 3059 * it is ok to just return success even though the 3060 * send failed. 3061 */ 3062 rv = 0; 3063 break; 3064 3065 case 0: /* EOK */ 3066 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3067 break; 3068 3069 default: 3070 goto cleanup_and_exit; 3071 } 3072 3073 vdcp->threads_pending--; 3074 return (rv); 3075 3076 cleanup_and_exit: 3077 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3078 return (ENXIO); 3079 } 3080 3081 /* 3082 * Function: 3083 * vdc_do_sync_op 3084 * 3085 * Description: 3086 * Wrapper around vdc_populate_descriptor that blocks until the 3087 * response to the message is available. 3088 * 3089 * Arguments: 3090 * vdcp - the soft state pointer 3091 * operation - operation we want vds to perform (VD_OP_XXX) 3092 * addr - address of data buf to be read/written. 3093 * nbytes - number of bytes to read/write 3094 * slice - the disk slice this request is for 3095 * offset - relative disk offset 3096 * cb_type - type of call - STRATEGY or SYNC 3097 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3098 * . mode for ioctl(9e) 3099 * . LP64 diskaddr_t (block I/O) 3100 * dir - direction of operation (READ/WRITE/BOTH) 3101 * rconflict - check for reservation conflict in case of failure 3102 * 3103 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3104 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3105 * result of a successful operation with vd_scsi_status(). 3106 * 3107 * Return Codes: 3108 * 0 3109 * EAGAIN 3110 * EFAULT 3111 * ENXIO 3112 * EIO 3113 */ 3114 static int 3115 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3116 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3117 vio_desc_direction_t dir, boolean_t rconflict) 3118 { 3119 int status; 3120 vdc_io_t *vio; 3121 boolean_t check_resv_conflict = B_FALSE; 3122 3123 ASSERT(cb_type == CB_SYNC); 3124 3125 /* 3126 * Grab the lock, if blocked wait until the server 3127 * response causes us to wake up again. 3128 */ 3129 mutex_enter(&vdcp->lock); 3130 vdcp->sync_op_cnt++; 3131 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3132 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3133 3134 if (vdcp->state == VDC_STATE_DETACH) { 3135 cv_broadcast(&vdcp->sync_blocked_cv); 3136 vdcp->sync_op_cnt--; 3137 mutex_exit(&vdcp->lock); 3138 return (ENXIO); 3139 } 3140 3141 /* now block anyone other thread entering after us */ 3142 vdcp->sync_op_blocked = B_TRUE; 3143 vdcp->sync_op_pending = B_TRUE; 3144 mutex_exit(&vdcp->lock); 3145 3146 status = vdc_send_request(vdcp, operation, addr, 3147 nbytes, slice, offset, cb_type, cb_arg, dir); 3148 3149 mutex_enter(&vdcp->lock); 3150 3151 if (status != 0) { 3152 vdcp->sync_op_pending = B_FALSE; 3153 } else { 3154 /* 3155 * block until our transaction completes. 3156 * Also anyone else waiting also gets to go next. 3157 */ 3158 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3159 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3160 3161 DMSG(vdcp, 2, ": operation returned %d\n", 3162 vdcp->sync_op_status); 3163 if (vdcp->state == VDC_STATE_DETACH) { 3164 vdcp->sync_op_pending = B_FALSE; 3165 status = ENXIO; 3166 } else { 3167 status = vdcp->sync_op_status; 3168 if (status != 0 && vdcp->failfast_interval != 0) { 3169 /* 3170 * Operation has failed and failfast is enabled. 3171 * We need to check if the failure is due to a 3172 * reservation conflict if this was requested. 3173 */ 3174 check_resv_conflict = rconflict; 3175 } 3176 3177 } 3178 } 3179 3180 vdcp->sync_op_status = 0; 3181 vdcp->sync_op_blocked = B_FALSE; 3182 vdcp->sync_op_cnt--; 3183 3184 /* signal the next waiting thread */ 3185 cv_signal(&vdcp->sync_blocked_cv); 3186 3187 /* 3188 * We have to check for reservation conflict after unblocking sync 3189 * operations because some sync operations will be used to do this 3190 * check. 3191 */ 3192 if (check_resv_conflict) { 3193 vio = vdc_failfast_io_queue(vdcp, NULL); 3194 while (vio->vio_qtime != 0) 3195 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3196 kmem_free(vio, sizeof (vdc_io_t)); 3197 } 3198 3199 mutex_exit(&vdcp->lock); 3200 3201 return (status); 3202 } 3203 3204 3205 /* 3206 * Function: 3207 * vdc_drain_response() 3208 * 3209 * Description: 3210 * When a guest is panicking, the completion of requests needs to be 3211 * handled differently because interrupts are disabled and vdc 3212 * will not get messages. We have to poll for the messages instead. 3213 * 3214 * Note: since we don't have a buf_t available we cannot implement 3215 * the io:::done DTrace probe in this specific case. 3216 * 3217 * Arguments: 3218 * vdc - soft state pointer for this instance of the device driver. 3219 * 3220 * Return Code: 3221 * 0 - Success 3222 */ 3223 static int 3224 vdc_drain_response(vdc_t *vdc) 3225 { 3226 int rv, idx, retries; 3227 size_t msglen; 3228 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3229 vio_dring_msg_t dmsg; 3230 3231 mutex_enter(&vdc->lock); 3232 3233 retries = 0; 3234 for (;;) { 3235 msglen = sizeof (dmsg); 3236 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3237 &msglen); 3238 if (rv) { 3239 rv = EINVAL; 3240 break; 3241 } 3242 3243 /* 3244 * if there are no packets wait and check again 3245 */ 3246 if ((rv == 0) && (msglen == 0)) { 3247 if (retries++ > vdc_dump_retries) { 3248 rv = EAGAIN; 3249 break; 3250 } 3251 3252 drv_usecwait(vdc_usec_timeout_dump); 3253 continue; 3254 } 3255 3256 /* 3257 * Ignore all messages that are not ACKs/NACKs to 3258 * DRing requests. 3259 */ 3260 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3261 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3262 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3263 dmsg.tag.vio_msgtype, 3264 dmsg.tag.vio_subtype, 3265 dmsg.tag.vio_subtype_env); 3266 continue; 3267 } 3268 3269 /* 3270 * set the appropriate return value for the current request. 3271 */ 3272 switch (dmsg.tag.vio_subtype) { 3273 case VIO_SUBTYPE_ACK: 3274 rv = 0; 3275 break; 3276 case VIO_SUBTYPE_NACK: 3277 rv = EAGAIN; 3278 break; 3279 default: 3280 continue; 3281 } 3282 3283 idx = dmsg.start_idx; 3284 if (idx >= vdc->dring_len) { 3285 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3286 vdc->instance, idx); 3287 continue; 3288 } 3289 ldep = &vdc->local_dring[idx]; 3290 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3291 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3292 vdc->instance, idx, ldep->dep->hdr.dstate); 3293 continue; 3294 } 3295 3296 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3297 vdc->instance, idx, ldep->dep->hdr.dstate); 3298 3299 rv = vdc_depopulate_descriptor(vdc, idx); 3300 if (rv) { 3301 DMSG(vdc, 0, 3302 "[%d] Entry @ %d - depopulate failed ..\n", 3303 vdc->instance, idx); 3304 } 3305 3306 /* if this is the last descriptor - break out of loop */ 3307 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3308 break; 3309 } 3310 3311 mutex_exit(&vdc->lock); 3312 DMSG(vdc, 0, "End idx=%d\n", idx); 3313 3314 return (rv); 3315 } 3316 3317 3318 /* 3319 * Function: 3320 * vdc_depopulate_descriptor() 3321 * 3322 * Description: 3323 * 3324 * Arguments: 3325 * vdc - soft state pointer for this instance of the device driver. 3326 * idx - Index of the Descriptor Ring entry being modified 3327 * 3328 * Return Code: 3329 * 0 - Success 3330 */ 3331 static int 3332 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3333 { 3334 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3335 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3336 int status = ENXIO; 3337 int rv = 0; 3338 3339 ASSERT(vdc != NULL); 3340 ASSERT(idx < vdc->dring_len); 3341 ldep = &vdc->local_dring[idx]; 3342 ASSERT(ldep != NULL); 3343 ASSERT(MUTEX_HELD(&vdc->lock)); 3344 3345 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3346 DMSG(vdc, 2, ": idx = %d\n", idx); 3347 3348 dep = ldep->dep; 3349 ASSERT(dep != NULL); 3350 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3351 (dep->payload.status == ECANCELED)); 3352 3353 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3354 3355 ldep->is_free = B_TRUE; 3356 status = dep->payload.status; 3357 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3358 3359 /* 3360 * If no buffers were used to transfer information to the server when 3361 * populating the descriptor then no memory handles need to be unbound 3362 * and we can return now. 3363 */ 3364 if (ldep->nbytes == 0) { 3365 cv_signal(&vdc->dring_free_cv); 3366 return (status); 3367 } 3368 3369 /* 3370 * If the upper layer passed in a misaligned address we copied the 3371 * data into an aligned buffer before sending it to LDC - we now 3372 * copy it back to the original buffer. 3373 */ 3374 if (ldep->align_addr) { 3375 ASSERT(ldep->addr != NULL); 3376 3377 if (dep->payload.nbytes > 0) 3378 bcopy(ldep->align_addr, ldep->addr, 3379 dep->payload.nbytes); 3380 kmem_free(ldep->align_addr, 3381 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3382 ldep->align_addr = NULL; 3383 } 3384 3385 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3386 if (rv != 0) { 3387 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3388 vdc->instance, ldep->desc_mhdl, idx, rv); 3389 /* 3390 * The error returned by the vDisk server is more informative 3391 * and thus has a higher priority but if it isn't set we ensure 3392 * that this function returns an error. 3393 */ 3394 if (status == 0) 3395 status = EINVAL; 3396 } 3397 3398 cv_signal(&vdc->membind_cv); 3399 cv_signal(&vdc->dring_free_cv); 3400 3401 return (status); 3402 } 3403 3404 /* 3405 * Function: 3406 * vdc_populate_mem_hdl() 3407 * 3408 * Description: 3409 * 3410 * Arguments: 3411 * vdc - soft state pointer for this instance of the device driver. 3412 * idx - Index of the Descriptor Ring entry being modified 3413 * addr - virtual address being mapped in 3414 * nybtes - number of bytes in 'addr' 3415 * operation - the vDisk operation being performed (VD_OP_xxx) 3416 * 3417 * Return Code: 3418 * 0 - Success 3419 */ 3420 static int 3421 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3422 { 3423 vd_dring_entry_t *dep = NULL; 3424 ldc_mem_handle_t mhdl; 3425 caddr_t vaddr; 3426 size_t nbytes; 3427 uint8_t perm = LDC_MEM_RW; 3428 uint8_t maptype; 3429 int rv = 0; 3430 int i; 3431 3432 ASSERT(vdcp != NULL); 3433 3434 dep = ldep->dep; 3435 mhdl = ldep->desc_mhdl; 3436 3437 switch (ldep->dir) { 3438 case VIO_read_dir: 3439 perm = LDC_MEM_W; 3440 break; 3441 3442 case VIO_write_dir: 3443 perm = LDC_MEM_R; 3444 break; 3445 3446 case VIO_both_dir: 3447 perm = LDC_MEM_RW; 3448 break; 3449 3450 default: 3451 ASSERT(0); /* catch bad programming in vdc */ 3452 } 3453 3454 /* 3455 * LDC expects any addresses passed in to be 8-byte aligned. We need 3456 * to copy the contents of any misaligned buffers to a newly allocated 3457 * buffer and bind it instead (and copy the the contents back to the 3458 * original buffer passed in when depopulating the descriptor) 3459 */ 3460 vaddr = ldep->addr; 3461 nbytes = ldep->nbytes; 3462 if (((uint64_t)vaddr & 0x7) != 0) { 3463 ASSERT(ldep->align_addr == NULL); 3464 ldep->align_addr = 3465 kmem_alloc(sizeof (caddr_t) * 3466 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3467 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3468 "(buf=%p nb=%ld op=%d)\n", 3469 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3470 nbytes, ldep->operation); 3471 if (perm != LDC_MEM_W) 3472 bcopy(vaddr, ldep->align_addr, nbytes); 3473 vaddr = ldep->align_addr; 3474 } 3475 3476 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3477 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3478 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3479 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3480 vdcp->instance, dep->payload.ncookies); 3481 if (rv != 0) { 3482 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3483 "(mhdl=%p, buf=%p, err=%d)\n", 3484 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3485 if (ldep->align_addr) { 3486 kmem_free(ldep->align_addr, 3487 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3488 ldep->align_addr = NULL; 3489 } 3490 return (EAGAIN); 3491 } 3492 3493 /* 3494 * Get the other cookies (if any). 3495 */ 3496 for (i = 1; i < dep->payload.ncookies; i++) { 3497 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3498 if (rv != 0) { 3499 (void) ldc_mem_unbind_handle(mhdl); 3500 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3501 "(mhdl=%lx cnum=%d), err=%d", 3502 vdcp->instance, mhdl, i, rv); 3503 if (ldep->align_addr) { 3504 kmem_free(ldep->align_addr, 3505 sizeof (caddr_t) * ldep->nbytes); 3506 ldep->align_addr = NULL; 3507 } 3508 return (EAGAIN); 3509 } 3510 } 3511 3512 return (rv); 3513 } 3514 3515 /* 3516 * Interrupt handlers for messages from LDC 3517 */ 3518 3519 /* 3520 * Function: 3521 * vdc_handle_cb() 3522 * 3523 * Description: 3524 * 3525 * Arguments: 3526 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3527 * arg - soft state pointer for this instance of the device driver. 3528 * 3529 * Return Code: 3530 * 0 - Success 3531 */ 3532 static uint_t 3533 vdc_handle_cb(uint64_t event, caddr_t arg) 3534 { 3535 ldc_status_t ldc_state; 3536 int rv = 0; 3537 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3538 vdc_t *vdc = srvr->vdcp; 3539 3540 ASSERT(vdc != NULL); 3541 3542 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3543 3544 /* If callback is not for the current server, ignore it */ 3545 mutex_enter(&vdc->lock); 3546 3547 if (vdc->curr_server != srvr) { 3548 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3549 vdc->instance, event, srvr->id); 3550 mutex_exit(&vdc->lock); 3551 return (LDC_SUCCESS); 3552 } 3553 3554 /* 3555 * Depending on the type of event that triggered this callback, 3556 * we modify the handshake state or read the data. 3557 * 3558 * NOTE: not done as a switch() as event could be triggered by 3559 * a state change and a read request. Also the ordering of the 3560 * check for the event types is deliberate. 3561 */ 3562 if (event & LDC_EVT_UP) { 3563 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3564 3565 /* get LDC state */ 3566 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3567 if (rv != 0) { 3568 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3569 vdc->instance, rv); 3570 mutex_exit(&vdc->lock); 3571 return (LDC_SUCCESS); 3572 } 3573 if (srvr->ldc_state != LDC_UP && 3574 ldc_state == LDC_UP) { 3575 /* 3576 * Reset the transaction sequence numbers when 3577 * LDC comes up. We then kick off the handshake 3578 * negotiation with the vDisk server. 3579 */ 3580 vdc->seq_num = 1; 3581 vdc->seq_num_reply = 0; 3582 srvr->ldc_state = ldc_state; 3583 cv_signal(&vdc->initwait_cv); 3584 } 3585 } 3586 3587 if (event & LDC_EVT_READ) { 3588 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3589 mutex_enter(&vdc->read_lock); 3590 cv_signal(&vdc->read_cv); 3591 vdc->read_state = VDC_READ_PENDING; 3592 mutex_exit(&vdc->read_lock); 3593 mutex_exit(&vdc->lock); 3594 3595 /* that's all we have to do - no need to handle DOWN/RESET */ 3596 return (LDC_SUCCESS); 3597 } 3598 3599 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3600 3601 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3602 3603 /* 3604 * Need to wake up any readers so they will 3605 * detect that a reset has occurred. 3606 */ 3607 mutex_enter(&vdc->read_lock); 3608 if ((vdc->read_state == VDC_READ_WAITING) || 3609 (vdc->read_state == VDC_READ_RESET)) 3610 cv_signal(&vdc->read_cv); 3611 vdc->read_state = VDC_READ_RESET; 3612 mutex_exit(&vdc->read_lock); 3613 3614 /* wake up any threads waiting for connection to come up */ 3615 if (vdc->state == VDC_STATE_INIT_WAITING) { 3616 vdc->state = VDC_STATE_RESETTING; 3617 cv_signal(&vdc->initwait_cv); 3618 } 3619 3620 } 3621 3622 mutex_exit(&vdc->lock); 3623 3624 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3625 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3626 vdc->instance, event); 3627 3628 return (LDC_SUCCESS); 3629 } 3630 3631 /* 3632 * Function: 3633 * vdc_wait_for_response() 3634 * 3635 * Description: 3636 * Block waiting for a response from the server. If there is 3637 * no data the thread block on the read_cv that is signalled 3638 * by the callback when an EVT_READ occurs. 3639 * 3640 * Arguments: 3641 * vdcp - soft state pointer for this instance of the device driver. 3642 * 3643 * Return Code: 3644 * 0 - Success 3645 */ 3646 static int 3647 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3648 { 3649 size_t nbytes = sizeof (*msgp); 3650 int status; 3651 3652 ASSERT(vdcp != NULL); 3653 3654 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3655 3656 status = vdc_recv(vdcp, msgp, &nbytes); 3657 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3658 status, (int)nbytes); 3659 if (status) { 3660 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3661 vdcp->instance, status); 3662 return (status); 3663 } 3664 3665 if (nbytes < sizeof (vio_msg_tag_t)) { 3666 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3667 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3668 return (ENOMSG); 3669 } 3670 3671 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3672 msgp->tag.vio_msgtype, 3673 msgp->tag.vio_subtype, 3674 msgp->tag.vio_subtype_env); 3675 3676 /* 3677 * Verify the Session ID of the message 3678 * 3679 * Every message after the Version has been negotiated should 3680 * have the correct session ID set. 3681 */ 3682 if ((msgp->tag.vio_sid != vdcp->session_id) && 3683 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3684 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3685 "expected 0x%lx [seq num %lx @ %d]", 3686 vdcp->instance, msgp->tag.vio_sid, 3687 vdcp->session_id, 3688 ((vio_dring_msg_t *)msgp)->seq_num, 3689 ((vio_dring_msg_t *)msgp)->start_idx); 3690 return (ENOMSG); 3691 } 3692 return (0); 3693 } 3694 3695 3696 /* 3697 * Function: 3698 * vdc_resubmit_backup_dring() 3699 * 3700 * Description: 3701 * Resubmit each descriptor in the backed up dring to 3702 * vDisk server. The Dring was backed up during connection 3703 * reset. 3704 * 3705 * Arguments: 3706 * vdcp - soft state pointer for this instance of the device driver. 3707 * 3708 * Return Code: 3709 * 0 - Success 3710 */ 3711 static int 3712 vdc_resubmit_backup_dring(vdc_t *vdcp) 3713 { 3714 int processed = 0; 3715 int count; 3716 int b_idx; 3717 int rv = 0; 3718 int dring_size; 3719 int op; 3720 vio_msg_t vio_msg; 3721 vdc_local_desc_t *curr_ldep; 3722 3723 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3724 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3725 3726 if (vdcp->local_dring_backup == NULL) { 3727 /* the pending requests have already been processed */ 3728 return (0); 3729 } 3730 3731 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3732 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3733 3734 /* 3735 * Walk the backup copy of the local descriptor ring and 3736 * resubmit all the outstanding transactions. 3737 */ 3738 b_idx = vdcp->local_dring_backup_tail; 3739 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3740 3741 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3742 3743 /* only resubmit outstanding transactions */ 3744 if (!curr_ldep->is_free) { 3745 /* 3746 * If we are retrying a block read/write operation we 3747 * need to update the I/O statistics to indicate that 3748 * the request is being put back on the waitq to be 3749 * serviced (it will have been taken off after the 3750 * error was reported). 3751 */ 3752 mutex_enter(&vdcp->lock); 3753 op = curr_ldep->operation; 3754 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3755 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3756 VD_KSTAT_WAITQ_ENTER(vdcp); 3757 } 3758 3759 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3760 rv = vdc_populate_descriptor(vdcp, op, 3761 curr_ldep->addr, curr_ldep->nbytes, 3762 curr_ldep->slice, curr_ldep->offset, 3763 curr_ldep->cb_type, curr_ldep->cb_arg, 3764 curr_ldep->dir); 3765 3766 if (rv) { 3767 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3768 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3769 VD_KSTAT_WAITQ_EXIT(vdcp); 3770 DTRACE_IO1(done, buf_t *, 3771 curr_ldep->cb_arg); 3772 } 3773 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3774 vdcp->instance, b_idx); 3775 mutex_exit(&vdcp->lock); 3776 goto done; 3777 } 3778 3779 /* 3780 * If this is a block read/write we update the I/O 3781 * statistics kstat to indicate that the request 3782 * has been sent back to the vDisk server and should 3783 * now be put on the run queue. 3784 */ 3785 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3786 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3787 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3788 } 3789 mutex_exit(&vdcp->lock); 3790 3791 /* Wait for the response message. */ 3792 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3793 b_idx); 3794 rv = vdc_wait_for_response(vdcp, &vio_msg); 3795 if (rv) { 3796 /* 3797 * If this is a block read/write we update 3798 * the I/O statistics kstat to take it 3799 * off the run queue. 3800 */ 3801 mutex_enter(&vdcp->lock); 3802 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3803 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3804 VD_KSTAT_RUNQ_EXIT(vdcp); 3805 DTRACE_IO1(done, buf_t *, 3806 curr_ldep->cb_arg); 3807 } 3808 DMSG(vdcp, 1, "[%d] wait_for_response " 3809 "returned err=%d\n", vdcp->instance, 3810 rv); 3811 mutex_exit(&vdcp->lock); 3812 goto done; 3813 } 3814 3815 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3816 rv = vdc_process_data_msg(vdcp, &vio_msg); 3817 if (rv) { 3818 DMSG(vdcp, 1, "[%d] process_data_msg " 3819 "returned err=%d\n", vdcp->instance, 3820 rv); 3821 goto done; 3822 } 3823 /* 3824 * Mark this entry as free so that we will not resubmit 3825 * this "done" request again, if we were to use the same 3826 * backup_dring again in future. This could happen when 3827 * a reset happens while processing the backup_dring. 3828 */ 3829 curr_ldep->is_free = B_TRUE; 3830 processed++; 3831 } 3832 3833 /* get the next element to submit */ 3834 if (++b_idx >= vdcp->local_dring_backup_len) 3835 b_idx = 0; 3836 } 3837 3838 /* all done - now clear up pending dring copy */ 3839 dring_size = vdcp->local_dring_backup_len * 3840 sizeof (vdcp->local_dring_backup[0]); 3841 3842 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3843 3844 vdcp->local_dring_backup = NULL; 3845 3846 done: 3847 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3848 3849 return (rv); 3850 } 3851 3852 /* 3853 * Function: 3854 * vdc_cancel_backup_dring 3855 * 3856 * Description: 3857 * Cancel each descriptor in the backed up dring to vDisk server. 3858 * The Dring was backed up during connection reset. 3859 * 3860 * Arguments: 3861 * vdcp - soft state pointer for this instance of the device driver. 3862 * 3863 * Return Code: 3864 * None 3865 */ 3866 void 3867 vdc_cancel_backup_dring(vdc_t *vdcp) 3868 { 3869 vdc_local_desc_t *ldep; 3870 struct buf *bufp; 3871 int count; 3872 int b_idx; 3873 int dring_size; 3874 int cancelled = 0; 3875 3876 ASSERT(MUTEX_HELD(&vdcp->lock)); 3877 ASSERT(vdcp->state == VDC_STATE_INIT || 3878 vdcp->state == VDC_STATE_INIT_WAITING || 3879 vdcp->state == VDC_STATE_NEGOTIATE || 3880 vdcp->state == VDC_STATE_RESETTING); 3881 3882 if (vdcp->local_dring_backup == NULL) { 3883 /* the pending requests have already been processed */ 3884 return; 3885 } 3886 3887 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3888 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3889 3890 /* 3891 * Walk the backup copy of the local descriptor ring and 3892 * cancel all the outstanding transactions. 3893 */ 3894 b_idx = vdcp->local_dring_backup_tail; 3895 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3896 3897 ldep = &(vdcp->local_dring_backup[b_idx]); 3898 3899 /* only cancel outstanding transactions */ 3900 if (!ldep->is_free) { 3901 3902 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3903 cancelled++; 3904 3905 /* 3906 * All requests have already been cleared from the 3907 * local descriptor ring and the LDC channel has been 3908 * reset so we will never get any reply for these 3909 * requests. Now we just have to notify threads waiting 3910 * for replies that the request has failed. 3911 */ 3912 switch (ldep->cb_type) { 3913 case CB_SYNC: 3914 ASSERT(vdcp->sync_op_pending); 3915 vdcp->sync_op_status = EIO; 3916 vdcp->sync_op_pending = B_FALSE; 3917 cv_signal(&vdcp->sync_pending_cv); 3918 break; 3919 3920 case CB_STRATEGY: 3921 bufp = ldep->cb_arg; 3922 ASSERT(bufp != NULL); 3923 bufp->b_resid = bufp->b_bcount; 3924 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3925 VD_KSTAT_RUNQ_EXIT(vdcp); 3926 DTRACE_IO1(done, buf_t *, bufp); 3927 bioerror(bufp, EIO); 3928 biodone(bufp); 3929 break; 3930 3931 default: 3932 ASSERT(0); 3933 } 3934 3935 } 3936 3937 /* get the next element to cancel */ 3938 if (++b_idx >= vdcp->local_dring_backup_len) 3939 b_idx = 0; 3940 } 3941 3942 /* all done - now clear up pending dring copy */ 3943 dring_size = vdcp->local_dring_backup_len * 3944 sizeof (vdcp->local_dring_backup[0]); 3945 3946 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3947 3948 vdcp->local_dring_backup = NULL; 3949 3950 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 3951 } 3952 3953 /* 3954 * Function: 3955 * vdc_connection_timeout 3956 * 3957 * Description: 3958 * This function is invoked if the timeout set to establish the connection 3959 * with vds expires. This will happen if we spend too much time in the 3960 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3961 * cancel any pending request and mark them as failed. 3962 * 3963 * If the timeout does not expire, it will be cancelled when we reach the 3964 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 3965 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 3966 * VDC_STATE_RESETTING state in which case we do nothing because the 3967 * timeout is being cancelled. 3968 * 3969 * Arguments: 3970 * arg - argument of the timeout function actually a soft state 3971 * pointer for the instance of the device driver. 3972 * 3973 * Return Code: 3974 * None 3975 */ 3976 void 3977 vdc_connection_timeout(void *arg) 3978 { 3979 vdc_t *vdcp = (vdc_t *)arg; 3980 3981 mutex_enter(&vdcp->lock); 3982 3983 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 3984 vdcp->state == VDC_STATE_DETACH) { 3985 /* 3986 * The connection has just been re-established or 3987 * we are detaching. 3988 */ 3989 vdcp->ctimeout_reached = B_FALSE; 3990 mutex_exit(&vdcp->lock); 3991 return; 3992 } 3993 3994 vdcp->ctimeout_reached = B_TRUE; 3995 3996 /* notify requests waiting for sending */ 3997 cv_broadcast(&vdcp->running_cv); 3998 3999 /* cancel requests waiting for a result */ 4000 vdc_cancel_backup_dring(vdcp); 4001 4002 mutex_exit(&vdcp->lock); 4003 4004 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4005 vdcp->instance); 4006 } 4007 4008 /* 4009 * Function: 4010 * vdc_backup_local_dring() 4011 * 4012 * Description: 4013 * Backup the current dring in the event of a reset. The Dring 4014 * transactions will be resubmitted to the server when the 4015 * connection is restored. 4016 * 4017 * Arguments: 4018 * vdcp - soft state pointer for this instance of the device driver. 4019 * 4020 * Return Code: 4021 * NONE 4022 */ 4023 static void 4024 vdc_backup_local_dring(vdc_t *vdcp) 4025 { 4026 int dring_size; 4027 4028 ASSERT(MUTEX_HELD(&vdcp->lock)); 4029 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4030 4031 /* 4032 * If the backup dring is stil around, it means 4033 * that the last restore did not complete. However, 4034 * since we never got back into the running state, 4035 * the backup copy we have is still valid. 4036 */ 4037 if (vdcp->local_dring_backup != NULL) { 4038 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4039 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4040 vdcp->local_dring_backup_tail); 4041 return; 4042 } 4043 4044 /* 4045 * The backup dring can be NULL and the local dring may not be 4046 * initialized. This can happen if we had a reset while establishing 4047 * a new connection but after the connection has timed out. In that 4048 * case the backup dring is NULL because the requests have been 4049 * cancelled and the request occured before the local dring is 4050 * initialized. 4051 */ 4052 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4053 return; 4054 4055 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4056 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4057 4058 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4059 4060 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4061 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4062 4063 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4064 vdcp->local_dring_backup_len = vdcp->dring_len; 4065 } 4066 4067 static void 4068 vdc_switch_server(vdc_t *vdcp) 4069 { 4070 int rv; 4071 vdc_server_t *curr_server, *new_server; 4072 4073 ASSERT(MUTEX_HELD(&vdcp->lock)); 4074 4075 /* if there is only one server return back */ 4076 if (vdcp->num_servers == 1) { 4077 return; 4078 } 4079 4080 /* Get current and next server */ 4081 curr_server = vdcp->curr_server; 4082 new_server = 4083 (curr_server->next) ? curr_server->next : vdcp->server_list; 4084 ASSERT(curr_server != new_server); 4085 4086 /* bring current server's channel down */ 4087 rv = ldc_down(curr_server->ldc_handle); 4088 if (rv) { 4089 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4090 vdcp->instance, curr_server->id); 4091 return; 4092 } 4093 4094 /* switch the server */ 4095 vdcp->curr_server = new_server; 4096 4097 cmn_err(CE_NOTE, "Successfully failed over from VDS on port@%ld to " 4098 "VDS on port@%ld.\n", curr_server->id, new_server->id); 4099 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4100 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4101 } 4102 4103 /* -------------------------------------------------------------------------- */ 4104 4105 /* 4106 * The following functions process the incoming messages from vds 4107 */ 4108 4109 /* 4110 * Function: 4111 * vdc_process_msg_thread() 4112 * 4113 * Description: 4114 * 4115 * Main VDC message processing thread. Each vDisk instance 4116 * consists of a copy of this thread. This thread triggers 4117 * all the handshakes and data exchange with the server. It 4118 * also handles all channel resets 4119 * 4120 * Arguments: 4121 * vdc - soft state pointer for this instance of the device driver. 4122 * 4123 * Return Code: 4124 * None 4125 */ 4126 static void 4127 vdc_process_msg_thread(vdc_t *vdcp) 4128 { 4129 int status; 4130 int ctimeout; 4131 timeout_id_t tmid = 0; 4132 clock_t ldcup_timeout = 0; 4133 4134 mutex_enter(&vdcp->lock); 4135 4136 for (;;) { 4137 4138 #define Q(_s) (vdcp->state == _s) ? #_s : 4139 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4140 Q(VDC_STATE_INIT) 4141 Q(VDC_STATE_INIT_WAITING) 4142 Q(VDC_STATE_NEGOTIATE) 4143 Q(VDC_STATE_HANDLE_PENDING) 4144 Q(VDC_STATE_RUNNING) 4145 Q(VDC_STATE_RESETTING) 4146 Q(VDC_STATE_DETACH) 4147 "UNKNOWN"); 4148 4149 switch (vdcp->state) { 4150 case VDC_STATE_INIT: 4151 4152 /* 4153 * If requested, start a timeout to check if the 4154 * connection with vds is established in the 4155 * specified delay. If the timeout expires, we 4156 * will cancel any pending request. 4157 * 4158 * If some reset have occurred while establishing 4159 * the connection, we already have a timeout armed 4160 * and in that case we don't need to arm a new one. 4161 * 4162 * The same rule applies when there are multiple vds'. 4163 * If either a connection cannot be established or 4164 * the handshake times out, the connection thread will 4165 * try another server. The 'ctimeout' will report 4166 * back an error after it expires irrespective of 4167 * whether the vdisk is trying to connect to just 4168 * one or multiple servers. 4169 */ 4170 ctimeout = (vdc_timeout != 0)? 4171 vdc_timeout : vdcp->curr_server->ctimeout; 4172 4173 if (ctimeout != 0 && tmid == 0) { 4174 tmid = timeout(vdc_connection_timeout, vdcp, 4175 ctimeout * drv_usectohz(MICROSEC)); 4176 } 4177 4178 /* Check if we are re-initializing repeatedly */ 4179 if (vdcp->hshake_cnt > vdc_hshake_retries && 4180 vdcp->lifecycle != VDC_LC_ONLINE) { 4181 4182 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4183 vdcp->instance, vdcp->hshake_cnt); 4184 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4185 vdcp->instance); 4186 vdcp->state = VDC_STATE_DETACH; 4187 break; 4188 } 4189 4190 /* Switch to STATE_DETACH if drv is detaching */ 4191 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4192 vdcp->state = VDC_STATE_DETACH; 4193 break; 4194 } 4195 4196 /* Switch server */ 4197 if (vdcp->hshake_cnt > 0) 4198 vdc_switch_server(vdcp); 4199 vdcp->hshake_cnt++; 4200 4201 /* Bring up connection with vds via LDC */ 4202 status = vdc_start_ldc_connection(vdcp); 4203 if (status != EINVAL) { 4204 vdcp->state = VDC_STATE_INIT_WAITING; 4205 } 4206 break; 4207 4208 case VDC_STATE_INIT_WAITING: 4209 4210 /* if channel is UP, start negotiation */ 4211 if (vdcp->curr_server->ldc_state == LDC_UP) { 4212 vdcp->state = VDC_STATE_NEGOTIATE; 4213 break; 4214 } 4215 4216 /* check if only one server exists */ 4217 if (vdcp->num_servers == 1) { 4218 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4219 } else { 4220 /* 4221 * wait for LDC_UP, if it times out, switch 4222 * to another server. 4223 */ 4224 ldcup_timeout = ddi_get_lbolt() + 4225 (vdc_ldcup_timeout * 4226 drv_usectohz(MICROSEC)); 4227 status = cv_timedwait(&vdcp->initwait_cv, 4228 &vdcp->lock, ldcup_timeout); 4229 if (status == -1 && 4230 vdcp->state == VDC_STATE_INIT_WAITING && 4231 vdcp->curr_server->ldc_state != LDC_UP) { 4232 /* timed out & still waiting */ 4233 vdcp->state = VDC_STATE_INIT; 4234 break; 4235 } 4236 } 4237 4238 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4239 DMSG(vdcp, 0, 4240 "state moved to %d out from under us...\n", 4241 vdcp->state); 4242 } 4243 break; 4244 4245 case VDC_STATE_NEGOTIATE: 4246 switch (status = vdc_ver_negotiation(vdcp)) { 4247 case 0: 4248 break; 4249 default: 4250 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4251 status); 4252 goto reset; 4253 } 4254 4255 switch (status = vdc_attr_negotiation(vdcp)) { 4256 case 0: 4257 break; 4258 default: 4259 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4260 status); 4261 goto reset; 4262 } 4263 4264 switch (status = vdc_dring_negotiation(vdcp)) { 4265 case 0: 4266 break; 4267 default: 4268 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4269 status); 4270 goto reset; 4271 } 4272 4273 switch (status = vdc_rdx_exchange(vdcp)) { 4274 case 0: 4275 vdcp->state = VDC_STATE_HANDLE_PENDING; 4276 goto done; 4277 default: 4278 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4279 status); 4280 goto reset; 4281 } 4282 reset: 4283 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4284 status); 4285 vdcp->state = VDC_STATE_RESETTING; 4286 vdcp->self_reset = B_TRUE; 4287 done: 4288 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4289 vdcp->state); 4290 break; 4291 4292 case VDC_STATE_HANDLE_PENDING: 4293 4294 if (vdcp->ctimeout_reached) { 4295 /* 4296 * The connection timeout had been reached so 4297 * pending requests have been cancelled. Now 4298 * that the connection is back we can reset 4299 * the timeout. 4300 */ 4301 ASSERT(vdcp->local_dring_backup == NULL); 4302 ASSERT(tmid != 0); 4303 tmid = 0; 4304 vdcp->ctimeout_reached = B_FALSE; 4305 vdcp->state = VDC_STATE_RUNNING; 4306 DMSG(vdcp, 0, "[%d] connection to service " 4307 "domain is up", vdcp->instance); 4308 break; 4309 } 4310 4311 mutex_exit(&vdcp->lock); 4312 if (tmid != 0) { 4313 (void) untimeout(tmid); 4314 tmid = 0; 4315 } 4316 status = vdc_resubmit_backup_dring(vdcp); 4317 mutex_enter(&vdcp->lock); 4318 4319 if (status) 4320 vdcp->state = VDC_STATE_RESETTING; 4321 else 4322 vdcp->state = VDC_STATE_RUNNING; 4323 4324 break; 4325 4326 /* enter running state */ 4327 case VDC_STATE_RUNNING: 4328 /* 4329 * Signal anyone waiting for the connection 4330 * to come on line. 4331 */ 4332 vdcp->hshake_cnt = 0; 4333 cv_broadcast(&vdcp->running_cv); 4334 4335 /* failfast has to been checked after reset */ 4336 cv_signal(&vdcp->failfast_cv); 4337 4338 /* ownership is lost during reset */ 4339 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4340 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4341 cv_signal(&vdcp->ownership_cv); 4342 4343 mutex_exit(&vdcp->lock); 4344 4345 for (;;) { 4346 vio_msg_t msg; 4347 status = vdc_wait_for_response(vdcp, &msg); 4348 if (status) break; 4349 4350 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4351 vdcp->instance); 4352 status = vdc_process_data_msg(vdcp, &msg); 4353 if (status) { 4354 DMSG(vdcp, 1, "[%d] process_data_msg " 4355 "returned err=%d\n", vdcp->instance, 4356 status); 4357 break; 4358 } 4359 4360 } 4361 4362 mutex_enter(&vdcp->lock); 4363 4364 vdcp->state = VDC_STATE_RESETTING; 4365 vdcp->self_reset = B_TRUE; 4366 break; 4367 4368 case VDC_STATE_RESETTING: 4369 /* 4370 * When we reach this state, we either come from the 4371 * VDC_STATE_RUNNING state and we can have pending 4372 * request but no timeout is armed; or we come from 4373 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4374 * VDC_HANDLE_PENDING state and there is no pending 4375 * request or pending requests have already been copied 4376 * into the backup dring. So we can safely keep the 4377 * connection timeout armed while we are in this state. 4378 */ 4379 4380 DMSG(vdcp, 0, "Initiating channel reset " 4381 "(pending = %d)\n", (int)vdcp->threads_pending); 4382 4383 if (vdcp->self_reset) { 4384 DMSG(vdcp, 0, 4385 "[%d] calling stop_ldc_connection.\n", 4386 vdcp->instance); 4387 status = vdc_stop_ldc_connection(vdcp); 4388 vdcp->self_reset = B_FALSE; 4389 } 4390 4391 /* 4392 * Wait for all threads currently waiting 4393 * for a free dring entry to use. 4394 */ 4395 while (vdcp->threads_pending) { 4396 cv_broadcast(&vdcp->membind_cv); 4397 cv_broadcast(&vdcp->dring_free_cv); 4398 mutex_exit(&vdcp->lock); 4399 /* give the waiters enough time to wake up */ 4400 delay(vdc_hz_min_ldc_delay); 4401 mutex_enter(&vdcp->lock); 4402 } 4403 4404 ASSERT(vdcp->threads_pending == 0); 4405 4406 /* Sanity check that no thread is receiving */ 4407 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4408 4409 vdcp->read_state = VDC_READ_IDLE; 4410 4411 vdc_backup_local_dring(vdcp); 4412 4413 /* cleanup the old d-ring */ 4414 vdc_destroy_descriptor_ring(vdcp); 4415 4416 /* go and start again */ 4417 vdcp->state = VDC_STATE_INIT; 4418 4419 break; 4420 4421 case VDC_STATE_DETACH: 4422 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4423 vdcp->instance); 4424 4425 /* cancel any pending timeout */ 4426 mutex_exit(&vdcp->lock); 4427 if (tmid != 0) { 4428 (void) untimeout(tmid); 4429 tmid = 0; 4430 } 4431 mutex_enter(&vdcp->lock); 4432 4433 /* 4434 * Signal anyone waiting for connection 4435 * to come online 4436 */ 4437 cv_broadcast(&vdcp->running_cv); 4438 4439 while (vdcp->sync_op_pending) { 4440 cv_signal(&vdcp->sync_pending_cv); 4441 cv_signal(&vdcp->sync_blocked_cv); 4442 mutex_exit(&vdcp->lock); 4443 /* give the waiters enough time to wake up */ 4444 delay(vdc_hz_min_ldc_delay); 4445 mutex_enter(&vdcp->lock); 4446 } 4447 4448 mutex_exit(&vdcp->lock); 4449 4450 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4451 vdcp->instance); 4452 thread_exit(); 4453 break; 4454 } 4455 } 4456 } 4457 4458 4459 /* 4460 * Function: 4461 * vdc_process_data_msg() 4462 * 4463 * Description: 4464 * This function is called by the message processing thread each time 4465 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4466 * be an ACK or NACK from vds[1] which vdc handles as follows. 4467 * ACK - wake up the waiting thread 4468 * NACK - resend any messages necessary 4469 * 4470 * [1] Although the message format allows it, vds should not send a 4471 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4472 * some bizarre reason it does, vdc will reset the connection. 4473 * 4474 * Arguments: 4475 * vdc - soft state pointer for this instance of the device driver. 4476 * msg - the LDC message sent by vds 4477 * 4478 * Return Code: 4479 * 0 - Success. 4480 * > 0 - error value returned by LDC 4481 */ 4482 static int 4483 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4484 { 4485 int status = 0; 4486 vio_dring_msg_t *dring_msg; 4487 vdc_local_desc_t *ldep = NULL; 4488 int start, end; 4489 int idx; 4490 int op; 4491 4492 dring_msg = (vio_dring_msg_t *)msg; 4493 4494 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4495 ASSERT(vdcp != NULL); 4496 4497 mutex_enter(&vdcp->lock); 4498 4499 /* 4500 * Check to see if the message has bogus data 4501 */ 4502 idx = start = dring_msg->start_idx; 4503 end = dring_msg->end_idx; 4504 if ((start >= vdcp->dring_len) || 4505 (end >= vdcp->dring_len) || (end < -1)) { 4506 /* 4507 * Update the I/O statistics to indicate that an error ocurred. 4508 * No need to update the wait/run queues as no specific read or 4509 * write request is being completed in response to this 'msg'. 4510 */ 4511 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4512 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4513 vdcp->instance, start, end); 4514 mutex_exit(&vdcp->lock); 4515 return (EINVAL); 4516 } 4517 4518 /* 4519 * Verify that the sequence number is what vdc expects. 4520 */ 4521 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4522 case VDC_SEQ_NUM_TODO: 4523 break; /* keep processing this message */ 4524 case VDC_SEQ_NUM_SKIP: 4525 mutex_exit(&vdcp->lock); 4526 return (0); 4527 case VDC_SEQ_NUM_INVALID: 4528 /* 4529 * Update the I/O statistics to indicate that an error ocurred. 4530 * No need to update the wait/run queues as no specific read or 4531 * write request is being completed in response to this 'msg'. 4532 */ 4533 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4534 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4535 mutex_exit(&vdcp->lock); 4536 return (ENXIO); 4537 } 4538 4539 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4540 /* 4541 * Update the I/O statistics to indicate that an error ocurred. 4542 * 4543 * We need to update the run queue if a read or write request 4544 * is being NACKed - otherwise there will appear to be an 4545 * indefinite outstanding request and statistics reported by 4546 * iostat(1M) will be incorrect. The transaction will be 4547 * resubmitted from the backup DRing following the reset 4548 * and the wait/run queues will be entered again. 4549 */ 4550 ldep = &vdcp->local_dring[idx]; 4551 op = ldep->operation; 4552 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4553 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4554 VD_KSTAT_RUNQ_EXIT(vdcp); 4555 } 4556 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4557 VDC_DUMP_DRING_MSG(dring_msg); 4558 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4559 mutex_exit(&vdcp->lock); 4560 return (EIO); 4561 4562 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4563 /* 4564 * Update the I/O statistics to indicate that an error occurred. 4565 * No need to update the wait/run queues as no specific read or 4566 * write request is being completed in response to this 'msg'. 4567 */ 4568 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4569 mutex_exit(&vdcp->lock); 4570 return (EPROTO); 4571 } 4572 4573 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4574 ASSERT(start == end); 4575 4576 ldep = &vdcp->local_dring[idx]; 4577 4578 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4579 ldep->dep->hdr.dstate, ldep->cb_type); 4580 4581 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4582 struct buf *bufp; 4583 4584 switch (ldep->cb_type) { 4585 case CB_SYNC: 4586 ASSERT(vdcp->sync_op_pending); 4587 4588 status = vdc_depopulate_descriptor(vdcp, idx); 4589 vdcp->sync_op_status = status; 4590 vdcp->sync_op_pending = B_FALSE; 4591 cv_signal(&vdcp->sync_pending_cv); 4592 break; 4593 4594 case CB_STRATEGY: 4595 bufp = ldep->cb_arg; 4596 ASSERT(bufp != NULL); 4597 bufp->b_resid = 4598 bufp->b_bcount - ldep->dep->payload.nbytes; 4599 status = ldep->dep->payload.status; /* Future:ntoh */ 4600 if (status != 0) { 4601 DMSG(vdcp, 1, "strategy status=%d\n", status); 4602 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4603 bioerror(bufp, status); 4604 } 4605 4606 (void) vdc_depopulate_descriptor(vdcp, idx); 4607 4608 DMSG(vdcp, 1, 4609 "strategy complete req=%ld bytes resp=%ld bytes\n", 4610 bufp->b_bcount, ldep->dep->payload.nbytes); 4611 4612 if (status != 0 && vdcp->failfast_interval != 0) { 4613 /* 4614 * The I/O has failed and failfast is enabled. 4615 * We need the failfast thread to check if the 4616 * failure is due to a reservation conflict. 4617 */ 4618 (void) vdc_failfast_io_queue(vdcp, bufp); 4619 } else { 4620 if (status == 0) { 4621 op = (bufp->b_flags & B_READ) ? 4622 VD_OP_BREAD : VD_OP_BWRITE; 4623 VD_UPDATE_IO_STATS(vdcp, op, 4624 ldep->dep->payload.nbytes); 4625 } 4626 VD_KSTAT_RUNQ_EXIT(vdcp); 4627 DTRACE_IO1(done, buf_t *, bufp); 4628 biodone(bufp); 4629 } 4630 break; 4631 4632 default: 4633 ASSERT(0); 4634 } 4635 } 4636 4637 /* let the arrival signal propogate */ 4638 mutex_exit(&vdcp->lock); 4639 4640 /* probe gives the count of how many entries were processed */ 4641 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4642 4643 return (0); 4644 } 4645 4646 4647 /* 4648 * Function: 4649 * vdc_handle_ver_msg() 4650 * 4651 * Description: 4652 * 4653 * Arguments: 4654 * vdc - soft state pointer for this instance of the device driver. 4655 * ver_msg - LDC message sent by vDisk server 4656 * 4657 * Return Code: 4658 * 0 - Success 4659 */ 4660 static int 4661 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4662 { 4663 int status = 0; 4664 4665 ASSERT(vdc != NULL); 4666 ASSERT(mutex_owned(&vdc->lock)); 4667 4668 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4669 return (EPROTO); 4670 } 4671 4672 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4673 return (EINVAL); 4674 } 4675 4676 switch (ver_msg->tag.vio_subtype) { 4677 case VIO_SUBTYPE_ACK: 4678 /* 4679 * We check to see if the version returned is indeed supported 4680 * (The server may have also adjusted the minor number downwards 4681 * and if so 'ver_msg' will contain the actual version agreed) 4682 */ 4683 if (vdc_is_supported_version(ver_msg)) { 4684 vdc->ver.major = ver_msg->ver_major; 4685 vdc->ver.minor = ver_msg->ver_minor; 4686 ASSERT(vdc->ver.major > 0); 4687 } else { 4688 status = EPROTO; 4689 } 4690 break; 4691 4692 case VIO_SUBTYPE_NACK: 4693 /* 4694 * call vdc_is_supported_version() which will return the next 4695 * supported version (if any) in 'ver_msg' 4696 */ 4697 (void) vdc_is_supported_version(ver_msg); 4698 if (ver_msg->ver_major > 0) { 4699 size_t len = sizeof (*ver_msg); 4700 4701 ASSERT(vdc->ver.major > 0); 4702 4703 /* reset the necessary fields and resend */ 4704 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4705 ver_msg->dev_class = VDEV_DISK; 4706 4707 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4708 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4709 vdc->instance, status); 4710 if (len != sizeof (*ver_msg)) 4711 status = EBADMSG; 4712 } else { 4713 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4714 vdc->instance); 4715 status = ENOTSUP; 4716 } 4717 4718 break; 4719 case VIO_SUBTYPE_INFO: 4720 /* 4721 * Handle the case where vds starts handshake 4722 * (for now only vdc is the instigator) 4723 */ 4724 status = ENOTSUP; 4725 break; 4726 4727 default: 4728 status = EINVAL; 4729 break; 4730 } 4731 4732 return (status); 4733 } 4734 4735 /* 4736 * Function: 4737 * vdc_handle_attr_msg() 4738 * 4739 * Description: 4740 * 4741 * Arguments: 4742 * vdc - soft state pointer for this instance of the device driver. 4743 * attr_msg - LDC message sent by vDisk server 4744 * 4745 * Return Code: 4746 * 0 - Success 4747 */ 4748 static int 4749 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4750 { 4751 int status = 0; 4752 4753 ASSERT(vdc != NULL); 4754 ASSERT(mutex_owned(&vdc->lock)); 4755 4756 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4757 return (EPROTO); 4758 } 4759 4760 switch (attr_msg->tag.vio_subtype) { 4761 case VIO_SUBTYPE_ACK: 4762 /* 4763 * We now verify the attributes sent by vds. 4764 */ 4765 if (attr_msg->vdisk_size == 0) { 4766 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4767 vdc->instance); 4768 status = EINVAL; 4769 break; 4770 } 4771 4772 if (attr_msg->max_xfer_sz == 0) { 4773 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4774 vdc->instance); 4775 status = EINVAL; 4776 break; 4777 } 4778 4779 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4780 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4781 vdc->instance); 4782 attr_msg->vdisk_size = 0; 4783 } 4784 4785 /* 4786 * If the disk size is already set check that it hasn't changed. 4787 */ 4788 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4789 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4790 DMSG(vdc, 0, "[%d] Different disk size from vds " 4791 "(old=0x%lx - new=0x%lx", vdc->instance, 4792 vdc->vdisk_size, attr_msg->vdisk_size) 4793 status = EINVAL; 4794 break; 4795 } 4796 4797 vdc->vdisk_size = attr_msg->vdisk_size; 4798 vdc->vdisk_type = attr_msg->vdisk_type; 4799 vdc->operations = attr_msg->operations; 4800 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4801 vdc->vdisk_media = attr_msg->vdisk_media; 4802 else 4803 vdc->vdisk_media = 0; 4804 4805 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4806 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4807 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4808 vdc->instance, vdc->block_size, 4809 attr_msg->vdisk_block_size); 4810 4811 /* 4812 * We don't know at compile time what the vDisk server will 4813 * think are good values but we apply a large (arbitrary) 4814 * upper bound to prevent memory exhaustion in vdc if it was 4815 * allocating a DRing based of huge values sent by the server. 4816 * We probably will never exceed this except if the message 4817 * was garbage. 4818 */ 4819 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4820 (PAGESIZE * DEV_BSIZE)) { 4821 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4822 vdc->block_size = attr_msg->vdisk_block_size; 4823 } else { 4824 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4825 " using max supported by vdc", vdc->instance); 4826 } 4827 4828 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4829 (attr_msg->vdisk_size > INT64_MAX) || 4830 (attr_msg->operations == 0) || 4831 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4832 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4833 vdc->instance); 4834 status = EINVAL; 4835 break; 4836 } 4837 4838 /* 4839 * Now that we have received all attributes we can create a 4840 * fake geometry for the disk. 4841 */ 4842 vdc_create_fake_geometry(vdc); 4843 break; 4844 4845 case VIO_SUBTYPE_NACK: 4846 /* 4847 * vds could not handle the attributes we sent so we 4848 * stop negotiating. 4849 */ 4850 status = EPROTO; 4851 break; 4852 4853 case VIO_SUBTYPE_INFO: 4854 /* 4855 * Handle the case where vds starts the handshake 4856 * (for now; vdc is the only supported instigatior) 4857 */ 4858 status = ENOTSUP; 4859 break; 4860 4861 default: 4862 status = ENOTSUP; 4863 break; 4864 } 4865 4866 return (status); 4867 } 4868 4869 /* 4870 * Function: 4871 * vdc_handle_dring_reg_msg() 4872 * 4873 * Description: 4874 * 4875 * Arguments: 4876 * vdc - soft state pointer for this instance of the driver. 4877 * dring_msg - LDC message sent by vDisk server 4878 * 4879 * Return Code: 4880 * 0 - Success 4881 */ 4882 static int 4883 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4884 { 4885 int status = 0; 4886 4887 ASSERT(vdc != NULL); 4888 ASSERT(mutex_owned(&vdc->lock)); 4889 4890 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4891 return (EPROTO); 4892 } 4893 4894 switch (dring_msg->tag.vio_subtype) { 4895 case VIO_SUBTYPE_ACK: 4896 /* save the received dring_ident */ 4897 vdc->dring_ident = dring_msg->dring_ident; 4898 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4899 vdc->instance, vdc->dring_ident); 4900 break; 4901 4902 case VIO_SUBTYPE_NACK: 4903 /* 4904 * vds could not handle the DRing info we sent so we 4905 * stop negotiating. 4906 */ 4907 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4908 vdc->instance); 4909 status = EPROTO; 4910 break; 4911 4912 case VIO_SUBTYPE_INFO: 4913 /* 4914 * Handle the case where vds starts handshake 4915 * (for now only vdc is the instigatior) 4916 */ 4917 status = ENOTSUP; 4918 break; 4919 default: 4920 status = ENOTSUP; 4921 } 4922 4923 return (status); 4924 } 4925 4926 /* 4927 * Function: 4928 * vdc_verify_seq_num() 4929 * 4930 * Description: 4931 * This functions verifies that the sequence number sent back by the vDisk 4932 * server with the latest message is what is expected (i.e. it is greater 4933 * than the last seq num sent by the vDisk server and less than or equal 4934 * to the last seq num generated by vdc). 4935 * 4936 * It then checks the request ID to see if any requests need processing 4937 * in the DRing. 4938 * 4939 * Arguments: 4940 * vdc - soft state pointer for this instance of the driver. 4941 * dring_msg - pointer to the LDC message sent by vds 4942 * 4943 * Return Code: 4944 * VDC_SEQ_NUM_TODO - Message needs to be processed 4945 * VDC_SEQ_NUM_SKIP - Message has already been processed 4946 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4947 * vdc cannot deal with them 4948 */ 4949 static int 4950 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4951 { 4952 ASSERT(vdc != NULL); 4953 ASSERT(dring_msg != NULL); 4954 ASSERT(mutex_owned(&vdc->lock)); 4955 4956 /* 4957 * Check to see if the messages were responded to in the correct 4958 * order by vds. 4959 */ 4960 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4961 (dring_msg->seq_num > vdc->seq_num)) { 4962 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4963 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4964 vdc->instance, dring_msg->seq_num, 4965 vdc->seq_num_reply, vdc->seq_num, 4966 vdc->req_id_proc, vdc->req_id); 4967 return (VDC_SEQ_NUM_INVALID); 4968 } 4969 vdc->seq_num_reply = dring_msg->seq_num; 4970 4971 if (vdc->req_id_proc < vdc->req_id) 4972 return (VDC_SEQ_NUM_TODO); 4973 else 4974 return (VDC_SEQ_NUM_SKIP); 4975 } 4976 4977 4978 /* 4979 * Function: 4980 * vdc_is_supported_version() 4981 * 4982 * Description: 4983 * This routine checks if the major/minor version numbers specified in 4984 * 'ver_msg' are supported. If not it finds the next version that is 4985 * in the supported version list 'vdc_version[]' and sets the fields in 4986 * 'ver_msg' to those values 4987 * 4988 * Arguments: 4989 * ver_msg - LDC message sent by vDisk server 4990 * 4991 * Return Code: 4992 * B_TRUE - Success 4993 * B_FALSE - Version not supported 4994 */ 4995 static boolean_t 4996 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 4997 { 4998 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 4999 5000 for (int i = 0; i < vdc_num_versions; i++) { 5001 ASSERT(vdc_version[i].major > 0); 5002 ASSERT((i == 0) || 5003 (vdc_version[i].major < vdc_version[i-1].major)); 5004 5005 /* 5006 * If the major versions match, adjust the minor version, if 5007 * necessary, down to the highest value supported by this 5008 * client. The server should support all minor versions lower 5009 * than the value it sent 5010 */ 5011 if (ver_msg->ver_major == vdc_version[i].major) { 5012 if (ver_msg->ver_minor > vdc_version[i].minor) { 5013 DMSGX(0, 5014 "Adjusting minor version from %u to %u", 5015 ver_msg->ver_minor, vdc_version[i].minor); 5016 ver_msg->ver_minor = vdc_version[i].minor; 5017 } 5018 return (B_TRUE); 5019 } 5020 5021 /* 5022 * If the message contains a higher major version number, set 5023 * the message's major/minor versions to the current values 5024 * and return false, so this message will get resent with 5025 * these values, and the server will potentially try again 5026 * with the same or a lower version 5027 */ 5028 if (ver_msg->ver_major > vdc_version[i].major) { 5029 ver_msg->ver_major = vdc_version[i].major; 5030 ver_msg->ver_minor = vdc_version[i].minor; 5031 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5032 ver_msg->ver_major, ver_msg->ver_minor); 5033 5034 return (B_FALSE); 5035 } 5036 5037 /* 5038 * Otherwise, the message's major version is less than the 5039 * current major version, so continue the loop to the next 5040 * (lower) supported version 5041 */ 5042 } 5043 5044 /* 5045 * No common version was found; "ground" the version pair in the 5046 * message to terminate negotiation 5047 */ 5048 ver_msg->ver_major = 0; 5049 ver_msg->ver_minor = 0; 5050 5051 return (B_FALSE); 5052 } 5053 /* -------------------------------------------------------------------------- */ 5054 5055 /* 5056 * DKIO(7) support 5057 */ 5058 5059 typedef struct vdc_dk_arg { 5060 struct dk_callback dkc; 5061 int mode; 5062 dev_t dev; 5063 vdc_t *vdc; 5064 } vdc_dk_arg_t; 5065 5066 /* 5067 * Function: 5068 * vdc_dkio_flush_cb() 5069 * 5070 * Description: 5071 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5072 * by kernel code. 5073 * 5074 * Arguments: 5075 * arg - a pointer to a vdc_dk_arg_t structure. 5076 */ 5077 void 5078 vdc_dkio_flush_cb(void *arg) 5079 { 5080 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5081 struct dk_callback *dkc = NULL; 5082 vdc_t *vdc = NULL; 5083 int rv; 5084 5085 if (dk_arg == NULL) { 5086 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5087 return; 5088 } 5089 dkc = &dk_arg->dkc; 5090 vdc = dk_arg->vdc; 5091 ASSERT(vdc != NULL); 5092 5093 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5094 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5095 if (rv != 0) { 5096 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5097 vdc->instance, rv, 5098 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5099 } 5100 5101 /* 5102 * Trigger the call back to notify the caller the the ioctl call has 5103 * been completed. 5104 */ 5105 if ((dk_arg->mode & FKIOCTL) && 5106 (dkc != NULL) && 5107 (dkc->dkc_callback != NULL)) { 5108 ASSERT(dkc->dkc_cookie != NULL); 5109 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5110 } 5111 5112 /* Indicate that one less DKIO write flush is outstanding */ 5113 mutex_enter(&vdc->lock); 5114 vdc->dkio_flush_pending--; 5115 ASSERT(vdc->dkio_flush_pending >= 0); 5116 mutex_exit(&vdc->lock); 5117 5118 /* free the mem that was allocated when the callback was dispatched */ 5119 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5120 } 5121 5122 /* 5123 * Function: 5124 * vdc_dkio_gapart() 5125 * 5126 * Description: 5127 * This function implements the DKIOCGAPART ioctl. 5128 * 5129 * Arguments: 5130 * vdc - soft state pointer 5131 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5132 * flag - ioctl flags 5133 */ 5134 static int 5135 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5136 { 5137 struct dk_geom *geom; 5138 struct vtoc *vtoc; 5139 union { 5140 struct dk_map map[NDKMAP]; 5141 struct dk_map32 map32[NDKMAP]; 5142 } data; 5143 int i, rv, size; 5144 5145 mutex_enter(&vdc->lock); 5146 5147 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5148 mutex_exit(&vdc->lock); 5149 return (rv); 5150 } 5151 5152 vtoc = vdc->vtoc; 5153 geom = vdc->geom; 5154 5155 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5156 5157 for (i = 0; i < vtoc->v_nparts; i++) { 5158 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5159 (geom->dkg_nhead * geom->dkg_nsect); 5160 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5161 } 5162 size = NDKMAP * sizeof (struct dk_map32); 5163 5164 } else { 5165 5166 for (i = 0; i < vtoc->v_nparts; i++) { 5167 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5168 (geom->dkg_nhead * geom->dkg_nsect); 5169 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5170 } 5171 size = NDKMAP * sizeof (struct dk_map); 5172 5173 } 5174 5175 mutex_exit(&vdc->lock); 5176 5177 if (ddi_copyout(&data, arg, size, flag) != 0) 5178 return (EFAULT); 5179 5180 return (0); 5181 } 5182 5183 /* 5184 * Function: 5185 * vdc_dkio_partition() 5186 * 5187 * Description: 5188 * This function implements the DKIOCPARTITION ioctl. 5189 * 5190 * Arguments: 5191 * vdc - soft state pointer 5192 * arg - a pointer to a struct partition64 structure 5193 * flag - ioctl flags 5194 */ 5195 static int 5196 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5197 { 5198 struct partition64 p64; 5199 efi_gpt_t *gpt; 5200 efi_gpe_t *gpe; 5201 vd_efi_dev_t edev; 5202 uint_t partno; 5203 int rv; 5204 5205 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5206 return (EFAULT); 5207 } 5208 5209 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5210 5211 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5212 return (rv); 5213 } 5214 5215 partno = p64.p_partno; 5216 5217 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5218 vd_efi_free(&edev, gpt, gpe); 5219 return (ESRCH); 5220 } 5221 5222 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5223 sizeof (struct uuid)); 5224 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5225 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5226 5227 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5228 vd_efi_free(&edev, gpt, gpe); 5229 return (EFAULT); 5230 } 5231 5232 vd_efi_free(&edev, gpt, gpe); 5233 return (0); 5234 } 5235 5236 /* 5237 * Function: 5238 * vdc_dioctl_rwcmd() 5239 * 5240 * Description: 5241 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5242 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5243 * 5244 * Arguments: 5245 * dev - device 5246 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5247 * flag - ioctl flags 5248 */ 5249 static int 5250 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5251 { 5252 struct dadkio_rwcmd32 rwcmd32; 5253 struct dadkio_rwcmd rwcmd; 5254 struct iovec aiov; 5255 struct uio auio; 5256 int rw, status; 5257 struct buf *buf; 5258 5259 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5260 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5261 sizeof (struct dadkio_rwcmd32), flag)) { 5262 return (EFAULT); 5263 } 5264 rwcmd.cmd = rwcmd32.cmd; 5265 rwcmd.flags = rwcmd32.flags; 5266 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5267 rwcmd.buflen = rwcmd32.buflen; 5268 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5269 } else { 5270 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5271 sizeof (struct dadkio_rwcmd), flag)) { 5272 return (EFAULT); 5273 } 5274 } 5275 5276 switch (rwcmd.cmd) { 5277 case DADKIO_RWCMD_READ: 5278 rw = B_READ; 5279 break; 5280 case DADKIO_RWCMD_WRITE: 5281 rw = B_WRITE; 5282 break; 5283 default: 5284 return (EINVAL); 5285 } 5286 5287 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5288 aiov.iov_base = rwcmd.bufaddr; 5289 aiov.iov_len = rwcmd.buflen; 5290 5291 bzero((caddr_t)&auio, sizeof (struct uio)); 5292 auio.uio_iov = &aiov; 5293 auio.uio_iovcnt = 1; 5294 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5295 auio.uio_resid = rwcmd.buflen; 5296 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5297 5298 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5299 bioinit(buf); 5300 /* 5301 * We use the private field of buf to specify that this is an 5302 * I/O using an absolute offset. 5303 */ 5304 buf->b_private = (void *)VD_SLICE_NONE; 5305 5306 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5307 5308 biofini(buf); 5309 kmem_free(buf, sizeof (buf_t)); 5310 5311 return (status); 5312 } 5313 5314 /* 5315 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5316 * buffer is returned in alloc_len. 5317 */ 5318 static vd_scsi_t * 5319 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5320 int *alloc_len) 5321 { 5322 vd_scsi_t *vd_scsi; 5323 int vd_scsi_len = VD_SCSI_SIZE; 5324 5325 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5326 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5327 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5328 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5329 5330 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5331 5332 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5333 5334 vd_scsi->cdb_len = cdb_len; 5335 vd_scsi->sense_len = sense_len; 5336 vd_scsi->datain_len = datain_len; 5337 vd_scsi->dataout_len = dataout_len; 5338 5339 *alloc_len = vd_scsi_len; 5340 5341 return (vd_scsi); 5342 } 5343 5344 /* 5345 * Convert the status of a SCSI command to a Solaris return code. 5346 * 5347 * Arguments: 5348 * vd_scsi - The SCSI operation buffer. 5349 * log_error - indicate if an error message should be logged. 5350 * 5351 * Note that our SCSI error messages are rather primitive for the moment 5352 * and could be improved by decoding some data like the SCSI command and 5353 * the sense key. 5354 * 5355 * Return value: 5356 * 0 - Status is good. 5357 * EACCES - Status reports a reservation conflict. 5358 * ENOTSUP - Status reports a check condition and sense key 5359 * reports an illegal request. 5360 * EIO - Any other status. 5361 */ 5362 static int 5363 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5364 { 5365 int rv; 5366 char path_str[MAXPATHLEN]; 5367 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5368 union scsi_cdb *cdb; 5369 struct scsi_extended_sense *sense; 5370 5371 if (vd_scsi->cmd_status == STATUS_GOOD) 5372 /* no error */ 5373 return (0); 5374 5375 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5376 if (vdc_scsi_log_error) 5377 log_error = B_TRUE; 5378 5379 if (log_error) { 5380 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5381 ddi_pathname(vdc->dip, path_str), vdc->instance, 5382 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5383 } 5384 5385 /* default returned value */ 5386 rv = EIO; 5387 5388 switch (vd_scsi->cmd_status) { 5389 5390 case STATUS_CHECK: 5391 case STATUS_TERMINATED: 5392 if (log_error) 5393 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5394 5395 /* check sense buffer */ 5396 if (vd_scsi->sense_len == 0 || 5397 vd_scsi->sense_status != STATUS_GOOD) { 5398 if (log_error) 5399 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5400 break; 5401 } 5402 5403 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5404 5405 if (log_error) { 5406 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5407 "\tASC: 0x%x, ASCQ: 0x%x\n", 5408 scsi_sense_key((uint8_t *)sense), 5409 scsi_sense_asc((uint8_t *)sense), 5410 scsi_sense_ascq((uint8_t *)sense)); 5411 } 5412 5413 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5414 rv = ENOTSUP; 5415 break; 5416 5417 case STATUS_BUSY: 5418 if (log_error) 5419 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5420 break; 5421 5422 case STATUS_RESERVATION_CONFLICT: 5423 /* 5424 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5425 * reservation conflict could be due to various reasons like 5426 * incorrect keys, not registered or not reserved etc. So, 5427 * we should not panic in that case. 5428 */ 5429 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5430 if (vdc->failfast_interval != 0 && 5431 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5432 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5433 /* failfast is enabled so we have to panic */ 5434 (void) snprintf(panic_str, sizeof (panic_str), 5435 VDC_RESV_CONFLICT_FMT_STR "%s", 5436 ddi_pathname(vdc->dip, path_str)); 5437 panic(panic_str); 5438 } 5439 if (log_error) 5440 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5441 rv = EACCES; 5442 break; 5443 5444 case STATUS_QFULL: 5445 if (log_error) 5446 cmn_err(CE_NOTE, "\tQueue Full\n"); 5447 break; 5448 5449 case STATUS_MET: 5450 case STATUS_INTERMEDIATE: 5451 case STATUS_SCSI2: 5452 case STATUS_INTERMEDIATE_MET: 5453 case STATUS_ACA_ACTIVE: 5454 if (log_error) 5455 cmn_err(CE_CONT, 5456 "\tUnexpected SCSI status received: 0x%x\n", 5457 vd_scsi->cmd_status); 5458 break; 5459 5460 default: 5461 if (log_error) 5462 cmn_err(CE_CONT, 5463 "\tInvalid SCSI status received: 0x%x\n", 5464 vd_scsi->cmd_status); 5465 break; 5466 } 5467 5468 return (rv); 5469 } 5470 5471 /* 5472 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5473 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5474 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5475 * converted to a VD_OP_RESET operation. 5476 */ 5477 static int 5478 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5479 { 5480 struct uscsi_cmd uscsi; 5481 struct uscsi_cmd32 uscsi32; 5482 vd_scsi_t *vd_scsi; 5483 int vd_scsi_len; 5484 union scsi_cdb *cdb; 5485 struct scsi_extended_sense *sense; 5486 char *datain, *dataout; 5487 size_t cdb_len, datain_len, dataout_len, sense_len; 5488 int rv; 5489 5490 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5491 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5492 mode) != 0) 5493 return (EFAULT); 5494 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5495 } else { 5496 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5497 mode) != 0) 5498 return (EFAULT); 5499 } 5500 5501 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5502 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5503 USCSI_RESET_ALL)) { 5504 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5505 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5506 return (rv); 5507 } 5508 5509 /* cdb buffer length */ 5510 cdb_len = uscsi.uscsi_cdblen; 5511 5512 /* data in and out buffers length */ 5513 if (uscsi.uscsi_flags & USCSI_READ) { 5514 datain_len = uscsi.uscsi_buflen; 5515 dataout_len = 0; 5516 } else { 5517 datain_len = 0; 5518 dataout_len = uscsi.uscsi_buflen; 5519 } 5520 5521 /* sense buffer length */ 5522 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5523 sense_len = uscsi.uscsi_rqlen; 5524 else 5525 sense_len = 0; 5526 5527 /* allocate buffer for the VD_SCSICMD_OP operation */ 5528 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5529 &vd_scsi_len); 5530 5531 /* 5532 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5533 * but basically they prevent a SCSI command from being retried in case 5534 * of an error. 5535 */ 5536 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5537 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5538 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5539 5540 /* set task attribute */ 5541 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5542 vd_scsi->task_attribute = 0; 5543 } else { 5544 if (uscsi.uscsi_flags & USCSI_HEAD) 5545 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5546 else if (uscsi.uscsi_flags & USCSI_HTAG) 5547 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5548 else if (uscsi.uscsi_flags & USCSI_OTAG) 5549 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5550 else 5551 vd_scsi->task_attribute = 0; 5552 } 5553 5554 /* set timeout */ 5555 vd_scsi->timeout = uscsi.uscsi_timeout; 5556 5557 /* copy-in cdb data */ 5558 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5559 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5560 rv = EFAULT; 5561 goto done; 5562 } 5563 5564 /* keep a pointer to the sense buffer */ 5565 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5566 5567 /* keep a pointer to the data-in buffer */ 5568 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5569 5570 /* copy-in request data to the data-out buffer */ 5571 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5572 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5573 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5574 mode)) { 5575 rv = EFAULT; 5576 goto done; 5577 } 5578 } 5579 5580 /* submit the request */ 5581 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5582 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5583 5584 if (rv != 0) 5585 goto done; 5586 5587 /* update scsi status */ 5588 uscsi.uscsi_status = vd_scsi->cmd_status; 5589 5590 /* update sense data */ 5591 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5592 (uscsi.uscsi_status == STATUS_CHECK || 5593 uscsi.uscsi_status == STATUS_TERMINATED)) { 5594 5595 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5596 5597 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5598 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5599 vd_scsi->sense_len; 5600 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5601 vd_scsi->sense_len, mode) != 0) { 5602 rv = EFAULT; 5603 goto done; 5604 } 5605 } 5606 } 5607 5608 /* update request data */ 5609 if (uscsi.uscsi_status == STATUS_GOOD) { 5610 if (uscsi.uscsi_flags & USCSI_READ) { 5611 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5612 vd_scsi->datain_len; 5613 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5614 vd_scsi->datain_len, mode) != 0) { 5615 rv = EFAULT; 5616 goto done; 5617 } 5618 } else { 5619 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5620 vd_scsi->dataout_len; 5621 } 5622 } 5623 5624 /* copy-out result */ 5625 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5626 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5627 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5628 mode) != 0) { 5629 rv = EFAULT; 5630 goto done; 5631 } 5632 } else { 5633 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5634 mode) != 0) { 5635 rv = EFAULT; 5636 goto done; 5637 } 5638 } 5639 5640 /* get the return code from the SCSI command status */ 5641 rv = vdc_scsi_status(vdc, vd_scsi, 5642 !(uscsi.uscsi_flags & USCSI_SILENT)); 5643 5644 done: 5645 kmem_free(vd_scsi, vd_scsi_len); 5646 return (rv); 5647 } 5648 5649 /* 5650 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5651 * 5652 * Arguments: 5653 * cmd - SCSI PERSISTENT IN command 5654 * len - length of the SCSI input buffer 5655 * vd_scsi_len - return the length of the allocated buffer 5656 * 5657 * Returned Value: 5658 * a pointer to the allocated VD_OP_SCSICMD buffer. 5659 */ 5660 static vd_scsi_t * 5661 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5662 { 5663 int cdb_len, sense_len, datain_len, dataout_len; 5664 vd_scsi_t *vd_scsi; 5665 union scsi_cdb *cdb; 5666 5667 cdb_len = CDB_GROUP1; 5668 sense_len = sizeof (struct scsi_extended_sense); 5669 datain_len = len; 5670 dataout_len = 0; 5671 5672 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5673 vd_scsi_len); 5674 5675 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5676 5677 /* set cdb */ 5678 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5679 cdb->cdb_opaque[1] = cmd; 5680 FORMG1COUNT(cdb, datain_len); 5681 5682 vd_scsi->timeout = vdc_scsi_timeout; 5683 5684 return (vd_scsi); 5685 } 5686 5687 /* 5688 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5689 * 5690 * Arguments: 5691 * cmd - SCSI PERSISTENT OUT command 5692 * len - length of the SCSI output buffer 5693 * vd_scsi_len - return the length of the allocated buffer 5694 * 5695 * Returned Code: 5696 * a pointer to the allocated VD_OP_SCSICMD buffer. 5697 */ 5698 static vd_scsi_t * 5699 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5700 { 5701 int cdb_len, sense_len, datain_len, dataout_len; 5702 vd_scsi_t *vd_scsi; 5703 union scsi_cdb *cdb; 5704 5705 cdb_len = CDB_GROUP1; 5706 sense_len = sizeof (struct scsi_extended_sense); 5707 datain_len = 0; 5708 dataout_len = len; 5709 5710 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5711 vd_scsi_len); 5712 5713 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5714 5715 /* set cdb */ 5716 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5717 cdb->cdb_opaque[1] = cmd; 5718 FORMG1COUNT(cdb, dataout_len); 5719 5720 vd_scsi->timeout = vdc_scsi_timeout; 5721 5722 return (vd_scsi); 5723 } 5724 5725 /* 5726 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5727 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5728 * server with a VD_OP_SCSICMD operation. 5729 */ 5730 static int 5731 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5732 { 5733 vd_scsi_t *vd_scsi; 5734 mhioc_inkeys_t inkeys; 5735 mhioc_key_list_t klist; 5736 struct mhioc_inkeys32 inkeys32; 5737 struct mhioc_key_list32 klist32; 5738 sd_prin_readkeys_t *scsi_keys; 5739 void *user_keys; 5740 int vd_scsi_len; 5741 int listsize, listlen, rv; 5742 5743 /* copyin arguments */ 5744 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5745 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5746 if (rv != 0) 5747 return (EFAULT); 5748 5749 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5750 sizeof (klist32), mode); 5751 if (rv != 0) 5752 return (EFAULT); 5753 5754 listsize = klist32.listsize; 5755 } else { 5756 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5757 if (rv != 0) 5758 return (EFAULT); 5759 5760 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5761 if (rv != 0) 5762 return (EFAULT); 5763 5764 listsize = klist.listsize; 5765 } 5766 5767 /* build SCSI VD_OP request */ 5768 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5769 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5770 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5771 5772 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5773 5774 /* submit the request */ 5775 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5776 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5777 5778 if (rv != 0) 5779 goto done; 5780 5781 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5782 5783 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5784 inkeys32.generation = scsi_keys->generation; 5785 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5786 if (rv != 0) { 5787 rv = EFAULT; 5788 goto done; 5789 } 5790 5791 klist32.listlen = listlen; 5792 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5793 sizeof (klist32), mode); 5794 if (rv != 0) { 5795 rv = EFAULT; 5796 goto done; 5797 } 5798 5799 user_keys = (caddr_t)(uintptr_t)klist32.list; 5800 } else { 5801 inkeys.generation = scsi_keys->generation; 5802 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5803 if (rv != 0) { 5804 rv = EFAULT; 5805 goto done; 5806 } 5807 5808 klist.listlen = listlen; 5809 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5810 if (rv != 0) { 5811 rv = EFAULT; 5812 goto done; 5813 } 5814 5815 user_keys = klist.list; 5816 } 5817 5818 /* copy out keys */ 5819 if (listlen > 0 && listsize > 0) { 5820 if (listsize < listlen) 5821 listlen = listsize; 5822 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5823 listlen * MHIOC_RESV_KEY_SIZE, mode); 5824 if (rv != 0) 5825 rv = EFAULT; 5826 } 5827 5828 if (rv == 0) 5829 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5830 5831 done: 5832 kmem_free(vd_scsi, vd_scsi_len); 5833 5834 return (rv); 5835 } 5836 5837 /* 5838 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5839 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5840 * the vdisk server with a VD_OP_SCSICMD operation. 5841 */ 5842 static int 5843 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5844 { 5845 vd_scsi_t *vd_scsi; 5846 mhioc_inresvs_t inresv; 5847 mhioc_resv_desc_list_t rlist; 5848 struct mhioc_inresvs32 inresv32; 5849 struct mhioc_resv_desc_list32 rlist32; 5850 mhioc_resv_desc_t mhd_resv; 5851 sd_prin_readresv_t *scsi_resv; 5852 sd_readresv_desc_t *resv; 5853 mhioc_resv_desc_t *user_resv; 5854 int vd_scsi_len; 5855 int listsize, listlen, i, rv; 5856 5857 /* copyin arguments */ 5858 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5859 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5860 if (rv != 0) 5861 return (EFAULT); 5862 5863 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5864 sizeof (rlist32), mode); 5865 if (rv != 0) 5866 return (EFAULT); 5867 5868 listsize = rlist32.listsize; 5869 } else { 5870 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5871 if (rv != 0) 5872 return (EFAULT); 5873 5874 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5875 if (rv != 0) 5876 return (EFAULT); 5877 5878 listsize = rlist.listsize; 5879 } 5880 5881 /* build SCSI VD_OP request */ 5882 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5883 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5884 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5885 5886 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5887 5888 /* submit the request */ 5889 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5890 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5891 5892 if (rv != 0) 5893 goto done; 5894 5895 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5896 5897 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5898 inresv32.generation = scsi_resv->generation; 5899 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5900 if (rv != 0) { 5901 rv = EFAULT; 5902 goto done; 5903 } 5904 5905 rlist32.listlen = listlen; 5906 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5907 sizeof (rlist32), mode); 5908 if (rv != 0) { 5909 rv = EFAULT; 5910 goto done; 5911 } 5912 5913 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5914 } else { 5915 inresv.generation = scsi_resv->generation; 5916 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5917 if (rv != 0) { 5918 rv = EFAULT; 5919 goto done; 5920 } 5921 5922 rlist.listlen = listlen; 5923 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5924 if (rv != 0) { 5925 rv = EFAULT; 5926 goto done; 5927 } 5928 5929 user_resv = rlist.list; 5930 } 5931 5932 /* copy out reservations */ 5933 if (listsize > 0 && listlen > 0) { 5934 if (listsize < listlen) 5935 listlen = listsize; 5936 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5937 5938 for (i = 0; i < listlen; i++) { 5939 mhd_resv.type = resv->type; 5940 mhd_resv.scope = resv->scope; 5941 mhd_resv.scope_specific_addr = 5942 BE_32(resv->scope_specific_addr); 5943 bcopy(&resv->resvkey, &mhd_resv.key, 5944 MHIOC_RESV_KEY_SIZE); 5945 5946 rv = ddi_copyout(&mhd_resv, user_resv, 5947 sizeof (mhd_resv), mode); 5948 if (rv != 0) { 5949 rv = EFAULT; 5950 goto done; 5951 } 5952 resv++; 5953 user_resv++; 5954 } 5955 } 5956 5957 if (rv == 0) 5958 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5959 5960 done: 5961 kmem_free(vd_scsi, vd_scsi_len); 5962 return (rv); 5963 } 5964 5965 /* 5966 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5967 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5968 * server with a VD_OP_SCSICMD operation. 5969 */ 5970 static int 5971 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5972 { 5973 vd_scsi_t *vd_scsi; 5974 sd_prout_t *scsi_prout; 5975 mhioc_register_t mhd_reg; 5976 int vd_scsi_len, rv; 5977 5978 /* copyin arguments */ 5979 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 5980 if (rv != 0) 5981 return (EFAULT); 5982 5983 /* build SCSI VD_OP request */ 5984 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 5985 sizeof (sd_prout_t), &vd_scsi_len); 5986 5987 /* set parameters */ 5988 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5989 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5990 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 5991 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 5992 5993 /* submit the request */ 5994 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5995 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5996 5997 if (rv == 0) 5998 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5999 6000 kmem_free(vd_scsi, vd_scsi_len); 6001 return (rv); 6002 } 6003 6004 /* 6005 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6006 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6007 * server with a VD_OP_SCSICMD operation. 6008 */ 6009 static int 6010 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6011 { 6012 union scsi_cdb *cdb; 6013 vd_scsi_t *vd_scsi; 6014 sd_prout_t *scsi_prout; 6015 mhioc_resv_desc_t mhd_resv; 6016 int vd_scsi_len, rv; 6017 6018 /* copyin arguments */ 6019 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6020 if (rv != 0) 6021 return (EFAULT); 6022 6023 /* build SCSI VD_OP request */ 6024 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6025 sizeof (sd_prout_t), &vd_scsi_len); 6026 6027 /* set parameters */ 6028 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6029 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6030 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6031 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6032 cdb->cdb_opaque[2] = mhd_resv.type; 6033 6034 /* submit the request */ 6035 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6036 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6037 6038 if (rv == 0) 6039 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6040 6041 kmem_free(vd_scsi, vd_scsi_len); 6042 return (rv); 6043 } 6044 6045 /* 6046 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6047 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6048 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6049 */ 6050 static int 6051 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6052 { 6053 union scsi_cdb *cdb; 6054 vd_scsi_t *vd_scsi; 6055 sd_prout_t *scsi_prout; 6056 mhioc_preemptandabort_t mhd_preempt; 6057 int vd_scsi_len, rv; 6058 6059 /* copyin arguments */ 6060 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6061 if (rv != 0) 6062 return (EFAULT); 6063 6064 /* build SCSI VD_OP request */ 6065 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6066 sizeof (sd_prout_t), &vd_scsi_len); 6067 6068 /* set parameters */ 6069 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6070 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6071 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6072 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6073 MHIOC_RESV_KEY_SIZE); 6074 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6075 MHIOC_RESV_KEY_SIZE); 6076 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6077 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6078 6079 /* submit the request */ 6080 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6081 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6082 6083 if (rv == 0) 6084 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6085 6086 kmem_free(vd_scsi, vd_scsi_len); 6087 return (rv); 6088 } 6089 6090 /* 6091 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6092 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6093 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6094 */ 6095 static int 6096 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6097 { 6098 vd_scsi_t *vd_scsi; 6099 sd_prout_t *scsi_prout; 6100 mhioc_registerandignorekey_t mhd_regi; 6101 int vd_scsi_len, rv; 6102 6103 /* copyin arguments */ 6104 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6105 if (rv != 0) 6106 return (EFAULT); 6107 6108 /* build SCSI VD_OP request */ 6109 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6110 sizeof (sd_prout_t), &vd_scsi_len); 6111 6112 /* set parameters */ 6113 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6114 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6115 MHIOC_RESV_KEY_SIZE); 6116 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6117 6118 /* submit the request */ 6119 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6120 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6121 6122 if (rv == 0) 6123 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6124 6125 kmem_free(vd_scsi, vd_scsi_len); 6126 return (rv); 6127 } 6128 6129 /* 6130 * This function is used by the failfast mechanism to send a SCSI command 6131 * to check for reservation conflict. 6132 */ 6133 static int 6134 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6135 { 6136 int cdb_len, sense_len, vd_scsi_len; 6137 vd_scsi_t *vd_scsi; 6138 union scsi_cdb *cdb; 6139 int rv; 6140 6141 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6142 6143 if (scmd == SCMD_WRITE_G1) 6144 cdb_len = CDB_GROUP1; 6145 else 6146 cdb_len = CDB_GROUP0; 6147 6148 sense_len = sizeof (struct scsi_extended_sense); 6149 6150 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6151 6152 /* set cdb */ 6153 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6154 cdb->scc_cmd = scmd; 6155 6156 vd_scsi->timeout = vdc_scsi_timeout; 6157 6158 /* 6159 * Submit the request. The last argument has to be B_FALSE so that 6160 * vdc_do_sync_op does not loop checking for reservation conflict if 6161 * the operation returns an error. 6162 */ 6163 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6164 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6165 6166 if (rv == 0) 6167 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6168 6169 kmem_free(vd_scsi, vd_scsi_len); 6170 return (rv); 6171 } 6172 6173 /* 6174 * This function is used by the failfast mechanism to check for reservation 6175 * conflict. It sends some SCSI commands which will fail with a reservation 6176 * conflict error if the system does not have access to the disk and this 6177 * will panic the system. 6178 * 6179 * Returned Code: 6180 * 0 - disk is accessible without reservation conflict error 6181 * != 0 - unable to check if disk is accessible 6182 */ 6183 int 6184 vdc_failfast_check_resv(vdc_t *vdc) 6185 { 6186 int failure = 0; 6187 6188 /* 6189 * Send a TEST UNIT READY command. The command will panic 6190 * the system if it fails with a reservation conflict. 6191 */ 6192 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6193 failure++; 6194 6195 /* 6196 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6197 * a reserved device, so we also do a WRITE(10) of zero byte in 6198 * order to provoke a Reservation Conflict status on those newer 6199 * devices. 6200 */ 6201 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6202 failure++; 6203 6204 return (failure); 6205 } 6206 6207 /* 6208 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6209 * queue when it has failed and failfast is enabled. Then we have to check 6210 * if it has failed because of a reservation conflict in which case we have 6211 * to panic the system. 6212 * 6213 * Async I/O should be queued with their block I/O data transfer structure 6214 * (buf). Sync I/O should be queued with buf = NULL. 6215 */ 6216 static vdc_io_t * 6217 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6218 { 6219 vdc_io_t *vio; 6220 6221 ASSERT(MUTEX_HELD(&vdc->lock)); 6222 6223 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6224 vio->vio_next = vdc->failfast_io_queue; 6225 vio->vio_buf = buf; 6226 vio->vio_qtime = ddi_get_lbolt(); 6227 6228 vdc->failfast_io_queue = vio; 6229 6230 /* notify the failfast thread that a new I/O is queued */ 6231 cv_signal(&vdc->failfast_cv); 6232 6233 return (vio); 6234 } 6235 6236 /* 6237 * Remove and complete I/O in the failfast I/O queue which have been 6238 * added after the indicated deadline. A deadline of 0 means that all 6239 * I/O have to be unqueued and marked as completed. 6240 */ 6241 static void 6242 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6243 { 6244 vdc_io_t *vio, *vio_tmp; 6245 6246 ASSERT(MUTEX_HELD(&vdc->lock)); 6247 6248 vio_tmp = NULL; 6249 vio = vdc->failfast_io_queue; 6250 6251 if (deadline != 0) { 6252 /* 6253 * Skip any io queued after the deadline. The failfast 6254 * I/O queue is ordered starting with the last I/O added 6255 * to the queue. 6256 */ 6257 while (vio != NULL && vio->vio_qtime > deadline) { 6258 vio_tmp = vio; 6259 vio = vio->vio_next; 6260 } 6261 } 6262 6263 if (vio == NULL) 6264 /* nothing to unqueue */ 6265 return; 6266 6267 /* update the queue */ 6268 if (vio_tmp == NULL) 6269 vdc->failfast_io_queue = NULL; 6270 else 6271 vio_tmp->vio_next = NULL; 6272 6273 /* 6274 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6275 * structure (buf) and they are completed by calling biodone(). Sync 6276 * I/O do not have a buf and they are completed by setting the 6277 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6278 * thread waiting for the I/O to complete is responsible for freeing 6279 * the vio structure. 6280 */ 6281 while (vio != NULL) { 6282 vio_tmp = vio->vio_next; 6283 if (vio->vio_buf != NULL) { 6284 VD_KSTAT_RUNQ_EXIT(vdc); 6285 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6286 biodone(vio->vio_buf); 6287 kmem_free(vio, sizeof (vdc_io_t)); 6288 } else { 6289 vio->vio_qtime = 0; 6290 } 6291 vio = vio_tmp; 6292 } 6293 6294 cv_broadcast(&vdc->failfast_io_cv); 6295 } 6296 6297 /* 6298 * Failfast Thread. 6299 * 6300 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6301 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6302 * we still have access to the disk. If a command fails with a RESERVATION 6303 * CONFLICT error then the system will immediatly panic. 6304 * 6305 * The failfast thread is also woken up when an I/O has failed. It then check 6306 * the access to the disk to ensure that the I/O failure was not due to a 6307 * reservation conflict. 6308 * 6309 * There is one failfast thread for each virtual disk for which failfast is 6310 * enabled. We could have only one thread sending requests for all disks but 6311 * this would need vdc to send asynchronous requests and to have callbacks to 6312 * process replies. 6313 */ 6314 static void 6315 vdc_failfast_thread(void *arg) 6316 { 6317 int status; 6318 vdc_t *vdc = (vdc_t *)arg; 6319 clock_t timeout, starttime; 6320 6321 mutex_enter(&vdc->lock); 6322 6323 while (vdc->failfast_interval != 0) { 6324 6325 starttime = ddi_get_lbolt(); 6326 6327 mutex_exit(&vdc->lock); 6328 6329 /* check for reservation conflict */ 6330 status = vdc_failfast_check_resv(vdc); 6331 6332 mutex_enter(&vdc->lock); 6333 /* 6334 * We have dropped the lock to send the SCSI command so we have 6335 * to check that failfast is still enabled. 6336 */ 6337 if (vdc->failfast_interval == 0) 6338 break; 6339 6340 /* 6341 * If we have successfully check the disk access and there was 6342 * no reservation conflict then we can complete any I/O queued 6343 * before the last check. 6344 */ 6345 if (status == 0) 6346 vdc_failfast_io_unqueue(vdc, starttime); 6347 6348 /* proceed again if some I/O are still in the queue */ 6349 if (vdc->failfast_io_queue != NULL) 6350 continue; 6351 6352 timeout = ddi_get_lbolt() + 6353 drv_usectohz(vdc->failfast_interval); 6354 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6355 } 6356 6357 /* 6358 * Failfast is being stop so we can complete any queued I/O. 6359 */ 6360 vdc_failfast_io_unqueue(vdc, 0); 6361 vdc->failfast_thread = NULL; 6362 mutex_exit(&vdc->lock); 6363 thread_exit(); 6364 } 6365 6366 /* 6367 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6368 */ 6369 static int 6370 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6371 { 6372 unsigned int mh_time; 6373 6374 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6375 return (EFAULT); 6376 6377 mutex_enter(&vdc->lock); 6378 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6379 vdc->failfast_thread = thread_create(NULL, 0, 6380 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6381 v.v_maxsyspri - 2); 6382 } 6383 6384 vdc->failfast_interval = mh_time * 1000; 6385 cv_signal(&vdc->failfast_cv); 6386 mutex_exit(&vdc->lock); 6387 6388 return (0); 6389 } 6390 6391 /* 6392 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6393 * converted to VD_OP_SET_ACCESS operations. 6394 */ 6395 static int 6396 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6397 { 6398 int rv; 6399 6400 /* submit owership command request */ 6401 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6402 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6403 VIO_both_dir, B_TRUE); 6404 6405 return (rv); 6406 } 6407 6408 /* 6409 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6410 * VD_OP_GET_ACCESS operation. 6411 */ 6412 static int 6413 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6414 { 6415 int rv; 6416 6417 /* submit owership command request */ 6418 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6419 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6420 VIO_both_dir, B_TRUE); 6421 6422 return (rv); 6423 } 6424 6425 /* 6426 * Disk Ownership Thread. 6427 * 6428 * When we have taken the ownership of a disk, this thread waits to be 6429 * notified when the LDC channel is reset so that it can recover the 6430 * ownership. 6431 * 6432 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6433 * can not be used to do the ownership recovery because it has to be 6434 * running to handle the reply message to the ownership operation. 6435 */ 6436 static void 6437 vdc_ownership_thread(void *arg) 6438 { 6439 vdc_t *vdc = (vdc_t *)arg; 6440 clock_t timeout; 6441 uint64_t status; 6442 6443 mutex_enter(&vdc->ownership_lock); 6444 mutex_enter(&vdc->lock); 6445 6446 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6447 6448 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6449 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6450 /* 6451 * There was a reset so the ownership has been lost, 6452 * try to recover. We do this without using the preempt 6453 * option so that we don't steal the ownership from 6454 * someone who has preempted us. 6455 */ 6456 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6457 vdc->instance); 6458 6459 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6460 VDC_OWNERSHIP_GRANTED); 6461 6462 mutex_exit(&vdc->lock); 6463 6464 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6465 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6466 6467 mutex_enter(&vdc->lock); 6468 6469 if (status == 0) { 6470 DMSG(vdc, 0, "[%d] Ownership recovered", 6471 vdc->instance); 6472 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6473 } else { 6474 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6475 vdc->instance); 6476 } 6477 6478 } 6479 6480 /* 6481 * If we have the ownership then we just wait for an event 6482 * to happen (LDC reset), otherwise we will retry to recover 6483 * after a delay. 6484 */ 6485 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6486 timeout = 0; 6487 else 6488 timeout = ddi_get_lbolt() + 6489 drv_usectohz(vdc_ownership_delay); 6490 6491 /* Release the ownership_lock and wait on the vdc lock */ 6492 mutex_exit(&vdc->ownership_lock); 6493 6494 if (timeout == 0) 6495 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6496 else 6497 (void) cv_timedwait(&vdc->ownership_cv, 6498 &vdc->lock, timeout); 6499 6500 mutex_exit(&vdc->lock); 6501 6502 mutex_enter(&vdc->ownership_lock); 6503 mutex_enter(&vdc->lock); 6504 } 6505 6506 vdc->ownership_thread = NULL; 6507 mutex_exit(&vdc->lock); 6508 mutex_exit(&vdc->ownership_lock); 6509 6510 thread_exit(); 6511 } 6512 6513 static void 6514 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6515 { 6516 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6517 6518 mutex_enter(&vdc->lock); 6519 vdc->ownership = ownership_flags; 6520 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6521 vdc->ownership_thread == NULL) { 6522 /* start ownership thread */ 6523 vdc->ownership_thread = thread_create(NULL, 0, 6524 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6525 v.v_maxsyspri - 2); 6526 } else { 6527 /* notify the ownership thread */ 6528 cv_signal(&vdc->ownership_cv); 6529 } 6530 mutex_exit(&vdc->lock); 6531 } 6532 6533 /* 6534 * Get the size and the block size of a virtual disk from the vdisk server. 6535 * We need to use this operation when the vdisk_size attribute was not 6536 * available during the handshake with the vdisk server. 6537 */ 6538 static int 6539 vdc_check_capacity(vdc_t *vdc) 6540 { 6541 int rv = 0; 6542 size_t alloc_len; 6543 vd_capacity_t *vd_cap; 6544 6545 if (vdc->vdisk_size != 0) 6546 return (0); 6547 6548 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6549 6550 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6551 6552 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6553 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6554 6555 if (rv == 0) { 6556 if (vd_cap->vdisk_block_size != vdc->block_size || 6557 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6558 vd_cap->vdisk_size == 0) 6559 rv = EINVAL; 6560 else 6561 vdc->vdisk_size = vd_cap->vdisk_size; 6562 } 6563 6564 kmem_free(vd_cap, alloc_len); 6565 return (rv); 6566 } 6567 6568 /* 6569 * This structure is used in the DKIO(7I) array below. 6570 */ 6571 typedef struct vdc_dk_ioctl { 6572 uint8_t op; /* VD_OP_XXX value */ 6573 int cmd; /* Solaris ioctl operation number */ 6574 size_t nbytes; /* size of structure to be copied */ 6575 6576 /* function to convert between vDisk and Solaris structure formats */ 6577 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6578 int mode, int dir); 6579 } vdc_dk_ioctl_t; 6580 6581 /* 6582 * Subset of DKIO(7I) operations currently supported 6583 */ 6584 static vdc_dk_ioctl_t dk_ioctl[] = { 6585 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6586 vdc_null_copy_func}, 6587 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6588 vdc_get_wce_convert}, 6589 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6590 vdc_set_wce_convert}, 6591 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6592 vdc_get_vtoc_convert}, 6593 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6594 vdc_set_vtoc_convert}, 6595 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6596 vdc_get_geom_convert}, 6597 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6598 vdc_get_geom_convert}, 6599 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6600 vdc_get_geom_convert}, 6601 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6602 vdc_set_geom_convert}, 6603 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6604 vdc_get_efi_convert}, 6605 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6606 vdc_set_efi_convert}, 6607 6608 /* DIOCTL_RWCMD is converted to a read or a write */ 6609 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6610 6611 /* mhd(7I) non-shared multihost disks ioctls */ 6612 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6613 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6614 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6615 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6616 6617 /* mhd(7I) shared multihost disks ioctls */ 6618 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6619 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6620 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6621 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6622 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6623 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6624 6625 /* mhd(7I) failfast ioctl */ 6626 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6627 6628 /* 6629 * These particular ioctls are not sent to the server - vdc fakes up 6630 * the necessary info. 6631 */ 6632 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6633 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6634 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6635 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6636 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6637 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6638 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6639 }; 6640 6641 /* 6642 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6643 * function and forward them to the vdisk. 6644 */ 6645 static int 6646 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6647 { 6648 vdc_t *vdc = (vdc_t *)vdisk; 6649 dev_t dev; 6650 int rval; 6651 6652 dev = makedevice(ddi_driver_major(vdc->dip), 6653 VD_MAKE_DEV(vdc->instance, 0)); 6654 6655 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6656 } 6657 6658 /* 6659 * Function: 6660 * vd_process_ioctl() 6661 * 6662 * Description: 6663 * This routine processes disk specific ioctl calls 6664 * 6665 * Arguments: 6666 * dev - the device number 6667 * cmd - the operation [dkio(7I)] to be processed 6668 * arg - pointer to user provided structure 6669 * (contains data to be set or reference parameter for get) 6670 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6671 * rvalp - pointer to return value for calling process. 6672 * 6673 * Return Code: 6674 * 0 6675 * EFAULT 6676 * ENXIO 6677 * EIO 6678 * ENOTSUP 6679 */ 6680 static int 6681 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6682 { 6683 int instance = VDCUNIT(dev); 6684 vdc_t *vdc = NULL; 6685 int rv = -1; 6686 int idx = 0; /* index into dk_ioctl[] */ 6687 size_t len = 0; /* #bytes to send to vds */ 6688 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6689 caddr_t mem_p = NULL; 6690 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6691 vdc_dk_ioctl_t *iop; 6692 6693 vdc = ddi_get_soft_state(vdc_state, instance); 6694 if (vdc == NULL) { 6695 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6696 instance); 6697 return (ENXIO); 6698 } 6699 6700 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6701 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6702 6703 if (rvalp != NULL) { 6704 /* the return value of the ioctl is 0 by default */ 6705 *rvalp = 0; 6706 } 6707 6708 /* 6709 * Validate the ioctl operation to be performed. 6710 * 6711 * If we have looped through the array without finding a match then we 6712 * don't support this ioctl. 6713 */ 6714 for (idx = 0; idx < nioctls; idx++) { 6715 if (cmd == dk_ioctl[idx].cmd) 6716 break; 6717 } 6718 6719 if (idx >= nioctls) { 6720 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6721 vdc->instance, cmd); 6722 return (ENOTSUP); 6723 } 6724 6725 iop = &(dk_ioctl[idx]); 6726 6727 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6728 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6729 dk_efi_t dk_efi; 6730 6731 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6732 if (rv != 0) 6733 return (EFAULT); 6734 6735 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6736 } else { 6737 len = iop->nbytes; 6738 } 6739 6740 /* check if the ioctl is applicable */ 6741 switch (cmd) { 6742 case CDROMREADOFFSET: 6743 case DKIOCREMOVABLE: 6744 return (ENOTTY); 6745 6746 case USCSICMD: 6747 case MHIOCTKOWN: 6748 case MHIOCSTATUS: 6749 case MHIOCQRESERVE: 6750 case MHIOCRELEASE: 6751 case MHIOCGRP_INKEYS: 6752 case MHIOCGRP_INRESV: 6753 case MHIOCGRP_REGISTER: 6754 case MHIOCGRP_RESERVE: 6755 case MHIOCGRP_PREEMPTANDABORT: 6756 case MHIOCGRP_REGISTERANDIGNOREKEY: 6757 case MHIOCENFAILFAST: 6758 if (vdc->cinfo == NULL) 6759 return (ENXIO); 6760 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6761 return (ENOTTY); 6762 break; 6763 6764 case DIOCTL_RWCMD: 6765 if (vdc->cinfo == NULL) 6766 return (ENXIO); 6767 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6768 return (ENOTTY); 6769 break; 6770 6771 case DKIOCINFO: 6772 if (vdc->cinfo == NULL) 6773 return (ENXIO); 6774 break; 6775 6776 case DKIOCGMEDIAINFO: 6777 if (vdc->minfo == NULL) 6778 return (ENXIO); 6779 if (vdc_check_capacity(vdc) != 0) 6780 /* disk capacity is not available */ 6781 return (EIO); 6782 break; 6783 } 6784 6785 /* 6786 * Deal with ioctls which require a processing different than 6787 * converting ioctl arguments and sending a corresponding 6788 * VD operation. 6789 */ 6790 switch (cmd) { 6791 6792 case USCSICMD: 6793 { 6794 return (vdc_uscsi_cmd(vdc, arg, mode)); 6795 } 6796 6797 case MHIOCTKOWN: 6798 { 6799 mutex_enter(&vdc->ownership_lock); 6800 /* 6801 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6802 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6803 * while we are processing the ioctl. 6804 */ 6805 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6806 6807 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6808 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6809 if (rv == 0) { 6810 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6811 VDC_OWNERSHIP_GRANTED); 6812 } else { 6813 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6814 } 6815 mutex_exit(&vdc->ownership_lock); 6816 return (rv); 6817 } 6818 6819 case MHIOCRELEASE: 6820 { 6821 mutex_enter(&vdc->ownership_lock); 6822 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6823 if (rv == 0) { 6824 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6825 } 6826 mutex_exit(&vdc->ownership_lock); 6827 return (rv); 6828 } 6829 6830 case MHIOCSTATUS: 6831 { 6832 uint64_t status; 6833 6834 rv = vdc_access_get(vdc, &status, mode); 6835 if (rv == 0 && rvalp != NULL) 6836 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6837 return (rv); 6838 } 6839 6840 case MHIOCQRESERVE: 6841 { 6842 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6843 return (rv); 6844 } 6845 6846 case MHIOCGRP_INKEYS: 6847 { 6848 return (vdc_mhd_inkeys(vdc, arg, mode)); 6849 } 6850 6851 case MHIOCGRP_INRESV: 6852 { 6853 return (vdc_mhd_inresv(vdc, arg, mode)); 6854 } 6855 6856 case MHIOCGRP_REGISTER: 6857 { 6858 return (vdc_mhd_register(vdc, arg, mode)); 6859 } 6860 6861 case MHIOCGRP_RESERVE: 6862 { 6863 return (vdc_mhd_reserve(vdc, arg, mode)); 6864 } 6865 6866 case MHIOCGRP_PREEMPTANDABORT: 6867 { 6868 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6869 } 6870 6871 case MHIOCGRP_REGISTERANDIGNOREKEY: 6872 { 6873 return (vdc_mhd_registerignore(vdc, arg, mode)); 6874 } 6875 6876 case MHIOCENFAILFAST: 6877 { 6878 rv = vdc_failfast(vdc, arg, mode); 6879 return (rv); 6880 } 6881 6882 case DIOCTL_RWCMD: 6883 { 6884 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6885 } 6886 6887 case DKIOCGAPART: 6888 { 6889 return (vdc_dkio_gapart(vdc, arg, mode)); 6890 } 6891 6892 case DKIOCPARTITION: 6893 { 6894 return (vdc_dkio_partition(vdc, arg, mode)); 6895 } 6896 6897 case DKIOCINFO: 6898 { 6899 struct dk_cinfo cinfo; 6900 6901 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6902 cinfo.dki_partition = VDCPART(dev); 6903 6904 rv = ddi_copyout(&cinfo, (void *)arg, 6905 sizeof (struct dk_cinfo), mode); 6906 if (rv != 0) 6907 return (EFAULT); 6908 6909 return (0); 6910 } 6911 6912 case DKIOCGMEDIAINFO: 6913 { 6914 ASSERT(vdc->vdisk_size != 0); 6915 if (vdc->minfo->dki_capacity == 0) 6916 vdc->minfo->dki_capacity = vdc->vdisk_size; 6917 rv = ddi_copyout(vdc->minfo, (void *)arg, 6918 sizeof (struct dk_minfo), mode); 6919 if (rv != 0) 6920 return (EFAULT); 6921 6922 return (0); 6923 } 6924 6925 case DKIOCFLUSHWRITECACHE: 6926 { 6927 struct dk_callback *dkc = 6928 (struct dk_callback *)(uintptr_t)arg; 6929 vdc_dk_arg_t *dkarg = NULL; 6930 6931 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6932 instance, mode); 6933 6934 /* 6935 * If arg is NULL, then there is no callback function 6936 * registered and the call operates synchronously; we 6937 * break and continue with the rest of the function and 6938 * wait for vds to return (i.e. after the request to 6939 * vds returns successfully, all writes completed prior 6940 * to the ioctl will have been flushed from the disk 6941 * write cache to persistent media. 6942 * 6943 * If a callback function is registered, we dispatch 6944 * the request on a task queue and return immediately. 6945 * The callback will deal with informing the calling 6946 * thread that the flush request is completed. 6947 */ 6948 if (dkc == NULL) 6949 break; 6950 6951 /* 6952 * the asynchronous callback is only supported if 6953 * invoked from within the kernel 6954 */ 6955 if ((mode & FKIOCTL) == 0) 6956 return (ENOTSUP); 6957 6958 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6959 6960 dkarg->mode = mode; 6961 dkarg->dev = dev; 6962 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 6963 6964 mutex_enter(&vdc->lock); 6965 vdc->dkio_flush_pending++; 6966 dkarg->vdc = vdc; 6967 mutex_exit(&vdc->lock); 6968 6969 /* put the request on a task queue */ 6970 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 6971 (void *)dkarg, DDI_SLEEP); 6972 if (rv == NULL) { 6973 /* clean up if dispatch fails */ 6974 mutex_enter(&vdc->lock); 6975 vdc->dkio_flush_pending--; 6976 mutex_exit(&vdc->lock); 6977 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 6978 } 6979 6980 return (rv == NULL ? ENOMEM : 0); 6981 } 6982 } 6983 6984 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 6985 ASSERT(iop->op != 0); 6986 6987 /* check if the vDisk server handles the operation for this vDisk */ 6988 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 6989 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 6990 vdc->instance, iop->op); 6991 return (ENOTSUP); 6992 } 6993 6994 /* LDC requires that the memory being mapped is 8-byte aligned */ 6995 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 6996 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 6997 instance, len, alloc_len); 6998 6999 if (alloc_len > 0) 7000 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7001 7002 /* 7003 * Call the conversion function for this ioctl which, if necessary, 7004 * converts from the Solaris format to the format ARC'ed 7005 * as part of the vDisk protocol (FWARC 2006/195) 7006 */ 7007 ASSERT(iop->convert != NULL); 7008 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7009 if (rv != 0) { 7010 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7011 instance, rv, cmd); 7012 if (mem_p != NULL) 7013 kmem_free(mem_p, alloc_len); 7014 return (rv); 7015 } 7016 7017 /* 7018 * send request to vds to service the ioctl. 7019 */ 7020 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7021 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7022 VIO_both_dir, B_TRUE); 7023 7024 if (rv != 0) { 7025 /* 7026 * This is not necessarily an error. The ioctl could 7027 * be returning a value such as ENOTTY to indicate 7028 * that the ioctl is not applicable. 7029 */ 7030 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7031 instance, rv, cmd); 7032 if (mem_p != NULL) 7033 kmem_free(mem_p, alloc_len); 7034 7035 return (rv); 7036 } 7037 7038 /* 7039 * Call the conversion function (if it exists) for this ioctl 7040 * which converts from the format ARC'ed as part of the vDisk 7041 * protocol (FWARC 2006/195) back to a format understood by 7042 * the rest of Solaris. 7043 */ 7044 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7045 if (rv != 0) { 7046 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7047 instance, rv, cmd); 7048 if (mem_p != NULL) 7049 kmem_free(mem_p, alloc_len); 7050 return (rv); 7051 } 7052 7053 if (mem_p != NULL) 7054 kmem_free(mem_p, alloc_len); 7055 7056 return (rv); 7057 } 7058 7059 /* 7060 * Function: 7061 * 7062 * Description: 7063 * This is an empty conversion function used by ioctl calls which 7064 * do not need to convert the data being passed in/out to userland 7065 */ 7066 static int 7067 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7068 { 7069 _NOTE(ARGUNUSED(vdc)) 7070 _NOTE(ARGUNUSED(from)) 7071 _NOTE(ARGUNUSED(to)) 7072 _NOTE(ARGUNUSED(mode)) 7073 _NOTE(ARGUNUSED(dir)) 7074 7075 return (0); 7076 } 7077 7078 static int 7079 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7080 int mode, int dir) 7081 { 7082 _NOTE(ARGUNUSED(vdc)) 7083 7084 if (dir == VD_COPYIN) 7085 return (0); /* nothing to do */ 7086 7087 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7088 return (EFAULT); 7089 7090 return (0); 7091 } 7092 7093 static int 7094 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7095 int mode, int dir) 7096 { 7097 _NOTE(ARGUNUSED(vdc)) 7098 7099 if (dir == VD_COPYOUT) 7100 return (0); /* nothing to do */ 7101 7102 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7103 return (EFAULT); 7104 7105 return (0); 7106 } 7107 7108 /* 7109 * Function: 7110 * vdc_get_vtoc_convert() 7111 * 7112 * Description: 7113 * This routine performs the necessary convertions from the DKIOCGVTOC 7114 * Solaris structure to the format defined in FWARC 2006/195. 7115 * 7116 * In the struct vtoc definition, the timestamp field is marked as not 7117 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7118 * However SVM uses that field to check it can write into the VTOC, 7119 * so we fake up the info of that field. 7120 * 7121 * Arguments: 7122 * vdc - the vDisk client 7123 * from - the buffer containing the data to be copied from 7124 * to - the buffer to be copied to 7125 * mode - flags passed to ioctl() call 7126 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7127 * 7128 * Return Code: 7129 * 0 - Success 7130 * ENXIO - incorrect buffer passed in. 7131 * EFAULT - ddi_copyout routine encountered an error. 7132 */ 7133 static int 7134 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7135 { 7136 int i; 7137 void *tmp_mem = NULL; 7138 void *tmp_memp; 7139 struct vtoc vt; 7140 struct vtoc32 vt32; 7141 int copy_len = 0; 7142 int rv = 0; 7143 7144 if (dir != VD_COPYOUT) 7145 return (0); /* nothing to do */ 7146 7147 if ((from == NULL) || (to == NULL)) 7148 return (ENXIO); 7149 7150 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7151 copy_len = sizeof (struct vtoc32); 7152 else 7153 copy_len = sizeof (struct vtoc); 7154 7155 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7156 7157 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 7158 7159 /* fake the VTOC timestamp field */ 7160 for (i = 0; i < V_NUMPAR; i++) { 7161 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 7162 } 7163 7164 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7165 /* LINTED E_ASSIGN_NARROW_CONV */ 7166 vtoctovtoc32(vt, vt32); 7167 tmp_memp = &vt32; 7168 } else { 7169 tmp_memp = &vt; 7170 } 7171 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 7172 if (rv != 0) 7173 rv = EFAULT; 7174 7175 kmem_free(tmp_mem, copy_len); 7176 return (rv); 7177 } 7178 7179 /* 7180 * Function: 7181 * vdc_set_vtoc_convert() 7182 * 7183 * Description: 7184 * This routine performs the necessary convertions from the DKIOCSVTOC 7185 * Solaris structure to the format defined in FWARC 2006/195. 7186 * 7187 * Arguments: 7188 * vdc - the vDisk client 7189 * from - Buffer with data 7190 * to - Buffer where data is to be copied to 7191 * mode - flags passed to ioctl 7192 * dir - direction of copy (in or out) 7193 * 7194 * Return Code: 7195 * 0 - Success 7196 * ENXIO - Invalid buffer passed in 7197 * EFAULT - ddi_copyin of data failed 7198 */ 7199 static int 7200 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7201 { 7202 _NOTE(ARGUNUSED(vdc)) 7203 7204 void *tmp_mem = NULL, *uvtoc; 7205 struct vtoc vt; 7206 struct vtoc *vtp = &vt; 7207 vd_vtoc_t vtvd; 7208 int copy_len = 0; 7209 int i, rv = 0; 7210 7211 if ((from == NULL) || (to == NULL)) 7212 return (ENXIO); 7213 7214 if (dir == VD_COPYIN) 7215 uvtoc = from; 7216 else 7217 uvtoc = to; 7218 7219 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7220 copy_len = sizeof (struct vtoc32); 7221 else 7222 copy_len = sizeof (struct vtoc); 7223 7224 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7225 7226 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 7227 if (rv != 0) { 7228 kmem_free(tmp_mem, copy_len); 7229 return (EFAULT); 7230 } 7231 7232 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7233 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 7234 } else { 7235 vtp = tmp_mem; 7236 } 7237 7238 if (dir == VD_COPYOUT) { 7239 /* 7240 * The disk label may have changed. Revalidate the disk 7241 * geometry. This will also update the device nodes. 7242 */ 7243 vdc_validate(vdc); 7244 7245 /* 7246 * We also need to keep track of the timestamp fields. 7247 */ 7248 for (i = 0; i < V_NUMPAR; i++) { 7249 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 7250 } 7251 7252 return (0); 7253 } 7254 7255 VTOC2VD_VTOC(vtp, &vtvd); 7256 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 7257 kmem_free(tmp_mem, copy_len); 7258 7259 return (0); 7260 } 7261 7262 /* 7263 * Function: 7264 * vdc_get_geom_convert() 7265 * 7266 * Description: 7267 * This routine performs the necessary convertions from the DKIOCGGEOM, 7268 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7269 * defined in FWARC 2006/195 7270 * 7271 * Arguments: 7272 * vdc - the vDisk client 7273 * from - Buffer with data 7274 * to - Buffer where data is to be copied to 7275 * mode - flags passed to ioctl 7276 * dir - direction of copy (in or out) 7277 * 7278 * Return Code: 7279 * 0 - Success 7280 * ENXIO - Invalid buffer passed in 7281 * EFAULT - ddi_copyout of data failed 7282 */ 7283 static int 7284 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7285 { 7286 _NOTE(ARGUNUSED(vdc)) 7287 7288 struct dk_geom geom; 7289 int copy_len = sizeof (struct dk_geom); 7290 int rv = 0; 7291 7292 if (dir != VD_COPYOUT) 7293 return (0); /* nothing to do */ 7294 7295 if ((from == NULL) || (to == NULL)) 7296 return (ENXIO); 7297 7298 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7299 rv = ddi_copyout(&geom, to, copy_len, mode); 7300 if (rv != 0) 7301 rv = EFAULT; 7302 7303 return (rv); 7304 } 7305 7306 /* 7307 * Function: 7308 * vdc_set_geom_convert() 7309 * 7310 * Description: 7311 * This routine performs the necessary convertions from the DKIOCSGEOM 7312 * Solaris structure to the format defined in FWARC 2006/195. 7313 * 7314 * Arguments: 7315 * vdc - the vDisk client 7316 * from - Buffer with data 7317 * to - Buffer where data is to be copied to 7318 * mode - flags passed to ioctl 7319 * dir - direction of copy (in or out) 7320 * 7321 * Return Code: 7322 * 0 - Success 7323 * ENXIO - Invalid buffer passed in 7324 * EFAULT - ddi_copyin of data failed 7325 */ 7326 static int 7327 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7328 { 7329 _NOTE(ARGUNUSED(vdc)) 7330 7331 vd_geom_t vdgeom; 7332 void *tmp_mem = NULL; 7333 int copy_len = sizeof (struct dk_geom); 7334 int rv = 0; 7335 7336 if (dir != VD_COPYIN) 7337 return (0); /* nothing to do */ 7338 7339 if ((from == NULL) || (to == NULL)) 7340 return (ENXIO); 7341 7342 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7343 7344 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7345 if (rv != 0) { 7346 kmem_free(tmp_mem, copy_len); 7347 return (EFAULT); 7348 } 7349 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7350 bcopy(&vdgeom, to, sizeof (vdgeom)); 7351 kmem_free(tmp_mem, copy_len); 7352 7353 return (0); 7354 } 7355 7356 static int 7357 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7358 { 7359 _NOTE(ARGUNUSED(vdc)) 7360 7361 vd_efi_t *vd_efi; 7362 dk_efi_t dk_efi; 7363 int rv = 0; 7364 void *uaddr; 7365 7366 if ((from == NULL) || (to == NULL)) 7367 return (ENXIO); 7368 7369 if (dir == VD_COPYIN) { 7370 7371 vd_efi = (vd_efi_t *)to; 7372 7373 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7374 if (rv != 0) 7375 return (EFAULT); 7376 7377 vd_efi->lba = dk_efi.dki_lba; 7378 vd_efi->length = dk_efi.dki_length; 7379 bzero(vd_efi->data, vd_efi->length); 7380 7381 } else { 7382 7383 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7384 if (rv != 0) 7385 return (EFAULT); 7386 7387 uaddr = dk_efi.dki_data; 7388 7389 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7390 7391 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7392 7393 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7394 mode); 7395 if (rv != 0) 7396 return (EFAULT); 7397 7398 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7399 } 7400 7401 return (0); 7402 } 7403 7404 static int 7405 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7406 { 7407 _NOTE(ARGUNUSED(vdc)) 7408 7409 dk_efi_t dk_efi; 7410 void *uaddr; 7411 7412 if (dir == VD_COPYOUT) { 7413 /* 7414 * The disk label may have changed. Revalidate the disk 7415 * geometry. This will also update the device nodes. 7416 */ 7417 vdc_validate(vdc); 7418 return (0); 7419 } 7420 7421 if ((from == NULL) || (to == NULL)) 7422 return (ENXIO); 7423 7424 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7425 return (EFAULT); 7426 7427 uaddr = dk_efi.dki_data; 7428 7429 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7430 7431 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7432 return (EFAULT); 7433 7434 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7435 7436 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7437 7438 return (0); 7439 } 7440 7441 7442 /* -------------------------------------------------------------------------- */ 7443 7444 /* 7445 * Function: 7446 * vdc_create_fake_geometry() 7447 * 7448 * Description: 7449 * This routine fakes up the disk info needed for some DKIO ioctls such 7450 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7451 * 7452 * Note: This function must not be called until the vDisk attributes have 7453 * been exchanged as part of the handshake with the vDisk server. 7454 * 7455 * Arguments: 7456 * vdc - soft state pointer for this instance of the device driver. 7457 * 7458 * Return Code: 7459 * none. 7460 */ 7461 static void 7462 vdc_create_fake_geometry(vdc_t *vdc) 7463 { 7464 ASSERT(vdc != NULL); 7465 ASSERT(vdc->max_xfer_sz != 0); 7466 7467 /* 7468 * DKIOCINFO support 7469 */ 7470 if (vdc->cinfo == NULL) 7471 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7472 7473 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7474 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7475 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7476 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7477 7478 /* 7479 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7480 * operation is supported, otherwise the controller type is DKC_DIRECT. 7481 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7482 * controller type is always DKC_DIRECT in that case. 7483 * 7484 * If the virtual disk is backed by a physical CD/DVD device or 7485 * an ISO image, modify the controller type to indicate this 7486 */ 7487 switch (vdc->vdisk_media) { 7488 case VD_MEDIA_CD: 7489 case VD_MEDIA_DVD: 7490 vdc->cinfo->dki_ctype = DKC_CDROM; 7491 break; 7492 case VD_MEDIA_FIXED: 7493 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7494 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7495 else 7496 vdc->cinfo->dki_ctype = DKC_DIRECT; 7497 break; 7498 default: 7499 /* in the case of v1.0 we default to a fixed disk */ 7500 vdc->cinfo->dki_ctype = DKC_DIRECT; 7501 break; 7502 } 7503 vdc->cinfo->dki_flags = DKI_FMTVOL; 7504 vdc->cinfo->dki_cnum = 0; 7505 vdc->cinfo->dki_addr = 0; 7506 vdc->cinfo->dki_space = 0; 7507 vdc->cinfo->dki_prio = 0; 7508 vdc->cinfo->dki_vec = 0; 7509 vdc->cinfo->dki_unit = vdc->instance; 7510 vdc->cinfo->dki_slave = 0; 7511 /* 7512 * The partition number will be created on the fly depending on the 7513 * actual slice (i.e. minor node) that is used to request the data. 7514 */ 7515 vdc->cinfo->dki_partition = 0; 7516 7517 /* 7518 * DKIOCGMEDIAINFO support 7519 */ 7520 if (vdc->minfo == NULL) 7521 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7522 7523 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7524 vdc->minfo->dki_media_type = 7525 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7526 } else { 7527 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7528 } 7529 7530 vdc->minfo->dki_capacity = vdc->vdisk_size; 7531 vdc->minfo->dki_lbsize = vdc->block_size; 7532 } 7533 7534 static ushort_t 7535 vdc_lbl2cksum(struct dk_label *label) 7536 { 7537 int count; 7538 ushort_t sum, *sp; 7539 7540 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7541 sp = (ushort_t *)label; 7542 sum = 0; 7543 while (count--) { 7544 sum ^= *sp++; 7545 } 7546 7547 return (sum); 7548 } 7549 7550 /* 7551 * Function: 7552 * vdc_validate_geometry 7553 * 7554 * Description: 7555 * This routine discovers the label and geometry of the disk. It stores 7556 * the disk label and related information in the vdc structure. If it 7557 * fails to validate the geometry or to discover the disk label then 7558 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7559 * 7560 * Arguments: 7561 * vdc - soft state pointer for this instance of the device driver. 7562 * 7563 * Return Code: 7564 * 0 - success. 7565 * EINVAL - unknown disk label. 7566 * ENOTSUP - geometry not applicable (EFI label). 7567 * EIO - error accessing the disk. 7568 */ 7569 static int 7570 vdc_validate_geometry(vdc_t *vdc) 7571 { 7572 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7573 dev_t dev; 7574 int rv, rval; 7575 struct dk_label label; 7576 struct dk_geom geom; 7577 struct vtoc vtoc; 7578 efi_gpt_t *gpt; 7579 efi_gpe_t *gpe; 7580 vd_efi_dev_t edev; 7581 7582 ASSERT(vdc != NULL); 7583 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7584 ASSERT(MUTEX_HELD(&vdc->lock)); 7585 7586 mutex_exit(&vdc->lock); 7587 7588 dev = makedevice(ddi_driver_major(vdc->dip), 7589 VD_MAKE_DEV(vdc->instance, 0)); 7590 7591 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7592 if (rv == 0) 7593 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7594 FKIOCTL, &rval); 7595 7596 if (rv == ENOTSUP) { 7597 /* 7598 * If the device does not support VTOC then we try 7599 * to read an EFI label. 7600 * 7601 * We need to know the block size and the disk size to 7602 * be able to read an EFI label. 7603 */ 7604 if (vdc->vdisk_size == 0) { 7605 if ((rv = vdc_check_capacity(vdc)) != 0) { 7606 mutex_enter(&vdc->lock); 7607 vdc_store_label_unk(vdc); 7608 return (rv); 7609 } 7610 } 7611 7612 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7613 7614 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7615 7616 if (rv) { 7617 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7618 vdc->instance, rv); 7619 mutex_enter(&vdc->lock); 7620 vdc_store_label_unk(vdc); 7621 return (EIO); 7622 } 7623 7624 mutex_enter(&vdc->lock); 7625 vdc_store_label_efi(vdc, gpt, gpe); 7626 vd_efi_free(&edev, gpt, gpe); 7627 return (ENOTSUP); 7628 } 7629 7630 if (rv != 0) { 7631 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7632 vdc->instance, rv); 7633 mutex_enter(&vdc->lock); 7634 vdc_store_label_unk(vdc); 7635 if (rv != EINVAL) 7636 rv = EIO; 7637 return (rv); 7638 } 7639 7640 /* check that geometry and vtoc are valid */ 7641 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7642 vtoc.v_sanity != VTOC_SANE) { 7643 mutex_enter(&vdc->lock); 7644 vdc_store_label_unk(vdc); 7645 return (EINVAL); 7646 } 7647 7648 /* 7649 * We have a disk and a valid VTOC. However this does not mean 7650 * that the disk currently have a VTOC label. The returned VTOC may 7651 * be a default VTOC to be used for configuring the disk (this is 7652 * what is done for disk image). So we read the label from the 7653 * beginning of the disk to ensure we really have a VTOC label. 7654 * 7655 * FUTURE: This could be the default way for reading the VTOC 7656 * from the disk as opposed to sending the VD_OP_GET_VTOC 7657 * to the server. This will be the default if vdc is implemented 7658 * ontop of cmlb. 7659 */ 7660 7661 /* 7662 * Single slice disk does not support read using an absolute disk 7663 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7664 */ 7665 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7666 mutex_enter(&vdc->lock); 7667 if (vtoc.v_nparts != 1) { 7668 vdc_store_label_unk(vdc); 7669 return (EINVAL); 7670 } 7671 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7672 return (0); 7673 } 7674 7675 if (vtoc.v_nparts != V_NUMPAR) { 7676 mutex_enter(&vdc->lock); 7677 vdc_store_label_unk(vdc); 7678 return (EINVAL); 7679 } 7680 7681 /* 7682 * Read disk label from start of disk 7683 */ 7684 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7685 bioinit(buf); 7686 buf->b_un.b_addr = (caddr_t)&label; 7687 buf->b_bcount = DK_LABEL_SIZE; 7688 buf->b_flags = B_BUSY | B_READ; 7689 buf->b_dev = cmpdev(dev); 7690 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7691 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7692 if (rv) { 7693 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7694 vdc->instance); 7695 } else { 7696 rv = biowait(buf); 7697 biofini(buf); 7698 } 7699 kmem_free(buf, sizeof (buf_t)); 7700 7701 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7702 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7703 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7704 vdc->instance); 7705 mutex_enter(&vdc->lock); 7706 vdc_store_label_unk(vdc); 7707 return (EINVAL); 7708 } 7709 7710 mutex_enter(&vdc->lock); 7711 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7712 return (0); 7713 } 7714 7715 /* 7716 * Function: 7717 * vdc_validate 7718 * 7719 * Description: 7720 * This routine discovers the label of the disk and create the 7721 * appropriate device nodes if the label has changed. 7722 * 7723 * Arguments: 7724 * vdc - soft state pointer for this instance of the device driver. 7725 * 7726 * Return Code: 7727 * none. 7728 */ 7729 static void 7730 vdc_validate(vdc_t *vdc) 7731 { 7732 vd_disk_label_t old_label; 7733 vd_slice_t old_slice[V_NUMPAR]; 7734 int rv; 7735 7736 ASSERT(!MUTEX_HELD(&vdc->lock)); 7737 7738 mutex_enter(&vdc->lock); 7739 7740 /* save the current label and vtoc */ 7741 old_label = vdc->vdisk_label; 7742 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7743 7744 /* check the geometry */ 7745 (void) vdc_validate_geometry(vdc); 7746 7747 /* if the disk label has changed, update device nodes */ 7748 if (vdc->vdisk_label != old_label) { 7749 7750 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7751 rv = vdc_create_device_nodes_efi(vdc); 7752 else 7753 rv = vdc_create_device_nodes_vtoc(vdc); 7754 7755 if (rv != 0) { 7756 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7757 vdc->instance); 7758 } 7759 } 7760 7761 mutex_exit(&vdc->lock); 7762 } 7763 7764 static void 7765 vdc_validate_task(void *arg) 7766 { 7767 vdc_t *vdc = (vdc_t *)arg; 7768 7769 vdc_validate(vdc); 7770 7771 mutex_enter(&vdc->lock); 7772 ASSERT(vdc->validate_pending > 0); 7773 vdc->validate_pending--; 7774 mutex_exit(&vdc->lock); 7775 } 7776 7777 /* 7778 * Function: 7779 * vdc_setup_devid() 7780 * 7781 * Description: 7782 * This routine discovers the devid of a vDisk. It requests the devid of 7783 * the underlying device from the vDisk server, builds an encapsulated 7784 * devid based on the retrieved devid and registers that new devid to 7785 * the vDisk. 7786 * 7787 * Arguments: 7788 * vdc - soft state pointer for this instance of the device driver. 7789 * 7790 * Return Code: 7791 * 0 - A devid was succesfully registered for the vDisk 7792 */ 7793 static int 7794 vdc_setup_devid(vdc_t *vdc) 7795 { 7796 int rv; 7797 vd_devid_t *vd_devid; 7798 size_t bufsize, bufid_len; 7799 7800 /* 7801 * At first sight, we don't know the size of the devid that the 7802 * server will return but this size will be encoded into the 7803 * reply. So we do a first request using a default size then we 7804 * check if this size was large enough. If not then we do a second 7805 * request with the correct size returned by the server. Note that 7806 * ldc requires size to be 8-byte aligned. 7807 */ 7808 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7809 sizeof (uint64_t)); 7810 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7811 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7812 7813 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7814 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7815 7816 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7817 7818 if (rv) { 7819 kmem_free(vd_devid, bufsize); 7820 return (rv); 7821 } 7822 7823 if (vd_devid->length > bufid_len) { 7824 /* 7825 * The returned devid is larger than the buffer used. Try again 7826 * with a buffer with the right size. 7827 */ 7828 kmem_free(vd_devid, bufsize); 7829 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7830 sizeof (uint64_t)); 7831 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7832 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7833 7834 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7835 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7836 VIO_both_dir, B_TRUE); 7837 7838 if (rv) { 7839 kmem_free(vd_devid, bufsize); 7840 return (rv); 7841 } 7842 } 7843 7844 /* 7845 * The virtual disk should have the same device id as the one associated 7846 * with the physical disk it is mapped on, otherwise sharing a disk 7847 * between a LDom and a non-LDom may not work (for example for a shared 7848 * SVM disk set). 7849 * 7850 * The DDI framework does not allow creating a device id with any 7851 * type so we first create a device id of type DEVID_ENCAP and then 7852 * we restore the orignal type of the physical device. 7853 */ 7854 7855 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7856 7857 /* build an encapsulated devid based on the returned devid */ 7858 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7859 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7860 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7861 kmem_free(vd_devid, bufsize); 7862 return (1); 7863 } 7864 7865 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7866 7867 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7868 7869 kmem_free(vd_devid, bufsize); 7870 7871 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7872 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7873 return (1); 7874 } 7875 7876 return (0); 7877 } 7878 7879 static void 7880 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7881 { 7882 int i, nparts; 7883 7884 ASSERT(MUTEX_HELD(&vdc->lock)); 7885 7886 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7887 bzero(vdc->vtoc, sizeof (struct vtoc)); 7888 bzero(vdc->geom, sizeof (struct dk_geom)); 7889 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7890 7891 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7892 7893 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7894 7895 if (gpe[i].efi_gpe_StartingLBA == 0 || 7896 gpe[i].efi_gpe_EndingLBA == 0) { 7897 continue; 7898 } 7899 7900 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7901 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7902 gpe[i].efi_gpe_StartingLBA + 1; 7903 } 7904 7905 ASSERT(vdc->vdisk_size != 0); 7906 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7907 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7908 7909 } 7910 7911 static void 7912 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7913 { 7914 int i; 7915 7916 ASSERT(MUTEX_HELD(&vdc->lock)); 7917 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7918 7919 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7920 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7921 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7922 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7923 7924 for (i = 0; i < vtoc->v_nparts; i++) { 7925 vdc->slice[i].start = vtoc->v_part[i].p_start; 7926 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7927 } 7928 } 7929 7930 static void 7931 vdc_store_label_unk(vdc_t *vdc) 7932 { 7933 ASSERT(MUTEX_HELD(&vdc->lock)); 7934 7935 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7936 bzero(vdc->vtoc, sizeof (struct vtoc)); 7937 bzero(vdc->geom, sizeof (struct dk_geom)); 7938 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7939 } 7940