1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/kstat.h> 69 #include <sys/mach_descrip.h> 70 #include <sys/modctl.h> 71 #include <sys/mdeg.h> 72 #include <sys/note.h> 73 #include <sys/open.h> 74 #include <sys/sdt.h> 75 #include <sys/stat.h> 76 #include <sys/sunddi.h> 77 #include <sys/types.h> 78 #include <sys/promif.h> 79 #include <sys/var.h> 80 #include <sys/vtoc.h> 81 #include <sys/archsystm.h> 82 #include <sys/sysmacros.h> 83 84 #include <sys/cdio.h> 85 #include <sys/dktp/fdisk.h> 86 #include <sys/dktp/dadkio.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 124 int mod_flags, char *name, caddr_t valuep, int *lengthp); 125 126 /* setup */ 127 static void vdc_min(struct buf *bufp); 128 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 129 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 130 static int vdc_start_ldc_connection(vdc_t *vdc); 131 static int vdc_create_device_nodes(vdc_t *vdc); 132 static int vdc_create_device_nodes_efi(vdc_t *vdc); 133 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 134 static void vdc_create_io_kstats(vdc_t *vdc); 135 static void vdc_create_err_kstats(vdc_t *vdc); 136 static void vdc_set_err_kstats(vdc_t *vdc); 137 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 138 mde_cookie_t *vd_nodep); 139 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 140 static void vdc_fini_ports(vdc_t *vdc); 141 static void vdc_switch_server(vdc_t *vdcp); 142 static int vdc_do_ldc_up(vdc_t *vdc); 143 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 144 static int vdc_init_descriptor_ring(vdc_t *vdc); 145 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 146 static int vdc_setup_devid(vdc_t *vdc); 147 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 148 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 149 static void vdc_store_label_unk(vdc_t *vdc); 150 static boolean_t vdc_is_opened(vdc_t *vdc); 151 152 /* handshake with vds */ 153 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 154 static int vdc_ver_negotiation(vdc_t *vdcp); 155 static int vdc_init_attr_negotiation(vdc_t *vdc); 156 static int vdc_attr_negotiation(vdc_t *vdcp); 157 static int vdc_init_dring_negotiate(vdc_t *vdc); 158 static int vdc_dring_negotiation(vdc_t *vdcp); 159 static int vdc_send_rdx(vdc_t *vdcp); 160 static int vdc_rdx_exchange(vdc_t *vdcp); 161 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 162 163 /* processing incoming messages from vDisk server */ 164 static void vdc_process_msg_thread(vdc_t *vdc); 165 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 166 167 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 168 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 169 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 170 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 171 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 172 static int vdc_send_request(vdc_t *vdcp, int operation, 173 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 174 int cb_type, void *cb_arg, vio_desc_direction_t dir); 175 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 176 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 177 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 178 int cb_type, void *cb_arg, vio_desc_direction_t dir); 179 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 180 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 181 void *cb_arg, vio_desc_direction_t dir, boolean_t); 182 183 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 184 static int vdc_drain_response(vdc_t *vdcp); 185 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 186 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 187 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 188 189 /* dkio */ 190 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 191 int *rvalp); 192 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 193 static void vdc_create_fake_geometry(vdc_t *vdc); 194 static int vdc_validate_geometry(vdc_t *vdc); 195 static void vdc_validate(vdc_t *vdc); 196 static void vdc_validate_task(void *arg); 197 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 198 int mode, int dir); 199 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 200 int mode, int dir); 201 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 202 int mode, int dir); 203 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 204 int mode, int dir); 205 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 206 int mode, int dir); 207 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 208 int mode, int dir); 209 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 210 int mode, int dir); 211 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 212 int mode, int dir); 213 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 214 int mode, int dir); 215 216 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 217 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 218 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 219 static int vdc_failfast_check_resv(vdc_t *vdc); 220 221 /* 222 * Module variables 223 */ 224 225 /* 226 * Tunable variables to control how long vdc waits before timing out on 227 * various operations 228 */ 229 static int vdc_hshake_retries = 3; 230 231 static int vdc_timeout = 0; /* units: seconds */ 232 static int vdc_ldcup_timeout = 1; /* units: seconds */ 233 234 static uint64_t vdc_hz_min_ldc_delay; 235 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 236 static uint64_t vdc_hz_max_ldc_delay; 237 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 238 239 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 240 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 241 242 /* values for dumping - need to run in a tighter loop */ 243 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 244 static int vdc_dump_retries = 100; 245 246 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 247 248 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 249 250 /* Count of the number of vdc instances attached */ 251 static volatile uint32_t vdc_instance_count = 0; 252 253 /* Tunable to log all SCSI errors */ 254 static boolean_t vdc_scsi_log_error = B_FALSE; 255 256 /* Soft state pointer */ 257 static void *vdc_state; 258 259 /* 260 * Controlling the verbosity of the error/debug messages 261 * 262 * vdc_msglevel - controls level of messages 263 * vdc_matchinst - 64-bit variable where each bit corresponds 264 * to the vdc instance the vdc_msglevel applies. 265 */ 266 int vdc_msglevel = 0x0; 267 uint64_t vdc_matchinst = 0ull; 268 269 /* 270 * Supported vDisk protocol version pairs. 271 * 272 * The first array entry is the latest and preferred version. 273 */ 274 static const vio_ver_t vdc_version[] = {{1, 1}}; 275 276 static struct cb_ops vdc_cb_ops = { 277 vdc_open, /* cb_open */ 278 vdc_close, /* cb_close */ 279 vdc_strategy, /* cb_strategy */ 280 vdc_print, /* cb_print */ 281 vdc_dump, /* cb_dump */ 282 vdc_read, /* cb_read */ 283 vdc_write, /* cb_write */ 284 vdc_ioctl, /* cb_ioctl */ 285 nodev, /* cb_devmap */ 286 nodev, /* cb_mmap */ 287 nodev, /* cb_segmap */ 288 nochpoll, /* cb_chpoll */ 289 vdc_prop_op, /* cb_prop_op */ 290 NULL, /* cb_str */ 291 D_MP | D_64BIT, /* cb_flag */ 292 CB_REV, /* cb_rev */ 293 vdc_aread, /* cb_aread */ 294 vdc_awrite /* cb_awrite */ 295 }; 296 297 static struct dev_ops vdc_ops = { 298 DEVO_REV, /* devo_rev */ 299 0, /* devo_refcnt */ 300 vdc_getinfo, /* devo_getinfo */ 301 nulldev, /* devo_identify */ 302 nulldev, /* devo_probe */ 303 vdc_attach, /* devo_attach */ 304 vdc_detach, /* devo_detach */ 305 nodev, /* devo_reset */ 306 &vdc_cb_ops, /* devo_cb_ops */ 307 NULL, /* devo_bus_ops */ 308 nulldev /* devo_power */ 309 }; 310 311 static struct modldrv modldrv = { 312 &mod_driverops, 313 "virtual disk client", 314 &vdc_ops, 315 }; 316 317 static struct modlinkage modlinkage = { 318 MODREV_1, 319 &modldrv, 320 NULL 321 }; 322 323 /* -------------------------------------------------------------------------- */ 324 325 /* 326 * Device Driver housekeeping and setup 327 */ 328 329 int 330 _init(void) 331 { 332 int status; 333 334 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 335 return (status); 336 if ((status = mod_install(&modlinkage)) != 0) 337 ddi_soft_state_fini(&vdc_state); 338 return (status); 339 } 340 341 int 342 _info(struct modinfo *modinfop) 343 { 344 return (mod_info(&modlinkage, modinfop)); 345 } 346 347 int 348 _fini(void) 349 { 350 int status; 351 352 if ((status = mod_remove(&modlinkage)) != 0) 353 return (status); 354 ddi_soft_state_fini(&vdc_state); 355 return (0); 356 } 357 358 static int 359 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 360 { 361 _NOTE(ARGUNUSED(dip)) 362 363 int instance = VDCUNIT((dev_t)arg); 364 vdc_t *vdc = NULL; 365 366 switch (cmd) { 367 case DDI_INFO_DEVT2DEVINFO: 368 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 369 *resultp = NULL; 370 return (DDI_FAILURE); 371 } 372 *resultp = vdc->dip; 373 return (DDI_SUCCESS); 374 case DDI_INFO_DEVT2INSTANCE: 375 *resultp = (void *)(uintptr_t)instance; 376 return (DDI_SUCCESS); 377 default: 378 *resultp = NULL; 379 return (DDI_FAILURE); 380 } 381 } 382 383 static int 384 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 385 { 386 kt_did_t failfast_tid, ownership_tid; 387 int instance; 388 int rv; 389 vdc_server_t *srvr; 390 vdc_t *vdc = NULL; 391 392 switch (cmd) { 393 case DDI_DETACH: 394 /* the real work happens below */ 395 break; 396 case DDI_SUSPEND: 397 /* nothing to do for this non-device */ 398 return (DDI_SUCCESS); 399 default: 400 return (DDI_FAILURE); 401 } 402 403 ASSERT(cmd == DDI_DETACH); 404 instance = ddi_get_instance(dip); 405 DMSGX(1, "[%d] Entered\n", instance); 406 407 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 408 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 409 return (DDI_FAILURE); 410 } 411 412 /* 413 * This function is called when vdc is detached or if it has failed to 414 * attach. In that case, the attach may have fail before the vdisk type 415 * has been set so we can't call vdc_is_opened(). However as the attach 416 * has failed, we know that the vdisk is not opened and we can safely 417 * detach. 418 */ 419 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 420 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 421 return (DDI_FAILURE); 422 } 423 424 if (vdc->dkio_flush_pending) { 425 DMSG(vdc, 0, 426 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 427 instance, vdc->dkio_flush_pending); 428 return (DDI_FAILURE); 429 } 430 431 if (vdc->validate_pending) { 432 DMSG(vdc, 0, 433 "[%d] Cannot detach: %d outstanding validate request\n", 434 instance, vdc->validate_pending); 435 return (DDI_FAILURE); 436 } 437 438 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 439 440 /* If we took ownership, release ownership */ 441 mutex_enter(&vdc->ownership_lock); 442 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 443 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 444 if (rv == 0) { 445 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 446 } 447 } 448 mutex_exit(&vdc->ownership_lock); 449 450 /* mark instance as detaching */ 451 vdc->lifecycle = VDC_LC_DETACHING; 452 453 /* 454 * Try and disable callbacks to prevent another handshake. We have to 455 * disable callbacks for all servers. 456 */ 457 for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) { 458 rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE); 459 DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n", 460 srvr->ldc_id, rv); 461 } 462 463 if (vdc->initialized & VDC_THREAD) { 464 mutex_enter(&vdc->read_lock); 465 if ((vdc->read_state == VDC_READ_WAITING) || 466 (vdc->read_state == VDC_READ_RESET)) { 467 vdc->read_state = VDC_READ_RESET; 468 cv_signal(&vdc->read_cv); 469 } 470 471 mutex_exit(&vdc->read_lock); 472 473 /* wake up any thread waiting for connection to come online */ 474 mutex_enter(&vdc->lock); 475 if (vdc->state == VDC_STATE_INIT_WAITING) { 476 DMSG(vdc, 0, 477 "[%d] write reset - move to resetting state...\n", 478 instance); 479 vdc->state = VDC_STATE_RESETTING; 480 cv_signal(&vdc->initwait_cv); 481 } 482 mutex_exit(&vdc->lock); 483 484 /* now wait until state transitions to VDC_STATE_DETACH */ 485 thread_join(vdc->msg_proc_thr->t_did); 486 ASSERT(vdc->state == VDC_STATE_DETACH); 487 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 488 vdc->instance); 489 } 490 491 mutex_enter(&vdc->lock); 492 493 if (vdc->initialized & VDC_DRING) 494 vdc_destroy_descriptor_ring(vdc); 495 496 vdc_fini_ports(vdc); 497 498 if (vdc->failfast_thread) { 499 failfast_tid = vdc->failfast_thread->t_did; 500 vdc->failfast_interval = 0; 501 cv_signal(&vdc->failfast_cv); 502 } else { 503 failfast_tid = 0; 504 } 505 506 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 507 ownership_tid = vdc->ownership_thread->t_did; 508 vdc->ownership = VDC_OWNERSHIP_NONE; 509 cv_signal(&vdc->ownership_cv); 510 } else { 511 ownership_tid = 0; 512 } 513 514 mutex_exit(&vdc->lock); 515 516 if (failfast_tid != 0) 517 thread_join(failfast_tid); 518 519 if (ownership_tid != 0) 520 thread_join(ownership_tid); 521 522 if (vdc->initialized & VDC_MINOR) 523 ddi_remove_minor_node(dip, NULL); 524 525 if (vdc->io_stats) { 526 kstat_delete(vdc->io_stats); 527 vdc->io_stats = NULL; 528 } 529 530 if (vdc->err_stats) { 531 kstat_delete(vdc->err_stats); 532 vdc->err_stats = NULL; 533 } 534 535 if (vdc->initialized & VDC_LOCKS) { 536 mutex_destroy(&vdc->lock); 537 mutex_destroy(&vdc->read_lock); 538 mutex_destroy(&vdc->ownership_lock); 539 cv_destroy(&vdc->initwait_cv); 540 cv_destroy(&vdc->dring_free_cv); 541 cv_destroy(&vdc->membind_cv); 542 cv_destroy(&vdc->sync_pending_cv); 543 cv_destroy(&vdc->sync_blocked_cv); 544 cv_destroy(&vdc->read_cv); 545 cv_destroy(&vdc->running_cv); 546 cv_destroy(&vdc->ownership_cv); 547 cv_destroy(&vdc->failfast_cv); 548 cv_destroy(&vdc->failfast_io_cv); 549 } 550 551 if (vdc->minfo) 552 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 553 554 if (vdc->cinfo) 555 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 556 557 if (vdc->vtoc) 558 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 559 560 if (vdc->geom) 561 kmem_free(vdc->geom, sizeof (struct dk_geom)); 562 563 if (vdc->devid) { 564 ddi_devid_unregister(dip); 565 ddi_devid_free(vdc->devid); 566 } 567 568 if (vdc->initialized & VDC_SOFT_STATE) 569 ddi_soft_state_free(vdc_state, instance); 570 571 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 572 573 return (DDI_SUCCESS); 574 } 575 576 577 static int 578 vdc_do_attach(dev_info_t *dip) 579 { 580 int instance; 581 vdc_t *vdc = NULL; 582 int status; 583 md_t *mdp; 584 mde_cookie_t vd_node; 585 586 ASSERT(dip != NULL); 587 588 instance = ddi_get_instance(dip); 589 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 590 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 591 instance); 592 return (DDI_FAILURE); 593 } 594 595 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 596 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 597 return (DDI_FAILURE); 598 } 599 600 /* 601 * We assign the value to initialized in this case to zero out the 602 * variable and then set bits in it to indicate what has been done 603 */ 604 vdc->initialized = VDC_SOFT_STATE; 605 606 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 607 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 608 609 vdc->dip = dip; 610 vdc->instance = instance; 611 vdc->vdisk_type = VD_DISK_TYPE_UNK; 612 vdc->vdisk_label = VD_DISK_LABEL_UNK; 613 vdc->state = VDC_STATE_INIT; 614 vdc->lifecycle = VDC_LC_ATTACHING; 615 vdc->session_id = 0; 616 vdc->block_size = DEV_BSIZE; 617 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 618 619 /* 620 * We assume, for now, that the vDisk server will export 'read' 621 * operations to us at a minimum (this is needed because of checks 622 * in vdc for supported operations early in the handshake process). 623 * The vDisk server will return ENOTSUP if this is not the case. 624 * The value will be overwritten during the attribute exchange with 625 * the bitmask of operations exported by server. 626 */ 627 vdc->operations = VD_OP_MASK_READ; 628 629 vdc->vtoc = NULL; 630 vdc->geom = NULL; 631 vdc->cinfo = NULL; 632 vdc->minfo = NULL; 633 634 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 635 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 636 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 637 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 638 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 639 640 vdc->threads_pending = 0; 641 vdc->sync_op_pending = B_FALSE; 642 vdc->sync_op_blocked = B_FALSE; 643 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 645 646 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 647 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 648 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 649 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 650 651 /* init blocking msg read functionality */ 652 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 653 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 654 vdc->read_state = VDC_READ_IDLE; 655 656 vdc->initialized |= VDC_LOCKS; 657 658 /* get device and port MD node for this disk instance */ 659 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 660 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 661 instance); 662 return (DDI_FAILURE); 663 } 664 665 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 666 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 667 return (DDI_FAILURE); 668 } 669 670 (void) md_fini_handle(mdp); 671 672 /* initialize the thread responsible for managing state with server */ 673 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 674 vdc, 0, &p0, TS_RUN, minclsyspri); 675 if (vdc->msg_proc_thr == NULL) { 676 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 677 instance); 678 return (DDI_FAILURE); 679 } 680 681 vdc->initialized |= VDC_THREAD; 682 683 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 684 vdc_create_io_kstats(vdc); 685 vdc_create_err_kstats(vdc); 686 687 atomic_inc_32(&vdc_instance_count); 688 689 /* 690 * Check the disk label. This will send requests and do the handshake. 691 * We don't really care about the disk label now. What we really need is 692 * the handshake do be done so that we know the type of the disk (slice 693 * or full disk) and the appropriate device nodes can be created. 694 */ 695 vdc->vdisk_label = VD_DISK_LABEL_UNK; 696 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 697 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 698 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 699 700 mutex_enter(&vdc->lock); 701 (void) vdc_validate_geometry(vdc); 702 mutex_exit(&vdc->lock); 703 704 /* 705 * Now that we have the device info we can create the device nodes 706 */ 707 status = vdc_create_device_nodes(vdc); 708 if (status) { 709 DMSG(vdc, 0, "[%d] Failed to create device nodes", 710 instance); 711 goto return_status; 712 } 713 714 /* 715 * Setup devid 716 */ 717 if (vdc_setup_devid(vdc)) { 718 DMSG(vdc, 0, "[%d] No device id available\n", instance); 719 } 720 721 /* 722 * Fill in the fields of the error statistics kstat that were not 723 * available when creating the kstat 724 */ 725 vdc_set_err_kstats(vdc); 726 727 ddi_report_dev(dip); 728 vdc->lifecycle = VDC_LC_ONLINE; 729 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 730 731 return_status: 732 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 733 return (status); 734 } 735 736 static int 737 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 738 { 739 int status; 740 741 switch (cmd) { 742 case DDI_ATTACH: 743 if ((status = vdc_do_attach(dip)) != 0) 744 (void) vdc_detach(dip, DDI_DETACH); 745 return (status); 746 case DDI_RESUME: 747 /* nothing to do for this non-device */ 748 return (DDI_SUCCESS); 749 default: 750 return (DDI_FAILURE); 751 } 752 } 753 754 static int 755 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 756 { 757 int status = 0; 758 ldc_status_t ldc_state; 759 ldc_attr_t ldc_attr; 760 761 ASSERT(vdc != NULL); 762 ASSERT(srvr != NULL); 763 764 ldc_attr.devclass = LDC_DEV_BLK; 765 ldc_attr.instance = vdc->instance; 766 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 767 ldc_attr.mtu = VD_LDC_MTU; 768 769 if ((srvr->state & VDC_LDC_INIT) == 0) { 770 status = ldc_init(srvr->ldc_id, &ldc_attr, 771 &srvr->ldc_handle); 772 if (status != 0) { 773 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 774 vdc->instance, srvr->ldc_id, status); 775 return (status); 776 } 777 srvr->state |= VDC_LDC_INIT; 778 } 779 status = ldc_status(srvr->ldc_handle, &ldc_state); 780 if (status != 0) { 781 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 782 vdc->instance, status); 783 goto init_exit; 784 } 785 srvr->ldc_state = ldc_state; 786 787 if ((srvr->state & VDC_LDC_CB) == 0) { 788 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 789 (caddr_t)srvr); 790 if (status != 0) { 791 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 792 vdc->instance, status); 793 goto init_exit; 794 } 795 srvr->state |= VDC_LDC_CB; 796 } 797 798 /* 799 * At this stage we have initialised LDC, we will now try and open 800 * the connection. 801 */ 802 if (srvr->ldc_state == LDC_INIT) { 803 status = ldc_open(srvr->ldc_handle); 804 if (status != 0) { 805 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 806 vdc->instance, srvr->ldc_id, status); 807 goto init_exit; 808 } 809 srvr->state |= VDC_LDC_OPEN; 810 } 811 812 init_exit: 813 if (status) { 814 vdc_terminate_ldc(vdc, srvr); 815 } 816 817 return (status); 818 } 819 820 static int 821 vdc_start_ldc_connection(vdc_t *vdc) 822 { 823 int status = 0; 824 825 ASSERT(vdc != NULL); 826 827 ASSERT(MUTEX_HELD(&vdc->lock)); 828 829 status = vdc_do_ldc_up(vdc); 830 831 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 832 833 return (status); 834 } 835 836 static int 837 vdc_stop_ldc_connection(vdc_t *vdcp) 838 { 839 int status; 840 841 ASSERT(vdcp != NULL); 842 843 ASSERT(MUTEX_HELD(&vdcp->lock)); 844 845 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 846 vdcp->state); 847 848 status = ldc_down(vdcp->curr_server->ldc_handle); 849 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 850 851 vdcp->initialized &= ~VDC_HANDSHAKE; 852 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 853 854 return (status); 855 } 856 857 static void 858 vdc_create_io_kstats(vdc_t *vdc) 859 { 860 if (vdc->io_stats != NULL) { 861 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 862 return; 863 } 864 865 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 866 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 867 if (vdc->io_stats != NULL) { 868 vdc->io_stats->ks_lock = &vdc->lock; 869 kstat_install(vdc->io_stats); 870 } else { 871 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 872 " will not be gathered", vdc->instance); 873 } 874 } 875 876 static void 877 vdc_create_err_kstats(vdc_t *vdc) 878 { 879 vd_err_stats_t *stp; 880 char kstatmodule_err[KSTAT_STRLEN]; 881 char kstatname[KSTAT_STRLEN]; 882 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 883 int instance = vdc->instance; 884 885 if (vdc->err_stats != NULL) { 886 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 887 return; 888 } 889 890 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 891 "%serr", VDC_DRIVER_NAME); 892 (void) snprintf(kstatname, sizeof (kstatname), 893 "%s%d,err", VDC_DRIVER_NAME, instance); 894 895 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 896 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 897 898 if (vdc->err_stats == NULL) { 899 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 900 " will not be gathered", instance); 901 return; 902 } 903 904 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 905 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 906 KSTAT_DATA_UINT32); 907 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 908 KSTAT_DATA_UINT32); 909 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 910 KSTAT_DATA_UINT32); 911 kstat_named_init(&stp->vd_vid, "Vendor", 912 KSTAT_DATA_CHAR); 913 kstat_named_init(&stp->vd_pid, "Product", 914 KSTAT_DATA_CHAR); 915 kstat_named_init(&stp->vd_capacity, "Size", 916 KSTAT_DATA_ULONGLONG); 917 918 vdc->err_stats->ks_update = nulldev; 919 920 kstat_install(vdc->err_stats); 921 } 922 923 static void 924 vdc_set_err_kstats(vdc_t *vdc) 925 { 926 vd_err_stats_t *stp; 927 928 if (vdc->err_stats == NULL) 929 return; 930 931 mutex_enter(&vdc->lock); 932 933 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 934 ASSERT(stp != NULL); 935 936 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 937 (void) strcpy(stp->vd_vid.value.c, "SUN"); 938 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 939 940 mutex_exit(&vdc->lock); 941 } 942 943 static int 944 vdc_create_device_nodes_efi(vdc_t *vdc) 945 { 946 ddi_remove_minor_node(vdc->dip, "h"); 947 ddi_remove_minor_node(vdc->dip, "h,raw"); 948 949 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 950 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 951 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 952 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 953 vdc->instance); 954 return (EIO); 955 } 956 957 /* if any device node is created we set this flag */ 958 vdc->initialized |= VDC_MINOR; 959 960 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 961 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 962 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 963 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 964 vdc->instance); 965 return (EIO); 966 } 967 968 return (0); 969 } 970 971 static int 972 vdc_create_device_nodes_vtoc(vdc_t *vdc) 973 { 974 ddi_remove_minor_node(vdc->dip, "wd"); 975 ddi_remove_minor_node(vdc->dip, "wd,raw"); 976 977 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 978 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 979 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 980 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 981 vdc->instance); 982 return (EIO); 983 } 984 985 /* if any device node is created we set this flag */ 986 vdc->initialized |= VDC_MINOR; 987 988 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 989 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 990 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 991 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 992 vdc->instance); 993 return (EIO); 994 } 995 996 return (0); 997 } 998 999 /* 1000 * Function: 1001 * vdc_create_device_nodes 1002 * 1003 * Description: 1004 * This function creates the block and character device nodes under 1005 * /devices. It is called as part of the attach(9E) of the instance 1006 * during the handshake with vds after vds has sent the attributes 1007 * to vdc. 1008 * 1009 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1010 * of 2 is used in keeping with the Solaris convention that slice 2 1011 * refers to a whole disk. Slices start at 'a' 1012 * 1013 * Parameters: 1014 * vdc - soft state pointer 1015 * 1016 * Return Values 1017 * 0 - Success 1018 * EIO - Failed to create node 1019 * EINVAL - Unknown type of disk exported 1020 */ 1021 static int 1022 vdc_create_device_nodes(vdc_t *vdc) 1023 { 1024 char name[sizeof ("s,raw")]; 1025 dev_info_t *dip = NULL; 1026 int instance, status; 1027 int num_slices = 1; 1028 int i; 1029 1030 ASSERT(vdc != NULL); 1031 1032 instance = vdc->instance; 1033 dip = vdc->dip; 1034 1035 switch (vdc->vdisk_type) { 1036 case VD_DISK_TYPE_DISK: 1037 num_slices = V_NUMPAR; 1038 break; 1039 case VD_DISK_TYPE_SLICE: 1040 num_slices = 1; 1041 break; 1042 case VD_DISK_TYPE_UNK: 1043 default: 1044 return (EINVAL); 1045 } 1046 1047 /* 1048 * Minor nodes are different for EFI disks: EFI disks do not have 1049 * a minor node 'g' for the minor number corresponding to slice 1050 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1051 * representing the whole disk. 1052 */ 1053 for (i = 0; i < num_slices; i++) { 1054 1055 if (i == VD_EFI_WD_SLICE) { 1056 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1057 status = vdc_create_device_nodes_efi(vdc); 1058 else 1059 status = vdc_create_device_nodes_vtoc(vdc); 1060 if (status != 0) 1061 return (status); 1062 continue; 1063 } 1064 1065 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1066 if (ddi_create_minor_node(dip, name, S_IFBLK, 1067 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1068 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1069 instance, name); 1070 return (EIO); 1071 } 1072 1073 /* if any device node is created we set this flag */ 1074 vdc->initialized |= VDC_MINOR; 1075 1076 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1077 1078 if (ddi_create_minor_node(dip, name, S_IFCHR, 1079 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1080 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1081 instance, name); 1082 return (EIO); 1083 } 1084 } 1085 1086 return (0); 1087 } 1088 1089 /* 1090 * Driver prop_op(9e) entry point function. Return the number of blocks for 1091 * the partition in question or forward the request to the property facilities. 1092 */ 1093 static int 1094 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1095 char *name, caddr_t valuep, int *lengthp) 1096 { 1097 int instance = ddi_get_instance(dip); 1098 vdc_t *vdc; 1099 uint64_t nblocks; 1100 uint_t blksize; 1101 1102 vdc = ddi_get_soft_state(vdc_state, instance); 1103 1104 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1105 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1106 name, valuep, lengthp)); 1107 } 1108 1109 mutex_enter(&vdc->lock); 1110 (void) vdc_validate_geometry(vdc); 1111 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1112 mutex_exit(&vdc->lock); 1113 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1114 name, valuep, lengthp)); 1115 } 1116 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1117 blksize = vdc->block_size; 1118 mutex_exit(&vdc->lock); 1119 1120 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1121 name, valuep, lengthp, nblocks, blksize)); 1122 } 1123 1124 /* 1125 * Function: 1126 * vdc_is_opened 1127 * 1128 * Description: 1129 * This function checks if any slice of a given virtual disk is 1130 * currently opened. 1131 * 1132 * Parameters: 1133 * vdc - soft state pointer 1134 * 1135 * Return Values 1136 * B_TRUE - at least one slice is opened. 1137 * B_FALSE - no slice is opened. 1138 */ 1139 static boolean_t 1140 vdc_is_opened(vdc_t *vdc) 1141 { 1142 int i, nslices; 1143 1144 switch (vdc->vdisk_type) { 1145 case VD_DISK_TYPE_DISK: 1146 nslices = V_NUMPAR; 1147 break; 1148 case VD_DISK_TYPE_SLICE: 1149 nslices = 1; 1150 break; 1151 case VD_DISK_TYPE_UNK: 1152 default: 1153 ASSERT(0); 1154 } 1155 1156 /* check if there's any layered open */ 1157 for (i = 0; i < nslices; i++) { 1158 if (vdc->open_lyr[i] > 0) 1159 return (B_TRUE); 1160 } 1161 1162 /* check if there is any other kind of open */ 1163 for (i = 0; i < OTYPCNT; i++) { 1164 if (vdc->open[i] != 0) 1165 return (B_TRUE); 1166 } 1167 1168 return (B_FALSE); 1169 } 1170 1171 static int 1172 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1173 { 1174 uint8_t slicemask; 1175 int i; 1176 1177 ASSERT(otyp < OTYPCNT); 1178 ASSERT(slice < V_NUMPAR); 1179 ASSERT(MUTEX_HELD(&vdc->lock)); 1180 1181 slicemask = 1 << slice; 1182 1183 /* check if slice is already exclusively opened */ 1184 if (vdc->open_excl & slicemask) 1185 return (EBUSY); 1186 1187 /* if open exclusive, check if slice is already opened */ 1188 if (flag & FEXCL) { 1189 if (vdc->open_lyr[slice] > 0) 1190 return (EBUSY); 1191 for (i = 0; i < OTYPCNT; i++) { 1192 if (vdc->open[i] & slicemask) 1193 return (EBUSY); 1194 } 1195 vdc->open_excl |= slicemask; 1196 } 1197 1198 /* mark slice as opened */ 1199 if (otyp == OTYP_LYR) { 1200 vdc->open_lyr[slice]++; 1201 } else { 1202 vdc->open[otyp] |= slicemask; 1203 } 1204 1205 return (0); 1206 } 1207 1208 static void 1209 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1210 { 1211 uint8_t slicemask; 1212 1213 ASSERT(otyp < OTYPCNT); 1214 ASSERT(slice < V_NUMPAR); 1215 ASSERT(MUTEX_HELD(&vdc->lock)); 1216 1217 slicemask = 1 << slice; 1218 1219 if (otyp == OTYP_LYR) { 1220 ASSERT(vdc->open_lyr[slice] > 0); 1221 vdc->open_lyr[slice]--; 1222 } else { 1223 vdc->open[otyp] &= ~slicemask; 1224 } 1225 1226 if (flag & FEXCL) 1227 vdc->open_excl &= ~slicemask; 1228 } 1229 1230 static int 1231 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1232 { 1233 _NOTE(ARGUNUSED(cred)) 1234 1235 int instance, nodelay; 1236 int slice, status = 0; 1237 vdc_t *vdc; 1238 1239 ASSERT(dev != NULL); 1240 instance = VDCUNIT(*dev); 1241 1242 if (otyp >= OTYPCNT) 1243 return (EINVAL); 1244 1245 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1246 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1247 return (ENXIO); 1248 } 1249 1250 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1251 getminor(*dev), flag, otyp); 1252 1253 slice = VDCPART(*dev); 1254 1255 nodelay = flag & (FNDELAY | FNONBLOCK); 1256 1257 if ((flag & FWRITE) && (!nodelay) && 1258 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1259 return (EROFS); 1260 } 1261 1262 mutex_enter(&vdc->lock); 1263 1264 status = vdc_mark_opened(vdc, slice, flag, otyp); 1265 1266 if (status != 0) { 1267 mutex_exit(&vdc->lock); 1268 return (status); 1269 } 1270 1271 if (nodelay) { 1272 1273 /* don't resubmit a validate request if there's already one */ 1274 if (vdc->validate_pending > 0) { 1275 mutex_exit(&vdc->lock); 1276 return (0); 1277 } 1278 1279 /* call vdc_validate() asynchronously to avoid blocking */ 1280 if (taskq_dispatch(system_taskq, vdc_validate_task, 1281 (void *)vdc, TQ_NOSLEEP) == NULL) { 1282 vdc_mark_closed(vdc, slice, flag, otyp); 1283 mutex_exit(&vdc->lock); 1284 return (ENXIO); 1285 } 1286 1287 vdc->validate_pending++; 1288 mutex_exit(&vdc->lock); 1289 return (0); 1290 } 1291 1292 mutex_exit(&vdc->lock); 1293 1294 vdc_validate(vdc); 1295 1296 mutex_enter(&vdc->lock); 1297 1298 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1299 vdc->slice[slice].nblocks == 0) { 1300 vdc_mark_closed(vdc, slice, flag, otyp); 1301 status = EIO; 1302 } 1303 1304 mutex_exit(&vdc->lock); 1305 1306 return (status); 1307 } 1308 1309 static int 1310 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1311 { 1312 _NOTE(ARGUNUSED(cred)) 1313 1314 int instance; 1315 int slice; 1316 int rv, rval; 1317 vdc_t *vdc; 1318 1319 instance = VDCUNIT(dev); 1320 1321 if (otyp >= OTYPCNT) 1322 return (EINVAL); 1323 1324 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1325 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1326 return (ENXIO); 1327 } 1328 1329 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1330 1331 slice = VDCPART(dev); 1332 1333 /* 1334 * Attempt to flush the W$ on a close operation. If this is 1335 * not a supported IOCTL command or the backing device is read-only 1336 * do not fail the close operation. 1337 */ 1338 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1339 1340 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1341 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1342 instance, rv); 1343 return (EIO); 1344 } 1345 1346 mutex_enter(&vdc->lock); 1347 vdc_mark_closed(vdc, slice, flag, otyp); 1348 mutex_exit(&vdc->lock); 1349 1350 return (0); 1351 } 1352 1353 static int 1354 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1355 { 1356 _NOTE(ARGUNUSED(credp)) 1357 1358 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1359 } 1360 1361 static int 1362 vdc_print(dev_t dev, char *str) 1363 { 1364 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1365 return (0); 1366 } 1367 1368 static int 1369 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1370 { 1371 int rv; 1372 size_t nbytes = nblk * DEV_BSIZE; 1373 int instance = VDCUNIT(dev); 1374 vdc_t *vdc = NULL; 1375 1376 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1377 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1378 return (ENXIO); 1379 } 1380 1381 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1382 instance, nbytes, blkno, (void *)addr); 1383 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1384 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1385 if (rv) { 1386 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1387 return (rv); 1388 } 1389 1390 if (ddi_in_panic()) 1391 (void) vdc_drain_response(vdc); 1392 1393 DMSG(vdc, 0, "[%d] End\n", instance); 1394 1395 return (0); 1396 } 1397 1398 /* -------------------------------------------------------------------------- */ 1399 1400 /* 1401 * Disk access routines 1402 * 1403 */ 1404 1405 /* 1406 * vdc_strategy() 1407 * 1408 * Return Value: 1409 * 0: As per strategy(9E), the strategy() function must return 0 1410 * [ bioerror(9f) sets b_flags to the proper error code ] 1411 */ 1412 static int 1413 vdc_strategy(struct buf *buf) 1414 { 1415 int rv = -1; 1416 vdc_t *vdc = NULL; 1417 int instance = VDCUNIT(buf->b_edev); 1418 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1419 int slice; 1420 1421 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1422 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1423 bioerror(buf, ENXIO); 1424 biodone(buf); 1425 return (0); 1426 } 1427 1428 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1429 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1430 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1431 1432 bp_mapin(buf); 1433 1434 if ((long)buf->b_private == VD_SLICE_NONE) { 1435 /* I/O using an absolute disk offset */ 1436 slice = VD_SLICE_NONE; 1437 } else { 1438 slice = VDCPART(buf->b_edev); 1439 } 1440 1441 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1442 buf->b_bcount, slice, buf->b_lblkno, 1443 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1444 VIO_write_dir); 1445 1446 /* 1447 * If the request was successfully sent, the strategy call returns and 1448 * the ACK handler calls the bioxxx functions when the vDisk server is 1449 * done otherwise we handle the error here. 1450 */ 1451 if (rv) { 1452 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1453 bioerror(buf, rv); 1454 biodone(buf); 1455 } 1456 1457 return (0); 1458 } 1459 1460 /* 1461 * Function: 1462 * vdc_min 1463 * 1464 * Description: 1465 * Routine to limit the size of a data transfer. Used in 1466 * conjunction with physio(9F). 1467 * 1468 * Arguments: 1469 * bp - pointer to the indicated buf(9S) struct. 1470 * 1471 */ 1472 static void 1473 vdc_min(struct buf *bufp) 1474 { 1475 vdc_t *vdc = NULL; 1476 int instance = VDCUNIT(bufp->b_edev); 1477 1478 vdc = ddi_get_soft_state(vdc_state, instance); 1479 VERIFY(vdc != NULL); 1480 1481 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1482 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1483 } 1484 } 1485 1486 static int 1487 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1488 { 1489 _NOTE(ARGUNUSED(cred)) 1490 1491 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1492 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1493 } 1494 1495 static int 1496 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1497 { 1498 _NOTE(ARGUNUSED(cred)) 1499 1500 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1501 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1502 } 1503 1504 static int 1505 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1506 { 1507 _NOTE(ARGUNUSED(cred)) 1508 1509 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1510 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1511 } 1512 1513 static int 1514 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1515 { 1516 _NOTE(ARGUNUSED(cred)) 1517 1518 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1519 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1520 } 1521 1522 1523 /* -------------------------------------------------------------------------- */ 1524 1525 /* 1526 * Handshake support 1527 */ 1528 1529 1530 /* 1531 * Function: 1532 * vdc_init_ver_negotiation() 1533 * 1534 * Description: 1535 * 1536 * Arguments: 1537 * vdc - soft state pointer for this instance of the device driver. 1538 * 1539 * Return Code: 1540 * 0 - Success 1541 */ 1542 static int 1543 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1544 { 1545 vio_ver_msg_t pkt; 1546 size_t msglen = sizeof (pkt); 1547 int status = -1; 1548 1549 ASSERT(vdc != NULL); 1550 ASSERT(mutex_owned(&vdc->lock)); 1551 1552 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1553 1554 /* 1555 * set the Session ID to a unique value 1556 * (the lower 32 bits of the clock tick) 1557 */ 1558 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1559 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1560 1561 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1562 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1563 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1564 pkt.tag.vio_sid = vdc->session_id; 1565 pkt.dev_class = VDEV_DISK; 1566 pkt.ver_major = ver.major; 1567 pkt.ver_minor = ver.minor; 1568 1569 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1570 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1571 vdc->instance, status); 1572 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1573 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1574 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1575 vdc->curr_server->ldc_handle, status, msglen); 1576 if (msglen != sizeof (vio_ver_msg_t)) 1577 status = ENOMSG; 1578 } 1579 1580 return (status); 1581 } 1582 1583 /* 1584 * Function: 1585 * vdc_ver_negotiation() 1586 * 1587 * Description: 1588 * 1589 * Arguments: 1590 * vdcp - soft state pointer for this instance of the device driver. 1591 * 1592 * Return Code: 1593 * 0 - Success 1594 */ 1595 static int 1596 vdc_ver_negotiation(vdc_t *vdcp) 1597 { 1598 vio_msg_t vio_msg; 1599 int status; 1600 1601 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1602 return (status); 1603 1604 /* release lock and wait for response */ 1605 mutex_exit(&vdcp->lock); 1606 status = vdc_wait_for_response(vdcp, &vio_msg); 1607 mutex_enter(&vdcp->lock); 1608 if (status) { 1609 DMSG(vdcp, 0, 1610 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1611 vdcp->instance, status); 1612 return (status); 1613 } 1614 1615 /* check type and sub_type ... */ 1616 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1617 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1618 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1619 vdcp->instance); 1620 return (EPROTO); 1621 } 1622 1623 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1624 } 1625 1626 /* 1627 * Function: 1628 * vdc_init_attr_negotiation() 1629 * 1630 * Description: 1631 * 1632 * Arguments: 1633 * vdc - soft state pointer for this instance of the device driver. 1634 * 1635 * Return Code: 1636 * 0 - Success 1637 */ 1638 static int 1639 vdc_init_attr_negotiation(vdc_t *vdc) 1640 { 1641 vd_attr_msg_t pkt; 1642 size_t msglen = sizeof (pkt); 1643 int status; 1644 1645 ASSERT(vdc != NULL); 1646 ASSERT(mutex_owned(&vdc->lock)); 1647 1648 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1649 1650 /* fill in tag */ 1651 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1652 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1653 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1654 pkt.tag.vio_sid = vdc->session_id; 1655 /* fill in payload */ 1656 pkt.max_xfer_sz = vdc->max_xfer_sz; 1657 pkt.vdisk_block_size = vdc->block_size; 1658 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1659 pkt.operations = 0; /* server will set bits of valid operations */ 1660 pkt.vdisk_type = 0; /* server will set to valid device type */ 1661 pkt.vdisk_media = 0; /* server will set to valid media type */ 1662 pkt.vdisk_size = 0; /* server will set to valid size */ 1663 1664 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1665 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1666 1667 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1668 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1669 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1670 vdc->curr_server->ldc_handle, status, msglen); 1671 if (msglen != sizeof (vio_ver_msg_t)) 1672 status = ENOMSG; 1673 } 1674 1675 return (status); 1676 } 1677 1678 /* 1679 * Function: 1680 * vdc_attr_negotiation() 1681 * 1682 * Description: 1683 * 1684 * Arguments: 1685 * vdc - soft state pointer for this instance of the device driver. 1686 * 1687 * Return Code: 1688 * 0 - Success 1689 */ 1690 static int 1691 vdc_attr_negotiation(vdc_t *vdcp) 1692 { 1693 int status; 1694 vio_msg_t vio_msg; 1695 1696 if (status = vdc_init_attr_negotiation(vdcp)) 1697 return (status); 1698 1699 /* release lock and wait for response */ 1700 mutex_exit(&vdcp->lock); 1701 status = vdc_wait_for_response(vdcp, &vio_msg); 1702 mutex_enter(&vdcp->lock); 1703 if (status) { 1704 DMSG(vdcp, 0, 1705 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1706 vdcp->instance, status); 1707 return (status); 1708 } 1709 1710 /* check type and sub_type ... */ 1711 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1712 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1713 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1714 vdcp->instance); 1715 return (EPROTO); 1716 } 1717 1718 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1719 } 1720 1721 1722 /* 1723 * Function: 1724 * vdc_init_dring_negotiate() 1725 * 1726 * Description: 1727 * 1728 * Arguments: 1729 * vdc - soft state pointer for this instance of the device driver. 1730 * 1731 * Return Code: 1732 * 0 - Success 1733 */ 1734 static int 1735 vdc_init_dring_negotiate(vdc_t *vdc) 1736 { 1737 vio_dring_reg_msg_t pkt; 1738 size_t msglen = sizeof (pkt); 1739 int status = -1; 1740 int retry; 1741 int nretries = 10; 1742 1743 ASSERT(vdc != NULL); 1744 ASSERT(mutex_owned(&vdc->lock)); 1745 1746 for (retry = 0; retry < nretries; retry++) { 1747 status = vdc_init_descriptor_ring(vdc); 1748 if (status != EAGAIN) 1749 break; 1750 drv_usecwait(vdc_min_timeout_ldc); 1751 } 1752 1753 if (status != 0) { 1754 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1755 vdc->instance, status); 1756 return (status); 1757 } 1758 1759 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1760 vdc->instance, status); 1761 1762 /* fill in tag */ 1763 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1764 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1765 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1766 pkt.tag.vio_sid = vdc->session_id; 1767 /* fill in payload */ 1768 pkt.dring_ident = 0; 1769 pkt.num_descriptors = vdc->dring_len; 1770 pkt.descriptor_size = vdc->dring_entry_size; 1771 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1772 pkt.ncookies = vdc->dring_cookie_count; 1773 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1774 1775 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1776 if (status != 0) { 1777 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1778 vdc->instance, status); 1779 } 1780 1781 return (status); 1782 } 1783 1784 1785 /* 1786 * Function: 1787 * vdc_dring_negotiation() 1788 * 1789 * Description: 1790 * 1791 * Arguments: 1792 * vdc - soft state pointer for this instance of the device driver. 1793 * 1794 * Return Code: 1795 * 0 - Success 1796 */ 1797 static int 1798 vdc_dring_negotiation(vdc_t *vdcp) 1799 { 1800 int status; 1801 vio_msg_t vio_msg; 1802 1803 if (status = vdc_init_dring_negotiate(vdcp)) 1804 return (status); 1805 1806 /* release lock and wait for response */ 1807 mutex_exit(&vdcp->lock); 1808 status = vdc_wait_for_response(vdcp, &vio_msg); 1809 mutex_enter(&vdcp->lock); 1810 if (status) { 1811 DMSG(vdcp, 0, 1812 "[%d] Failed waiting for Dring negotiation response," 1813 " rv(%d)", vdcp->instance, status); 1814 return (status); 1815 } 1816 1817 /* check type and sub_type ... */ 1818 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1819 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1820 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1821 vdcp->instance); 1822 return (EPROTO); 1823 } 1824 1825 return (vdc_handle_dring_reg_msg(vdcp, 1826 (vio_dring_reg_msg_t *)&vio_msg)); 1827 } 1828 1829 1830 /* 1831 * Function: 1832 * vdc_send_rdx() 1833 * 1834 * Description: 1835 * 1836 * Arguments: 1837 * vdc - soft state pointer for this instance of the device driver. 1838 * 1839 * Return Code: 1840 * 0 - Success 1841 */ 1842 static int 1843 vdc_send_rdx(vdc_t *vdcp) 1844 { 1845 vio_msg_t msg; 1846 size_t msglen = sizeof (vio_msg_t); 1847 int status; 1848 1849 /* 1850 * Send an RDX message to vds to indicate we are ready 1851 * to send data 1852 */ 1853 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1854 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1855 msg.tag.vio_subtype_env = VIO_RDX; 1856 msg.tag.vio_sid = vdcp->session_id; 1857 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1858 if (status != 0) { 1859 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1860 vdcp->instance, status); 1861 } 1862 1863 return (status); 1864 } 1865 1866 /* 1867 * Function: 1868 * vdc_handle_rdx() 1869 * 1870 * Description: 1871 * 1872 * Arguments: 1873 * vdc - soft state pointer for this instance of the device driver. 1874 * msgp - received msg 1875 * 1876 * Return Code: 1877 * 0 - Success 1878 */ 1879 static int 1880 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1881 { 1882 _NOTE(ARGUNUSED(vdcp)) 1883 _NOTE(ARGUNUSED(msgp)) 1884 1885 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1886 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1887 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1888 1889 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1890 1891 return (0); 1892 } 1893 1894 /* 1895 * Function: 1896 * vdc_rdx_exchange() 1897 * 1898 * Description: 1899 * 1900 * Arguments: 1901 * vdc - soft state pointer for this instance of the device driver. 1902 * 1903 * Return Code: 1904 * 0 - Success 1905 */ 1906 static int 1907 vdc_rdx_exchange(vdc_t *vdcp) 1908 { 1909 int status; 1910 vio_msg_t vio_msg; 1911 1912 if (status = vdc_send_rdx(vdcp)) 1913 return (status); 1914 1915 /* release lock and wait for response */ 1916 mutex_exit(&vdcp->lock); 1917 status = vdc_wait_for_response(vdcp, &vio_msg); 1918 mutex_enter(&vdcp->lock); 1919 if (status) { 1920 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1921 vdcp->instance, status); 1922 return (status); 1923 } 1924 1925 /* check type and sub_type ... */ 1926 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1927 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1928 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1929 return (EPROTO); 1930 } 1931 1932 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1933 } 1934 1935 1936 /* -------------------------------------------------------------------------- */ 1937 1938 /* 1939 * LDC helper routines 1940 */ 1941 1942 static int 1943 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1944 { 1945 int status; 1946 boolean_t q_has_pkts = B_FALSE; 1947 uint64_t delay_time; 1948 size_t len; 1949 1950 mutex_enter(&vdc->read_lock); 1951 1952 if (vdc->read_state == VDC_READ_IDLE) 1953 vdc->read_state = VDC_READ_WAITING; 1954 1955 while (vdc->read_state != VDC_READ_PENDING) { 1956 1957 /* detect if the connection has been reset */ 1958 if (vdc->read_state == VDC_READ_RESET) { 1959 status = ECONNRESET; 1960 goto done; 1961 } 1962 1963 cv_wait(&vdc->read_cv, &vdc->read_lock); 1964 } 1965 1966 /* 1967 * Until we get a blocking ldc read we have to retry 1968 * until the entire LDC message has arrived before 1969 * ldc_read() will succeed. Note we also bail out if 1970 * the channel is reset or goes away. 1971 */ 1972 delay_time = vdc_ldc_read_init_delay; 1973 loop: 1974 len = *nbytesp; 1975 status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); 1976 switch (status) { 1977 case EAGAIN: 1978 delay_time *= 2; 1979 if (delay_time >= vdc_ldc_read_max_delay) 1980 delay_time = vdc_ldc_read_max_delay; 1981 delay(delay_time); 1982 goto loop; 1983 1984 case 0: 1985 if (len == 0) { 1986 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 1987 "no error!\n", vdc->instance); 1988 goto loop; 1989 } 1990 1991 *nbytesp = len; 1992 1993 /* 1994 * If there are pending messages, leave the 1995 * read state as pending. Otherwise, set the state 1996 * back to idle. 1997 */ 1998 status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); 1999 if (status == 0 && !q_has_pkts) 2000 vdc->read_state = VDC_READ_IDLE; 2001 2002 break; 2003 default: 2004 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2005 break; 2006 } 2007 2008 done: 2009 mutex_exit(&vdc->read_lock); 2010 2011 return (status); 2012 } 2013 2014 2015 2016 #ifdef DEBUG 2017 void 2018 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2019 { 2020 char *ms, *ss, *ses; 2021 switch (msg->tag.vio_msgtype) { 2022 #define Q(_s) case _s : ms = #_s; break; 2023 Q(VIO_TYPE_CTRL) 2024 Q(VIO_TYPE_DATA) 2025 Q(VIO_TYPE_ERR) 2026 #undef Q 2027 default: ms = "unknown"; break; 2028 } 2029 2030 switch (msg->tag.vio_subtype) { 2031 #define Q(_s) case _s : ss = #_s; break; 2032 Q(VIO_SUBTYPE_INFO) 2033 Q(VIO_SUBTYPE_ACK) 2034 Q(VIO_SUBTYPE_NACK) 2035 #undef Q 2036 default: ss = "unknown"; break; 2037 } 2038 2039 switch (msg->tag.vio_subtype_env) { 2040 #define Q(_s) case _s : ses = #_s; break; 2041 Q(VIO_VER_INFO) 2042 Q(VIO_ATTR_INFO) 2043 Q(VIO_DRING_REG) 2044 Q(VIO_DRING_UNREG) 2045 Q(VIO_RDX) 2046 Q(VIO_PKT_DATA) 2047 Q(VIO_DESC_DATA) 2048 Q(VIO_DRING_DATA) 2049 #undef Q 2050 default: ses = "unknown"; break; 2051 } 2052 2053 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2054 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2055 msg->tag.vio_subtype_env, ms, ss, ses); 2056 } 2057 #endif 2058 2059 /* 2060 * Function: 2061 * vdc_send() 2062 * 2063 * Description: 2064 * The function encapsulates the call to write a message using LDC. 2065 * If LDC indicates that the call failed due to the queue being full, 2066 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2067 * 2068 * Arguments: 2069 * ldc_handle - LDC handle for the channel this instance of vdc uses 2070 * pkt - address of LDC message to be sent 2071 * msglen - the size of the message being sent. When the function 2072 * returns, this contains the number of bytes written. 2073 * 2074 * Return Code: 2075 * 0 - Success. 2076 * EINVAL - pkt or msglen were NULL 2077 * ECONNRESET - The connection was not up. 2078 * EWOULDBLOCK - LDC queue is full 2079 * xxx - other error codes returned by ldc_write 2080 */ 2081 static int 2082 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2083 { 2084 size_t size = 0; 2085 int status = 0; 2086 clock_t delay_ticks; 2087 2088 ASSERT(vdc != NULL); 2089 ASSERT(mutex_owned(&vdc->lock)); 2090 ASSERT(msglen != NULL); 2091 ASSERT(*msglen != 0); 2092 2093 #ifdef DEBUG 2094 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2095 #endif 2096 /* 2097 * Wait indefinitely to send if channel 2098 * is busy, but bail out if we succeed or 2099 * if the channel closes or is reset. 2100 */ 2101 delay_ticks = vdc_hz_min_ldc_delay; 2102 do { 2103 size = *msglen; 2104 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2105 if (status == EWOULDBLOCK) { 2106 delay(delay_ticks); 2107 /* geometric backoff */ 2108 delay_ticks *= 2; 2109 if (delay_ticks > vdc_hz_max_ldc_delay) 2110 delay_ticks = vdc_hz_max_ldc_delay; 2111 } 2112 } while (status == EWOULDBLOCK); 2113 2114 /* if LDC had serious issues --- reset vdc state */ 2115 if (status == EIO || status == ECONNRESET) { 2116 /* LDC had serious issues --- reset vdc state */ 2117 mutex_enter(&vdc->read_lock); 2118 if ((vdc->read_state == VDC_READ_WAITING) || 2119 (vdc->read_state == VDC_READ_RESET)) 2120 cv_signal(&vdc->read_cv); 2121 vdc->read_state = VDC_READ_RESET; 2122 mutex_exit(&vdc->read_lock); 2123 2124 /* wake up any waiters in the reset thread */ 2125 if (vdc->state == VDC_STATE_INIT_WAITING) { 2126 DMSG(vdc, 0, "[%d] write reset - " 2127 "vdc is resetting ..\n", vdc->instance); 2128 vdc->state = VDC_STATE_RESETTING; 2129 cv_signal(&vdc->initwait_cv); 2130 } 2131 2132 return (ECONNRESET); 2133 } 2134 2135 /* return the last size written */ 2136 *msglen = size; 2137 2138 return (status); 2139 } 2140 2141 /* 2142 * Function: 2143 * vdc_get_md_node 2144 * 2145 * Description: 2146 * Get the MD, the device node for the given disk instance. The 2147 * caller is responsible for cleaning up the reference to the 2148 * returned MD (mdpp) by calling md_fini_handle(). 2149 * 2150 * Arguments: 2151 * dip - dev info pointer for this instance of the device driver. 2152 * mdpp - the returned MD. 2153 * vd_nodep - the returned device node. 2154 * 2155 * Return Code: 2156 * 0 - Success. 2157 * ENOENT - Expected node or property did not exist. 2158 * ENXIO - Unexpected error communicating with MD framework 2159 */ 2160 static int 2161 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2162 { 2163 int status = ENOENT; 2164 char *node_name = NULL; 2165 md_t *mdp = NULL; 2166 int num_nodes; 2167 int num_vdevs; 2168 mde_cookie_t rootnode; 2169 mde_cookie_t *listp = NULL; 2170 boolean_t found_inst = B_FALSE; 2171 int listsz; 2172 int idx; 2173 uint64_t md_inst; 2174 int obp_inst; 2175 int instance = ddi_get_instance(dip); 2176 2177 /* 2178 * Get the OBP instance number for comparison with the MD instance 2179 * 2180 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2181 * notion of "instance", or unique identifier, for that node; OBP 2182 * stores the value of the "cfg-handle" MD property as the value of 2183 * the "reg" property on the node in the device tree it builds from 2184 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2185 * "reg" property value to uniquely identify this device instance. 2186 * If the "reg" property cannot be found, the device tree state is 2187 * presumably so broken that there is no point in continuing. 2188 */ 2189 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2190 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2191 return (ENOENT); 2192 } 2193 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2194 OBP_REG, -1); 2195 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2196 2197 /* 2198 * We now walk the MD nodes to find the node for this vdisk. 2199 */ 2200 if ((mdp = md_get_handle()) == NULL) { 2201 cmn_err(CE_WARN, "unable to init machine description"); 2202 return (ENXIO); 2203 } 2204 2205 num_nodes = md_node_count(mdp); 2206 ASSERT(num_nodes > 0); 2207 2208 listsz = num_nodes * sizeof (mde_cookie_t); 2209 2210 /* allocate memory for nodes */ 2211 listp = kmem_zalloc(listsz, KM_SLEEP); 2212 2213 rootnode = md_root_node(mdp); 2214 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2215 2216 /* 2217 * Search for all the virtual devices, we will then check to see which 2218 * ones are disk nodes. 2219 */ 2220 num_vdevs = md_scan_dag(mdp, rootnode, 2221 md_find_name(mdp, VDC_MD_VDEV_NAME), 2222 md_find_name(mdp, "fwd"), listp); 2223 2224 if (num_vdevs <= 0) { 2225 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2226 status = ENOENT; 2227 goto done; 2228 } 2229 2230 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2231 for (idx = 0; idx < num_vdevs; idx++) { 2232 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2233 if ((status != 0) || (node_name == NULL)) { 2234 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2235 ": err %d", VDC_MD_VDEV_NAME, status); 2236 continue; 2237 } 2238 2239 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2240 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2241 status = md_get_prop_val(mdp, listp[idx], 2242 VDC_MD_CFG_HDL, &md_inst); 2243 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2244 instance, md_inst); 2245 if ((status == 0) && (md_inst == obp_inst)) { 2246 found_inst = B_TRUE; 2247 break; 2248 } 2249 } 2250 } 2251 2252 if (!found_inst) { 2253 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2254 status = ENOENT; 2255 goto done; 2256 } 2257 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2258 2259 *vd_nodep = listp[idx]; 2260 *mdpp = mdp; 2261 done: 2262 kmem_free(listp, listsz); 2263 return (status); 2264 } 2265 2266 /* 2267 * Function: 2268 * vdc_init_ports 2269 * 2270 * Description: 2271 * Initialize all the ports for this vdisk instance. 2272 * 2273 * Arguments: 2274 * vdc - soft state pointer for this instance of the device driver. 2275 * mdp - md pointer 2276 * vd_nodep - device md node. 2277 * 2278 * Return Code: 2279 * 0 - Success. 2280 * ENOENT - Expected node or property did not exist. 2281 */ 2282 static int 2283 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2284 { 2285 int status = 0; 2286 int idx; 2287 int num_nodes; 2288 int num_vports; 2289 int num_chans; 2290 int listsz; 2291 mde_cookie_t vd_port; 2292 mde_cookie_t *chanp = NULL; 2293 mde_cookie_t *portp = NULL; 2294 vdc_server_t *srvr; 2295 vdc_server_t *prev_srvr = NULL; 2296 2297 /* 2298 * We now walk the MD nodes to find the port nodes for this vdisk. 2299 */ 2300 num_nodes = md_node_count(mdp); 2301 ASSERT(num_nodes > 0); 2302 2303 listsz = num_nodes * sizeof (mde_cookie_t); 2304 2305 /* allocate memory for nodes */ 2306 portp = kmem_zalloc(listsz, KM_SLEEP); 2307 chanp = kmem_zalloc(listsz, KM_SLEEP); 2308 2309 num_vports = md_scan_dag(mdp, vd_nodep, 2310 md_find_name(mdp, VDC_MD_PORT_NAME), 2311 md_find_name(mdp, "fwd"), portp); 2312 if (num_vports == 0) { 2313 DMSGX(0, "Found no '%s' node for '%s' port\n", 2314 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2315 status = ENOENT; 2316 goto done; 2317 } 2318 2319 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2320 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2321 2322 vdc->num_servers = 0; 2323 for (idx = 0; idx < num_vports; idx++) { 2324 2325 /* initialize this port */ 2326 vd_port = portp[idx]; 2327 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2328 srvr->vdcp = vdc; 2329 2330 /* get port id */ 2331 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2332 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2333 VDC_MD_ID); 2334 kmem_free(srvr, sizeof (vdc_server_t)); 2335 continue; 2336 } 2337 2338 /* set the connection timeout */ 2339 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2340 &srvr->ctimeout) != 0) { 2341 srvr->ctimeout = 0; 2342 } 2343 2344 /* get the ldc id */ 2345 num_chans = md_scan_dag(mdp, vd_port, 2346 md_find_name(mdp, VDC_MD_CHAN_NAME), 2347 md_find_name(mdp, "fwd"), chanp); 2348 2349 /* expecting at least one channel */ 2350 if (num_chans <= 0) { 2351 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2352 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2353 kmem_free(srvr, sizeof (vdc_server_t)); 2354 continue; 2355 } else if (num_chans != 1) { 2356 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2357 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2358 num_chans); 2359 } 2360 2361 /* 2362 * We use the first channel found (index 0), irrespective of how 2363 * many are there in total. 2364 */ 2365 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2366 &srvr->ldc_id) != 0) { 2367 cmn_err(CE_NOTE, "Channel '%s' property not found", 2368 VDC_MD_ID); 2369 kmem_free(srvr, sizeof (vdc_server_t)); 2370 continue; 2371 } 2372 2373 /* 2374 * now initialise LDC channel which will be used to 2375 * communicate with this server 2376 */ 2377 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2378 kmem_free(srvr, sizeof (vdc_server_t)); 2379 continue; 2380 } 2381 2382 /* add server to list */ 2383 if (prev_srvr) 2384 prev_srvr->next = srvr; 2385 else 2386 vdc->server_list = srvr; 2387 2388 prev_srvr = srvr; 2389 2390 /* inc numbers of servers */ 2391 vdc->num_servers++; 2392 } 2393 2394 /* 2395 * Adjust the max number of handshake retries to match 2396 * the number of vdisk servers. 2397 */ 2398 if (vdc_hshake_retries < vdc->num_servers) 2399 vdc_hshake_retries = vdc->num_servers; 2400 2401 /* pick first server as current server */ 2402 if (vdc->server_list != NULL) { 2403 vdc->curr_server = vdc->server_list; 2404 status = 0; 2405 } else { 2406 status = ENOENT; 2407 } 2408 2409 done: 2410 kmem_free(chanp, listsz); 2411 kmem_free(portp, listsz); 2412 return (status); 2413 } 2414 2415 2416 /* 2417 * Function: 2418 * vdc_do_ldc_up 2419 * 2420 * Description: 2421 * Bring the channel for the current server up. 2422 * 2423 * Arguments: 2424 * vdc - soft state pointer for this instance of the device driver. 2425 * 2426 * Return Code: 2427 * 0 - Success. 2428 * EINVAL - Driver is detaching / LDC error 2429 * ECONNREFUSED - Other end is not listening 2430 */ 2431 static int 2432 vdc_do_ldc_up(vdc_t *vdc) 2433 { 2434 int status; 2435 ldc_status_t ldc_state; 2436 2437 ASSERT(MUTEX_HELD(&vdc->lock)); 2438 2439 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2440 vdc->instance, vdc->curr_server->ldc_id); 2441 2442 if (vdc->lifecycle == VDC_LC_DETACHING) 2443 return (EINVAL); 2444 2445 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2446 switch (status) { 2447 case ECONNREFUSED: /* listener not ready at other end */ 2448 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2449 vdc->instance, vdc->curr_server->ldc_id, status); 2450 status = 0; 2451 break; 2452 default: 2453 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2454 "channel=%ld, err=%d", vdc->instance, 2455 vdc->curr_server->ldc_id, status); 2456 break; 2457 } 2458 } 2459 2460 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2461 vdc->curr_server->ldc_state = ldc_state; 2462 if (ldc_state == LDC_UP) { 2463 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2464 vdc->instance); 2465 vdc->seq_num = 1; 2466 vdc->seq_num_reply = 0; 2467 } 2468 } 2469 2470 return (status); 2471 } 2472 2473 /* 2474 * Function: 2475 * vdc_terminate_ldc() 2476 * 2477 * Description: 2478 * 2479 * Arguments: 2480 * vdc - soft state pointer for this instance of the device driver. 2481 * srvr - vdc per-server info structure 2482 * 2483 * Return Code: 2484 * None 2485 */ 2486 static void 2487 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2488 { 2489 int instance = ddi_get_instance(vdc->dip); 2490 2491 if (srvr->state & VDC_LDC_OPEN) { 2492 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2493 (void) ldc_close(srvr->ldc_handle); 2494 } 2495 if (srvr->state & VDC_LDC_CB) { 2496 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2497 (void) ldc_unreg_callback(srvr->ldc_handle); 2498 } 2499 if (srvr->state & VDC_LDC_INIT) { 2500 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2501 (void) ldc_fini(srvr->ldc_handle); 2502 srvr->ldc_handle = NULL; 2503 } 2504 2505 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2506 } 2507 2508 /* 2509 * Function: 2510 * vdc_fini_ports() 2511 * 2512 * Description: 2513 * Finalize all ports by closing the channel associated with each 2514 * port and also freeing the server structure. 2515 * 2516 * Arguments: 2517 * vdc - soft state pointer for this instance of the device driver. 2518 * 2519 * Return Code: 2520 * None 2521 */ 2522 static void 2523 vdc_fini_ports(vdc_t *vdc) 2524 { 2525 int instance = ddi_get_instance(vdc->dip); 2526 vdc_server_t *srvr, *prev_srvr; 2527 2528 ASSERT(vdc != NULL); 2529 ASSERT(mutex_owned(&vdc->lock)); 2530 2531 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2532 2533 srvr = vdc->server_list; 2534 2535 while (srvr) { 2536 2537 vdc_terminate_ldc(vdc, srvr); 2538 2539 /* next server */ 2540 prev_srvr = srvr; 2541 srvr = srvr->next; 2542 2543 /* free server */ 2544 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2545 } 2546 2547 vdc->server_list = NULL; 2548 } 2549 2550 /* -------------------------------------------------------------------------- */ 2551 2552 /* 2553 * Descriptor Ring helper routines 2554 */ 2555 2556 /* 2557 * Function: 2558 * vdc_init_descriptor_ring() 2559 * 2560 * Description: 2561 * 2562 * Arguments: 2563 * vdc - soft state pointer for this instance of the device driver. 2564 * 2565 * Return Code: 2566 * 0 - Success 2567 */ 2568 static int 2569 vdc_init_descriptor_ring(vdc_t *vdc) 2570 { 2571 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2572 int status = 0; 2573 int i; 2574 2575 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2576 2577 ASSERT(vdc != NULL); 2578 ASSERT(mutex_owned(&vdc->lock)); 2579 2580 /* ensure we have enough room to store max sized block */ 2581 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2582 2583 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2584 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2585 /* 2586 * Calculate the maximum block size we can transmit using one 2587 * Descriptor Ring entry from the attributes returned by the 2588 * vDisk server. This is subject to a minimum of 'maxphys' 2589 * as we do not have the capability to split requests over 2590 * multiple DRing entries. 2591 */ 2592 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2593 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2594 vdc->instance); 2595 vdc->dring_max_cookies = maxphys / PAGESIZE; 2596 } else { 2597 vdc->dring_max_cookies = 2598 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2599 } 2600 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2601 (sizeof (ldc_mem_cookie_t) * 2602 (vdc->dring_max_cookies - 1))); 2603 vdc->dring_len = VD_DRING_LEN; 2604 2605 status = ldc_mem_dring_create(vdc->dring_len, 2606 vdc->dring_entry_size, &vdc->dring_hdl); 2607 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2608 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2609 vdc->instance); 2610 return (status); 2611 } 2612 vdc->initialized |= VDC_DRING_INIT; 2613 } 2614 2615 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2616 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2617 vdc->dring_cookie = 2618 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2619 2620 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2621 vdc->dring_hdl, 2622 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2623 &vdc->dring_cookie[0], 2624 &vdc->dring_cookie_count); 2625 if (status != 0) { 2626 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2627 "(%lx) to channel (%lx) status=%d\n", 2628 vdc->instance, vdc->dring_hdl, 2629 vdc->curr_server->ldc_handle, status); 2630 return (status); 2631 } 2632 ASSERT(vdc->dring_cookie_count == 1); 2633 vdc->initialized |= VDC_DRING_BOUND; 2634 } 2635 2636 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2637 if (status != 0) { 2638 DMSG(vdc, 0, 2639 "[%d] Failed to get info for descriptor ring (%lx)\n", 2640 vdc->instance, vdc->dring_hdl); 2641 return (status); 2642 } 2643 2644 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2645 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2646 2647 /* Allocate the local copy of this dring */ 2648 vdc->local_dring = 2649 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2650 KM_SLEEP); 2651 vdc->initialized |= VDC_DRING_LOCAL; 2652 } 2653 2654 /* 2655 * Mark all DRing entries as free and initialize the private 2656 * descriptor's memory handles. If any entry is initialized, 2657 * we need to free it later so we set the bit in 'initialized' 2658 * at the start. 2659 */ 2660 vdc->initialized |= VDC_DRING_ENTRY; 2661 for (i = 0; i < vdc->dring_len; i++) { 2662 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2663 dep->hdr.dstate = VIO_DESC_FREE; 2664 2665 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2666 &vdc->local_dring[i].desc_mhdl); 2667 if (status != 0) { 2668 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2669 " descriptor %d", vdc->instance, i); 2670 return (status); 2671 } 2672 vdc->local_dring[i].is_free = B_TRUE; 2673 vdc->local_dring[i].dep = dep; 2674 } 2675 2676 /* Initialize the starting index */ 2677 vdc->dring_curr_idx = 0; 2678 2679 return (status); 2680 } 2681 2682 /* 2683 * Function: 2684 * vdc_destroy_descriptor_ring() 2685 * 2686 * Description: 2687 * 2688 * Arguments: 2689 * vdc - soft state pointer for this instance of the device driver. 2690 * 2691 * Return Code: 2692 * None 2693 */ 2694 static void 2695 vdc_destroy_descriptor_ring(vdc_t *vdc) 2696 { 2697 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2698 ldc_mem_handle_t mhdl = NULL; 2699 ldc_mem_info_t minfo; 2700 int status = -1; 2701 int i; /* loop */ 2702 2703 ASSERT(vdc != NULL); 2704 ASSERT(mutex_owned(&vdc->lock)); 2705 2706 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2707 2708 if (vdc->initialized & VDC_DRING_ENTRY) { 2709 DMSG(vdc, 0, 2710 "[%d] Removing Local DRing entries\n", vdc->instance); 2711 for (i = 0; i < vdc->dring_len; i++) { 2712 ldep = &vdc->local_dring[i]; 2713 mhdl = ldep->desc_mhdl; 2714 2715 if (mhdl == NULL) 2716 continue; 2717 2718 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2719 DMSG(vdc, 0, 2720 "ldc_mem_info returned an error: %d\n", 2721 status); 2722 2723 /* 2724 * This must mean that the mem handle 2725 * is not valid. Clear it out so that 2726 * no one tries to use it. 2727 */ 2728 ldep->desc_mhdl = NULL; 2729 continue; 2730 } 2731 2732 if (minfo.status == LDC_BOUND) { 2733 (void) ldc_mem_unbind_handle(mhdl); 2734 } 2735 2736 (void) ldc_mem_free_handle(mhdl); 2737 2738 ldep->desc_mhdl = NULL; 2739 } 2740 vdc->initialized &= ~VDC_DRING_ENTRY; 2741 } 2742 2743 if (vdc->initialized & VDC_DRING_LOCAL) { 2744 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2745 kmem_free(vdc->local_dring, 2746 vdc->dring_len * sizeof (vdc_local_desc_t)); 2747 vdc->initialized &= ~VDC_DRING_LOCAL; 2748 } 2749 2750 if (vdc->initialized & VDC_DRING_BOUND) { 2751 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2752 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2753 if (status == 0) { 2754 vdc->initialized &= ~VDC_DRING_BOUND; 2755 } else { 2756 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2757 vdc->instance, status, vdc->dring_hdl); 2758 } 2759 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2760 } 2761 2762 if (vdc->initialized & VDC_DRING_INIT) { 2763 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2764 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2765 if (status == 0) { 2766 vdc->dring_hdl = NULL; 2767 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2768 vdc->initialized &= ~VDC_DRING_INIT; 2769 } else { 2770 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2771 vdc->instance, status, vdc->dring_hdl); 2772 } 2773 } 2774 } 2775 2776 /* 2777 * Function: 2778 * vdc_map_to_shared_dring() 2779 * 2780 * Description: 2781 * Copy contents of the local descriptor to the shared 2782 * memory descriptor. 2783 * 2784 * Arguments: 2785 * vdcp - soft state pointer for this instance of the device driver. 2786 * idx - descriptor ring index 2787 * 2788 * Return Code: 2789 * None 2790 */ 2791 static int 2792 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2793 { 2794 vdc_local_desc_t *ldep; 2795 vd_dring_entry_t *dep; 2796 int rv; 2797 2798 ldep = &(vdcp->local_dring[idx]); 2799 2800 /* for now leave in the old pop_mem_hdl stuff */ 2801 if (ldep->nbytes > 0) { 2802 rv = vdc_populate_mem_hdl(vdcp, ldep); 2803 if (rv) { 2804 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2805 vdcp->instance); 2806 return (rv); 2807 } 2808 } 2809 2810 /* 2811 * fill in the data details into the DRing 2812 */ 2813 dep = ldep->dep; 2814 ASSERT(dep != NULL); 2815 2816 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2817 dep->payload.operation = ldep->operation; 2818 dep->payload.addr = ldep->offset; 2819 dep->payload.nbytes = ldep->nbytes; 2820 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2821 dep->payload.slice = ldep->slice; 2822 dep->hdr.dstate = VIO_DESC_READY; 2823 dep->hdr.ack = 1; /* request an ACK for every message */ 2824 2825 return (0); 2826 } 2827 2828 /* 2829 * Function: 2830 * vdc_send_request 2831 * 2832 * Description: 2833 * This routine writes the data to be transmitted to vds into the 2834 * descriptor, notifies vds that the ring has been updated and 2835 * then waits for the request to be processed. 2836 * 2837 * Arguments: 2838 * vdcp - the soft state pointer 2839 * operation - operation we want vds to perform (VD_OP_XXX) 2840 * addr - address of data buf to be read/written. 2841 * nbytes - number of bytes to read/write 2842 * slice - the disk slice this request is for 2843 * offset - relative disk offset 2844 * cb_type - type of call - STRATEGY or SYNC 2845 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2846 * . mode for ioctl(9e) 2847 * . LP64 diskaddr_t (block I/O) 2848 * dir - direction of operation (READ/WRITE/BOTH) 2849 * 2850 * Return Codes: 2851 * 0 2852 * ENXIO 2853 */ 2854 static int 2855 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2856 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2857 void *cb_arg, vio_desc_direction_t dir) 2858 { 2859 int rv = 0; 2860 2861 ASSERT(vdcp != NULL); 2862 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2863 2864 mutex_enter(&vdcp->lock); 2865 2866 /* 2867 * If this is a block read/write operation we update the I/O statistics 2868 * to indicate that the request is being put on the waitq to be 2869 * serviced. 2870 * 2871 * We do it here (a common routine for both synchronous and strategy 2872 * calls) for performance reasons - we are already holding vdc->lock 2873 * so there is no extra locking overhead. We would have to explicitly 2874 * grab the 'lock' mutex to update the stats if we were to do this 2875 * higher up the stack in vdc_strategy() et. al. 2876 */ 2877 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2878 DTRACE_IO1(start, buf_t *, cb_arg); 2879 VD_KSTAT_WAITQ_ENTER(vdcp); 2880 } 2881 2882 do { 2883 while (vdcp->state != VDC_STATE_RUNNING) { 2884 2885 /* return error if detaching */ 2886 if (vdcp->state == VDC_STATE_DETACH) { 2887 rv = ENXIO; 2888 goto done; 2889 } 2890 2891 /* fail request if connection timeout is reached */ 2892 if (vdcp->ctimeout_reached) { 2893 rv = EIO; 2894 goto done; 2895 } 2896 2897 /* 2898 * If we are panicking and the disk is not ready then 2899 * we can't send any request because we can't complete 2900 * the handshake now. 2901 */ 2902 if (ddi_in_panic()) { 2903 rv = EIO; 2904 goto done; 2905 } 2906 2907 cv_wait(&vdcp->running_cv, &vdcp->lock); 2908 } 2909 2910 } while (vdc_populate_descriptor(vdcp, operation, addr, 2911 nbytes, slice, offset, cb_type, cb_arg, dir)); 2912 2913 done: 2914 /* 2915 * If this is a block read/write we update the I/O statistics kstat 2916 * to indicate that this request has been placed on the queue for 2917 * processing (i.e sent to the vDisk server) - iostat(1M) will 2918 * report the time waiting for the vDisk server under the %b column 2919 * In the case of an error we simply take it off the wait queue. 2920 */ 2921 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2922 if (rv == 0) { 2923 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2924 DTRACE_PROBE1(send, buf_t *, cb_arg); 2925 } else { 2926 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2927 VD_KSTAT_WAITQ_EXIT(vdcp); 2928 DTRACE_IO1(done, buf_t *, cb_arg); 2929 } 2930 } 2931 2932 mutex_exit(&vdcp->lock); 2933 2934 return (rv); 2935 } 2936 2937 2938 /* 2939 * Function: 2940 * vdc_populate_descriptor 2941 * 2942 * Description: 2943 * This routine writes the data to be transmitted to vds into the 2944 * descriptor, notifies vds that the ring has been updated and 2945 * then waits for the request to be processed. 2946 * 2947 * Arguments: 2948 * vdcp - the soft state pointer 2949 * operation - operation we want vds to perform (VD_OP_XXX) 2950 * addr - address of data buf to be read/written. 2951 * nbytes - number of bytes to read/write 2952 * slice - the disk slice this request is for 2953 * offset - relative disk offset 2954 * cb_type - type of call - STRATEGY or SYNC 2955 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2956 * . mode for ioctl(9e) 2957 * . LP64 diskaddr_t (block I/O) 2958 * dir - direction of operation (READ/WRITE/BOTH) 2959 * 2960 * Return Codes: 2961 * 0 2962 * EAGAIN 2963 * ECONNRESET 2964 * ENXIO 2965 */ 2966 static int 2967 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2968 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2969 void *cb_arg, vio_desc_direction_t dir) 2970 { 2971 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2972 int idx; /* Index of DRing entry used */ 2973 int next_idx; 2974 vio_dring_msg_t dmsg; 2975 size_t msglen; 2976 int rv; 2977 2978 ASSERT(MUTEX_HELD(&vdcp->lock)); 2979 vdcp->threads_pending++; 2980 loop: 2981 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2982 2983 /* Get next available D-Ring entry */ 2984 idx = vdcp->dring_curr_idx; 2985 local_dep = &(vdcp->local_dring[idx]); 2986 2987 if (!local_dep->is_free) { 2988 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2989 vdcp->instance); 2990 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2991 if (vdcp->state == VDC_STATE_RUNNING || 2992 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2993 goto loop; 2994 } 2995 vdcp->threads_pending--; 2996 return (ECONNRESET); 2997 } 2998 2999 next_idx = idx + 1; 3000 if (next_idx >= vdcp->dring_len) 3001 next_idx = 0; 3002 vdcp->dring_curr_idx = next_idx; 3003 3004 ASSERT(local_dep->is_free); 3005 3006 local_dep->operation = operation; 3007 local_dep->addr = addr; 3008 local_dep->nbytes = nbytes; 3009 local_dep->slice = slice; 3010 local_dep->offset = offset; 3011 local_dep->cb_type = cb_type; 3012 local_dep->cb_arg = cb_arg; 3013 local_dep->dir = dir; 3014 3015 local_dep->is_free = B_FALSE; 3016 3017 rv = vdc_map_to_shared_dring(vdcp, idx); 3018 if (rv) { 3019 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3020 vdcp->instance); 3021 /* free the descriptor */ 3022 local_dep->is_free = B_TRUE; 3023 vdcp->dring_curr_idx = idx; 3024 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3025 if (vdcp->state == VDC_STATE_RUNNING || 3026 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3027 goto loop; 3028 } 3029 vdcp->threads_pending--; 3030 return (ECONNRESET); 3031 } 3032 3033 /* 3034 * Send a msg with the DRing details to vds 3035 */ 3036 VIO_INIT_DRING_DATA_TAG(dmsg); 3037 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3038 dmsg.dring_ident = vdcp->dring_ident; 3039 dmsg.start_idx = idx; 3040 dmsg.end_idx = idx; 3041 vdcp->seq_num++; 3042 3043 DTRACE_PROBE2(populate, int, vdcp->instance, 3044 vdc_local_desc_t *, local_dep); 3045 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3046 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3047 3048 /* 3049 * note we're still holding the lock here to 3050 * make sure the message goes out in order !!!... 3051 */ 3052 msglen = sizeof (dmsg); 3053 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3054 switch (rv) { 3055 case ECONNRESET: 3056 /* 3057 * vdc_send initiates the reset on failure. 3058 * Since the transaction has already been put 3059 * on the local dring, it will automatically get 3060 * retried when the channel is reset. Given that, 3061 * it is ok to just return success even though the 3062 * send failed. 3063 */ 3064 rv = 0; 3065 break; 3066 3067 case 0: /* EOK */ 3068 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3069 break; 3070 3071 default: 3072 goto cleanup_and_exit; 3073 } 3074 3075 vdcp->threads_pending--; 3076 return (rv); 3077 3078 cleanup_and_exit: 3079 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3080 return (ENXIO); 3081 } 3082 3083 /* 3084 * Function: 3085 * vdc_do_sync_op 3086 * 3087 * Description: 3088 * Wrapper around vdc_populate_descriptor that blocks until the 3089 * response to the message is available. 3090 * 3091 * Arguments: 3092 * vdcp - the soft state pointer 3093 * operation - operation we want vds to perform (VD_OP_XXX) 3094 * addr - address of data buf to be read/written. 3095 * nbytes - number of bytes to read/write 3096 * slice - the disk slice this request is for 3097 * offset - relative disk offset 3098 * cb_type - type of call - STRATEGY or SYNC 3099 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3100 * . mode for ioctl(9e) 3101 * . LP64 diskaddr_t (block I/O) 3102 * dir - direction of operation (READ/WRITE/BOTH) 3103 * rconflict - check for reservation conflict in case of failure 3104 * 3105 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3106 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3107 * result of a successful operation with vd_scsi_status(). 3108 * 3109 * Return Codes: 3110 * 0 3111 * EAGAIN 3112 * EFAULT 3113 * ENXIO 3114 * EIO 3115 */ 3116 static int 3117 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3118 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3119 vio_desc_direction_t dir, boolean_t rconflict) 3120 { 3121 int status; 3122 vdc_io_t *vio; 3123 boolean_t check_resv_conflict = B_FALSE; 3124 3125 ASSERT(cb_type == CB_SYNC); 3126 3127 /* 3128 * Grab the lock, if blocked wait until the server 3129 * response causes us to wake up again. 3130 */ 3131 mutex_enter(&vdcp->lock); 3132 vdcp->sync_op_cnt++; 3133 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3134 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3135 3136 if (vdcp->state == VDC_STATE_DETACH) { 3137 cv_broadcast(&vdcp->sync_blocked_cv); 3138 vdcp->sync_op_cnt--; 3139 mutex_exit(&vdcp->lock); 3140 return (ENXIO); 3141 } 3142 3143 /* now block anyone other thread entering after us */ 3144 vdcp->sync_op_blocked = B_TRUE; 3145 vdcp->sync_op_pending = B_TRUE; 3146 mutex_exit(&vdcp->lock); 3147 3148 status = vdc_send_request(vdcp, operation, addr, 3149 nbytes, slice, offset, cb_type, cb_arg, dir); 3150 3151 mutex_enter(&vdcp->lock); 3152 3153 if (status != 0) { 3154 vdcp->sync_op_pending = B_FALSE; 3155 } else { 3156 /* 3157 * block until our transaction completes. 3158 * Also anyone else waiting also gets to go next. 3159 */ 3160 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3161 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3162 3163 DMSG(vdcp, 2, ": operation returned %d\n", 3164 vdcp->sync_op_status); 3165 if (vdcp->state == VDC_STATE_DETACH) { 3166 vdcp->sync_op_pending = B_FALSE; 3167 status = ENXIO; 3168 } else { 3169 status = vdcp->sync_op_status; 3170 if (status != 0 && vdcp->failfast_interval != 0) { 3171 /* 3172 * Operation has failed and failfast is enabled. 3173 * We need to check if the failure is due to a 3174 * reservation conflict if this was requested. 3175 */ 3176 check_resv_conflict = rconflict; 3177 } 3178 3179 } 3180 } 3181 3182 vdcp->sync_op_status = 0; 3183 vdcp->sync_op_blocked = B_FALSE; 3184 vdcp->sync_op_cnt--; 3185 3186 /* signal the next waiting thread */ 3187 cv_signal(&vdcp->sync_blocked_cv); 3188 3189 /* 3190 * We have to check for reservation conflict after unblocking sync 3191 * operations because some sync operations will be used to do this 3192 * check. 3193 */ 3194 if (check_resv_conflict) { 3195 vio = vdc_failfast_io_queue(vdcp, NULL); 3196 while (vio->vio_qtime != 0) 3197 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3198 kmem_free(vio, sizeof (vdc_io_t)); 3199 } 3200 3201 mutex_exit(&vdcp->lock); 3202 3203 return (status); 3204 } 3205 3206 3207 /* 3208 * Function: 3209 * vdc_drain_response() 3210 * 3211 * Description: 3212 * When a guest is panicking, the completion of requests needs to be 3213 * handled differently because interrupts are disabled and vdc 3214 * will not get messages. We have to poll for the messages instead. 3215 * 3216 * Note: since we don't have a buf_t available we cannot implement 3217 * the io:::done DTrace probe in this specific case. 3218 * 3219 * Arguments: 3220 * vdc - soft state pointer for this instance of the device driver. 3221 * 3222 * Return Code: 3223 * 0 - Success 3224 */ 3225 static int 3226 vdc_drain_response(vdc_t *vdc) 3227 { 3228 int rv, idx, retries; 3229 size_t msglen; 3230 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3231 vio_dring_msg_t dmsg; 3232 3233 mutex_enter(&vdc->lock); 3234 3235 retries = 0; 3236 for (;;) { 3237 msglen = sizeof (dmsg); 3238 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3239 &msglen); 3240 if (rv) { 3241 rv = EINVAL; 3242 break; 3243 } 3244 3245 /* 3246 * if there are no packets wait and check again 3247 */ 3248 if ((rv == 0) && (msglen == 0)) { 3249 if (retries++ > vdc_dump_retries) { 3250 rv = EAGAIN; 3251 break; 3252 } 3253 3254 drv_usecwait(vdc_usec_timeout_dump); 3255 continue; 3256 } 3257 3258 /* 3259 * Ignore all messages that are not ACKs/NACKs to 3260 * DRing requests. 3261 */ 3262 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3263 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3264 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3265 dmsg.tag.vio_msgtype, 3266 dmsg.tag.vio_subtype, 3267 dmsg.tag.vio_subtype_env); 3268 continue; 3269 } 3270 3271 /* 3272 * set the appropriate return value for the current request. 3273 */ 3274 switch (dmsg.tag.vio_subtype) { 3275 case VIO_SUBTYPE_ACK: 3276 rv = 0; 3277 break; 3278 case VIO_SUBTYPE_NACK: 3279 rv = EAGAIN; 3280 break; 3281 default: 3282 continue; 3283 } 3284 3285 idx = dmsg.start_idx; 3286 if (idx >= vdc->dring_len) { 3287 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3288 vdc->instance, idx); 3289 continue; 3290 } 3291 ldep = &vdc->local_dring[idx]; 3292 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3293 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3294 vdc->instance, idx, ldep->dep->hdr.dstate); 3295 continue; 3296 } 3297 3298 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3299 vdc->instance, idx, ldep->dep->hdr.dstate); 3300 3301 rv = vdc_depopulate_descriptor(vdc, idx); 3302 if (rv) { 3303 DMSG(vdc, 0, 3304 "[%d] Entry @ %d - depopulate failed ..\n", 3305 vdc->instance, idx); 3306 } 3307 3308 /* if this is the last descriptor - break out of loop */ 3309 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3310 break; 3311 } 3312 3313 mutex_exit(&vdc->lock); 3314 DMSG(vdc, 0, "End idx=%d\n", idx); 3315 3316 return (rv); 3317 } 3318 3319 3320 /* 3321 * Function: 3322 * vdc_depopulate_descriptor() 3323 * 3324 * Description: 3325 * 3326 * Arguments: 3327 * vdc - soft state pointer for this instance of the device driver. 3328 * idx - Index of the Descriptor Ring entry being modified 3329 * 3330 * Return Code: 3331 * 0 - Success 3332 */ 3333 static int 3334 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3335 { 3336 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3337 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3338 int status = ENXIO; 3339 int rv = 0; 3340 3341 ASSERT(vdc != NULL); 3342 ASSERT(idx < vdc->dring_len); 3343 ldep = &vdc->local_dring[idx]; 3344 ASSERT(ldep != NULL); 3345 ASSERT(MUTEX_HELD(&vdc->lock)); 3346 3347 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3348 DMSG(vdc, 2, ": idx = %d\n", idx); 3349 3350 dep = ldep->dep; 3351 ASSERT(dep != NULL); 3352 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3353 (dep->payload.status == ECANCELED)); 3354 3355 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3356 3357 ldep->is_free = B_TRUE; 3358 status = dep->payload.status; 3359 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3360 3361 /* 3362 * If no buffers were used to transfer information to the server when 3363 * populating the descriptor then no memory handles need to be unbound 3364 * and we can return now. 3365 */ 3366 if (ldep->nbytes == 0) { 3367 cv_signal(&vdc->dring_free_cv); 3368 return (status); 3369 } 3370 3371 /* 3372 * If the upper layer passed in a misaligned address we copied the 3373 * data into an aligned buffer before sending it to LDC - we now 3374 * copy it back to the original buffer. 3375 */ 3376 if (ldep->align_addr) { 3377 ASSERT(ldep->addr != NULL); 3378 3379 if (dep->payload.nbytes > 0) 3380 bcopy(ldep->align_addr, ldep->addr, 3381 dep->payload.nbytes); 3382 kmem_free(ldep->align_addr, 3383 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3384 ldep->align_addr = NULL; 3385 } 3386 3387 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3388 if (rv != 0) { 3389 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3390 vdc->instance, ldep->desc_mhdl, idx, rv); 3391 /* 3392 * The error returned by the vDisk server is more informative 3393 * and thus has a higher priority but if it isn't set we ensure 3394 * that this function returns an error. 3395 */ 3396 if (status == 0) 3397 status = EINVAL; 3398 } 3399 3400 cv_signal(&vdc->membind_cv); 3401 cv_signal(&vdc->dring_free_cv); 3402 3403 return (status); 3404 } 3405 3406 /* 3407 * Function: 3408 * vdc_populate_mem_hdl() 3409 * 3410 * Description: 3411 * 3412 * Arguments: 3413 * vdc - soft state pointer for this instance of the device driver. 3414 * idx - Index of the Descriptor Ring entry being modified 3415 * addr - virtual address being mapped in 3416 * nybtes - number of bytes in 'addr' 3417 * operation - the vDisk operation being performed (VD_OP_xxx) 3418 * 3419 * Return Code: 3420 * 0 - Success 3421 */ 3422 static int 3423 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3424 { 3425 vd_dring_entry_t *dep = NULL; 3426 ldc_mem_handle_t mhdl; 3427 caddr_t vaddr; 3428 size_t nbytes; 3429 uint8_t perm = LDC_MEM_RW; 3430 uint8_t maptype; 3431 int rv = 0; 3432 int i; 3433 3434 ASSERT(vdcp != NULL); 3435 3436 dep = ldep->dep; 3437 mhdl = ldep->desc_mhdl; 3438 3439 switch (ldep->dir) { 3440 case VIO_read_dir: 3441 perm = LDC_MEM_W; 3442 break; 3443 3444 case VIO_write_dir: 3445 perm = LDC_MEM_R; 3446 break; 3447 3448 case VIO_both_dir: 3449 perm = LDC_MEM_RW; 3450 break; 3451 3452 default: 3453 ASSERT(0); /* catch bad programming in vdc */ 3454 } 3455 3456 /* 3457 * LDC expects any addresses passed in to be 8-byte aligned. We need 3458 * to copy the contents of any misaligned buffers to a newly allocated 3459 * buffer and bind it instead (and copy the the contents back to the 3460 * original buffer passed in when depopulating the descriptor) 3461 */ 3462 vaddr = ldep->addr; 3463 nbytes = ldep->nbytes; 3464 if (((uint64_t)vaddr & 0x7) != 0) { 3465 ASSERT(ldep->align_addr == NULL); 3466 ldep->align_addr = 3467 kmem_alloc(sizeof (caddr_t) * 3468 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3469 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3470 "(buf=%p nb=%ld op=%d)\n", 3471 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3472 nbytes, ldep->operation); 3473 if (perm != LDC_MEM_W) 3474 bcopy(vaddr, ldep->align_addr, nbytes); 3475 vaddr = ldep->align_addr; 3476 } 3477 3478 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3479 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3480 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3481 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3482 vdcp->instance, dep->payload.ncookies); 3483 if (rv != 0) { 3484 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3485 "(mhdl=%p, buf=%p, err=%d)\n", 3486 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3487 if (ldep->align_addr) { 3488 kmem_free(ldep->align_addr, 3489 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3490 ldep->align_addr = NULL; 3491 } 3492 return (EAGAIN); 3493 } 3494 3495 /* 3496 * Get the other cookies (if any). 3497 */ 3498 for (i = 1; i < dep->payload.ncookies; i++) { 3499 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3500 if (rv != 0) { 3501 (void) ldc_mem_unbind_handle(mhdl); 3502 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3503 "(mhdl=%lx cnum=%d), err=%d", 3504 vdcp->instance, mhdl, i, rv); 3505 if (ldep->align_addr) { 3506 kmem_free(ldep->align_addr, 3507 sizeof (caddr_t) * ldep->nbytes); 3508 ldep->align_addr = NULL; 3509 } 3510 return (EAGAIN); 3511 } 3512 } 3513 3514 return (rv); 3515 } 3516 3517 /* 3518 * Interrupt handlers for messages from LDC 3519 */ 3520 3521 /* 3522 * Function: 3523 * vdc_handle_cb() 3524 * 3525 * Description: 3526 * 3527 * Arguments: 3528 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3529 * arg - soft state pointer for this instance of the device driver. 3530 * 3531 * Return Code: 3532 * 0 - Success 3533 */ 3534 static uint_t 3535 vdc_handle_cb(uint64_t event, caddr_t arg) 3536 { 3537 ldc_status_t ldc_state; 3538 int rv = 0; 3539 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3540 vdc_t *vdc = srvr->vdcp; 3541 3542 ASSERT(vdc != NULL); 3543 3544 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3545 3546 /* If callback is not for the current server, ignore it */ 3547 mutex_enter(&vdc->lock); 3548 3549 if (vdc->curr_server != srvr) { 3550 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3551 vdc->instance, event, srvr->id); 3552 mutex_exit(&vdc->lock); 3553 return (LDC_SUCCESS); 3554 } 3555 3556 /* 3557 * Depending on the type of event that triggered this callback, 3558 * we modify the handshake state or read the data. 3559 * 3560 * NOTE: not done as a switch() as event could be triggered by 3561 * a state change and a read request. Also the ordering of the 3562 * check for the event types is deliberate. 3563 */ 3564 if (event & LDC_EVT_UP) { 3565 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3566 3567 /* get LDC state */ 3568 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3569 if (rv != 0) { 3570 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3571 vdc->instance, rv); 3572 mutex_exit(&vdc->lock); 3573 return (LDC_SUCCESS); 3574 } 3575 if (srvr->ldc_state != LDC_UP && 3576 ldc_state == LDC_UP) { 3577 /* 3578 * Reset the transaction sequence numbers when 3579 * LDC comes up. We then kick off the handshake 3580 * negotiation with the vDisk server. 3581 */ 3582 vdc->seq_num = 1; 3583 vdc->seq_num_reply = 0; 3584 srvr->ldc_state = ldc_state; 3585 cv_signal(&vdc->initwait_cv); 3586 } 3587 } 3588 3589 if (event & LDC_EVT_READ) { 3590 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3591 mutex_enter(&vdc->read_lock); 3592 cv_signal(&vdc->read_cv); 3593 vdc->read_state = VDC_READ_PENDING; 3594 mutex_exit(&vdc->read_lock); 3595 mutex_exit(&vdc->lock); 3596 3597 /* that's all we have to do - no need to handle DOWN/RESET */ 3598 return (LDC_SUCCESS); 3599 } 3600 3601 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3602 3603 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3604 3605 /* 3606 * Need to wake up any readers so they will 3607 * detect that a reset has occurred. 3608 */ 3609 mutex_enter(&vdc->read_lock); 3610 if ((vdc->read_state == VDC_READ_WAITING) || 3611 (vdc->read_state == VDC_READ_RESET)) 3612 cv_signal(&vdc->read_cv); 3613 vdc->read_state = VDC_READ_RESET; 3614 mutex_exit(&vdc->read_lock); 3615 3616 /* wake up any threads waiting for connection to come up */ 3617 if (vdc->state == VDC_STATE_INIT_WAITING) { 3618 vdc->state = VDC_STATE_RESETTING; 3619 cv_signal(&vdc->initwait_cv); 3620 } 3621 3622 } 3623 3624 mutex_exit(&vdc->lock); 3625 3626 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3627 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3628 vdc->instance, event); 3629 3630 return (LDC_SUCCESS); 3631 } 3632 3633 /* 3634 * Function: 3635 * vdc_wait_for_response() 3636 * 3637 * Description: 3638 * Block waiting for a response from the server. If there is 3639 * no data the thread block on the read_cv that is signalled 3640 * by the callback when an EVT_READ occurs. 3641 * 3642 * Arguments: 3643 * vdcp - soft state pointer for this instance of the device driver. 3644 * 3645 * Return Code: 3646 * 0 - Success 3647 */ 3648 static int 3649 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3650 { 3651 size_t nbytes = sizeof (*msgp); 3652 int status; 3653 3654 ASSERT(vdcp != NULL); 3655 3656 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3657 3658 status = vdc_recv(vdcp, msgp, &nbytes); 3659 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3660 status, (int)nbytes); 3661 if (status) { 3662 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3663 vdcp->instance, status); 3664 return (status); 3665 } 3666 3667 if (nbytes < sizeof (vio_msg_tag_t)) { 3668 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3669 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3670 return (ENOMSG); 3671 } 3672 3673 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3674 msgp->tag.vio_msgtype, 3675 msgp->tag.vio_subtype, 3676 msgp->tag.vio_subtype_env); 3677 3678 /* 3679 * Verify the Session ID of the message 3680 * 3681 * Every message after the Version has been negotiated should 3682 * have the correct session ID set. 3683 */ 3684 if ((msgp->tag.vio_sid != vdcp->session_id) && 3685 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3686 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3687 "expected 0x%lx [seq num %lx @ %d]", 3688 vdcp->instance, msgp->tag.vio_sid, 3689 vdcp->session_id, 3690 ((vio_dring_msg_t *)msgp)->seq_num, 3691 ((vio_dring_msg_t *)msgp)->start_idx); 3692 return (ENOMSG); 3693 } 3694 return (0); 3695 } 3696 3697 3698 /* 3699 * Function: 3700 * vdc_resubmit_backup_dring() 3701 * 3702 * Description: 3703 * Resubmit each descriptor in the backed up dring to 3704 * vDisk server. The Dring was backed up during connection 3705 * reset. 3706 * 3707 * Arguments: 3708 * vdcp - soft state pointer for this instance of the device driver. 3709 * 3710 * Return Code: 3711 * 0 - Success 3712 */ 3713 static int 3714 vdc_resubmit_backup_dring(vdc_t *vdcp) 3715 { 3716 int processed = 0; 3717 int count; 3718 int b_idx; 3719 int rv = 0; 3720 int dring_size; 3721 int op; 3722 vio_msg_t vio_msg; 3723 vdc_local_desc_t *curr_ldep; 3724 3725 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3726 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3727 3728 if (vdcp->local_dring_backup == NULL) { 3729 /* the pending requests have already been processed */ 3730 return (0); 3731 } 3732 3733 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3734 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3735 3736 /* 3737 * Walk the backup copy of the local descriptor ring and 3738 * resubmit all the outstanding transactions. 3739 */ 3740 b_idx = vdcp->local_dring_backup_tail; 3741 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3742 3743 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3744 3745 /* only resubmit outstanding transactions */ 3746 if (!curr_ldep->is_free) { 3747 /* 3748 * If we are retrying a block read/write operation we 3749 * need to update the I/O statistics to indicate that 3750 * the request is being put back on the waitq to be 3751 * serviced (it will have been taken off after the 3752 * error was reported). 3753 */ 3754 mutex_enter(&vdcp->lock); 3755 op = curr_ldep->operation; 3756 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3757 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3758 VD_KSTAT_WAITQ_ENTER(vdcp); 3759 } 3760 3761 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3762 rv = vdc_populate_descriptor(vdcp, op, 3763 curr_ldep->addr, curr_ldep->nbytes, 3764 curr_ldep->slice, curr_ldep->offset, 3765 curr_ldep->cb_type, curr_ldep->cb_arg, 3766 curr_ldep->dir); 3767 3768 if (rv) { 3769 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3770 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3771 VD_KSTAT_WAITQ_EXIT(vdcp); 3772 DTRACE_IO1(done, buf_t *, 3773 curr_ldep->cb_arg); 3774 } 3775 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3776 vdcp->instance, b_idx); 3777 mutex_exit(&vdcp->lock); 3778 goto done; 3779 } 3780 3781 /* 3782 * If this is a block read/write we update the I/O 3783 * statistics kstat to indicate that the request 3784 * has been sent back to the vDisk server and should 3785 * now be put on the run queue. 3786 */ 3787 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3788 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3789 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3790 } 3791 mutex_exit(&vdcp->lock); 3792 3793 /* Wait for the response message. */ 3794 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3795 b_idx); 3796 rv = vdc_wait_for_response(vdcp, &vio_msg); 3797 if (rv) { 3798 /* 3799 * If this is a block read/write we update 3800 * the I/O statistics kstat to take it 3801 * off the run queue. 3802 */ 3803 mutex_enter(&vdcp->lock); 3804 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3805 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3806 VD_KSTAT_RUNQ_EXIT(vdcp); 3807 DTRACE_IO1(done, buf_t *, 3808 curr_ldep->cb_arg); 3809 } 3810 DMSG(vdcp, 1, "[%d] wait_for_response " 3811 "returned err=%d\n", vdcp->instance, 3812 rv); 3813 mutex_exit(&vdcp->lock); 3814 goto done; 3815 } 3816 3817 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3818 rv = vdc_process_data_msg(vdcp, &vio_msg); 3819 if (rv) { 3820 DMSG(vdcp, 1, "[%d] process_data_msg " 3821 "returned err=%d\n", vdcp->instance, 3822 rv); 3823 goto done; 3824 } 3825 /* 3826 * Mark this entry as free so that we will not resubmit 3827 * this "done" request again, if we were to use the same 3828 * backup_dring again in future. This could happen when 3829 * a reset happens while processing the backup_dring. 3830 */ 3831 curr_ldep->is_free = B_TRUE; 3832 processed++; 3833 } 3834 3835 /* get the next element to submit */ 3836 if (++b_idx >= vdcp->local_dring_backup_len) 3837 b_idx = 0; 3838 } 3839 3840 /* all done - now clear up pending dring copy */ 3841 dring_size = vdcp->local_dring_backup_len * 3842 sizeof (vdcp->local_dring_backup[0]); 3843 3844 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3845 3846 vdcp->local_dring_backup = NULL; 3847 3848 done: 3849 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3850 3851 return (rv); 3852 } 3853 3854 /* 3855 * Function: 3856 * vdc_cancel_backup_dring 3857 * 3858 * Description: 3859 * Cancel each descriptor in the backed up dring to vDisk server. 3860 * The Dring was backed up during connection reset. 3861 * 3862 * Arguments: 3863 * vdcp - soft state pointer for this instance of the device driver. 3864 * 3865 * Return Code: 3866 * None 3867 */ 3868 void 3869 vdc_cancel_backup_dring(vdc_t *vdcp) 3870 { 3871 vdc_local_desc_t *ldep; 3872 struct buf *bufp; 3873 int count; 3874 int b_idx; 3875 int dring_size; 3876 int cancelled = 0; 3877 3878 ASSERT(MUTEX_HELD(&vdcp->lock)); 3879 ASSERT(vdcp->state == VDC_STATE_INIT || 3880 vdcp->state == VDC_STATE_INIT_WAITING || 3881 vdcp->state == VDC_STATE_NEGOTIATE || 3882 vdcp->state == VDC_STATE_RESETTING); 3883 3884 if (vdcp->local_dring_backup == NULL) { 3885 /* the pending requests have already been processed */ 3886 return; 3887 } 3888 3889 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3890 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3891 3892 /* 3893 * Walk the backup copy of the local descriptor ring and 3894 * cancel all the outstanding transactions. 3895 */ 3896 b_idx = vdcp->local_dring_backup_tail; 3897 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3898 3899 ldep = &(vdcp->local_dring_backup[b_idx]); 3900 3901 /* only cancel outstanding transactions */ 3902 if (!ldep->is_free) { 3903 3904 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3905 cancelled++; 3906 3907 /* 3908 * All requests have already been cleared from the 3909 * local descriptor ring and the LDC channel has been 3910 * reset so we will never get any reply for these 3911 * requests. Now we just have to notify threads waiting 3912 * for replies that the request has failed. 3913 */ 3914 switch (ldep->cb_type) { 3915 case CB_SYNC: 3916 ASSERT(vdcp->sync_op_pending); 3917 vdcp->sync_op_status = EIO; 3918 vdcp->sync_op_pending = B_FALSE; 3919 cv_signal(&vdcp->sync_pending_cv); 3920 break; 3921 3922 case CB_STRATEGY: 3923 bufp = ldep->cb_arg; 3924 ASSERT(bufp != NULL); 3925 bufp->b_resid = bufp->b_bcount; 3926 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3927 VD_KSTAT_RUNQ_EXIT(vdcp); 3928 DTRACE_IO1(done, buf_t *, bufp); 3929 bioerror(bufp, EIO); 3930 biodone(bufp); 3931 break; 3932 3933 default: 3934 ASSERT(0); 3935 } 3936 3937 } 3938 3939 /* get the next element to cancel */ 3940 if (++b_idx >= vdcp->local_dring_backup_len) 3941 b_idx = 0; 3942 } 3943 3944 /* all done - now clear up pending dring copy */ 3945 dring_size = vdcp->local_dring_backup_len * 3946 sizeof (vdcp->local_dring_backup[0]); 3947 3948 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3949 3950 vdcp->local_dring_backup = NULL; 3951 3952 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 3953 } 3954 3955 /* 3956 * Function: 3957 * vdc_connection_timeout 3958 * 3959 * Description: 3960 * This function is invoked if the timeout set to establish the connection 3961 * with vds expires. This will happen if we spend too much time in the 3962 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3963 * cancel any pending request and mark them as failed. 3964 * 3965 * If the timeout does not expire, it will be cancelled when we reach the 3966 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 3967 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 3968 * VDC_STATE_RESETTING state in which case we do nothing because the 3969 * timeout is being cancelled. 3970 * 3971 * Arguments: 3972 * arg - argument of the timeout function actually a soft state 3973 * pointer for the instance of the device driver. 3974 * 3975 * Return Code: 3976 * None 3977 */ 3978 void 3979 vdc_connection_timeout(void *arg) 3980 { 3981 vdc_t *vdcp = (vdc_t *)arg; 3982 3983 mutex_enter(&vdcp->lock); 3984 3985 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 3986 vdcp->state == VDC_STATE_DETACH) { 3987 /* 3988 * The connection has just been re-established or 3989 * we are detaching. 3990 */ 3991 vdcp->ctimeout_reached = B_FALSE; 3992 mutex_exit(&vdcp->lock); 3993 return; 3994 } 3995 3996 vdcp->ctimeout_reached = B_TRUE; 3997 3998 /* notify requests waiting for sending */ 3999 cv_broadcast(&vdcp->running_cv); 4000 4001 /* cancel requests waiting for a result */ 4002 vdc_cancel_backup_dring(vdcp); 4003 4004 mutex_exit(&vdcp->lock); 4005 4006 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4007 vdcp->instance); 4008 } 4009 4010 /* 4011 * Function: 4012 * vdc_backup_local_dring() 4013 * 4014 * Description: 4015 * Backup the current dring in the event of a reset. The Dring 4016 * transactions will be resubmitted to the server when the 4017 * connection is restored. 4018 * 4019 * Arguments: 4020 * vdcp - soft state pointer for this instance of the device driver. 4021 * 4022 * Return Code: 4023 * NONE 4024 */ 4025 static void 4026 vdc_backup_local_dring(vdc_t *vdcp) 4027 { 4028 int dring_size; 4029 4030 ASSERT(MUTEX_HELD(&vdcp->lock)); 4031 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4032 4033 /* 4034 * If the backup dring is stil around, it means 4035 * that the last restore did not complete. However, 4036 * since we never got back into the running state, 4037 * the backup copy we have is still valid. 4038 */ 4039 if (vdcp->local_dring_backup != NULL) { 4040 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4041 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4042 vdcp->local_dring_backup_tail); 4043 return; 4044 } 4045 4046 /* 4047 * The backup dring can be NULL and the local dring may not be 4048 * initialized. This can happen if we had a reset while establishing 4049 * a new connection but after the connection has timed out. In that 4050 * case the backup dring is NULL because the requests have been 4051 * cancelled and the request occured before the local dring is 4052 * initialized. 4053 */ 4054 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4055 return; 4056 4057 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4058 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4059 4060 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4061 4062 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4063 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4064 4065 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4066 vdcp->local_dring_backup_len = vdcp->dring_len; 4067 } 4068 4069 static void 4070 vdc_switch_server(vdc_t *vdcp) 4071 { 4072 int rv; 4073 vdc_server_t *curr_server, *new_server; 4074 4075 ASSERT(MUTEX_HELD(&vdcp->lock)); 4076 4077 /* if there is only one server return back */ 4078 if (vdcp->num_servers == 1) { 4079 return; 4080 } 4081 4082 /* Get current and next server */ 4083 curr_server = vdcp->curr_server; 4084 new_server = 4085 (curr_server->next) ? curr_server->next : vdcp->server_list; 4086 ASSERT(curr_server != new_server); 4087 4088 /* bring current server's channel down */ 4089 rv = ldc_down(curr_server->ldc_handle); 4090 if (rv) { 4091 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4092 vdcp->instance, curr_server->id); 4093 return; 4094 } 4095 4096 /* switch the server */ 4097 vdcp->curr_server = new_server; 4098 4099 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4100 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4101 } 4102 4103 /* -------------------------------------------------------------------------- */ 4104 4105 /* 4106 * The following functions process the incoming messages from vds 4107 */ 4108 4109 /* 4110 * Function: 4111 * vdc_process_msg_thread() 4112 * 4113 * Description: 4114 * 4115 * Main VDC message processing thread. Each vDisk instance 4116 * consists of a copy of this thread. This thread triggers 4117 * all the handshakes and data exchange with the server. It 4118 * also handles all channel resets 4119 * 4120 * Arguments: 4121 * vdc - soft state pointer for this instance of the device driver. 4122 * 4123 * Return Code: 4124 * None 4125 */ 4126 static void 4127 vdc_process_msg_thread(vdc_t *vdcp) 4128 { 4129 int status; 4130 int ctimeout; 4131 timeout_id_t tmid = 0; 4132 clock_t ldcup_timeout = 0; 4133 4134 mutex_enter(&vdcp->lock); 4135 4136 for (;;) { 4137 4138 #define Q(_s) (vdcp->state == _s) ? #_s : 4139 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4140 Q(VDC_STATE_INIT) 4141 Q(VDC_STATE_INIT_WAITING) 4142 Q(VDC_STATE_NEGOTIATE) 4143 Q(VDC_STATE_HANDLE_PENDING) 4144 Q(VDC_STATE_RUNNING) 4145 Q(VDC_STATE_RESETTING) 4146 Q(VDC_STATE_DETACH) 4147 "UNKNOWN"); 4148 4149 switch (vdcp->state) { 4150 case VDC_STATE_INIT: 4151 4152 /* 4153 * If requested, start a timeout to check if the 4154 * connection with vds is established in the 4155 * specified delay. If the timeout expires, we 4156 * will cancel any pending request. 4157 * 4158 * If some reset have occurred while establishing 4159 * the connection, we already have a timeout armed 4160 * and in that case we don't need to arm a new one. 4161 * 4162 * The same rule applies when there are multiple vds'. 4163 * If either a connection cannot be established or 4164 * the handshake times out, the connection thread will 4165 * try another server. The 'ctimeout' will report 4166 * back an error after it expires irrespective of 4167 * whether the vdisk is trying to connect to just 4168 * one or multiple servers. 4169 */ 4170 ctimeout = (vdc_timeout != 0)? 4171 vdc_timeout : vdcp->curr_server->ctimeout; 4172 4173 if (ctimeout != 0 && tmid == 0) { 4174 tmid = timeout(vdc_connection_timeout, vdcp, 4175 ctimeout * drv_usectohz(MICROSEC)); 4176 } 4177 4178 /* Check if we are re-initializing repeatedly */ 4179 if (vdcp->hshake_cnt > vdc_hshake_retries && 4180 vdcp->lifecycle != VDC_LC_ONLINE) { 4181 4182 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4183 vdcp->instance, vdcp->hshake_cnt); 4184 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4185 vdcp->instance); 4186 vdcp->state = VDC_STATE_DETACH; 4187 break; 4188 } 4189 4190 /* Switch to STATE_DETACH if drv is detaching */ 4191 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4192 vdcp->state = VDC_STATE_DETACH; 4193 break; 4194 } 4195 4196 /* Switch server */ 4197 if (vdcp->hshake_cnt > 0) 4198 vdc_switch_server(vdcp); 4199 vdcp->hshake_cnt++; 4200 4201 /* Bring up connection with vds via LDC */ 4202 status = vdc_start_ldc_connection(vdcp); 4203 if (status != EINVAL) { 4204 vdcp->state = VDC_STATE_INIT_WAITING; 4205 } 4206 break; 4207 4208 case VDC_STATE_INIT_WAITING: 4209 4210 /* if channel is UP, start negotiation */ 4211 if (vdcp->curr_server->ldc_state == LDC_UP) { 4212 vdcp->state = VDC_STATE_NEGOTIATE; 4213 break; 4214 } 4215 4216 /* check if only one server exists */ 4217 if (vdcp->num_servers == 1) { 4218 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4219 } else { 4220 /* 4221 * wait for LDC_UP, if it times out, switch 4222 * to another server. 4223 */ 4224 ldcup_timeout = ddi_get_lbolt() + 4225 (vdc_ldcup_timeout * 4226 drv_usectohz(MICROSEC)); 4227 status = cv_timedwait(&vdcp->initwait_cv, 4228 &vdcp->lock, ldcup_timeout); 4229 if (status == -1 && 4230 vdcp->state == VDC_STATE_INIT_WAITING && 4231 vdcp->curr_server->ldc_state != LDC_UP) { 4232 /* timed out & still waiting */ 4233 vdcp->state = VDC_STATE_INIT; 4234 break; 4235 } 4236 } 4237 4238 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4239 DMSG(vdcp, 0, 4240 "state moved to %d out from under us...\n", 4241 vdcp->state); 4242 } 4243 break; 4244 4245 case VDC_STATE_NEGOTIATE: 4246 switch (status = vdc_ver_negotiation(vdcp)) { 4247 case 0: 4248 break; 4249 default: 4250 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4251 status); 4252 goto reset; 4253 } 4254 4255 switch (status = vdc_attr_negotiation(vdcp)) { 4256 case 0: 4257 break; 4258 default: 4259 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4260 status); 4261 goto reset; 4262 } 4263 4264 switch (status = vdc_dring_negotiation(vdcp)) { 4265 case 0: 4266 break; 4267 default: 4268 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4269 status); 4270 goto reset; 4271 } 4272 4273 switch (status = vdc_rdx_exchange(vdcp)) { 4274 case 0: 4275 vdcp->state = VDC_STATE_HANDLE_PENDING; 4276 goto done; 4277 default: 4278 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4279 status); 4280 goto reset; 4281 } 4282 reset: 4283 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4284 status); 4285 vdcp->state = VDC_STATE_RESETTING; 4286 vdcp->self_reset = B_TRUE; 4287 done: 4288 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4289 vdcp->state); 4290 break; 4291 4292 case VDC_STATE_HANDLE_PENDING: 4293 4294 if (vdcp->ctimeout_reached) { 4295 /* 4296 * The connection timeout had been reached so 4297 * pending requests have been cancelled. Now 4298 * that the connection is back we can reset 4299 * the timeout. 4300 */ 4301 ASSERT(vdcp->local_dring_backup == NULL); 4302 ASSERT(tmid != 0); 4303 tmid = 0; 4304 vdcp->ctimeout_reached = B_FALSE; 4305 vdcp->state = VDC_STATE_RUNNING; 4306 DMSG(vdcp, 0, "[%d] connection to service " 4307 "domain is up", vdcp->instance); 4308 break; 4309 } 4310 4311 mutex_exit(&vdcp->lock); 4312 if (tmid != 0) { 4313 (void) untimeout(tmid); 4314 tmid = 0; 4315 } 4316 status = vdc_resubmit_backup_dring(vdcp); 4317 mutex_enter(&vdcp->lock); 4318 4319 if (status) 4320 vdcp->state = VDC_STATE_RESETTING; 4321 else 4322 vdcp->state = VDC_STATE_RUNNING; 4323 4324 break; 4325 4326 /* enter running state */ 4327 case VDC_STATE_RUNNING: 4328 /* 4329 * Signal anyone waiting for the connection 4330 * to come on line. 4331 */ 4332 vdcp->hshake_cnt = 0; 4333 cv_broadcast(&vdcp->running_cv); 4334 4335 /* failfast has to been checked after reset */ 4336 cv_signal(&vdcp->failfast_cv); 4337 4338 /* ownership is lost during reset */ 4339 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4340 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4341 cv_signal(&vdcp->ownership_cv); 4342 4343 cmn_err(CE_CONT, "?vdisk@%d is online using " 4344 "ldc@%ld,%ld\n", vdcp->instance, 4345 vdcp->curr_server->ldc_id, vdcp->curr_server->id); 4346 4347 mutex_exit(&vdcp->lock); 4348 4349 for (;;) { 4350 vio_msg_t msg; 4351 status = vdc_wait_for_response(vdcp, &msg); 4352 if (status) break; 4353 4354 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4355 vdcp->instance); 4356 status = vdc_process_data_msg(vdcp, &msg); 4357 if (status) { 4358 DMSG(vdcp, 1, "[%d] process_data_msg " 4359 "returned err=%d\n", vdcp->instance, 4360 status); 4361 break; 4362 } 4363 4364 } 4365 4366 mutex_enter(&vdcp->lock); 4367 4368 cmn_err(CE_CONT, "?vdisk@%d is offline\n", 4369 vdcp->instance); 4370 4371 vdcp->state = VDC_STATE_RESETTING; 4372 vdcp->self_reset = B_TRUE; 4373 break; 4374 4375 case VDC_STATE_RESETTING: 4376 /* 4377 * When we reach this state, we either come from the 4378 * VDC_STATE_RUNNING state and we can have pending 4379 * request but no timeout is armed; or we come from 4380 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4381 * VDC_HANDLE_PENDING state and there is no pending 4382 * request or pending requests have already been copied 4383 * into the backup dring. So we can safely keep the 4384 * connection timeout armed while we are in this state. 4385 */ 4386 4387 DMSG(vdcp, 0, "Initiating channel reset " 4388 "(pending = %d)\n", (int)vdcp->threads_pending); 4389 4390 if (vdcp->self_reset) { 4391 DMSG(vdcp, 0, 4392 "[%d] calling stop_ldc_connection.\n", 4393 vdcp->instance); 4394 status = vdc_stop_ldc_connection(vdcp); 4395 vdcp->self_reset = B_FALSE; 4396 } 4397 4398 /* 4399 * Wait for all threads currently waiting 4400 * for a free dring entry to use. 4401 */ 4402 while (vdcp->threads_pending) { 4403 cv_broadcast(&vdcp->membind_cv); 4404 cv_broadcast(&vdcp->dring_free_cv); 4405 mutex_exit(&vdcp->lock); 4406 /* give the waiters enough time to wake up */ 4407 delay(vdc_hz_min_ldc_delay); 4408 mutex_enter(&vdcp->lock); 4409 } 4410 4411 ASSERT(vdcp->threads_pending == 0); 4412 4413 /* Sanity check that no thread is receiving */ 4414 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4415 4416 vdcp->read_state = VDC_READ_IDLE; 4417 4418 vdc_backup_local_dring(vdcp); 4419 4420 /* cleanup the old d-ring */ 4421 vdc_destroy_descriptor_ring(vdcp); 4422 4423 /* go and start again */ 4424 vdcp->state = VDC_STATE_INIT; 4425 4426 break; 4427 4428 case VDC_STATE_DETACH: 4429 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4430 vdcp->instance); 4431 4432 /* cancel any pending timeout */ 4433 mutex_exit(&vdcp->lock); 4434 if (tmid != 0) { 4435 (void) untimeout(tmid); 4436 tmid = 0; 4437 } 4438 mutex_enter(&vdcp->lock); 4439 4440 /* 4441 * Signal anyone waiting for connection 4442 * to come online 4443 */ 4444 cv_broadcast(&vdcp->running_cv); 4445 4446 while (vdcp->sync_op_pending) { 4447 cv_signal(&vdcp->sync_pending_cv); 4448 cv_signal(&vdcp->sync_blocked_cv); 4449 mutex_exit(&vdcp->lock); 4450 /* give the waiters enough time to wake up */ 4451 delay(vdc_hz_min_ldc_delay); 4452 mutex_enter(&vdcp->lock); 4453 } 4454 4455 mutex_exit(&vdcp->lock); 4456 4457 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4458 vdcp->instance); 4459 thread_exit(); 4460 break; 4461 } 4462 } 4463 } 4464 4465 4466 /* 4467 * Function: 4468 * vdc_process_data_msg() 4469 * 4470 * Description: 4471 * This function is called by the message processing thread each time 4472 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4473 * be an ACK or NACK from vds[1] which vdc handles as follows. 4474 * ACK - wake up the waiting thread 4475 * NACK - resend any messages necessary 4476 * 4477 * [1] Although the message format allows it, vds should not send a 4478 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4479 * some bizarre reason it does, vdc will reset the connection. 4480 * 4481 * Arguments: 4482 * vdc - soft state pointer for this instance of the device driver. 4483 * msg - the LDC message sent by vds 4484 * 4485 * Return Code: 4486 * 0 - Success. 4487 * > 0 - error value returned by LDC 4488 */ 4489 static int 4490 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4491 { 4492 int status = 0; 4493 vio_dring_msg_t *dring_msg; 4494 vdc_local_desc_t *ldep = NULL; 4495 int start, end; 4496 int idx; 4497 int op; 4498 4499 dring_msg = (vio_dring_msg_t *)msg; 4500 4501 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4502 ASSERT(vdcp != NULL); 4503 4504 mutex_enter(&vdcp->lock); 4505 4506 /* 4507 * Check to see if the message has bogus data 4508 */ 4509 idx = start = dring_msg->start_idx; 4510 end = dring_msg->end_idx; 4511 if ((start >= vdcp->dring_len) || 4512 (end >= vdcp->dring_len) || (end < -1)) { 4513 /* 4514 * Update the I/O statistics to indicate that an error ocurred. 4515 * No need to update the wait/run queues as no specific read or 4516 * write request is being completed in response to this 'msg'. 4517 */ 4518 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4519 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4520 vdcp->instance, start, end); 4521 mutex_exit(&vdcp->lock); 4522 return (EINVAL); 4523 } 4524 4525 /* 4526 * Verify that the sequence number is what vdc expects. 4527 */ 4528 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4529 case VDC_SEQ_NUM_TODO: 4530 break; /* keep processing this message */ 4531 case VDC_SEQ_NUM_SKIP: 4532 mutex_exit(&vdcp->lock); 4533 return (0); 4534 case VDC_SEQ_NUM_INVALID: 4535 /* 4536 * Update the I/O statistics to indicate that an error ocurred. 4537 * No need to update the wait/run queues as no specific read or 4538 * write request is being completed in response to this 'msg'. 4539 */ 4540 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4541 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4542 mutex_exit(&vdcp->lock); 4543 return (ENXIO); 4544 } 4545 4546 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4547 /* 4548 * Update the I/O statistics to indicate that an error ocurred. 4549 * 4550 * We need to update the run queue if a read or write request 4551 * is being NACKed - otherwise there will appear to be an 4552 * indefinite outstanding request and statistics reported by 4553 * iostat(1M) will be incorrect. The transaction will be 4554 * resubmitted from the backup DRing following the reset 4555 * and the wait/run queues will be entered again. 4556 */ 4557 ldep = &vdcp->local_dring[idx]; 4558 op = ldep->operation; 4559 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4560 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4561 VD_KSTAT_RUNQ_EXIT(vdcp); 4562 } 4563 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4564 VDC_DUMP_DRING_MSG(dring_msg); 4565 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4566 mutex_exit(&vdcp->lock); 4567 return (EIO); 4568 4569 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4570 /* 4571 * Update the I/O statistics to indicate that an error occurred. 4572 * No need to update the wait/run queues as no specific read or 4573 * write request is being completed in response to this 'msg'. 4574 */ 4575 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4576 mutex_exit(&vdcp->lock); 4577 return (EPROTO); 4578 } 4579 4580 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4581 ASSERT(start == end); 4582 4583 ldep = &vdcp->local_dring[idx]; 4584 4585 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4586 ldep->dep->hdr.dstate, ldep->cb_type); 4587 4588 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4589 struct buf *bufp; 4590 4591 switch (ldep->cb_type) { 4592 case CB_SYNC: 4593 ASSERT(vdcp->sync_op_pending); 4594 4595 status = vdc_depopulate_descriptor(vdcp, idx); 4596 vdcp->sync_op_status = status; 4597 vdcp->sync_op_pending = B_FALSE; 4598 cv_signal(&vdcp->sync_pending_cv); 4599 break; 4600 4601 case CB_STRATEGY: 4602 bufp = ldep->cb_arg; 4603 ASSERT(bufp != NULL); 4604 bufp->b_resid = 4605 bufp->b_bcount - ldep->dep->payload.nbytes; 4606 status = ldep->dep->payload.status; /* Future:ntoh */ 4607 if (status != 0) { 4608 DMSG(vdcp, 1, "strategy status=%d\n", status); 4609 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4610 bioerror(bufp, status); 4611 } 4612 4613 (void) vdc_depopulate_descriptor(vdcp, idx); 4614 4615 DMSG(vdcp, 1, 4616 "strategy complete req=%ld bytes resp=%ld bytes\n", 4617 bufp->b_bcount, ldep->dep->payload.nbytes); 4618 4619 if (status != 0 && vdcp->failfast_interval != 0) { 4620 /* 4621 * The I/O has failed and failfast is enabled. 4622 * We need the failfast thread to check if the 4623 * failure is due to a reservation conflict. 4624 */ 4625 (void) vdc_failfast_io_queue(vdcp, bufp); 4626 } else { 4627 if (status == 0) { 4628 op = (bufp->b_flags & B_READ) ? 4629 VD_OP_BREAD : VD_OP_BWRITE; 4630 VD_UPDATE_IO_STATS(vdcp, op, 4631 ldep->dep->payload.nbytes); 4632 } 4633 VD_KSTAT_RUNQ_EXIT(vdcp); 4634 DTRACE_IO1(done, buf_t *, bufp); 4635 biodone(bufp); 4636 } 4637 break; 4638 4639 default: 4640 ASSERT(0); 4641 } 4642 } 4643 4644 /* let the arrival signal propogate */ 4645 mutex_exit(&vdcp->lock); 4646 4647 /* probe gives the count of how many entries were processed */ 4648 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4649 4650 return (0); 4651 } 4652 4653 4654 /* 4655 * Function: 4656 * vdc_handle_ver_msg() 4657 * 4658 * Description: 4659 * 4660 * Arguments: 4661 * vdc - soft state pointer for this instance of the device driver. 4662 * ver_msg - LDC message sent by vDisk server 4663 * 4664 * Return Code: 4665 * 0 - Success 4666 */ 4667 static int 4668 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4669 { 4670 int status = 0; 4671 4672 ASSERT(vdc != NULL); 4673 ASSERT(mutex_owned(&vdc->lock)); 4674 4675 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4676 return (EPROTO); 4677 } 4678 4679 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4680 return (EINVAL); 4681 } 4682 4683 switch (ver_msg->tag.vio_subtype) { 4684 case VIO_SUBTYPE_ACK: 4685 /* 4686 * We check to see if the version returned is indeed supported 4687 * (The server may have also adjusted the minor number downwards 4688 * and if so 'ver_msg' will contain the actual version agreed) 4689 */ 4690 if (vdc_is_supported_version(ver_msg)) { 4691 vdc->ver.major = ver_msg->ver_major; 4692 vdc->ver.minor = ver_msg->ver_minor; 4693 ASSERT(vdc->ver.major > 0); 4694 } else { 4695 status = EPROTO; 4696 } 4697 break; 4698 4699 case VIO_SUBTYPE_NACK: 4700 /* 4701 * call vdc_is_supported_version() which will return the next 4702 * supported version (if any) in 'ver_msg' 4703 */ 4704 (void) vdc_is_supported_version(ver_msg); 4705 if (ver_msg->ver_major > 0) { 4706 size_t len = sizeof (*ver_msg); 4707 4708 ASSERT(vdc->ver.major > 0); 4709 4710 /* reset the necessary fields and resend */ 4711 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4712 ver_msg->dev_class = VDEV_DISK; 4713 4714 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4715 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4716 vdc->instance, status); 4717 if (len != sizeof (*ver_msg)) 4718 status = EBADMSG; 4719 } else { 4720 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4721 vdc->instance); 4722 status = ENOTSUP; 4723 } 4724 4725 break; 4726 case VIO_SUBTYPE_INFO: 4727 /* 4728 * Handle the case where vds starts handshake 4729 * (for now only vdc is the instigator) 4730 */ 4731 status = ENOTSUP; 4732 break; 4733 4734 default: 4735 status = EINVAL; 4736 break; 4737 } 4738 4739 return (status); 4740 } 4741 4742 /* 4743 * Function: 4744 * vdc_handle_attr_msg() 4745 * 4746 * Description: 4747 * 4748 * Arguments: 4749 * vdc - soft state pointer for this instance of the device driver. 4750 * attr_msg - LDC message sent by vDisk server 4751 * 4752 * Return Code: 4753 * 0 - Success 4754 */ 4755 static int 4756 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4757 { 4758 int status = 0; 4759 4760 ASSERT(vdc != NULL); 4761 ASSERT(mutex_owned(&vdc->lock)); 4762 4763 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4764 return (EPROTO); 4765 } 4766 4767 switch (attr_msg->tag.vio_subtype) { 4768 case VIO_SUBTYPE_ACK: 4769 /* 4770 * We now verify the attributes sent by vds. 4771 */ 4772 if (attr_msg->vdisk_size == 0) { 4773 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4774 vdc->instance); 4775 status = EINVAL; 4776 break; 4777 } 4778 4779 if (attr_msg->max_xfer_sz == 0) { 4780 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4781 vdc->instance); 4782 status = EINVAL; 4783 break; 4784 } 4785 4786 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4787 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4788 vdc->instance); 4789 attr_msg->vdisk_size = 0; 4790 } 4791 4792 /* 4793 * If the disk size is already set check that it hasn't changed. 4794 */ 4795 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4796 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4797 DMSG(vdc, 0, "[%d] Different disk size from vds " 4798 "(old=0x%lx - new=0x%lx", vdc->instance, 4799 vdc->vdisk_size, attr_msg->vdisk_size) 4800 status = EINVAL; 4801 break; 4802 } 4803 4804 vdc->vdisk_size = attr_msg->vdisk_size; 4805 vdc->vdisk_type = attr_msg->vdisk_type; 4806 vdc->operations = attr_msg->operations; 4807 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4808 vdc->vdisk_media = attr_msg->vdisk_media; 4809 else 4810 vdc->vdisk_media = 0; 4811 4812 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4813 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4814 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4815 vdc->instance, vdc->block_size, 4816 attr_msg->vdisk_block_size); 4817 4818 /* 4819 * We don't know at compile time what the vDisk server will 4820 * think are good values but we apply a large (arbitrary) 4821 * upper bound to prevent memory exhaustion in vdc if it was 4822 * allocating a DRing based of huge values sent by the server. 4823 * We probably will never exceed this except if the message 4824 * was garbage. 4825 */ 4826 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4827 (PAGESIZE * DEV_BSIZE)) { 4828 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4829 vdc->block_size = attr_msg->vdisk_block_size; 4830 } else { 4831 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4832 " using max supported by vdc", vdc->instance); 4833 } 4834 4835 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4836 (attr_msg->vdisk_size > INT64_MAX) || 4837 (attr_msg->operations == 0) || 4838 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4839 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4840 vdc->instance); 4841 status = EINVAL; 4842 break; 4843 } 4844 4845 /* 4846 * Now that we have received all attributes we can create a 4847 * fake geometry for the disk. 4848 */ 4849 vdc_create_fake_geometry(vdc); 4850 break; 4851 4852 case VIO_SUBTYPE_NACK: 4853 /* 4854 * vds could not handle the attributes we sent so we 4855 * stop negotiating. 4856 */ 4857 status = EPROTO; 4858 break; 4859 4860 case VIO_SUBTYPE_INFO: 4861 /* 4862 * Handle the case where vds starts the handshake 4863 * (for now; vdc is the only supported instigatior) 4864 */ 4865 status = ENOTSUP; 4866 break; 4867 4868 default: 4869 status = ENOTSUP; 4870 break; 4871 } 4872 4873 return (status); 4874 } 4875 4876 /* 4877 * Function: 4878 * vdc_handle_dring_reg_msg() 4879 * 4880 * Description: 4881 * 4882 * Arguments: 4883 * vdc - soft state pointer for this instance of the driver. 4884 * dring_msg - LDC message sent by vDisk server 4885 * 4886 * Return Code: 4887 * 0 - Success 4888 */ 4889 static int 4890 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4891 { 4892 int status = 0; 4893 4894 ASSERT(vdc != NULL); 4895 ASSERT(mutex_owned(&vdc->lock)); 4896 4897 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4898 return (EPROTO); 4899 } 4900 4901 switch (dring_msg->tag.vio_subtype) { 4902 case VIO_SUBTYPE_ACK: 4903 /* save the received dring_ident */ 4904 vdc->dring_ident = dring_msg->dring_ident; 4905 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4906 vdc->instance, vdc->dring_ident); 4907 break; 4908 4909 case VIO_SUBTYPE_NACK: 4910 /* 4911 * vds could not handle the DRing info we sent so we 4912 * stop negotiating. 4913 */ 4914 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4915 vdc->instance); 4916 status = EPROTO; 4917 break; 4918 4919 case VIO_SUBTYPE_INFO: 4920 /* 4921 * Handle the case where vds starts handshake 4922 * (for now only vdc is the instigatior) 4923 */ 4924 status = ENOTSUP; 4925 break; 4926 default: 4927 status = ENOTSUP; 4928 } 4929 4930 return (status); 4931 } 4932 4933 /* 4934 * Function: 4935 * vdc_verify_seq_num() 4936 * 4937 * Description: 4938 * This functions verifies that the sequence number sent back by the vDisk 4939 * server with the latest message is what is expected (i.e. it is greater 4940 * than the last seq num sent by the vDisk server and less than or equal 4941 * to the last seq num generated by vdc). 4942 * 4943 * It then checks the request ID to see if any requests need processing 4944 * in the DRing. 4945 * 4946 * Arguments: 4947 * vdc - soft state pointer for this instance of the driver. 4948 * dring_msg - pointer to the LDC message sent by vds 4949 * 4950 * Return Code: 4951 * VDC_SEQ_NUM_TODO - Message needs to be processed 4952 * VDC_SEQ_NUM_SKIP - Message has already been processed 4953 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4954 * vdc cannot deal with them 4955 */ 4956 static int 4957 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4958 { 4959 ASSERT(vdc != NULL); 4960 ASSERT(dring_msg != NULL); 4961 ASSERT(mutex_owned(&vdc->lock)); 4962 4963 /* 4964 * Check to see if the messages were responded to in the correct 4965 * order by vds. 4966 */ 4967 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4968 (dring_msg->seq_num > vdc->seq_num)) { 4969 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4970 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4971 vdc->instance, dring_msg->seq_num, 4972 vdc->seq_num_reply, vdc->seq_num, 4973 vdc->req_id_proc, vdc->req_id); 4974 return (VDC_SEQ_NUM_INVALID); 4975 } 4976 vdc->seq_num_reply = dring_msg->seq_num; 4977 4978 if (vdc->req_id_proc < vdc->req_id) 4979 return (VDC_SEQ_NUM_TODO); 4980 else 4981 return (VDC_SEQ_NUM_SKIP); 4982 } 4983 4984 4985 /* 4986 * Function: 4987 * vdc_is_supported_version() 4988 * 4989 * Description: 4990 * This routine checks if the major/minor version numbers specified in 4991 * 'ver_msg' are supported. If not it finds the next version that is 4992 * in the supported version list 'vdc_version[]' and sets the fields in 4993 * 'ver_msg' to those values 4994 * 4995 * Arguments: 4996 * ver_msg - LDC message sent by vDisk server 4997 * 4998 * Return Code: 4999 * B_TRUE - Success 5000 * B_FALSE - Version not supported 5001 */ 5002 static boolean_t 5003 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5004 { 5005 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5006 5007 for (int i = 0; i < vdc_num_versions; i++) { 5008 ASSERT(vdc_version[i].major > 0); 5009 ASSERT((i == 0) || 5010 (vdc_version[i].major < vdc_version[i-1].major)); 5011 5012 /* 5013 * If the major versions match, adjust the minor version, if 5014 * necessary, down to the highest value supported by this 5015 * client. The server should support all minor versions lower 5016 * than the value it sent 5017 */ 5018 if (ver_msg->ver_major == vdc_version[i].major) { 5019 if (ver_msg->ver_minor > vdc_version[i].minor) { 5020 DMSGX(0, 5021 "Adjusting minor version from %u to %u", 5022 ver_msg->ver_minor, vdc_version[i].minor); 5023 ver_msg->ver_minor = vdc_version[i].minor; 5024 } 5025 return (B_TRUE); 5026 } 5027 5028 /* 5029 * If the message contains a higher major version number, set 5030 * the message's major/minor versions to the current values 5031 * and return false, so this message will get resent with 5032 * these values, and the server will potentially try again 5033 * with the same or a lower version 5034 */ 5035 if (ver_msg->ver_major > vdc_version[i].major) { 5036 ver_msg->ver_major = vdc_version[i].major; 5037 ver_msg->ver_minor = vdc_version[i].minor; 5038 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5039 ver_msg->ver_major, ver_msg->ver_minor); 5040 5041 return (B_FALSE); 5042 } 5043 5044 /* 5045 * Otherwise, the message's major version is less than the 5046 * current major version, so continue the loop to the next 5047 * (lower) supported version 5048 */ 5049 } 5050 5051 /* 5052 * No common version was found; "ground" the version pair in the 5053 * message to terminate negotiation 5054 */ 5055 ver_msg->ver_major = 0; 5056 ver_msg->ver_minor = 0; 5057 5058 return (B_FALSE); 5059 } 5060 /* -------------------------------------------------------------------------- */ 5061 5062 /* 5063 * DKIO(7) support 5064 */ 5065 5066 typedef struct vdc_dk_arg { 5067 struct dk_callback dkc; 5068 int mode; 5069 dev_t dev; 5070 vdc_t *vdc; 5071 } vdc_dk_arg_t; 5072 5073 /* 5074 * Function: 5075 * vdc_dkio_flush_cb() 5076 * 5077 * Description: 5078 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5079 * by kernel code. 5080 * 5081 * Arguments: 5082 * arg - a pointer to a vdc_dk_arg_t structure. 5083 */ 5084 void 5085 vdc_dkio_flush_cb(void *arg) 5086 { 5087 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5088 struct dk_callback *dkc = NULL; 5089 vdc_t *vdc = NULL; 5090 int rv; 5091 5092 if (dk_arg == NULL) { 5093 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5094 return; 5095 } 5096 dkc = &dk_arg->dkc; 5097 vdc = dk_arg->vdc; 5098 ASSERT(vdc != NULL); 5099 5100 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5101 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5102 if (rv != 0) { 5103 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5104 vdc->instance, rv, 5105 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5106 } 5107 5108 /* 5109 * Trigger the call back to notify the caller the the ioctl call has 5110 * been completed. 5111 */ 5112 if ((dk_arg->mode & FKIOCTL) && 5113 (dkc != NULL) && 5114 (dkc->dkc_callback != NULL)) { 5115 ASSERT(dkc->dkc_cookie != NULL); 5116 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5117 } 5118 5119 /* Indicate that one less DKIO write flush is outstanding */ 5120 mutex_enter(&vdc->lock); 5121 vdc->dkio_flush_pending--; 5122 ASSERT(vdc->dkio_flush_pending >= 0); 5123 mutex_exit(&vdc->lock); 5124 5125 /* free the mem that was allocated when the callback was dispatched */ 5126 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5127 } 5128 5129 /* 5130 * Function: 5131 * vdc_dkio_gapart() 5132 * 5133 * Description: 5134 * This function implements the DKIOCGAPART ioctl. 5135 * 5136 * Arguments: 5137 * vdc - soft state pointer 5138 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5139 * flag - ioctl flags 5140 */ 5141 static int 5142 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5143 { 5144 struct dk_geom *geom; 5145 struct vtoc *vtoc; 5146 union { 5147 struct dk_map map[NDKMAP]; 5148 struct dk_map32 map32[NDKMAP]; 5149 } data; 5150 int i, rv, size; 5151 5152 mutex_enter(&vdc->lock); 5153 5154 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5155 mutex_exit(&vdc->lock); 5156 return (rv); 5157 } 5158 5159 vtoc = vdc->vtoc; 5160 geom = vdc->geom; 5161 5162 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5163 5164 for (i = 0; i < vtoc->v_nparts; i++) { 5165 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5166 (geom->dkg_nhead * geom->dkg_nsect); 5167 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5168 } 5169 size = NDKMAP * sizeof (struct dk_map32); 5170 5171 } else { 5172 5173 for (i = 0; i < vtoc->v_nparts; i++) { 5174 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5175 (geom->dkg_nhead * geom->dkg_nsect); 5176 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5177 } 5178 size = NDKMAP * sizeof (struct dk_map); 5179 5180 } 5181 5182 mutex_exit(&vdc->lock); 5183 5184 if (ddi_copyout(&data, arg, size, flag) != 0) 5185 return (EFAULT); 5186 5187 return (0); 5188 } 5189 5190 /* 5191 * Function: 5192 * vdc_dkio_partition() 5193 * 5194 * Description: 5195 * This function implements the DKIOCPARTITION ioctl. 5196 * 5197 * Arguments: 5198 * vdc - soft state pointer 5199 * arg - a pointer to a struct partition64 structure 5200 * flag - ioctl flags 5201 */ 5202 static int 5203 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5204 { 5205 struct partition64 p64; 5206 efi_gpt_t *gpt; 5207 efi_gpe_t *gpe; 5208 vd_efi_dev_t edev; 5209 uint_t partno; 5210 int rv; 5211 5212 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5213 return (EFAULT); 5214 } 5215 5216 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5217 5218 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5219 return (rv); 5220 } 5221 5222 partno = p64.p_partno; 5223 5224 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5225 vd_efi_free(&edev, gpt, gpe); 5226 return (ESRCH); 5227 } 5228 5229 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5230 sizeof (struct uuid)); 5231 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5232 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5233 5234 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5235 vd_efi_free(&edev, gpt, gpe); 5236 return (EFAULT); 5237 } 5238 5239 vd_efi_free(&edev, gpt, gpe); 5240 return (0); 5241 } 5242 5243 /* 5244 * Function: 5245 * vdc_dioctl_rwcmd() 5246 * 5247 * Description: 5248 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5249 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5250 * 5251 * Arguments: 5252 * dev - device 5253 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5254 * flag - ioctl flags 5255 */ 5256 static int 5257 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5258 { 5259 struct dadkio_rwcmd32 rwcmd32; 5260 struct dadkio_rwcmd rwcmd; 5261 struct iovec aiov; 5262 struct uio auio; 5263 int rw, status; 5264 struct buf *buf; 5265 5266 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5267 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5268 sizeof (struct dadkio_rwcmd32), flag)) { 5269 return (EFAULT); 5270 } 5271 rwcmd.cmd = rwcmd32.cmd; 5272 rwcmd.flags = rwcmd32.flags; 5273 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5274 rwcmd.buflen = rwcmd32.buflen; 5275 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5276 } else { 5277 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5278 sizeof (struct dadkio_rwcmd), flag)) { 5279 return (EFAULT); 5280 } 5281 } 5282 5283 switch (rwcmd.cmd) { 5284 case DADKIO_RWCMD_READ: 5285 rw = B_READ; 5286 break; 5287 case DADKIO_RWCMD_WRITE: 5288 rw = B_WRITE; 5289 break; 5290 default: 5291 return (EINVAL); 5292 } 5293 5294 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5295 aiov.iov_base = rwcmd.bufaddr; 5296 aiov.iov_len = rwcmd.buflen; 5297 5298 bzero((caddr_t)&auio, sizeof (struct uio)); 5299 auio.uio_iov = &aiov; 5300 auio.uio_iovcnt = 1; 5301 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5302 auio.uio_resid = rwcmd.buflen; 5303 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5304 5305 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5306 bioinit(buf); 5307 /* 5308 * We use the private field of buf to specify that this is an 5309 * I/O using an absolute offset. 5310 */ 5311 buf->b_private = (void *)VD_SLICE_NONE; 5312 5313 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5314 5315 biofini(buf); 5316 kmem_free(buf, sizeof (buf_t)); 5317 5318 return (status); 5319 } 5320 5321 /* 5322 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5323 * buffer is returned in alloc_len. 5324 */ 5325 static vd_scsi_t * 5326 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5327 int *alloc_len) 5328 { 5329 vd_scsi_t *vd_scsi; 5330 int vd_scsi_len = VD_SCSI_SIZE; 5331 5332 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5333 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5334 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5335 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5336 5337 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5338 5339 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5340 5341 vd_scsi->cdb_len = cdb_len; 5342 vd_scsi->sense_len = sense_len; 5343 vd_scsi->datain_len = datain_len; 5344 vd_scsi->dataout_len = dataout_len; 5345 5346 *alloc_len = vd_scsi_len; 5347 5348 return (vd_scsi); 5349 } 5350 5351 /* 5352 * Convert the status of a SCSI command to a Solaris return code. 5353 * 5354 * Arguments: 5355 * vd_scsi - The SCSI operation buffer. 5356 * log_error - indicate if an error message should be logged. 5357 * 5358 * Note that our SCSI error messages are rather primitive for the moment 5359 * and could be improved by decoding some data like the SCSI command and 5360 * the sense key. 5361 * 5362 * Return value: 5363 * 0 - Status is good. 5364 * EACCES - Status reports a reservation conflict. 5365 * ENOTSUP - Status reports a check condition and sense key 5366 * reports an illegal request. 5367 * EIO - Any other status. 5368 */ 5369 static int 5370 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5371 { 5372 int rv; 5373 char path_str[MAXPATHLEN]; 5374 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5375 union scsi_cdb *cdb; 5376 struct scsi_extended_sense *sense; 5377 5378 if (vd_scsi->cmd_status == STATUS_GOOD) 5379 /* no error */ 5380 return (0); 5381 5382 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5383 if (vdc_scsi_log_error) 5384 log_error = B_TRUE; 5385 5386 if (log_error) { 5387 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5388 ddi_pathname(vdc->dip, path_str), vdc->instance, 5389 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5390 } 5391 5392 /* default returned value */ 5393 rv = EIO; 5394 5395 switch (vd_scsi->cmd_status) { 5396 5397 case STATUS_CHECK: 5398 case STATUS_TERMINATED: 5399 if (log_error) 5400 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5401 5402 /* check sense buffer */ 5403 if (vd_scsi->sense_len == 0 || 5404 vd_scsi->sense_status != STATUS_GOOD) { 5405 if (log_error) 5406 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5407 break; 5408 } 5409 5410 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5411 5412 if (log_error) { 5413 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5414 "\tASC: 0x%x, ASCQ: 0x%x\n", 5415 scsi_sense_key((uint8_t *)sense), 5416 scsi_sense_asc((uint8_t *)sense), 5417 scsi_sense_ascq((uint8_t *)sense)); 5418 } 5419 5420 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5421 rv = ENOTSUP; 5422 break; 5423 5424 case STATUS_BUSY: 5425 if (log_error) 5426 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5427 break; 5428 5429 case STATUS_RESERVATION_CONFLICT: 5430 /* 5431 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5432 * reservation conflict could be due to various reasons like 5433 * incorrect keys, not registered or not reserved etc. So, 5434 * we should not panic in that case. 5435 */ 5436 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5437 if (vdc->failfast_interval != 0 && 5438 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5439 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5440 /* failfast is enabled so we have to panic */ 5441 (void) snprintf(panic_str, sizeof (panic_str), 5442 VDC_RESV_CONFLICT_FMT_STR "%s", 5443 ddi_pathname(vdc->dip, path_str)); 5444 panic(panic_str); 5445 } 5446 if (log_error) 5447 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5448 rv = EACCES; 5449 break; 5450 5451 case STATUS_QFULL: 5452 if (log_error) 5453 cmn_err(CE_NOTE, "\tQueue Full\n"); 5454 break; 5455 5456 case STATUS_MET: 5457 case STATUS_INTERMEDIATE: 5458 case STATUS_SCSI2: 5459 case STATUS_INTERMEDIATE_MET: 5460 case STATUS_ACA_ACTIVE: 5461 if (log_error) 5462 cmn_err(CE_CONT, 5463 "\tUnexpected SCSI status received: 0x%x\n", 5464 vd_scsi->cmd_status); 5465 break; 5466 5467 default: 5468 if (log_error) 5469 cmn_err(CE_CONT, 5470 "\tInvalid SCSI status received: 0x%x\n", 5471 vd_scsi->cmd_status); 5472 break; 5473 } 5474 5475 return (rv); 5476 } 5477 5478 /* 5479 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5480 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5481 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5482 * converted to a VD_OP_RESET operation. 5483 */ 5484 static int 5485 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5486 { 5487 struct uscsi_cmd uscsi; 5488 struct uscsi_cmd32 uscsi32; 5489 vd_scsi_t *vd_scsi; 5490 int vd_scsi_len; 5491 union scsi_cdb *cdb; 5492 struct scsi_extended_sense *sense; 5493 char *datain, *dataout; 5494 size_t cdb_len, datain_len, dataout_len, sense_len; 5495 int rv; 5496 5497 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5498 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5499 mode) != 0) 5500 return (EFAULT); 5501 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5502 } else { 5503 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5504 mode) != 0) 5505 return (EFAULT); 5506 } 5507 5508 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5509 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5510 USCSI_RESET_ALL)) { 5511 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5512 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5513 return (rv); 5514 } 5515 5516 /* cdb buffer length */ 5517 cdb_len = uscsi.uscsi_cdblen; 5518 5519 /* data in and out buffers length */ 5520 if (uscsi.uscsi_flags & USCSI_READ) { 5521 datain_len = uscsi.uscsi_buflen; 5522 dataout_len = 0; 5523 } else { 5524 datain_len = 0; 5525 dataout_len = uscsi.uscsi_buflen; 5526 } 5527 5528 /* sense buffer length */ 5529 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5530 sense_len = uscsi.uscsi_rqlen; 5531 else 5532 sense_len = 0; 5533 5534 /* allocate buffer for the VD_SCSICMD_OP operation */ 5535 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5536 &vd_scsi_len); 5537 5538 /* 5539 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5540 * but basically they prevent a SCSI command from being retried in case 5541 * of an error. 5542 */ 5543 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5544 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5545 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5546 5547 /* set task attribute */ 5548 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5549 vd_scsi->task_attribute = 0; 5550 } else { 5551 if (uscsi.uscsi_flags & USCSI_HEAD) 5552 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5553 else if (uscsi.uscsi_flags & USCSI_HTAG) 5554 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5555 else if (uscsi.uscsi_flags & USCSI_OTAG) 5556 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5557 else 5558 vd_scsi->task_attribute = 0; 5559 } 5560 5561 /* set timeout */ 5562 vd_scsi->timeout = uscsi.uscsi_timeout; 5563 5564 /* copy-in cdb data */ 5565 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5566 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5567 rv = EFAULT; 5568 goto done; 5569 } 5570 5571 /* keep a pointer to the sense buffer */ 5572 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5573 5574 /* keep a pointer to the data-in buffer */ 5575 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5576 5577 /* copy-in request data to the data-out buffer */ 5578 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5579 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5580 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5581 mode)) { 5582 rv = EFAULT; 5583 goto done; 5584 } 5585 } 5586 5587 /* submit the request */ 5588 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5589 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5590 5591 if (rv != 0) 5592 goto done; 5593 5594 /* update scsi status */ 5595 uscsi.uscsi_status = vd_scsi->cmd_status; 5596 5597 /* update sense data */ 5598 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5599 (uscsi.uscsi_status == STATUS_CHECK || 5600 uscsi.uscsi_status == STATUS_TERMINATED)) { 5601 5602 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5603 5604 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5605 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5606 vd_scsi->sense_len; 5607 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5608 vd_scsi->sense_len, mode) != 0) { 5609 rv = EFAULT; 5610 goto done; 5611 } 5612 } 5613 } 5614 5615 /* update request data */ 5616 if (uscsi.uscsi_status == STATUS_GOOD) { 5617 if (uscsi.uscsi_flags & USCSI_READ) { 5618 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5619 vd_scsi->datain_len; 5620 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5621 vd_scsi->datain_len, mode) != 0) { 5622 rv = EFAULT; 5623 goto done; 5624 } 5625 } else { 5626 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5627 vd_scsi->dataout_len; 5628 } 5629 } 5630 5631 /* copy-out result */ 5632 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5633 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5634 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5635 mode) != 0) { 5636 rv = EFAULT; 5637 goto done; 5638 } 5639 } else { 5640 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5641 mode) != 0) { 5642 rv = EFAULT; 5643 goto done; 5644 } 5645 } 5646 5647 /* get the return code from the SCSI command status */ 5648 rv = vdc_scsi_status(vdc, vd_scsi, 5649 !(uscsi.uscsi_flags & USCSI_SILENT)); 5650 5651 done: 5652 kmem_free(vd_scsi, vd_scsi_len); 5653 return (rv); 5654 } 5655 5656 /* 5657 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5658 * 5659 * Arguments: 5660 * cmd - SCSI PERSISTENT IN command 5661 * len - length of the SCSI input buffer 5662 * vd_scsi_len - return the length of the allocated buffer 5663 * 5664 * Returned Value: 5665 * a pointer to the allocated VD_OP_SCSICMD buffer. 5666 */ 5667 static vd_scsi_t * 5668 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5669 { 5670 int cdb_len, sense_len, datain_len, dataout_len; 5671 vd_scsi_t *vd_scsi; 5672 union scsi_cdb *cdb; 5673 5674 cdb_len = CDB_GROUP1; 5675 sense_len = sizeof (struct scsi_extended_sense); 5676 datain_len = len; 5677 dataout_len = 0; 5678 5679 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5680 vd_scsi_len); 5681 5682 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5683 5684 /* set cdb */ 5685 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5686 cdb->cdb_opaque[1] = cmd; 5687 FORMG1COUNT(cdb, datain_len); 5688 5689 vd_scsi->timeout = vdc_scsi_timeout; 5690 5691 return (vd_scsi); 5692 } 5693 5694 /* 5695 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5696 * 5697 * Arguments: 5698 * cmd - SCSI PERSISTENT OUT command 5699 * len - length of the SCSI output buffer 5700 * vd_scsi_len - return the length of the allocated buffer 5701 * 5702 * Returned Code: 5703 * a pointer to the allocated VD_OP_SCSICMD buffer. 5704 */ 5705 static vd_scsi_t * 5706 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5707 { 5708 int cdb_len, sense_len, datain_len, dataout_len; 5709 vd_scsi_t *vd_scsi; 5710 union scsi_cdb *cdb; 5711 5712 cdb_len = CDB_GROUP1; 5713 sense_len = sizeof (struct scsi_extended_sense); 5714 datain_len = 0; 5715 dataout_len = len; 5716 5717 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5718 vd_scsi_len); 5719 5720 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5721 5722 /* set cdb */ 5723 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5724 cdb->cdb_opaque[1] = cmd; 5725 FORMG1COUNT(cdb, dataout_len); 5726 5727 vd_scsi->timeout = vdc_scsi_timeout; 5728 5729 return (vd_scsi); 5730 } 5731 5732 /* 5733 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5734 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5735 * server with a VD_OP_SCSICMD operation. 5736 */ 5737 static int 5738 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5739 { 5740 vd_scsi_t *vd_scsi; 5741 mhioc_inkeys_t inkeys; 5742 mhioc_key_list_t klist; 5743 struct mhioc_inkeys32 inkeys32; 5744 struct mhioc_key_list32 klist32; 5745 sd_prin_readkeys_t *scsi_keys; 5746 void *user_keys; 5747 int vd_scsi_len; 5748 int listsize, listlen, rv; 5749 5750 /* copyin arguments */ 5751 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5752 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5753 if (rv != 0) 5754 return (EFAULT); 5755 5756 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5757 sizeof (klist32), mode); 5758 if (rv != 0) 5759 return (EFAULT); 5760 5761 listsize = klist32.listsize; 5762 } else { 5763 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5764 if (rv != 0) 5765 return (EFAULT); 5766 5767 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5768 if (rv != 0) 5769 return (EFAULT); 5770 5771 listsize = klist.listsize; 5772 } 5773 5774 /* build SCSI VD_OP request */ 5775 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5776 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5777 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5778 5779 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5780 5781 /* submit the request */ 5782 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5783 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5784 5785 if (rv != 0) 5786 goto done; 5787 5788 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5789 5790 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5791 inkeys32.generation = scsi_keys->generation; 5792 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5793 if (rv != 0) { 5794 rv = EFAULT; 5795 goto done; 5796 } 5797 5798 klist32.listlen = listlen; 5799 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5800 sizeof (klist32), mode); 5801 if (rv != 0) { 5802 rv = EFAULT; 5803 goto done; 5804 } 5805 5806 user_keys = (caddr_t)(uintptr_t)klist32.list; 5807 } else { 5808 inkeys.generation = scsi_keys->generation; 5809 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5810 if (rv != 0) { 5811 rv = EFAULT; 5812 goto done; 5813 } 5814 5815 klist.listlen = listlen; 5816 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5817 if (rv != 0) { 5818 rv = EFAULT; 5819 goto done; 5820 } 5821 5822 user_keys = klist.list; 5823 } 5824 5825 /* copy out keys */ 5826 if (listlen > 0 && listsize > 0) { 5827 if (listsize < listlen) 5828 listlen = listsize; 5829 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5830 listlen * MHIOC_RESV_KEY_SIZE, mode); 5831 if (rv != 0) 5832 rv = EFAULT; 5833 } 5834 5835 if (rv == 0) 5836 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5837 5838 done: 5839 kmem_free(vd_scsi, vd_scsi_len); 5840 5841 return (rv); 5842 } 5843 5844 /* 5845 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5846 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5847 * the vdisk server with a VD_OP_SCSICMD operation. 5848 */ 5849 static int 5850 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5851 { 5852 vd_scsi_t *vd_scsi; 5853 mhioc_inresvs_t inresv; 5854 mhioc_resv_desc_list_t rlist; 5855 struct mhioc_inresvs32 inresv32; 5856 struct mhioc_resv_desc_list32 rlist32; 5857 mhioc_resv_desc_t mhd_resv; 5858 sd_prin_readresv_t *scsi_resv; 5859 sd_readresv_desc_t *resv; 5860 mhioc_resv_desc_t *user_resv; 5861 int vd_scsi_len; 5862 int listsize, listlen, i, rv; 5863 5864 /* copyin arguments */ 5865 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5866 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5867 if (rv != 0) 5868 return (EFAULT); 5869 5870 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5871 sizeof (rlist32), mode); 5872 if (rv != 0) 5873 return (EFAULT); 5874 5875 listsize = rlist32.listsize; 5876 } else { 5877 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5878 if (rv != 0) 5879 return (EFAULT); 5880 5881 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5882 if (rv != 0) 5883 return (EFAULT); 5884 5885 listsize = rlist.listsize; 5886 } 5887 5888 /* build SCSI VD_OP request */ 5889 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5890 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5891 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5892 5893 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5894 5895 /* submit the request */ 5896 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5897 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5898 5899 if (rv != 0) 5900 goto done; 5901 5902 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5903 5904 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5905 inresv32.generation = scsi_resv->generation; 5906 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5907 if (rv != 0) { 5908 rv = EFAULT; 5909 goto done; 5910 } 5911 5912 rlist32.listlen = listlen; 5913 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5914 sizeof (rlist32), mode); 5915 if (rv != 0) { 5916 rv = EFAULT; 5917 goto done; 5918 } 5919 5920 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5921 } else { 5922 inresv.generation = scsi_resv->generation; 5923 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5924 if (rv != 0) { 5925 rv = EFAULT; 5926 goto done; 5927 } 5928 5929 rlist.listlen = listlen; 5930 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5931 if (rv != 0) { 5932 rv = EFAULT; 5933 goto done; 5934 } 5935 5936 user_resv = rlist.list; 5937 } 5938 5939 /* copy out reservations */ 5940 if (listsize > 0 && listlen > 0) { 5941 if (listsize < listlen) 5942 listlen = listsize; 5943 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5944 5945 for (i = 0; i < listlen; i++) { 5946 mhd_resv.type = resv->type; 5947 mhd_resv.scope = resv->scope; 5948 mhd_resv.scope_specific_addr = 5949 BE_32(resv->scope_specific_addr); 5950 bcopy(&resv->resvkey, &mhd_resv.key, 5951 MHIOC_RESV_KEY_SIZE); 5952 5953 rv = ddi_copyout(&mhd_resv, user_resv, 5954 sizeof (mhd_resv), mode); 5955 if (rv != 0) { 5956 rv = EFAULT; 5957 goto done; 5958 } 5959 resv++; 5960 user_resv++; 5961 } 5962 } 5963 5964 if (rv == 0) 5965 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5966 5967 done: 5968 kmem_free(vd_scsi, vd_scsi_len); 5969 return (rv); 5970 } 5971 5972 /* 5973 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5974 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5975 * server with a VD_OP_SCSICMD operation. 5976 */ 5977 static int 5978 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5979 { 5980 vd_scsi_t *vd_scsi; 5981 sd_prout_t *scsi_prout; 5982 mhioc_register_t mhd_reg; 5983 int vd_scsi_len, rv; 5984 5985 /* copyin arguments */ 5986 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 5987 if (rv != 0) 5988 return (EFAULT); 5989 5990 /* build SCSI VD_OP request */ 5991 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 5992 sizeof (sd_prout_t), &vd_scsi_len); 5993 5994 /* set parameters */ 5995 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 5996 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 5997 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 5998 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 5999 6000 /* submit the request */ 6001 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6002 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6003 6004 if (rv == 0) 6005 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6006 6007 kmem_free(vd_scsi, vd_scsi_len); 6008 return (rv); 6009 } 6010 6011 /* 6012 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6013 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6014 * server with a VD_OP_SCSICMD operation. 6015 */ 6016 static int 6017 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6018 { 6019 union scsi_cdb *cdb; 6020 vd_scsi_t *vd_scsi; 6021 sd_prout_t *scsi_prout; 6022 mhioc_resv_desc_t mhd_resv; 6023 int vd_scsi_len, rv; 6024 6025 /* copyin arguments */ 6026 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6027 if (rv != 0) 6028 return (EFAULT); 6029 6030 /* build SCSI VD_OP request */ 6031 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6032 sizeof (sd_prout_t), &vd_scsi_len); 6033 6034 /* set parameters */ 6035 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6036 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6037 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6038 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6039 cdb->cdb_opaque[2] = mhd_resv.type; 6040 6041 /* submit the request */ 6042 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6043 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6044 6045 if (rv == 0) 6046 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6047 6048 kmem_free(vd_scsi, vd_scsi_len); 6049 return (rv); 6050 } 6051 6052 /* 6053 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6054 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6055 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6056 */ 6057 static int 6058 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6059 { 6060 union scsi_cdb *cdb; 6061 vd_scsi_t *vd_scsi; 6062 sd_prout_t *scsi_prout; 6063 mhioc_preemptandabort_t mhd_preempt; 6064 int vd_scsi_len, rv; 6065 6066 /* copyin arguments */ 6067 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6068 if (rv != 0) 6069 return (EFAULT); 6070 6071 /* build SCSI VD_OP request */ 6072 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6073 sizeof (sd_prout_t), &vd_scsi_len); 6074 6075 /* set parameters */ 6076 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6077 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6078 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6079 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6080 MHIOC_RESV_KEY_SIZE); 6081 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6082 MHIOC_RESV_KEY_SIZE); 6083 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6084 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6085 6086 /* submit the request */ 6087 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6088 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6089 6090 if (rv == 0) 6091 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6092 6093 kmem_free(vd_scsi, vd_scsi_len); 6094 return (rv); 6095 } 6096 6097 /* 6098 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6099 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6100 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6101 */ 6102 static int 6103 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6104 { 6105 vd_scsi_t *vd_scsi; 6106 sd_prout_t *scsi_prout; 6107 mhioc_registerandignorekey_t mhd_regi; 6108 int vd_scsi_len, rv; 6109 6110 /* copyin arguments */ 6111 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6112 if (rv != 0) 6113 return (EFAULT); 6114 6115 /* build SCSI VD_OP request */ 6116 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6117 sizeof (sd_prout_t), &vd_scsi_len); 6118 6119 /* set parameters */ 6120 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6121 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6122 MHIOC_RESV_KEY_SIZE); 6123 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6124 6125 /* submit the request */ 6126 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6127 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6128 6129 if (rv == 0) 6130 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6131 6132 kmem_free(vd_scsi, vd_scsi_len); 6133 return (rv); 6134 } 6135 6136 /* 6137 * This function is used by the failfast mechanism to send a SCSI command 6138 * to check for reservation conflict. 6139 */ 6140 static int 6141 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6142 { 6143 int cdb_len, sense_len, vd_scsi_len; 6144 vd_scsi_t *vd_scsi; 6145 union scsi_cdb *cdb; 6146 int rv; 6147 6148 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6149 6150 if (scmd == SCMD_WRITE_G1) 6151 cdb_len = CDB_GROUP1; 6152 else 6153 cdb_len = CDB_GROUP0; 6154 6155 sense_len = sizeof (struct scsi_extended_sense); 6156 6157 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6158 6159 /* set cdb */ 6160 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6161 cdb->scc_cmd = scmd; 6162 6163 vd_scsi->timeout = vdc_scsi_timeout; 6164 6165 /* 6166 * Submit the request. The last argument has to be B_FALSE so that 6167 * vdc_do_sync_op does not loop checking for reservation conflict if 6168 * the operation returns an error. 6169 */ 6170 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6171 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6172 6173 if (rv == 0) 6174 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6175 6176 kmem_free(vd_scsi, vd_scsi_len); 6177 return (rv); 6178 } 6179 6180 /* 6181 * This function is used by the failfast mechanism to check for reservation 6182 * conflict. It sends some SCSI commands which will fail with a reservation 6183 * conflict error if the system does not have access to the disk and this 6184 * will panic the system. 6185 * 6186 * Returned Code: 6187 * 0 - disk is accessible without reservation conflict error 6188 * != 0 - unable to check if disk is accessible 6189 */ 6190 int 6191 vdc_failfast_check_resv(vdc_t *vdc) 6192 { 6193 int failure = 0; 6194 6195 /* 6196 * Send a TEST UNIT READY command. The command will panic 6197 * the system if it fails with a reservation conflict. 6198 */ 6199 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6200 failure++; 6201 6202 /* 6203 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6204 * a reserved device, so we also do a WRITE(10) of zero byte in 6205 * order to provoke a Reservation Conflict status on those newer 6206 * devices. 6207 */ 6208 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6209 failure++; 6210 6211 return (failure); 6212 } 6213 6214 /* 6215 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6216 * queue when it has failed and failfast is enabled. Then we have to check 6217 * if it has failed because of a reservation conflict in which case we have 6218 * to panic the system. 6219 * 6220 * Async I/O should be queued with their block I/O data transfer structure 6221 * (buf). Sync I/O should be queued with buf = NULL. 6222 */ 6223 static vdc_io_t * 6224 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6225 { 6226 vdc_io_t *vio; 6227 6228 ASSERT(MUTEX_HELD(&vdc->lock)); 6229 6230 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6231 vio->vio_next = vdc->failfast_io_queue; 6232 vio->vio_buf = buf; 6233 vio->vio_qtime = ddi_get_lbolt(); 6234 6235 vdc->failfast_io_queue = vio; 6236 6237 /* notify the failfast thread that a new I/O is queued */ 6238 cv_signal(&vdc->failfast_cv); 6239 6240 return (vio); 6241 } 6242 6243 /* 6244 * Remove and complete I/O in the failfast I/O queue which have been 6245 * added after the indicated deadline. A deadline of 0 means that all 6246 * I/O have to be unqueued and marked as completed. 6247 */ 6248 static void 6249 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6250 { 6251 vdc_io_t *vio, *vio_tmp; 6252 6253 ASSERT(MUTEX_HELD(&vdc->lock)); 6254 6255 vio_tmp = NULL; 6256 vio = vdc->failfast_io_queue; 6257 6258 if (deadline != 0) { 6259 /* 6260 * Skip any io queued after the deadline. The failfast 6261 * I/O queue is ordered starting with the last I/O added 6262 * to the queue. 6263 */ 6264 while (vio != NULL && vio->vio_qtime > deadline) { 6265 vio_tmp = vio; 6266 vio = vio->vio_next; 6267 } 6268 } 6269 6270 if (vio == NULL) 6271 /* nothing to unqueue */ 6272 return; 6273 6274 /* update the queue */ 6275 if (vio_tmp == NULL) 6276 vdc->failfast_io_queue = NULL; 6277 else 6278 vio_tmp->vio_next = NULL; 6279 6280 /* 6281 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6282 * structure (buf) and they are completed by calling biodone(). Sync 6283 * I/O do not have a buf and they are completed by setting the 6284 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6285 * thread waiting for the I/O to complete is responsible for freeing 6286 * the vio structure. 6287 */ 6288 while (vio != NULL) { 6289 vio_tmp = vio->vio_next; 6290 if (vio->vio_buf != NULL) { 6291 VD_KSTAT_RUNQ_EXIT(vdc); 6292 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6293 biodone(vio->vio_buf); 6294 kmem_free(vio, sizeof (vdc_io_t)); 6295 } else { 6296 vio->vio_qtime = 0; 6297 } 6298 vio = vio_tmp; 6299 } 6300 6301 cv_broadcast(&vdc->failfast_io_cv); 6302 } 6303 6304 /* 6305 * Failfast Thread. 6306 * 6307 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6308 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6309 * we still have access to the disk. If a command fails with a RESERVATION 6310 * CONFLICT error then the system will immediatly panic. 6311 * 6312 * The failfast thread is also woken up when an I/O has failed. It then check 6313 * the access to the disk to ensure that the I/O failure was not due to a 6314 * reservation conflict. 6315 * 6316 * There is one failfast thread for each virtual disk for which failfast is 6317 * enabled. We could have only one thread sending requests for all disks but 6318 * this would need vdc to send asynchronous requests and to have callbacks to 6319 * process replies. 6320 */ 6321 static void 6322 vdc_failfast_thread(void *arg) 6323 { 6324 int status; 6325 vdc_t *vdc = (vdc_t *)arg; 6326 clock_t timeout, starttime; 6327 6328 mutex_enter(&vdc->lock); 6329 6330 while (vdc->failfast_interval != 0) { 6331 6332 starttime = ddi_get_lbolt(); 6333 6334 mutex_exit(&vdc->lock); 6335 6336 /* check for reservation conflict */ 6337 status = vdc_failfast_check_resv(vdc); 6338 6339 mutex_enter(&vdc->lock); 6340 /* 6341 * We have dropped the lock to send the SCSI command so we have 6342 * to check that failfast is still enabled. 6343 */ 6344 if (vdc->failfast_interval == 0) 6345 break; 6346 6347 /* 6348 * If we have successfully check the disk access and there was 6349 * no reservation conflict then we can complete any I/O queued 6350 * before the last check. 6351 */ 6352 if (status == 0) 6353 vdc_failfast_io_unqueue(vdc, starttime); 6354 6355 /* proceed again if some I/O are still in the queue */ 6356 if (vdc->failfast_io_queue != NULL) 6357 continue; 6358 6359 timeout = ddi_get_lbolt() + 6360 drv_usectohz(vdc->failfast_interval); 6361 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6362 } 6363 6364 /* 6365 * Failfast is being stop so we can complete any queued I/O. 6366 */ 6367 vdc_failfast_io_unqueue(vdc, 0); 6368 vdc->failfast_thread = NULL; 6369 mutex_exit(&vdc->lock); 6370 thread_exit(); 6371 } 6372 6373 /* 6374 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6375 */ 6376 static int 6377 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6378 { 6379 unsigned int mh_time; 6380 6381 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6382 return (EFAULT); 6383 6384 mutex_enter(&vdc->lock); 6385 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6386 vdc->failfast_thread = thread_create(NULL, 0, 6387 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6388 v.v_maxsyspri - 2); 6389 } 6390 6391 vdc->failfast_interval = mh_time * 1000; 6392 cv_signal(&vdc->failfast_cv); 6393 mutex_exit(&vdc->lock); 6394 6395 return (0); 6396 } 6397 6398 /* 6399 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6400 * converted to VD_OP_SET_ACCESS operations. 6401 */ 6402 static int 6403 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6404 { 6405 int rv; 6406 6407 /* submit owership command request */ 6408 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6409 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6410 VIO_both_dir, B_TRUE); 6411 6412 return (rv); 6413 } 6414 6415 /* 6416 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6417 * VD_OP_GET_ACCESS operation. 6418 */ 6419 static int 6420 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6421 { 6422 int rv; 6423 6424 /* submit owership command request */ 6425 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6426 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6427 VIO_both_dir, B_TRUE); 6428 6429 return (rv); 6430 } 6431 6432 /* 6433 * Disk Ownership Thread. 6434 * 6435 * When we have taken the ownership of a disk, this thread waits to be 6436 * notified when the LDC channel is reset so that it can recover the 6437 * ownership. 6438 * 6439 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6440 * can not be used to do the ownership recovery because it has to be 6441 * running to handle the reply message to the ownership operation. 6442 */ 6443 static void 6444 vdc_ownership_thread(void *arg) 6445 { 6446 vdc_t *vdc = (vdc_t *)arg; 6447 clock_t timeout; 6448 uint64_t status; 6449 6450 mutex_enter(&vdc->ownership_lock); 6451 mutex_enter(&vdc->lock); 6452 6453 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6454 6455 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6456 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6457 /* 6458 * There was a reset so the ownership has been lost, 6459 * try to recover. We do this without using the preempt 6460 * option so that we don't steal the ownership from 6461 * someone who has preempted us. 6462 */ 6463 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6464 vdc->instance); 6465 6466 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6467 VDC_OWNERSHIP_GRANTED); 6468 6469 mutex_exit(&vdc->lock); 6470 6471 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6472 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6473 6474 mutex_enter(&vdc->lock); 6475 6476 if (status == 0) { 6477 DMSG(vdc, 0, "[%d] Ownership recovered", 6478 vdc->instance); 6479 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6480 } else { 6481 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6482 vdc->instance); 6483 } 6484 6485 } 6486 6487 /* 6488 * If we have the ownership then we just wait for an event 6489 * to happen (LDC reset), otherwise we will retry to recover 6490 * after a delay. 6491 */ 6492 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6493 timeout = 0; 6494 else 6495 timeout = ddi_get_lbolt() + 6496 drv_usectohz(vdc_ownership_delay); 6497 6498 /* Release the ownership_lock and wait on the vdc lock */ 6499 mutex_exit(&vdc->ownership_lock); 6500 6501 if (timeout == 0) 6502 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6503 else 6504 (void) cv_timedwait(&vdc->ownership_cv, 6505 &vdc->lock, timeout); 6506 6507 mutex_exit(&vdc->lock); 6508 6509 mutex_enter(&vdc->ownership_lock); 6510 mutex_enter(&vdc->lock); 6511 } 6512 6513 vdc->ownership_thread = NULL; 6514 mutex_exit(&vdc->lock); 6515 mutex_exit(&vdc->ownership_lock); 6516 6517 thread_exit(); 6518 } 6519 6520 static void 6521 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6522 { 6523 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6524 6525 mutex_enter(&vdc->lock); 6526 vdc->ownership = ownership_flags; 6527 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6528 vdc->ownership_thread == NULL) { 6529 /* start ownership thread */ 6530 vdc->ownership_thread = thread_create(NULL, 0, 6531 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6532 v.v_maxsyspri - 2); 6533 } else { 6534 /* notify the ownership thread */ 6535 cv_signal(&vdc->ownership_cv); 6536 } 6537 mutex_exit(&vdc->lock); 6538 } 6539 6540 /* 6541 * Get the size and the block size of a virtual disk from the vdisk server. 6542 * We need to use this operation when the vdisk_size attribute was not 6543 * available during the handshake with the vdisk server. 6544 */ 6545 static int 6546 vdc_check_capacity(vdc_t *vdc) 6547 { 6548 int rv = 0; 6549 size_t alloc_len; 6550 vd_capacity_t *vd_cap; 6551 6552 if (vdc->vdisk_size != 0) 6553 return (0); 6554 6555 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6556 6557 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6558 6559 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6560 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6561 6562 if (rv == 0) { 6563 if (vd_cap->vdisk_block_size != vdc->block_size || 6564 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6565 vd_cap->vdisk_size == 0) 6566 rv = EINVAL; 6567 else 6568 vdc->vdisk_size = vd_cap->vdisk_size; 6569 } 6570 6571 kmem_free(vd_cap, alloc_len); 6572 return (rv); 6573 } 6574 6575 /* 6576 * This structure is used in the DKIO(7I) array below. 6577 */ 6578 typedef struct vdc_dk_ioctl { 6579 uint8_t op; /* VD_OP_XXX value */ 6580 int cmd; /* Solaris ioctl operation number */ 6581 size_t nbytes; /* size of structure to be copied */ 6582 6583 /* function to convert between vDisk and Solaris structure formats */ 6584 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6585 int mode, int dir); 6586 } vdc_dk_ioctl_t; 6587 6588 /* 6589 * Subset of DKIO(7I) operations currently supported 6590 */ 6591 static vdc_dk_ioctl_t dk_ioctl[] = { 6592 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6593 vdc_null_copy_func}, 6594 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6595 vdc_get_wce_convert}, 6596 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6597 vdc_set_wce_convert}, 6598 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6599 vdc_get_vtoc_convert}, 6600 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6601 vdc_set_vtoc_convert}, 6602 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6603 vdc_get_geom_convert}, 6604 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6605 vdc_get_geom_convert}, 6606 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6607 vdc_get_geom_convert}, 6608 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6609 vdc_set_geom_convert}, 6610 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6611 vdc_get_efi_convert}, 6612 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6613 vdc_set_efi_convert}, 6614 6615 /* DIOCTL_RWCMD is converted to a read or a write */ 6616 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6617 6618 /* mhd(7I) non-shared multihost disks ioctls */ 6619 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6620 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6621 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6622 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6623 6624 /* mhd(7I) shared multihost disks ioctls */ 6625 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6626 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6627 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6628 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6629 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6630 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6631 6632 /* mhd(7I) failfast ioctl */ 6633 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6634 6635 /* 6636 * These particular ioctls are not sent to the server - vdc fakes up 6637 * the necessary info. 6638 */ 6639 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6640 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6641 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6642 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6643 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6644 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6645 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6646 }; 6647 6648 /* 6649 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6650 * function and forward them to the vdisk. 6651 */ 6652 static int 6653 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6654 { 6655 vdc_t *vdc = (vdc_t *)vdisk; 6656 dev_t dev; 6657 int rval; 6658 6659 dev = makedevice(ddi_driver_major(vdc->dip), 6660 VD_MAKE_DEV(vdc->instance, 0)); 6661 6662 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6663 } 6664 6665 /* 6666 * Function: 6667 * vd_process_ioctl() 6668 * 6669 * Description: 6670 * This routine processes disk specific ioctl calls 6671 * 6672 * Arguments: 6673 * dev - the device number 6674 * cmd - the operation [dkio(7I)] to be processed 6675 * arg - pointer to user provided structure 6676 * (contains data to be set or reference parameter for get) 6677 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6678 * rvalp - pointer to return value for calling process. 6679 * 6680 * Return Code: 6681 * 0 6682 * EFAULT 6683 * ENXIO 6684 * EIO 6685 * ENOTSUP 6686 */ 6687 static int 6688 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6689 { 6690 int instance = VDCUNIT(dev); 6691 vdc_t *vdc = NULL; 6692 int rv = -1; 6693 int idx = 0; /* index into dk_ioctl[] */ 6694 size_t len = 0; /* #bytes to send to vds */ 6695 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6696 caddr_t mem_p = NULL; 6697 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6698 vdc_dk_ioctl_t *iop; 6699 6700 vdc = ddi_get_soft_state(vdc_state, instance); 6701 if (vdc == NULL) { 6702 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6703 instance); 6704 return (ENXIO); 6705 } 6706 6707 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6708 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6709 6710 if (rvalp != NULL) { 6711 /* the return value of the ioctl is 0 by default */ 6712 *rvalp = 0; 6713 } 6714 6715 /* 6716 * Validate the ioctl operation to be performed. 6717 * 6718 * If we have looped through the array without finding a match then we 6719 * don't support this ioctl. 6720 */ 6721 for (idx = 0; idx < nioctls; idx++) { 6722 if (cmd == dk_ioctl[idx].cmd) 6723 break; 6724 } 6725 6726 if (idx >= nioctls) { 6727 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6728 vdc->instance, cmd); 6729 return (ENOTSUP); 6730 } 6731 6732 iop = &(dk_ioctl[idx]); 6733 6734 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6735 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6736 dk_efi_t dk_efi; 6737 6738 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6739 if (rv != 0) 6740 return (EFAULT); 6741 6742 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6743 } else { 6744 len = iop->nbytes; 6745 } 6746 6747 /* check if the ioctl is applicable */ 6748 switch (cmd) { 6749 case CDROMREADOFFSET: 6750 case DKIOCREMOVABLE: 6751 return (ENOTTY); 6752 6753 case USCSICMD: 6754 case MHIOCTKOWN: 6755 case MHIOCSTATUS: 6756 case MHIOCQRESERVE: 6757 case MHIOCRELEASE: 6758 case MHIOCGRP_INKEYS: 6759 case MHIOCGRP_INRESV: 6760 case MHIOCGRP_REGISTER: 6761 case MHIOCGRP_RESERVE: 6762 case MHIOCGRP_PREEMPTANDABORT: 6763 case MHIOCGRP_REGISTERANDIGNOREKEY: 6764 case MHIOCENFAILFAST: 6765 if (vdc->cinfo == NULL) 6766 return (ENXIO); 6767 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6768 return (ENOTTY); 6769 break; 6770 6771 case DIOCTL_RWCMD: 6772 if (vdc->cinfo == NULL) 6773 return (ENXIO); 6774 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6775 return (ENOTTY); 6776 break; 6777 6778 case DKIOCINFO: 6779 if (vdc->cinfo == NULL) 6780 return (ENXIO); 6781 break; 6782 6783 case DKIOCGMEDIAINFO: 6784 if (vdc->minfo == NULL) 6785 return (ENXIO); 6786 if (vdc_check_capacity(vdc) != 0) 6787 /* disk capacity is not available */ 6788 return (EIO); 6789 break; 6790 } 6791 6792 /* 6793 * Deal with ioctls which require a processing different than 6794 * converting ioctl arguments and sending a corresponding 6795 * VD operation. 6796 */ 6797 switch (cmd) { 6798 6799 case USCSICMD: 6800 { 6801 return (vdc_uscsi_cmd(vdc, arg, mode)); 6802 } 6803 6804 case MHIOCTKOWN: 6805 { 6806 mutex_enter(&vdc->ownership_lock); 6807 /* 6808 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6809 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6810 * while we are processing the ioctl. 6811 */ 6812 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6813 6814 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6815 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6816 if (rv == 0) { 6817 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6818 VDC_OWNERSHIP_GRANTED); 6819 } else { 6820 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6821 } 6822 mutex_exit(&vdc->ownership_lock); 6823 return (rv); 6824 } 6825 6826 case MHIOCRELEASE: 6827 { 6828 mutex_enter(&vdc->ownership_lock); 6829 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6830 if (rv == 0) { 6831 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6832 } 6833 mutex_exit(&vdc->ownership_lock); 6834 return (rv); 6835 } 6836 6837 case MHIOCSTATUS: 6838 { 6839 uint64_t status; 6840 6841 rv = vdc_access_get(vdc, &status, mode); 6842 if (rv == 0 && rvalp != NULL) 6843 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6844 return (rv); 6845 } 6846 6847 case MHIOCQRESERVE: 6848 { 6849 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6850 return (rv); 6851 } 6852 6853 case MHIOCGRP_INKEYS: 6854 { 6855 return (vdc_mhd_inkeys(vdc, arg, mode)); 6856 } 6857 6858 case MHIOCGRP_INRESV: 6859 { 6860 return (vdc_mhd_inresv(vdc, arg, mode)); 6861 } 6862 6863 case MHIOCGRP_REGISTER: 6864 { 6865 return (vdc_mhd_register(vdc, arg, mode)); 6866 } 6867 6868 case MHIOCGRP_RESERVE: 6869 { 6870 return (vdc_mhd_reserve(vdc, arg, mode)); 6871 } 6872 6873 case MHIOCGRP_PREEMPTANDABORT: 6874 { 6875 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6876 } 6877 6878 case MHIOCGRP_REGISTERANDIGNOREKEY: 6879 { 6880 return (vdc_mhd_registerignore(vdc, arg, mode)); 6881 } 6882 6883 case MHIOCENFAILFAST: 6884 { 6885 rv = vdc_failfast(vdc, arg, mode); 6886 return (rv); 6887 } 6888 6889 case DIOCTL_RWCMD: 6890 { 6891 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6892 } 6893 6894 case DKIOCGAPART: 6895 { 6896 return (vdc_dkio_gapart(vdc, arg, mode)); 6897 } 6898 6899 case DKIOCPARTITION: 6900 { 6901 return (vdc_dkio_partition(vdc, arg, mode)); 6902 } 6903 6904 case DKIOCINFO: 6905 { 6906 struct dk_cinfo cinfo; 6907 6908 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6909 cinfo.dki_partition = VDCPART(dev); 6910 6911 rv = ddi_copyout(&cinfo, (void *)arg, 6912 sizeof (struct dk_cinfo), mode); 6913 if (rv != 0) 6914 return (EFAULT); 6915 6916 return (0); 6917 } 6918 6919 case DKIOCGMEDIAINFO: 6920 { 6921 ASSERT(vdc->vdisk_size != 0); 6922 if (vdc->minfo->dki_capacity == 0) 6923 vdc->minfo->dki_capacity = vdc->vdisk_size; 6924 rv = ddi_copyout(vdc->minfo, (void *)arg, 6925 sizeof (struct dk_minfo), mode); 6926 if (rv != 0) 6927 return (EFAULT); 6928 6929 return (0); 6930 } 6931 6932 case DKIOCFLUSHWRITECACHE: 6933 { 6934 struct dk_callback *dkc = 6935 (struct dk_callback *)(uintptr_t)arg; 6936 vdc_dk_arg_t *dkarg = NULL; 6937 6938 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6939 instance, mode); 6940 6941 /* 6942 * If arg is NULL, then there is no callback function 6943 * registered and the call operates synchronously; we 6944 * break and continue with the rest of the function and 6945 * wait for vds to return (i.e. after the request to 6946 * vds returns successfully, all writes completed prior 6947 * to the ioctl will have been flushed from the disk 6948 * write cache to persistent media. 6949 * 6950 * If a callback function is registered, we dispatch 6951 * the request on a task queue and return immediately. 6952 * The callback will deal with informing the calling 6953 * thread that the flush request is completed. 6954 */ 6955 if (dkc == NULL) 6956 break; 6957 6958 /* 6959 * the asynchronous callback is only supported if 6960 * invoked from within the kernel 6961 */ 6962 if ((mode & FKIOCTL) == 0) 6963 return (ENOTSUP); 6964 6965 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6966 6967 dkarg->mode = mode; 6968 dkarg->dev = dev; 6969 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 6970 6971 mutex_enter(&vdc->lock); 6972 vdc->dkio_flush_pending++; 6973 dkarg->vdc = vdc; 6974 mutex_exit(&vdc->lock); 6975 6976 /* put the request on a task queue */ 6977 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 6978 (void *)dkarg, DDI_SLEEP); 6979 if (rv == NULL) { 6980 /* clean up if dispatch fails */ 6981 mutex_enter(&vdc->lock); 6982 vdc->dkio_flush_pending--; 6983 mutex_exit(&vdc->lock); 6984 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 6985 } 6986 6987 return (rv == NULL ? ENOMEM : 0); 6988 } 6989 } 6990 6991 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 6992 ASSERT(iop->op != 0); 6993 6994 /* check if the vDisk server handles the operation for this vDisk */ 6995 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 6996 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 6997 vdc->instance, iop->op); 6998 return (ENOTSUP); 6999 } 7000 7001 /* LDC requires that the memory being mapped is 8-byte aligned */ 7002 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7003 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7004 instance, len, alloc_len); 7005 7006 if (alloc_len > 0) 7007 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7008 7009 /* 7010 * Call the conversion function for this ioctl which, if necessary, 7011 * converts from the Solaris format to the format ARC'ed 7012 * as part of the vDisk protocol (FWARC 2006/195) 7013 */ 7014 ASSERT(iop->convert != NULL); 7015 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7016 if (rv != 0) { 7017 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7018 instance, rv, cmd); 7019 if (mem_p != NULL) 7020 kmem_free(mem_p, alloc_len); 7021 return (rv); 7022 } 7023 7024 /* 7025 * send request to vds to service the ioctl. 7026 */ 7027 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7028 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7029 VIO_both_dir, B_TRUE); 7030 7031 if (rv != 0) { 7032 /* 7033 * This is not necessarily an error. The ioctl could 7034 * be returning a value such as ENOTTY to indicate 7035 * that the ioctl is not applicable. 7036 */ 7037 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7038 instance, rv, cmd); 7039 if (mem_p != NULL) 7040 kmem_free(mem_p, alloc_len); 7041 7042 return (rv); 7043 } 7044 7045 /* 7046 * Call the conversion function (if it exists) for this ioctl 7047 * which converts from the format ARC'ed as part of the vDisk 7048 * protocol (FWARC 2006/195) back to a format understood by 7049 * the rest of Solaris. 7050 */ 7051 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7052 if (rv != 0) { 7053 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7054 instance, rv, cmd); 7055 if (mem_p != NULL) 7056 kmem_free(mem_p, alloc_len); 7057 return (rv); 7058 } 7059 7060 if (mem_p != NULL) 7061 kmem_free(mem_p, alloc_len); 7062 7063 return (rv); 7064 } 7065 7066 /* 7067 * Function: 7068 * 7069 * Description: 7070 * This is an empty conversion function used by ioctl calls which 7071 * do not need to convert the data being passed in/out to userland 7072 */ 7073 static int 7074 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7075 { 7076 _NOTE(ARGUNUSED(vdc)) 7077 _NOTE(ARGUNUSED(from)) 7078 _NOTE(ARGUNUSED(to)) 7079 _NOTE(ARGUNUSED(mode)) 7080 _NOTE(ARGUNUSED(dir)) 7081 7082 return (0); 7083 } 7084 7085 static int 7086 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7087 int mode, int dir) 7088 { 7089 _NOTE(ARGUNUSED(vdc)) 7090 7091 if (dir == VD_COPYIN) 7092 return (0); /* nothing to do */ 7093 7094 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7095 return (EFAULT); 7096 7097 return (0); 7098 } 7099 7100 static int 7101 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7102 int mode, int dir) 7103 { 7104 _NOTE(ARGUNUSED(vdc)) 7105 7106 if (dir == VD_COPYOUT) 7107 return (0); /* nothing to do */ 7108 7109 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7110 return (EFAULT); 7111 7112 return (0); 7113 } 7114 7115 /* 7116 * Function: 7117 * vdc_get_vtoc_convert() 7118 * 7119 * Description: 7120 * This routine performs the necessary convertions from the DKIOCGVTOC 7121 * Solaris structure to the format defined in FWARC 2006/195. 7122 * 7123 * In the struct vtoc definition, the timestamp field is marked as not 7124 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7125 * However SVM uses that field to check it can write into the VTOC, 7126 * so we fake up the info of that field. 7127 * 7128 * Arguments: 7129 * vdc - the vDisk client 7130 * from - the buffer containing the data to be copied from 7131 * to - the buffer to be copied to 7132 * mode - flags passed to ioctl() call 7133 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7134 * 7135 * Return Code: 7136 * 0 - Success 7137 * ENXIO - incorrect buffer passed in. 7138 * EFAULT - ddi_copyout routine encountered an error. 7139 */ 7140 static int 7141 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7142 { 7143 int i; 7144 void *tmp_mem = NULL; 7145 void *tmp_memp; 7146 struct vtoc vt; 7147 struct vtoc32 vt32; 7148 int copy_len = 0; 7149 int rv = 0; 7150 7151 if (dir != VD_COPYOUT) 7152 return (0); /* nothing to do */ 7153 7154 if ((from == NULL) || (to == NULL)) 7155 return (ENXIO); 7156 7157 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7158 copy_len = sizeof (struct vtoc32); 7159 else 7160 copy_len = sizeof (struct vtoc); 7161 7162 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7163 7164 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 7165 7166 /* fake the VTOC timestamp field */ 7167 for (i = 0; i < V_NUMPAR; i++) { 7168 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 7169 } 7170 7171 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7172 /* LINTED E_ASSIGN_NARROW_CONV */ 7173 vtoctovtoc32(vt, vt32); 7174 tmp_memp = &vt32; 7175 } else { 7176 tmp_memp = &vt; 7177 } 7178 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 7179 if (rv != 0) 7180 rv = EFAULT; 7181 7182 kmem_free(tmp_mem, copy_len); 7183 return (rv); 7184 } 7185 7186 /* 7187 * Function: 7188 * vdc_set_vtoc_convert() 7189 * 7190 * Description: 7191 * This routine performs the necessary convertions from the DKIOCSVTOC 7192 * Solaris structure to the format defined in FWARC 2006/195. 7193 * 7194 * Arguments: 7195 * vdc - the vDisk client 7196 * from - Buffer with data 7197 * to - Buffer where data is to be copied to 7198 * mode - flags passed to ioctl 7199 * dir - direction of copy (in or out) 7200 * 7201 * Return Code: 7202 * 0 - Success 7203 * ENXIO - Invalid buffer passed in 7204 * EFAULT - ddi_copyin of data failed 7205 */ 7206 static int 7207 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7208 { 7209 _NOTE(ARGUNUSED(vdc)) 7210 7211 void *tmp_mem = NULL, *uvtoc; 7212 struct vtoc vt; 7213 struct vtoc *vtp = &vt; 7214 vd_vtoc_t vtvd; 7215 int copy_len = 0; 7216 int i, rv = 0; 7217 7218 if ((from == NULL) || (to == NULL)) 7219 return (ENXIO); 7220 7221 if (dir == VD_COPYIN) 7222 uvtoc = from; 7223 else 7224 uvtoc = to; 7225 7226 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7227 copy_len = sizeof (struct vtoc32); 7228 else 7229 copy_len = sizeof (struct vtoc); 7230 7231 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7232 7233 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 7234 if (rv != 0) { 7235 kmem_free(tmp_mem, copy_len); 7236 return (EFAULT); 7237 } 7238 7239 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7240 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 7241 } else { 7242 vtp = tmp_mem; 7243 } 7244 7245 if (dir == VD_COPYOUT) { 7246 /* 7247 * The disk label may have changed. Revalidate the disk 7248 * geometry. This will also update the device nodes. 7249 */ 7250 vdc_validate(vdc); 7251 7252 /* 7253 * We also need to keep track of the timestamp fields. 7254 */ 7255 for (i = 0; i < V_NUMPAR; i++) { 7256 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 7257 } 7258 7259 return (0); 7260 } 7261 7262 VTOC2VD_VTOC(vtp, &vtvd); 7263 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 7264 kmem_free(tmp_mem, copy_len); 7265 7266 return (0); 7267 } 7268 7269 /* 7270 * Function: 7271 * vdc_get_geom_convert() 7272 * 7273 * Description: 7274 * This routine performs the necessary convertions from the DKIOCGGEOM, 7275 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7276 * defined in FWARC 2006/195 7277 * 7278 * Arguments: 7279 * vdc - the vDisk client 7280 * from - Buffer with data 7281 * to - Buffer where data is to be copied to 7282 * mode - flags passed to ioctl 7283 * dir - direction of copy (in or out) 7284 * 7285 * Return Code: 7286 * 0 - Success 7287 * ENXIO - Invalid buffer passed in 7288 * EFAULT - ddi_copyout of data failed 7289 */ 7290 static int 7291 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7292 { 7293 _NOTE(ARGUNUSED(vdc)) 7294 7295 struct dk_geom geom; 7296 int copy_len = sizeof (struct dk_geom); 7297 int rv = 0; 7298 7299 if (dir != VD_COPYOUT) 7300 return (0); /* nothing to do */ 7301 7302 if ((from == NULL) || (to == NULL)) 7303 return (ENXIO); 7304 7305 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7306 rv = ddi_copyout(&geom, to, copy_len, mode); 7307 if (rv != 0) 7308 rv = EFAULT; 7309 7310 return (rv); 7311 } 7312 7313 /* 7314 * Function: 7315 * vdc_set_geom_convert() 7316 * 7317 * Description: 7318 * This routine performs the necessary convertions from the DKIOCSGEOM 7319 * Solaris structure to the format defined in FWARC 2006/195. 7320 * 7321 * Arguments: 7322 * vdc - the vDisk client 7323 * from - Buffer with data 7324 * to - Buffer where data is to be copied to 7325 * mode - flags passed to ioctl 7326 * dir - direction of copy (in or out) 7327 * 7328 * Return Code: 7329 * 0 - Success 7330 * ENXIO - Invalid buffer passed in 7331 * EFAULT - ddi_copyin of data failed 7332 */ 7333 static int 7334 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7335 { 7336 _NOTE(ARGUNUSED(vdc)) 7337 7338 vd_geom_t vdgeom; 7339 void *tmp_mem = NULL; 7340 int copy_len = sizeof (struct dk_geom); 7341 int rv = 0; 7342 7343 if (dir != VD_COPYIN) 7344 return (0); /* nothing to do */ 7345 7346 if ((from == NULL) || (to == NULL)) 7347 return (ENXIO); 7348 7349 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7350 7351 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7352 if (rv != 0) { 7353 kmem_free(tmp_mem, copy_len); 7354 return (EFAULT); 7355 } 7356 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7357 bcopy(&vdgeom, to, sizeof (vdgeom)); 7358 kmem_free(tmp_mem, copy_len); 7359 7360 return (0); 7361 } 7362 7363 static int 7364 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7365 { 7366 _NOTE(ARGUNUSED(vdc)) 7367 7368 vd_efi_t *vd_efi; 7369 dk_efi_t dk_efi; 7370 int rv = 0; 7371 void *uaddr; 7372 7373 if ((from == NULL) || (to == NULL)) 7374 return (ENXIO); 7375 7376 if (dir == VD_COPYIN) { 7377 7378 vd_efi = (vd_efi_t *)to; 7379 7380 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7381 if (rv != 0) 7382 return (EFAULT); 7383 7384 vd_efi->lba = dk_efi.dki_lba; 7385 vd_efi->length = dk_efi.dki_length; 7386 bzero(vd_efi->data, vd_efi->length); 7387 7388 } else { 7389 7390 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7391 if (rv != 0) 7392 return (EFAULT); 7393 7394 uaddr = dk_efi.dki_data; 7395 7396 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7397 7398 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7399 7400 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7401 mode); 7402 if (rv != 0) 7403 return (EFAULT); 7404 7405 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7406 } 7407 7408 return (0); 7409 } 7410 7411 static int 7412 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7413 { 7414 _NOTE(ARGUNUSED(vdc)) 7415 7416 dk_efi_t dk_efi; 7417 void *uaddr; 7418 7419 if (dir == VD_COPYOUT) { 7420 /* 7421 * The disk label may have changed. Revalidate the disk 7422 * geometry. This will also update the device nodes. 7423 */ 7424 vdc_validate(vdc); 7425 return (0); 7426 } 7427 7428 if ((from == NULL) || (to == NULL)) 7429 return (ENXIO); 7430 7431 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7432 return (EFAULT); 7433 7434 uaddr = dk_efi.dki_data; 7435 7436 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7437 7438 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7439 return (EFAULT); 7440 7441 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7442 7443 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7444 7445 return (0); 7446 } 7447 7448 7449 /* -------------------------------------------------------------------------- */ 7450 7451 /* 7452 * Function: 7453 * vdc_create_fake_geometry() 7454 * 7455 * Description: 7456 * This routine fakes up the disk info needed for some DKIO ioctls such 7457 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7458 * 7459 * Note: This function must not be called until the vDisk attributes have 7460 * been exchanged as part of the handshake with the vDisk server. 7461 * 7462 * Arguments: 7463 * vdc - soft state pointer for this instance of the device driver. 7464 * 7465 * Return Code: 7466 * none. 7467 */ 7468 static void 7469 vdc_create_fake_geometry(vdc_t *vdc) 7470 { 7471 ASSERT(vdc != NULL); 7472 ASSERT(vdc->max_xfer_sz != 0); 7473 7474 /* 7475 * DKIOCINFO support 7476 */ 7477 if (vdc->cinfo == NULL) 7478 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7479 7480 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7481 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7482 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7483 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7484 7485 /* 7486 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7487 * operation is supported, otherwise the controller type is DKC_DIRECT. 7488 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7489 * controller type is always DKC_DIRECT in that case. 7490 * 7491 * If the virtual disk is backed by a physical CD/DVD device or 7492 * an ISO image, modify the controller type to indicate this 7493 */ 7494 switch (vdc->vdisk_media) { 7495 case VD_MEDIA_CD: 7496 case VD_MEDIA_DVD: 7497 vdc->cinfo->dki_ctype = DKC_CDROM; 7498 break; 7499 case VD_MEDIA_FIXED: 7500 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7501 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7502 else 7503 vdc->cinfo->dki_ctype = DKC_DIRECT; 7504 break; 7505 default: 7506 /* in the case of v1.0 we default to a fixed disk */ 7507 vdc->cinfo->dki_ctype = DKC_DIRECT; 7508 break; 7509 } 7510 vdc->cinfo->dki_flags = DKI_FMTVOL; 7511 vdc->cinfo->dki_cnum = 0; 7512 vdc->cinfo->dki_addr = 0; 7513 vdc->cinfo->dki_space = 0; 7514 vdc->cinfo->dki_prio = 0; 7515 vdc->cinfo->dki_vec = 0; 7516 vdc->cinfo->dki_unit = vdc->instance; 7517 vdc->cinfo->dki_slave = 0; 7518 /* 7519 * The partition number will be created on the fly depending on the 7520 * actual slice (i.e. minor node) that is used to request the data. 7521 */ 7522 vdc->cinfo->dki_partition = 0; 7523 7524 /* 7525 * DKIOCGMEDIAINFO support 7526 */ 7527 if (vdc->minfo == NULL) 7528 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7529 7530 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7531 vdc->minfo->dki_media_type = 7532 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7533 } else { 7534 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7535 } 7536 7537 vdc->minfo->dki_capacity = vdc->vdisk_size; 7538 vdc->minfo->dki_lbsize = vdc->block_size; 7539 } 7540 7541 static ushort_t 7542 vdc_lbl2cksum(struct dk_label *label) 7543 { 7544 int count; 7545 ushort_t sum, *sp; 7546 7547 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7548 sp = (ushort_t *)label; 7549 sum = 0; 7550 while (count--) { 7551 sum ^= *sp++; 7552 } 7553 7554 return (sum); 7555 } 7556 7557 /* 7558 * Function: 7559 * vdc_validate_geometry 7560 * 7561 * Description: 7562 * This routine discovers the label and geometry of the disk. It stores 7563 * the disk label and related information in the vdc structure. If it 7564 * fails to validate the geometry or to discover the disk label then 7565 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7566 * 7567 * Arguments: 7568 * vdc - soft state pointer for this instance of the device driver. 7569 * 7570 * Return Code: 7571 * 0 - success. 7572 * EINVAL - unknown disk label. 7573 * ENOTSUP - geometry not applicable (EFI label). 7574 * EIO - error accessing the disk. 7575 */ 7576 static int 7577 vdc_validate_geometry(vdc_t *vdc) 7578 { 7579 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7580 dev_t dev; 7581 int rv, rval; 7582 struct dk_label label; 7583 struct dk_geom geom; 7584 struct vtoc vtoc; 7585 efi_gpt_t *gpt; 7586 efi_gpe_t *gpe; 7587 vd_efi_dev_t edev; 7588 7589 ASSERT(vdc != NULL); 7590 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7591 ASSERT(MUTEX_HELD(&vdc->lock)); 7592 7593 mutex_exit(&vdc->lock); 7594 7595 dev = makedevice(ddi_driver_major(vdc->dip), 7596 VD_MAKE_DEV(vdc->instance, 0)); 7597 7598 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7599 if (rv == 0) 7600 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7601 FKIOCTL, &rval); 7602 7603 if (rv == ENOTSUP) { 7604 /* 7605 * If the device does not support VTOC then we try 7606 * to read an EFI label. 7607 * 7608 * We need to know the block size and the disk size to 7609 * be able to read an EFI label. 7610 */ 7611 if (vdc->vdisk_size == 0) { 7612 if ((rv = vdc_check_capacity(vdc)) != 0) { 7613 mutex_enter(&vdc->lock); 7614 vdc_store_label_unk(vdc); 7615 return (rv); 7616 } 7617 } 7618 7619 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7620 7621 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7622 7623 if (rv) { 7624 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7625 vdc->instance, rv); 7626 mutex_enter(&vdc->lock); 7627 vdc_store_label_unk(vdc); 7628 return (EIO); 7629 } 7630 7631 mutex_enter(&vdc->lock); 7632 vdc_store_label_efi(vdc, gpt, gpe); 7633 vd_efi_free(&edev, gpt, gpe); 7634 return (ENOTSUP); 7635 } 7636 7637 if (rv != 0) { 7638 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7639 vdc->instance, rv); 7640 mutex_enter(&vdc->lock); 7641 vdc_store_label_unk(vdc); 7642 if (rv != EINVAL) 7643 rv = EIO; 7644 return (rv); 7645 } 7646 7647 /* check that geometry and vtoc are valid */ 7648 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7649 vtoc.v_sanity != VTOC_SANE) { 7650 mutex_enter(&vdc->lock); 7651 vdc_store_label_unk(vdc); 7652 return (EINVAL); 7653 } 7654 7655 /* 7656 * We have a disk and a valid VTOC. However this does not mean 7657 * that the disk currently have a VTOC label. The returned VTOC may 7658 * be a default VTOC to be used for configuring the disk (this is 7659 * what is done for disk image). So we read the label from the 7660 * beginning of the disk to ensure we really have a VTOC label. 7661 * 7662 * FUTURE: This could be the default way for reading the VTOC 7663 * from the disk as opposed to sending the VD_OP_GET_VTOC 7664 * to the server. This will be the default if vdc is implemented 7665 * ontop of cmlb. 7666 */ 7667 7668 /* 7669 * Single slice disk does not support read using an absolute disk 7670 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7671 */ 7672 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7673 mutex_enter(&vdc->lock); 7674 if (vtoc.v_nparts != 1) { 7675 vdc_store_label_unk(vdc); 7676 return (EINVAL); 7677 } 7678 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7679 return (0); 7680 } 7681 7682 if (vtoc.v_nparts != V_NUMPAR) { 7683 mutex_enter(&vdc->lock); 7684 vdc_store_label_unk(vdc); 7685 return (EINVAL); 7686 } 7687 7688 /* 7689 * Read disk label from start of disk 7690 */ 7691 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7692 bioinit(buf); 7693 buf->b_un.b_addr = (caddr_t)&label; 7694 buf->b_bcount = DK_LABEL_SIZE; 7695 buf->b_flags = B_BUSY | B_READ; 7696 buf->b_dev = cmpdev(dev); 7697 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7698 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7699 if (rv) { 7700 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7701 vdc->instance); 7702 } else { 7703 rv = biowait(buf); 7704 biofini(buf); 7705 } 7706 kmem_free(buf, sizeof (buf_t)); 7707 7708 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7709 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7710 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7711 vdc->instance); 7712 mutex_enter(&vdc->lock); 7713 vdc_store_label_unk(vdc); 7714 return (EINVAL); 7715 } 7716 7717 mutex_enter(&vdc->lock); 7718 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7719 return (0); 7720 } 7721 7722 /* 7723 * Function: 7724 * vdc_validate 7725 * 7726 * Description: 7727 * This routine discovers the label of the disk and create the 7728 * appropriate device nodes if the label has changed. 7729 * 7730 * Arguments: 7731 * vdc - soft state pointer for this instance of the device driver. 7732 * 7733 * Return Code: 7734 * none. 7735 */ 7736 static void 7737 vdc_validate(vdc_t *vdc) 7738 { 7739 vd_disk_label_t old_label; 7740 vd_slice_t old_slice[V_NUMPAR]; 7741 int rv; 7742 7743 ASSERT(!MUTEX_HELD(&vdc->lock)); 7744 7745 mutex_enter(&vdc->lock); 7746 7747 /* save the current label and vtoc */ 7748 old_label = vdc->vdisk_label; 7749 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7750 7751 /* check the geometry */ 7752 (void) vdc_validate_geometry(vdc); 7753 7754 /* if the disk label has changed, update device nodes */ 7755 if (vdc->vdisk_label != old_label) { 7756 7757 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7758 rv = vdc_create_device_nodes_efi(vdc); 7759 else 7760 rv = vdc_create_device_nodes_vtoc(vdc); 7761 7762 if (rv != 0) { 7763 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7764 vdc->instance); 7765 } 7766 } 7767 7768 mutex_exit(&vdc->lock); 7769 } 7770 7771 static void 7772 vdc_validate_task(void *arg) 7773 { 7774 vdc_t *vdc = (vdc_t *)arg; 7775 7776 vdc_validate(vdc); 7777 7778 mutex_enter(&vdc->lock); 7779 ASSERT(vdc->validate_pending > 0); 7780 vdc->validate_pending--; 7781 mutex_exit(&vdc->lock); 7782 } 7783 7784 /* 7785 * Function: 7786 * vdc_setup_devid() 7787 * 7788 * Description: 7789 * This routine discovers the devid of a vDisk. It requests the devid of 7790 * the underlying device from the vDisk server, builds an encapsulated 7791 * devid based on the retrieved devid and registers that new devid to 7792 * the vDisk. 7793 * 7794 * Arguments: 7795 * vdc - soft state pointer for this instance of the device driver. 7796 * 7797 * Return Code: 7798 * 0 - A devid was succesfully registered for the vDisk 7799 */ 7800 static int 7801 vdc_setup_devid(vdc_t *vdc) 7802 { 7803 int rv; 7804 vd_devid_t *vd_devid; 7805 size_t bufsize, bufid_len; 7806 7807 /* 7808 * At first sight, we don't know the size of the devid that the 7809 * server will return but this size will be encoded into the 7810 * reply. So we do a first request using a default size then we 7811 * check if this size was large enough. If not then we do a second 7812 * request with the correct size returned by the server. Note that 7813 * ldc requires size to be 8-byte aligned. 7814 */ 7815 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7816 sizeof (uint64_t)); 7817 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7818 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7819 7820 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7821 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7822 7823 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7824 7825 if (rv) { 7826 kmem_free(vd_devid, bufsize); 7827 return (rv); 7828 } 7829 7830 if (vd_devid->length > bufid_len) { 7831 /* 7832 * The returned devid is larger than the buffer used. Try again 7833 * with a buffer with the right size. 7834 */ 7835 kmem_free(vd_devid, bufsize); 7836 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7837 sizeof (uint64_t)); 7838 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7839 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7840 7841 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7842 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7843 VIO_both_dir, B_TRUE); 7844 7845 if (rv) { 7846 kmem_free(vd_devid, bufsize); 7847 return (rv); 7848 } 7849 } 7850 7851 /* 7852 * The virtual disk should have the same device id as the one associated 7853 * with the physical disk it is mapped on, otherwise sharing a disk 7854 * between a LDom and a non-LDom may not work (for example for a shared 7855 * SVM disk set). 7856 * 7857 * The DDI framework does not allow creating a device id with any 7858 * type so we first create a device id of type DEVID_ENCAP and then 7859 * we restore the orignal type of the physical device. 7860 */ 7861 7862 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7863 7864 /* build an encapsulated devid based on the returned devid */ 7865 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7866 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7867 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7868 kmem_free(vd_devid, bufsize); 7869 return (1); 7870 } 7871 7872 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7873 7874 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7875 7876 kmem_free(vd_devid, bufsize); 7877 7878 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7879 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7880 return (1); 7881 } 7882 7883 return (0); 7884 } 7885 7886 static void 7887 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7888 { 7889 int i, nparts; 7890 7891 ASSERT(MUTEX_HELD(&vdc->lock)); 7892 7893 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7894 bzero(vdc->vtoc, sizeof (struct vtoc)); 7895 bzero(vdc->geom, sizeof (struct dk_geom)); 7896 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7897 7898 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7899 7900 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7901 7902 if (gpe[i].efi_gpe_StartingLBA == 0 || 7903 gpe[i].efi_gpe_EndingLBA == 0) { 7904 continue; 7905 } 7906 7907 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7908 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7909 gpe[i].efi_gpe_StartingLBA + 1; 7910 } 7911 7912 ASSERT(vdc->vdisk_size != 0); 7913 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7914 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7915 7916 } 7917 7918 static void 7919 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7920 { 7921 int i; 7922 7923 ASSERT(MUTEX_HELD(&vdc->lock)); 7924 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7925 7926 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7927 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7928 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7929 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7930 7931 for (i = 0; i < vtoc->v_nparts; i++) { 7932 vdc->slice[i].start = vtoc->v_part[i].p_start; 7933 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7934 } 7935 } 7936 7937 static void 7938 vdc_store_label_unk(vdc_t *vdc) 7939 { 7940 ASSERT(MUTEX_HELD(&vdc->lock)); 7941 7942 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7943 bzero(vdc->vtoc, sizeof (struct vtoc)); 7944 bzero(vdc->geom, sizeof (struct dk_geom)); 7945 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7946 } 7947