1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 28 /* 29 * LDoms virtual disk client (vdc) device driver 30 * 31 * This driver runs on a guest logical domain and communicates with the virtual 32 * disk server (vds) driver running on the service domain which is exporting 33 * virtualized "disks" to the guest logical domain. 34 * 35 * The driver can be divided into four sections: 36 * 37 * 1) generic device driver housekeeping 38 * _init, _fini, attach, detach, ops structures, etc. 39 * 40 * 2) communication channel setup 41 * Setup the communications link over the LDC channel that vdc uses to 42 * talk to the vDisk server. Initialise the descriptor ring which 43 * allows the LDC clients to transfer data via memory mappings. 44 * 45 * 3) Support exported to upper layers (filesystems, etc) 46 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 47 * ioctl calls. vdc will copy the data to be written to the descriptor 48 * ring or maps the buffer to store the data read by the vDisk 49 * server into the descriptor ring. It then sends a message to the 50 * vDisk server requesting it to complete the operation. 51 * 52 * 4) Handling responses from vDisk server. 53 * The vDisk server will ACK some or all of the messages vdc sends to it 54 * (this is configured during the handshake). Upon receipt of an ACK 55 * vdc will check the descriptor ring and signal to the upper layer 56 * code waiting on the IO. 57 */ 58 59 #include <sys/atomic.h> 60 #include <sys/conf.h> 61 #include <sys/disp.h> 62 #include <sys/ddi.h> 63 #include <sys/dkio.h> 64 #include <sys/efi_partition.h> 65 #include <sys/fcntl.h> 66 #include <sys/file.h> 67 #include <sys/kstat.h> 68 #include <sys/mach_descrip.h> 69 #include <sys/modctl.h> 70 #include <sys/mdeg.h> 71 #include <sys/note.h> 72 #include <sys/open.h> 73 #include <sys/sdt.h> 74 #include <sys/stat.h> 75 #include <sys/sunddi.h> 76 #include <sys/types.h> 77 #include <sys/promif.h> 78 #include <sys/var.h> 79 #include <sys/vtoc.h> 80 #include <sys/archsystm.h> 81 #include <sys/sysmacros.h> 82 83 #include <sys/cdio.h> 84 #include <sys/dktp/fdisk.h> 85 #include <sys/dktp/dadkio.h> 86 #include <sys/mhd.h> 87 #include <sys/scsi/generic/sense.h> 88 #include <sys/scsi/impl/uscsi.h> 89 #include <sys/scsi/impl/services.h> 90 #include <sys/scsi/targets/sddef.h> 91 92 #include <sys/ldoms.h> 93 #include <sys/ldc.h> 94 #include <sys/vio_common.h> 95 #include <sys/vio_mailbox.h> 96 #include <sys/vio_util.h> 97 #include <sys/vdsk_common.h> 98 #include <sys/vdsk_mailbox.h> 99 #include <sys/vdc.h> 100 101 /* 102 * function prototypes 103 */ 104 105 /* standard driver functions */ 106 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 107 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 108 static int vdc_strategy(struct buf *buf); 109 static int vdc_print(dev_t dev, char *str); 110 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 111 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 112 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 114 cred_t *credp, int *rvalp); 115 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 116 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 117 118 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 119 void *arg, void **resultp); 120 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 121 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 122 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 123 int mod_flags, char *name, caddr_t valuep, int *lengthp); 124 125 /* setup */ 126 static void vdc_min(struct buf *bufp); 127 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 128 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 129 static int vdc_start_ldc_connection(vdc_t *vdc); 130 static int vdc_create_device_nodes(vdc_t *vdc); 131 static int vdc_create_device_nodes_efi(vdc_t *vdc); 132 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 133 static void vdc_create_io_kstats(vdc_t *vdc); 134 static void vdc_create_err_kstats(vdc_t *vdc); 135 static void vdc_set_err_kstats(vdc_t *vdc); 136 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 137 mde_cookie_t *vd_nodep); 138 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 139 static void vdc_fini_ports(vdc_t *vdc); 140 static void vdc_switch_server(vdc_t *vdcp); 141 static int vdc_do_ldc_up(vdc_t *vdc); 142 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 143 static int vdc_init_descriptor_ring(vdc_t *vdc); 144 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 145 static int vdc_setup_devid(vdc_t *vdc); 146 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 147 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 148 static void vdc_store_label_unk(vdc_t *vdc); 149 static boolean_t vdc_is_opened(vdc_t *vdc); 150 static void vdc_update_size(vdc_t *vdc, size_t, size_t, size_t); 151 152 /* handshake with vds */ 153 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 154 static int vdc_ver_negotiation(vdc_t *vdcp); 155 static int vdc_init_attr_negotiation(vdc_t *vdc); 156 static int vdc_attr_negotiation(vdc_t *vdcp); 157 static int vdc_init_dring_negotiate(vdc_t *vdc); 158 static int vdc_dring_negotiation(vdc_t *vdcp); 159 static int vdc_send_rdx(vdc_t *vdcp); 160 static int vdc_rdx_exchange(vdc_t *vdcp); 161 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 162 163 /* processing incoming messages from vDisk server */ 164 static void vdc_process_msg_thread(vdc_t *vdc); 165 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 166 167 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 168 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 169 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 170 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 171 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 172 static int vdc_send_request(vdc_t *vdcp, int operation, 173 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 174 int cb_type, void *cb_arg, vio_desc_direction_t dir); 175 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 176 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 177 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 178 int cb_type, void *cb_arg, vio_desc_direction_t dir); 179 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 180 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 181 void *cb_arg, vio_desc_direction_t dir, boolean_t); 182 183 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 184 static int vdc_drain_response(vdc_t *vdcp, struct buf *buf); 185 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 186 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 187 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 188 189 /* dkio */ 190 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 191 int *rvalp); 192 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 193 static void vdc_create_fake_geometry(vdc_t *vdc); 194 static int vdc_validate_geometry(vdc_t *vdc); 195 static void vdc_validate(vdc_t *vdc); 196 static void vdc_validate_task(void *arg); 197 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 198 int mode, int dir); 199 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 200 int mode, int dir); 201 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 202 int mode, int dir); 203 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 204 int mode, int dir); 205 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 206 int mode, int dir); 207 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 208 int mode, int dir); 209 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 210 int mode, int dir); 211 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 212 int mode, int dir); 213 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 214 int mode, int dir); 215 216 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 217 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 218 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 219 static int vdc_failfast_check_resv(vdc_t *vdc); 220 221 /* 222 * Module variables 223 */ 224 225 /* 226 * Tunable variables to control how long vdc waits before timing out on 227 * various operations 228 */ 229 static int vdc_hshake_retries = 3; 230 231 static int vdc_timeout = 0; /* units: seconds */ 232 static int vdc_ldcup_timeout = 1; /* units: seconds */ 233 234 static uint64_t vdc_hz_min_ldc_delay; 235 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 236 static uint64_t vdc_hz_max_ldc_delay; 237 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 238 239 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 240 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 241 242 /* values for dumping - need to run in a tighter loop */ 243 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 244 static int vdc_dump_retries = 100; 245 246 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 247 248 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 249 250 /* Count of the number of vdc instances attached */ 251 static volatile uint32_t vdc_instance_count = 0; 252 253 /* Tunable to log all SCSI errors */ 254 static boolean_t vdc_scsi_log_error = B_FALSE; 255 256 /* Soft state pointer */ 257 static void *vdc_state; 258 259 /* 260 * Controlling the verbosity of the error/debug messages 261 * 262 * vdc_msglevel - controls level of messages 263 * vdc_matchinst - 64-bit variable where each bit corresponds 264 * to the vdc instance the vdc_msglevel applies. 265 */ 266 int vdc_msglevel = 0x0; 267 uint64_t vdc_matchinst = 0ull; 268 269 /* 270 * Supported vDisk protocol version pairs. 271 * 272 * The first array entry is the latest and preferred version. 273 */ 274 static const vio_ver_t vdc_version[] = {{1, 1}}; 275 276 static struct cb_ops vdc_cb_ops = { 277 vdc_open, /* cb_open */ 278 vdc_close, /* cb_close */ 279 vdc_strategy, /* cb_strategy */ 280 vdc_print, /* cb_print */ 281 vdc_dump, /* cb_dump */ 282 vdc_read, /* cb_read */ 283 vdc_write, /* cb_write */ 284 vdc_ioctl, /* cb_ioctl */ 285 nodev, /* cb_devmap */ 286 nodev, /* cb_mmap */ 287 nodev, /* cb_segmap */ 288 nochpoll, /* cb_chpoll */ 289 vdc_prop_op, /* cb_prop_op */ 290 NULL, /* cb_str */ 291 D_MP | D_64BIT, /* cb_flag */ 292 CB_REV, /* cb_rev */ 293 vdc_aread, /* cb_aread */ 294 vdc_awrite /* cb_awrite */ 295 }; 296 297 static struct dev_ops vdc_ops = { 298 DEVO_REV, /* devo_rev */ 299 0, /* devo_refcnt */ 300 vdc_getinfo, /* devo_getinfo */ 301 nulldev, /* devo_identify */ 302 nulldev, /* devo_probe */ 303 vdc_attach, /* devo_attach */ 304 vdc_detach, /* devo_detach */ 305 nodev, /* devo_reset */ 306 &vdc_cb_ops, /* devo_cb_ops */ 307 NULL, /* devo_bus_ops */ 308 nulldev /* devo_power */ 309 }; 310 311 static struct modldrv modldrv = { 312 &mod_driverops, 313 "virtual disk client", 314 &vdc_ops, 315 }; 316 317 static struct modlinkage modlinkage = { 318 MODREV_1, 319 &modldrv, 320 NULL 321 }; 322 323 /* -------------------------------------------------------------------------- */ 324 325 /* 326 * Device Driver housekeeping and setup 327 */ 328 329 int 330 _init(void) 331 { 332 int status; 333 334 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 335 return (status); 336 if ((status = mod_install(&modlinkage)) != 0) 337 ddi_soft_state_fini(&vdc_state); 338 return (status); 339 } 340 341 int 342 _info(struct modinfo *modinfop) 343 { 344 return (mod_info(&modlinkage, modinfop)); 345 } 346 347 int 348 _fini(void) 349 { 350 int status; 351 352 if ((status = mod_remove(&modlinkage)) != 0) 353 return (status); 354 ddi_soft_state_fini(&vdc_state); 355 return (0); 356 } 357 358 static int 359 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 360 { 361 _NOTE(ARGUNUSED(dip)) 362 363 int instance = VDCUNIT((dev_t)arg); 364 vdc_t *vdc = NULL; 365 366 switch (cmd) { 367 case DDI_INFO_DEVT2DEVINFO: 368 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 369 *resultp = NULL; 370 return (DDI_FAILURE); 371 } 372 *resultp = vdc->dip; 373 return (DDI_SUCCESS); 374 case DDI_INFO_DEVT2INSTANCE: 375 *resultp = (void *)(uintptr_t)instance; 376 return (DDI_SUCCESS); 377 default: 378 *resultp = NULL; 379 return (DDI_FAILURE); 380 } 381 } 382 383 static int 384 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 385 { 386 kt_did_t failfast_tid, ownership_tid; 387 int instance; 388 int rv; 389 vdc_server_t *srvr; 390 vdc_t *vdc = NULL; 391 392 switch (cmd) { 393 case DDI_DETACH: 394 /* the real work happens below */ 395 break; 396 case DDI_SUSPEND: 397 /* nothing to do for this non-device */ 398 return (DDI_SUCCESS); 399 default: 400 return (DDI_FAILURE); 401 } 402 403 ASSERT(cmd == DDI_DETACH); 404 instance = ddi_get_instance(dip); 405 DMSGX(1, "[%d] Entered\n", instance); 406 407 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 408 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 409 return (DDI_FAILURE); 410 } 411 412 /* 413 * This function is called when vdc is detached or if it has failed to 414 * attach. In that case, the attach may have fail before the vdisk type 415 * has been set so we can't call vdc_is_opened(). However as the attach 416 * has failed, we know that the vdisk is not opened and we can safely 417 * detach. 418 */ 419 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 420 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 421 return (DDI_FAILURE); 422 } 423 424 if (vdc->dkio_flush_pending) { 425 DMSG(vdc, 0, 426 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 427 instance, vdc->dkio_flush_pending); 428 return (DDI_FAILURE); 429 } 430 431 if (vdc->validate_pending) { 432 DMSG(vdc, 0, 433 "[%d] Cannot detach: %d outstanding validate request\n", 434 instance, vdc->validate_pending); 435 return (DDI_FAILURE); 436 } 437 438 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 439 440 /* If we took ownership, release ownership */ 441 mutex_enter(&vdc->ownership_lock); 442 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 443 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 444 if (rv == 0) { 445 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 446 } 447 } 448 mutex_exit(&vdc->ownership_lock); 449 450 /* mark instance as detaching */ 451 vdc->lifecycle = VDC_LC_DETACHING; 452 453 /* 454 * Try and disable callbacks to prevent another handshake. We have to 455 * disable callbacks for all servers. 456 */ 457 for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) { 458 rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE); 459 DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n", 460 srvr->ldc_id, rv); 461 } 462 463 if (vdc->initialized & VDC_THREAD) { 464 mutex_enter(&vdc->read_lock); 465 if ((vdc->read_state == VDC_READ_WAITING) || 466 (vdc->read_state == VDC_READ_RESET)) { 467 vdc->read_state = VDC_READ_RESET; 468 cv_signal(&vdc->read_cv); 469 } 470 471 mutex_exit(&vdc->read_lock); 472 473 /* wake up any thread waiting for connection to come online */ 474 mutex_enter(&vdc->lock); 475 if (vdc->state == VDC_STATE_INIT_WAITING) { 476 DMSG(vdc, 0, 477 "[%d] write reset - move to resetting state...\n", 478 instance); 479 vdc->state = VDC_STATE_RESETTING; 480 cv_signal(&vdc->initwait_cv); 481 } 482 mutex_exit(&vdc->lock); 483 484 /* now wait until state transitions to VDC_STATE_DETACH */ 485 thread_join(vdc->msg_proc_thr->t_did); 486 ASSERT(vdc->state == VDC_STATE_DETACH); 487 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 488 vdc->instance); 489 } 490 491 mutex_enter(&vdc->lock); 492 493 if (vdc->initialized & VDC_DRING) 494 vdc_destroy_descriptor_ring(vdc); 495 496 vdc_fini_ports(vdc); 497 498 if (vdc->failfast_thread) { 499 failfast_tid = vdc->failfast_thread->t_did; 500 vdc->failfast_interval = 0; 501 cv_signal(&vdc->failfast_cv); 502 } else { 503 failfast_tid = 0; 504 } 505 506 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 507 ownership_tid = vdc->ownership_thread->t_did; 508 vdc->ownership = VDC_OWNERSHIP_NONE; 509 cv_signal(&vdc->ownership_cv); 510 } else { 511 ownership_tid = 0; 512 } 513 514 mutex_exit(&vdc->lock); 515 516 if (failfast_tid != 0) 517 thread_join(failfast_tid); 518 519 if (ownership_tid != 0) 520 thread_join(ownership_tid); 521 522 if (vdc->initialized & VDC_MINOR) 523 ddi_remove_minor_node(dip, NULL); 524 525 if (vdc->io_stats) { 526 kstat_delete(vdc->io_stats); 527 vdc->io_stats = NULL; 528 } 529 530 if (vdc->err_stats) { 531 kstat_delete(vdc->err_stats); 532 vdc->err_stats = NULL; 533 } 534 535 if (vdc->initialized & VDC_LOCKS) { 536 mutex_destroy(&vdc->lock); 537 mutex_destroy(&vdc->read_lock); 538 mutex_destroy(&vdc->ownership_lock); 539 cv_destroy(&vdc->initwait_cv); 540 cv_destroy(&vdc->dring_free_cv); 541 cv_destroy(&vdc->membind_cv); 542 cv_destroy(&vdc->sync_pending_cv); 543 cv_destroy(&vdc->sync_blocked_cv); 544 cv_destroy(&vdc->read_cv); 545 cv_destroy(&vdc->running_cv); 546 cv_destroy(&vdc->ownership_cv); 547 cv_destroy(&vdc->failfast_cv); 548 cv_destroy(&vdc->failfast_io_cv); 549 } 550 551 if (vdc->minfo) 552 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 553 554 if (vdc->cinfo) 555 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 556 557 if (vdc->vtoc) 558 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 559 560 if (vdc->geom) 561 kmem_free(vdc->geom, sizeof (struct dk_geom)); 562 563 if (vdc->devid) { 564 ddi_devid_unregister(dip); 565 ddi_devid_free(vdc->devid); 566 } 567 568 if (vdc->initialized & VDC_SOFT_STATE) 569 ddi_soft_state_free(vdc_state, instance); 570 571 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 572 573 return (DDI_SUCCESS); 574 } 575 576 577 static int 578 vdc_do_attach(dev_info_t *dip) 579 { 580 int instance; 581 vdc_t *vdc = NULL; 582 int status; 583 md_t *mdp; 584 mde_cookie_t vd_node; 585 586 ASSERT(dip != NULL); 587 588 instance = ddi_get_instance(dip); 589 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 590 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 591 instance); 592 return (DDI_FAILURE); 593 } 594 595 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 596 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 597 return (DDI_FAILURE); 598 } 599 600 /* 601 * We assign the value to initialized in this case to zero out the 602 * variable and then set bits in it to indicate what has been done 603 */ 604 vdc->initialized = VDC_SOFT_STATE; 605 606 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 607 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 608 609 vdc->dip = dip; 610 vdc->instance = instance; 611 vdc->vdisk_type = VD_DISK_TYPE_UNK; 612 vdc->vdisk_label = VD_DISK_LABEL_UNK; 613 vdc->state = VDC_STATE_INIT; 614 vdc->lifecycle = VDC_LC_ATTACHING; 615 vdc->session_id = 0; 616 vdc->block_size = DEV_BSIZE; 617 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 618 619 /* 620 * We assume, for now, that the vDisk server will export 'read' 621 * operations to us at a minimum (this is needed because of checks 622 * in vdc for supported operations early in the handshake process). 623 * The vDisk server will return ENOTSUP if this is not the case. 624 * The value will be overwritten during the attribute exchange with 625 * the bitmask of operations exported by server. 626 */ 627 vdc->operations = VD_OP_MASK_READ; 628 629 vdc->vtoc = NULL; 630 vdc->geom = NULL; 631 vdc->cinfo = NULL; 632 vdc->minfo = NULL; 633 634 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 635 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 636 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 637 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 638 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 639 640 vdc->threads_pending = 0; 641 vdc->sync_op_pending = B_FALSE; 642 vdc->sync_op_blocked = B_FALSE; 643 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 645 646 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 647 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 648 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 649 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 650 651 /* init blocking msg read functionality */ 652 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 653 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 654 vdc->read_state = VDC_READ_IDLE; 655 656 vdc->initialized |= VDC_LOCKS; 657 658 /* get device and port MD node for this disk instance */ 659 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 660 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 661 instance); 662 return (DDI_FAILURE); 663 } 664 665 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 666 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 667 return (DDI_FAILURE); 668 } 669 670 (void) md_fini_handle(mdp); 671 672 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 673 vdc_create_io_kstats(vdc); 674 vdc_create_err_kstats(vdc); 675 676 /* Initialize remaining structures before starting the msg thread */ 677 vdc->vdisk_label = VD_DISK_LABEL_UNK; 678 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 679 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 680 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 681 682 /* initialize the thread responsible for managing state with server */ 683 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 684 vdc, 0, &p0, TS_RUN, minclsyspri); 685 if (vdc->msg_proc_thr == NULL) { 686 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 687 instance); 688 return (DDI_FAILURE); 689 } 690 691 vdc->initialized |= VDC_THREAD; 692 693 atomic_inc_32(&vdc_instance_count); 694 695 /* 696 * Check the disk label. This will send requests and do the handshake. 697 * We don't really care about the disk label now. What we really need is 698 * the handshake do be done so that we know the type of the disk (slice 699 * or full disk) and the appropriate device nodes can be created. 700 */ 701 702 mutex_enter(&vdc->lock); 703 (void) vdc_validate_geometry(vdc); 704 mutex_exit(&vdc->lock); 705 706 /* 707 * Now that we have the device info we can create the device nodes 708 */ 709 status = vdc_create_device_nodes(vdc); 710 if (status) { 711 DMSG(vdc, 0, "[%d] Failed to create device nodes", 712 instance); 713 goto return_status; 714 } 715 716 /* 717 * Setup devid 718 */ 719 if (vdc_setup_devid(vdc)) { 720 DMSG(vdc, 0, "[%d] No device id available\n", instance); 721 } 722 723 /* 724 * Fill in the fields of the error statistics kstat that were not 725 * available when creating the kstat 726 */ 727 vdc_set_err_kstats(vdc); 728 729 ddi_report_dev(dip); 730 vdc->lifecycle = VDC_LC_ONLINE; 731 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 732 733 return_status: 734 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 735 return (status); 736 } 737 738 static int 739 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 740 { 741 int status; 742 743 switch (cmd) { 744 case DDI_ATTACH: 745 if ((status = vdc_do_attach(dip)) != 0) 746 (void) vdc_detach(dip, DDI_DETACH); 747 return (status); 748 case DDI_RESUME: 749 /* nothing to do for this non-device */ 750 return (DDI_SUCCESS); 751 default: 752 return (DDI_FAILURE); 753 } 754 } 755 756 static int 757 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 758 { 759 int status = 0; 760 ldc_status_t ldc_state; 761 ldc_attr_t ldc_attr; 762 763 ASSERT(vdc != NULL); 764 ASSERT(srvr != NULL); 765 766 ldc_attr.devclass = LDC_DEV_BLK; 767 ldc_attr.instance = vdc->instance; 768 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 769 ldc_attr.mtu = VD_LDC_MTU; 770 771 if ((srvr->state & VDC_LDC_INIT) == 0) { 772 status = ldc_init(srvr->ldc_id, &ldc_attr, 773 &srvr->ldc_handle); 774 if (status != 0) { 775 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 776 vdc->instance, srvr->ldc_id, status); 777 return (status); 778 } 779 srvr->state |= VDC_LDC_INIT; 780 } 781 status = ldc_status(srvr->ldc_handle, &ldc_state); 782 if (status != 0) { 783 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 784 vdc->instance, status); 785 goto init_exit; 786 } 787 srvr->ldc_state = ldc_state; 788 789 if ((srvr->state & VDC_LDC_CB) == 0) { 790 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 791 (caddr_t)srvr); 792 if (status != 0) { 793 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 794 vdc->instance, status); 795 goto init_exit; 796 } 797 srvr->state |= VDC_LDC_CB; 798 } 799 800 /* 801 * At this stage we have initialised LDC, we will now try and open 802 * the connection. 803 */ 804 if (srvr->ldc_state == LDC_INIT) { 805 status = ldc_open(srvr->ldc_handle); 806 if (status != 0) { 807 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 808 vdc->instance, srvr->ldc_id, status); 809 goto init_exit; 810 } 811 srvr->state |= VDC_LDC_OPEN; 812 } 813 814 init_exit: 815 if (status) { 816 vdc_terminate_ldc(vdc, srvr); 817 } 818 819 return (status); 820 } 821 822 static int 823 vdc_start_ldc_connection(vdc_t *vdc) 824 { 825 int status = 0; 826 827 ASSERT(vdc != NULL); 828 829 ASSERT(MUTEX_HELD(&vdc->lock)); 830 831 status = vdc_do_ldc_up(vdc); 832 833 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 834 835 return (status); 836 } 837 838 static int 839 vdc_stop_ldc_connection(vdc_t *vdcp) 840 { 841 int status; 842 843 ASSERT(vdcp != NULL); 844 845 ASSERT(MUTEX_HELD(&vdcp->lock)); 846 847 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 848 vdcp->state); 849 850 status = ldc_down(vdcp->curr_server->ldc_handle); 851 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 852 853 vdcp->initialized &= ~VDC_HANDSHAKE; 854 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 855 856 return (status); 857 } 858 859 static void 860 vdc_create_io_kstats(vdc_t *vdc) 861 { 862 if (vdc->io_stats != NULL) { 863 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 864 return; 865 } 866 867 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 868 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 869 if (vdc->io_stats != NULL) { 870 vdc->io_stats->ks_lock = &vdc->lock; 871 kstat_install(vdc->io_stats); 872 } else { 873 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 874 " will not be gathered", vdc->instance); 875 } 876 } 877 878 static void 879 vdc_create_err_kstats(vdc_t *vdc) 880 { 881 vd_err_stats_t *stp; 882 char kstatmodule_err[KSTAT_STRLEN]; 883 char kstatname[KSTAT_STRLEN]; 884 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 885 int instance = vdc->instance; 886 887 if (vdc->err_stats != NULL) { 888 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 889 return; 890 } 891 892 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 893 "%serr", VDC_DRIVER_NAME); 894 (void) snprintf(kstatname, sizeof (kstatname), 895 "%s%d,err", VDC_DRIVER_NAME, instance); 896 897 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 898 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 899 900 if (vdc->err_stats == NULL) { 901 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 902 " will not be gathered", instance); 903 return; 904 } 905 906 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 907 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 908 KSTAT_DATA_UINT32); 909 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 910 KSTAT_DATA_UINT32); 911 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 912 KSTAT_DATA_UINT32); 913 kstat_named_init(&stp->vd_vid, "Vendor", 914 KSTAT_DATA_CHAR); 915 kstat_named_init(&stp->vd_pid, "Product", 916 KSTAT_DATA_CHAR); 917 kstat_named_init(&stp->vd_capacity, "Size", 918 KSTAT_DATA_ULONGLONG); 919 920 vdc->err_stats->ks_update = nulldev; 921 922 kstat_install(vdc->err_stats); 923 } 924 925 static void 926 vdc_set_err_kstats(vdc_t *vdc) 927 { 928 vd_err_stats_t *stp; 929 930 if (vdc->err_stats == NULL) 931 return; 932 933 mutex_enter(&vdc->lock); 934 935 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 936 ASSERT(stp != NULL); 937 938 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 939 (void) strcpy(stp->vd_vid.value.c, "SUN"); 940 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 941 942 mutex_exit(&vdc->lock); 943 } 944 945 static int 946 vdc_create_device_nodes_efi(vdc_t *vdc) 947 { 948 ddi_remove_minor_node(vdc->dip, "h"); 949 ddi_remove_minor_node(vdc->dip, "h,raw"); 950 951 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 952 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 953 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 954 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 955 vdc->instance); 956 return (EIO); 957 } 958 959 /* if any device node is created we set this flag */ 960 vdc->initialized |= VDC_MINOR; 961 962 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 963 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 964 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 965 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 966 vdc->instance); 967 return (EIO); 968 } 969 970 return (0); 971 } 972 973 static int 974 vdc_create_device_nodes_vtoc(vdc_t *vdc) 975 { 976 ddi_remove_minor_node(vdc->dip, "wd"); 977 ddi_remove_minor_node(vdc->dip, "wd,raw"); 978 979 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 980 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 981 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 982 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 983 vdc->instance); 984 return (EIO); 985 } 986 987 /* if any device node is created we set this flag */ 988 vdc->initialized |= VDC_MINOR; 989 990 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 991 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 992 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 993 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 994 vdc->instance); 995 return (EIO); 996 } 997 998 return (0); 999 } 1000 1001 /* 1002 * Function: 1003 * vdc_create_device_nodes 1004 * 1005 * Description: 1006 * This function creates the block and character device nodes under 1007 * /devices. It is called as part of the attach(9E) of the instance 1008 * during the handshake with vds after vds has sent the attributes 1009 * to vdc. 1010 * 1011 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1012 * of 2 is used in keeping with the Solaris convention that slice 2 1013 * refers to a whole disk. Slices start at 'a' 1014 * 1015 * Parameters: 1016 * vdc - soft state pointer 1017 * 1018 * Return Values 1019 * 0 - Success 1020 * EIO - Failed to create node 1021 * EINVAL - Unknown type of disk exported 1022 */ 1023 static int 1024 vdc_create_device_nodes(vdc_t *vdc) 1025 { 1026 char name[sizeof ("s,raw")]; 1027 dev_info_t *dip = NULL; 1028 int instance, status; 1029 int num_slices = 1; 1030 int i; 1031 1032 ASSERT(vdc != NULL); 1033 1034 instance = vdc->instance; 1035 dip = vdc->dip; 1036 1037 switch (vdc->vdisk_type) { 1038 case VD_DISK_TYPE_DISK: 1039 num_slices = V_NUMPAR; 1040 break; 1041 case VD_DISK_TYPE_SLICE: 1042 num_slices = 1; 1043 break; 1044 case VD_DISK_TYPE_UNK: 1045 default: 1046 return (EINVAL); 1047 } 1048 1049 /* 1050 * Minor nodes are different for EFI disks: EFI disks do not have 1051 * a minor node 'g' for the minor number corresponding to slice 1052 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1053 * representing the whole disk. 1054 */ 1055 for (i = 0; i < num_slices; i++) { 1056 1057 if (i == VD_EFI_WD_SLICE) { 1058 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1059 status = vdc_create_device_nodes_efi(vdc); 1060 else 1061 status = vdc_create_device_nodes_vtoc(vdc); 1062 if (status != 0) 1063 return (status); 1064 continue; 1065 } 1066 1067 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1068 if (ddi_create_minor_node(dip, name, S_IFBLK, 1069 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1070 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1071 instance, name); 1072 return (EIO); 1073 } 1074 1075 /* if any device node is created we set this flag */ 1076 vdc->initialized |= VDC_MINOR; 1077 1078 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1079 1080 if (ddi_create_minor_node(dip, name, S_IFCHR, 1081 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1082 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1083 instance, name); 1084 return (EIO); 1085 } 1086 } 1087 1088 return (0); 1089 } 1090 1091 /* 1092 * Driver prop_op(9e) entry point function. Return the number of blocks for 1093 * the partition in question or forward the request to the property facilities. 1094 */ 1095 static int 1096 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1097 char *name, caddr_t valuep, int *lengthp) 1098 { 1099 int instance = ddi_get_instance(dip); 1100 vdc_t *vdc; 1101 uint64_t nblocks; 1102 uint_t blksize; 1103 1104 vdc = ddi_get_soft_state(vdc_state, instance); 1105 1106 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1107 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1108 name, valuep, lengthp)); 1109 } 1110 1111 mutex_enter(&vdc->lock); 1112 (void) vdc_validate_geometry(vdc); 1113 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1114 mutex_exit(&vdc->lock); 1115 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1116 name, valuep, lengthp)); 1117 } 1118 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1119 blksize = vdc->block_size; 1120 mutex_exit(&vdc->lock); 1121 1122 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1123 name, valuep, lengthp, nblocks, blksize)); 1124 } 1125 1126 /* 1127 * Function: 1128 * vdc_is_opened 1129 * 1130 * Description: 1131 * This function checks if any slice of a given virtual disk is 1132 * currently opened. 1133 * 1134 * Parameters: 1135 * vdc - soft state pointer 1136 * 1137 * Return Values 1138 * B_TRUE - at least one slice is opened. 1139 * B_FALSE - no slice is opened. 1140 */ 1141 static boolean_t 1142 vdc_is_opened(vdc_t *vdc) 1143 { 1144 int i, nslices; 1145 1146 switch (vdc->vdisk_type) { 1147 case VD_DISK_TYPE_DISK: 1148 nslices = V_NUMPAR; 1149 break; 1150 case VD_DISK_TYPE_SLICE: 1151 nslices = 1; 1152 break; 1153 case VD_DISK_TYPE_UNK: 1154 default: 1155 ASSERT(0); 1156 } 1157 1158 /* check if there's any layered open */ 1159 for (i = 0; i < nslices; i++) { 1160 if (vdc->open_lyr[i] > 0) 1161 return (B_TRUE); 1162 } 1163 1164 /* check if there is any other kind of open */ 1165 for (i = 0; i < OTYPCNT; i++) { 1166 if (vdc->open[i] != 0) 1167 return (B_TRUE); 1168 } 1169 1170 return (B_FALSE); 1171 } 1172 1173 static int 1174 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1175 { 1176 uint8_t slicemask; 1177 int i; 1178 1179 ASSERT(otyp < OTYPCNT); 1180 ASSERT(slice < V_NUMPAR); 1181 ASSERT(MUTEX_HELD(&vdc->lock)); 1182 1183 slicemask = 1 << slice; 1184 1185 /* check if slice is already exclusively opened */ 1186 if (vdc->open_excl & slicemask) 1187 return (EBUSY); 1188 1189 /* if open exclusive, check if slice is already opened */ 1190 if (flag & FEXCL) { 1191 if (vdc->open_lyr[slice] > 0) 1192 return (EBUSY); 1193 for (i = 0; i < OTYPCNT; i++) { 1194 if (vdc->open[i] & slicemask) 1195 return (EBUSY); 1196 } 1197 vdc->open_excl |= slicemask; 1198 } 1199 1200 /* mark slice as opened */ 1201 if (otyp == OTYP_LYR) { 1202 vdc->open_lyr[slice]++; 1203 } else { 1204 vdc->open[otyp] |= slicemask; 1205 } 1206 1207 return (0); 1208 } 1209 1210 static void 1211 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1212 { 1213 uint8_t slicemask; 1214 1215 ASSERT(otyp < OTYPCNT); 1216 ASSERT(slice < V_NUMPAR); 1217 ASSERT(MUTEX_HELD(&vdc->lock)); 1218 1219 slicemask = 1 << slice; 1220 1221 if (otyp == OTYP_LYR) { 1222 ASSERT(vdc->open_lyr[slice] > 0); 1223 vdc->open_lyr[slice]--; 1224 } else { 1225 vdc->open[otyp] &= ~slicemask; 1226 } 1227 1228 if (flag & FEXCL) 1229 vdc->open_excl &= ~slicemask; 1230 } 1231 1232 static int 1233 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1234 { 1235 _NOTE(ARGUNUSED(cred)) 1236 1237 int instance, nodelay; 1238 int slice, status = 0; 1239 vdc_t *vdc; 1240 1241 ASSERT(dev != NULL); 1242 instance = VDCUNIT(*dev); 1243 1244 if (otyp >= OTYPCNT) 1245 return (EINVAL); 1246 1247 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1248 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1249 return (ENXIO); 1250 } 1251 1252 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1253 getminor(*dev), flag, otyp); 1254 1255 slice = VDCPART(*dev); 1256 1257 nodelay = flag & (FNDELAY | FNONBLOCK); 1258 1259 if ((flag & FWRITE) && (!nodelay) && 1260 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1261 return (EROFS); 1262 } 1263 1264 mutex_enter(&vdc->lock); 1265 1266 status = vdc_mark_opened(vdc, slice, flag, otyp); 1267 1268 if (status != 0) { 1269 mutex_exit(&vdc->lock); 1270 return (status); 1271 } 1272 1273 if (nodelay) { 1274 1275 /* don't resubmit a validate request if there's already one */ 1276 if (vdc->validate_pending > 0) { 1277 mutex_exit(&vdc->lock); 1278 return (0); 1279 } 1280 1281 /* call vdc_validate() asynchronously to avoid blocking */ 1282 if (taskq_dispatch(system_taskq, vdc_validate_task, 1283 (void *)vdc, TQ_NOSLEEP) == NULL) { 1284 vdc_mark_closed(vdc, slice, flag, otyp); 1285 mutex_exit(&vdc->lock); 1286 return (ENXIO); 1287 } 1288 1289 vdc->validate_pending++; 1290 mutex_exit(&vdc->lock); 1291 return (0); 1292 } 1293 1294 mutex_exit(&vdc->lock); 1295 1296 vdc_validate(vdc); 1297 1298 mutex_enter(&vdc->lock); 1299 1300 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1301 vdc->slice[slice].nblocks == 0) { 1302 vdc_mark_closed(vdc, slice, flag, otyp); 1303 status = EIO; 1304 } 1305 1306 mutex_exit(&vdc->lock); 1307 1308 return (status); 1309 } 1310 1311 static int 1312 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1313 { 1314 _NOTE(ARGUNUSED(cred)) 1315 1316 int instance; 1317 int slice; 1318 int rv, rval; 1319 vdc_t *vdc; 1320 1321 instance = VDCUNIT(dev); 1322 1323 if (otyp >= OTYPCNT) 1324 return (EINVAL); 1325 1326 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1327 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1328 return (ENXIO); 1329 } 1330 1331 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1332 1333 slice = VDCPART(dev); 1334 1335 /* 1336 * Attempt to flush the W$ on a close operation. If this is 1337 * not a supported IOCTL command or the backing device is read-only 1338 * do not fail the close operation. 1339 */ 1340 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1341 1342 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1343 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1344 instance, rv); 1345 return (EIO); 1346 } 1347 1348 mutex_enter(&vdc->lock); 1349 vdc_mark_closed(vdc, slice, flag, otyp); 1350 mutex_exit(&vdc->lock); 1351 1352 return (0); 1353 } 1354 1355 static int 1356 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1357 { 1358 _NOTE(ARGUNUSED(credp)) 1359 1360 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1361 } 1362 1363 static int 1364 vdc_print(dev_t dev, char *str) 1365 { 1366 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1367 return (0); 1368 } 1369 1370 static int 1371 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1372 { 1373 int rv; 1374 size_t nbytes = nblk * DEV_BSIZE; 1375 int instance = VDCUNIT(dev); 1376 vdc_t *vdc = NULL; 1377 1378 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1379 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1380 return (ENXIO); 1381 } 1382 1383 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1384 instance, nbytes, blkno, (void *)addr); 1385 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1386 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1387 if (rv) { 1388 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1389 return (rv); 1390 } 1391 1392 if (ddi_in_panic()) 1393 (void) vdc_drain_response(vdc, NULL); 1394 1395 DMSG(vdc, 0, "[%d] End\n", instance); 1396 1397 return (0); 1398 } 1399 1400 /* -------------------------------------------------------------------------- */ 1401 1402 /* 1403 * Disk access routines 1404 * 1405 */ 1406 1407 /* 1408 * vdc_strategy() 1409 * 1410 * Return Value: 1411 * 0: As per strategy(9E), the strategy() function must return 0 1412 * [ bioerror(9f) sets b_flags to the proper error code ] 1413 */ 1414 static int 1415 vdc_strategy(struct buf *buf) 1416 { 1417 int rv = -1; 1418 vdc_t *vdc = NULL; 1419 int instance = VDCUNIT(buf->b_edev); 1420 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1421 int slice; 1422 1423 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1424 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1425 bioerror(buf, ENXIO); 1426 biodone(buf); 1427 return (0); 1428 } 1429 1430 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1431 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1432 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1433 1434 bp_mapin(buf); 1435 1436 if ((long)buf->b_private == VD_SLICE_NONE) { 1437 /* I/O using an absolute disk offset */ 1438 slice = VD_SLICE_NONE; 1439 } else { 1440 slice = VDCPART(buf->b_edev); 1441 } 1442 1443 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1444 buf->b_bcount, slice, buf->b_lblkno, 1445 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1446 VIO_write_dir); 1447 1448 /* 1449 * If the request was successfully sent, the strategy call returns and 1450 * the ACK handler calls the bioxxx functions when the vDisk server is 1451 * done otherwise we handle the error here. 1452 */ 1453 if (rv) { 1454 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1455 bioerror(buf, rv); 1456 biodone(buf); 1457 } else if (ddi_in_panic()) { 1458 (void) vdc_drain_response(vdc, buf); 1459 } 1460 1461 return (0); 1462 } 1463 1464 /* 1465 * Function: 1466 * vdc_min 1467 * 1468 * Description: 1469 * Routine to limit the size of a data transfer. Used in 1470 * conjunction with physio(9F). 1471 * 1472 * Arguments: 1473 * bp - pointer to the indicated buf(9S) struct. 1474 * 1475 */ 1476 static void 1477 vdc_min(struct buf *bufp) 1478 { 1479 vdc_t *vdc = NULL; 1480 int instance = VDCUNIT(bufp->b_edev); 1481 1482 vdc = ddi_get_soft_state(vdc_state, instance); 1483 VERIFY(vdc != NULL); 1484 1485 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1486 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1487 } 1488 } 1489 1490 static int 1491 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1492 { 1493 _NOTE(ARGUNUSED(cred)) 1494 1495 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1496 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1497 } 1498 1499 static int 1500 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1501 { 1502 _NOTE(ARGUNUSED(cred)) 1503 1504 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1505 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1506 } 1507 1508 static int 1509 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1510 { 1511 _NOTE(ARGUNUSED(cred)) 1512 1513 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1514 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1515 } 1516 1517 static int 1518 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1519 { 1520 _NOTE(ARGUNUSED(cred)) 1521 1522 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1523 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1524 } 1525 1526 1527 /* -------------------------------------------------------------------------- */ 1528 1529 /* 1530 * Handshake support 1531 */ 1532 1533 1534 /* 1535 * Function: 1536 * vdc_init_ver_negotiation() 1537 * 1538 * Description: 1539 * 1540 * Arguments: 1541 * vdc - soft state pointer for this instance of the device driver. 1542 * 1543 * Return Code: 1544 * 0 - Success 1545 */ 1546 static int 1547 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1548 { 1549 vio_ver_msg_t pkt; 1550 size_t msglen = sizeof (pkt); 1551 int status = -1; 1552 1553 ASSERT(vdc != NULL); 1554 ASSERT(mutex_owned(&vdc->lock)); 1555 1556 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1557 1558 /* 1559 * set the Session ID to a unique value 1560 * (the lower 32 bits of the clock tick) 1561 */ 1562 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1563 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1564 1565 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1566 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1567 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1568 pkt.tag.vio_sid = vdc->session_id; 1569 pkt.dev_class = VDEV_DISK; 1570 pkt.ver_major = ver.major; 1571 pkt.ver_minor = ver.minor; 1572 1573 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1574 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1575 vdc->instance, status); 1576 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1577 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1578 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1579 vdc->curr_server->ldc_handle, status, msglen); 1580 if (msglen != sizeof (vio_ver_msg_t)) 1581 status = ENOMSG; 1582 } 1583 1584 return (status); 1585 } 1586 1587 /* 1588 * Function: 1589 * vdc_ver_negotiation() 1590 * 1591 * Description: 1592 * 1593 * Arguments: 1594 * vdcp - soft state pointer for this instance of the device driver. 1595 * 1596 * Return Code: 1597 * 0 - Success 1598 */ 1599 static int 1600 vdc_ver_negotiation(vdc_t *vdcp) 1601 { 1602 vio_msg_t vio_msg; 1603 int status; 1604 1605 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1606 return (status); 1607 1608 /* release lock and wait for response */ 1609 mutex_exit(&vdcp->lock); 1610 status = vdc_wait_for_response(vdcp, &vio_msg); 1611 mutex_enter(&vdcp->lock); 1612 if (status) { 1613 DMSG(vdcp, 0, 1614 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1615 vdcp->instance, status); 1616 return (status); 1617 } 1618 1619 /* check type and sub_type ... */ 1620 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1621 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1622 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1623 vdcp->instance); 1624 return (EPROTO); 1625 } 1626 1627 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1628 } 1629 1630 /* 1631 * Function: 1632 * vdc_init_attr_negotiation() 1633 * 1634 * Description: 1635 * 1636 * Arguments: 1637 * vdc - soft state pointer for this instance of the device driver. 1638 * 1639 * Return Code: 1640 * 0 - Success 1641 */ 1642 static int 1643 vdc_init_attr_negotiation(vdc_t *vdc) 1644 { 1645 vd_attr_msg_t pkt; 1646 size_t msglen = sizeof (pkt); 1647 int status; 1648 1649 ASSERT(vdc != NULL); 1650 ASSERT(mutex_owned(&vdc->lock)); 1651 1652 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1653 1654 /* fill in tag */ 1655 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1656 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1657 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1658 pkt.tag.vio_sid = vdc->session_id; 1659 /* fill in payload */ 1660 pkt.max_xfer_sz = vdc->max_xfer_sz; 1661 pkt.vdisk_block_size = vdc->block_size; 1662 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1663 pkt.operations = 0; /* server will set bits of valid operations */ 1664 pkt.vdisk_type = 0; /* server will set to valid device type */ 1665 pkt.vdisk_media = 0; /* server will set to valid media type */ 1666 pkt.vdisk_size = 0; /* server will set to valid size */ 1667 1668 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1669 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1670 1671 if ((status != 0) || (msglen != sizeof (vd_attr_msg_t))) { 1672 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1673 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1674 vdc->curr_server->ldc_handle, status, msglen); 1675 if (msglen != sizeof (vd_attr_msg_t)) 1676 status = ENOMSG; 1677 } 1678 1679 return (status); 1680 } 1681 1682 /* 1683 * Function: 1684 * vdc_attr_negotiation() 1685 * 1686 * Description: 1687 * 1688 * Arguments: 1689 * vdc - soft state pointer for this instance of the device driver. 1690 * 1691 * Return Code: 1692 * 0 - Success 1693 */ 1694 static int 1695 vdc_attr_negotiation(vdc_t *vdcp) 1696 { 1697 int status; 1698 vio_msg_t vio_msg; 1699 1700 if (status = vdc_init_attr_negotiation(vdcp)) 1701 return (status); 1702 1703 /* release lock and wait for response */ 1704 mutex_exit(&vdcp->lock); 1705 status = vdc_wait_for_response(vdcp, &vio_msg); 1706 mutex_enter(&vdcp->lock); 1707 if (status) { 1708 DMSG(vdcp, 0, 1709 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1710 vdcp->instance, status); 1711 return (status); 1712 } 1713 1714 /* check type and sub_type ... */ 1715 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1716 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1717 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1718 vdcp->instance); 1719 return (EPROTO); 1720 } 1721 1722 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1723 } 1724 1725 1726 /* 1727 * Function: 1728 * vdc_init_dring_negotiate() 1729 * 1730 * Description: 1731 * 1732 * Arguments: 1733 * vdc - soft state pointer for this instance of the device driver. 1734 * 1735 * Return Code: 1736 * 0 - Success 1737 */ 1738 static int 1739 vdc_init_dring_negotiate(vdc_t *vdc) 1740 { 1741 vio_dring_reg_msg_t pkt; 1742 size_t msglen = sizeof (pkt); 1743 int status = -1; 1744 int retry; 1745 int nretries = 10; 1746 1747 ASSERT(vdc != NULL); 1748 ASSERT(mutex_owned(&vdc->lock)); 1749 1750 for (retry = 0; retry < nretries; retry++) { 1751 status = vdc_init_descriptor_ring(vdc); 1752 if (status != EAGAIN) 1753 break; 1754 drv_usecwait(vdc_min_timeout_ldc); 1755 } 1756 1757 if (status != 0) { 1758 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1759 vdc->instance, status); 1760 return (status); 1761 } 1762 1763 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1764 vdc->instance, status); 1765 1766 /* fill in tag */ 1767 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1768 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1769 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1770 pkt.tag.vio_sid = vdc->session_id; 1771 /* fill in payload */ 1772 pkt.dring_ident = 0; 1773 pkt.num_descriptors = vdc->dring_len; 1774 pkt.descriptor_size = vdc->dring_entry_size; 1775 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1776 pkt.ncookies = vdc->dring_cookie_count; 1777 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1778 1779 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1780 if (status != 0) { 1781 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1782 vdc->instance, status); 1783 } 1784 1785 return (status); 1786 } 1787 1788 1789 /* 1790 * Function: 1791 * vdc_dring_negotiation() 1792 * 1793 * Description: 1794 * 1795 * Arguments: 1796 * vdc - soft state pointer for this instance of the device driver. 1797 * 1798 * Return Code: 1799 * 0 - Success 1800 */ 1801 static int 1802 vdc_dring_negotiation(vdc_t *vdcp) 1803 { 1804 int status; 1805 vio_msg_t vio_msg; 1806 1807 if (status = vdc_init_dring_negotiate(vdcp)) 1808 return (status); 1809 1810 /* release lock and wait for response */ 1811 mutex_exit(&vdcp->lock); 1812 status = vdc_wait_for_response(vdcp, &vio_msg); 1813 mutex_enter(&vdcp->lock); 1814 if (status) { 1815 DMSG(vdcp, 0, 1816 "[%d] Failed waiting for Dring negotiation response," 1817 " rv(%d)", vdcp->instance, status); 1818 return (status); 1819 } 1820 1821 /* check type and sub_type ... */ 1822 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1823 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1824 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1825 vdcp->instance); 1826 return (EPROTO); 1827 } 1828 1829 return (vdc_handle_dring_reg_msg(vdcp, 1830 (vio_dring_reg_msg_t *)&vio_msg)); 1831 } 1832 1833 1834 /* 1835 * Function: 1836 * vdc_send_rdx() 1837 * 1838 * Description: 1839 * 1840 * Arguments: 1841 * vdc - soft state pointer for this instance of the device driver. 1842 * 1843 * Return Code: 1844 * 0 - Success 1845 */ 1846 static int 1847 vdc_send_rdx(vdc_t *vdcp) 1848 { 1849 vio_msg_t msg; 1850 size_t msglen = sizeof (vio_msg_t); 1851 int status; 1852 1853 /* 1854 * Send an RDX message to vds to indicate we are ready 1855 * to send data 1856 */ 1857 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1858 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1859 msg.tag.vio_subtype_env = VIO_RDX; 1860 msg.tag.vio_sid = vdcp->session_id; 1861 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1862 if (status != 0) { 1863 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1864 vdcp->instance, status); 1865 } 1866 1867 return (status); 1868 } 1869 1870 /* 1871 * Function: 1872 * vdc_handle_rdx() 1873 * 1874 * Description: 1875 * 1876 * Arguments: 1877 * vdc - soft state pointer for this instance of the device driver. 1878 * msgp - received msg 1879 * 1880 * Return Code: 1881 * 0 - Success 1882 */ 1883 static int 1884 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1885 { 1886 _NOTE(ARGUNUSED(vdcp)) 1887 _NOTE(ARGUNUSED(msgp)) 1888 1889 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1890 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1891 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1892 1893 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1894 1895 return (0); 1896 } 1897 1898 /* 1899 * Function: 1900 * vdc_rdx_exchange() 1901 * 1902 * Description: 1903 * 1904 * Arguments: 1905 * vdc - soft state pointer for this instance of the device driver. 1906 * 1907 * Return Code: 1908 * 0 - Success 1909 */ 1910 static int 1911 vdc_rdx_exchange(vdc_t *vdcp) 1912 { 1913 int status; 1914 vio_msg_t vio_msg; 1915 1916 if (status = vdc_send_rdx(vdcp)) 1917 return (status); 1918 1919 /* release lock and wait for response */ 1920 mutex_exit(&vdcp->lock); 1921 status = vdc_wait_for_response(vdcp, &vio_msg); 1922 mutex_enter(&vdcp->lock); 1923 if (status) { 1924 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1925 vdcp->instance, status); 1926 return (status); 1927 } 1928 1929 /* check type and sub_type ... */ 1930 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1931 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1932 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1933 return (EPROTO); 1934 } 1935 1936 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1937 } 1938 1939 1940 /* -------------------------------------------------------------------------- */ 1941 1942 /* 1943 * LDC helper routines 1944 */ 1945 1946 static int 1947 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1948 { 1949 int status; 1950 boolean_t q_has_pkts = B_FALSE; 1951 uint64_t delay_time; 1952 size_t len; 1953 1954 mutex_enter(&vdc->read_lock); 1955 1956 if (vdc->read_state == VDC_READ_IDLE) 1957 vdc->read_state = VDC_READ_WAITING; 1958 1959 while (vdc->read_state != VDC_READ_PENDING) { 1960 1961 /* detect if the connection has been reset */ 1962 if (vdc->read_state == VDC_READ_RESET) { 1963 status = ECONNRESET; 1964 goto done; 1965 } 1966 1967 cv_wait(&vdc->read_cv, &vdc->read_lock); 1968 } 1969 1970 /* 1971 * Until we get a blocking ldc read we have to retry 1972 * until the entire LDC message has arrived before 1973 * ldc_read() will succeed. Note we also bail out if 1974 * the channel is reset or goes away. 1975 */ 1976 delay_time = vdc_ldc_read_init_delay; 1977 loop: 1978 len = *nbytesp; 1979 status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); 1980 switch (status) { 1981 case EAGAIN: 1982 delay_time *= 2; 1983 if (delay_time >= vdc_ldc_read_max_delay) 1984 delay_time = vdc_ldc_read_max_delay; 1985 delay(delay_time); 1986 goto loop; 1987 1988 case 0: 1989 if (len == 0) { 1990 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 1991 "no error!\n", vdc->instance); 1992 goto loop; 1993 } 1994 1995 *nbytesp = len; 1996 1997 /* 1998 * If there are pending messages, leave the 1999 * read state as pending. Otherwise, set the state 2000 * back to idle. 2001 */ 2002 status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); 2003 if (status == 0 && !q_has_pkts) 2004 vdc->read_state = VDC_READ_IDLE; 2005 2006 break; 2007 default: 2008 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2009 break; 2010 } 2011 2012 done: 2013 mutex_exit(&vdc->read_lock); 2014 2015 return (status); 2016 } 2017 2018 2019 2020 #ifdef DEBUG 2021 void 2022 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2023 { 2024 char *ms, *ss, *ses; 2025 switch (msg->tag.vio_msgtype) { 2026 #define Q(_s) case _s : ms = #_s; break; 2027 Q(VIO_TYPE_CTRL) 2028 Q(VIO_TYPE_DATA) 2029 Q(VIO_TYPE_ERR) 2030 #undef Q 2031 default: ms = "unknown"; break; 2032 } 2033 2034 switch (msg->tag.vio_subtype) { 2035 #define Q(_s) case _s : ss = #_s; break; 2036 Q(VIO_SUBTYPE_INFO) 2037 Q(VIO_SUBTYPE_ACK) 2038 Q(VIO_SUBTYPE_NACK) 2039 #undef Q 2040 default: ss = "unknown"; break; 2041 } 2042 2043 switch (msg->tag.vio_subtype_env) { 2044 #define Q(_s) case _s : ses = #_s; break; 2045 Q(VIO_VER_INFO) 2046 Q(VIO_ATTR_INFO) 2047 Q(VIO_DRING_REG) 2048 Q(VIO_DRING_UNREG) 2049 Q(VIO_RDX) 2050 Q(VIO_PKT_DATA) 2051 Q(VIO_DESC_DATA) 2052 Q(VIO_DRING_DATA) 2053 #undef Q 2054 default: ses = "unknown"; break; 2055 } 2056 2057 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2058 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2059 msg->tag.vio_subtype_env, ms, ss, ses); 2060 } 2061 #endif 2062 2063 /* 2064 * Function: 2065 * vdc_send() 2066 * 2067 * Description: 2068 * The function encapsulates the call to write a message using LDC. 2069 * If LDC indicates that the call failed due to the queue being full, 2070 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2071 * 2072 * Arguments: 2073 * ldc_handle - LDC handle for the channel this instance of vdc uses 2074 * pkt - address of LDC message to be sent 2075 * msglen - the size of the message being sent. When the function 2076 * returns, this contains the number of bytes written. 2077 * 2078 * Return Code: 2079 * 0 - Success. 2080 * EINVAL - pkt or msglen were NULL 2081 * ECONNRESET - The connection was not up. 2082 * EWOULDBLOCK - LDC queue is full 2083 * xxx - other error codes returned by ldc_write 2084 */ 2085 static int 2086 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2087 { 2088 size_t size = 0; 2089 int status = 0; 2090 clock_t delay_ticks; 2091 2092 ASSERT(vdc != NULL); 2093 ASSERT(mutex_owned(&vdc->lock)); 2094 ASSERT(msglen != NULL); 2095 ASSERT(*msglen != 0); 2096 2097 #ifdef DEBUG 2098 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2099 #endif 2100 /* 2101 * Wait indefinitely to send if channel 2102 * is busy, but bail out if we succeed or 2103 * if the channel closes or is reset. 2104 */ 2105 delay_ticks = vdc_hz_min_ldc_delay; 2106 do { 2107 size = *msglen; 2108 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2109 if (status == EWOULDBLOCK) { 2110 delay(delay_ticks); 2111 /* geometric backoff */ 2112 delay_ticks *= 2; 2113 if (delay_ticks > vdc_hz_max_ldc_delay) 2114 delay_ticks = vdc_hz_max_ldc_delay; 2115 } 2116 } while (status == EWOULDBLOCK); 2117 2118 /* if LDC had serious issues --- reset vdc state */ 2119 if (status == EIO || status == ECONNRESET) { 2120 /* LDC had serious issues --- reset vdc state */ 2121 mutex_enter(&vdc->read_lock); 2122 if ((vdc->read_state == VDC_READ_WAITING) || 2123 (vdc->read_state == VDC_READ_RESET)) 2124 cv_signal(&vdc->read_cv); 2125 vdc->read_state = VDC_READ_RESET; 2126 mutex_exit(&vdc->read_lock); 2127 2128 /* wake up any waiters in the reset thread */ 2129 if (vdc->state == VDC_STATE_INIT_WAITING) { 2130 DMSG(vdc, 0, "[%d] write reset - " 2131 "vdc is resetting ..\n", vdc->instance); 2132 vdc->state = VDC_STATE_RESETTING; 2133 cv_signal(&vdc->initwait_cv); 2134 } 2135 2136 return (ECONNRESET); 2137 } 2138 2139 /* return the last size written */ 2140 *msglen = size; 2141 2142 return (status); 2143 } 2144 2145 /* 2146 * Function: 2147 * vdc_get_md_node 2148 * 2149 * Description: 2150 * Get the MD, the device node for the given disk instance. The 2151 * caller is responsible for cleaning up the reference to the 2152 * returned MD (mdpp) by calling md_fini_handle(). 2153 * 2154 * Arguments: 2155 * dip - dev info pointer for this instance of the device driver. 2156 * mdpp - the returned MD. 2157 * vd_nodep - the returned device node. 2158 * 2159 * Return Code: 2160 * 0 - Success. 2161 * ENOENT - Expected node or property did not exist. 2162 * ENXIO - Unexpected error communicating with MD framework 2163 */ 2164 static int 2165 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2166 { 2167 int status = ENOENT; 2168 char *node_name = NULL; 2169 md_t *mdp = NULL; 2170 int num_nodes; 2171 int num_vdevs; 2172 mde_cookie_t rootnode; 2173 mde_cookie_t *listp = NULL; 2174 boolean_t found_inst = B_FALSE; 2175 int listsz; 2176 int idx; 2177 uint64_t md_inst; 2178 int obp_inst; 2179 int instance = ddi_get_instance(dip); 2180 2181 /* 2182 * Get the OBP instance number for comparison with the MD instance 2183 * 2184 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2185 * notion of "instance", or unique identifier, for that node; OBP 2186 * stores the value of the "cfg-handle" MD property as the value of 2187 * the "reg" property on the node in the device tree it builds from 2188 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2189 * "reg" property value to uniquely identify this device instance. 2190 * If the "reg" property cannot be found, the device tree state is 2191 * presumably so broken that there is no point in continuing. 2192 */ 2193 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2194 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2195 return (ENOENT); 2196 } 2197 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2198 OBP_REG, -1); 2199 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2200 2201 /* 2202 * We now walk the MD nodes to find the node for this vdisk. 2203 */ 2204 if ((mdp = md_get_handle()) == NULL) { 2205 cmn_err(CE_WARN, "unable to init machine description"); 2206 return (ENXIO); 2207 } 2208 2209 num_nodes = md_node_count(mdp); 2210 ASSERT(num_nodes > 0); 2211 2212 listsz = num_nodes * sizeof (mde_cookie_t); 2213 2214 /* allocate memory for nodes */ 2215 listp = kmem_zalloc(listsz, KM_SLEEP); 2216 2217 rootnode = md_root_node(mdp); 2218 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2219 2220 /* 2221 * Search for all the virtual devices, we will then check to see which 2222 * ones are disk nodes. 2223 */ 2224 num_vdevs = md_scan_dag(mdp, rootnode, 2225 md_find_name(mdp, VDC_MD_VDEV_NAME), 2226 md_find_name(mdp, "fwd"), listp); 2227 2228 if (num_vdevs <= 0) { 2229 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2230 status = ENOENT; 2231 goto done; 2232 } 2233 2234 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2235 for (idx = 0; idx < num_vdevs; idx++) { 2236 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2237 if ((status != 0) || (node_name == NULL)) { 2238 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2239 ": err %d", VDC_MD_VDEV_NAME, status); 2240 continue; 2241 } 2242 2243 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2244 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2245 status = md_get_prop_val(mdp, listp[idx], 2246 VDC_MD_CFG_HDL, &md_inst); 2247 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2248 instance, md_inst); 2249 if ((status == 0) && (md_inst == obp_inst)) { 2250 found_inst = B_TRUE; 2251 break; 2252 } 2253 } 2254 } 2255 2256 if (!found_inst) { 2257 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2258 status = ENOENT; 2259 goto done; 2260 } 2261 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2262 2263 *vd_nodep = listp[idx]; 2264 *mdpp = mdp; 2265 done: 2266 kmem_free(listp, listsz); 2267 return (status); 2268 } 2269 2270 /* 2271 * Function: 2272 * vdc_init_ports 2273 * 2274 * Description: 2275 * Initialize all the ports for this vdisk instance. 2276 * 2277 * Arguments: 2278 * vdc - soft state pointer for this instance of the device driver. 2279 * mdp - md pointer 2280 * vd_nodep - device md node. 2281 * 2282 * Return Code: 2283 * 0 - Success. 2284 * ENOENT - Expected node or property did not exist. 2285 */ 2286 static int 2287 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2288 { 2289 int status = 0; 2290 int idx; 2291 int num_nodes; 2292 int num_vports; 2293 int num_chans; 2294 int listsz; 2295 mde_cookie_t vd_port; 2296 mde_cookie_t *chanp = NULL; 2297 mde_cookie_t *portp = NULL; 2298 vdc_server_t *srvr; 2299 vdc_server_t *prev_srvr = NULL; 2300 2301 /* 2302 * We now walk the MD nodes to find the port nodes for this vdisk. 2303 */ 2304 num_nodes = md_node_count(mdp); 2305 ASSERT(num_nodes > 0); 2306 2307 listsz = num_nodes * sizeof (mde_cookie_t); 2308 2309 /* allocate memory for nodes */ 2310 portp = kmem_zalloc(listsz, KM_SLEEP); 2311 chanp = kmem_zalloc(listsz, KM_SLEEP); 2312 2313 num_vports = md_scan_dag(mdp, vd_nodep, 2314 md_find_name(mdp, VDC_MD_PORT_NAME), 2315 md_find_name(mdp, "fwd"), portp); 2316 if (num_vports == 0) { 2317 DMSGX(0, "Found no '%s' node for '%s' port\n", 2318 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2319 status = ENOENT; 2320 goto done; 2321 } 2322 2323 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2324 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2325 2326 vdc->num_servers = 0; 2327 for (idx = 0; idx < num_vports; idx++) { 2328 2329 /* initialize this port */ 2330 vd_port = portp[idx]; 2331 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2332 srvr->vdcp = vdc; 2333 2334 /* get port id */ 2335 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2336 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2337 VDC_MD_ID); 2338 kmem_free(srvr, sizeof (vdc_server_t)); 2339 continue; 2340 } 2341 2342 /* set the connection timeout */ 2343 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2344 &srvr->ctimeout) != 0) { 2345 srvr->ctimeout = 0; 2346 } 2347 2348 /* get the ldc id */ 2349 num_chans = md_scan_dag(mdp, vd_port, 2350 md_find_name(mdp, VDC_MD_CHAN_NAME), 2351 md_find_name(mdp, "fwd"), chanp); 2352 2353 /* expecting at least one channel */ 2354 if (num_chans <= 0) { 2355 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2356 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2357 kmem_free(srvr, sizeof (vdc_server_t)); 2358 continue; 2359 } else if (num_chans != 1) { 2360 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2361 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2362 num_chans); 2363 } 2364 2365 /* 2366 * We use the first channel found (index 0), irrespective of how 2367 * many are there in total. 2368 */ 2369 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2370 &srvr->ldc_id) != 0) { 2371 cmn_err(CE_NOTE, "Channel '%s' property not found", 2372 VDC_MD_ID); 2373 kmem_free(srvr, sizeof (vdc_server_t)); 2374 continue; 2375 } 2376 2377 /* 2378 * now initialise LDC channel which will be used to 2379 * communicate with this server 2380 */ 2381 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2382 kmem_free(srvr, sizeof (vdc_server_t)); 2383 continue; 2384 } 2385 2386 /* add server to list */ 2387 if (prev_srvr) 2388 prev_srvr->next = srvr; 2389 else 2390 vdc->server_list = srvr; 2391 2392 prev_srvr = srvr; 2393 2394 /* inc numbers of servers */ 2395 vdc->num_servers++; 2396 } 2397 2398 /* 2399 * Adjust the max number of handshake retries to match 2400 * the number of vdisk servers. 2401 */ 2402 if (vdc_hshake_retries < vdc->num_servers) 2403 vdc_hshake_retries = vdc->num_servers; 2404 2405 /* pick first server as current server */ 2406 if (vdc->server_list != NULL) { 2407 vdc->curr_server = vdc->server_list; 2408 status = 0; 2409 } else { 2410 status = ENOENT; 2411 } 2412 2413 done: 2414 kmem_free(chanp, listsz); 2415 kmem_free(portp, listsz); 2416 return (status); 2417 } 2418 2419 2420 /* 2421 * Function: 2422 * vdc_do_ldc_up 2423 * 2424 * Description: 2425 * Bring the channel for the current server up. 2426 * 2427 * Arguments: 2428 * vdc - soft state pointer for this instance of the device driver. 2429 * 2430 * Return Code: 2431 * 0 - Success. 2432 * EINVAL - Driver is detaching / LDC error 2433 * ECONNREFUSED - Other end is not listening 2434 */ 2435 static int 2436 vdc_do_ldc_up(vdc_t *vdc) 2437 { 2438 int status; 2439 ldc_status_t ldc_state; 2440 2441 ASSERT(MUTEX_HELD(&vdc->lock)); 2442 2443 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2444 vdc->instance, vdc->curr_server->ldc_id); 2445 2446 if (vdc->lifecycle == VDC_LC_DETACHING) 2447 return (EINVAL); 2448 2449 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2450 switch (status) { 2451 case ECONNREFUSED: /* listener not ready at other end */ 2452 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2453 vdc->instance, vdc->curr_server->ldc_id, status); 2454 status = 0; 2455 break; 2456 default: 2457 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2458 "channel=%ld, err=%d", vdc->instance, 2459 vdc->curr_server->ldc_id, status); 2460 break; 2461 } 2462 } 2463 2464 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2465 vdc->curr_server->ldc_state = ldc_state; 2466 if (ldc_state == LDC_UP) { 2467 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2468 vdc->instance); 2469 vdc->seq_num = 1; 2470 vdc->seq_num_reply = 0; 2471 } 2472 } 2473 2474 return (status); 2475 } 2476 2477 /* 2478 * Function: 2479 * vdc_terminate_ldc() 2480 * 2481 * Description: 2482 * 2483 * Arguments: 2484 * vdc - soft state pointer for this instance of the device driver. 2485 * srvr - vdc per-server info structure 2486 * 2487 * Return Code: 2488 * None 2489 */ 2490 static void 2491 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2492 { 2493 int instance = ddi_get_instance(vdc->dip); 2494 2495 if (srvr->state & VDC_LDC_OPEN) { 2496 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2497 (void) ldc_close(srvr->ldc_handle); 2498 } 2499 if (srvr->state & VDC_LDC_CB) { 2500 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2501 (void) ldc_unreg_callback(srvr->ldc_handle); 2502 } 2503 if (srvr->state & VDC_LDC_INIT) { 2504 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2505 (void) ldc_fini(srvr->ldc_handle); 2506 srvr->ldc_handle = NULL; 2507 } 2508 2509 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2510 } 2511 2512 /* 2513 * Function: 2514 * vdc_fini_ports() 2515 * 2516 * Description: 2517 * Finalize all ports by closing the channel associated with each 2518 * port and also freeing the server structure. 2519 * 2520 * Arguments: 2521 * vdc - soft state pointer for this instance of the device driver. 2522 * 2523 * Return Code: 2524 * None 2525 */ 2526 static void 2527 vdc_fini_ports(vdc_t *vdc) 2528 { 2529 int instance = ddi_get_instance(vdc->dip); 2530 vdc_server_t *srvr, *prev_srvr; 2531 2532 ASSERT(vdc != NULL); 2533 ASSERT(mutex_owned(&vdc->lock)); 2534 2535 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2536 2537 srvr = vdc->server_list; 2538 2539 while (srvr) { 2540 2541 vdc_terminate_ldc(vdc, srvr); 2542 2543 /* next server */ 2544 prev_srvr = srvr; 2545 srvr = srvr->next; 2546 2547 /* free server */ 2548 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2549 } 2550 2551 vdc->server_list = NULL; 2552 } 2553 2554 /* -------------------------------------------------------------------------- */ 2555 2556 /* 2557 * Descriptor Ring helper routines 2558 */ 2559 2560 /* 2561 * Function: 2562 * vdc_init_descriptor_ring() 2563 * 2564 * Description: 2565 * 2566 * Arguments: 2567 * vdc - soft state pointer for this instance of the device driver. 2568 * 2569 * Return Code: 2570 * 0 - Success 2571 */ 2572 static int 2573 vdc_init_descriptor_ring(vdc_t *vdc) 2574 { 2575 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2576 int status = 0; 2577 int i; 2578 2579 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2580 2581 ASSERT(vdc != NULL); 2582 ASSERT(mutex_owned(&vdc->lock)); 2583 2584 /* ensure we have enough room to store max sized block */ 2585 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2586 2587 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2588 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2589 /* 2590 * Calculate the maximum block size we can transmit using one 2591 * Descriptor Ring entry from the attributes returned by the 2592 * vDisk server. This is subject to a minimum of 'maxphys' 2593 * as we do not have the capability to split requests over 2594 * multiple DRing entries. 2595 */ 2596 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2597 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2598 vdc->instance); 2599 vdc->dring_max_cookies = maxphys / PAGESIZE; 2600 } else { 2601 vdc->dring_max_cookies = 2602 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2603 } 2604 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2605 (sizeof (ldc_mem_cookie_t) * 2606 (vdc->dring_max_cookies - 1))); 2607 vdc->dring_len = VD_DRING_LEN; 2608 2609 status = ldc_mem_dring_create(vdc->dring_len, 2610 vdc->dring_entry_size, &vdc->dring_hdl); 2611 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2612 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2613 vdc->instance); 2614 return (status); 2615 } 2616 vdc->initialized |= VDC_DRING_INIT; 2617 } 2618 2619 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2620 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2621 vdc->dring_cookie = 2622 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2623 2624 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2625 vdc->dring_hdl, 2626 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2627 &vdc->dring_cookie[0], 2628 &vdc->dring_cookie_count); 2629 if (status != 0) { 2630 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2631 "(%lx) to channel (%lx) status=%d\n", 2632 vdc->instance, vdc->dring_hdl, 2633 vdc->curr_server->ldc_handle, status); 2634 return (status); 2635 } 2636 ASSERT(vdc->dring_cookie_count == 1); 2637 vdc->initialized |= VDC_DRING_BOUND; 2638 } 2639 2640 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2641 if (status != 0) { 2642 DMSG(vdc, 0, 2643 "[%d] Failed to get info for descriptor ring (%lx)\n", 2644 vdc->instance, vdc->dring_hdl); 2645 return (status); 2646 } 2647 2648 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2649 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2650 2651 /* Allocate the local copy of this dring */ 2652 vdc->local_dring = 2653 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2654 KM_SLEEP); 2655 vdc->initialized |= VDC_DRING_LOCAL; 2656 } 2657 2658 /* 2659 * Mark all DRing entries as free and initialize the private 2660 * descriptor's memory handles. If any entry is initialized, 2661 * we need to free it later so we set the bit in 'initialized' 2662 * at the start. 2663 */ 2664 vdc->initialized |= VDC_DRING_ENTRY; 2665 for (i = 0; i < vdc->dring_len; i++) { 2666 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2667 dep->hdr.dstate = VIO_DESC_FREE; 2668 2669 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2670 &vdc->local_dring[i].desc_mhdl); 2671 if (status != 0) { 2672 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2673 " descriptor %d", vdc->instance, i); 2674 return (status); 2675 } 2676 vdc->local_dring[i].is_free = B_TRUE; 2677 vdc->local_dring[i].dep = dep; 2678 } 2679 2680 /* Initialize the starting index */ 2681 vdc->dring_curr_idx = 0; 2682 2683 return (status); 2684 } 2685 2686 /* 2687 * Function: 2688 * vdc_destroy_descriptor_ring() 2689 * 2690 * Description: 2691 * 2692 * Arguments: 2693 * vdc - soft state pointer for this instance of the device driver. 2694 * 2695 * Return Code: 2696 * None 2697 */ 2698 static void 2699 vdc_destroy_descriptor_ring(vdc_t *vdc) 2700 { 2701 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2702 ldc_mem_handle_t mhdl = NULL; 2703 ldc_mem_info_t minfo; 2704 int status = -1; 2705 int i; /* loop */ 2706 2707 ASSERT(vdc != NULL); 2708 ASSERT(mutex_owned(&vdc->lock)); 2709 2710 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2711 2712 if (vdc->initialized & VDC_DRING_ENTRY) { 2713 DMSG(vdc, 0, 2714 "[%d] Removing Local DRing entries\n", vdc->instance); 2715 for (i = 0; i < vdc->dring_len; i++) { 2716 ldep = &vdc->local_dring[i]; 2717 mhdl = ldep->desc_mhdl; 2718 2719 if (mhdl == NULL) 2720 continue; 2721 2722 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2723 DMSG(vdc, 0, 2724 "ldc_mem_info returned an error: %d\n", 2725 status); 2726 2727 /* 2728 * This must mean that the mem handle 2729 * is not valid. Clear it out so that 2730 * no one tries to use it. 2731 */ 2732 ldep->desc_mhdl = NULL; 2733 continue; 2734 } 2735 2736 if (minfo.status == LDC_BOUND) { 2737 (void) ldc_mem_unbind_handle(mhdl); 2738 } 2739 2740 (void) ldc_mem_free_handle(mhdl); 2741 2742 ldep->desc_mhdl = NULL; 2743 } 2744 vdc->initialized &= ~VDC_DRING_ENTRY; 2745 } 2746 2747 if (vdc->initialized & VDC_DRING_LOCAL) { 2748 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2749 kmem_free(vdc->local_dring, 2750 vdc->dring_len * sizeof (vdc_local_desc_t)); 2751 vdc->initialized &= ~VDC_DRING_LOCAL; 2752 } 2753 2754 if (vdc->initialized & VDC_DRING_BOUND) { 2755 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2756 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2757 if (status == 0) { 2758 vdc->initialized &= ~VDC_DRING_BOUND; 2759 } else { 2760 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2761 vdc->instance, status, vdc->dring_hdl); 2762 } 2763 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2764 } 2765 2766 if (vdc->initialized & VDC_DRING_INIT) { 2767 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2768 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2769 if (status == 0) { 2770 vdc->dring_hdl = NULL; 2771 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2772 vdc->initialized &= ~VDC_DRING_INIT; 2773 } else { 2774 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2775 vdc->instance, status, vdc->dring_hdl); 2776 } 2777 } 2778 } 2779 2780 /* 2781 * Function: 2782 * vdc_map_to_shared_dring() 2783 * 2784 * Description: 2785 * Copy contents of the local descriptor to the shared 2786 * memory descriptor. 2787 * 2788 * Arguments: 2789 * vdcp - soft state pointer for this instance of the device driver. 2790 * idx - descriptor ring index 2791 * 2792 * Return Code: 2793 * None 2794 */ 2795 static int 2796 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2797 { 2798 vdc_local_desc_t *ldep; 2799 vd_dring_entry_t *dep; 2800 int rv; 2801 2802 ldep = &(vdcp->local_dring[idx]); 2803 2804 /* for now leave in the old pop_mem_hdl stuff */ 2805 if (ldep->nbytes > 0) { 2806 rv = vdc_populate_mem_hdl(vdcp, ldep); 2807 if (rv) { 2808 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2809 vdcp->instance); 2810 return (rv); 2811 } 2812 } 2813 2814 /* 2815 * fill in the data details into the DRing 2816 */ 2817 dep = ldep->dep; 2818 ASSERT(dep != NULL); 2819 2820 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2821 dep->payload.operation = ldep->operation; 2822 dep->payload.addr = ldep->offset; 2823 dep->payload.nbytes = ldep->nbytes; 2824 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2825 dep->payload.slice = ldep->slice; 2826 dep->hdr.dstate = VIO_DESC_READY; 2827 dep->hdr.ack = 1; /* request an ACK for every message */ 2828 2829 return (0); 2830 } 2831 2832 /* 2833 * Function: 2834 * vdc_send_request 2835 * 2836 * Description: 2837 * This routine writes the data to be transmitted to vds into the 2838 * descriptor, notifies vds that the ring has been updated and 2839 * then waits for the request to be processed. 2840 * 2841 * Arguments: 2842 * vdcp - the soft state pointer 2843 * operation - operation we want vds to perform (VD_OP_XXX) 2844 * addr - address of data buf to be read/written. 2845 * nbytes - number of bytes to read/write 2846 * slice - the disk slice this request is for 2847 * offset - relative disk offset 2848 * cb_type - type of call - STRATEGY or SYNC 2849 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2850 * . mode for ioctl(9e) 2851 * . LP64 diskaddr_t (block I/O) 2852 * dir - direction of operation (READ/WRITE/BOTH) 2853 * 2854 * Return Codes: 2855 * 0 2856 * ENXIO 2857 */ 2858 static int 2859 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2860 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2861 void *cb_arg, vio_desc_direction_t dir) 2862 { 2863 int rv = 0; 2864 2865 ASSERT(vdcp != NULL); 2866 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2867 2868 mutex_enter(&vdcp->lock); 2869 2870 /* 2871 * If this is a block read/write operation we update the I/O statistics 2872 * to indicate that the request is being put on the waitq to be 2873 * serviced. 2874 * 2875 * We do it here (a common routine for both synchronous and strategy 2876 * calls) for performance reasons - we are already holding vdc->lock 2877 * so there is no extra locking overhead. We would have to explicitly 2878 * grab the 'lock' mutex to update the stats if we were to do this 2879 * higher up the stack in vdc_strategy() et. al. 2880 */ 2881 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2882 DTRACE_IO1(start, buf_t *, cb_arg); 2883 VD_KSTAT_WAITQ_ENTER(vdcp); 2884 } 2885 2886 do { 2887 while (vdcp->state != VDC_STATE_RUNNING) { 2888 2889 /* return error if detaching */ 2890 if (vdcp->state == VDC_STATE_DETACH) { 2891 rv = ENXIO; 2892 goto done; 2893 } 2894 2895 /* fail request if connection timeout is reached */ 2896 if (vdcp->ctimeout_reached) { 2897 rv = EIO; 2898 goto done; 2899 } 2900 2901 /* 2902 * If we are panicking and the disk is not ready then 2903 * we can't send any request because we can't complete 2904 * the handshake now. 2905 */ 2906 if (ddi_in_panic()) { 2907 rv = EIO; 2908 goto done; 2909 } 2910 2911 cv_wait(&vdcp->running_cv, &vdcp->lock); 2912 } 2913 2914 } while (vdc_populate_descriptor(vdcp, operation, addr, 2915 nbytes, slice, offset, cb_type, cb_arg, dir)); 2916 2917 done: 2918 /* 2919 * If this is a block read/write we update the I/O statistics kstat 2920 * to indicate that this request has been placed on the queue for 2921 * processing (i.e sent to the vDisk server) - iostat(1M) will 2922 * report the time waiting for the vDisk server under the %b column 2923 * In the case of an error we simply take it off the wait queue. 2924 */ 2925 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2926 if (rv == 0) { 2927 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2928 DTRACE_PROBE1(send, buf_t *, cb_arg); 2929 } else { 2930 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2931 VD_KSTAT_WAITQ_EXIT(vdcp); 2932 DTRACE_IO1(done, buf_t *, cb_arg); 2933 } 2934 } 2935 2936 mutex_exit(&vdcp->lock); 2937 2938 return (rv); 2939 } 2940 2941 2942 /* 2943 * Function: 2944 * vdc_populate_descriptor 2945 * 2946 * Description: 2947 * This routine writes the data to be transmitted to vds into the 2948 * descriptor, notifies vds that the ring has been updated and 2949 * then waits for the request to be processed. 2950 * 2951 * Arguments: 2952 * vdcp - the soft state pointer 2953 * operation - operation we want vds to perform (VD_OP_XXX) 2954 * addr - address of data buf to be read/written. 2955 * nbytes - number of bytes to read/write 2956 * slice - the disk slice this request is for 2957 * offset - relative disk offset 2958 * cb_type - type of call - STRATEGY or SYNC 2959 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2960 * . mode for ioctl(9e) 2961 * . LP64 diskaddr_t (block I/O) 2962 * dir - direction of operation (READ/WRITE/BOTH) 2963 * 2964 * Return Codes: 2965 * 0 2966 * EAGAIN 2967 * ECONNRESET 2968 * ENXIO 2969 */ 2970 static int 2971 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2972 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2973 void *cb_arg, vio_desc_direction_t dir) 2974 { 2975 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2976 int idx; /* Index of DRing entry used */ 2977 int next_idx; 2978 vio_dring_msg_t dmsg; 2979 size_t msglen; 2980 int rv; 2981 2982 ASSERT(MUTEX_HELD(&vdcp->lock)); 2983 vdcp->threads_pending++; 2984 loop: 2985 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2986 2987 /* Get next available D-Ring entry */ 2988 idx = vdcp->dring_curr_idx; 2989 local_dep = &(vdcp->local_dring[idx]); 2990 2991 if (!local_dep->is_free) { 2992 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2993 vdcp->instance); 2994 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2995 if (vdcp->state == VDC_STATE_RUNNING || 2996 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2997 goto loop; 2998 } 2999 vdcp->threads_pending--; 3000 return (ECONNRESET); 3001 } 3002 3003 next_idx = idx + 1; 3004 if (next_idx >= vdcp->dring_len) 3005 next_idx = 0; 3006 vdcp->dring_curr_idx = next_idx; 3007 3008 ASSERT(local_dep->is_free); 3009 3010 local_dep->operation = operation; 3011 local_dep->addr = addr; 3012 local_dep->nbytes = nbytes; 3013 local_dep->slice = slice; 3014 local_dep->offset = offset; 3015 local_dep->cb_type = cb_type; 3016 local_dep->cb_arg = cb_arg; 3017 local_dep->dir = dir; 3018 3019 local_dep->is_free = B_FALSE; 3020 3021 rv = vdc_map_to_shared_dring(vdcp, idx); 3022 if (rv) { 3023 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3024 vdcp->instance); 3025 /* free the descriptor */ 3026 local_dep->is_free = B_TRUE; 3027 vdcp->dring_curr_idx = idx; 3028 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3029 if (vdcp->state == VDC_STATE_RUNNING || 3030 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3031 goto loop; 3032 } 3033 vdcp->threads_pending--; 3034 return (ECONNRESET); 3035 } 3036 3037 /* 3038 * Send a msg with the DRing details to vds 3039 */ 3040 VIO_INIT_DRING_DATA_TAG(dmsg); 3041 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3042 dmsg.dring_ident = vdcp->dring_ident; 3043 dmsg.start_idx = idx; 3044 dmsg.end_idx = idx; 3045 vdcp->seq_num++; 3046 3047 DTRACE_PROBE2(populate, int, vdcp->instance, 3048 vdc_local_desc_t *, local_dep); 3049 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3050 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3051 3052 /* 3053 * note we're still holding the lock here to 3054 * make sure the message goes out in order !!!... 3055 */ 3056 msglen = sizeof (dmsg); 3057 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3058 switch (rv) { 3059 case ECONNRESET: 3060 /* 3061 * vdc_send initiates the reset on failure. 3062 * Since the transaction has already been put 3063 * on the local dring, it will automatically get 3064 * retried when the channel is reset. Given that, 3065 * it is ok to just return success even though the 3066 * send failed. 3067 */ 3068 rv = 0; 3069 break; 3070 3071 case 0: /* EOK */ 3072 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3073 break; 3074 3075 default: 3076 goto cleanup_and_exit; 3077 } 3078 3079 vdcp->threads_pending--; 3080 return (rv); 3081 3082 cleanup_and_exit: 3083 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3084 return (ENXIO); 3085 } 3086 3087 /* 3088 * Function: 3089 * vdc_do_sync_op 3090 * 3091 * Description: 3092 * Wrapper around vdc_populate_descriptor that blocks until the 3093 * response to the message is available. 3094 * 3095 * Arguments: 3096 * vdcp - the soft state pointer 3097 * operation - operation we want vds to perform (VD_OP_XXX) 3098 * addr - address of data buf to be read/written. 3099 * nbytes - number of bytes to read/write 3100 * slice - the disk slice this request is for 3101 * offset - relative disk offset 3102 * cb_type - type of call - STRATEGY or SYNC 3103 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3104 * . mode for ioctl(9e) 3105 * . LP64 diskaddr_t (block I/O) 3106 * dir - direction of operation (READ/WRITE/BOTH) 3107 * rconflict - check for reservation conflict in case of failure 3108 * 3109 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3110 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3111 * result of a successful operation with vd_scsi_status(). 3112 * 3113 * Return Codes: 3114 * 0 3115 * EAGAIN 3116 * EFAULT 3117 * ENXIO 3118 * EIO 3119 */ 3120 static int 3121 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3122 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3123 vio_desc_direction_t dir, boolean_t rconflict) 3124 { 3125 int status; 3126 vdc_io_t *vio; 3127 boolean_t check_resv_conflict = B_FALSE; 3128 3129 ASSERT(cb_type == CB_SYNC); 3130 3131 /* 3132 * Grab the lock, if blocked wait until the server 3133 * response causes us to wake up again. 3134 */ 3135 mutex_enter(&vdcp->lock); 3136 vdcp->sync_op_cnt++; 3137 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3138 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3139 3140 if (vdcp->state == VDC_STATE_DETACH) { 3141 cv_broadcast(&vdcp->sync_blocked_cv); 3142 vdcp->sync_op_cnt--; 3143 mutex_exit(&vdcp->lock); 3144 return (ENXIO); 3145 } 3146 3147 /* now block anyone other thread entering after us */ 3148 vdcp->sync_op_blocked = B_TRUE; 3149 vdcp->sync_op_pending = B_TRUE; 3150 mutex_exit(&vdcp->lock); 3151 3152 status = vdc_send_request(vdcp, operation, addr, 3153 nbytes, slice, offset, cb_type, cb_arg, dir); 3154 3155 mutex_enter(&vdcp->lock); 3156 3157 if (status != 0) { 3158 vdcp->sync_op_pending = B_FALSE; 3159 } else { 3160 /* 3161 * block until our transaction completes. 3162 * Also anyone else waiting also gets to go next. 3163 */ 3164 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3165 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3166 3167 DMSG(vdcp, 2, ": operation returned %d\n", 3168 vdcp->sync_op_status); 3169 if (vdcp->state == VDC_STATE_DETACH) { 3170 vdcp->sync_op_pending = B_FALSE; 3171 status = ENXIO; 3172 } else { 3173 status = vdcp->sync_op_status; 3174 if (status != 0 && vdcp->failfast_interval != 0) { 3175 /* 3176 * Operation has failed and failfast is enabled. 3177 * We need to check if the failure is due to a 3178 * reservation conflict if this was requested. 3179 */ 3180 check_resv_conflict = rconflict; 3181 } 3182 3183 } 3184 } 3185 3186 vdcp->sync_op_status = 0; 3187 vdcp->sync_op_blocked = B_FALSE; 3188 vdcp->sync_op_cnt--; 3189 3190 /* signal the next waiting thread */ 3191 cv_signal(&vdcp->sync_blocked_cv); 3192 3193 /* 3194 * We have to check for reservation conflict after unblocking sync 3195 * operations because some sync operations will be used to do this 3196 * check. 3197 */ 3198 if (check_resv_conflict) { 3199 vio = vdc_failfast_io_queue(vdcp, NULL); 3200 while (vio->vio_qtime != 0) 3201 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3202 kmem_free(vio, sizeof (vdc_io_t)); 3203 } 3204 3205 mutex_exit(&vdcp->lock); 3206 3207 return (status); 3208 } 3209 3210 3211 /* 3212 * Function: 3213 * vdc_drain_response() 3214 * 3215 * Description: 3216 * When a guest is panicking, the completion of requests needs to be 3217 * handled differently because interrupts are disabled and vdc 3218 * will not get messages. We have to poll for the messages instead. 3219 * 3220 * Note: since we are panicking we don't implement the io:::done 3221 * DTrace probe or update the I/O statistics kstats. 3222 * 3223 * Arguments: 3224 * vdc - soft state pointer for this instance of the device driver. 3225 * buf - if buf is NULL then we drain all responses, otherwise we 3226 * poll until we receive a ACK/NACK for the specific I/O 3227 * described by buf. 3228 * 3229 * Return Code: 3230 * 0 - Success 3231 */ 3232 static int 3233 vdc_drain_response(vdc_t *vdc, struct buf *buf) 3234 { 3235 int rv, idx, retries; 3236 size_t msglen; 3237 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3238 vio_dring_msg_t dmsg; 3239 struct buf *mbuf; 3240 3241 mutex_enter(&vdc->lock); 3242 3243 retries = 0; 3244 for (;;) { 3245 msglen = sizeof (dmsg); 3246 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3247 &msglen); 3248 if (rv) { 3249 rv = EINVAL; 3250 break; 3251 } 3252 3253 /* 3254 * if there are no packets wait and check again 3255 */ 3256 if ((rv == 0) && (msglen == 0)) { 3257 if (retries++ > vdc_dump_retries) { 3258 rv = EAGAIN; 3259 break; 3260 } 3261 3262 drv_usecwait(vdc_usec_timeout_dump); 3263 continue; 3264 } 3265 3266 /* 3267 * Ignore all messages that are not ACKs/NACKs to 3268 * DRing requests. 3269 */ 3270 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3271 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3272 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3273 dmsg.tag.vio_msgtype, 3274 dmsg.tag.vio_subtype, 3275 dmsg.tag.vio_subtype_env); 3276 continue; 3277 } 3278 3279 /* 3280 * set the appropriate return value for the current request. 3281 */ 3282 switch (dmsg.tag.vio_subtype) { 3283 case VIO_SUBTYPE_ACK: 3284 rv = 0; 3285 break; 3286 case VIO_SUBTYPE_NACK: 3287 rv = EAGAIN; 3288 break; 3289 default: 3290 continue; 3291 } 3292 3293 idx = dmsg.start_idx; 3294 if (idx >= vdc->dring_len) { 3295 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3296 vdc->instance, idx); 3297 continue; 3298 } 3299 ldep = &vdc->local_dring[idx]; 3300 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3301 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3302 vdc->instance, idx, ldep->dep->hdr.dstate); 3303 continue; 3304 } 3305 3306 if (buf != NULL && ldep->cb_type == CB_STRATEGY) { 3307 mbuf = ldep->cb_arg; 3308 mbuf->b_resid = mbuf->b_bcount - 3309 ldep->dep->payload.nbytes; 3310 bioerror(mbuf, (rv == EAGAIN)? EIO: 3311 ldep->dep->payload.status); 3312 biodone(mbuf); 3313 } else { 3314 mbuf = NULL; 3315 } 3316 3317 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3318 vdc->instance, idx, ldep->dep->hdr.dstate); 3319 3320 rv = vdc_depopulate_descriptor(vdc, idx); 3321 if (rv) { 3322 DMSG(vdc, 0, 3323 "[%d] Entry @ %d - depopulate failed ..\n", 3324 vdc->instance, idx); 3325 } 3326 3327 /* we have received an ACK/NACK for the specified buffer */ 3328 if (buf != NULL && buf == mbuf) { 3329 rv = 0; 3330 break; 3331 } 3332 3333 /* if this is the last descriptor - break out of loop */ 3334 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) { 3335 if (buf != NULL) { 3336 /* 3337 * We never got a response for the specified 3338 * buffer so we fail the I/O. 3339 */ 3340 bioerror(buf, EIO); 3341 biodone(buf); 3342 } 3343 break; 3344 } 3345 } 3346 3347 mutex_exit(&vdc->lock); 3348 DMSG(vdc, 0, "End idx=%d\n", idx); 3349 3350 return (rv); 3351 } 3352 3353 3354 /* 3355 * Function: 3356 * vdc_depopulate_descriptor() 3357 * 3358 * Description: 3359 * 3360 * Arguments: 3361 * vdc - soft state pointer for this instance of the device driver. 3362 * idx - Index of the Descriptor Ring entry being modified 3363 * 3364 * Return Code: 3365 * 0 - Success 3366 */ 3367 static int 3368 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3369 { 3370 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3371 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3372 int status = ENXIO; 3373 int rv = 0; 3374 3375 ASSERT(vdc != NULL); 3376 ASSERT(idx < vdc->dring_len); 3377 ldep = &vdc->local_dring[idx]; 3378 ASSERT(ldep != NULL); 3379 ASSERT(MUTEX_HELD(&vdc->lock)); 3380 3381 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3382 DMSG(vdc, 2, ": idx = %d\n", idx); 3383 3384 dep = ldep->dep; 3385 ASSERT(dep != NULL); 3386 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3387 (dep->payload.status == ECANCELED)); 3388 3389 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3390 3391 ldep->is_free = B_TRUE; 3392 status = dep->payload.status; 3393 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3394 3395 /* 3396 * If no buffers were used to transfer information to the server when 3397 * populating the descriptor then no memory handles need to be unbound 3398 * and we can return now. 3399 */ 3400 if (ldep->nbytes == 0) { 3401 cv_signal(&vdc->dring_free_cv); 3402 return (status); 3403 } 3404 3405 /* 3406 * If the upper layer passed in a misaligned address we copied the 3407 * data into an aligned buffer before sending it to LDC - we now 3408 * copy it back to the original buffer. 3409 */ 3410 if (ldep->align_addr) { 3411 ASSERT(ldep->addr != NULL); 3412 3413 if (dep->payload.nbytes > 0) 3414 bcopy(ldep->align_addr, ldep->addr, 3415 dep->payload.nbytes); 3416 kmem_free(ldep->align_addr, 3417 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3418 ldep->align_addr = NULL; 3419 } 3420 3421 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3422 if (rv != 0) { 3423 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3424 vdc->instance, ldep->desc_mhdl, idx, rv); 3425 /* 3426 * The error returned by the vDisk server is more informative 3427 * and thus has a higher priority but if it isn't set we ensure 3428 * that this function returns an error. 3429 */ 3430 if (status == 0) 3431 status = EINVAL; 3432 } 3433 3434 cv_signal(&vdc->membind_cv); 3435 cv_signal(&vdc->dring_free_cv); 3436 3437 return (status); 3438 } 3439 3440 /* 3441 * Function: 3442 * vdc_populate_mem_hdl() 3443 * 3444 * Description: 3445 * 3446 * Arguments: 3447 * vdc - soft state pointer for this instance of the device driver. 3448 * idx - Index of the Descriptor Ring entry being modified 3449 * addr - virtual address being mapped in 3450 * nybtes - number of bytes in 'addr' 3451 * operation - the vDisk operation being performed (VD_OP_xxx) 3452 * 3453 * Return Code: 3454 * 0 - Success 3455 */ 3456 static int 3457 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3458 { 3459 vd_dring_entry_t *dep = NULL; 3460 ldc_mem_handle_t mhdl; 3461 caddr_t vaddr; 3462 size_t nbytes; 3463 uint8_t perm = LDC_MEM_RW; 3464 uint8_t maptype; 3465 int rv = 0; 3466 int i; 3467 3468 ASSERT(vdcp != NULL); 3469 3470 dep = ldep->dep; 3471 mhdl = ldep->desc_mhdl; 3472 3473 switch (ldep->dir) { 3474 case VIO_read_dir: 3475 perm = LDC_MEM_W; 3476 break; 3477 3478 case VIO_write_dir: 3479 perm = LDC_MEM_R; 3480 break; 3481 3482 case VIO_both_dir: 3483 perm = LDC_MEM_RW; 3484 break; 3485 3486 default: 3487 ASSERT(0); /* catch bad programming in vdc */ 3488 } 3489 3490 /* 3491 * LDC expects any addresses passed in to be 8-byte aligned. We need 3492 * to copy the contents of any misaligned buffers to a newly allocated 3493 * buffer and bind it instead (and copy the the contents back to the 3494 * original buffer passed in when depopulating the descriptor) 3495 */ 3496 vaddr = ldep->addr; 3497 nbytes = ldep->nbytes; 3498 if (((uint64_t)vaddr & 0x7) != 0) { 3499 ASSERT(ldep->align_addr == NULL); 3500 ldep->align_addr = 3501 kmem_alloc(sizeof (caddr_t) * 3502 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3503 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3504 "(buf=%p nb=%ld op=%d)\n", 3505 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3506 nbytes, ldep->operation); 3507 if (perm != LDC_MEM_W) 3508 bcopy(vaddr, ldep->align_addr, nbytes); 3509 vaddr = ldep->align_addr; 3510 } 3511 3512 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3513 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3514 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3515 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3516 vdcp->instance, dep->payload.ncookies); 3517 if (rv != 0) { 3518 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3519 "(mhdl=%p, buf=%p, err=%d)\n", 3520 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3521 if (ldep->align_addr) { 3522 kmem_free(ldep->align_addr, 3523 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3524 ldep->align_addr = NULL; 3525 } 3526 return (EAGAIN); 3527 } 3528 3529 /* 3530 * Get the other cookies (if any). 3531 */ 3532 for (i = 1; i < dep->payload.ncookies; i++) { 3533 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3534 if (rv != 0) { 3535 (void) ldc_mem_unbind_handle(mhdl); 3536 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3537 "(mhdl=%lx cnum=%d), err=%d", 3538 vdcp->instance, mhdl, i, rv); 3539 if (ldep->align_addr) { 3540 kmem_free(ldep->align_addr, 3541 sizeof (caddr_t) * ldep->nbytes); 3542 ldep->align_addr = NULL; 3543 } 3544 return (EAGAIN); 3545 } 3546 } 3547 3548 return (rv); 3549 } 3550 3551 /* 3552 * Interrupt handlers for messages from LDC 3553 */ 3554 3555 /* 3556 * Function: 3557 * vdc_handle_cb() 3558 * 3559 * Description: 3560 * 3561 * Arguments: 3562 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3563 * arg - soft state pointer for this instance of the device driver. 3564 * 3565 * Return Code: 3566 * 0 - Success 3567 */ 3568 static uint_t 3569 vdc_handle_cb(uint64_t event, caddr_t arg) 3570 { 3571 ldc_status_t ldc_state; 3572 int rv = 0; 3573 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3574 vdc_t *vdc = srvr->vdcp; 3575 3576 ASSERT(vdc != NULL); 3577 3578 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3579 3580 /* If callback is not for the current server, ignore it */ 3581 mutex_enter(&vdc->lock); 3582 3583 if (vdc->curr_server != srvr) { 3584 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3585 vdc->instance, event, srvr->id); 3586 mutex_exit(&vdc->lock); 3587 return (LDC_SUCCESS); 3588 } 3589 3590 /* 3591 * Depending on the type of event that triggered this callback, 3592 * we modify the handshake state or read the data. 3593 * 3594 * NOTE: not done as a switch() as event could be triggered by 3595 * a state change and a read request. Also the ordering of the 3596 * check for the event types is deliberate. 3597 */ 3598 if (event & LDC_EVT_UP) { 3599 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3600 3601 /* get LDC state */ 3602 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3603 if (rv != 0) { 3604 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3605 vdc->instance, rv); 3606 mutex_exit(&vdc->lock); 3607 return (LDC_SUCCESS); 3608 } 3609 if (srvr->ldc_state != LDC_UP && 3610 ldc_state == LDC_UP) { 3611 /* 3612 * Reset the transaction sequence numbers when 3613 * LDC comes up. We then kick off the handshake 3614 * negotiation with the vDisk server. 3615 */ 3616 vdc->seq_num = 1; 3617 vdc->seq_num_reply = 0; 3618 srvr->ldc_state = ldc_state; 3619 cv_signal(&vdc->initwait_cv); 3620 } 3621 } 3622 3623 if (event & LDC_EVT_READ) { 3624 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3625 mutex_enter(&vdc->read_lock); 3626 cv_signal(&vdc->read_cv); 3627 vdc->read_state = VDC_READ_PENDING; 3628 mutex_exit(&vdc->read_lock); 3629 mutex_exit(&vdc->lock); 3630 3631 /* that's all we have to do - no need to handle DOWN/RESET */ 3632 return (LDC_SUCCESS); 3633 } 3634 3635 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3636 3637 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3638 3639 /* 3640 * Need to wake up any readers so they will 3641 * detect that a reset has occurred. 3642 */ 3643 mutex_enter(&vdc->read_lock); 3644 if ((vdc->read_state == VDC_READ_WAITING) || 3645 (vdc->read_state == VDC_READ_RESET)) 3646 cv_signal(&vdc->read_cv); 3647 vdc->read_state = VDC_READ_RESET; 3648 mutex_exit(&vdc->read_lock); 3649 3650 /* wake up any threads waiting for connection to come up */ 3651 if (vdc->state == VDC_STATE_INIT_WAITING) { 3652 vdc->state = VDC_STATE_RESETTING; 3653 cv_signal(&vdc->initwait_cv); 3654 } 3655 3656 } 3657 3658 mutex_exit(&vdc->lock); 3659 3660 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3661 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3662 vdc->instance, event); 3663 3664 return (LDC_SUCCESS); 3665 } 3666 3667 /* 3668 * Function: 3669 * vdc_wait_for_response() 3670 * 3671 * Description: 3672 * Block waiting for a response from the server. If there is 3673 * no data the thread block on the read_cv that is signalled 3674 * by the callback when an EVT_READ occurs. 3675 * 3676 * Arguments: 3677 * vdcp - soft state pointer for this instance of the device driver. 3678 * 3679 * Return Code: 3680 * 0 - Success 3681 */ 3682 static int 3683 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3684 { 3685 size_t nbytes = sizeof (*msgp); 3686 int status; 3687 3688 ASSERT(vdcp != NULL); 3689 3690 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3691 3692 status = vdc_recv(vdcp, msgp, &nbytes); 3693 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3694 status, (int)nbytes); 3695 if (status) { 3696 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3697 vdcp->instance, status); 3698 return (status); 3699 } 3700 3701 if (nbytes < sizeof (vio_msg_tag_t)) { 3702 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3703 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3704 return (ENOMSG); 3705 } 3706 3707 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3708 msgp->tag.vio_msgtype, 3709 msgp->tag.vio_subtype, 3710 msgp->tag.vio_subtype_env); 3711 3712 /* 3713 * Verify the Session ID of the message 3714 * 3715 * Every message after the Version has been negotiated should 3716 * have the correct session ID set. 3717 */ 3718 if ((msgp->tag.vio_sid != vdcp->session_id) && 3719 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3720 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3721 "expected 0x%lx [seq num %lx @ %d]", 3722 vdcp->instance, msgp->tag.vio_sid, 3723 vdcp->session_id, 3724 ((vio_dring_msg_t *)msgp)->seq_num, 3725 ((vio_dring_msg_t *)msgp)->start_idx); 3726 return (ENOMSG); 3727 } 3728 return (0); 3729 } 3730 3731 3732 /* 3733 * Function: 3734 * vdc_resubmit_backup_dring() 3735 * 3736 * Description: 3737 * Resubmit each descriptor in the backed up dring to 3738 * vDisk server. The Dring was backed up during connection 3739 * reset. 3740 * 3741 * Arguments: 3742 * vdcp - soft state pointer for this instance of the device driver. 3743 * 3744 * Return Code: 3745 * 0 - Success 3746 */ 3747 static int 3748 vdc_resubmit_backup_dring(vdc_t *vdcp) 3749 { 3750 int processed = 0; 3751 int count; 3752 int b_idx; 3753 int rv = 0; 3754 int dring_size; 3755 int op; 3756 vio_msg_t vio_msg; 3757 vdc_local_desc_t *curr_ldep; 3758 3759 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3760 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3761 3762 if (vdcp->local_dring_backup == NULL) { 3763 /* the pending requests have already been processed */ 3764 return (0); 3765 } 3766 3767 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3768 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3769 3770 /* 3771 * Walk the backup copy of the local descriptor ring and 3772 * resubmit all the outstanding transactions. 3773 */ 3774 b_idx = vdcp->local_dring_backup_tail; 3775 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3776 3777 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3778 3779 /* only resubmit outstanding transactions */ 3780 if (!curr_ldep->is_free) { 3781 /* 3782 * If we are retrying a block read/write operation we 3783 * need to update the I/O statistics to indicate that 3784 * the request is being put back on the waitq to be 3785 * serviced (it will have been taken off after the 3786 * error was reported). 3787 */ 3788 mutex_enter(&vdcp->lock); 3789 op = curr_ldep->operation; 3790 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3791 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3792 VD_KSTAT_WAITQ_ENTER(vdcp); 3793 } 3794 3795 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3796 rv = vdc_populate_descriptor(vdcp, op, 3797 curr_ldep->addr, curr_ldep->nbytes, 3798 curr_ldep->slice, curr_ldep->offset, 3799 curr_ldep->cb_type, curr_ldep->cb_arg, 3800 curr_ldep->dir); 3801 3802 if (rv) { 3803 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3804 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3805 VD_KSTAT_WAITQ_EXIT(vdcp); 3806 DTRACE_IO1(done, buf_t *, 3807 curr_ldep->cb_arg); 3808 } 3809 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3810 vdcp->instance, b_idx); 3811 mutex_exit(&vdcp->lock); 3812 goto done; 3813 } 3814 3815 /* 3816 * If this is a block read/write we update the I/O 3817 * statistics kstat to indicate that the request 3818 * has been sent back to the vDisk server and should 3819 * now be put on the run queue. 3820 */ 3821 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3822 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3823 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3824 } 3825 mutex_exit(&vdcp->lock); 3826 3827 /* Wait for the response message. */ 3828 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3829 b_idx); 3830 rv = vdc_wait_for_response(vdcp, &vio_msg); 3831 if (rv) { 3832 /* 3833 * If this is a block read/write we update 3834 * the I/O statistics kstat to take it 3835 * off the run queue. 3836 */ 3837 mutex_enter(&vdcp->lock); 3838 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3839 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3840 VD_KSTAT_RUNQ_EXIT(vdcp); 3841 DTRACE_IO1(done, buf_t *, 3842 curr_ldep->cb_arg); 3843 } 3844 DMSG(vdcp, 1, "[%d] wait_for_response " 3845 "returned err=%d\n", vdcp->instance, 3846 rv); 3847 mutex_exit(&vdcp->lock); 3848 goto done; 3849 } 3850 3851 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3852 rv = vdc_process_data_msg(vdcp, &vio_msg); 3853 if (rv) { 3854 DMSG(vdcp, 1, "[%d] process_data_msg " 3855 "returned err=%d\n", vdcp->instance, 3856 rv); 3857 goto done; 3858 } 3859 /* 3860 * Mark this entry as free so that we will not resubmit 3861 * this "done" request again, if we were to use the same 3862 * backup_dring again in future. This could happen when 3863 * a reset happens while processing the backup_dring. 3864 */ 3865 curr_ldep->is_free = B_TRUE; 3866 processed++; 3867 } 3868 3869 /* get the next element to submit */ 3870 if (++b_idx >= vdcp->local_dring_backup_len) 3871 b_idx = 0; 3872 } 3873 3874 /* all done - now clear up pending dring copy */ 3875 dring_size = vdcp->local_dring_backup_len * 3876 sizeof (vdcp->local_dring_backup[0]); 3877 3878 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3879 3880 vdcp->local_dring_backup = NULL; 3881 3882 done: 3883 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3884 3885 return (rv); 3886 } 3887 3888 /* 3889 * Function: 3890 * vdc_cancel_backup_dring 3891 * 3892 * Description: 3893 * Cancel each descriptor in the backed up dring to vDisk server. 3894 * The Dring was backed up during connection reset. 3895 * 3896 * Arguments: 3897 * vdcp - soft state pointer for this instance of the device driver. 3898 * 3899 * Return Code: 3900 * None 3901 */ 3902 void 3903 vdc_cancel_backup_dring(vdc_t *vdcp) 3904 { 3905 vdc_local_desc_t *ldep; 3906 struct buf *bufp; 3907 int count; 3908 int b_idx; 3909 int dring_size; 3910 int cancelled = 0; 3911 3912 ASSERT(MUTEX_HELD(&vdcp->lock)); 3913 ASSERT(vdcp->state == VDC_STATE_INIT || 3914 vdcp->state == VDC_STATE_INIT_WAITING || 3915 vdcp->state == VDC_STATE_NEGOTIATE || 3916 vdcp->state == VDC_STATE_RESETTING); 3917 3918 if (vdcp->local_dring_backup == NULL) { 3919 /* the pending requests have already been processed */ 3920 return; 3921 } 3922 3923 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3924 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3925 3926 /* 3927 * Walk the backup copy of the local descriptor ring and 3928 * cancel all the outstanding transactions. 3929 */ 3930 b_idx = vdcp->local_dring_backup_tail; 3931 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3932 3933 ldep = &(vdcp->local_dring_backup[b_idx]); 3934 3935 /* only cancel outstanding transactions */ 3936 if (!ldep->is_free) { 3937 3938 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3939 cancelled++; 3940 3941 /* 3942 * All requests have already been cleared from the 3943 * local descriptor ring and the LDC channel has been 3944 * reset so we will never get any reply for these 3945 * requests. Now we just have to notify threads waiting 3946 * for replies that the request has failed. 3947 */ 3948 switch (ldep->cb_type) { 3949 case CB_SYNC: 3950 ASSERT(vdcp->sync_op_pending); 3951 vdcp->sync_op_status = EIO; 3952 vdcp->sync_op_pending = B_FALSE; 3953 cv_signal(&vdcp->sync_pending_cv); 3954 break; 3955 3956 case CB_STRATEGY: 3957 bufp = ldep->cb_arg; 3958 ASSERT(bufp != NULL); 3959 bufp->b_resid = bufp->b_bcount; 3960 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3961 VD_KSTAT_RUNQ_EXIT(vdcp); 3962 DTRACE_IO1(done, buf_t *, bufp); 3963 bioerror(bufp, EIO); 3964 biodone(bufp); 3965 break; 3966 3967 default: 3968 ASSERT(0); 3969 } 3970 3971 } 3972 3973 /* get the next element to cancel */ 3974 if (++b_idx >= vdcp->local_dring_backup_len) 3975 b_idx = 0; 3976 } 3977 3978 /* all done - now clear up pending dring copy */ 3979 dring_size = vdcp->local_dring_backup_len * 3980 sizeof (vdcp->local_dring_backup[0]); 3981 3982 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3983 3984 vdcp->local_dring_backup = NULL; 3985 3986 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 3987 } 3988 3989 /* 3990 * Function: 3991 * vdc_connection_timeout 3992 * 3993 * Description: 3994 * This function is invoked if the timeout set to establish the connection 3995 * with vds expires. This will happen if we spend too much time in the 3996 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3997 * cancel any pending request and mark them as failed. 3998 * 3999 * If the timeout does not expire, it will be cancelled when we reach the 4000 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 4001 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 4002 * VDC_STATE_RESETTING state in which case we do nothing because the 4003 * timeout is being cancelled. 4004 * 4005 * Arguments: 4006 * arg - argument of the timeout function actually a soft state 4007 * pointer for the instance of the device driver. 4008 * 4009 * Return Code: 4010 * None 4011 */ 4012 void 4013 vdc_connection_timeout(void *arg) 4014 { 4015 vdc_t *vdcp = (vdc_t *)arg; 4016 4017 mutex_enter(&vdcp->lock); 4018 4019 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4020 vdcp->state == VDC_STATE_DETACH) { 4021 /* 4022 * The connection has just been re-established or 4023 * we are detaching. 4024 */ 4025 vdcp->ctimeout_reached = B_FALSE; 4026 mutex_exit(&vdcp->lock); 4027 return; 4028 } 4029 4030 vdcp->ctimeout_reached = B_TRUE; 4031 4032 /* notify requests waiting for sending */ 4033 cv_broadcast(&vdcp->running_cv); 4034 4035 /* cancel requests waiting for a result */ 4036 vdc_cancel_backup_dring(vdcp); 4037 4038 mutex_exit(&vdcp->lock); 4039 4040 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4041 vdcp->instance); 4042 } 4043 4044 /* 4045 * Function: 4046 * vdc_backup_local_dring() 4047 * 4048 * Description: 4049 * Backup the current dring in the event of a reset. The Dring 4050 * transactions will be resubmitted to the server when the 4051 * connection is restored. 4052 * 4053 * Arguments: 4054 * vdcp - soft state pointer for this instance of the device driver. 4055 * 4056 * Return Code: 4057 * NONE 4058 */ 4059 static void 4060 vdc_backup_local_dring(vdc_t *vdcp) 4061 { 4062 int dring_size; 4063 4064 ASSERT(MUTEX_HELD(&vdcp->lock)); 4065 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4066 4067 /* 4068 * If the backup dring is stil around, it means 4069 * that the last restore did not complete. However, 4070 * since we never got back into the running state, 4071 * the backup copy we have is still valid. 4072 */ 4073 if (vdcp->local_dring_backup != NULL) { 4074 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4075 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4076 vdcp->local_dring_backup_tail); 4077 return; 4078 } 4079 4080 /* 4081 * The backup dring can be NULL and the local dring may not be 4082 * initialized. This can happen if we had a reset while establishing 4083 * a new connection but after the connection has timed out. In that 4084 * case the backup dring is NULL because the requests have been 4085 * cancelled and the request occured before the local dring is 4086 * initialized. 4087 */ 4088 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4089 return; 4090 4091 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4092 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4093 4094 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4095 4096 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4097 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4098 4099 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4100 vdcp->local_dring_backup_len = vdcp->dring_len; 4101 } 4102 4103 static void 4104 vdc_switch_server(vdc_t *vdcp) 4105 { 4106 int rv; 4107 vdc_server_t *curr_server, *new_server; 4108 4109 ASSERT(MUTEX_HELD(&vdcp->lock)); 4110 4111 /* if there is only one server return back */ 4112 if (vdcp->num_servers == 1) { 4113 return; 4114 } 4115 4116 /* Get current and next server */ 4117 curr_server = vdcp->curr_server; 4118 new_server = 4119 (curr_server->next) ? curr_server->next : vdcp->server_list; 4120 ASSERT(curr_server != new_server); 4121 4122 /* bring current server's channel down */ 4123 rv = ldc_down(curr_server->ldc_handle); 4124 if (rv) { 4125 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4126 vdcp->instance, curr_server->id); 4127 return; 4128 } 4129 4130 /* switch the server */ 4131 vdcp->curr_server = new_server; 4132 4133 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4134 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4135 } 4136 4137 /* -------------------------------------------------------------------------- */ 4138 4139 /* 4140 * The following functions process the incoming messages from vds 4141 */ 4142 4143 /* 4144 * Function: 4145 * vdc_process_msg_thread() 4146 * 4147 * Description: 4148 * 4149 * Main VDC message processing thread. Each vDisk instance 4150 * consists of a copy of this thread. This thread triggers 4151 * all the handshakes and data exchange with the server. It 4152 * also handles all channel resets 4153 * 4154 * Arguments: 4155 * vdc - soft state pointer for this instance of the device driver. 4156 * 4157 * Return Code: 4158 * None 4159 */ 4160 static void 4161 vdc_process_msg_thread(vdc_t *vdcp) 4162 { 4163 int status; 4164 int ctimeout; 4165 timeout_id_t tmid = 0; 4166 clock_t ldcup_timeout = 0; 4167 4168 mutex_enter(&vdcp->lock); 4169 4170 for (;;) { 4171 4172 #define Q(_s) (vdcp->state == _s) ? #_s : 4173 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4174 Q(VDC_STATE_INIT) 4175 Q(VDC_STATE_INIT_WAITING) 4176 Q(VDC_STATE_NEGOTIATE) 4177 Q(VDC_STATE_HANDLE_PENDING) 4178 Q(VDC_STATE_RUNNING) 4179 Q(VDC_STATE_RESETTING) 4180 Q(VDC_STATE_DETACH) 4181 "UNKNOWN"); 4182 4183 switch (vdcp->state) { 4184 case VDC_STATE_INIT: 4185 4186 /* 4187 * If requested, start a timeout to check if the 4188 * connection with vds is established in the 4189 * specified delay. If the timeout expires, we 4190 * will cancel any pending request. 4191 * 4192 * If some reset have occurred while establishing 4193 * the connection, we already have a timeout armed 4194 * and in that case we don't need to arm a new one. 4195 * 4196 * The same rule applies when there are multiple vds'. 4197 * If either a connection cannot be established or 4198 * the handshake times out, the connection thread will 4199 * try another server. The 'ctimeout' will report 4200 * back an error after it expires irrespective of 4201 * whether the vdisk is trying to connect to just 4202 * one or multiple servers. 4203 */ 4204 ctimeout = (vdc_timeout != 0)? 4205 vdc_timeout : vdcp->curr_server->ctimeout; 4206 4207 if (ctimeout != 0 && tmid == 0) { 4208 tmid = timeout(vdc_connection_timeout, vdcp, 4209 ctimeout * drv_usectohz(MICROSEC)); 4210 } 4211 4212 /* Check if we are re-initializing repeatedly */ 4213 if (vdcp->hshake_cnt > vdc_hshake_retries && 4214 vdcp->lifecycle != VDC_LC_ONLINE) { 4215 4216 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4217 vdcp->instance, vdcp->hshake_cnt); 4218 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4219 vdcp->instance); 4220 vdcp->state = VDC_STATE_DETACH; 4221 break; 4222 } 4223 4224 /* Switch to STATE_DETACH if drv is detaching */ 4225 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4226 vdcp->state = VDC_STATE_DETACH; 4227 break; 4228 } 4229 4230 /* Switch server */ 4231 if (vdcp->hshake_cnt > 0) 4232 vdc_switch_server(vdcp); 4233 vdcp->hshake_cnt++; 4234 4235 /* Bring up connection with vds via LDC */ 4236 status = vdc_start_ldc_connection(vdcp); 4237 if (status != EINVAL) { 4238 vdcp->state = VDC_STATE_INIT_WAITING; 4239 } 4240 break; 4241 4242 case VDC_STATE_INIT_WAITING: 4243 4244 /* if channel is UP, start negotiation */ 4245 if (vdcp->curr_server->ldc_state == LDC_UP) { 4246 vdcp->state = VDC_STATE_NEGOTIATE; 4247 break; 4248 } 4249 4250 /* check if only one server exists */ 4251 if (vdcp->num_servers == 1) { 4252 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4253 } else { 4254 /* 4255 * wait for LDC_UP, if it times out, switch 4256 * to another server. 4257 */ 4258 ldcup_timeout = ddi_get_lbolt() + 4259 (vdc_ldcup_timeout * 4260 drv_usectohz(MICROSEC)); 4261 status = cv_timedwait(&vdcp->initwait_cv, 4262 &vdcp->lock, ldcup_timeout); 4263 if (status == -1 && 4264 vdcp->state == VDC_STATE_INIT_WAITING && 4265 vdcp->curr_server->ldc_state != LDC_UP) { 4266 /* timed out & still waiting */ 4267 vdcp->state = VDC_STATE_INIT; 4268 break; 4269 } 4270 } 4271 4272 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4273 DMSG(vdcp, 0, 4274 "state moved to %d out from under us...\n", 4275 vdcp->state); 4276 } 4277 break; 4278 4279 case VDC_STATE_NEGOTIATE: 4280 switch (status = vdc_ver_negotiation(vdcp)) { 4281 case 0: 4282 break; 4283 default: 4284 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4285 status); 4286 goto reset; 4287 } 4288 4289 switch (status = vdc_attr_negotiation(vdcp)) { 4290 case 0: 4291 break; 4292 default: 4293 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4294 status); 4295 goto reset; 4296 } 4297 4298 switch (status = vdc_dring_negotiation(vdcp)) { 4299 case 0: 4300 break; 4301 default: 4302 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4303 status); 4304 goto reset; 4305 } 4306 4307 switch (status = vdc_rdx_exchange(vdcp)) { 4308 case 0: 4309 vdcp->state = VDC_STATE_HANDLE_PENDING; 4310 goto done; 4311 default: 4312 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4313 status); 4314 goto reset; 4315 } 4316 reset: 4317 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4318 status); 4319 vdcp->state = VDC_STATE_RESETTING; 4320 vdcp->self_reset = B_TRUE; 4321 done: 4322 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4323 vdcp->state); 4324 break; 4325 4326 case VDC_STATE_HANDLE_PENDING: 4327 4328 if (vdcp->ctimeout_reached) { 4329 /* 4330 * The connection timeout had been reached so 4331 * pending requests have been cancelled. Now 4332 * that the connection is back we can reset 4333 * the timeout. 4334 */ 4335 ASSERT(vdcp->local_dring_backup == NULL); 4336 ASSERT(tmid != 0); 4337 tmid = 0; 4338 vdcp->ctimeout_reached = B_FALSE; 4339 vdcp->state = VDC_STATE_RUNNING; 4340 DMSG(vdcp, 0, "[%d] connection to service " 4341 "domain is up", vdcp->instance); 4342 break; 4343 } 4344 4345 mutex_exit(&vdcp->lock); 4346 if (tmid != 0) { 4347 (void) untimeout(tmid); 4348 tmid = 0; 4349 } 4350 status = vdc_resubmit_backup_dring(vdcp); 4351 mutex_enter(&vdcp->lock); 4352 4353 if (status) 4354 vdcp->state = VDC_STATE_RESETTING; 4355 else 4356 vdcp->state = VDC_STATE_RUNNING; 4357 4358 break; 4359 4360 /* enter running state */ 4361 case VDC_STATE_RUNNING: 4362 /* 4363 * Signal anyone waiting for the connection 4364 * to come on line. 4365 */ 4366 vdcp->hshake_cnt = 0; 4367 cv_broadcast(&vdcp->running_cv); 4368 4369 /* failfast has to been checked after reset */ 4370 cv_signal(&vdcp->failfast_cv); 4371 4372 /* ownership is lost during reset */ 4373 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4374 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4375 cv_signal(&vdcp->ownership_cv); 4376 4377 cmn_err(CE_CONT, "?vdisk@%d is online using " 4378 "ldc@%ld,%ld\n", vdcp->instance, 4379 vdcp->curr_server->ldc_id, vdcp->curr_server->id); 4380 4381 mutex_exit(&vdcp->lock); 4382 4383 for (;;) { 4384 vio_msg_t msg; 4385 status = vdc_wait_for_response(vdcp, &msg); 4386 if (status) break; 4387 4388 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4389 vdcp->instance); 4390 status = vdc_process_data_msg(vdcp, &msg); 4391 if (status) { 4392 DMSG(vdcp, 1, "[%d] process_data_msg " 4393 "returned err=%d\n", vdcp->instance, 4394 status); 4395 break; 4396 } 4397 4398 } 4399 4400 mutex_enter(&vdcp->lock); 4401 4402 cmn_err(CE_CONT, "?vdisk@%d is offline\n", 4403 vdcp->instance); 4404 4405 vdcp->state = VDC_STATE_RESETTING; 4406 vdcp->self_reset = B_TRUE; 4407 break; 4408 4409 case VDC_STATE_RESETTING: 4410 /* 4411 * When we reach this state, we either come from the 4412 * VDC_STATE_RUNNING state and we can have pending 4413 * request but no timeout is armed; or we come from 4414 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4415 * VDC_HANDLE_PENDING state and there is no pending 4416 * request or pending requests have already been copied 4417 * into the backup dring. So we can safely keep the 4418 * connection timeout armed while we are in this state. 4419 */ 4420 4421 DMSG(vdcp, 0, "Initiating channel reset " 4422 "(pending = %d)\n", (int)vdcp->threads_pending); 4423 4424 if (vdcp->self_reset) { 4425 DMSG(vdcp, 0, 4426 "[%d] calling stop_ldc_connection.\n", 4427 vdcp->instance); 4428 status = vdc_stop_ldc_connection(vdcp); 4429 vdcp->self_reset = B_FALSE; 4430 } 4431 4432 /* 4433 * Wait for all threads currently waiting 4434 * for a free dring entry to use. 4435 */ 4436 while (vdcp->threads_pending) { 4437 cv_broadcast(&vdcp->membind_cv); 4438 cv_broadcast(&vdcp->dring_free_cv); 4439 mutex_exit(&vdcp->lock); 4440 /* give the waiters enough time to wake up */ 4441 delay(vdc_hz_min_ldc_delay); 4442 mutex_enter(&vdcp->lock); 4443 } 4444 4445 ASSERT(vdcp->threads_pending == 0); 4446 4447 /* Sanity check that no thread is receiving */ 4448 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4449 4450 vdcp->read_state = VDC_READ_IDLE; 4451 4452 vdc_backup_local_dring(vdcp); 4453 4454 /* cleanup the old d-ring */ 4455 vdc_destroy_descriptor_ring(vdcp); 4456 4457 /* go and start again */ 4458 vdcp->state = VDC_STATE_INIT; 4459 4460 break; 4461 4462 case VDC_STATE_DETACH: 4463 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4464 vdcp->instance); 4465 4466 /* cancel any pending timeout */ 4467 mutex_exit(&vdcp->lock); 4468 if (tmid != 0) { 4469 (void) untimeout(tmid); 4470 tmid = 0; 4471 } 4472 mutex_enter(&vdcp->lock); 4473 4474 /* 4475 * Signal anyone waiting for connection 4476 * to come online 4477 */ 4478 cv_broadcast(&vdcp->running_cv); 4479 4480 while (vdcp->sync_op_pending) { 4481 cv_signal(&vdcp->sync_pending_cv); 4482 cv_signal(&vdcp->sync_blocked_cv); 4483 mutex_exit(&vdcp->lock); 4484 /* give the waiters enough time to wake up */ 4485 delay(vdc_hz_min_ldc_delay); 4486 mutex_enter(&vdcp->lock); 4487 } 4488 4489 mutex_exit(&vdcp->lock); 4490 4491 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4492 vdcp->instance); 4493 thread_exit(); 4494 break; 4495 } 4496 } 4497 } 4498 4499 4500 /* 4501 * Function: 4502 * vdc_process_data_msg() 4503 * 4504 * Description: 4505 * This function is called by the message processing thread each time 4506 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4507 * be an ACK or NACK from vds[1] which vdc handles as follows. 4508 * ACK - wake up the waiting thread 4509 * NACK - resend any messages necessary 4510 * 4511 * [1] Although the message format allows it, vds should not send a 4512 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4513 * some bizarre reason it does, vdc will reset the connection. 4514 * 4515 * Arguments: 4516 * vdc - soft state pointer for this instance of the device driver. 4517 * msg - the LDC message sent by vds 4518 * 4519 * Return Code: 4520 * 0 - Success. 4521 * > 0 - error value returned by LDC 4522 */ 4523 static int 4524 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4525 { 4526 int status = 0; 4527 vio_dring_msg_t *dring_msg; 4528 vdc_local_desc_t *ldep = NULL; 4529 int start, end; 4530 int idx; 4531 int op; 4532 4533 dring_msg = (vio_dring_msg_t *)msg; 4534 4535 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4536 ASSERT(vdcp != NULL); 4537 4538 mutex_enter(&vdcp->lock); 4539 4540 /* 4541 * Check to see if the message has bogus data 4542 */ 4543 idx = start = dring_msg->start_idx; 4544 end = dring_msg->end_idx; 4545 if ((start >= vdcp->dring_len) || 4546 (end >= vdcp->dring_len) || (end < -1)) { 4547 /* 4548 * Update the I/O statistics to indicate that an error ocurred. 4549 * No need to update the wait/run queues as no specific read or 4550 * write request is being completed in response to this 'msg'. 4551 */ 4552 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4553 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4554 vdcp->instance, start, end); 4555 mutex_exit(&vdcp->lock); 4556 return (EINVAL); 4557 } 4558 4559 /* 4560 * Verify that the sequence number is what vdc expects. 4561 */ 4562 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4563 case VDC_SEQ_NUM_TODO: 4564 break; /* keep processing this message */ 4565 case VDC_SEQ_NUM_SKIP: 4566 mutex_exit(&vdcp->lock); 4567 return (0); 4568 case VDC_SEQ_NUM_INVALID: 4569 /* 4570 * Update the I/O statistics to indicate that an error ocurred. 4571 * No need to update the wait/run queues as no specific read or 4572 * write request is being completed in response to this 'msg'. 4573 */ 4574 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4575 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4576 mutex_exit(&vdcp->lock); 4577 return (ENXIO); 4578 } 4579 4580 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4581 /* 4582 * Update the I/O statistics to indicate that an error ocurred. 4583 * 4584 * We need to update the run queue if a read or write request 4585 * is being NACKed - otherwise there will appear to be an 4586 * indefinite outstanding request and statistics reported by 4587 * iostat(1M) will be incorrect. The transaction will be 4588 * resubmitted from the backup DRing following the reset 4589 * and the wait/run queues will be entered again. 4590 */ 4591 ldep = &vdcp->local_dring[idx]; 4592 op = ldep->operation; 4593 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4594 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4595 VD_KSTAT_RUNQ_EXIT(vdcp); 4596 } 4597 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4598 VDC_DUMP_DRING_MSG(dring_msg); 4599 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4600 mutex_exit(&vdcp->lock); 4601 return (EIO); 4602 4603 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4604 /* 4605 * Update the I/O statistics to indicate that an error occurred. 4606 * No need to update the wait/run queues as no specific read or 4607 * write request is being completed in response to this 'msg'. 4608 */ 4609 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4610 mutex_exit(&vdcp->lock); 4611 return (EPROTO); 4612 } 4613 4614 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4615 ASSERT(start == end); 4616 4617 ldep = &vdcp->local_dring[idx]; 4618 4619 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4620 ldep->dep->hdr.dstate, ldep->cb_type); 4621 4622 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4623 struct buf *bufp; 4624 4625 switch (ldep->cb_type) { 4626 case CB_SYNC: 4627 ASSERT(vdcp->sync_op_pending); 4628 4629 status = vdc_depopulate_descriptor(vdcp, idx); 4630 vdcp->sync_op_status = status; 4631 vdcp->sync_op_pending = B_FALSE; 4632 cv_signal(&vdcp->sync_pending_cv); 4633 break; 4634 4635 case CB_STRATEGY: 4636 bufp = ldep->cb_arg; 4637 ASSERT(bufp != NULL); 4638 bufp->b_resid = 4639 bufp->b_bcount - ldep->dep->payload.nbytes; 4640 status = ldep->dep->payload.status; /* Future:ntoh */ 4641 if (status != 0) { 4642 DMSG(vdcp, 1, "strategy status=%d\n", status); 4643 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4644 bioerror(bufp, status); 4645 } 4646 4647 (void) vdc_depopulate_descriptor(vdcp, idx); 4648 4649 DMSG(vdcp, 1, 4650 "strategy complete req=%ld bytes resp=%ld bytes\n", 4651 bufp->b_bcount, ldep->dep->payload.nbytes); 4652 4653 if (status != 0 && vdcp->failfast_interval != 0) { 4654 /* 4655 * The I/O has failed and failfast is enabled. 4656 * We need the failfast thread to check if the 4657 * failure is due to a reservation conflict. 4658 */ 4659 (void) vdc_failfast_io_queue(vdcp, bufp); 4660 } else { 4661 if (status == 0) { 4662 op = (bufp->b_flags & B_READ) ? 4663 VD_OP_BREAD : VD_OP_BWRITE; 4664 VD_UPDATE_IO_STATS(vdcp, op, 4665 ldep->dep->payload.nbytes); 4666 } 4667 VD_KSTAT_RUNQ_EXIT(vdcp); 4668 DTRACE_IO1(done, buf_t *, bufp); 4669 biodone(bufp); 4670 } 4671 break; 4672 4673 default: 4674 ASSERT(0); 4675 } 4676 } 4677 4678 /* let the arrival signal propogate */ 4679 mutex_exit(&vdcp->lock); 4680 4681 /* probe gives the count of how many entries were processed */ 4682 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4683 4684 return (0); 4685 } 4686 4687 4688 /* 4689 * Function: 4690 * vdc_handle_ver_msg() 4691 * 4692 * Description: 4693 * 4694 * Arguments: 4695 * vdc - soft state pointer for this instance of the device driver. 4696 * ver_msg - LDC message sent by vDisk server 4697 * 4698 * Return Code: 4699 * 0 - Success 4700 */ 4701 static int 4702 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4703 { 4704 int status = 0; 4705 4706 ASSERT(vdc != NULL); 4707 ASSERT(mutex_owned(&vdc->lock)); 4708 4709 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4710 return (EPROTO); 4711 } 4712 4713 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4714 return (EINVAL); 4715 } 4716 4717 switch (ver_msg->tag.vio_subtype) { 4718 case VIO_SUBTYPE_ACK: 4719 /* 4720 * We check to see if the version returned is indeed supported 4721 * (The server may have also adjusted the minor number downwards 4722 * and if so 'ver_msg' will contain the actual version agreed) 4723 */ 4724 if (vdc_is_supported_version(ver_msg)) { 4725 vdc->ver.major = ver_msg->ver_major; 4726 vdc->ver.minor = ver_msg->ver_minor; 4727 ASSERT(vdc->ver.major > 0); 4728 } else { 4729 status = EPROTO; 4730 } 4731 break; 4732 4733 case VIO_SUBTYPE_NACK: 4734 /* 4735 * call vdc_is_supported_version() which will return the next 4736 * supported version (if any) in 'ver_msg' 4737 */ 4738 (void) vdc_is_supported_version(ver_msg); 4739 if (ver_msg->ver_major > 0) { 4740 size_t len = sizeof (*ver_msg); 4741 4742 ASSERT(vdc->ver.major > 0); 4743 4744 /* reset the necessary fields and resend */ 4745 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4746 ver_msg->dev_class = VDEV_DISK; 4747 4748 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4749 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4750 vdc->instance, status); 4751 if (len != sizeof (*ver_msg)) 4752 status = EBADMSG; 4753 } else { 4754 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4755 vdc->instance); 4756 status = ENOTSUP; 4757 } 4758 4759 break; 4760 case VIO_SUBTYPE_INFO: 4761 /* 4762 * Handle the case where vds starts handshake 4763 * (for now only vdc is the instigator) 4764 */ 4765 status = ENOTSUP; 4766 break; 4767 4768 default: 4769 status = EINVAL; 4770 break; 4771 } 4772 4773 return (status); 4774 } 4775 4776 /* 4777 * Function: 4778 * vdc_handle_attr_msg() 4779 * 4780 * Description: 4781 * 4782 * Arguments: 4783 * vdc - soft state pointer for this instance of the device driver. 4784 * attr_msg - LDC message sent by vDisk server 4785 * 4786 * Return Code: 4787 * 0 - Success 4788 */ 4789 static int 4790 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4791 { 4792 int status = 0; 4793 4794 ASSERT(vdc != NULL); 4795 ASSERT(mutex_owned(&vdc->lock)); 4796 4797 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4798 return (EPROTO); 4799 } 4800 4801 switch (attr_msg->tag.vio_subtype) { 4802 case VIO_SUBTYPE_ACK: 4803 /* 4804 * We now verify the attributes sent by vds. 4805 */ 4806 if (attr_msg->vdisk_size == 0) { 4807 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4808 vdc->instance); 4809 status = EINVAL; 4810 break; 4811 } 4812 4813 if (attr_msg->max_xfer_sz == 0) { 4814 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4815 vdc->instance); 4816 status = EINVAL; 4817 break; 4818 } 4819 4820 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4821 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4822 vdc->instance); 4823 attr_msg->vdisk_size = 0; 4824 } 4825 /* update disk, block and transfer sizes */ 4826 vdc_update_size(vdc, attr_msg->vdisk_size, 4827 attr_msg->vdisk_block_size, attr_msg->max_xfer_sz); 4828 vdc->vdisk_type = attr_msg->vdisk_type; 4829 vdc->operations = attr_msg->operations; 4830 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4831 vdc->vdisk_media = attr_msg->vdisk_media; 4832 else 4833 vdc->vdisk_media = 0; 4834 4835 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4836 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4837 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4838 vdc->instance, vdc->block_size, 4839 attr_msg->vdisk_block_size); 4840 4841 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4842 (attr_msg->vdisk_size > INT64_MAX) || 4843 (attr_msg->operations == 0) || 4844 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4845 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4846 vdc->instance); 4847 status = EINVAL; 4848 break; 4849 } 4850 4851 /* 4852 * Now that we have received all attributes we can create a 4853 * fake geometry for the disk. 4854 */ 4855 vdc_create_fake_geometry(vdc); 4856 break; 4857 4858 case VIO_SUBTYPE_NACK: 4859 /* 4860 * vds could not handle the attributes we sent so we 4861 * stop negotiating. 4862 */ 4863 status = EPROTO; 4864 break; 4865 4866 case VIO_SUBTYPE_INFO: 4867 /* 4868 * Handle the case where vds starts the handshake 4869 * (for now; vdc is the only supported instigatior) 4870 */ 4871 status = ENOTSUP; 4872 break; 4873 4874 default: 4875 status = ENOTSUP; 4876 break; 4877 } 4878 4879 return (status); 4880 } 4881 4882 /* 4883 * Function: 4884 * vdc_handle_dring_reg_msg() 4885 * 4886 * Description: 4887 * 4888 * Arguments: 4889 * vdc - soft state pointer for this instance of the driver. 4890 * dring_msg - LDC message sent by vDisk server 4891 * 4892 * Return Code: 4893 * 0 - Success 4894 */ 4895 static int 4896 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4897 { 4898 int status = 0; 4899 4900 ASSERT(vdc != NULL); 4901 ASSERT(mutex_owned(&vdc->lock)); 4902 4903 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4904 return (EPROTO); 4905 } 4906 4907 switch (dring_msg->tag.vio_subtype) { 4908 case VIO_SUBTYPE_ACK: 4909 /* save the received dring_ident */ 4910 vdc->dring_ident = dring_msg->dring_ident; 4911 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4912 vdc->instance, vdc->dring_ident); 4913 break; 4914 4915 case VIO_SUBTYPE_NACK: 4916 /* 4917 * vds could not handle the DRing info we sent so we 4918 * stop negotiating. 4919 */ 4920 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4921 vdc->instance); 4922 status = EPROTO; 4923 break; 4924 4925 case VIO_SUBTYPE_INFO: 4926 /* 4927 * Handle the case where vds starts handshake 4928 * (for now only vdc is the instigatior) 4929 */ 4930 status = ENOTSUP; 4931 break; 4932 default: 4933 status = ENOTSUP; 4934 } 4935 4936 return (status); 4937 } 4938 4939 /* 4940 * Function: 4941 * vdc_verify_seq_num() 4942 * 4943 * Description: 4944 * This functions verifies that the sequence number sent back by the vDisk 4945 * server with the latest message is what is expected (i.e. it is greater 4946 * than the last seq num sent by the vDisk server and less than or equal 4947 * to the last seq num generated by vdc). 4948 * 4949 * It then checks the request ID to see if any requests need processing 4950 * in the DRing. 4951 * 4952 * Arguments: 4953 * vdc - soft state pointer for this instance of the driver. 4954 * dring_msg - pointer to the LDC message sent by vds 4955 * 4956 * Return Code: 4957 * VDC_SEQ_NUM_TODO - Message needs to be processed 4958 * VDC_SEQ_NUM_SKIP - Message has already been processed 4959 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4960 * vdc cannot deal with them 4961 */ 4962 static int 4963 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4964 { 4965 ASSERT(vdc != NULL); 4966 ASSERT(dring_msg != NULL); 4967 ASSERT(mutex_owned(&vdc->lock)); 4968 4969 /* 4970 * Check to see if the messages were responded to in the correct 4971 * order by vds. 4972 */ 4973 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4974 (dring_msg->seq_num > vdc->seq_num)) { 4975 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4976 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4977 vdc->instance, dring_msg->seq_num, 4978 vdc->seq_num_reply, vdc->seq_num, 4979 vdc->req_id_proc, vdc->req_id); 4980 return (VDC_SEQ_NUM_INVALID); 4981 } 4982 vdc->seq_num_reply = dring_msg->seq_num; 4983 4984 if (vdc->req_id_proc < vdc->req_id) 4985 return (VDC_SEQ_NUM_TODO); 4986 else 4987 return (VDC_SEQ_NUM_SKIP); 4988 } 4989 4990 4991 /* 4992 * Function: 4993 * vdc_is_supported_version() 4994 * 4995 * Description: 4996 * This routine checks if the major/minor version numbers specified in 4997 * 'ver_msg' are supported. If not it finds the next version that is 4998 * in the supported version list 'vdc_version[]' and sets the fields in 4999 * 'ver_msg' to those values 5000 * 5001 * Arguments: 5002 * ver_msg - LDC message sent by vDisk server 5003 * 5004 * Return Code: 5005 * B_TRUE - Success 5006 * B_FALSE - Version not supported 5007 */ 5008 static boolean_t 5009 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5010 { 5011 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5012 5013 for (int i = 0; i < vdc_num_versions; i++) { 5014 ASSERT(vdc_version[i].major > 0); 5015 ASSERT((i == 0) || 5016 (vdc_version[i].major < vdc_version[i-1].major)); 5017 5018 /* 5019 * If the major versions match, adjust the minor version, if 5020 * necessary, down to the highest value supported by this 5021 * client. The server should support all minor versions lower 5022 * than the value it sent 5023 */ 5024 if (ver_msg->ver_major == vdc_version[i].major) { 5025 if (ver_msg->ver_minor > vdc_version[i].minor) { 5026 DMSGX(0, 5027 "Adjusting minor version from %u to %u", 5028 ver_msg->ver_minor, vdc_version[i].minor); 5029 ver_msg->ver_minor = vdc_version[i].minor; 5030 } 5031 return (B_TRUE); 5032 } 5033 5034 /* 5035 * If the message contains a higher major version number, set 5036 * the message's major/minor versions to the current values 5037 * and return false, so this message will get resent with 5038 * these values, and the server will potentially try again 5039 * with the same or a lower version 5040 */ 5041 if (ver_msg->ver_major > vdc_version[i].major) { 5042 ver_msg->ver_major = vdc_version[i].major; 5043 ver_msg->ver_minor = vdc_version[i].minor; 5044 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5045 ver_msg->ver_major, ver_msg->ver_minor); 5046 5047 return (B_FALSE); 5048 } 5049 5050 /* 5051 * Otherwise, the message's major version is less than the 5052 * current major version, so continue the loop to the next 5053 * (lower) supported version 5054 */ 5055 } 5056 5057 /* 5058 * No common version was found; "ground" the version pair in the 5059 * message to terminate negotiation 5060 */ 5061 ver_msg->ver_major = 0; 5062 ver_msg->ver_minor = 0; 5063 5064 return (B_FALSE); 5065 } 5066 /* -------------------------------------------------------------------------- */ 5067 5068 /* 5069 * DKIO(7) support 5070 */ 5071 5072 typedef struct vdc_dk_arg { 5073 struct dk_callback dkc; 5074 int mode; 5075 dev_t dev; 5076 vdc_t *vdc; 5077 } vdc_dk_arg_t; 5078 5079 /* 5080 * Function: 5081 * vdc_dkio_flush_cb() 5082 * 5083 * Description: 5084 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5085 * by kernel code. 5086 * 5087 * Arguments: 5088 * arg - a pointer to a vdc_dk_arg_t structure. 5089 */ 5090 void 5091 vdc_dkio_flush_cb(void *arg) 5092 { 5093 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5094 struct dk_callback *dkc = NULL; 5095 vdc_t *vdc = NULL; 5096 int rv; 5097 5098 if (dk_arg == NULL) { 5099 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5100 return; 5101 } 5102 dkc = &dk_arg->dkc; 5103 vdc = dk_arg->vdc; 5104 ASSERT(vdc != NULL); 5105 5106 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5107 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5108 if (rv != 0) { 5109 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5110 vdc->instance, rv, 5111 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5112 } 5113 5114 /* 5115 * Trigger the call back to notify the caller the the ioctl call has 5116 * been completed. 5117 */ 5118 if ((dk_arg->mode & FKIOCTL) && 5119 (dkc != NULL) && 5120 (dkc->dkc_callback != NULL)) { 5121 ASSERT(dkc->dkc_cookie != NULL); 5122 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5123 } 5124 5125 /* Indicate that one less DKIO write flush is outstanding */ 5126 mutex_enter(&vdc->lock); 5127 vdc->dkio_flush_pending--; 5128 ASSERT(vdc->dkio_flush_pending >= 0); 5129 mutex_exit(&vdc->lock); 5130 5131 /* free the mem that was allocated when the callback was dispatched */ 5132 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5133 } 5134 5135 /* 5136 * Function: 5137 * vdc_dkio_gapart() 5138 * 5139 * Description: 5140 * This function implements the DKIOCGAPART ioctl. 5141 * 5142 * Arguments: 5143 * vdc - soft state pointer 5144 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5145 * flag - ioctl flags 5146 */ 5147 static int 5148 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5149 { 5150 struct dk_geom *geom; 5151 struct vtoc *vtoc; 5152 union { 5153 struct dk_map map[NDKMAP]; 5154 struct dk_map32 map32[NDKMAP]; 5155 } data; 5156 int i, rv, size; 5157 5158 mutex_enter(&vdc->lock); 5159 5160 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5161 mutex_exit(&vdc->lock); 5162 return (rv); 5163 } 5164 5165 vtoc = vdc->vtoc; 5166 geom = vdc->geom; 5167 5168 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5169 5170 for (i = 0; i < vtoc->v_nparts; i++) { 5171 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5172 (geom->dkg_nhead * geom->dkg_nsect); 5173 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5174 } 5175 size = NDKMAP * sizeof (struct dk_map32); 5176 5177 } else { 5178 5179 for (i = 0; i < vtoc->v_nparts; i++) { 5180 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5181 (geom->dkg_nhead * geom->dkg_nsect); 5182 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5183 } 5184 size = NDKMAP * sizeof (struct dk_map); 5185 5186 } 5187 5188 mutex_exit(&vdc->lock); 5189 5190 if (ddi_copyout(&data, arg, size, flag) != 0) 5191 return (EFAULT); 5192 5193 return (0); 5194 } 5195 5196 /* 5197 * Function: 5198 * vdc_dkio_partition() 5199 * 5200 * Description: 5201 * This function implements the DKIOCPARTITION ioctl. 5202 * 5203 * Arguments: 5204 * vdc - soft state pointer 5205 * arg - a pointer to a struct partition64 structure 5206 * flag - ioctl flags 5207 */ 5208 static int 5209 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5210 { 5211 struct partition64 p64; 5212 efi_gpt_t *gpt; 5213 efi_gpe_t *gpe; 5214 vd_efi_dev_t edev; 5215 uint_t partno; 5216 int rv; 5217 5218 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5219 return (EFAULT); 5220 } 5221 5222 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5223 5224 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5225 return (rv); 5226 } 5227 5228 partno = p64.p_partno; 5229 5230 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5231 vd_efi_free(&edev, gpt, gpe); 5232 return (ESRCH); 5233 } 5234 5235 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5236 sizeof (struct uuid)); 5237 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5238 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5239 5240 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5241 vd_efi_free(&edev, gpt, gpe); 5242 return (EFAULT); 5243 } 5244 5245 vd_efi_free(&edev, gpt, gpe); 5246 return (0); 5247 } 5248 5249 /* 5250 * Function: 5251 * vdc_dioctl_rwcmd() 5252 * 5253 * Description: 5254 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5255 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5256 * 5257 * Arguments: 5258 * dev - device 5259 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5260 * flag - ioctl flags 5261 */ 5262 static int 5263 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5264 { 5265 struct dadkio_rwcmd32 rwcmd32; 5266 struct dadkio_rwcmd rwcmd; 5267 struct iovec aiov; 5268 struct uio auio; 5269 int rw, status; 5270 struct buf *buf; 5271 5272 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5273 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5274 sizeof (struct dadkio_rwcmd32), flag)) { 5275 return (EFAULT); 5276 } 5277 rwcmd.cmd = rwcmd32.cmd; 5278 rwcmd.flags = rwcmd32.flags; 5279 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5280 rwcmd.buflen = rwcmd32.buflen; 5281 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5282 } else { 5283 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5284 sizeof (struct dadkio_rwcmd), flag)) { 5285 return (EFAULT); 5286 } 5287 } 5288 5289 switch (rwcmd.cmd) { 5290 case DADKIO_RWCMD_READ: 5291 rw = B_READ; 5292 break; 5293 case DADKIO_RWCMD_WRITE: 5294 rw = B_WRITE; 5295 break; 5296 default: 5297 return (EINVAL); 5298 } 5299 5300 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5301 aiov.iov_base = rwcmd.bufaddr; 5302 aiov.iov_len = rwcmd.buflen; 5303 5304 bzero((caddr_t)&auio, sizeof (struct uio)); 5305 auio.uio_iov = &aiov; 5306 auio.uio_iovcnt = 1; 5307 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5308 auio.uio_resid = rwcmd.buflen; 5309 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5310 5311 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5312 bioinit(buf); 5313 /* 5314 * We use the private field of buf to specify that this is an 5315 * I/O using an absolute offset. 5316 */ 5317 buf->b_private = (void *)VD_SLICE_NONE; 5318 5319 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5320 5321 biofini(buf); 5322 kmem_free(buf, sizeof (buf_t)); 5323 5324 return (status); 5325 } 5326 5327 /* 5328 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5329 * buffer is returned in alloc_len. 5330 */ 5331 static vd_scsi_t * 5332 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5333 int *alloc_len) 5334 { 5335 vd_scsi_t *vd_scsi; 5336 int vd_scsi_len = VD_SCSI_SIZE; 5337 5338 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5339 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5340 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5341 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5342 5343 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5344 5345 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5346 5347 vd_scsi->cdb_len = cdb_len; 5348 vd_scsi->sense_len = sense_len; 5349 vd_scsi->datain_len = datain_len; 5350 vd_scsi->dataout_len = dataout_len; 5351 5352 *alloc_len = vd_scsi_len; 5353 5354 return (vd_scsi); 5355 } 5356 5357 /* 5358 * Convert the status of a SCSI command to a Solaris return code. 5359 * 5360 * Arguments: 5361 * vd_scsi - The SCSI operation buffer. 5362 * log_error - indicate if an error message should be logged. 5363 * 5364 * Note that our SCSI error messages are rather primitive for the moment 5365 * and could be improved by decoding some data like the SCSI command and 5366 * the sense key. 5367 * 5368 * Return value: 5369 * 0 - Status is good. 5370 * EACCES - Status reports a reservation conflict. 5371 * ENOTSUP - Status reports a check condition and sense key 5372 * reports an illegal request. 5373 * EIO - Any other status. 5374 */ 5375 static int 5376 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5377 { 5378 int rv; 5379 char path_str[MAXPATHLEN]; 5380 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5381 union scsi_cdb *cdb; 5382 struct scsi_extended_sense *sense; 5383 5384 if (vd_scsi->cmd_status == STATUS_GOOD) 5385 /* no error */ 5386 return (0); 5387 5388 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5389 if (vdc_scsi_log_error) 5390 log_error = B_TRUE; 5391 5392 if (log_error) { 5393 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5394 ddi_pathname(vdc->dip, path_str), vdc->instance, 5395 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5396 } 5397 5398 /* default returned value */ 5399 rv = EIO; 5400 5401 switch (vd_scsi->cmd_status) { 5402 5403 case STATUS_CHECK: 5404 case STATUS_TERMINATED: 5405 if (log_error) 5406 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5407 5408 /* check sense buffer */ 5409 if (vd_scsi->sense_len == 0 || 5410 vd_scsi->sense_status != STATUS_GOOD) { 5411 if (log_error) 5412 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5413 break; 5414 } 5415 5416 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5417 5418 if (log_error) { 5419 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5420 "\tASC: 0x%x, ASCQ: 0x%x\n", 5421 scsi_sense_key((uint8_t *)sense), 5422 scsi_sense_asc((uint8_t *)sense), 5423 scsi_sense_ascq((uint8_t *)sense)); 5424 } 5425 5426 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5427 rv = ENOTSUP; 5428 break; 5429 5430 case STATUS_BUSY: 5431 if (log_error) 5432 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5433 break; 5434 5435 case STATUS_RESERVATION_CONFLICT: 5436 /* 5437 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5438 * reservation conflict could be due to various reasons like 5439 * incorrect keys, not registered or not reserved etc. So, 5440 * we should not panic in that case. 5441 */ 5442 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5443 if (vdc->failfast_interval != 0 && 5444 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5445 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5446 /* failfast is enabled so we have to panic */ 5447 (void) snprintf(panic_str, sizeof (panic_str), 5448 VDC_RESV_CONFLICT_FMT_STR "%s", 5449 ddi_pathname(vdc->dip, path_str)); 5450 panic(panic_str); 5451 } 5452 if (log_error) 5453 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5454 rv = EACCES; 5455 break; 5456 5457 case STATUS_QFULL: 5458 if (log_error) 5459 cmn_err(CE_NOTE, "\tQueue Full\n"); 5460 break; 5461 5462 case STATUS_MET: 5463 case STATUS_INTERMEDIATE: 5464 case STATUS_SCSI2: 5465 case STATUS_INTERMEDIATE_MET: 5466 case STATUS_ACA_ACTIVE: 5467 if (log_error) 5468 cmn_err(CE_CONT, 5469 "\tUnexpected SCSI status received: 0x%x\n", 5470 vd_scsi->cmd_status); 5471 break; 5472 5473 default: 5474 if (log_error) 5475 cmn_err(CE_CONT, 5476 "\tInvalid SCSI status received: 0x%x\n", 5477 vd_scsi->cmd_status); 5478 break; 5479 } 5480 5481 return (rv); 5482 } 5483 5484 /* 5485 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5486 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5487 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5488 * converted to a VD_OP_RESET operation. 5489 */ 5490 static int 5491 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5492 { 5493 struct uscsi_cmd uscsi; 5494 struct uscsi_cmd32 uscsi32; 5495 vd_scsi_t *vd_scsi; 5496 int vd_scsi_len; 5497 union scsi_cdb *cdb; 5498 struct scsi_extended_sense *sense; 5499 char *datain, *dataout; 5500 size_t cdb_len, datain_len, dataout_len, sense_len; 5501 int rv; 5502 5503 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5504 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5505 mode) != 0) 5506 return (EFAULT); 5507 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5508 } else { 5509 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5510 mode) != 0) 5511 return (EFAULT); 5512 } 5513 5514 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5515 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5516 USCSI_RESET_ALL)) { 5517 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5518 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5519 return (rv); 5520 } 5521 5522 /* cdb buffer length */ 5523 cdb_len = uscsi.uscsi_cdblen; 5524 5525 /* data in and out buffers length */ 5526 if (uscsi.uscsi_flags & USCSI_READ) { 5527 datain_len = uscsi.uscsi_buflen; 5528 dataout_len = 0; 5529 } else { 5530 datain_len = 0; 5531 dataout_len = uscsi.uscsi_buflen; 5532 } 5533 5534 /* sense buffer length */ 5535 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5536 sense_len = uscsi.uscsi_rqlen; 5537 else 5538 sense_len = 0; 5539 5540 /* allocate buffer for the VD_SCSICMD_OP operation */ 5541 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5542 &vd_scsi_len); 5543 5544 /* 5545 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5546 * but basically they prevent a SCSI command from being retried in case 5547 * of an error. 5548 */ 5549 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5550 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5551 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5552 5553 /* set task attribute */ 5554 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5555 vd_scsi->task_attribute = 0; 5556 } else { 5557 if (uscsi.uscsi_flags & USCSI_HEAD) 5558 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5559 else if (uscsi.uscsi_flags & USCSI_HTAG) 5560 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5561 else if (uscsi.uscsi_flags & USCSI_OTAG) 5562 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5563 else 5564 vd_scsi->task_attribute = 0; 5565 } 5566 5567 /* set timeout */ 5568 vd_scsi->timeout = uscsi.uscsi_timeout; 5569 5570 /* copy-in cdb data */ 5571 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5572 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5573 rv = EFAULT; 5574 goto done; 5575 } 5576 5577 /* keep a pointer to the sense buffer */ 5578 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5579 5580 /* keep a pointer to the data-in buffer */ 5581 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5582 5583 /* copy-in request data to the data-out buffer */ 5584 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5585 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5586 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5587 mode)) { 5588 rv = EFAULT; 5589 goto done; 5590 } 5591 } 5592 5593 /* submit the request */ 5594 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5595 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5596 5597 if (rv != 0) 5598 goto done; 5599 5600 /* update scsi status */ 5601 uscsi.uscsi_status = vd_scsi->cmd_status; 5602 5603 /* update sense data */ 5604 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5605 (uscsi.uscsi_status == STATUS_CHECK || 5606 uscsi.uscsi_status == STATUS_TERMINATED)) { 5607 5608 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5609 5610 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5611 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5612 vd_scsi->sense_len; 5613 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5614 vd_scsi->sense_len, mode) != 0) { 5615 rv = EFAULT; 5616 goto done; 5617 } 5618 } 5619 } 5620 5621 /* update request data */ 5622 if (uscsi.uscsi_status == STATUS_GOOD) { 5623 if (uscsi.uscsi_flags & USCSI_READ) { 5624 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5625 vd_scsi->datain_len; 5626 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5627 vd_scsi->datain_len, mode) != 0) { 5628 rv = EFAULT; 5629 goto done; 5630 } 5631 } else { 5632 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5633 vd_scsi->dataout_len; 5634 } 5635 } 5636 5637 /* copy-out result */ 5638 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5639 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5640 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5641 mode) != 0) { 5642 rv = EFAULT; 5643 goto done; 5644 } 5645 } else { 5646 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5647 mode) != 0) { 5648 rv = EFAULT; 5649 goto done; 5650 } 5651 } 5652 5653 /* get the return code from the SCSI command status */ 5654 rv = vdc_scsi_status(vdc, vd_scsi, 5655 !(uscsi.uscsi_flags & USCSI_SILENT)); 5656 5657 done: 5658 kmem_free(vd_scsi, vd_scsi_len); 5659 return (rv); 5660 } 5661 5662 /* 5663 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5664 * 5665 * Arguments: 5666 * cmd - SCSI PERSISTENT IN command 5667 * len - length of the SCSI input buffer 5668 * vd_scsi_len - return the length of the allocated buffer 5669 * 5670 * Returned Value: 5671 * a pointer to the allocated VD_OP_SCSICMD buffer. 5672 */ 5673 static vd_scsi_t * 5674 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5675 { 5676 int cdb_len, sense_len, datain_len, dataout_len; 5677 vd_scsi_t *vd_scsi; 5678 union scsi_cdb *cdb; 5679 5680 cdb_len = CDB_GROUP1; 5681 sense_len = sizeof (struct scsi_extended_sense); 5682 datain_len = len; 5683 dataout_len = 0; 5684 5685 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5686 vd_scsi_len); 5687 5688 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5689 5690 /* set cdb */ 5691 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5692 cdb->cdb_opaque[1] = cmd; 5693 FORMG1COUNT(cdb, datain_len); 5694 5695 vd_scsi->timeout = vdc_scsi_timeout; 5696 5697 return (vd_scsi); 5698 } 5699 5700 /* 5701 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5702 * 5703 * Arguments: 5704 * cmd - SCSI PERSISTENT OUT command 5705 * len - length of the SCSI output buffer 5706 * vd_scsi_len - return the length of the allocated buffer 5707 * 5708 * Returned Code: 5709 * a pointer to the allocated VD_OP_SCSICMD buffer. 5710 */ 5711 static vd_scsi_t * 5712 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5713 { 5714 int cdb_len, sense_len, datain_len, dataout_len; 5715 vd_scsi_t *vd_scsi; 5716 union scsi_cdb *cdb; 5717 5718 cdb_len = CDB_GROUP1; 5719 sense_len = sizeof (struct scsi_extended_sense); 5720 datain_len = 0; 5721 dataout_len = len; 5722 5723 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5724 vd_scsi_len); 5725 5726 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5727 5728 /* set cdb */ 5729 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5730 cdb->cdb_opaque[1] = cmd; 5731 FORMG1COUNT(cdb, dataout_len); 5732 5733 vd_scsi->timeout = vdc_scsi_timeout; 5734 5735 return (vd_scsi); 5736 } 5737 5738 /* 5739 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5740 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5741 * server with a VD_OP_SCSICMD operation. 5742 */ 5743 static int 5744 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5745 { 5746 vd_scsi_t *vd_scsi; 5747 mhioc_inkeys_t inkeys; 5748 mhioc_key_list_t klist; 5749 struct mhioc_inkeys32 inkeys32; 5750 struct mhioc_key_list32 klist32; 5751 sd_prin_readkeys_t *scsi_keys; 5752 void *user_keys; 5753 int vd_scsi_len; 5754 int listsize, listlen, rv; 5755 5756 /* copyin arguments */ 5757 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5758 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5759 if (rv != 0) 5760 return (EFAULT); 5761 5762 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5763 sizeof (klist32), mode); 5764 if (rv != 0) 5765 return (EFAULT); 5766 5767 listsize = klist32.listsize; 5768 } else { 5769 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5770 if (rv != 0) 5771 return (EFAULT); 5772 5773 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5774 if (rv != 0) 5775 return (EFAULT); 5776 5777 listsize = klist.listsize; 5778 } 5779 5780 /* build SCSI VD_OP request */ 5781 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5782 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5783 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5784 5785 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5786 5787 /* submit the request */ 5788 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5789 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5790 5791 if (rv != 0) 5792 goto done; 5793 5794 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5795 5796 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5797 inkeys32.generation = scsi_keys->generation; 5798 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5799 if (rv != 0) { 5800 rv = EFAULT; 5801 goto done; 5802 } 5803 5804 klist32.listlen = listlen; 5805 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5806 sizeof (klist32), mode); 5807 if (rv != 0) { 5808 rv = EFAULT; 5809 goto done; 5810 } 5811 5812 user_keys = (caddr_t)(uintptr_t)klist32.list; 5813 } else { 5814 inkeys.generation = scsi_keys->generation; 5815 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5816 if (rv != 0) { 5817 rv = EFAULT; 5818 goto done; 5819 } 5820 5821 klist.listlen = listlen; 5822 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5823 if (rv != 0) { 5824 rv = EFAULT; 5825 goto done; 5826 } 5827 5828 user_keys = klist.list; 5829 } 5830 5831 /* copy out keys */ 5832 if (listlen > 0 && listsize > 0) { 5833 if (listsize < listlen) 5834 listlen = listsize; 5835 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5836 listlen * MHIOC_RESV_KEY_SIZE, mode); 5837 if (rv != 0) 5838 rv = EFAULT; 5839 } 5840 5841 if (rv == 0) 5842 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5843 5844 done: 5845 kmem_free(vd_scsi, vd_scsi_len); 5846 5847 return (rv); 5848 } 5849 5850 /* 5851 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5852 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5853 * the vdisk server with a VD_OP_SCSICMD operation. 5854 */ 5855 static int 5856 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5857 { 5858 vd_scsi_t *vd_scsi; 5859 mhioc_inresvs_t inresv; 5860 mhioc_resv_desc_list_t rlist; 5861 struct mhioc_inresvs32 inresv32; 5862 struct mhioc_resv_desc_list32 rlist32; 5863 mhioc_resv_desc_t mhd_resv; 5864 sd_prin_readresv_t *scsi_resv; 5865 sd_readresv_desc_t *resv; 5866 mhioc_resv_desc_t *user_resv; 5867 int vd_scsi_len; 5868 int listsize, listlen, i, rv; 5869 5870 /* copyin arguments */ 5871 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5872 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5873 if (rv != 0) 5874 return (EFAULT); 5875 5876 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5877 sizeof (rlist32), mode); 5878 if (rv != 0) 5879 return (EFAULT); 5880 5881 listsize = rlist32.listsize; 5882 } else { 5883 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5884 if (rv != 0) 5885 return (EFAULT); 5886 5887 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5888 if (rv != 0) 5889 return (EFAULT); 5890 5891 listsize = rlist.listsize; 5892 } 5893 5894 /* build SCSI VD_OP request */ 5895 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5896 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5897 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5898 5899 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5900 5901 /* submit the request */ 5902 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5903 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5904 5905 if (rv != 0) 5906 goto done; 5907 5908 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5909 5910 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5911 inresv32.generation = scsi_resv->generation; 5912 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5913 if (rv != 0) { 5914 rv = EFAULT; 5915 goto done; 5916 } 5917 5918 rlist32.listlen = listlen; 5919 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5920 sizeof (rlist32), mode); 5921 if (rv != 0) { 5922 rv = EFAULT; 5923 goto done; 5924 } 5925 5926 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5927 } else { 5928 inresv.generation = scsi_resv->generation; 5929 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5930 if (rv != 0) { 5931 rv = EFAULT; 5932 goto done; 5933 } 5934 5935 rlist.listlen = listlen; 5936 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5937 if (rv != 0) { 5938 rv = EFAULT; 5939 goto done; 5940 } 5941 5942 user_resv = rlist.list; 5943 } 5944 5945 /* copy out reservations */ 5946 if (listsize > 0 && listlen > 0) { 5947 if (listsize < listlen) 5948 listlen = listsize; 5949 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5950 5951 for (i = 0; i < listlen; i++) { 5952 mhd_resv.type = resv->type; 5953 mhd_resv.scope = resv->scope; 5954 mhd_resv.scope_specific_addr = 5955 BE_32(resv->scope_specific_addr); 5956 bcopy(&resv->resvkey, &mhd_resv.key, 5957 MHIOC_RESV_KEY_SIZE); 5958 5959 rv = ddi_copyout(&mhd_resv, user_resv, 5960 sizeof (mhd_resv), mode); 5961 if (rv != 0) { 5962 rv = EFAULT; 5963 goto done; 5964 } 5965 resv++; 5966 user_resv++; 5967 } 5968 } 5969 5970 if (rv == 0) 5971 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5972 5973 done: 5974 kmem_free(vd_scsi, vd_scsi_len); 5975 return (rv); 5976 } 5977 5978 /* 5979 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5980 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5981 * server with a VD_OP_SCSICMD operation. 5982 */ 5983 static int 5984 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5985 { 5986 vd_scsi_t *vd_scsi; 5987 sd_prout_t *scsi_prout; 5988 mhioc_register_t mhd_reg; 5989 int vd_scsi_len, rv; 5990 5991 /* copyin arguments */ 5992 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 5993 if (rv != 0) 5994 return (EFAULT); 5995 5996 /* build SCSI VD_OP request */ 5997 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 5998 sizeof (sd_prout_t), &vd_scsi_len); 5999 6000 /* set parameters */ 6001 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6002 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6003 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6004 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6005 6006 /* submit the request */ 6007 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6008 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6009 6010 if (rv == 0) 6011 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6012 6013 kmem_free(vd_scsi, vd_scsi_len); 6014 return (rv); 6015 } 6016 6017 /* 6018 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6019 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6020 * server with a VD_OP_SCSICMD operation. 6021 */ 6022 static int 6023 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6024 { 6025 union scsi_cdb *cdb; 6026 vd_scsi_t *vd_scsi; 6027 sd_prout_t *scsi_prout; 6028 mhioc_resv_desc_t mhd_resv; 6029 int vd_scsi_len, rv; 6030 6031 /* copyin arguments */ 6032 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6033 if (rv != 0) 6034 return (EFAULT); 6035 6036 /* build SCSI VD_OP request */ 6037 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6038 sizeof (sd_prout_t), &vd_scsi_len); 6039 6040 /* set parameters */ 6041 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6042 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6043 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6044 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6045 cdb->cdb_opaque[2] = mhd_resv.type; 6046 6047 /* submit the request */ 6048 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6049 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6050 6051 if (rv == 0) 6052 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6053 6054 kmem_free(vd_scsi, vd_scsi_len); 6055 return (rv); 6056 } 6057 6058 /* 6059 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6060 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6061 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6062 */ 6063 static int 6064 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6065 { 6066 union scsi_cdb *cdb; 6067 vd_scsi_t *vd_scsi; 6068 sd_prout_t *scsi_prout; 6069 mhioc_preemptandabort_t mhd_preempt; 6070 int vd_scsi_len, rv; 6071 6072 /* copyin arguments */ 6073 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6074 if (rv != 0) 6075 return (EFAULT); 6076 6077 /* build SCSI VD_OP request */ 6078 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6079 sizeof (sd_prout_t), &vd_scsi_len); 6080 6081 /* set parameters */ 6082 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6083 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6084 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6085 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6086 MHIOC_RESV_KEY_SIZE); 6087 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6088 MHIOC_RESV_KEY_SIZE); 6089 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6090 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6091 6092 /* submit the request */ 6093 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6094 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6095 6096 if (rv == 0) 6097 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6098 6099 kmem_free(vd_scsi, vd_scsi_len); 6100 return (rv); 6101 } 6102 6103 /* 6104 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6105 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6106 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6107 */ 6108 static int 6109 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6110 { 6111 vd_scsi_t *vd_scsi; 6112 sd_prout_t *scsi_prout; 6113 mhioc_registerandignorekey_t mhd_regi; 6114 int vd_scsi_len, rv; 6115 6116 /* copyin arguments */ 6117 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6118 if (rv != 0) 6119 return (EFAULT); 6120 6121 /* build SCSI VD_OP request */ 6122 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6123 sizeof (sd_prout_t), &vd_scsi_len); 6124 6125 /* set parameters */ 6126 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6127 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6128 MHIOC_RESV_KEY_SIZE); 6129 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6130 6131 /* submit the request */ 6132 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6133 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6134 6135 if (rv == 0) 6136 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6137 6138 kmem_free(vd_scsi, vd_scsi_len); 6139 return (rv); 6140 } 6141 6142 /* 6143 * This function is used by the failfast mechanism to send a SCSI command 6144 * to check for reservation conflict. 6145 */ 6146 static int 6147 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6148 { 6149 int cdb_len, sense_len, vd_scsi_len; 6150 vd_scsi_t *vd_scsi; 6151 union scsi_cdb *cdb; 6152 int rv; 6153 6154 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6155 6156 if (scmd == SCMD_WRITE_G1) 6157 cdb_len = CDB_GROUP1; 6158 else 6159 cdb_len = CDB_GROUP0; 6160 6161 sense_len = sizeof (struct scsi_extended_sense); 6162 6163 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6164 6165 /* set cdb */ 6166 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6167 cdb->scc_cmd = scmd; 6168 6169 vd_scsi->timeout = vdc_scsi_timeout; 6170 6171 /* 6172 * Submit the request. The last argument has to be B_FALSE so that 6173 * vdc_do_sync_op does not loop checking for reservation conflict if 6174 * the operation returns an error. 6175 */ 6176 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6177 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6178 6179 if (rv == 0) 6180 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6181 6182 kmem_free(vd_scsi, vd_scsi_len); 6183 return (rv); 6184 } 6185 6186 /* 6187 * This function is used by the failfast mechanism to check for reservation 6188 * conflict. It sends some SCSI commands which will fail with a reservation 6189 * conflict error if the system does not have access to the disk and this 6190 * will panic the system. 6191 * 6192 * Returned Code: 6193 * 0 - disk is accessible without reservation conflict error 6194 * != 0 - unable to check if disk is accessible 6195 */ 6196 int 6197 vdc_failfast_check_resv(vdc_t *vdc) 6198 { 6199 int failure = 0; 6200 6201 /* 6202 * Send a TEST UNIT READY command. The command will panic 6203 * the system if it fails with a reservation conflict. 6204 */ 6205 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6206 failure++; 6207 6208 /* 6209 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6210 * a reserved device, so we also do a WRITE(10) of zero byte in 6211 * order to provoke a Reservation Conflict status on those newer 6212 * devices. 6213 */ 6214 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6215 failure++; 6216 6217 return (failure); 6218 } 6219 6220 /* 6221 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6222 * queue when it has failed and failfast is enabled. Then we have to check 6223 * if it has failed because of a reservation conflict in which case we have 6224 * to panic the system. 6225 * 6226 * Async I/O should be queued with their block I/O data transfer structure 6227 * (buf). Sync I/O should be queued with buf = NULL. 6228 */ 6229 static vdc_io_t * 6230 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6231 { 6232 vdc_io_t *vio; 6233 6234 ASSERT(MUTEX_HELD(&vdc->lock)); 6235 6236 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6237 vio->vio_next = vdc->failfast_io_queue; 6238 vio->vio_buf = buf; 6239 vio->vio_qtime = ddi_get_lbolt(); 6240 6241 vdc->failfast_io_queue = vio; 6242 6243 /* notify the failfast thread that a new I/O is queued */ 6244 cv_signal(&vdc->failfast_cv); 6245 6246 return (vio); 6247 } 6248 6249 /* 6250 * Remove and complete I/O in the failfast I/O queue which have been 6251 * added after the indicated deadline. A deadline of 0 means that all 6252 * I/O have to be unqueued and marked as completed. 6253 */ 6254 static void 6255 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6256 { 6257 vdc_io_t *vio, *vio_tmp; 6258 6259 ASSERT(MUTEX_HELD(&vdc->lock)); 6260 6261 vio_tmp = NULL; 6262 vio = vdc->failfast_io_queue; 6263 6264 if (deadline != 0) { 6265 /* 6266 * Skip any io queued after the deadline. The failfast 6267 * I/O queue is ordered starting with the last I/O added 6268 * to the queue. 6269 */ 6270 while (vio != NULL && vio->vio_qtime > deadline) { 6271 vio_tmp = vio; 6272 vio = vio->vio_next; 6273 } 6274 } 6275 6276 if (vio == NULL) 6277 /* nothing to unqueue */ 6278 return; 6279 6280 /* update the queue */ 6281 if (vio_tmp == NULL) 6282 vdc->failfast_io_queue = NULL; 6283 else 6284 vio_tmp->vio_next = NULL; 6285 6286 /* 6287 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6288 * structure (buf) and they are completed by calling biodone(). Sync 6289 * I/O do not have a buf and they are completed by setting the 6290 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6291 * thread waiting for the I/O to complete is responsible for freeing 6292 * the vio structure. 6293 */ 6294 while (vio != NULL) { 6295 vio_tmp = vio->vio_next; 6296 if (vio->vio_buf != NULL) { 6297 VD_KSTAT_RUNQ_EXIT(vdc); 6298 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6299 biodone(vio->vio_buf); 6300 kmem_free(vio, sizeof (vdc_io_t)); 6301 } else { 6302 vio->vio_qtime = 0; 6303 } 6304 vio = vio_tmp; 6305 } 6306 6307 cv_broadcast(&vdc->failfast_io_cv); 6308 } 6309 6310 /* 6311 * Failfast Thread. 6312 * 6313 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6314 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6315 * we still have access to the disk. If a command fails with a RESERVATION 6316 * CONFLICT error then the system will immediatly panic. 6317 * 6318 * The failfast thread is also woken up when an I/O has failed. It then check 6319 * the access to the disk to ensure that the I/O failure was not due to a 6320 * reservation conflict. 6321 * 6322 * There is one failfast thread for each virtual disk for which failfast is 6323 * enabled. We could have only one thread sending requests for all disks but 6324 * this would need vdc to send asynchronous requests and to have callbacks to 6325 * process replies. 6326 */ 6327 static void 6328 vdc_failfast_thread(void *arg) 6329 { 6330 int status; 6331 vdc_t *vdc = (vdc_t *)arg; 6332 clock_t timeout, starttime; 6333 6334 mutex_enter(&vdc->lock); 6335 6336 while (vdc->failfast_interval != 0) { 6337 6338 starttime = ddi_get_lbolt(); 6339 6340 mutex_exit(&vdc->lock); 6341 6342 /* check for reservation conflict */ 6343 status = vdc_failfast_check_resv(vdc); 6344 6345 mutex_enter(&vdc->lock); 6346 /* 6347 * We have dropped the lock to send the SCSI command so we have 6348 * to check that failfast is still enabled. 6349 */ 6350 if (vdc->failfast_interval == 0) 6351 break; 6352 6353 /* 6354 * If we have successfully check the disk access and there was 6355 * no reservation conflict then we can complete any I/O queued 6356 * before the last check. 6357 */ 6358 if (status == 0) 6359 vdc_failfast_io_unqueue(vdc, starttime); 6360 6361 /* proceed again if some I/O are still in the queue */ 6362 if (vdc->failfast_io_queue != NULL) 6363 continue; 6364 6365 timeout = ddi_get_lbolt() + 6366 drv_usectohz(vdc->failfast_interval); 6367 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6368 } 6369 6370 /* 6371 * Failfast is being stop so we can complete any queued I/O. 6372 */ 6373 vdc_failfast_io_unqueue(vdc, 0); 6374 vdc->failfast_thread = NULL; 6375 mutex_exit(&vdc->lock); 6376 thread_exit(); 6377 } 6378 6379 /* 6380 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6381 */ 6382 static int 6383 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6384 { 6385 unsigned int mh_time; 6386 6387 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6388 return (EFAULT); 6389 6390 mutex_enter(&vdc->lock); 6391 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6392 vdc->failfast_thread = thread_create(NULL, 0, 6393 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6394 v.v_maxsyspri - 2); 6395 } 6396 6397 vdc->failfast_interval = mh_time * 1000; 6398 cv_signal(&vdc->failfast_cv); 6399 mutex_exit(&vdc->lock); 6400 6401 return (0); 6402 } 6403 6404 /* 6405 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6406 * converted to VD_OP_SET_ACCESS operations. 6407 */ 6408 static int 6409 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6410 { 6411 int rv; 6412 6413 /* submit owership command request */ 6414 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6415 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6416 VIO_both_dir, B_TRUE); 6417 6418 return (rv); 6419 } 6420 6421 /* 6422 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6423 * VD_OP_GET_ACCESS operation. 6424 */ 6425 static int 6426 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6427 { 6428 int rv; 6429 6430 /* submit owership command request */ 6431 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6432 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6433 VIO_both_dir, B_TRUE); 6434 6435 return (rv); 6436 } 6437 6438 /* 6439 * Disk Ownership Thread. 6440 * 6441 * When we have taken the ownership of a disk, this thread waits to be 6442 * notified when the LDC channel is reset so that it can recover the 6443 * ownership. 6444 * 6445 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6446 * can not be used to do the ownership recovery because it has to be 6447 * running to handle the reply message to the ownership operation. 6448 */ 6449 static void 6450 vdc_ownership_thread(void *arg) 6451 { 6452 vdc_t *vdc = (vdc_t *)arg; 6453 clock_t timeout; 6454 uint64_t status; 6455 6456 mutex_enter(&vdc->ownership_lock); 6457 mutex_enter(&vdc->lock); 6458 6459 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6460 6461 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6462 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6463 /* 6464 * There was a reset so the ownership has been lost, 6465 * try to recover. We do this without using the preempt 6466 * option so that we don't steal the ownership from 6467 * someone who has preempted us. 6468 */ 6469 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6470 vdc->instance); 6471 6472 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6473 VDC_OWNERSHIP_GRANTED); 6474 6475 mutex_exit(&vdc->lock); 6476 6477 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6478 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6479 6480 mutex_enter(&vdc->lock); 6481 6482 if (status == 0) { 6483 DMSG(vdc, 0, "[%d] Ownership recovered", 6484 vdc->instance); 6485 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6486 } else { 6487 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6488 vdc->instance); 6489 } 6490 6491 } 6492 6493 /* 6494 * If we have the ownership then we just wait for an event 6495 * to happen (LDC reset), otherwise we will retry to recover 6496 * after a delay. 6497 */ 6498 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6499 timeout = 0; 6500 else 6501 timeout = ddi_get_lbolt() + 6502 drv_usectohz(vdc_ownership_delay); 6503 6504 /* Release the ownership_lock and wait on the vdc lock */ 6505 mutex_exit(&vdc->ownership_lock); 6506 6507 if (timeout == 0) 6508 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6509 else 6510 (void) cv_timedwait(&vdc->ownership_cv, 6511 &vdc->lock, timeout); 6512 6513 mutex_exit(&vdc->lock); 6514 6515 mutex_enter(&vdc->ownership_lock); 6516 mutex_enter(&vdc->lock); 6517 } 6518 6519 vdc->ownership_thread = NULL; 6520 mutex_exit(&vdc->lock); 6521 mutex_exit(&vdc->ownership_lock); 6522 6523 thread_exit(); 6524 } 6525 6526 static void 6527 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6528 { 6529 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6530 6531 mutex_enter(&vdc->lock); 6532 vdc->ownership = ownership_flags; 6533 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6534 vdc->ownership_thread == NULL) { 6535 /* start ownership thread */ 6536 vdc->ownership_thread = thread_create(NULL, 0, 6537 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6538 v.v_maxsyspri - 2); 6539 } else { 6540 /* notify the ownership thread */ 6541 cv_signal(&vdc->ownership_cv); 6542 } 6543 mutex_exit(&vdc->lock); 6544 } 6545 6546 /* 6547 * Get the size and the block size of a virtual disk from the vdisk server. 6548 */ 6549 static int 6550 vdc_get_capacity(vdc_t *vdc, size_t *dsk_size, size_t *blk_size) 6551 { 6552 int rv = 0; 6553 size_t alloc_len; 6554 vd_capacity_t *vd_cap; 6555 6556 ASSERT(MUTEX_NOT_HELD(&vdc->lock)); 6557 6558 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6559 6560 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6561 6562 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6563 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6564 6565 *dsk_size = vd_cap->vdisk_size; 6566 *blk_size = vd_cap->vdisk_block_size; 6567 6568 kmem_free(vd_cap, alloc_len); 6569 return (rv); 6570 } 6571 6572 /* 6573 * Check the disk capacity. Disk size information is updated if size has 6574 * changed. 6575 * 6576 * Return 0 if the disk capacity is available, or non-zero if it is not. 6577 */ 6578 static int 6579 vdc_check_capacity(vdc_t *vdc) 6580 { 6581 size_t dsk_size, blk_size; 6582 int rv; 6583 6584 if ((rv = vdc_get_capacity(vdc, &dsk_size, &blk_size)) != 0) 6585 return (rv); 6586 6587 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0) 6588 return (EINVAL); 6589 6590 mutex_enter(&vdc->lock); 6591 vdc_update_size(vdc, dsk_size, blk_size, vdc->max_xfer_sz); 6592 mutex_exit(&vdc->lock); 6593 6594 return (0); 6595 } 6596 6597 /* 6598 * This structure is used in the DKIO(7I) array below. 6599 */ 6600 typedef struct vdc_dk_ioctl { 6601 uint8_t op; /* VD_OP_XXX value */ 6602 int cmd; /* Solaris ioctl operation number */ 6603 size_t nbytes; /* size of structure to be copied */ 6604 6605 /* function to convert between vDisk and Solaris structure formats */ 6606 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6607 int mode, int dir); 6608 } vdc_dk_ioctl_t; 6609 6610 /* 6611 * Subset of DKIO(7I) operations currently supported 6612 */ 6613 static vdc_dk_ioctl_t dk_ioctl[] = { 6614 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6615 vdc_null_copy_func}, 6616 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6617 vdc_get_wce_convert}, 6618 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6619 vdc_set_wce_convert}, 6620 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6621 vdc_get_vtoc_convert}, 6622 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6623 vdc_set_vtoc_convert}, 6624 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6625 vdc_get_geom_convert}, 6626 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6627 vdc_get_geom_convert}, 6628 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6629 vdc_get_geom_convert}, 6630 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6631 vdc_set_geom_convert}, 6632 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6633 vdc_get_efi_convert}, 6634 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6635 vdc_set_efi_convert}, 6636 6637 /* DIOCTL_RWCMD is converted to a read or a write */ 6638 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6639 6640 /* mhd(7I) non-shared multihost disks ioctls */ 6641 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6642 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6643 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6644 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6645 6646 /* mhd(7I) shared multihost disks ioctls */ 6647 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6648 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6649 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6650 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6651 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6652 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6653 6654 /* mhd(7I) failfast ioctl */ 6655 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6656 6657 /* 6658 * These particular ioctls are not sent to the server - vdc fakes up 6659 * the necessary info. 6660 */ 6661 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6662 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6663 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6664 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6665 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6666 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6667 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6668 }; 6669 6670 /* 6671 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6672 * function and forward them to the vdisk. 6673 */ 6674 static int 6675 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6676 { 6677 vdc_t *vdc = (vdc_t *)vdisk; 6678 dev_t dev; 6679 int rval; 6680 6681 dev = makedevice(ddi_driver_major(vdc->dip), 6682 VD_MAKE_DEV(vdc->instance, 0)); 6683 6684 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6685 } 6686 6687 /* 6688 * Function: 6689 * vd_process_ioctl() 6690 * 6691 * Description: 6692 * This routine processes disk specific ioctl calls 6693 * 6694 * Arguments: 6695 * dev - the device number 6696 * cmd - the operation [dkio(7I)] to be processed 6697 * arg - pointer to user provided structure 6698 * (contains data to be set or reference parameter for get) 6699 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6700 * rvalp - pointer to return value for calling process. 6701 * 6702 * Return Code: 6703 * 0 6704 * EFAULT 6705 * ENXIO 6706 * EIO 6707 * ENOTSUP 6708 */ 6709 static int 6710 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6711 { 6712 int instance = VDCUNIT(dev); 6713 vdc_t *vdc = NULL; 6714 int rv = -1; 6715 int idx = 0; /* index into dk_ioctl[] */ 6716 size_t len = 0; /* #bytes to send to vds */ 6717 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6718 caddr_t mem_p = NULL; 6719 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6720 vdc_dk_ioctl_t *iop; 6721 6722 vdc = ddi_get_soft_state(vdc_state, instance); 6723 if (vdc == NULL) { 6724 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6725 instance); 6726 return (ENXIO); 6727 } 6728 6729 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6730 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6731 6732 if (rvalp != NULL) { 6733 /* the return value of the ioctl is 0 by default */ 6734 *rvalp = 0; 6735 } 6736 6737 /* 6738 * Validate the ioctl operation to be performed. 6739 * 6740 * If we have looped through the array without finding a match then we 6741 * don't support this ioctl. 6742 */ 6743 for (idx = 0; idx < nioctls; idx++) { 6744 if (cmd == dk_ioctl[idx].cmd) 6745 break; 6746 } 6747 6748 if (idx >= nioctls) { 6749 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6750 vdc->instance, cmd); 6751 return (ENOTSUP); 6752 } 6753 6754 iop = &(dk_ioctl[idx]); 6755 6756 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6757 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6758 dk_efi_t dk_efi; 6759 6760 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6761 if (rv != 0) 6762 return (EFAULT); 6763 6764 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6765 } else { 6766 len = iop->nbytes; 6767 } 6768 6769 /* check if the ioctl is applicable */ 6770 switch (cmd) { 6771 case CDROMREADOFFSET: 6772 case DKIOCREMOVABLE: 6773 return (ENOTTY); 6774 6775 case USCSICMD: 6776 case MHIOCTKOWN: 6777 case MHIOCSTATUS: 6778 case MHIOCQRESERVE: 6779 case MHIOCRELEASE: 6780 case MHIOCGRP_INKEYS: 6781 case MHIOCGRP_INRESV: 6782 case MHIOCGRP_REGISTER: 6783 case MHIOCGRP_RESERVE: 6784 case MHIOCGRP_PREEMPTANDABORT: 6785 case MHIOCGRP_REGISTERANDIGNOREKEY: 6786 case MHIOCENFAILFAST: 6787 if (vdc->cinfo == NULL) 6788 return (ENXIO); 6789 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6790 return (ENOTTY); 6791 break; 6792 6793 case DIOCTL_RWCMD: 6794 if (vdc->cinfo == NULL) 6795 return (ENXIO); 6796 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6797 return (ENOTTY); 6798 break; 6799 6800 case DKIOCINFO: 6801 if (vdc->cinfo == NULL) 6802 return (ENXIO); 6803 break; 6804 6805 case DKIOCGMEDIAINFO: 6806 if (vdc->minfo == NULL) 6807 return (ENXIO); 6808 if (vdc_check_capacity(vdc) != 0) 6809 /* disk capacity is not available */ 6810 return (EIO); 6811 break; 6812 } 6813 6814 /* 6815 * Deal with ioctls which require a processing different than 6816 * converting ioctl arguments and sending a corresponding 6817 * VD operation. 6818 */ 6819 switch (cmd) { 6820 6821 case USCSICMD: 6822 { 6823 return (vdc_uscsi_cmd(vdc, arg, mode)); 6824 } 6825 6826 case MHIOCTKOWN: 6827 { 6828 mutex_enter(&vdc->ownership_lock); 6829 /* 6830 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6831 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6832 * while we are processing the ioctl. 6833 */ 6834 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6835 6836 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6837 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6838 if (rv == 0) { 6839 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6840 VDC_OWNERSHIP_GRANTED); 6841 } else { 6842 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6843 } 6844 mutex_exit(&vdc->ownership_lock); 6845 return (rv); 6846 } 6847 6848 case MHIOCRELEASE: 6849 { 6850 mutex_enter(&vdc->ownership_lock); 6851 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6852 if (rv == 0) { 6853 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6854 } 6855 mutex_exit(&vdc->ownership_lock); 6856 return (rv); 6857 } 6858 6859 case MHIOCSTATUS: 6860 { 6861 uint64_t status; 6862 6863 rv = vdc_access_get(vdc, &status, mode); 6864 if (rv == 0 && rvalp != NULL) 6865 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6866 return (rv); 6867 } 6868 6869 case MHIOCQRESERVE: 6870 { 6871 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6872 return (rv); 6873 } 6874 6875 case MHIOCGRP_INKEYS: 6876 { 6877 return (vdc_mhd_inkeys(vdc, arg, mode)); 6878 } 6879 6880 case MHIOCGRP_INRESV: 6881 { 6882 return (vdc_mhd_inresv(vdc, arg, mode)); 6883 } 6884 6885 case MHIOCGRP_REGISTER: 6886 { 6887 return (vdc_mhd_register(vdc, arg, mode)); 6888 } 6889 6890 case MHIOCGRP_RESERVE: 6891 { 6892 return (vdc_mhd_reserve(vdc, arg, mode)); 6893 } 6894 6895 case MHIOCGRP_PREEMPTANDABORT: 6896 { 6897 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6898 } 6899 6900 case MHIOCGRP_REGISTERANDIGNOREKEY: 6901 { 6902 return (vdc_mhd_registerignore(vdc, arg, mode)); 6903 } 6904 6905 case MHIOCENFAILFAST: 6906 { 6907 rv = vdc_failfast(vdc, arg, mode); 6908 return (rv); 6909 } 6910 6911 case DIOCTL_RWCMD: 6912 { 6913 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6914 } 6915 6916 case DKIOCGAPART: 6917 { 6918 return (vdc_dkio_gapart(vdc, arg, mode)); 6919 } 6920 6921 case DKIOCPARTITION: 6922 { 6923 return (vdc_dkio_partition(vdc, arg, mode)); 6924 } 6925 6926 case DKIOCINFO: 6927 { 6928 struct dk_cinfo cinfo; 6929 6930 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6931 cinfo.dki_partition = VDCPART(dev); 6932 6933 rv = ddi_copyout(&cinfo, (void *)arg, 6934 sizeof (struct dk_cinfo), mode); 6935 if (rv != 0) 6936 return (EFAULT); 6937 6938 return (0); 6939 } 6940 6941 case DKIOCGMEDIAINFO: 6942 { 6943 ASSERT(vdc->vdisk_size != 0); 6944 ASSERT(vdc->minfo->dki_capacity != 0); 6945 rv = ddi_copyout(vdc->minfo, (void *)arg, 6946 sizeof (struct dk_minfo), mode); 6947 if (rv != 0) 6948 return (EFAULT); 6949 6950 return (0); 6951 } 6952 6953 case DKIOCFLUSHWRITECACHE: 6954 { 6955 struct dk_callback *dkc = 6956 (struct dk_callback *)(uintptr_t)arg; 6957 vdc_dk_arg_t *dkarg = NULL; 6958 6959 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6960 instance, mode); 6961 6962 /* 6963 * If arg is NULL, then there is no callback function 6964 * registered and the call operates synchronously; we 6965 * break and continue with the rest of the function and 6966 * wait for vds to return (i.e. after the request to 6967 * vds returns successfully, all writes completed prior 6968 * to the ioctl will have been flushed from the disk 6969 * write cache to persistent media. 6970 * 6971 * If a callback function is registered, we dispatch 6972 * the request on a task queue and return immediately. 6973 * The callback will deal with informing the calling 6974 * thread that the flush request is completed. 6975 */ 6976 if (dkc == NULL) 6977 break; 6978 6979 /* 6980 * the asynchronous callback is only supported if 6981 * invoked from within the kernel 6982 */ 6983 if ((mode & FKIOCTL) == 0) 6984 return (ENOTSUP); 6985 6986 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6987 6988 dkarg->mode = mode; 6989 dkarg->dev = dev; 6990 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 6991 6992 mutex_enter(&vdc->lock); 6993 vdc->dkio_flush_pending++; 6994 dkarg->vdc = vdc; 6995 mutex_exit(&vdc->lock); 6996 6997 /* put the request on a task queue */ 6998 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 6999 (void *)dkarg, DDI_SLEEP); 7000 if (rv == NULL) { 7001 /* clean up if dispatch fails */ 7002 mutex_enter(&vdc->lock); 7003 vdc->dkio_flush_pending--; 7004 mutex_exit(&vdc->lock); 7005 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7006 } 7007 7008 return (rv == NULL ? ENOMEM : 0); 7009 } 7010 } 7011 7012 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7013 ASSERT(iop->op != 0); 7014 7015 /* check if the vDisk server handles the operation for this vDisk */ 7016 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7017 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7018 vdc->instance, iop->op); 7019 return (ENOTSUP); 7020 } 7021 7022 /* LDC requires that the memory being mapped is 8-byte aligned */ 7023 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7024 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7025 instance, len, alloc_len); 7026 7027 if (alloc_len > 0) 7028 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7029 7030 /* 7031 * Call the conversion function for this ioctl which, if necessary, 7032 * converts from the Solaris format to the format ARC'ed 7033 * as part of the vDisk protocol (FWARC 2006/195) 7034 */ 7035 ASSERT(iop->convert != NULL); 7036 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7037 if (rv != 0) { 7038 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7039 instance, rv, cmd); 7040 if (mem_p != NULL) 7041 kmem_free(mem_p, alloc_len); 7042 return (rv); 7043 } 7044 7045 /* 7046 * send request to vds to service the ioctl. 7047 */ 7048 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7049 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7050 VIO_both_dir, B_TRUE); 7051 7052 if (rv != 0) { 7053 /* 7054 * This is not necessarily an error. The ioctl could 7055 * be returning a value such as ENOTTY to indicate 7056 * that the ioctl is not applicable. 7057 */ 7058 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7059 instance, rv, cmd); 7060 if (mem_p != NULL) 7061 kmem_free(mem_p, alloc_len); 7062 7063 return (rv); 7064 } 7065 7066 /* 7067 * Call the conversion function (if it exists) for this ioctl 7068 * which converts from the format ARC'ed as part of the vDisk 7069 * protocol (FWARC 2006/195) back to a format understood by 7070 * the rest of Solaris. 7071 */ 7072 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7073 if (rv != 0) { 7074 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7075 instance, rv, cmd); 7076 if (mem_p != NULL) 7077 kmem_free(mem_p, alloc_len); 7078 return (rv); 7079 } 7080 7081 if (mem_p != NULL) 7082 kmem_free(mem_p, alloc_len); 7083 7084 return (rv); 7085 } 7086 7087 /* 7088 * Function: 7089 * 7090 * Description: 7091 * This is an empty conversion function used by ioctl calls which 7092 * do not need to convert the data being passed in/out to userland 7093 */ 7094 static int 7095 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7096 { 7097 _NOTE(ARGUNUSED(vdc)) 7098 _NOTE(ARGUNUSED(from)) 7099 _NOTE(ARGUNUSED(to)) 7100 _NOTE(ARGUNUSED(mode)) 7101 _NOTE(ARGUNUSED(dir)) 7102 7103 return (0); 7104 } 7105 7106 static int 7107 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7108 int mode, int dir) 7109 { 7110 _NOTE(ARGUNUSED(vdc)) 7111 7112 if (dir == VD_COPYIN) 7113 return (0); /* nothing to do */ 7114 7115 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7116 return (EFAULT); 7117 7118 return (0); 7119 } 7120 7121 static int 7122 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7123 int mode, int dir) 7124 { 7125 _NOTE(ARGUNUSED(vdc)) 7126 7127 if (dir == VD_COPYOUT) 7128 return (0); /* nothing to do */ 7129 7130 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7131 return (EFAULT); 7132 7133 return (0); 7134 } 7135 7136 /* 7137 * Function: 7138 * vdc_get_vtoc_convert() 7139 * 7140 * Description: 7141 * This routine performs the necessary convertions from the DKIOCGVTOC 7142 * Solaris structure to the format defined in FWARC 2006/195. 7143 * 7144 * In the struct vtoc definition, the timestamp field is marked as not 7145 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7146 * However SVM uses that field to check it can write into the VTOC, 7147 * so we fake up the info of that field. 7148 * 7149 * Arguments: 7150 * vdc - the vDisk client 7151 * from - the buffer containing the data to be copied from 7152 * to - the buffer to be copied to 7153 * mode - flags passed to ioctl() call 7154 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7155 * 7156 * Return Code: 7157 * 0 - Success 7158 * ENXIO - incorrect buffer passed in. 7159 * EFAULT - ddi_copyout routine encountered an error. 7160 */ 7161 static int 7162 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7163 { 7164 int i; 7165 void *tmp_mem = NULL; 7166 void *tmp_memp; 7167 struct vtoc vt; 7168 struct vtoc32 vt32; 7169 int copy_len = 0; 7170 int rv = 0; 7171 7172 if (dir != VD_COPYOUT) 7173 return (0); /* nothing to do */ 7174 7175 if ((from == NULL) || (to == NULL)) 7176 return (ENXIO); 7177 7178 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7179 copy_len = sizeof (struct vtoc32); 7180 else 7181 copy_len = sizeof (struct vtoc); 7182 7183 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7184 7185 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 7186 7187 /* fake the VTOC timestamp field */ 7188 for (i = 0; i < V_NUMPAR; i++) { 7189 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 7190 } 7191 7192 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7193 /* LINTED E_ASSIGN_NARROW_CONV */ 7194 vtoctovtoc32(vt, vt32); 7195 tmp_memp = &vt32; 7196 } else { 7197 tmp_memp = &vt; 7198 } 7199 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 7200 if (rv != 0) 7201 rv = EFAULT; 7202 7203 kmem_free(tmp_mem, copy_len); 7204 return (rv); 7205 } 7206 7207 /* 7208 * Function: 7209 * vdc_set_vtoc_convert() 7210 * 7211 * Description: 7212 * This routine performs the necessary convertions from the DKIOCSVTOC 7213 * Solaris structure to the format defined in FWARC 2006/195. 7214 * 7215 * Arguments: 7216 * vdc - the vDisk client 7217 * from - Buffer with data 7218 * to - Buffer where data is to be copied to 7219 * mode - flags passed to ioctl 7220 * dir - direction of copy (in or out) 7221 * 7222 * Return Code: 7223 * 0 - Success 7224 * ENXIO - Invalid buffer passed in 7225 * EFAULT - ddi_copyin of data failed 7226 */ 7227 static int 7228 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7229 { 7230 _NOTE(ARGUNUSED(vdc)) 7231 7232 void *tmp_mem = NULL, *uvtoc; 7233 struct vtoc vt; 7234 struct vtoc *vtp = &vt; 7235 vd_vtoc_t vtvd; 7236 int copy_len = 0; 7237 int i, rv = 0; 7238 7239 if ((from == NULL) || (to == NULL)) 7240 return (ENXIO); 7241 7242 if (dir == VD_COPYIN) 7243 uvtoc = from; 7244 else 7245 uvtoc = to; 7246 7247 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7248 copy_len = sizeof (struct vtoc32); 7249 else 7250 copy_len = sizeof (struct vtoc); 7251 7252 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7253 7254 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 7255 if (rv != 0) { 7256 kmem_free(tmp_mem, copy_len); 7257 return (EFAULT); 7258 } 7259 7260 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7261 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 7262 } else { 7263 vtp = tmp_mem; 7264 } 7265 7266 if (dir == VD_COPYOUT) { 7267 /* 7268 * The disk label may have changed. Revalidate the disk 7269 * geometry. This will also update the device nodes. 7270 */ 7271 vdc_validate(vdc); 7272 7273 /* 7274 * We also need to keep track of the timestamp fields. 7275 */ 7276 for (i = 0; i < V_NUMPAR; i++) { 7277 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 7278 } 7279 7280 return (0); 7281 } 7282 7283 VTOC2VD_VTOC(vtp, &vtvd); 7284 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 7285 kmem_free(tmp_mem, copy_len); 7286 7287 return (0); 7288 } 7289 7290 /* 7291 * Function: 7292 * vdc_get_geom_convert() 7293 * 7294 * Description: 7295 * This routine performs the necessary convertions from the DKIOCGGEOM, 7296 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7297 * defined in FWARC 2006/195 7298 * 7299 * Arguments: 7300 * vdc - the vDisk client 7301 * from - Buffer with data 7302 * to - Buffer where data is to be copied to 7303 * mode - flags passed to ioctl 7304 * dir - direction of copy (in or out) 7305 * 7306 * Return Code: 7307 * 0 - Success 7308 * ENXIO - Invalid buffer passed in 7309 * EFAULT - ddi_copyout of data failed 7310 */ 7311 static int 7312 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7313 { 7314 _NOTE(ARGUNUSED(vdc)) 7315 7316 struct dk_geom geom; 7317 int copy_len = sizeof (struct dk_geom); 7318 int rv = 0; 7319 7320 if (dir != VD_COPYOUT) 7321 return (0); /* nothing to do */ 7322 7323 if ((from == NULL) || (to == NULL)) 7324 return (ENXIO); 7325 7326 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7327 rv = ddi_copyout(&geom, to, copy_len, mode); 7328 if (rv != 0) 7329 rv = EFAULT; 7330 7331 return (rv); 7332 } 7333 7334 /* 7335 * Function: 7336 * vdc_set_geom_convert() 7337 * 7338 * Description: 7339 * This routine performs the necessary convertions from the DKIOCSGEOM 7340 * Solaris structure to the format defined in FWARC 2006/195. 7341 * 7342 * Arguments: 7343 * vdc - the vDisk client 7344 * from - Buffer with data 7345 * to - Buffer where data is to be copied to 7346 * mode - flags passed to ioctl 7347 * dir - direction of copy (in or out) 7348 * 7349 * Return Code: 7350 * 0 - Success 7351 * ENXIO - Invalid buffer passed in 7352 * EFAULT - ddi_copyin of data failed 7353 */ 7354 static int 7355 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7356 { 7357 _NOTE(ARGUNUSED(vdc)) 7358 7359 vd_geom_t vdgeom; 7360 void *tmp_mem = NULL; 7361 int copy_len = sizeof (struct dk_geom); 7362 int rv = 0; 7363 7364 if (dir != VD_COPYIN) 7365 return (0); /* nothing to do */ 7366 7367 if ((from == NULL) || (to == NULL)) 7368 return (ENXIO); 7369 7370 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7371 7372 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7373 if (rv != 0) { 7374 kmem_free(tmp_mem, copy_len); 7375 return (EFAULT); 7376 } 7377 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7378 bcopy(&vdgeom, to, sizeof (vdgeom)); 7379 kmem_free(tmp_mem, copy_len); 7380 7381 return (0); 7382 } 7383 7384 static int 7385 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7386 { 7387 _NOTE(ARGUNUSED(vdc)) 7388 7389 vd_efi_t *vd_efi; 7390 dk_efi_t dk_efi; 7391 int rv = 0; 7392 void *uaddr; 7393 7394 if ((from == NULL) || (to == NULL)) 7395 return (ENXIO); 7396 7397 if (dir == VD_COPYIN) { 7398 7399 vd_efi = (vd_efi_t *)to; 7400 7401 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7402 if (rv != 0) 7403 return (EFAULT); 7404 7405 vd_efi->lba = dk_efi.dki_lba; 7406 vd_efi->length = dk_efi.dki_length; 7407 bzero(vd_efi->data, vd_efi->length); 7408 7409 } else { 7410 7411 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7412 if (rv != 0) 7413 return (EFAULT); 7414 7415 uaddr = dk_efi.dki_data; 7416 7417 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7418 7419 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7420 7421 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7422 mode); 7423 if (rv != 0) 7424 return (EFAULT); 7425 7426 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7427 } 7428 7429 return (0); 7430 } 7431 7432 static int 7433 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7434 { 7435 _NOTE(ARGUNUSED(vdc)) 7436 7437 dk_efi_t dk_efi; 7438 void *uaddr; 7439 7440 if (dir == VD_COPYOUT) { 7441 /* 7442 * The disk label may have changed. Revalidate the disk 7443 * geometry. This will also update the device nodes. 7444 */ 7445 vdc_validate(vdc); 7446 return (0); 7447 } 7448 7449 if ((from == NULL) || (to == NULL)) 7450 return (ENXIO); 7451 7452 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7453 return (EFAULT); 7454 7455 uaddr = dk_efi.dki_data; 7456 7457 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7458 7459 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7460 return (EFAULT); 7461 7462 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7463 7464 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7465 7466 return (0); 7467 } 7468 7469 7470 /* -------------------------------------------------------------------------- */ 7471 7472 /* 7473 * Function: 7474 * vdc_create_fake_geometry() 7475 * 7476 * Description: 7477 * This routine fakes up the disk info needed for some DKIO ioctls such 7478 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7479 * 7480 * Note: This function must not be called until the vDisk attributes have 7481 * been exchanged as part of the handshake with the vDisk server. 7482 * 7483 * Arguments: 7484 * vdc - soft state pointer for this instance of the device driver. 7485 * 7486 * Return Code: 7487 * none. 7488 */ 7489 static void 7490 vdc_create_fake_geometry(vdc_t *vdc) 7491 { 7492 ASSERT(vdc != NULL); 7493 ASSERT(vdc->max_xfer_sz != 0); 7494 7495 /* 7496 * DKIOCINFO support 7497 */ 7498 if (vdc->cinfo == NULL) 7499 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7500 7501 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7502 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7503 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7504 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7505 7506 /* 7507 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7508 * operation is supported, otherwise the controller type is DKC_DIRECT. 7509 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7510 * controller type is always DKC_DIRECT in that case. 7511 * 7512 * If the virtual disk is backed by a physical CD/DVD device or 7513 * an ISO image, modify the controller type to indicate this 7514 */ 7515 switch (vdc->vdisk_media) { 7516 case VD_MEDIA_CD: 7517 case VD_MEDIA_DVD: 7518 vdc->cinfo->dki_ctype = DKC_CDROM; 7519 break; 7520 case VD_MEDIA_FIXED: 7521 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7522 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7523 else 7524 vdc->cinfo->dki_ctype = DKC_DIRECT; 7525 break; 7526 default: 7527 /* in the case of v1.0 we default to a fixed disk */ 7528 vdc->cinfo->dki_ctype = DKC_DIRECT; 7529 break; 7530 } 7531 vdc->cinfo->dki_flags = DKI_FMTVOL; 7532 vdc->cinfo->dki_cnum = 0; 7533 vdc->cinfo->dki_addr = 0; 7534 vdc->cinfo->dki_space = 0; 7535 vdc->cinfo->dki_prio = 0; 7536 vdc->cinfo->dki_vec = 0; 7537 vdc->cinfo->dki_unit = vdc->instance; 7538 vdc->cinfo->dki_slave = 0; 7539 /* 7540 * The partition number will be created on the fly depending on the 7541 * actual slice (i.e. minor node) that is used to request the data. 7542 */ 7543 vdc->cinfo->dki_partition = 0; 7544 7545 /* 7546 * DKIOCGMEDIAINFO support 7547 */ 7548 if (vdc->minfo == NULL) 7549 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7550 7551 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7552 vdc->minfo->dki_media_type = 7553 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7554 } else { 7555 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7556 } 7557 7558 vdc->minfo->dki_capacity = vdc->vdisk_size; 7559 vdc->minfo->dki_lbsize = vdc->block_size; 7560 } 7561 7562 static ushort_t 7563 vdc_lbl2cksum(struct dk_label *label) 7564 { 7565 int count; 7566 ushort_t sum, *sp; 7567 7568 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7569 sp = (ushort_t *)label; 7570 sum = 0; 7571 while (count--) { 7572 sum ^= *sp++; 7573 } 7574 7575 return (sum); 7576 } 7577 7578 static void 7579 vdc_update_size(vdc_t *vdc, size_t dsk_size, size_t blk_size, size_t xfr_size) 7580 { 7581 vd_err_stats_t *stp; 7582 7583 ASSERT(MUTEX_HELD(&vdc->lock)); 7584 ASSERT(xfr_size != 0); 7585 7586 /* 7587 * If the disk size is unknown or sizes are unchanged then don't 7588 * update anything. 7589 */ 7590 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0 || 7591 (blk_size == vdc->block_size && dsk_size == vdc->vdisk_size && 7592 xfr_size == vdc->max_xfer_sz)) 7593 return; 7594 7595 /* 7596 * We don't know at compile time what the vDisk server will think 7597 * are good values but we apply a large (arbitrary) upper bound to 7598 * prevent memory exhaustion in vdc if it was allocating a DRing 7599 * based of huge values sent by the server. We probably will never 7600 * exceed this except if the message was garbage. 7601 */ 7602 if ((xfr_size * blk_size) > (PAGESIZE * DEV_BSIZE)) { 7603 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 7604 " using max supported by vdc", vdc->instance); 7605 xfr_size = maxphys / DEV_BSIZE; 7606 dsk_size = (dsk_size * blk_size) / DEV_BSIZE; 7607 blk_size = DEV_BSIZE; 7608 } 7609 7610 vdc->max_xfer_sz = xfr_size; 7611 vdc->block_size = blk_size; 7612 vdc->vdisk_size = dsk_size; 7613 7614 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 7615 stp->vd_capacity.value.ui64 = dsk_size * blk_size; 7616 7617 vdc->minfo->dki_capacity = dsk_size; 7618 vdc->minfo->dki_lbsize = (uint_t)blk_size; 7619 } 7620 7621 /* 7622 * Function: 7623 * vdc_validate_geometry 7624 * 7625 * Description: 7626 * This routine discovers the label and geometry of the disk. It stores 7627 * the disk label and related information in the vdc structure. If it 7628 * fails to validate the geometry or to discover the disk label then 7629 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7630 * 7631 * Arguments: 7632 * vdc - soft state pointer for this instance of the device driver. 7633 * 7634 * Return Code: 7635 * 0 - success. 7636 * EINVAL - unknown disk label. 7637 * ENOTSUP - geometry not applicable (EFI label). 7638 * EIO - error accessing the disk. 7639 */ 7640 static int 7641 vdc_validate_geometry(vdc_t *vdc) 7642 { 7643 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7644 dev_t dev; 7645 int rv, rval; 7646 struct dk_label label; 7647 struct dk_geom geom; 7648 struct vtoc vtoc; 7649 efi_gpt_t *gpt; 7650 efi_gpe_t *gpe; 7651 vd_efi_dev_t edev; 7652 7653 ASSERT(vdc != NULL); 7654 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7655 ASSERT(MUTEX_HELD(&vdc->lock)); 7656 7657 mutex_exit(&vdc->lock); 7658 /* 7659 * Check the disk capacity in case it has changed. If that fails then 7660 * we proceed and we will be using the disk size we currently have. 7661 */ 7662 (void) vdc_check_capacity(vdc); 7663 dev = makedevice(ddi_driver_major(vdc->dip), 7664 VD_MAKE_DEV(vdc->instance, 0)); 7665 7666 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7667 if (rv == 0) 7668 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7669 FKIOCTL, &rval); 7670 7671 if (rv == ENOTSUP) { 7672 /* 7673 * If the device does not support VTOC then we try 7674 * to read an EFI label. 7675 * 7676 * We need to know the block size and the disk size to 7677 * be able to read an EFI label. 7678 */ 7679 if (vdc->vdisk_size == 0) { 7680 mutex_enter(&vdc->lock); 7681 vdc_store_label_unk(vdc); 7682 return (EIO); 7683 } 7684 7685 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7686 7687 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7688 7689 if (rv) { 7690 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7691 vdc->instance, rv); 7692 mutex_enter(&vdc->lock); 7693 vdc_store_label_unk(vdc); 7694 return (EIO); 7695 } 7696 7697 mutex_enter(&vdc->lock); 7698 vdc_store_label_efi(vdc, gpt, gpe); 7699 vd_efi_free(&edev, gpt, gpe); 7700 return (ENOTSUP); 7701 } 7702 7703 if (rv != 0) { 7704 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7705 vdc->instance, rv); 7706 mutex_enter(&vdc->lock); 7707 vdc_store_label_unk(vdc); 7708 if (rv != EINVAL) 7709 rv = EIO; 7710 return (rv); 7711 } 7712 7713 /* check that geometry and vtoc are valid */ 7714 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7715 vtoc.v_sanity != VTOC_SANE) { 7716 mutex_enter(&vdc->lock); 7717 vdc_store_label_unk(vdc); 7718 return (EINVAL); 7719 } 7720 7721 /* 7722 * We have a disk and a valid VTOC. However this does not mean 7723 * that the disk currently have a VTOC label. The returned VTOC may 7724 * be a default VTOC to be used for configuring the disk (this is 7725 * what is done for disk image). So we read the label from the 7726 * beginning of the disk to ensure we really have a VTOC label. 7727 * 7728 * FUTURE: This could be the default way for reading the VTOC 7729 * from the disk as opposed to sending the VD_OP_GET_VTOC 7730 * to the server. This will be the default if vdc is implemented 7731 * ontop of cmlb. 7732 */ 7733 7734 /* 7735 * Single slice disk does not support read using an absolute disk 7736 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7737 */ 7738 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7739 mutex_enter(&vdc->lock); 7740 if (vtoc.v_nparts != 1) { 7741 vdc_store_label_unk(vdc); 7742 return (EINVAL); 7743 } 7744 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7745 return (0); 7746 } 7747 7748 if (vtoc.v_nparts != V_NUMPAR) { 7749 mutex_enter(&vdc->lock); 7750 vdc_store_label_unk(vdc); 7751 return (EINVAL); 7752 } 7753 7754 /* 7755 * Read disk label from start of disk 7756 */ 7757 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7758 bioinit(buf); 7759 buf->b_un.b_addr = (caddr_t)&label; 7760 buf->b_bcount = DK_LABEL_SIZE; 7761 buf->b_flags = B_BUSY | B_READ; 7762 buf->b_dev = cmpdev(dev); 7763 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7764 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7765 if (rv) { 7766 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7767 vdc->instance); 7768 } else { 7769 rv = biowait(buf); 7770 biofini(buf); 7771 } 7772 kmem_free(buf, sizeof (buf_t)); 7773 7774 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7775 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7776 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7777 vdc->instance); 7778 mutex_enter(&vdc->lock); 7779 vdc_store_label_unk(vdc); 7780 return (EINVAL); 7781 } 7782 7783 mutex_enter(&vdc->lock); 7784 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7785 return (0); 7786 } 7787 7788 /* 7789 * Function: 7790 * vdc_validate 7791 * 7792 * Description: 7793 * This routine discovers the label of the disk and create the 7794 * appropriate device nodes if the label has changed. 7795 * 7796 * Arguments: 7797 * vdc - soft state pointer for this instance of the device driver. 7798 * 7799 * Return Code: 7800 * none. 7801 */ 7802 static void 7803 vdc_validate(vdc_t *vdc) 7804 { 7805 vd_disk_label_t old_label; 7806 vd_slice_t old_slice[V_NUMPAR]; 7807 int rv; 7808 7809 ASSERT(!MUTEX_HELD(&vdc->lock)); 7810 7811 mutex_enter(&vdc->lock); 7812 7813 /* save the current label and vtoc */ 7814 old_label = vdc->vdisk_label; 7815 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7816 7817 /* check the geometry */ 7818 (void) vdc_validate_geometry(vdc); 7819 7820 /* if the disk label has changed, update device nodes */ 7821 if (vdc->vdisk_label != old_label) { 7822 7823 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7824 rv = vdc_create_device_nodes_efi(vdc); 7825 else 7826 rv = vdc_create_device_nodes_vtoc(vdc); 7827 7828 if (rv != 0) { 7829 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7830 vdc->instance); 7831 } 7832 } 7833 7834 mutex_exit(&vdc->lock); 7835 } 7836 7837 static void 7838 vdc_validate_task(void *arg) 7839 { 7840 vdc_t *vdc = (vdc_t *)arg; 7841 7842 vdc_validate(vdc); 7843 7844 mutex_enter(&vdc->lock); 7845 ASSERT(vdc->validate_pending > 0); 7846 vdc->validate_pending--; 7847 mutex_exit(&vdc->lock); 7848 } 7849 7850 /* 7851 * Function: 7852 * vdc_setup_devid() 7853 * 7854 * Description: 7855 * This routine discovers the devid of a vDisk. It requests the devid of 7856 * the underlying device from the vDisk server, builds an encapsulated 7857 * devid based on the retrieved devid and registers that new devid to 7858 * the vDisk. 7859 * 7860 * Arguments: 7861 * vdc - soft state pointer for this instance of the device driver. 7862 * 7863 * Return Code: 7864 * 0 - A devid was succesfully registered for the vDisk 7865 */ 7866 static int 7867 vdc_setup_devid(vdc_t *vdc) 7868 { 7869 int rv; 7870 vd_devid_t *vd_devid; 7871 size_t bufsize, bufid_len; 7872 7873 /* 7874 * At first sight, we don't know the size of the devid that the 7875 * server will return but this size will be encoded into the 7876 * reply. So we do a first request using a default size then we 7877 * check if this size was large enough. If not then we do a second 7878 * request with the correct size returned by the server. Note that 7879 * ldc requires size to be 8-byte aligned. 7880 */ 7881 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7882 sizeof (uint64_t)); 7883 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7884 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7885 7886 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7887 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7888 7889 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7890 7891 if (rv) { 7892 kmem_free(vd_devid, bufsize); 7893 return (rv); 7894 } 7895 7896 if (vd_devid->length > bufid_len) { 7897 /* 7898 * The returned devid is larger than the buffer used. Try again 7899 * with a buffer with the right size. 7900 */ 7901 kmem_free(vd_devid, bufsize); 7902 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7903 sizeof (uint64_t)); 7904 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7905 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7906 7907 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7908 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7909 VIO_both_dir, B_TRUE); 7910 7911 if (rv) { 7912 kmem_free(vd_devid, bufsize); 7913 return (rv); 7914 } 7915 } 7916 7917 /* 7918 * The virtual disk should have the same device id as the one associated 7919 * with the physical disk it is mapped on, otherwise sharing a disk 7920 * between a LDom and a non-LDom may not work (for example for a shared 7921 * SVM disk set). 7922 * 7923 * The DDI framework does not allow creating a device id with any 7924 * type so we first create a device id of type DEVID_ENCAP and then 7925 * we restore the orignal type of the physical device. 7926 */ 7927 7928 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7929 7930 /* build an encapsulated devid based on the returned devid */ 7931 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7932 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7933 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7934 kmem_free(vd_devid, bufsize); 7935 return (1); 7936 } 7937 7938 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7939 7940 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7941 7942 kmem_free(vd_devid, bufsize); 7943 7944 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7945 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7946 return (1); 7947 } 7948 7949 return (0); 7950 } 7951 7952 static void 7953 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7954 { 7955 int i, nparts; 7956 7957 ASSERT(MUTEX_HELD(&vdc->lock)); 7958 7959 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7960 bzero(vdc->vtoc, sizeof (struct vtoc)); 7961 bzero(vdc->geom, sizeof (struct dk_geom)); 7962 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7963 7964 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7965 7966 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7967 7968 if (gpe[i].efi_gpe_StartingLBA == 0 || 7969 gpe[i].efi_gpe_EndingLBA == 0) { 7970 continue; 7971 } 7972 7973 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7974 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7975 gpe[i].efi_gpe_StartingLBA + 1; 7976 } 7977 7978 ASSERT(vdc->vdisk_size != 0); 7979 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7980 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7981 7982 } 7983 7984 static void 7985 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7986 { 7987 int i; 7988 7989 ASSERT(MUTEX_HELD(&vdc->lock)); 7990 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7991 7992 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7993 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7994 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7995 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7996 7997 for (i = 0; i < vtoc->v_nparts; i++) { 7998 vdc->slice[i].start = vtoc->v_part[i].p_start; 7999 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 8000 } 8001 } 8002 8003 static void 8004 vdc_store_label_unk(vdc_t *vdc) 8005 { 8006 ASSERT(MUTEX_HELD(&vdc->lock)); 8007 8008 vdc->vdisk_label = VD_DISK_LABEL_UNK; 8009 bzero(vdc->vtoc, sizeof (struct vtoc)); 8010 bzero(vdc->geom, sizeof (struct dk_geom)); 8011 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8012 } 8013