1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * LDoms virtual disk client (vdc) device driver 29 * 30 * This driver runs on a guest logical domain and communicates with the virtual 31 * disk server (vds) driver running on the service domain which is exporting 32 * virtualized "disks" to the guest logical domain. 33 * 34 * The driver can be divided into four sections: 35 * 36 * 1) generic device driver housekeeping 37 * _init, _fini, attach, detach, ops structures, etc. 38 * 39 * 2) communication channel setup 40 * Setup the communications link over the LDC channel that vdc uses to 41 * talk to the vDisk server. Initialise the descriptor ring which 42 * allows the LDC clients to transfer data via memory mappings. 43 * 44 * 3) Support exported to upper layers (filesystems, etc) 45 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 46 * ioctl calls. vdc will copy the data to be written to the descriptor 47 * ring or maps the buffer to store the data read by the vDisk 48 * server into the descriptor ring. It then sends a message to the 49 * vDisk server requesting it to complete the operation. 50 * 51 * 4) Handling responses from vDisk server. 52 * The vDisk server will ACK some or all of the messages vdc sends to it 53 * (this is configured during the handshake). Upon receipt of an ACK 54 * vdc will check the descriptor ring and signal to the upper layer 55 * code waiting on the IO. 56 */ 57 58 #include <sys/atomic.h> 59 #include <sys/conf.h> 60 #include <sys/disp.h> 61 #include <sys/ddi.h> 62 #include <sys/dkio.h> 63 #include <sys/efi_partition.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/kstat.h> 67 #include <sys/mach_descrip.h> 68 #include <sys/modctl.h> 69 #include <sys/mdeg.h> 70 #include <sys/note.h> 71 #include <sys/open.h> 72 #include <sys/sdt.h> 73 #include <sys/stat.h> 74 #include <sys/sunddi.h> 75 #include <sys/types.h> 76 #include <sys/promif.h> 77 #include <sys/var.h> 78 #include <sys/vtoc.h> 79 #include <sys/archsystm.h> 80 #include <sys/sysmacros.h> 81 82 #include <sys/cdio.h> 83 #include <sys/dktp/fdisk.h> 84 #include <sys/dktp/dadkio.h> 85 #include <sys/mhd.h> 86 #include <sys/scsi/generic/sense.h> 87 #include <sys/scsi/impl/uscsi.h> 88 #include <sys/scsi/impl/services.h> 89 #include <sys/scsi/targets/sddef.h> 90 91 #include <sys/ldoms.h> 92 #include <sys/ldc.h> 93 #include <sys/vio_common.h> 94 #include <sys/vio_mailbox.h> 95 #include <sys/vio_util.h> 96 #include <sys/vdsk_common.h> 97 #include <sys/vdsk_mailbox.h> 98 #include <sys/vdc.h> 99 100 #define VD_OLDVTOC_LIMIT 0x7fffffff 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 124 int mod_flags, char *name, caddr_t valuep, int *lengthp); 125 126 /* setup */ 127 static void vdc_min(struct buf *bufp); 128 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 129 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 130 static int vdc_start_ldc_connection(vdc_t *vdc); 131 static int vdc_create_device_nodes(vdc_t *vdc); 132 static int vdc_create_device_nodes_efi(vdc_t *vdc); 133 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 134 static void vdc_create_io_kstats(vdc_t *vdc); 135 static void vdc_create_err_kstats(vdc_t *vdc); 136 static void vdc_set_err_kstats(vdc_t *vdc); 137 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 138 mde_cookie_t *vd_nodep); 139 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 140 static void vdc_fini_ports(vdc_t *vdc); 141 static void vdc_switch_server(vdc_t *vdcp); 142 static int vdc_do_ldc_up(vdc_t *vdc); 143 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 144 static int vdc_init_descriptor_ring(vdc_t *vdc); 145 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 146 static int vdc_setup_devid(vdc_t *vdc); 147 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 148 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, 149 struct extvtoc *); 150 static void vdc_store_label_unk(vdc_t *vdc); 151 static boolean_t vdc_is_opened(vdc_t *vdc); 152 static void vdc_update_size(vdc_t *vdc, size_t, size_t, size_t); 153 154 /* handshake with vds */ 155 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 156 static int vdc_ver_negotiation(vdc_t *vdcp); 157 static int vdc_init_attr_negotiation(vdc_t *vdc); 158 static int vdc_attr_negotiation(vdc_t *vdcp); 159 static int vdc_init_dring_negotiate(vdc_t *vdc); 160 static int vdc_dring_negotiation(vdc_t *vdcp); 161 static int vdc_send_rdx(vdc_t *vdcp); 162 static int vdc_rdx_exchange(vdc_t *vdcp); 163 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 164 165 /* processing incoming messages from vDisk server */ 166 static void vdc_process_msg_thread(vdc_t *vdc); 167 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 168 169 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 170 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 171 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 172 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 173 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 174 static int vdc_send_request(vdc_t *vdcp, int operation, 175 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 176 int cb_type, void *cb_arg, vio_desc_direction_t dir); 177 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 178 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 179 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 180 int cb_type, void *cb_arg, vio_desc_direction_t dir); 181 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 182 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 183 void *cb_arg, vio_desc_direction_t dir, boolean_t); 184 185 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 186 static int vdc_drain_response(vdc_t *vdcp, vio_cb_type_t cb_type, 187 struct buf *buf); 188 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 189 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 190 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 191 192 /* dkio */ 193 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 194 int *rvalp); 195 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 196 static void vdc_create_fake_geometry(vdc_t *vdc); 197 static int vdc_validate_geometry(vdc_t *vdc); 198 static void vdc_validate(vdc_t *vdc); 199 static void vdc_validate_task(void *arg); 200 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 201 int mode, int dir); 202 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 203 int mode, int dir); 204 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 205 int mode, int dir); 206 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 207 int mode, int dir); 208 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 209 int mode, int dir); 210 static int vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, 211 int mode, int dir); 212 static int vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, 213 int mode, int dir); 214 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 215 int mode, int dir); 216 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 217 int mode, int dir); 218 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 219 int mode, int dir); 220 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 221 int mode, int dir); 222 223 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 224 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 225 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 226 static int vdc_failfast_check_resv(vdc_t *vdc); 227 228 /* 229 * Module variables 230 */ 231 232 /* 233 * Tunable variables to control how long vdc waits before timing out on 234 * various operations 235 */ 236 static int vdc_hshake_retries = 3; 237 238 static int vdc_timeout = 0; /* units: seconds */ 239 static int vdc_ldcup_timeout = 1; /* units: seconds */ 240 241 static uint64_t vdc_hz_min_ldc_delay; 242 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 243 static uint64_t vdc_hz_max_ldc_delay; 244 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 245 246 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 247 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 248 249 /* values for dumping - need to run in a tighter loop */ 250 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 251 static int vdc_dump_retries = 100; 252 253 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 254 255 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 256 257 /* Count of the number of vdc instances attached */ 258 static volatile uint32_t vdc_instance_count = 0; 259 260 /* Tunable to log all SCSI errors */ 261 static boolean_t vdc_scsi_log_error = B_FALSE; 262 263 /* Soft state pointer */ 264 static void *vdc_state; 265 266 /* 267 * Controlling the verbosity of the error/debug messages 268 * 269 * vdc_msglevel - controls level of messages 270 * vdc_matchinst - 64-bit variable where each bit corresponds 271 * to the vdc instance the vdc_msglevel applies. 272 */ 273 int vdc_msglevel = 0x0; 274 uint64_t vdc_matchinst = 0ull; 275 276 /* 277 * Supported vDisk protocol version pairs. 278 * 279 * The first array entry is the latest and preferred version. 280 */ 281 static const vio_ver_t vdc_version[] = {{1, 1}}; 282 283 static struct cb_ops vdc_cb_ops = { 284 vdc_open, /* cb_open */ 285 vdc_close, /* cb_close */ 286 vdc_strategy, /* cb_strategy */ 287 vdc_print, /* cb_print */ 288 vdc_dump, /* cb_dump */ 289 vdc_read, /* cb_read */ 290 vdc_write, /* cb_write */ 291 vdc_ioctl, /* cb_ioctl */ 292 nodev, /* cb_devmap */ 293 nodev, /* cb_mmap */ 294 nodev, /* cb_segmap */ 295 nochpoll, /* cb_chpoll */ 296 vdc_prop_op, /* cb_prop_op */ 297 NULL, /* cb_str */ 298 D_MP | D_64BIT, /* cb_flag */ 299 CB_REV, /* cb_rev */ 300 vdc_aread, /* cb_aread */ 301 vdc_awrite /* cb_awrite */ 302 }; 303 304 static struct dev_ops vdc_ops = { 305 DEVO_REV, /* devo_rev */ 306 0, /* devo_refcnt */ 307 vdc_getinfo, /* devo_getinfo */ 308 nulldev, /* devo_identify */ 309 nulldev, /* devo_probe */ 310 vdc_attach, /* devo_attach */ 311 vdc_detach, /* devo_detach */ 312 nodev, /* devo_reset */ 313 &vdc_cb_ops, /* devo_cb_ops */ 314 NULL, /* devo_bus_ops */ 315 nulldev, /* devo_power */ 316 ddi_quiesce_not_needed, /* devo_quiesce */ 317 }; 318 319 static struct modldrv modldrv = { 320 &mod_driverops, 321 "virtual disk client", 322 &vdc_ops, 323 }; 324 325 static struct modlinkage modlinkage = { 326 MODREV_1, 327 &modldrv, 328 NULL 329 }; 330 331 /* -------------------------------------------------------------------------- */ 332 333 /* 334 * Device Driver housekeeping and setup 335 */ 336 337 int 338 _init(void) 339 { 340 int status; 341 342 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 343 return (status); 344 if ((status = mod_install(&modlinkage)) != 0) 345 ddi_soft_state_fini(&vdc_state); 346 return (status); 347 } 348 349 int 350 _info(struct modinfo *modinfop) 351 { 352 return (mod_info(&modlinkage, modinfop)); 353 } 354 355 int 356 _fini(void) 357 { 358 int status; 359 360 if ((status = mod_remove(&modlinkage)) != 0) 361 return (status); 362 ddi_soft_state_fini(&vdc_state); 363 return (0); 364 } 365 366 static int 367 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 368 { 369 _NOTE(ARGUNUSED(dip)) 370 371 int instance = VDCUNIT((dev_t)arg); 372 vdc_t *vdc = NULL; 373 374 switch (cmd) { 375 case DDI_INFO_DEVT2DEVINFO: 376 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 377 *resultp = NULL; 378 return (DDI_FAILURE); 379 } 380 *resultp = vdc->dip; 381 return (DDI_SUCCESS); 382 case DDI_INFO_DEVT2INSTANCE: 383 *resultp = (void *)(uintptr_t)instance; 384 return (DDI_SUCCESS); 385 default: 386 *resultp = NULL; 387 return (DDI_FAILURE); 388 } 389 } 390 391 static int 392 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 393 { 394 kt_did_t failfast_tid, ownership_tid; 395 int instance; 396 int rv; 397 vdc_server_t *srvr; 398 vdc_t *vdc = NULL; 399 400 switch (cmd) { 401 case DDI_DETACH: 402 /* the real work happens below */ 403 break; 404 case DDI_SUSPEND: 405 /* nothing to do for this non-device */ 406 return (DDI_SUCCESS); 407 default: 408 return (DDI_FAILURE); 409 } 410 411 ASSERT(cmd == DDI_DETACH); 412 instance = ddi_get_instance(dip); 413 DMSGX(1, "[%d] Entered\n", instance); 414 415 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 416 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 417 return (DDI_FAILURE); 418 } 419 420 /* 421 * This function is called when vdc is detached or if it has failed to 422 * attach. In that case, the attach may have fail before the vdisk type 423 * has been set so we can't call vdc_is_opened(). However as the attach 424 * has failed, we know that the vdisk is not opened and we can safely 425 * detach. 426 */ 427 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 428 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 429 return (DDI_FAILURE); 430 } 431 432 if (vdc->dkio_flush_pending) { 433 DMSG(vdc, 0, 434 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 435 instance, vdc->dkio_flush_pending); 436 return (DDI_FAILURE); 437 } 438 439 if (vdc->validate_pending) { 440 DMSG(vdc, 0, 441 "[%d] Cannot detach: %d outstanding validate request\n", 442 instance, vdc->validate_pending); 443 return (DDI_FAILURE); 444 } 445 446 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 447 448 /* If we took ownership, release ownership */ 449 mutex_enter(&vdc->ownership_lock); 450 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 451 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 452 if (rv == 0) { 453 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 454 } 455 } 456 mutex_exit(&vdc->ownership_lock); 457 458 /* mark instance as detaching */ 459 vdc->lifecycle = VDC_LC_DETACHING; 460 461 /* 462 * Try and disable callbacks to prevent another handshake. We have to 463 * disable callbacks for all servers. 464 */ 465 for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) { 466 rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE); 467 DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n", 468 srvr->ldc_id, rv); 469 } 470 471 if (vdc->initialized & VDC_THREAD) { 472 mutex_enter(&vdc->read_lock); 473 if ((vdc->read_state == VDC_READ_WAITING) || 474 (vdc->read_state == VDC_READ_RESET)) { 475 vdc->read_state = VDC_READ_RESET; 476 cv_signal(&vdc->read_cv); 477 } 478 479 mutex_exit(&vdc->read_lock); 480 481 /* wake up any thread waiting for connection to come online */ 482 mutex_enter(&vdc->lock); 483 if (vdc->state == VDC_STATE_INIT_WAITING) { 484 DMSG(vdc, 0, 485 "[%d] write reset - move to resetting state...\n", 486 instance); 487 vdc->state = VDC_STATE_RESETTING; 488 cv_signal(&vdc->initwait_cv); 489 } 490 mutex_exit(&vdc->lock); 491 492 /* now wait until state transitions to VDC_STATE_DETACH */ 493 thread_join(vdc->msg_proc_thr->t_did); 494 ASSERT(vdc->state == VDC_STATE_DETACH); 495 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 496 vdc->instance); 497 } 498 499 mutex_enter(&vdc->lock); 500 501 if (vdc->initialized & VDC_DRING) 502 vdc_destroy_descriptor_ring(vdc); 503 504 vdc_fini_ports(vdc); 505 506 if (vdc->failfast_thread) { 507 failfast_tid = vdc->failfast_thread->t_did; 508 vdc->failfast_interval = 0; 509 cv_signal(&vdc->failfast_cv); 510 } else { 511 failfast_tid = 0; 512 } 513 514 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 515 ownership_tid = vdc->ownership_thread->t_did; 516 vdc->ownership = VDC_OWNERSHIP_NONE; 517 cv_signal(&vdc->ownership_cv); 518 } else { 519 ownership_tid = 0; 520 } 521 522 mutex_exit(&vdc->lock); 523 524 if (failfast_tid != 0) 525 thread_join(failfast_tid); 526 527 if (ownership_tid != 0) 528 thread_join(ownership_tid); 529 530 if (vdc->initialized & VDC_MINOR) 531 ddi_remove_minor_node(dip, NULL); 532 533 if (vdc->io_stats) { 534 kstat_delete(vdc->io_stats); 535 vdc->io_stats = NULL; 536 } 537 538 if (vdc->err_stats) { 539 kstat_delete(vdc->err_stats); 540 vdc->err_stats = NULL; 541 } 542 543 if (vdc->initialized & VDC_LOCKS) { 544 mutex_destroy(&vdc->lock); 545 mutex_destroy(&vdc->read_lock); 546 mutex_destroy(&vdc->ownership_lock); 547 cv_destroy(&vdc->initwait_cv); 548 cv_destroy(&vdc->dring_free_cv); 549 cv_destroy(&vdc->membind_cv); 550 cv_destroy(&vdc->sync_pending_cv); 551 cv_destroy(&vdc->sync_blocked_cv); 552 cv_destroy(&vdc->read_cv); 553 cv_destroy(&vdc->running_cv); 554 cv_destroy(&vdc->ownership_cv); 555 cv_destroy(&vdc->failfast_cv); 556 cv_destroy(&vdc->failfast_io_cv); 557 } 558 559 if (vdc->minfo) 560 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 561 562 if (vdc->cinfo) 563 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 564 565 if (vdc->vtoc) 566 kmem_free(vdc->vtoc, sizeof (struct extvtoc)); 567 568 if (vdc->geom) 569 kmem_free(vdc->geom, sizeof (struct dk_geom)); 570 571 if (vdc->devid) { 572 ddi_devid_unregister(dip); 573 ddi_devid_free(vdc->devid); 574 } 575 576 if (vdc->initialized & VDC_SOFT_STATE) 577 ddi_soft_state_free(vdc_state, instance); 578 579 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 580 581 return (DDI_SUCCESS); 582 } 583 584 585 static int 586 vdc_do_attach(dev_info_t *dip) 587 { 588 int instance; 589 vdc_t *vdc = NULL; 590 int status; 591 md_t *mdp; 592 mde_cookie_t vd_node; 593 594 ASSERT(dip != NULL); 595 596 instance = ddi_get_instance(dip); 597 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 598 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 599 instance); 600 return (DDI_FAILURE); 601 } 602 603 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 604 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 605 return (DDI_FAILURE); 606 } 607 608 /* 609 * We assign the value to initialized in this case to zero out the 610 * variable and then set bits in it to indicate what has been done 611 */ 612 vdc->initialized = VDC_SOFT_STATE; 613 614 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 615 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 616 617 vdc->dip = dip; 618 vdc->instance = instance; 619 vdc->vdisk_type = VD_DISK_TYPE_UNK; 620 vdc->vdisk_label = VD_DISK_LABEL_UNK; 621 vdc->state = VDC_STATE_INIT; 622 vdc->lifecycle = VDC_LC_ATTACHING; 623 vdc->session_id = 0; 624 vdc->block_size = DEV_BSIZE; 625 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 626 627 /* 628 * We assume, for now, that the vDisk server will export 'read' 629 * operations to us at a minimum (this is needed because of checks 630 * in vdc for supported operations early in the handshake process). 631 * The vDisk server will return ENOTSUP if this is not the case. 632 * The value will be overwritten during the attribute exchange with 633 * the bitmask of operations exported by server. 634 */ 635 vdc->operations = VD_OP_MASK_READ; 636 637 vdc->vtoc = NULL; 638 vdc->geom = NULL; 639 vdc->cinfo = NULL; 640 vdc->minfo = NULL; 641 642 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 643 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 645 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 646 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 647 648 vdc->threads_pending = 0; 649 vdc->sync_op_pending = B_FALSE; 650 vdc->sync_op_blocked = B_FALSE; 651 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 652 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 653 654 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 655 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 656 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 657 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 658 659 /* init blocking msg read functionality */ 660 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 661 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 662 vdc->read_state = VDC_READ_IDLE; 663 664 vdc->initialized |= VDC_LOCKS; 665 666 /* get device and port MD node for this disk instance */ 667 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 668 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 669 instance); 670 return (DDI_FAILURE); 671 } 672 673 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 674 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 675 return (DDI_FAILURE); 676 } 677 678 (void) md_fini_handle(mdp); 679 680 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 681 vdc_create_io_kstats(vdc); 682 vdc_create_err_kstats(vdc); 683 684 /* Initialize remaining structures before starting the msg thread */ 685 vdc->vdisk_label = VD_DISK_LABEL_UNK; 686 vdc->vtoc = kmem_zalloc(sizeof (struct extvtoc), KM_SLEEP); 687 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 688 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 689 690 /* initialize the thread responsible for managing state with server */ 691 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 692 vdc, 0, &p0, TS_RUN, minclsyspri); 693 if (vdc->msg_proc_thr == NULL) { 694 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 695 instance); 696 return (DDI_FAILURE); 697 } 698 699 vdc->initialized |= VDC_THREAD; 700 701 atomic_inc_32(&vdc_instance_count); 702 703 /* 704 * Check the disk label. This will send requests and do the handshake. 705 * We don't really care about the disk label now. What we really need is 706 * the handshake do be done so that we know the type of the disk (slice 707 * or full disk) and the appropriate device nodes can be created. 708 */ 709 710 mutex_enter(&vdc->lock); 711 (void) vdc_validate_geometry(vdc); 712 mutex_exit(&vdc->lock); 713 714 /* 715 * Now that we have the device info we can create the device nodes 716 */ 717 status = vdc_create_device_nodes(vdc); 718 if (status) { 719 DMSG(vdc, 0, "[%d] Failed to create device nodes", 720 instance); 721 goto return_status; 722 } 723 724 /* 725 * Setup devid 726 */ 727 if (vdc_setup_devid(vdc)) { 728 DMSG(vdc, 0, "[%d] No device id available\n", instance); 729 } 730 731 /* 732 * Fill in the fields of the error statistics kstat that were not 733 * available when creating the kstat 734 */ 735 vdc_set_err_kstats(vdc); 736 737 ddi_report_dev(dip); 738 vdc->lifecycle = VDC_LC_ONLINE; 739 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 740 741 return_status: 742 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 743 return (status); 744 } 745 746 static int 747 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 748 { 749 int status; 750 751 switch (cmd) { 752 case DDI_ATTACH: 753 if ((status = vdc_do_attach(dip)) != 0) 754 (void) vdc_detach(dip, DDI_DETACH); 755 return (status); 756 case DDI_RESUME: 757 /* nothing to do for this non-device */ 758 return (DDI_SUCCESS); 759 default: 760 return (DDI_FAILURE); 761 } 762 } 763 764 static int 765 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 766 { 767 int status = 0; 768 ldc_status_t ldc_state; 769 ldc_attr_t ldc_attr; 770 771 ASSERT(vdc != NULL); 772 ASSERT(srvr != NULL); 773 774 ldc_attr.devclass = LDC_DEV_BLK; 775 ldc_attr.instance = vdc->instance; 776 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 777 ldc_attr.mtu = VD_LDC_MTU; 778 779 if ((srvr->state & VDC_LDC_INIT) == 0) { 780 status = ldc_init(srvr->ldc_id, &ldc_attr, 781 &srvr->ldc_handle); 782 if (status != 0) { 783 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 784 vdc->instance, srvr->ldc_id, status); 785 return (status); 786 } 787 srvr->state |= VDC_LDC_INIT; 788 } 789 status = ldc_status(srvr->ldc_handle, &ldc_state); 790 if (status != 0) { 791 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 792 vdc->instance, status); 793 goto init_exit; 794 } 795 srvr->ldc_state = ldc_state; 796 797 if ((srvr->state & VDC_LDC_CB) == 0) { 798 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 799 (caddr_t)srvr); 800 if (status != 0) { 801 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 802 vdc->instance, status); 803 goto init_exit; 804 } 805 srvr->state |= VDC_LDC_CB; 806 } 807 808 /* 809 * At this stage we have initialised LDC, we will now try and open 810 * the connection. 811 */ 812 if (srvr->ldc_state == LDC_INIT) { 813 status = ldc_open(srvr->ldc_handle); 814 if (status != 0) { 815 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 816 vdc->instance, srvr->ldc_id, status); 817 goto init_exit; 818 } 819 srvr->state |= VDC_LDC_OPEN; 820 } 821 822 init_exit: 823 if (status) { 824 vdc_terminate_ldc(vdc, srvr); 825 } 826 827 return (status); 828 } 829 830 static int 831 vdc_start_ldc_connection(vdc_t *vdc) 832 { 833 int status = 0; 834 835 ASSERT(vdc != NULL); 836 837 ASSERT(MUTEX_HELD(&vdc->lock)); 838 839 status = vdc_do_ldc_up(vdc); 840 841 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 842 843 return (status); 844 } 845 846 static int 847 vdc_stop_ldc_connection(vdc_t *vdcp) 848 { 849 int status; 850 851 ASSERT(vdcp != NULL); 852 853 ASSERT(MUTEX_HELD(&vdcp->lock)); 854 855 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 856 vdcp->state); 857 858 status = ldc_down(vdcp->curr_server->ldc_handle); 859 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 860 861 vdcp->initialized &= ~VDC_HANDSHAKE; 862 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 863 864 return (status); 865 } 866 867 static void 868 vdc_create_io_kstats(vdc_t *vdc) 869 { 870 if (vdc->io_stats != NULL) { 871 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 872 return; 873 } 874 875 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 876 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 877 if (vdc->io_stats != NULL) { 878 vdc->io_stats->ks_lock = &vdc->lock; 879 kstat_install(vdc->io_stats); 880 } else { 881 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 882 " will not be gathered", vdc->instance); 883 } 884 } 885 886 static void 887 vdc_create_err_kstats(vdc_t *vdc) 888 { 889 vd_err_stats_t *stp; 890 char kstatmodule_err[KSTAT_STRLEN]; 891 char kstatname[KSTAT_STRLEN]; 892 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 893 int instance = vdc->instance; 894 895 if (vdc->err_stats != NULL) { 896 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 897 return; 898 } 899 900 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 901 "%serr", VDC_DRIVER_NAME); 902 (void) snprintf(kstatname, sizeof (kstatname), 903 "%s%d,err", VDC_DRIVER_NAME, instance); 904 905 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 906 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 907 908 if (vdc->err_stats == NULL) { 909 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 910 " will not be gathered", instance); 911 return; 912 } 913 914 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 915 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 916 KSTAT_DATA_UINT32); 917 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 918 KSTAT_DATA_UINT32); 919 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 920 KSTAT_DATA_UINT32); 921 kstat_named_init(&stp->vd_vid, "Vendor", 922 KSTAT_DATA_CHAR); 923 kstat_named_init(&stp->vd_pid, "Product", 924 KSTAT_DATA_CHAR); 925 kstat_named_init(&stp->vd_capacity, "Size", 926 KSTAT_DATA_ULONGLONG); 927 928 vdc->err_stats->ks_update = nulldev; 929 930 kstat_install(vdc->err_stats); 931 } 932 933 static void 934 vdc_set_err_kstats(vdc_t *vdc) 935 { 936 vd_err_stats_t *stp; 937 938 if (vdc->err_stats == NULL) 939 return; 940 941 mutex_enter(&vdc->lock); 942 943 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 944 ASSERT(stp != NULL); 945 946 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 947 (void) strcpy(stp->vd_vid.value.c, "SUN"); 948 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 949 950 mutex_exit(&vdc->lock); 951 } 952 953 static int 954 vdc_create_device_nodes_efi(vdc_t *vdc) 955 { 956 ddi_remove_minor_node(vdc->dip, "h"); 957 ddi_remove_minor_node(vdc->dip, "h,raw"); 958 959 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 960 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 961 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 962 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 963 vdc->instance); 964 return (EIO); 965 } 966 967 /* if any device node is created we set this flag */ 968 vdc->initialized |= VDC_MINOR; 969 970 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 971 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 972 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 973 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 974 vdc->instance); 975 return (EIO); 976 } 977 978 return (0); 979 } 980 981 static int 982 vdc_create_device_nodes_vtoc(vdc_t *vdc) 983 { 984 ddi_remove_minor_node(vdc->dip, "wd"); 985 ddi_remove_minor_node(vdc->dip, "wd,raw"); 986 987 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 988 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 989 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 990 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 991 vdc->instance); 992 return (EIO); 993 } 994 995 /* if any device node is created we set this flag */ 996 vdc->initialized |= VDC_MINOR; 997 998 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 999 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 1000 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1001 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 1002 vdc->instance); 1003 return (EIO); 1004 } 1005 1006 return (0); 1007 } 1008 1009 /* 1010 * Function: 1011 * vdc_create_device_nodes 1012 * 1013 * Description: 1014 * This function creates the block and character device nodes under 1015 * /devices. It is called as part of the attach(9E) of the instance 1016 * during the handshake with vds after vds has sent the attributes 1017 * to vdc. 1018 * 1019 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1020 * of 2 is used in keeping with the Solaris convention that slice 2 1021 * refers to a whole disk. Slices start at 'a' 1022 * 1023 * Parameters: 1024 * vdc - soft state pointer 1025 * 1026 * Return Values 1027 * 0 - Success 1028 * EIO - Failed to create node 1029 * EINVAL - Unknown type of disk exported 1030 */ 1031 static int 1032 vdc_create_device_nodes(vdc_t *vdc) 1033 { 1034 char name[sizeof ("s,raw")]; 1035 dev_info_t *dip = NULL; 1036 int instance, status; 1037 int num_slices = 1; 1038 int i; 1039 1040 ASSERT(vdc != NULL); 1041 1042 instance = vdc->instance; 1043 dip = vdc->dip; 1044 1045 switch (vdc->vdisk_type) { 1046 case VD_DISK_TYPE_DISK: 1047 num_slices = V_NUMPAR; 1048 break; 1049 case VD_DISK_TYPE_SLICE: 1050 num_slices = 1; 1051 break; 1052 case VD_DISK_TYPE_UNK: 1053 default: 1054 return (EINVAL); 1055 } 1056 1057 /* 1058 * Minor nodes are different for EFI disks: EFI disks do not have 1059 * a minor node 'g' for the minor number corresponding to slice 1060 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1061 * representing the whole disk. 1062 */ 1063 for (i = 0; i < num_slices; i++) { 1064 1065 if (i == VD_EFI_WD_SLICE) { 1066 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1067 status = vdc_create_device_nodes_efi(vdc); 1068 else 1069 status = vdc_create_device_nodes_vtoc(vdc); 1070 if (status != 0) 1071 return (status); 1072 continue; 1073 } 1074 1075 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1076 if (ddi_create_minor_node(dip, name, S_IFBLK, 1077 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1078 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1079 instance, name); 1080 return (EIO); 1081 } 1082 1083 /* if any device node is created we set this flag */ 1084 vdc->initialized |= VDC_MINOR; 1085 1086 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1087 1088 if (ddi_create_minor_node(dip, name, S_IFCHR, 1089 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1090 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1091 instance, name); 1092 return (EIO); 1093 } 1094 } 1095 1096 return (0); 1097 } 1098 1099 /* 1100 * Driver prop_op(9e) entry point function. Return the number of blocks for 1101 * the partition in question or forward the request to the property facilities. 1102 */ 1103 static int 1104 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1105 char *name, caddr_t valuep, int *lengthp) 1106 { 1107 int instance = ddi_get_instance(dip); 1108 vdc_t *vdc; 1109 uint64_t nblocks; 1110 uint_t blksize; 1111 1112 vdc = ddi_get_soft_state(vdc_state, instance); 1113 1114 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1115 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1116 name, valuep, lengthp)); 1117 } 1118 1119 mutex_enter(&vdc->lock); 1120 (void) vdc_validate_geometry(vdc); 1121 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1122 mutex_exit(&vdc->lock); 1123 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1124 name, valuep, lengthp)); 1125 } 1126 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1127 blksize = vdc->block_size; 1128 mutex_exit(&vdc->lock); 1129 1130 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1131 name, valuep, lengthp, nblocks, blksize)); 1132 } 1133 1134 /* 1135 * Function: 1136 * vdc_is_opened 1137 * 1138 * Description: 1139 * This function checks if any slice of a given virtual disk is 1140 * currently opened. 1141 * 1142 * Parameters: 1143 * vdc - soft state pointer 1144 * 1145 * Return Values 1146 * B_TRUE - at least one slice is opened. 1147 * B_FALSE - no slice is opened. 1148 */ 1149 static boolean_t 1150 vdc_is_opened(vdc_t *vdc) 1151 { 1152 int i, nslices; 1153 1154 switch (vdc->vdisk_type) { 1155 case VD_DISK_TYPE_DISK: 1156 nslices = V_NUMPAR; 1157 break; 1158 case VD_DISK_TYPE_SLICE: 1159 nslices = 1; 1160 break; 1161 case VD_DISK_TYPE_UNK: 1162 default: 1163 ASSERT(0); 1164 } 1165 1166 /* check if there's any layered open */ 1167 for (i = 0; i < nslices; i++) { 1168 if (vdc->open_lyr[i] > 0) 1169 return (B_TRUE); 1170 } 1171 1172 /* check if there is any other kind of open */ 1173 for (i = 0; i < OTYPCNT; i++) { 1174 if (vdc->open[i] != 0) 1175 return (B_TRUE); 1176 } 1177 1178 return (B_FALSE); 1179 } 1180 1181 static int 1182 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1183 { 1184 uint8_t slicemask; 1185 int i; 1186 1187 ASSERT(otyp < OTYPCNT); 1188 ASSERT(slice < V_NUMPAR); 1189 ASSERT(MUTEX_HELD(&vdc->lock)); 1190 1191 slicemask = 1 << slice; 1192 1193 /* check if slice is already exclusively opened */ 1194 if (vdc->open_excl & slicemask) 1195 return (EBUSY); 1196 1197 /* if open exclusive, check if slice is already opened */ 1198 if (flag & FEXCL) { 1199 if (vdc->open_lyr[slice] > 0) 1200 return (EBUSY); 1201 for (i = 0; i < OTYPCNT; i++) { 1202 if (vdc->open[i] & slicemask) 1203 return (EBUSY); 1204 } 1205 vdc->open_excl |= slicemask; 1206 } 1207 1208 /* mark slice as opened */ 1209 if (otyp == OTYP_LYR) { 1210 vdc->open_lyr[slice]++; 1211 } else { 1212 vdc->open[otyp] |= slicemask; 1213 } 1214 1215 return (0); 1216 } 1217 1218 static void 1219 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1220 { 1221 uint8_t slicemask; 1222 1223 ASSERT(otyp < OTYPCNT); 1224 ASSERT(slice < V_NUMPAR); 1225 ASSERT(MUTEX_HELD(&vdc->lock)); 1226 1227 slicemask = 1 << slice; 1228 1229 if (otyp == OTYP_LYR) { 1230 ASSERT(vdc->open_lyr[slice] > 0); 1231 vdc->open_lyr[slice]--; 1232 } else { 1233 vdc->open[otyp] &= ~slicemask; 1234 } 1235 1236 if (flag & FEXCL) 1237 vdc->open_excl &= ~slicemask; 1238 } 1239 1240 static int 1241 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1242 { 1243 _NOTE(ARGUNUSED(cred)) 1244 1245 int instance, nodelay; 1246 int slice, status = 0; 1247 vdc_t *vdc; 1248 1249 ASSERT(dev != NULL); 1250 instance = VDCUNIT(*dev); 1251 1252 if (otyp >= OTYPCNT) 1253 return (EINVAL); 1254 1255 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1256 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1257 return (ENXIO); 1258 } 1259 1260 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1261 getminor(*dev), flag, otyp); 1262 1263 slice = VDCPART(*dev); 1264 1265 nodelay = flag & (FNDELAY | FNONBLOCK); 1266 1267 if ((flag & FWRITE) && (!nodelay) && 1268 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1269 return (EROFS); 1270 } 1271 1272 mutex_enter(&vdc->lock); 1273 1274 status = vdc_mark_opened(vdc, slice, flag, otyp); 1275 1276 if (status != 0) { 1277 mutex_exit(&vdc->lock); 1278 return (status); 1279 } 1280 1281 if (nodelay) { 1282 1283 /* don't resubmit a validate request if there's already one */ 1284 if (vdc->validate_pending > 0) { 1285 mutex_exit(&vdc->lock); 1286 return (0); 1287 } 1288 1289 /* call vdc_validate() asynchronously to avoid blocking */ 1290 if (taskq_dispatch(system_taskq, vdc_validate_task, 1291 (void *)vdc, TQ_NOSLEEP) == NULL) { 1292 vdc_mark_closed(vdc, slice, flag, otyp); 1293 mutex_exit(&vdc->lock); 1294 return (ENXIO); 1295 } 1296 1297 vdc->validate_pending++; 1298 mutex_exit(&vdc->lock); 1299 return (0); 1300 } 1301 1302 mutex_exit(&vdc->lock); 1303 1304 vdc_validate(vdc); 1305 1306 mutex_enter(&vdc->lock); 1307 1308 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1309 vdc->slice[slice].nblocks == 0) { 1310 vdc_mark_closed(vdc, slice, flag, otyp); 1311 status = EIO; 1312 } 1313 1314 mutex_exit(&vdc->lock); 1315 1316 return (status); 1317 } 1318 1319 static int 1320 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1321 { 1322 _NOTE(ARGUNUSED(cred)) 1323 1324 int instance; 1325 int slice; 1326 int rv, rval; 1327 vdc_t *vdc; 1328 1329 instance = VDCUNIT(dev); 1330 1331 if (otyp >= OTYPCNT) 1332 return (EINVAL); 1333 1334 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1335 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1336 return (ENXIO); 1337 } 1338 1339 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1340 1341 slice = VDCPART(dev); 1342 1343 /* 1344 * Attempt to flush the W$ on a close operation. If this is 1345 * not a supported IOCTL command or the backing device is read-only 1346 * do not fail the close operation. 1347 */ 1348 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1349 1350 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1351 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1352 instance, rv); 1353 return (EIO); 1354 } 1355 1356 mutex_enter(&vdc->lock); 1357 vdc_mark_closed(vdc, slice, flag, otyp); 1358 mutex_exit(&vdc->lock); 1359 1360 return (0); 1361 } 1362 1363 static int 1364 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1365 { 1366 _NOTE(ARGUNUSED(credp)) 1367 1368 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1369 } 1370 1371 static int 1372 vdc_print(dev_t dev, char *str) 1373 { 1374 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1375 return (0); 1376 } 1377 1378 static int 1379 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1380 { 1381 int rv; 1382 size_t nbytes = nblk * DEV_BSIZE; 1383 int instance = VDCUNIT(dev); 1384 vdc_t *vdc = NULL; 1385 1386 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1387 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1388 return (ENXIO); 1389 } 1390 1391 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1392 instance, nbytes, blkno, (void *)addr); 1393 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1394 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1395 if (rv) { 1396 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1397 return (rv); 1398 } 1399 1400 if (ddi_in_panic()) 1401 (void) vdc_drain_response(vdc, CB_STRATEGY, NULL); 1402 1403 DMSG(vdc, 0, "[%d] End\n", instance); 1404 1405 return (0); 1406 } 1407 1408 /* -------------------------------------------------------------------------- */ 1409 1410 /* 1411 * Disk access routines 1412 * 1413 */ 1414 1415 /* 1416 * vdc_strategy() 1417 * 1418 * Return Value: 1419 * 0: As per strategy(9E), the strategy() function must return 0 1420 * [ bioerror(9f) sets b_flags to the proper error code ] 1421 */ 1422 static int 1423 vdc_strategy(struct buf *buf) 1424 { 1425 int rv = -1; 1426 vdc_t *vdc = NULL; 1427 int instance = VDCUNIT(buf->b_edev); 1428 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1429 int slice; 1430 1431 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1432 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1433 bioerror(buf, ENXIO); 1434 biodone(buf); 1435 return (0); 1436 } 1437 1438 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1439 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1440 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1441 1442 bp_mapin(buf); 1443 1444 if ((long)buf->b_private == VD_SLICE_NONE) { 1445 /* I/O using an absolute disk offset */ 1446 slice = VD_SLICE_NONE; 1447 } else { 1448 slice = VDCPART(buf->b_edev); 1449 } 1450 1451 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1452 buf->b_bcount, slice, buf->b_lblkno, 1453 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1454 VIO_write_dir); 1455 1456 /* 1457 * If the request was successfully sent, the strategy call returns and 1458 * the ACK handler calls the bioxxx functions when the vDisk server is 1459 * done otherwise we handle the error here. 1460 */ 1461 if (rv) { 1462 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1463 bioerror(buf, rv); 1464 biodone(buf); 1465 } else if (ddi_in_panic()) { 1466 rv = vdc_drain_response(vdc, CB_STRATEGY, buf); 1467 if (rv != 0) { 1468 bioerror(buf, EIO); 1469 biodone(buf); 1470 } 1471 } 1472 1473 return (0); 1474 } 1475 1476 /* 1477 * Function: 1478 * vdc_min 1479 * 1480 * Description: 1481 * Routine to limit the size of a data transfer. Used in 1482 * conjunction with physio(9F). 1483 * 1484 * Arguments: 1485 * bp - pointer to the indicated buf(9S) struct. 1486 * 1487 */ 1488 static void 1489 vdc_min(struct buf *bufp) 1490 { 1491 vdc_t *vdc = NULL; 1492 int instance = VDCUNIT(bufp->b_edev); 1493 1494 vdc = ddi_get_soft_state(vdc_state, instance); 1495 VERIFY(vdc != NULL); 1496 1497 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1498 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1499 } 1500 } 1501 1502 static int 1503 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1504 { 1505 _NOTE(ARGUNUSED(cred)) 1506 1507 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1508 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1509 } 1510 1511 static int 1512 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1513 { 1514 _NOTE(ARGUNUSED(cred)) 1515 1516 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1517 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1518 } 1519 1520 static int 1521 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1522 { 1523 _NOTE(ARGUNUSED(cred)) 1524 1525 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1526 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1527 } 1528 1529 static int 1530 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1531 { 1532 _NOTE(ARGUNUSED(cred)) 1533 1534 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1535 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1536 } 1537 1538 1539 /* -------------------------------------------------------------------------- */ 1540 1541 /* 1542 * Handshake support 1543 */ 1544 1545 1546 /* 1547 * Function: 1548 * vdc_init_ver_negotiation() 1549 * 1550 * Description: 1551 * 1552 * Arguments: 1553 * vdc - soft state pointer for this instance of the device driver. 1554 * 1555 * Return Code: 1556 * 0 - Success 1557 */ 1558 static int 1559 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1560 { 1561 vio_ver_msg_t pkt; 1562 size_t msglen = sizeof (pkt); 1563 int status = -1; 1564 1565 ASSERT(vdc != NULL); 1566 ASSERT(mutex_owned(&vdc->lock)); 1567 1568 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1569 1570 /* 1571 * set the Session ID to a unique value 1572 * (the lower 32 bits of the clock tick) 1573 */ 1574 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1575 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1576 1577 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1578 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1579 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1580 pkt.tag.vio_sid = vdc->session_id; 1581 pkt.dev_class = VDEV_DISK; 1582 pkt.ver_major = ver.major; 1583 pkt.ver_minor = ver.minor; 1584 1585 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1586 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1587 vdc->instance, status); 1588 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1589 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1590 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1591 vdc->curr_server->ldc_handle, status, msglen); 1592 if (msglen != sizeof (vio_ver_msg_t)) 1593 status = ENOMSG; 1594 } 1595 1596 return (status); 1597 } 1598 1599 /* 1600 * Function: 1601 * vdc_ver_negotiation() 1602 * 1603 * Description: 1604 * 1605 * Arguments: 1606 * vdcp - soft state pointer for this instance of the device driver. 1607 * 1608 * Return Code: 1609 * 0 - Success 1610 */ 1611 static int 1612 vdc_ver_negotiation(vdc_t *vdcp) 1613 { 1614 vio_msg_t vio_msg; 1615 int status; 1616 1617 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1618 return (status); 1619 1620 /* release lock and wait for response */ 1621 mutex_exit(&vdcp->lock); 1622 status = vdc_wait_for_response(vdcp, &vio_msg); 1623 mutex_enter(&vdcp->lock); 1624 if (status) { 1625 DMSG(vdcp, 0, 1626 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1627 vdcp->instance, status); 1628 return (status); 1629 } 1630 1631 /* check type and sub_type ... */ 1632 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1633 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1634 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1635 vdcp->instance); 1636 return (EPROTO); 1637 } 1638 1639 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1640 } 1641 1642 /* 1643 * Function: 1644 * vdc_init_attr_negotiation() 1645 * 1646 * Description: 1647 * 1648 * Arguments: 1649 * vdc - soft state pointer for this instance of the device driver. 1650 * 1651 * Return Code: 1652 * 0 - Success 1653 */ 1654 static int 1655 vdc_init_attr_negotiation(vdc_t *vdc) 1656 { 1657 vd_attr_msg_t pkt; 1658 size_t msglen = sizeof (pkt); 1659 int status; 1660 1661 ASSERT(vdc != NULL); 1662 ASSERT(mutex_owned(&vdc->lock)); 1663 1664 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1665 1666 /* fill in tag */ 1667 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1668 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1669 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1670 pkt.tag.vio_sid = vdc->session_id; 1671 /* fill in payload */ 1672 pkt.max_xfer_sz = vdc->max_xfer_sz; 1673 pkt.vdisk_block_size = vdc->block_size; 1674 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1675 pkt.operations = 0; /* server will set bits of valid operations */ 1676 pkt.vdisk_type = 0; /* server will set to valid device type */ 1677 pkt.vdisk_media = 0; /* server will set to valid media type */ 1678 pkt.vdisk_size = 0; /* server will set to valid size */ 1679 1680 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1681 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1682 1683 if ((status != 0) || (msglen != sizeof (vd_attr_msg_t))) { 1684 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1685 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1686 vdc->curr_server->ldc_handle, status, msglen); 1687 if (msglen != sizeof (vd_attr_msg_t)) 1688 status = ENOMSG; 1689 } 1690 1691 return (status); 1692 } 1693 1694 /* 1695 * Function: 1696 * vdc_attr_negotiation() 1697 * 1698 * Description: 1699 * 1700 * Arguments: 1701 * vdc - soft state pointer for this instance of the device driver. 1702 * 1703 * Return Code: 1704 * 0 - Success 1705 */ 1706 static int 1707 vdc_attr_negotiation(vdc_t *vdcp) 1708 { 1709 int status; 1710 vio_msg_t vio_msg; 1711 1712 if (status = vdc_init_attr_negotiation(vdcp)) 1713 return (status); 1714 1715 /* release lock and wait for response */ 1716 mutex_exit(&vdcp->lock); 1717 status = vdc_wait_for_response(vdcp, &vio_msg); 1718 mutex_enter(&vdcp->lock); 1719 if (status) { 1720 DMSG(vdcp, 0, 1721 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1722 vdcp->instance, status); 1723 return (status); 1724 } 1725 1726 /* check type and sub_type ... */ 1727 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1728 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1729 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1730 vdcp->instance); 1731 return (EPROTO); 1732 } 1733 1734 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1735 } 1736 1737 1738 /* 1739 * Function: 1740 * vdc_init_dring_negotiate() 1741 * 1742 * Description: 1743 * 1744 * Arguments: 1745 * vdc - soft state pointer for this instance of the device driver. 1746 * 1747 * Return Code: 1748 * 0 - Success 1749 */ 1750 static int 1751 vdc_init_dring_negotiate(vdc_t *vdc) 1752 { 1753 vio_dring_reg_msg_t pkt; 1754 size_t msglen = sizeof (pkt); 1755 int status = -1; 1756 int retry; 1757 int nretries = 10; 1758 1759 ASSERT(vdc != NULL); 1760 ASSERT(mutex_owned(&vdc->lock)); 1761 1762 for (retry = 0; retry < nretries; retry++) { 1763 status = vdc_init_descriptor_ring(vdc); 1764 if (status != EAGAIN) 1765 break; 1766 drv_usecwait(vdc_min_timeout_ldc); 1767 } 1768 1769 if (status != 0) { 1770 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1771 vdc->instance, status); 1772 return (status); 1773 } 1774 1775 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1776 vdc->instance, status); 1777 1778 /* fill in tag */ 1779 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1780 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1781 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1782 pkt.tag.vio_sid = vdc->session_id; 1783 /* fill in payload */ 1784 pkt.dring_ident = 0; 1785 pkt.num_descriptors = vdc->dring_len; 1786 pkt.descriptor_size = vdc->dring_entry_size; 1787 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1788 pkt.ncookies = vdc->dring_cookie_count; 1789 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1790 1791 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1792 if (status != 0) { 1793 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1794 vdc->instance, status); 1795 } 1796 1797 return (status); 1798 } 1799 1800 1801 /* 1802 * Function: 1803 * vdc_dring_negotiation() 1804 * 1805 * Description: 1806 * 1807 * Arguments: 1808 * vdc - soft state pointer for this instance of the device driver. 1809 * 1810 * Return Code: 1811 * 0 - Success 1812 */ 1813 static int 1814 vdc_dring_negotiation(vdc_t *vdcp) 1815 { 1816 int status; 1817 vio_msg_t vio_msg; 1818 1819 if (status = vdc_init_dring_negotiate(vdcp)) 1820 return (status); 1821 1822 /* release lock and wait for response */ 1823 mutex_exit(&vdcp->lock); 1824 status = vdc_wait_for_response(vdcp, &vio_msg); 1825 mutex_enter(&vdcp->lock); 1826 if (status) { 1827 DMSG(vdcp, 0, 1828 "[%d] Failed waiting for Dring negotiation response," 1829 " rv(%d)", vdcp->instance, status); 1830 return (status); 1831 } 1832 1833 /* check type and sub_type ... */ 1834 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1835 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1836 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1837 vdcp->instance); 1838 return (EPROTO); 1839 } 1840 1841 return (vdc_handle_dring_reg_msg(vdcp, 1842 (vio_dring_reg_msg_t *)&vio_msg)); 1843 } 1844 1845 1846 /* 1847 * Function: 1848 * vdc_send_rdx() 1849 * 1850 * Description: 1851 * 1852 * Arguments: 1853 * vdc - soft state pointer for this instance of the device driver. 1854 * 1855 * Return Code: 1856 * 0 - Success 1857 */ 1858 static int 1859 vdc_send_rdx(vdc_t *vdcp) 1860 { 1861 vio_msg_t msg; 1862 size_t msglen = sizeof (vio_msg_t); 1863 int status; 1864 1865 /* 1866 * Send an RDX message to vds to indicate we are ready 1867 * to send data 1868 */ 1869 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1870 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1871 msg.tag.vio_subtype_env = VIO_RDX; 1872 msg.tag.vio_sid = vdcp->session_id; 1873 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1874 if (status != 0) { 1875 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1876 vdcp->instance, status); 1877 } 1878 1879 return (status); 1880 } 1881 1882 /* 1883 * Function: 1884 * vdc_handle_rdx() 1885 * 1886 * Description: 1887 * 1888 * Arguments: 1889 * vdc - soft state pointer for this instance of the device driver. 1890 * msgp - received msg 1891 * 1892 * Return Code: 1893 * 0 - Success 1894 */ 1895 static int 1896 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1897 { 1898 _NOTE(ARGUNUSED(vdcp)) 1899 _NOTE(ARGUNUSED(msgp)) 1900 1901 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1902 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1903 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1904 1905 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1906 1907 return (0); 1908 } 1909 1910 /* 1911 * Function: 1912 * vdc_rdx_exchange() 1913 * 1914 * Description: 1915 * 1916 * Arguments: 1917 * vdc - soft state pointer for this instance of the device driver. 1918 * 1919 * Return Code: 1920 * 0 - Success 1921 */ 1922 static int 1923 vdc_rdx_exchange(vdc_t *vdcp) 1924 { 1925 int status; 1926 vio_msg_t vio_msg; 1927 1928 if (status = vdc_send_rdx(vdcp)) 1929 return (status); 1930 1931 /* release lock and wait for response */ 1932 mutex_exit(&vdcp->lock); 1933 status = vdc_wait_for_response(vdcp, &vio_msg); 1934 mutex_enter(&vdcp->lock); 1935 if (status) { 1936 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1937 vdcp->instance, status); 1938 return (status); 1939 } 1940 1941 /* check type and sub_type ... */ 1942 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1943 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1944 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1945 return (EPROTO); 1946 } 1947 1948 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1949 } 1950 1951 1952 /* -------------------------------------------------------------------------- */ 1953 1954 /* 1955 * LDC helper routines 1956 */ 1957 1958 static int 1959 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1960 { 1961 int status; 1962 uint64_t delay_time; 1963 size_t len; 1964 1965 /* 1966 * Until we get a blocking ldc read we have to retry until the entire 1967 * LDC message has arrived before ldc_read() will return that message. 1968 * If ldc_read() succeed but returns a zero length message then that 1969 * means that the LDC queue is empty and we have to wait for a 1970 * notification from the LDC callback which will set the read_state to 1971 * VDC_READ_PENDING. Note we also bail out if the channel is reset or 1972 * goes away. 1973 */ 1974 delay_time = vdc_ldc_read_init_delay; 1975 1976 for (;;) { 1977 1978 len = *nbytesp; 1979 /* 1980 * vdc->curr_server is protected by vdc->lock but to avoid 1981 * contentions we don't take the lock here. We can do this 1982 * safely because vdc_recv() is only called from thread 1983 * process_msg_thread() which is also the only thread that 1984 * can change vdc->curr_server. 1985 */ 1986 status = ldc_read(vdc->curr_server->ldc_handle, 1987 (caddr_t)msgp, &len); 1988 1989 if (status == EAGAIN) { 1990 delay_time *= 2; 1991 if (delay_time >= vdc_ldc_read_max_delay) 1992 delay_time = vdc_ldc_read_max_delay; 1993 delay(delay_time); 1994 continue; 1995 } 1996 1997 if (status != 0) { 1998 DMSG(vdc, 0, "ldc_read returned %d\n", status); 1999 break; 2000 } 2001 2002 if (len != 0) { 2003 *nbytesp = len; 2004 break; 2005 } 2006 2007 mutex_enter(&vdc->read_lock); 2008 2009 while (vdc->read_state != VDC_READ_PENDING) { 2010 2011 /* detect if the connection has been reset */ 2012 if (vdc->read_state == VDC_READ_RESET) { 2013 mutex_exit(&vdc->read_lock); 2014 return (ECONNRESET); 2015 } 2016 2017 vdc->read_state = VDC_READ_WAITING; 2018 cv_wait(&vdc->read_cv, &vdc->read_lock); 2019 } 2020 2021 vdc->read_state = VDC_READ_IDLE; 2022 mutex_exit(&vdc->read_lock); 2023 2024 delay_time = vdc_ldc_read_init_delay; 2025 } 2026 2027 return (status); 2028 } 2029 2030 2031 2032 #ifdef DEBUG 2033 void 2034 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2035 { 2036 char *ms, *ss, *ses; 2037 switch (msg->tag.vio_msgtype) { 2038 #define Q(_s) case _s : ms = #_s; break; 2039 Q(VIO_TYPE_CTRL) 2040 Q(VIO_TYPE_DATA) 2041 Q(VIO_TYPE_ERR) 2042 #undef Q 2043 default: ms = "unknown"; break; 2044 } 2045 2046 switch (msg->tag.vio_subtype) { 2047 #define Q(_s) case _s : ss = #_s; break; 2048 Q(VIO_SUBTYPE_INFO) 2049 Q(VIO_SUBTYPE_ACK) 2050 Q(VIO_SUBTYPE_NACK) 2051 #undef Q 2052 default: ss = "unknown"; break; 2053 } 2054 2055 switch (msg->tag.vio_subtype_env) { 2056 #define Q(_s) case _s : ses = #_s; break; 2057 Q(VIO_VER_INFO) 2058 Q(VIO_ATTR_INFO) 2059 Q(VIO_DRING_REG) 2060 Q(VIO_DRING_UNREG) 2061 Q(VIO_RDX) 2062 Q(VIO_PKT_DATA) 2063 Q(VIO_DESC_DATA) 2064 Q(VIO_DRING_DATA) 2065 #undef Q 2066 default: ses = "unknown"; break; 2067 } 2068 2069 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2070 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2071 msg->tag.vio_subtype_env, ms, ss, ses); 2072 } 2073 #endif 2074 2075 /* 2076 * Function: 2077 * vdc_send() 2078 * 2079 * Description: 2080 * The function encapsulates the call to write a message using LDC. 2081 * If LDC indicates that the call failed due to the queue being full, 2082 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2083 * 2084 * Arguments: 2085 * ldc_handle - LDC handle for the channel this instance of vdc uses 2086 * pkt - address of LDC message to be sent 2087 * msglen - the size of the message being sent. When the function 2088 * returns, this contains the number of bytes written. 2089 * 2090 * Return Code: 2091 * 0 - Success. 2092 * EINVAL - pkt or msglen were NULL 2093 * ECONNRESET - The connection was not up. 2094 * EWOULDBLOCK - LDC queue is full 2095 * xxx - other error codes returned by ldc_write 2096 */ 2097 static int 2098 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2099 { 2100 size_t size = 0; 2101 int status = 0; 2102 clock_t delay_ticks; 2103 2104 ASSERT(vdc != NULL); 2105 ASSERT(mutex_owned(&vdc->lock)); 2106 ASSERT(msglen != NULL); 2107 ASSERT(*msglen != 0); 2108 2109 #ifdef DEBUG 2110 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2111 #endif 2112 /* 2113 * Wait indefinitely to send if channel 2114 * is busy, but bail out if we succeed or 2115 * if the channel closes or is reset. 2116 */ 2117 delay_ticks = vdc_hz_min_ldc_delay; 2118 do { 2119 size = *msglen; 2120 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2121 if (status == EWOULDBLOCK) { 2122 delay(delay_ticks); 2123 /* geometric backoff */ 2124 delay_ticks *= 2; 2125 if (delay_ticks > vdc_hz_max_ldc_delay) 2126 delay_ticks = vdc_hz_max_ldc_delay; 2127 } 2128 } while (status == EWOULDBLOCK); 2129 2130 /* if LDC had serious issues --- reset vdc state */ 2131 if (status == EIO || status == ECONNRESET) { 2132 /* LDC had serious issues --- reset vdc state */ 2133 mutex_enter(&vdc->read_lock); 2134 if ((vdc->read_state == VDC_READ_WAITING) || 2135 (vdc->read_state == VDC_READ_RESET)) 2136 cv_signal(&vdc->read_cv); 2137 vdc->read_state = VDC_READ_RESET; 2138 mutex_exit(&vdc->read_lock); 2139 2140 /* wake up any waiters in the reset thread */ 2141 if (vdc->state == VDC_STATE_INIT_WAITING) { 2142 DMSG(vdc, 0, "[%d] write reset - " 2143 "vdc is resetting ..\n", vdc->instance); 2144 vdc->state = VDC_STATE_RESETTING; 2145 cv_signal(&vdc->initwait_cv); 2146 } 2147 2148 return (ECONNRESET); 2149 } 2150 2151 /* return the last size written */ 2152 *msglen = size; 2153 2154 return (status); 2155 } 2156 2157 /* 2158 * Function: 2159 * vdc_get_md_node 2160 * 2161 * Description: 2162 * Get the MD, the device node for the given disk instance. The 2163 * caller is responsible for cleaning up the reference to the 2164 * returned MD (mdpp) by calling md_fini_handle(). 2165 * 2166 * Arguments: 2167 * dip - dev info pointer for this instance of the device driver. 2168 * mdpp - the returned MD. 2169 * vd_nodep - the returned device node. 2170 * 2171 * Return Code: 2172 * 0 - Success. 2173 * ENOENT - Expected node or property did not exist. 2174 * ENXIO - Unexpected error communicating with MD framework 2175 */ 2176 static int 2177 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2178 { 2179 int status = ENOENT; 2180 char *node_name = NULL; 2181 md_t *mdp = NULL; 2182 int num_nodes; 2183 int num_vdevs; 2184 mde_cookie_t rootnode; 2185 mde_cookie_t *listp = NULL; 2186 boolean_t found_inst = B_FALSE; 2187 int listsz; 2188 int idx; 2189 uint64_t md_inst; 2190 int obp_inst; 2191 int instance = ddi_get_instance(dip); 2192 2193 /* 2194 * Get the OBP instance number for comparison with the MD instance 2195 * 2196 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2197 * notion of "instance", or unique identifier, for that node; OBP 2198 * stores the value of the "cfg-handle" MD property as the value of 2199 * the "reg" property on the node in the device tree it builds from 2200 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2201 * "reg" property value to uniquely identify this device instance. 2202 * If the "reg" property cannot be found, the device tree state is 2203 * presumably so broken that there is no point in continuing. 2204 */ 2205 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2206 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2207 return (ENOENT); 2208 } 2209 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2210 OBP_REG, -1); 2211 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2212 2213 /* 2214 * We now walk the MD nodes to find the node for this vdisk. 2215 */ 2216 if ((mdp = md_get_handle()) == NULL) { 2217 cmn_err(CE_WARN, "unable to init machine description"); 2218 return (ENXIO); 2219 } 2220 2221 num_nodes = md_node_count(mdp); 2222 ASSERT(num_nodes > 0); 2223 2224 listsz = num_nodes * sizeof (mde_cookie_t); 2225 2226 /* allocate memory for nodes */ 2227 listp = kmem_zalloc(listsz, KM_SLEEP); 2228 2229 rootnode = md_root_node(mdp); 2230 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2231 2232 /* 2233 * Search for all the virtual devices, we will then check to see which 2234 * ones are disk nodes. 2235 */ 2236 num_vdevs = md_scan_dag(mdp, rootnode, 2237 md_find_name(mdp, VDC_MD_VDEV_NAME), 2238 md_find_name(mdp, "fwd"), listp); 2239 2240 if (num_vdevs <= 0) { 2241 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2242 status = ENOENT; 2243 goto done; 2244 } 2245 2246 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2247 for (idx = 0; idx < num_vdevs; idx++) { 2248 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2249 if ((status != 0) || (node_name == NULL)) { 2250 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2251 ": err %d", VDC_MD_VDEV_NAME, status); 2252 continue; 2253 } 2254 2255 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2256 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2257 status = md_get_prop_val(mdp, listp[idx], 2258 VDC_MD_CFG_HDL, &md_inst); 2259 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2260 instance, md_inst); 2261 if ((status == 0) && (md_inst == obp_inst)) { 2262 found_inst = B_TRUE; 2263 break; 2264 } 2265 } 2266 } 2267 2268 if (!found_inst) { 2269 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2270 status = ENOENT; 2271 goto done; 2272 } 2273 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2274 2275 *vd_nodep = listp[idx]; 2276 *mdpp = mdp; 2277 done: 2278 kmem_free(listp, listsz); 2279 return (status); 2280 } 2281 2282 /* 2283 * Function: 2284 * vdc_init_ports 2285 * 2286 * Description: 2287 * Initialize all the ports for this vdisk instance. 2288 * 2289 * Arguments: 2290 * vdc - soft state pointer for this instance of the device driver. 2291 * mdp - md pointer 2292 * vd_nodep - device md node. 2293 * 2294 * Return Code: 2295 * 0 - Success. 2296 * ENOENT - Expected node or property did not exist. 2297 */ 2298 static int 2299 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2300 { 2301 int status = 0; 2302 int idx; 2303 int num_nodes; 2304 int num_vports; 2305 int num_chans; 2306 int listsz; 2307 mde_cookie_t vd_port; 2308 mde_cookie_t *chanp = NULL; 2309 mde_cookie_t *portp = NULL; 2310 vdc_server_t *srvr; 2311 vdc_server_t *prev_srvr = NULL; 2312 2313 /* 2314 * We now walk the MD nodes to find the port nodes for this vdisk. 2315 */ 2316 num_nodes = md_node_count(mdp); 2317 ASSERT(num_nodes > 0); 2318 2319 listsz = num_nodes * sizeof (mde_cookie_t); 2320 2321 /* allocate memory for nodes */ 2322 portp = kmem_zalloc(listsz, KM_SLEEP); 2323 chanp = kmem_zalloc(listsz, KM_SLEEP); 2324 2325 num_vports = md_scan_dag(mdp, vd_nodep, 2326 md_find_name(mdp, VDC_MD_PORT_NAME), 2327 md_find_name(mdp, "fwd"), portp); 2328 if (num_vports == 0) { 2329 DMSGX(0, "Found no '%s' node for '%s' port\n", 2330 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2331 status = ENOENT; 2332 goto done; 2333 } 2334 2335 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2336 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2337 2338 vdc->num_servers = 0; 2339 for (idx = 0; idx < num_vports; idx++) { 2340 2341 /* initialize this port */ 2342 vd_port = portp[idx]; 2343 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2344 srvr->vdcp = vdc; 2345 2346 /* get port id */ 2347 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2348 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2349 VDC_MD_ID); 2350 kmem_free(srvr, sizeof (vdc_server_t)); 2351 continue; 2352 } 2353 2354 /* set the connection timeout */ 2355 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2356 &srvr->ctimeout) != 0) { 2357 srvr->ctimeout = 0; 2358 } 2359 2360 /* get the ldc id */ 2361 num_chans = md_scan_dag(mdp, vd_port, 2362 md_find_name(mdp, VDC_MD_CHAN_NAME), 2363 md_find_name(mdp, "fwd"), chanp); 2364 2365 /* expecting at least one channel */ 2366 if (num_chans <= 0) { 2367 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2368 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2369 kmem_free(srvr, sizeof (vdc_server_t)); 2370 continue; 2371 } else if (num_chans != 1) { 2372 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2373 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2374 num_chans); 2375 } 2376 2377 /* 2378 * We use the first channel found (index 0), irrespective of how 2379 * many are there in total. 2380 */ 2381 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2382 &srvr->ldc_id) != 0) { 2383 cmn_err(CE_NOTE, "Channel '%s' property not found", 2384 VDC_MD_ID); 2385 kmem_free(srvr, sizeof (vdc_server_t)); 2386 continue; 2387 } 2388 2389 /* 2390 * now initialise LDC channel which will be used to 2391 * communicate with this server 2392 */ 2393 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2394 kmem_free(srvr, sizeof (vdc_server_t)); 2395 continue; 2396 } 2397 2398 /* add server to list */ 2399 if (prev_srvr) 2400 prev_srvr->next = srvr; 2401 else 2402 vdc->server_list = srvr; 2403 2404 prev_srvr = srvr; 2405 2406 /* inc numbers of servers */ 2407 vdc->num_servers++; 2408 } 2409 2410 /* 2411 * Adjust the max number of handshake retries to match 2412 * the number of vdisk servers. 2413 */ 2414 if (vdc_hshake_retries < vdc->num_servers) 2415 vdc_hshake_retries = vdc->num_servers; 2416 2417 /* pick first server as current server */ 2418 if (vdc->server_list != NULL) { 2419 vdc->curr_server = vdc->server_list; 2420 status = 0; 2421 } else { 2422 status = ENOENT; 2423 } 2424 2425 done: 2426 kmem_free(chanp, listsz); 2427 kmem_free(portp, listsz); 2428 return (status); 2429 } 2430 2431 2432 /* 2433 * Function: 2434 * vdc_do_ldc_up 2435 * 2436 * Description: 2437 * Bring the channel for the current server up. 2438 * 2439 * Arguments: 2440 * vdc - soft state pointer for this instance of the device driver. 2441 * 2442 * Return Code: 2443 * 0 - Success. 2444 * EINVAL - Driver is detaching / LDC error 2445 * ECONNREFUSED - Other end is not listening 2446 */ 2447 static int 2448 vdc_do_ldc_up(vdc_t *vdc) 2449 { 2450 int status; 2451 ldc_status_t ldc_state; 2452 2453 ASSERT(MUTEX_HELD(&vdc->lock)); 2454 2455 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2456 vdc->instance, vdc->curr_server->ldc_id); 2457 2458 if (vdc->lifecycle == VDC_LC_DETACHING) 2459 return (EINVAL); 2460 2461 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2462 switch (status) { 2463 case ECONNREFUSED: /* listener not ready at other end */ 2464 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2465 vdc->instance, vdc->curr_server->ldc_id, status); 2466 status = 0; 2467 break; 2468 default: 2469 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2470 "channel=%ld, err=%d", vdc->instance, 2471 vdc->curr_server->ldc_id, status); 2472 break; 2473 } 2474 } 2475 2476 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2477 vdc->curr_server->ldc_state = ldc_state; 2478 if (ldc_state == LDC_UP) { 2479 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2480 vdc->instance); 2481 vdc->seq_num = 1; 2482 vdc->seq_num_reply = 0; 2483 } 2484 } 2485 2486 return (status); 2487 } 2488 2489 /* 2490 * Function: 2491 * vdc_terminate_ldc() 2492 * 2493 * Description: 2494 * 2495 * Arguments: 2496 * vdc - soft state pointer for this instance of the device driver. 2497 * srvr - vdc per-server info structure 2498 * 2499 * Return Code: 2500 * None 2501 */ 2502 static void 2503 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2504 { 2505 int instance = ddi_get_instance(vdc->dip); 2506 2507 if (srvr->state & VDC_LDC_OPEN) { 2508 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2509 (void) ldc_close(srvr->ldc_handle); 2510 } 2511 if (srvr->state & VDC_LDC_CB) { 2512 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2513 (void) ldc_unreg_callback(srvr->ldc_handle); 2514 } 2515 if (srvr->state & VDC_LDC_INIT) { 2516 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2517 (void) ldc_fini(srvr->ldc_handle); 2518 srvr->ldc_handle = NULL; 2519 } 2520 2521 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2522 } 2523 2524 /* 2525 * Function: 2526 * vdc_fini_ports() 2527 * 2528 * Description: 2529 * Finalize all ports by closing the channel associated with each 2530 * port and also freeing the server structure. 2531 * 2532 * Arguments: 2533 * vdc - soft state pointer for this instance of the device driver. 2534 * 2535 * Return Code: 2536 * None 2537 */ 2538 static void 2539 vdc_fini_ports(vdc_t *vdc) 2540 { 2541 int instance = ddi_get_instance(vdc->dip); 2542 vdc_server_t *srvr, *prev_srvr; 2543 2544 ASSERT(vdc != NULL); 2545 ASSERT(mutex_owned(&vdc->lock)); 2546 2547 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2548 2549 srvr = vdc->server_list; 2550 2551 while (srvr) { 2552 2553 vdc_terminate_ldc(vdc, srvr); 2554 2555 /* next server */ 2556 prev_srvr = srvr; 2557 srvr = srvr->next; 2558 2559 /* free server */ 2560 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2561 } 2562 2563 vdc->server_list = NULL; 2564 } 2565 2566 /* -------------------------------------------------------------------------- */ 2567 2568 /* 2569 * Descriptor Ring helper routines 2570 */ 2571 2572 /* 2573 * Function: 2574 * vdc_init_descriptor_ring() 2575 * 2576 * Description: 2577 * 2578 * Arguments: 2579 * vdc - soft state pointer for this instance of the device driver. 2580 * 2581 * Return Code: 2582 * 0 - Success 2583 */ 2584 static int 2585 vdc_init_descriptor_ring(vdc_t *vdc) 2586 { 2587 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2588 int status = 0; 2589 int i; 2590 2591 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2592 2593 ASSERT(vdc != NULL); 2594 ASSERT(mutex_owned(&vdc->lock)); 2595 2596 /* ensure we have enough room to store max sized block */ 2597 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2598 2599 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2600 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2601 /* 2602 * Calculate the maximum block size we can transmit using one 2603 * Descriptor Ring entry from the attributes returned by the 2604 * vDisk server. This is subject to a minimum of 'maxphys' 2605 * as we do not have the capability to split requests over 2606 * multiple DRing entries. 2607 */ 2608 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2609 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2610 vdc->instance); 2611 vdc->dring_max_cookies = maxphys / PAGESIZE; 2612 } else { 2613 vdc->dring_max_cookies = 2614 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2615 } 2616 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2617 (sizeof (ldc_mem_cookie_t) * 2618 (vdc->dring_max_cookies - 1))); 2619 vdc->dring_len = VD_DRING_LEN; 2620 2621 status = ldc_mem_dring_create(vdc->dring_len, 2622 vdc->dring_entry_size, &vdc->dring_hdl); 2623 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2624 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2625 vdc->instance); 2626 return (status); 2627 } 2628 vdc->initialized |= VDC_DRING_INIT; 2629 } 2630 2631 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2632 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2633 vdc->dring_cookie = 2634 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2635 2636 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2637 vdc->dring_hdl, 2638 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2639 &vdc->dring_cookie[0], 2640 &vdc->dring_cookie_count); 2641 if (status != 0) { 2642 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2643 "(%lx) to channel (%lx) status=%d\n", 2644 vdc->instance, vdc->dring_hdl, 2645 vdc->curr_server->ldc_handle, status); 2646 return (status); 2647 } 2648 ASSERT(vdc->dring_cookie_count == 1); 2649 vdc->initialized |= VDC_DRING_BOUND; 2650 } 2651 2652 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2653 if (status != 0) { 2654 DMSG(vdc, 0, 2655 "[%d] Failed to get info for descriptor ring (%lx)\n", 2656 vdc->instance, vdc->dring_hdl); 2657 return (status); 2658 } 2659 2660 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2661 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2662 2663 /* Allocate the local copy of this dring */ 2664 vdc->local_dring = 2665 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2666 KM_SLEEP); 2667 vdc->initialized |= VDC_DRING_LOCAL; 2668 } 2669 2670 /* 2671 * Mark all DRing entries as free and initialize the private 2672 * descriptor's memory handles. If any entry is initialized, 2673 * we need to free it later so we set the bit in 'initialized' 2674 * at the start. 2675 */ 2676 vdc->initialized |= VDC_DRING_ENTRY; 2677 for (i = 0; i < vdc->dring_len; i++) { 2678 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2679 dep->hdr.dstate = VIO_DESC_FREE; 2680 2681 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2682 &vdc->local_dring[i].desc_mhdl); 2683 if (status != 0) { 2684 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2685 " descriptor %d", vdc->instance, i); 2686 return (status); 2687 } 2688 vdc->local_dring[i].is_free = B_TRUE; 2689 vdc->local_dring[i].dep = dep; 2690 } 2691 2692 /* Initialize the starting index */ 2693 vdc->dring_curr_idx = 0; 2694 2695 return (status); 2696 } 2697 2698 /* 2699 * Function: 2700 * vdc_destroy_descriptor_ring() 2701 * 2702 * Description: 2703 * 2704 * Arguments: 2705 * vdc - soft state pointer for this instance of the device driver. 2706 * 2707 * Return Code: 2708 * None 2709 */ 2710 static void 2711 vdc_destroy_descriptor_ring(vdc_t *vdc) 2712 { 2713 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2714 ldc_mem_handle_t mhdl = NULL; 2715 ldc_mem_info_t minfo; 2716 int status = -1; 2717 int i; /* loop */ 2718 2719 ASSERT(vdc != NULL); 2720 ASSERT(mutex_owned(&vdc->lock)); 2721 2722 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2723 2724 if (vdc->initialized & VDC_DRING_ENTRY) { 2725 DMSG(vdc, 0, 2726 "[%d] Removing Local DRing entries\n", vdc->instance); 2727 for (i = 0; i < vdc->dring_len; i++) { 2728 ldep = &vdc->local_dring[i]; 2729 mhdl = ldep->desc_mhdl; 2730 2731 if (mhdl == NULL) 2732 continue; 2733 2734 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2735 DMSG(vdc, 0, 2736 "ldc_mem_info returned an error: %d\n", 2737 status); 2738 2739 /* 2740 * This must mean that the mem handle 2741 * is not valid. Clear it out so that 2742 * no one tries to use it. 2743 */ 2744 ldep->desc_mhdl = NULL; 2745 continue; 2746 } 2747 2748 if (minfo.status == LDC_BOUND) { 2749 (void) ldc_mem_unbind_handle(mhdl); 2750 } 2751 2752 (void) ldc_mem_free_handle(mhdl); 2753 2754 ldep->desc_mhdl = NULL; 2755 } 2756 vdc->initialized &= ~VDC_DRING_ENTRY; 2757 } 2758 2759 if (vdc->initialized & VDC_DRING_LOCAL) { 2760 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2761 kmem_free(vdc->local_dring, 2762 vdc->dring_len * sizeof (vdc_local_desc_t)); 2763 vdc->initialized &= ~VDC_DRING_LOCAL; 2764 } 2765 2766 if (vdc->initialized & VDC_DRING_BOUND) { 2767 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2768 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2769 if (status == 0) { 2770 vdc->initialized &= ~VDC_DRING_BOUND; 2771 } else { 2772 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2773 vdc->instance, status, vdc->dring_hdl); 2774 } 2775 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2776 } 2777 2778 if (vdc->initialized & VDC_DRING_INIT) { 2779 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2780 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2781 if (status == 0) { 2782 vdc->dring_hdl = NULL; 2783 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2784 vdc->initialized &= ~VDC_DRING_INIT; 2785 } else { 2786 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2787 vdc->instance, status, vdc->dring_hdl); 2788 } 2789 } 2790 } 2791 2792 /* 2793 * Function: 2794 * vdc_map_to_shared_dring() 2795 * 2796 * Description: 2797 * Copy contents of the local descriptor to the shared 2798 * memory descriptor. 2799 * 2800 * Arguments: 2801 * vdcp - soft state pointer for this instance of the device driver. 2802 * idx - descriptor ring index 2803 * 2804 * Return Code: 2805 * None 2806 */ 2807 static int 2808 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2809 { 2810 vdc_local_desc_t *ldep; 2811 vd_dring_entry_t *dep; 2812 int rv; 2813 2814 ldep = &(vdcp->local_dring[idx]); 2815 2816 /* for now leave in the old pop_mem_hdl stuff */ 2817 if (ldep->nbytes > 0) { 2818 rv = vdc_populate_mem_hdl(vdcp, ldep); 2819 if (rv) { 2820 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2821 vdcp->instance); 2822 return (rv); 2823 } 2824 } 2825 2826 /* 2827 * fill in the data details into the DRing 2828 */ 2829 dep = ldep->dep; 2830 ASSERT(dep != NULL); 2831 2832 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2833 dep->payload.operation = ldep->operation; 2834 dep->payload.addr = ldep->offset; 2835 dep->payload.nbytes = ldep->nbytes; 2836 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2837 dep->payload.slice = ldep->slice; 2838 dep->hdr.dstate = VIO_DESC_READY; 2839 dep->hdr.ack = 1; /* request an ACK for every message */ 2840 2841 return (0); 2842 } 2843 2844 /* 2845 * Function: 2846 * vdc_send_request 2847 * 2848 * Description: 2849 * This routine writes the data to be transmitted to vds into the 2850 * descriptor, notifies vds that the ring has been updated and 2851 * then waits for the request to be processed. 2852 * 2853 * Arguments: 2854 * vdcp - the soft state pointer 2855 * operation - operation we want vds to perform (VD_OP_XXX) 2856 * addr - address of data buf to be read/written. 2857 * nbytes - number of bytes to read/write 2858 * slice - the disk slice this request is for 2859 * offset - relative disk offset 2860 * cb_type - type of call - STRATEGY or SYNC 2861 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2862 * . mode for ioctl(9e) 2863 * . LP64 diskaddr_t (block I/O) 2864 * dir - direction of operation (READ/WRITE/BOTH) 2865 * 2866 * Return Codes: 2867 * 0 2868 * ENXIO 2869 */ 2870 static int 2871 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2872 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2873 void *cb_arg, vio_desc_direction_t dir) 2874 { 2875 int rv = 0; 2876 2877 ASSERT(vdcp != NULL); 2878 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2879 2880 mutex_enter(&vdcp->lock); 2881 2882 /* 2883 * If this is a block read/write operation we update the I/O statistics 2884 * to indicate that the request is being put on the waitq to be 2885 * serviced. 2886 * 2887 * We do it here (a common routine for both synchronous and strategy 2888 * calls) for performance reasons - we are already holding vdc->lock 2889 * so there is no extra locking overhead. We would have to explicitly 2890 * grab the 'lock' mutex to update the stats if we were to do this 2891 * higher up the stack in vdc_strategy() et. al. 2892 */ 2893 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2894 DTRACE_IO1(start, buf_t *, cb_arg); 2895 VD_KSTAT_WAITQ_ENTER(vdcp); 2896 } 2897 2898 do { 2899 while (vdcp->state != VDC_STATE_RUNNING) { 2900 2901 /* return error if detaching */ 2902 if (vdcp->state == VDC_STATE_DETACH) { 2903 rv = ENXIO; 2904 goto done; 2905 } 2906 2907 /* fail request if connection timeout is reached */ 2908 if (vdcp->ctimeout_reached) { 2909 rv = EIO; 2910 goto done; 2911 } 2912 2913 /* 2914 * If we are panicking and the disk is not ready then 2915 * we can't send any request because we can't complete 2916 * the handshake now. 2917 */ 2918 if (ddi_in_panic()) { 2919 rv = EIO; 2920 goto done; 2921 } 2922 2923 cv_wait(&vdcp->running_cv, &vdcp->lock); 2924 } 2925 2926 } while (vdc_populate_descriptor(vdcp, operation, addr, 2927 nbytes, slice, offset, cb_type, cb_arg, dir)); 2928 2929 done: 2930 /* 2931 * If this is a block read/write we update the I/O statistics kstat 2932 * to indicate that this request has been placed on the queue for 2933 * processing (i.e sent to the vDisk server) - iostat(1M) will 2934 * report the time waiting for the vDisk server under the %b column 2935 * In the case of an error we simply take it off the wait queue. 2936 */ 2937 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2938 if (rv == 0) { 2939 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2940 DTRACE_PROBE1(send, buf_t *, cb_arg); 2941 } else { 2942 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2943 VD_KSTAT_WAITQ_EXIT(vdcp); 2944 DTRACE_IO1(done, buf_t *, cb_arg); 2945 } 2946 } 2947 2948 mutex_exit(&vdcp->lock); 2949 2950 return (rv); 2951 } 2952 2953 2954 /* 2955 * Function: 2956 * vdc_populate_descriptor 2957 * 2958 * Description: 2959 * This routine writes the data to be transmitted to vds into the 2960 * descriptor, notifies vds that the ring has been updated and 2961 * then waits for the request to be processed. 2962 * 2963 * Arguments: 2964 * vdcp - the soft state pointer 2965 * operation - operation we want vds to perform (VD_OP_XXX) 2966 * addr - address of data buf to be read/written. 2967 * nbytes - number of bytes to read/write 2968 * slice - the disk slice this request is for 2969 * offset - relative disk offset 2970 * cb_type - type of call - STRATEGY or SYNC 2971 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2972 * . mode for ioctl(9e) 2973 * . LP64 diskaddr_t (block I/O) 2974 * dir - direction of operation (READ/WRITE/BOTH) 2975 * 2976 * Return Codes: 2977 * 0 2978 * EAGAIN 2979 * ECONNRESET 2980 * ENXIO 2981 */ 2982 static int 2983 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2984 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2985 void *cb_arg, vio_desc_direction_t dir) 2986 { 2987 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2988 int idx; /* Index of DRing entry used */ 2989 int next_idx; 2990 vio_dring_msg_t dmsg; 2991 size_t msglen; 2992 int rv; 2993 2994 ASSERT(MUTEX_HELD(&vdcp->lock)); 2995 vdcp->threads_pending++; 2996 loop: 2997 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2998 2999 /* Get next available D-Ring entry */ 3000 idx = vdcp->dring_curr_idx; 3001 local_dep = &(vdcp->local_dring[idx]); 3002 3003 if (!local_dep->is_free) { 3004 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 3005 vdcp->instance); 3006 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 3007 if (vdcp->state == VDC_STATE_RUNNING || 3008 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3009 goto loop; 3010 } 3011 vdcp->threads_pending--; 3012 return (ECONNRESET); 3013 } 3014 3015 next_idx = idx + 1; 3016 if (next_idx >= vdcp->dring_len) 3017 next_idx = 0; 3018 vdcp->dring_curr_idx = next_idx; 3019 3020 ASSERT(local_dep->is_free); 3021 3022 local_dep->operation = operation; 3023 local_dep->addr = addr; 3024 local_dep->nbytes = nbytes; 3025 local_dep->slice = slice; 3026 local_dep->offset = offset; 3027 local_dep->cb_type = cb_type; 3028 local_dep->cb_arg = cb_arg; 3029 local_dep->dir = dir; 3030 3031 local_dep->is_free = B_FALSE; 3032 3033 rv = vdc_map_to_shared_dring(vdcp, idx); 3034 if (rv) { 3035 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3036 vdcp->instance); 3037 /* free the descriptor */ 3038 local_dep->is_free = B_TRUE; 3039 vdcp->dring_curr_idx = idx; 3040 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3041 if (vdcp->state == VDC_STATE_RUNNING || 3042 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3043 goto loop; 3044 } 3045 vdcp->threads_pending--; 3046 return (ECONNRESET); 3047 } 3048 3049 /* 3050 * Send a msg with the DRing details to vds 3051 */ 3052 VIO_INIT_DRING_DATA_TAG(dmsg); 3053 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3054 dmsg.dring_ident = vdcp->dring_ident; 3055 dmsg.start_idx = idx; 3056 dmsg.end_idx = idx; 3057 vdcp->seq_num++; 3058 3059 DTRACE_PROBE2(populate, int, vdcp->instance, 3060 vdc_local_desc_t *, local_dep); 3061 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3062 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3063 3064 /* 3065 * note we're still holding the lock here to 3066 * make sure the message goes out in order !!!... 3067 */ 3068 msglen = sizeof (dmsg); 3069 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3070 switch (rv) { 3071 case ECONNRESET: 3072 /* 3073 * vdc_send initiates the reset on failure. 3074 * Since the transaction has already been put 3075 * on the local dring, it will automatically get 3076 * retried when the channel is reset. Given that, 3077 * it is ok to just return success even though the 3078 * send failed. 3079 */ 3080 rv = 0; 3081 break; 3082 3083 case 0: /* EOK */ 3084 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3085 break; 3086 3087 default: 3088 goto cleanup_and_exit; 3089 } 3090 3091 vdcp->threads_pending--; 3092 return (rv); 3093 3094 cleanup_and_exit: 3095 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3096 return (ENXIO); 3097 } 3098 3099 /* 3100 * Function: 3101 * vdc_do_sync_op 3102 * 3103 * Description: 3104 * Wrapper around vdc_populate_descriptor that blocks until the 3105 * response to the message is available. 3106 * 3107 * Arguments: 3108 * vdcp - the soft state pointer 3109 * operation - operation we want vds to perform (VD_OP_XXX) 3110 * addr - address of data buf to be read/written. 3111 * nbytes - number of bytes to read/write 3112 * slice - the disk slice this request is for 3113 * offset - relative disk offset 3114 * cb_type - type of call - STRATEGY or SYNC 3115 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3116 * . mode for ioctl(9e) 3117 * . LP64 diskaddr_t (block I/O) 3118 * dir - direction of operation (READ/WRITE/BOTH) 3119 * rconflict - check for reservation conflict in case of failure 3120 * 3121 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3122 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3123 * result of a successful operation with vd_scsi_status(). 3124 * 3125 * Return Codes: 3126 * 0 3127 * EAGAIN 3128 * EFAULT 3129 * ENXIO 3130 * EIO 3131 */ 3132 static int 3133 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3134 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3135 vio_desc_direction_t dir, boolean_t rconflict) 3136 { 3137 int status; 3138 vdc_io_t *vio; 3139 boolean_t check_resv_conflict = B_FALSE; 3140 3141 ASSERT(cb_type == CB_SYNC); 3142 3143 /* 3144 * Grab the lock, if blocked wait until the server 3145 * response causes us to wake up again. 3146 */ 3147 mutex_enter(&vdcp->lock); 3148 vdcp->sync_op_cnt++; 3149 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) { 3150 if (ddi_in_panic()) { 3151 /* don't block if we are panicking */ 3152 vdcp->sync_op_cnt--; 3153 mutex_exit(&vdcp->lock); 3154 return (EIO); 3155 } else { 3156 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3157 } 3158 } 3159 3160 if (vdcp->state == VDC_STATE_DETACH) { 3161 cv_broadcast(&vdcp->sync_blocked_cv); 3162 vdcp->sync_op_cnt--; 3163 mutex_exit(&vdcp->lock); 3164 return (ENXIO); 3165 } 3166 3167 /* now block anyone other thread entering after us */ 3168 vdcp->sync_op_blocked = B_TRUE; 3169 vdcp->sync_op_pending = B_TRUE; 3170 mutex_exit(&vdcp->lock); 3171 3172 status = vdc_send_request(vdcp, operation, addr, 3173 nbytes, slice, offset, cb_type, cb_arg, dir); 3174 3175 mutex_enter(&vdcp->lock); 3176 3177 if (status != 0) { 3178 vdcp->sync_op_pending = B_FALSE; 3179 } else if (ddi_in_panic()) { 3180 if (vdc_drain_response(vdcp, CB_SYNC, NULL) == 0) { 3181 status = vdcp->sync_op_status; 3182 } else { 3183 vdcp->sync_op_pending = B_FALSE; 3184 status = EIO; 3185 } 3186 } else { 3187 /* 3188 * block until our transaction completes. 3189 * Also anyone else waiting also gets to go next. 3190 */ 3191 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3192 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3193 3194 DMSG(vdcp, 2, ": operation returned %d\n", 3195 vdcp->sync_op_status); 3196 if (vdcp->state == VDC_STATE_DETACH) { 3197 vdcp->sync_op_pending = B_FALSE; 3198 status = ENXIO; 3199 } else { 3200 status = vdcp->sync_op_status; 3201 if (status != 0 && vdcp->failfast_interval != 0) { 3202 /* 3203 * Operation has failed and failfast is enabled. 3204 * We need to check if the failure is due to a 3205 * reservation conflict if this was requested. 3206 */ 3207 check_resv_conflict = rconflict; 3208 } 3209 3210 } 3211 } 3212 3213 vdcp->sync_op_status = 0; 3214 vdcp->sync_op_blocked = B_FALSE; 3215 vdcp->sync_op_cnt--; 3216 3217 /* signal the next waiting thread */ 3218 cv_signal(&vdcp->sync_blocked_cv); 3219 3220 /* 3221 * We have to check for reservation conflict after unblocking sync 3222 * operations because some sync operations will be used to do this 3223 * check. 3224 */ 3225 if (check_resv_conflict) { 3226 vio = vdc_failfast_io_queue(vdcp, NULL); 3227 while (vio->vio_qtime != 0) 3228 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3229 kmem_free(vio, sizeof (vdc_io_t)); 3230 } 3231 3232 mutex_exit(&vdcp->lock); 3233 3234 return (status); 3235 } 3236 3237 3238 /* 3239 * Function: 3240 * vdc_drain_response() 3241 * 3242 * Description: 3243 * When a guest is panicking, the completion of requests needs to be 3244 * handled differently because interrupts are disabled and vdc 3245 * will not get messages. We have to poll for the messages instead. 3246 * 3247 * Note: since we are panicking we don't implement the io:::done 3248 * DTrace probe or update the I/O statistics kstats. 3249 * 3250 * Arguments: 3251 * vdc - soft state pointer for this instance of the device driver. 3252 * cb_type - the type of request we want to drain. If type is CB_SYNC 3253 * then we drain all responses until we find a CB_SYNC request. 3254 * If the type is CB_STRATEGY then the behavior depends on the 3255 * value of the buf argument. 3256 * buf - if the cb_type argument is CB_SYNC then the buf argument 3257 * must be NULL. If the cb_type argument is CB_STRATEGY and 3258 * if buf is NULL then we drain all responses, otherwise we 3259 * poll until we receive a ACK/NACK for the specific I/O 3260 * described by buf. 3261 * 3262 * Return Code: 3263 * 0 - Success. If we were expecting a response to a particular 3264 * CB_SYNC or CB_STRATEGY request then this means that a 3265 * response has been received. 3266 */ 3267 static int 3268 vdc_drain_response(vdc_t *vdc, vio_cb_type_t cb_type, struct buf *buf) 3269 { 3270 int rv, idx, retries; 3271 size_t msglen; 3272 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3273 vio_dring_msg_t dmsg; 3274 struct buf *mbuf; 3275 boolean_t ack; 3276 3277 ASSERT(cb_type == CB_STRATEGY || cb_type == CB_SYNC); 3278 3279 mutex_enter(&vdc->lock); 3280 3281 retries = 0; 3282 for (;;) { 3283 msglen = sizeof (dmsg); 3284 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3285 &msglen); 3286 if (rv) { 3287 rv = EINVAL; 3288 break; 3289 } 3290 3291 /* 3292 * if there are no packets wait and check again 3293 */ 3294 if ((rv == 0) && (msglen == 0)) { 3295 if (retries++ > vdc_dump_retries) { 3296 rv = EAGAIN; 3297 break; 3298 } 3299 3300 drv_usecwait(vdc_usec_timeout_dump); 3301 continue; 3302 } 3303 3304 /* 3305 * Ignore all messages that are not ACKs/NACKs to 3306 * DRing requests. 3307 */ 3308 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3309 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3310 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3311 dmsg.tag.vio_msgtype, 3312 dmsg.tag.vio_subtype, 3313 dmsg.tag.vio_subtype_env); 3314 continue; 3315 } 3316 3317 /* 3318 * Record if the packet was ACK'ed or not. If the packet was not 3319 * ACK'ed then we will just mark the request as failed; we don't 3320 * want to reset the connection at this point. 3321 */ 3322 switch (dmsg.tag.vio_subtype) { 3323 case VIO_SUBTYPE_ACK: 3324 ack = B_TRUE; 3325 break; 3326 case VIO_SUBTYPE_NACK: 3327 ack = B_FALSE; 3328 break; 3329 default: 3330 continue; 3331 } 3332 3333 idx = dmsg.start_idx; 3334 if (idx >= vdc->dring_len) { 3335 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3336 vdc->instance, idx); 3337 continue; 3338 } 3339 ldep = &vdc->local_dring[idx]; 3340 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3341 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3342 vdc->instance, idx, ldep->dep->hdr.dstate); 3343 continue; 3344 } 3345 3346 switch (ldep->cb_type) { 3347 3348 case CB_STRATEGY: 3349 mbuf = ldep->cb_arg; 3350 if (mbuf != NULL) { 3351 mbuf->b_resid = mbuf->b_bcount - 3352 ldep->dep->payload.nbytes; 3353 bioerror(mbuf, 3354 ack ? ldep->dep->payload.status : EIO); 3355 biodone(mbuf); 3356 } 3357 rv = vdc_depopulate_descriptor(vdc, idx); 3358 if (buf != NULL && buf == mbuf) { 3359 rv = 0; 3360 goto done; 3361 } 3362 break; 3363 3364 case CB_SYNC: 3365 rv = vdc_depopulate_descriptor(vdc, idx); 3366 vdc->sync_op_status = ack ? rv : EIO; 3367 vdc->sync_op_pending = B_FALSE; 3368 cv_signal(&vdc->sync_pending_cv); 3369 if (cb_type == CB_SYNC) { 3370 rv = 0; 3371 goto done; 3372 } 3373 break; 3374 } 3375 3376 /* if this is the last descriptor - break out of loop */ 3377 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) { 3378 /* 3379 * If we were expecting a response for a particular 3380 * request then we return with an error otherwise we 3381 * have successfully completed the drain. 3382 */ 3383 rv = (buf != NULL || cb_type == CB_SYNC)? ESRCH: 0; 3384 break; 3385 } 3386 } 3387 3388 done: 3389 mutex_exit(&vdc->lock); 3390 DMSG(vdc, 0, "End idx=%d\n", idx); 3391 3392 return (rv); 3393 } 3394 3395 3396 /* 3397 * Function: 3398 * vdc_depopulate_descriptor() 3399 * 3400 * Description: 3401 * 3402 * Arguments: 3403 * vdc - soft state pointer for this instance of the device driver. 3404 * idx - Index of the Descriptor Ring entry being modified 3405 * 3406 * Return Code: 3407 * 0 - Success 3408 */ 3409 static int 3410 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3411 { 3412 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3413 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3414 int status = ENXIO; 3415 int rv = 0; 3416 3417 ASSERT(vdc != NULL); 3418 ASSERT(idx < vdc->dring_len); 3419 ldep = &vdc->local_dring[idx]; 3420 ASSERT(ldep != NULL); 3421 ASSERT(MUTEX_HELD(&vdc->lock)); 3422 3423 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3424 DMSG(vdc, 2, ": idx = %d\n", idx); 3425 3426 dep = ldep->dep; 3427 ASSERT(dep != NULL); 3428 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3429 (dep->payload.status == ECANCELED)); 3430 3431 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3432 3433 ldep->is_free = B_TRUE; 3434 status = dep->payload.status; 3435 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3436 3437 /* 3438 * If no buffers were used to transfer information to the server when 3439 * populating the descriptor then no memory handles need to be unbound 3440 * and we can return now. 3441 */ 3442 if (ldep->nbytes == 0) { 3443 cv_signal(&vdc->dring_free_cv); 3444 return (status); 3445 } 3446 3447 /* 3448 * If the upper layer passed in a misaligned address we copied the 3449 * data into an aligned buffer before sending it to LDC - we now 3450 * copy it back to the original buffer. 3451 */ 3452 if (ldep->align_addr) { 3453 ASSERT(ldep->addr != NULL); 3454 3455 if (dep->payload.nbytes > 0) 3456 bcopy(ldep->align_addr, ldep->addr, 3457 dep->payload.nbytes); 3458 kmem_free(ldep->align_addr, 3459 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3460 ldep->align_addr = NULL; 3461 } 3462 3463 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3464 if (rv != 0) { 3465 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3466 vdc->instance, ldep->desc_mhdl, idx, rv); 3467 /* 3468 * The error returned by the vDisk server is more informative 3469 * and thus has a higher priority but if it isn't set we ensure 3470 * that this function returns an error. 3471 */ 3472 if (status == 0) 3473 status = EINVAL; 3474 } 3475 3476 cv_signal(&vdc->membind_cv); 3477 cv_signal(&vdc->dring_free_cv); 3478 3479 return (status); 3480 } 3481 3482 /* 3483 * Function: 3484 * vdc_populate_mem_hdl() 3485 * 3486 * Description: 3487 * 3488 * Arguments: 3489 * vdc - soft state pointer for this instance of the device driver. 3490 * idx - Index of the Descriptor Ring entry being modified 3491 * addr - virtual address being mapped in 3492 * nybtes - number of bytes in 'addr' 3493 * operation - the vDisk operation being performed (VD_OP_xxx) 3494 * 3495 * Return Code: 3496 * 0 - Success 3497 */ 3498 static int 3499 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3500 { 3501 vd_dring_entry_t *dep = NULL; 3502 ldc_mem_handle_t mhdl; 3503 caddr_t vaddr; 3504 size_t nbytes; 3505 uint8_t perm = LDC_MEM_RW; 3506 uint8_t maptype; 3507 int rv = 0; 3508 int i; 3509 3510 ASSERT(vdcp != NULL); 3511 3512 dep = ldep->dep; 3513 mhdl = ldep->desc_mhdl; 3514 3515 switch (ldep->dir) { 3516 case VIO_read_dir: 3517 perm = LDC_MEM_W; 3518 break; 3519 3520 case VIO_write_dir: 3521 perm = LDC_MEM_R; 3522 break; 3523 3524 case VIO_both_dir: 3525 perm = LDC_MEM_RW; 3526 break; 3527 3528 default: 3529 ASSERT(0); /* catch bad programming in vdc */ 3530 } 3531 3532 /* 3533 * LDC expects any addresses passed in to be 8-byte aligned. We need 3534 * to copy the contents of any misaligned buffers to a newly allocated 3535 * buffer and bind it instead (and copy the the contents back to the 3536 * original buffer passed in when depopulating the descriptor) 3537 */ 3538 vaddr = ldep->addr; 3539 nbytes = ldep->nbytes; 3540 if (((uint64_t)vaddr & 0x7) != 0) { 3541 ASSERT(ldep->align_addr == NULL); 3542 ldep->align_addr = 3543 kmem_alloc(sizeof (caddr_t) * 3544 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3545 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3546 "(buf=%p nb=%ld op=%d)\n", 3547 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3548 nbytes, ldep->operation); 3549 if (perm != LDC_MEM_W) 3550 bcopy(vaddr, ldep->align_addr, nbytes); 3551 vaddr = ldep->align_addr; 3552 } 3553 3554 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3555 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3556 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3557 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3558 vdcp->instance, dep->payload.ncookies); 3559 if (rv != 0) { 3560 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3561 "(mhdl=%p, buf=%p, err=%d)\n", 3562 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3563 if (ldep->align_addr) { 3564 kmem_free(ldep->align_addr, 3565 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3566 ldep->align_addr = NULL; 3567 } 3568 return (EAGAIN); 3569 } 3570 3571 /* 3572 * Get the other cookies (if any). 3573 */ 3574 for (i = 1; i < dep->payload.ncookies; i++) { 3575 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3576 if (rv != 0) { 3577 (void) ldc_mem_unbind_handle(mhdl); 3578 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3579 "(mhdl=%lx cnum=%d), err=%d", 3580 vdcp->instance, mhdl, i, rv); 3581 if (ldep->align_addr) { 3582 kmem_free(ldep->align_addr, 3583 sizeof (caddr_t) * ldep->nbytes); 3584 ldep->align_addr = NULL; 3585 } 3586 return (EAGAIN); 3587 } 3588 } 3589 3590 return (rv); 3591 } 3592 3593 /* 3594 * Interrupt handlers for messages from LDC 3595 */ 3596 3597 /* 3598 * Function: 3599 * vdc_handle_cb() 3600 * 3601 * Description: 3602 * 3603 * Arguments: 3604 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3605 * arg - soft state pointer for this instance of the device driver. 3606 * 3607 * Return Code: 3608 * 0 - Success 3609 */ 3610 static uint_t 3611 vdc_handle_cb(uint64_t event, caddr_t arg) 3612 { 3613 ldc_status_t ldc_state; 3614 int rv = 0; 3615 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3616 vdc_t *vdc = srvr->vdcp; 3617 3618 ASSERT(vdc != NULL); 3619 3620 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3621 3622 /* If callback is not for the current server, ignore it */ 3623 mutex_enter(&vdc->lock); 3624 3625 if (vdc->curr_server != srvr) { 3626 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3627 vdc->instance, event, srvr->id); 3628 mutex_exit(&vdc->lock); 3629 return (LDC_SUCCESS); 3630 } 3631 3632 /* 3633 * Depending on the type of event that triggered this callback, 3634 * we modify the handshake state or read the data. 3635 * 3636 * NOTE: not done as a switch() as event could be triggered by 3637 * a state change and a read request. Also the ordering of the 3638 * check for the event types is deliberate. 3639 */ 3640 if (event & LDC_EVT_UP) { 3641 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3642 3643 /* get LDC state */ 3644 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3645 if (rv != 0) { 3646 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3647 vdc->instance, rv); 3648 mutex_exit(&vdc->lock); 3649 return (LDC_SUCCESS); 3650 } 3651 if (srvr->ldc_state != LDC_UP && 3652 ldc_state == LDC_UP) { 3653 /* 3654 * Reset the transaction sequence numbers when 3655 * LDC comes up. We then kick off the handshake 3656 * negotiation with the vDisk server. 3657 */ 3658 vdc->seq_num = 1; 3659 vdc->seq_num_reply = 0; 3660 srvr->ldc_state = ldc_state; 3661 cv_signal(&vdc->initwait_cv); 3662 } 3663 } 3664 3665 if (event & LDC_EVT_READ) { 3666 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3667 mutex_enter(&vdc->read_lock); 3668 cv_signal(&vdc->read_cv); 3669 vdc->read_state = VDC_READ_PENDING; 3670 mutex_exit(&vdc->read_lock); 3671 mutex_exit(&vdc->lock); 3672 3673 /* that's all we have to do - no need to handle DOWN/RESET */ 3674 return (LDC_SUCCESS); 3675 } 3676 3677 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3678 3679 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3680 3681 /* 3682 * Need to wake up any readers so they will 3683 * detect that a reset has occurred. 3684 */ 3685 mutex_enter(&vdc->read_lock); 3686 if ((vdc->read_state == VDC_READ_WAITING) || 3687 (vdc->read_state == VDC_READ_RESET)) 3688 cv_signal(&vdc->read_cv); 3689 vdc->read_state = VDC_READ_RESET; 3690 mutex_exit(&vdc->read_lock); 3691 3692 /* wake up any threads waiting for connection to come up */ 3693 if (vdc->state == VDC_STATE_INIT_WAITING) { 3694 vdc->state = VDC_STATE_RESETTING; 3695 cv_signal(&vdc->initwait_cv); 3696 } 3697 3698 } 3699 3700 mutex_exit(&vdc->lock); 3701 3702 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3703 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3704 vdc->instance, event); 3705 3706 return (LDC_SUCCESS); 3707 } 3708 3709 /* 3710 * Function: 3711 * vdc_wait_for_response() 3712 * 3713 * Description: 3714 * Block waiting for a response from the server. If there is 3715 * no data the thread block on the read_cv that is signalled 3716 * by the callback when an EVT_READ occurs. 3717 * 3718 * Arguments: 3719 * vdcp - soft state pointer for this instance of the device driver. 3720 * 3721 * Return Code: 3722 * 0 - Success 3723 */ 3724 static int 3725 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3726 { 3727 size_t nbytes = sizeof (*msgp); 3728 int status; 3729 3730 ASSERT(vdcp != NULL); 3731 3732 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3733 3734 status = vdc_recv(vdcp, msgp, &nbytes); 3735 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3736 status, (int)nbytes); 3737 if (status) { 3738 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3739 vdcp->instance, status); 3740 return (status); 3741 } 3742 3743 if (nbytes < sizeof (vio_msg_tag_t)) { 3744 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3745 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3746 return (ENOMSG); 3747 } 3748 3749 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3750 msgp->tag.vio_msgtype, 3751 msgp->tag.vio_subtype, 3752 msgp->tag.vio_subtype_env); 3753 3754 /* 3755 * Verify the Session ID of the message 3756 * 3757 * Every message after the Version has been negotiated should 3758 * have the correct session ID set. 3759 */ 3760 if ((msgp->tag.vio_sid != vdcp->session_id) && 3761 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3762 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3763 "expected 0x%lx [seq num %lx @ %d]", 3764 vdcp->instance, msgp->tag.vio_sid, 3765 vdcp->session_id, 3766 ((vio_dring_msg_t *)msgp)->seq_num, 3767 ((vio_dring_msg_t *)msgp)->start_idx); 3768 return (ENOMSG); 3769 } 3770 return (0); 3771 } 3772 3773 3774 /* 3775 * Function: 3776 * vdc_resubmit_backup_dring() 3777 * 3778 * Description: 3779 * Resubmit each descriptor in the backed up dring to 3780 * vDisk server. The Dring was backed up during connection 3781 * reset. 3782 * 3783 * Arguments: 3784 * vdcp - soft state pointer for this instance of the device driver. 3785 * 3786 * Return Code: 3787 * 0 - Success 3788 */ 3789 static int 3790 vdc_resubmit_backup_dring(vdc_t *vdcp) 3791 { 3792 int processed = 0; 3793 int count; 3794 int b_idx; 3795 int rv = 0; 3796 int dring_size; 3797 int op; 3798 vio_msg_t vio_msg; 3799 vdc_local_desc_t *curr_ldep; 3800 3801 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3802 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3803 3804 if (vdcp->local_dring_backup == NULL) { 3805 /* the pending requests have already been processed */ 3806 return (0); 3807 } 3808 3809 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3810 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3811 3812 /* 3813 * Walk the backup copy of the local descriptor ring and 3814 * resubmit all the outstanding transactions. 3815 */ 3816 b_idx = vdcp->local_dring_backup_tail; 3817 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3818 3819 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3820 3821 /* only resubmit outstanding transactions */ 3822 if (!curr_ldep->is_free) { 3823 /* 3824 * If we are retrying a block read/write operation we 3825 * need to update the I/O statistics to indicate that 3826 * the request is being put back on the waitq to be 3827 * serviced (it will have been taken off after the 3828 * error was reported). 3829 */ 3830 mutex_enter(&vdcp->lock); 3831 op = curr_ldep->operation; 3832 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3833 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3834 VD_KSTAT_WAITQ_ENTER(vdcp); 3835 } 3836 3837 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3838 rv = vdc_populate_descriptor(vdcp, op, 3839 curr_ldep->addr, curr_ldep->nbytes, 3840 curr_ldep->slice, curr_ldep->offset, 3841 curr_ldep->cb_type, curr_ldep->cb_arg, 3842 curr_ldep->dir); 3843 3844 if (rv) { 3845 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3846 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3847 VD_KSTAT_WAITQ_EXIT(vdcp); 3848 DTRACE_IO1(done, buf_t *, 3849 curr_ldep->cb_arg); 3850 } 3851 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3852 vdcp->instance, b_idx); 3853 mutex_exit(&vdcp->lock); 3854 goto done; 3855 } 3856 3857 /* 3858 * If this is a block read/write we update the I/O 3859 * statistics kstat to indicate that the request 3860 * has been sent back to the vDisk server and should 3861 * now be put on the run queue. 3862 */ 3863 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3864 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3865 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3866 } 3867 mutex_exit(&vdcp->lock); 3868 3869 /* Wait for the response message. */ 3870 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3871 b_idx); 3872 rv = vdc_wait_for_response(vdcp, &vio_msg); 3873 if (rv) { 3874 /* 3875 * If this is a block read/write we update 3876 * the I/O statistics kstat to take it 3877 * off the run queue. 3878 */ 3879 mutex_enter(&vdcp->lock); 3880 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3881 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3882 VD_KSTAT_RUNQ_EXIT(vdcp); 3883 DTRACE_IO1(done, buf_t *, 3884 curr_ldep->cb_arg); 3885 } 3886 DMSG(vdcp, 1, "[%d] wait_for_response " 3887 "returned err=%d\n", vdcp->instance, 3888 rv); 3889 mutex_exit(&vdcp->lock); 3890 goto done; 3891 } 3892 3893 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3894 rv = vdc_process_data_msg(vdcp, &vio_msg); 3895 if (rv) { 3896 DMSG(vdcp, 1, "[%d] process_data_msg " 3897 "returned err=%d\n", vdcp->instance, 3898 rv); 3899 goto done; 3900 } 3901 /* 3902 * Mark this entry as free so that we will not resubmit 3903 * this "done" request again, if we were to use the same 3904 * backup_dring again in future. This could happen when 3905 * a reset happens while processing the backup_dring. 3906 */ 3907 curr_ldep->is_free = B_TRUE; 3908 processed++; 3909 } 3910 3911 /* get the next element to submit */ 3912 if (++b_idx >= vdcp->local_dring_backup_len) 3913 b_idx = 0; 3914 } 3915 3916 /* all done - now clear up pending dring copy */ 3917 dring_size = vdcp->local_dring_backup_len * 3918 sizeof (vdcp->local_dring_backup[0]); 3919 3920 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3921 3922 vdcp->local_dring_backup = NULL; 3923 3924 done: 3925 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3926 3927 return (rv); 3928 } 3929 3930 /* 3931 * Function: 3932 * vdc_cancel_backup_dring 3933 * 3934 * Description: 3935 * Cancel each descriptor in the backed up dring to vDisk server. 3936 * The Dring was backed up during connection reset. 3937 * 3938 * Arguments: 3939 * vdcp - soft state pointer for this instance of the device driver. 3940 * 3941 * Return Code: 3942 * None 3943 */ 3944 void 3945 vdc_cancel_backup_dring(vdc_t *vdcp) 3946 { 3947 vdc_local_desc_t *ldep; 3948 struct buf *bufp; 3949 int count; 3950 int b_idx; 3951 int dring_size; 3952 int cancelled = 0; 3953 3954 ASSERT(MUTEX_HELD(&vdcp->lock)); 3955 ASSERT(vdcp->state == VDC_STATE_INIT || 3956 vdcp->state == VDC_STATE_INIT_WAITING || 3957 vdcp->state == VDC_STATE_NEGOTIATE || 3958 vdcp->state == VDC_STATE_RESETTING); 3959 3960 if (vdcp->local_dring_backup == NULL) { 3961 /* the pending requests have already been processed */ 3962 return; 3963 } 3964 3965 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3966 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3967 3968 /* 3969 * Walk the backup copy of the local descriptor ring and 3970 * cancel all the outstanding transactions. 3971 */ 3972 b_idx = vdcp->local_dring_backup_tail; 3973 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3974 3975 ldep = &(vdcp->local_dring_backup[b_idx]); 3976 3977 /* only cancel outstanding transactions */ 3978 if (!ldep->is_free) { 3979 3980 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3981 cancelled++; 3982 3983 /* 3984 * All requests have already been cleared from the 3985 * local descriptor ring and the LDC channel has been 3986 * reset so we will never get any reply for these 3987 * requests. Now we just have to notify threads waiting 3988 * for replies that the request has failed. 3989 */ 3990 switch (ldep->cb_type) { 3991 case CB_SYNC: 3992 ASSERT(vdcp->sync_op_pending); 3993 vdcp->sync_op_status = EIO; 3994 vdcp->sync_op_pending = B_FALSE; 3995 cv_signal(&vdcp->sync_pending_cv); 3996 break; 3997 3998 case CB_STRATEGY: 3999 bufp = ldep->cb_arg; 4000 ASSERT(bufp != NULL); 4001 bufp->b_resid = bufp->b_bcount; 4002 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4003 VD_KSTAT_RUNQ_EXIT(vdcp); 4004 DTRACE_IO1(done, buf_t *, bufp); 4005 bioerror(bufp, EIO); 4006 biodone(bufp); 4007 break; 4008 4009 default: 4010 ASSERT(0); 4011 } 4012 4013 } 4014 4015 /* get the next element to cancel */ 4016 if (++b_idx >= vdcp->local_dring_backup_len) 4017 b_idx = 0; 4018 } 4019 4020 /* all done - now clear up pending dring copy */ 4021 dring_size = vdcp->local_dring_backup_len * 4022 sizeof (vdcp->local_dring_backup[0]); 4023 4024 (void) kmem_free(vdcp->local_dring_backup, dring_size); 4025 4026 vdcp->local_dring_backup = NULL; 4027 4028 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 4029 } 4030 4031 /* 4032 * Function: 4033 * vdc_connection_timeout 4034 * 4035 * Description: 4036 * This function is invoked if the timeout set to establish the connection 4037 * with vds expires. This will happen if we spend too much time in the 4038 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 4039 * cancel any pending request and mark them as failed. 4040 * 4041 * If the timeout does not expire, it will be cancelled when we reach the 4042 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 4043 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 4044 * VDC_STATE_RESETTING state in which case we do nothing because the 4045 * timeout is being cancelled. 4046 * 4047 * Arguments: 4048 * arg - argument of the timeout function actually a soft state 4049 * pointer for the instance of the device driver. 4050 * 4051 * Return Code: 4052 * None 4053 */ 4054 void 4055 vdc_connection_timeout(void *arg) 4056 { 4057 vdc_t *vdcp = (vdc_t *)arg; 4058 4059 mutex_enter(&vdcp->lock); 4060 4061 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4062 vdcp->state == VDC_STATE_DETACH) { 4063 /* 4064 * The connection has just been re-established or 4065 * we are detaching. 4066 */ 4067 vdcp->ctimeout_reached = B_FALSE; 4068 mutex_exit(&vdcp->lock); 4069 return; 4070 } 4071 4072 vdcp->ctimeout_reached = B_TRUE; 4073 4074 /* notify requests waiting for sending */ 4075 cv_broadcast(&vdcp->running_cv); 4076 4077 /* cancel requests waiting for a result */ 4078 vdc_cancel_backup_dring(vdcp); 4079 4080 mutex_exit(&vdcp->lock); 4081 4082 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4083 vdcp->instance); 4084 } 4085 4086 /* 4087 * Function: 4088 * vdc_backup_local_dring() 4089 * 4090 * Description: 4091 * Backup the current dring in the event of a reset. The Dring 4092 * transactions will be resubmitted to the server when the 4093 * connection is restored. 4094 * 4095 * Arguments: 4096 * vdcp - soft state pointer for this instance of the device driver. 4097 * 4098 * Return Code: 4099 * NONE 4100 */ 4101 static void 4102 vdc_backup_local_dring(vdc_t *vdcp) 4103 { 4104 int dring_size; 4105 4106 ASSERT(MUTEX_HELD(&vdcp->lock)); 4107 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4108 4109 /* 4110 * If the backup dring is stil around, it means 4111 * that the last restore did not complete. However, 4112 * since we never got back into the running state, 4113 * the backup copy we have is still valid. 4114 */ 4115 if (vdcp->local_dring_backup != NULL) { 4116 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4117 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4118 vdcp->local_dring_backup_tail); 4119 return; 4120 } 4121 4122 /* 4123 * The backup dring can be NULL and the local dring may not be 4124 * initialized. This can happen if we had a reset while establishing 4125 * a new connection but after the connection has timed out. In that 4126 * case the backup dring is NULL because the requests have been 4127 * cancelled and the request occured before the local dring is 4128 * initialized. 4129 */ 4130 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4131 return; 4132 4133 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4134 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4135 4136 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4137 4138 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4139 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4140 4141 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4142 vdcp->local_dring_backup_len = vdcp->dring_len; 4143 } 4144 4145 static void 4146 vdc_switch_server(vdc_t *vdcp) 4147 { 4148 int rv; 4149 vdc_server_t *curr_server, *new_server; 4150 4151 ASSERT(MUTEX_HELD(&vdcp->lock)); 4152 4153 /* if there is only one server return back */ 4154 if (vdcp->num_servers == 1) { 4155 return; 4156 } 4157 4158 /* Get current and next server */ 4159 curr_server = vdcp->curr_server; 4160 new_server = 4161 (curr_server->next) ? curr_server->next : vdcp->server_list; 4162 ASSERT(curr_server != new_server); 4163 4164 /* bring current server's channel down */ 4165 rv = ldc_down(curr_server->ldc_handle); 4166 if (rv) { 4167 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4168 vdcp->instance, curr_server->id); 4169 return; 4170 } 4171 4172 /* switch the server */ 4173 vdcp->curr_server = new_server; 4174 4175 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4176 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4177 } 4178 4179 /* -------------------------------------------------------------------------- */ 4180 4181 /* 4182 * The following functions process the incoming messages from vds 4183 */ 4184 4185 /* 4186 * Function: 4187 * vdc_process_msg_thread() 4188 * 4189 * Description: 4190 * 4191 * Main VDC message processing thread. Each vDisk instance 4192 * consists of a copy of this thread. This thread triggers 4193 * all the handshakes and data exchange with the server. It 4194 * also handles all channel resets 4195 * 4196 * Arguments: 4197 * vdc - soft state pointer for this instance of the device driver. 4198 * 4199 * Return Code: 4200 * None 4201 */ 4202 static void 4203 vdc_process_msg_thread(vdc_t *vdcp) 4204 { 4205 int status; 4206 int ctimeout; 4207 timeout_id_t tmid = 0; 4208 clock_t ldcup_timeout = 0; 4209 4210 mutex_enter(&vdcp->lock); 4211 4212 for (;;) { 4213 4214 #define Q(_s) (vdcp->state == _s) ? #_s : 4215 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4216 Q(VDC_STATE_INIT) 4217 Q(VDC_STATE_INIT_WAITING) 4218 Q(VDC_STATE_NEGOTIATE) 4219 Q(VDC_STATE_HANDLE_PENDING) 4220 Q(VDC_STATE_RUNNING) 4221 Q(VDC_STATE_RESETTING) 4222 Q(VDC_STATE_DETACH) 4223 "UNKNOWN"); 4224 4225 switch (vdcp->state) { 4226 case VDC_STATE_INIT: 4227 4228 /* 4229 * If requested, start a timeout to check if the 4230 * connection with vds is established in the 4231 * specified delay. If the timeout expires, we 4232 * will cancel any pending request. 4233 * 4234 * If some reset have occurred while establishing 4235 * the connection, we already have a timeout armed 4236 * and in that case we don't need to arm a new one. 4237 * 4238 * The same rule applies when there are multiple vds'. 4239 * If either a connection cannot be established or 4240 * the handshake times out, the connection thread will 4241 * try another server. The 'ctimeout' will report 4242 * back an error after it expires irrespective of 4243 * whether the vdisk is trying to connect to just 4244 * one or multiple servers. 4245 */ 4246 ctimeout = (vdc_timeout != 0)? 4247 vdc_timeout : vdcp->curr_server->ctimeout; 4248 4249 if (ctimeout != 0 && tmid == 0) { 4250 tmid = timeout(vdc_connection_timeout, vdcp, 4251 ctimeout * drv_usectohz(MICROSEC)); 4252 } 4253 4254 /* Check if we are re-initializing repeatedly */ 4255 if (vdcp->hshake_cnt > vdc_hshake_retries && 4256 vdcp->lifecycle != VDC_LC_ONLINE) { 4257 4258 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4259 vdcp->instance, vdcp->hshake_cnt); 4260 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4261 vdcp->instance); 4262 vdcp->state = VDC_STATE_DETACH; 4263 break; 4264 } 4265 4266 /* Switch to STATE_DETACH if drv is detaching */ 4267 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4268 vdcp->state = VDC_STATE_DETACH; 4269 break; 4270 } 4271 4272 /* Switch server */ 4273 if (vdcp->hshake_cnt > 0) 4274 vdc_switch_server(vdcp); 4275 vdcp->hshake_cnt++; 4276 4277 /* Bring up connection with vds via LDC */ 4278 status = vdc_start_ldc_connection(vdcp); 4279 if (status != EINVAL) { 4280 vdcp->state = VDC_STATE_INIT_WAITING; 4281 } 4282 break; 4283 4284 case VDC_STATE_INIT_WAITING: 4285 4286 /* if channel is UP, start negotiation */ 4287 if (vdcp->curr_server->ldc_state == LDC_UP) { 4288 vdcp->state = VDC_STATE_NEGOTIATE; 4289 break; 4290 } 4291 4292 /* check if only one server exists */ 4293 if (vdcp->num_servers == 1) { 4294 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4295 } else { 4296 /* 4297 * wait for LDC_UP, if it times out, switch 4298 * to another server. 4299 */ 4300 ldcup_timeout = ddi_get_lbolt() + 4301 (vdc_ldcup_timeout * 4302 drv_usectohz(MICROSEC)); 4303 status = cv_timedwait(&vdcp->initwait_cv, 4304 &vdcp->lock, ldcup_timeout); 4305 if (status == -1 && 4306 vdcp->state == VDC_STATE_INIT_WAITING && 4307 vdcp->curr_server->ldc_state != LDC_UP) { 4308 /* timed out & still waiting */ 4309 vdcp->state = VDC_STATE_INIT; 4310 break; 4311 } 4312 } 4313 4314 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4315 DMSG(vdcp, 0, 4316 "state moved to %d out from under us...\n", 4317 vdcp->state); 4318 } 4319 break; 4320 4321 case VDC_STATE_NEGOTIATE: 4322 switch (status = vdc_ver_negotiation(vdcp)) { 4323 case 0: 4324 break; 4325 default: 4326 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4327 status); 4328 goto reset; 4329 } 4330 4331 switch (status = vdc_attr_negotiation(vdcp)) { 4332 case 0: 4333 break; 4334 default: 4335 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4336 status); 4337 goto reset; 4338 } 4339 4340 switch (status = vdc_dring_negotiation(vdcp)) { 4341 case 0: 4342 break; 4343 default: 4344 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4345 status); 4346 goto reset; 4347 } 4348 4349 switch (status = vdc_rdx_exchange(vdcp)) { 4350 case 0: 4351 vdcp->state = VDC_STATE_HANDLE_PENDING; 4352 goto done; 4353 default: 4354 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4355 status); 4356 goto reset; 4357 } 4358 reset: 4359 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4360 status); 4361 vdcp->state = VDC_STATE_RESETTING; 4362 vdcp->self_reset = B_TRUE; 4363 done: 4364 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4365 vdcp->state); 4366 break; 4367 4368 case VDC_STATE_HANDLE_PENDING: 4369 4370 if (vdcp->ctimeout_reached) { 4371 /* 4372 * The connection timeout had been reached so 4373 * pending requests have been cancelled. Now 4374 * that the connection is back we can reset 4375 * the timeout. 4376 */ 4377 ASSERT(vdcp->local_dring_backup == NULL); 4378 ASSERT(tmid != 0); 4379 tmid = 0; 4380 vdcp->ctimeout_reached = B_FALSE; 4381 vdcp->state = VDC_STATE_RUNNING; 4382 DMSG(vdcp, 0, "[%d] connection to service " 4383 "domain is up", vdcp->instance); 4384 break; 4385 } 4386 4387 mutex_exit(&vdcp->lock); 4388 if (tmid != 0) { 4389 (void) untimeout(tmid); 4390 tmid = 0; 4391 } 4392 status = vdc_resubmit_backup_dring(vdcp); 4393 mutex_enter(&vdcp->lock); 4394 4395 if (status) 4396 vdcp->state = VDC_STATE_RESETTING; 4397 else 4398 vdcp->state = VDC_STATE_RUNNING; 4399 4400 break; 4401 4402 /* enter running state */ 4403 case VDC_STATE_RUNNING: 4404 /* 4405 * Signal anyone waiting for the connection 4406 * to come on line. 4407 */ 4408 vdcp->hshake_cnt = 0; 4409 cv_broadcast(&vdcp->running_cv); 4410 4411 /* failfast has to been checked after reset */ 4412 cv_signal(&vdcp->failfast_cv); 4413 4414 /* ownership is lost during reset */ 4415 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4416 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4417 cv_signal(&vdcp->ownership_cv); 4418 4419 cmn_err(CE_CONT, "?vdisk@%d is online using " 4420 "ldc@%ld,%ld\n", vdcp->instance, 4421 vdcp->curr_server->ldc_id, vdcp->curr_server->id); 4422 4423 mutex_exit(&vdcp->lock); 4424 4425 for (;;) { 4426 vio_msg_t msg; 4427 status = vdc_wait_for_response(vdcp, &msg); 4428 if (status) break; 4429 4430 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4431 vdcp->instance); 4432 status = vdc_process_data_msg(vdcp, &msg); 4433 if (status) { 4434 DMSG(vdcp, 1, "[%d] process_data_msg " 4435 "returned err=%d\n", vdcp->instance, 4436 status); 4437 break; 4438 } 4439 4440 } 4441 4442 mutex_enter(&vdcp->lock); 4443 4444 cmn_err(CE_CONT, "?vdisk@%d is offline\n", 4445 vdcp->instance); 4446 4447 vdcp->state = VDC_STATE_RESETTING; 4448 vdcp->self_reset = B_TRUE; 4449 break; 4450 4451 case VDC_STATE_RESETTING: 4452 /* 4453 * When we reach this state, we either come from the 4454 * VDC_STATE_RUNNING state and we can have pending 4455 * request but no timeout is armed; or we come from 4456 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4457 * VDC_HANDLE_PENDING state and there is no pending 4458 * request or pending requests have already been copied 4459 * into the backup dring. So we can safely keep the 4460 * connection timeout armed while we are in this state. 4461 */ 4462 4463 DMSG(vdcp, 0, "Initiating channel reset " 4464 "(pending = %d)\n", (int)vdcp->threads_pending); 4465 4466 if (vdcp->self_reset) { 4467 DMSG(vdcp, 0, 4468 "[%d] calling stop_ldc_connection.\n", 4469 vdcp->instance); 4470 status = vdc_stop_ldc_connection(vdcp); 4471 vdcp->self_reset = B_FALSE; 4472 } 4473 4474 /* 4475 * Wait for all threads currently waiting 4476 * for a free dring entry to use. 4477 */ 4478 while (vdcp->threads_pending) { 4479 cv_broadcast(&vdcp->membind_cv); 4480 cv_broadcast(&vdcp->dring_free_cv); 4481 mutex_exit(&vdcp->lock); 4482 /* give the waiters enough time to wake up */ 4483 delay(vdc_hz_min_ldc_delay); 4484 mutex_enter(&vdcp->lock); 4485 } 4486 4487 ASSERT(vdcp->threads_pending == 0); 4488 4489 /* Sanity check that no thread is receiving */ 4490 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4491 4492 vdcp->read_state = VDC_READ_IDLE; 4493 4494 vdc_backup_local_dring(vdcp); 4495 4496 /* cleanup the old d-ring */ 4497 vdc_destroy_descriptor_ring(vdcp); 4498 4499 /* go and start again */ 4500 vdcp->state = VDC_STATE_INIT; 4501 4502 break; 4503 4504 case VDC_STATE_DETACH: 4505 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4506 vdcp->instance); 4507 4508 /* cancel any pending timeout */ 4509 mutex_exit(&vdcp->lock); 4510 if (tmid != 0) { 4511 (void) untimeout(tmid); 4512 tmid = 0; 4513 } 4514 mutex_enter(&vdcp->lock); 4515 4516 /* 4517 * Signal anyone waiting for connection 4518 * to come online 4519 */ 4520 cv_broadcast(&vdcp->running_cv); 4521 4522 while (vdcp->sync_op_pending) { 4523 cv_signal(&vdcp->sync_pending_cv); 4524 cv_signal(&vdcp->sync_blocked_cv); 4525 mutex_exit(&vdcp->lock); 4526 /* give the waiters enough time to wake up */ 4527 delay(vdc_hz_min_ldc_delay); 4528 mutex_enter(&vdcp->lock); 4529 } 4530 4531 mutex_exit(&vdcp->lock); 4532 4533 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4534 vdcp->instance); 4535 thread_exit(); 4536 break; 4537 } 4538 } 4539 } 4540 4541 4542 /* 4543 * Function: 4544 * vdc_process_data_msg() 4545 * 4546 * Description: 4547 * This function is called by the message processing thread each time 4548 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4549 * be an ACK or NACK from vds[1] which vdc handles as follows. 4550 * ACK - wake up the waiting thread 4551 * NACK - resend any messages necessary 4552 * 4553 * [1] Although the message format allows it, vds should not send a 4554 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4555 * some bizarre reason it does, vdc will reset the connection. 4556 * 4557 * Arguments: 4558 * vdc - soft state pointer for this instance of the device driver. 4559 * msg - the LDC message sent by vds 4560 * 4561 * Return Code: 4562 * 0 - Success. 4563 * > 0 - error value returned by LDC 4564 */ 4565 static int 4566 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4567 { 4568 int status = 0; 4569 vio_dring_msg_t *dring_msg; 4570 vdc_local_desc_t *ldep = NULL; 4571 int start, end; 4572 int idx; 4573 int op; 4574 4575 dring_msg = (vio_dring_msg_t *)msg; 4576 4577 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4578 ASSERT(vdcp != NULL); 4579 4580 mutex_enter(&vdcp->lock); 4581 4582 /* 4583 * Check to see if the message has bogus data 4584 */ 4585 idx = start = dring_msg->start_idx; 4586 end = dring_msg->end_idx; 4587 if ((start >= vdcp->dring_len) || 4588 (end >= vdcp->dring_len) || (end < -1)) { 4589 /* 4590 * Update the I/O statistics to indicate that an error ocurred. 4591 * No need to update the wait/run queues as no specific read or 4592 * write request is being completed in response to this 'msg'. 4593 */ 4594 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4595 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4596 vdcp->instance, start, end); 4597 mutex_exit(&vdcp->lock); 4598 return (EINVAL); 4599 } 4600 4601 /* 4602 * Verify that the sequence number is what vdc expects. 4603 */ 4604 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4605 case VDC_SEQ_NUM_TODO: 4606 break; /* keep processing this message */ 4607 case VDC_SEQ_NUM_SKIP: 4608 mutex_exit(&vdcp->lock); 4609 return (0); 4610 case VDC_SEQ_NUM_INVALID: 4611 /* 4612 * Update the I/O statistics to indicate that an error ocurred. 4613 * No need to update the wait/run queues as no specific read or 4614 * write request is being completed in response to this 'msg'. 4615 */ 4616 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4617 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4618 mutex_exit(&vdcp->lock); 4619 return (ENXIO); 4620 } 4621 4622 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4623 /* 4624 * Update the I/O statistics to indicate that an error ocurred. 4625 * 4626 * We need to update the run queue if a read or write request 4627 * is being NACKed - otherwise there will appear to be an 4628 * indefinite outstanding request and statistics reported by 4629 * iostat(1M) will be incorrect. The transaction will be 4630 * resubmitted from the backup DRing following the reset 4631 * and the wait/run queues will be entered again. 4632 */ 4633 ldep = &vdcp->local_dring[idx]; 4634 op = ldep->operation; 4635 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4636 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4637 VD_KSTAT_RUNQ_EXIT(vdcp); 4638 } 4639 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4640 VDC_DUMP_DRING_MSG(dring_msg); 4641 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4642 mutex_exit(&vdcp->lock); 4643 return (EIO); 4644 4645 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4646 /* 4647 * Update the I/O statistics to indicate that an error occurred. 4648 * No need to update the wait/run queues as no specific read or 4649 * write request is being completed in response to this 'msg'. 4650 */ 4651 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4652 mutex_exit(&vdcp->lock); 4653 return (EPROTO); 4654 } 4655 4656 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4657 ASSERT(start == end); 4658 4659 ldep = &vdcp->local_dring[idx]; 4660 4661 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4662 ldep->dep->hdr.dstate, ldep->cb_type); 4663 4664 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4665 struct buf *bufp; 4666 4667 switch (ldep->cb_type) { 4668 case CB_SYNC: 4669 ASSERT(vdcp->sync_op_pending); 4670 4671 status = vdc_depopulate_descriptor(vdcp, idx); 4672 vdcp->sync_op_status = status; 4673 vdcp->sync_op_pending = B_FALSE; 4674 cv_signal(&vdcp->sync_pending_cv); 4675 break; 4676 4677 case CB_STRATEGY: 4678 bufp = ldep->cb_arg; 4679 ASSERT(bufp != NULL); 4680 bufp->b_resid = 4681 bufp->b_bcount - ldep->dep->payload.nbytes; 4682 status = ldep->dep->payload.status; /* Future:ntoh */ 4683 if (status != 0) { 4684 DMSG(vdcp, 1, "strategy status=%d\n", status); 4685 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4686 bioerror(bufp, status); 4687 } 4688 4689 (void) vdc_depopulate_descriptor(vdcp, idx); 4690 4691 DMSG(vdcp, 1, 4692 "strategy complete req=%ld bytes resp=%ld bytes\n", 4693 bufp->b_bcount, ldep->dep->payload.nbytes); 4694 4695 if (status != 0 && vdcp->failfast_interval != 0) { 4696 /* 4697 * The I/O has failed and failfast is enabled. 4698 * We need the failfast thread to check if the 4699 * failure is due to a reservation conflict. 4700 */ 4701 (void) vdc_failfast_io_queue(vdcp, bufp); 4702 } else { 4703 if (status == 0) { 4704 op = (bufp->b_flags & B_READ) ? 4705 VD_OP_BREAD : VD_OP_BWRITE; 4706 VD_UPDATE_IO_STATS(vdcp, op, 4707 ldep->dep->payload.nbytes); 4708 } 4709 VD_KSTAT_RUNQ_EXIT(vdcp); 4710 DTRACE_IO1(done, buf_t *, bufp); 4711 biodone(bufp); 4712 } 4713 break; 4714 4715 default: 4716 ASSERT(0); 4717 } 4718 } 4719 4720 /* let the arrival signal propogate */ 4721 mutex_exit(&vdcp->lock); 4722 4723 /* probe gives the count of how many entries were processed */ 4724 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4725 4726 return (0); 4727 } 4728 4729 4730 /* 4731 * Function: 4732 * vdc_handle_ver_msg() 4733 * 4734 * Description: 4735 * 4736 * Arguments: 4737 * vdc - soft state pointer for this instance of the device driver. 4738 * ver_msg - LDC message sent by vDisk server 4739 * 4740 * Return Code: 4741 * 0 - Success 4742 */ 4743 static int 4744 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4745 { 4746 int status = 0; 4747 4748 ASSERT(vdc != NULL); 4749 ASSERT(mutex_owned(&vdc->lock)); 4750 4751 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4752 return (EPROTO); 4753 } 4754 4755 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4756 return (EINVAL); 4757 } 4758 4759 switch (ver_msg->tag.vio_subtype) { 4760 case VIO_SUBTYPE_ACK: 4761 /* 4762 * We check to see if the version returned is indeed supported 4763 * (The server may have also adjusted the minor number downwards 4764 * and if so 'ver_msg' will contain the actual version agreed) 4765 */ 4766 if (vdc_is_supported_version(ver_msg)) { 4767 vdc->ver.major = ver_msg->ver_major; 4768 vdc->ver.minor = ver_msg->ver_minor; 4769 ASSERT(vdc->ver.major > 0); 4770 } else { 4771 status = EPROTO; 4772 } 4773 break; 4774 4775 case VIO_SUBTYPE_NACK: 4776 /* 4777 * call vdc_is_supported_version() which will return the next 4778 * supported version (if any) in 'ver_msg' 4779 */ 4780 (void) vdc_is_supported_version(ver_msg); 4781 if (ver_msg->ver_major > 0) { 4782 size_t len = sizeof (*ver_msg); 4783 4784 ASSERT(vdc->ver.major > 0); 4785 4786 /* reset the necessary fields and resend */ 4787 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4788 ver_msg->dev_class = VDEV_DISK; 4789 4790 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4791 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4792 vdc->instance, status); 4793 if (len != sizeof (*ver_msg)) 4794 status = EBADMSG; 4795 } else { 4796 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4797 vdc->instance); 4798 status = ENOTSUP; 4799 } 4800 4801 break; 4802 case VIO_SUBTYPE_INFO: 4803 /* 4804 * Handle the case where vds starts handshake 4805 * (for now only vdc is the instigator) 4806 */ 4807 status = ENOTSUP; 4808 break; 4809 4810 default: 4811 status = EINVAL; 4812 break; 4813 } 4814 4815 return (status); 4816 } 4817 4818 /* 4819 * Function: 4820 * vdc_handle_attr_msg() 4821 * 4822 * Description: 4823 * 4824 * Arguments: 4825 * vdc - soft state pointer for this instance of the device driver. 4826 * attr_msg - LDC message sent by vDisk server 4827 * 4828 * Return Code: 4829 * 0 - Success 4830 */ 4831 static int 4832 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4833 { 4834 int status = 0; 4835 4836 ASSERT(vdc != NULL); 4837 ASSERT(mutex_owned(&vdc->lock)); 4838 4839 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4840 return (EPROTO); 4841 } 4842 4843 switch (attr_msg->tag.vio_subtype) { 4844 case VIO_SUBTYPE_ACK: 4845 /* 4846 * We now verify the attributes sent by vds. 4847 */ 4848 if (attr_msg->vdisk_size == 0) { 4849 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4850 vdc->instance); 4851 status = EINVAL; 4852 break; 4853 } 4854 4855 if (attr_msg->max_xfer_sz == 0) { 4856 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4857 vdc->instance); 4858 status = EINVAL; 4859 break; 4860 } 4861 4862 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4863 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4864 vdc->instance); 4865 attr_msg->vdisk_size = 0; 4866 } 4867 /* update disk, block and transfer sizes */ 4868 vdc_update_size(vdc, attr_msg->vdisk_size, 4869 attr_msg->vdisk_block_size, attr_msg->max_xfer_sz); 4870 vdc->vdisk_type = attr_msg->vdisk_type; 4871 vdc->operations = attr_msg->operations; 4872 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4873 vdc->vdisk_media = attr_msg->vdisk_media; 4874 else 4875 vdc->vdisk_media = 0; 4876 4877 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4878 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4879 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4880 vdc->instance, vdc->block_size, 4881 attr_msg->vdisk_block_size); 4882 4883 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4884 (attr_msg->vdisk_size > INT64_MAX) || 4885 (attr_msg->operations == 0) || 4886 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4887 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4888 vdc->instance); 4889 status = EINVAL; 4890 break; 4891 } 4892 4893 /* 4894 * Now that we have received all attributes we can create a 4895 * fake geometry for the disk. 4896 */ 4897 vdc_create_fake_geometry(vdc); 4898 break; 4899 4900 case VIO_SUBTYPE_NACK: 4901 /* 4902 * vds could not handle the attributes we sent so we 4903 * stop negotiating. 4904 */ 4905 status = EPROTO; 4906 break; 4907 4908 case VIO_SUBTYPE_INFO: 4909 /* 4910 * Handle the case where vds starts the handshake 4911 * (for now; vdc is the only supported instigatior) 4912 */ 4913 status = ENOTSUP; 4914 break; 4915 4916 default: 4917 status = ENOTSUP; 4918 break; 4919 } 4920 4921 return (status); 4922 } 4923 4924 /* 4925 * Function: 4926 * vdc_handle_dring_reg_msg() 4927 * 4928 * Description: 4929 * 4930 * Arguments: 4931 * vdc - soft state pointer for this instance of the driver. 4932 * dring_msg - LDC message sent by vDisk server 4933 * 4934 * Return Code: 4935 * 0 - Success 4936 */ 4937 static int 4938 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4939 { 4940 int status = 0; 4941 4942 ASSERT(vdc != NULL); 4943 ASSERT(mutex_owned(&vdc->lock)); 4944 4945 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4946 return (EPROTO); 4947 } 4948 4949 switch (dring_msg->tag.vio_subtype) { 4950 case VIO_SUBTYPE_ACK: 4951 /* save the received dring_ident */ 4952 vdc->dring_ident = dring_msg->dring_ident; 4953 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4954 vdc->instance, vdc->dring_ident); 4955 break; 4956 4957 case VIO_SUBTYPE_NACK: 4958 /* 4959 * vds could not handle the DRing info we sent so we 4960 * stop negotiating. 4961 */ 4962 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4963 vdc->instance); 4964 status = EPROTO; 4965 break; 4966 4967 case VIO_SUBTYPE_INFO: 4968 /* 4969 * Handle the case where vds starts handshake 4970 * (for now only vdc is the instigatior) 4971 */ 4972 status = ENOTSUP; 4973 break; 4974 default: 4975 status = ENOTSUP; 4976 } 4977 4978 return (status); 4979 } 4980 4981 /* 4982 * Function: 4983 * vdc_verify_seq_num() 4984 * 4985 * Description: 4986 * This functions verifies that the sequence number sent back by the vDisk 4987 * server with the latest message is what is expected (i.e. it is greater 4988 * than the last seq num sent by the vDisk server and less than or equal 4989 * to the last seq num generated by vdc). 4990 * 4991 * It then checks the request ID to see if any requests need processing 4992 * in the DRing. 4993 * 4994 * Arguments: 4995 * vdc - soft state pointer for this instance of the driver. 4996 * dring_msg - pointer to the LDC message sent by vds 4997 * 4998 * Return Code: 4999 * VDC_SEQ_NUM_TODO - Message needs to be processed 5000 * VDC_SEQ_NUM_SKIP - Message has already been processed 5001 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 5002 * vdc cannot deal with them 5003 */ 5004 static int 5005 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 5006 { 5007 ASSERT(vdc != NULL); 5008 ASSERT(dring_msg != NULL); 5009 ASSERT(mutex_owned(&vdc->lock)); 5010 5011 /* 5012 * Check to see if the messages were responded to in the correct 5013 * order by vds. 5014 */ 5015 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 5016 (dring_msg->seq_num > vdc->seq_num)) { 5017 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 5018 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 5019 vdc->instance, dring_msg->seq_num, 5020 vdc->seq_num_reply, vdc->seq_num, 5021 vdc->req_id_proc, vdc->req_id); 5022 return (VDC_SEQ_NUM_INVALID); 5023 } 5024 vdc->seq_num_reply = dring_msg->seq_num; 5025 5026 if (vdc->req_id_proc < vdc->req_id) 5027 return (VDC_SEQ_NUM_TODO); 5028 else 5029 return (VDC_SEQ_NUM_SKIP); 5030 } 5031 5032 5033 /* 5034 * Function: 5035 * vdc_is_supported_version() 5036 * 5037 * Description: 5038 * This routine checks if the major/minor version numbers specified in 5039 * 'ver_msg' are supported. If not it finds the next version that is 5040 * in the supported version list 'vdc_version[]' and sets the fields in 5041 * 'ver_msg' to those values 5042 * 5043 * Arguments: 5044 * ver_msg - LDC message sent by vDisk server 5045 * 5046 * Return Code: 5047 * B_TRUE - Success 5048 * B_FALSE - Version not supported 5049 */ 5050 static boolean_t 5051 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5052 { 5053 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5054 5055 for (int i = 0; i < vdc_num_versions; i++) { 5056 ASSERT(vdc_version[i].major > 0); 5057 ASSERT((i == 0) || 5058 (vdc_version[i].major < vdc_version[i-1].major)); 5059 5060 /* 5061 * If the major versions match, adjust the minor version, if 5062 * necessary, down to the highest value supported by this 5063 * client. The server should support all minor versions lower 5064 * than the value it sent 5065 */ 5066 if (ver_msg->ver_major == vdc_version[i].major) { 5067 if (ver_msg->ver_minor > vdc_version[i].minor) { 5068 DMSGX(0, 5069 "Adjusting minor version from %u to %u", 5070 ver_msg->ver_minor, vdc_version[i].minor); 5071 ver_msg->ver_minor = vdc_version[i].minor; 5072 } 5073 return (B_TRUE); 5074 } 5075 5076 /* 5077 * If the message contains a higher major version number, set 5078 * the message's major/minor versions to the current values 5079 * and return false, so this message will get resent with 5080 * these values, and the server will potentially try again 5081 * with the same or a lower version 5082 */ 5083 if (ver_msg->ver_major > vdc_version[i].major) { 5084 ver_msg->ver_major = vdc_version[i].major; 5085 ver_msg->ver_minor = vdc_version[i].minor; 5086 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5087 ver_msg->ver_major, ver_msg->ver_minor); 5088 5089 return (B_FALSE); 5090 } 5091 5092 /* 5093 * Otherwise, the message's major version is less than the 5094 * current major version, so continue the loop to the next 5095 * (lower) supported version 5096 */ 5097 } 5098 5099 /* 5100 * No common version was found; "ground" the version pair in the 5101 * message to terminate negotiation 5102 */ 5103 ver_msg->ver_major = 0; 5104 ver_msg->ver_minor = 0; 5105 5106 return (B_FALSE); 5107 } 5108 /* -------------------------------------------------------------------------- */ 5109 5110 /* 5111 * DKIO(7) support 5112 */ 5113 5114 typedef struct vdc_dk_arg { 5115 struct dk_callback dkc; 5116 int mode; 5117 dev_t dev; 5118 vdc_t *vdc; 5119 } vdc_dk_arg_t; 5120 5121 /* 5122 * Function: 5123 * vdc_dkio_flush_cb() 5124 * 5125 * Description: 5126 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5127 * by kernel code. 5128 * 5129 * Arguments: 5130 * arg - a pointer to a vdc_dk_arg_t structure. 5131 */ 5132 void 5133 vdc_dkio_flush_cb(void *arg) 5134 { 5135 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5136 struct dk_callback *dkc = NULL; 5137 vdc_t *vdc = NULL; 5138 int rv; 5139 5140 if (dk_arg == NULL) { 5141 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5142 return; 5143 } 5144 dkc = &dk_arg->dkc; 5145 vdc = dk_arg->vdc; 5146 ASSERT(vdc != NULL); 5147 5148 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5149 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5150 if (rv != 0) { 5151 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5152 vdc->instance, rv, 5153 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5154 } 5155 5156 /* 5157 * Trigger the call back to notify the caller the the ioctl call has 5158 * been completed. 5159 */ 5160 if ((dk_arg->mode & FKIOCTL) && 5161 (dkc != NULL) && 5162 (dkc->dkc_callback != NULL)) { 5163 ASSERT(dkc->dkc_cookie != NULL); 5164 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5165 } 5166 5167 /* Indicate that one less DKIO write flush is outstanding */ 5168 mutex_enter(&vdc->lock); 5169 vdc->dkio_flush_pending--; 5170 ASSERT(vdc->dkio_flush_pending >= 0); 5171 mutex_exit(&vdc->lock); 5172 5173 /* free the mem that was allocated when the callback was dispatched */ 5174 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5175 } 5176 5177 /* 5178 * Function: 5179 * vdc_dkio_gapart() 5180 * 5181 * Description: 5182 * This function implements the DKIOCGAPART ioctl. 5183 * 5184 * Arguments: 5185 * vdc - soft state pointer 5186 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5187 * flag - ioctl flags 5188 */ 5189 static int 5190 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5191 { 5192 struct dk_geom *geom; 5193 struct extvtoc *vtoc; 5194 union { 5195 struct dk_map map[NDKMAP]; 5196 struct dk_map32 map32[NDKMAP]; 5197 } data; 5198 int i, rv, size; 5199 5200 mutex_enter(&vdc->lock); 5201 5202 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5203 mutex_exit(&vdc->lock); 5204 return (rv); 5205 } 5206 5207 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) { 5208 mutex_exit(&vdc->lock); 5209 return (EOVERFLOW); 5210 } 5211 5212 vtoc = vdc->vtoc; 5213 geom = vdc->geom; 5214 5215 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5216 5217 for (i = 0; i < vtoc->v_nparts; i++) { 5218 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5219 (geom->dkg_nhead * geom->dkg_nsect); 5220 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5221 } 5222 size = NDKMAP * sizeof (struct dk_map32); 5223 5224 } else { 5225 5226 for (i = 0; i < vtoc->v_nparts; i++) { 5227 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5228 (geom->dkg_nhead * geom->dkg_nsect); 5229 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5230 } 5231 size = NDKMAP * sizeof (struct dk_map); 5232 5233 } 5234 5235 mutex_exit(&vdc->lock); 5236 5237 if (ddi_copyout(&data, arg, size, flag) != 0) 5238 return (EFAULT); 5239 5240 return (0); 5241 } 5242 5243 /* 5244 * Function: 5245 * vdc_dkio_partition() 5246 * 5247 * Description: 5248 * This function implements the DKIOCPARTITION ioctl. 5249 * 5250 * Arguments: 5251 * vdc - soft state pointer 5252 * arg - a pointer to a struct partition64 structure 5253 * flag - ioctl flags 5254 */ 5255 static int 5256 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5257 { 5258 struct partition64 p64; 5259 efi_gpt_t *gpt; 5260 efi_gpe_t *gpe; 5261 vd_efi_dev_t edev; 5262 uint_t partno; 5263 int rv; 5264 5265 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5266 return (EFAULT); 5267 } 5268 5269 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5270 5271 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5272 return (rv); 5273 } 5274 5275 partno = p64.p_partno; 5276 5277 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5278 vd_efi_free(&edev, gpt, gpe); 5279 return (ESRCH); 5280 } 5281 5282 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5283 sizeof (struct uuid)); 5284 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5285 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5286 5287 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5288 vd_efi_free(&edev, gpt, gpe); 5289 return (EFAULT); 5290 } 5291 5292 vd_efi_free(&edev, gpt, gpe); 5293 return (0); 5294 } 5295 5296 /* 5297 * Function: 5298 * vdc_dioctl_rwcmd() 5299 * 5300 * Description: 5301 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5302 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5303 * 5304 * Arguments: 5305 * dev - device 5306 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5307 * flag - ioctl flags 5308 */ 5309 static int 5310 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5311 { 5312 struct dadkio_rwcmd32 rwcmd32; 5313 struct dadkio_rwcmd rwcmd; 5314 struct iovec aiov; 5315 struct uio auio; 5316 int rw, status; 5317 struct buf *buf; 5318 5319 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5320 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5321 sizeof (struct dadkio_rwcmd32), flag)) { 5322 return (EFAULT); 5323 } 5324 rwcmd.cmd = rwcmd32.cmd; 5325 rwcmd.flags = rwcmd32.flags; 5326 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5327 rwcmd.buflen = rwcmd32.buflen; 5328 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5329 } else { 5330 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5331 sizeof (struct dadkio_rwcmd), flag)) { 5332 return (EFAULT); 5333 } 5334 } 5335 5336 switch (rwcmd.cmd) { 5337 case DADKIO_RWCMD_READ: 5338 rw = B_READ; 5339 break; 5340 case DADKIO_RWCMD_WRITE: 5341 rw = B_WRITE; 5342 break; 5343 default: 5344 return (EINVAL); 5345 } 5346 5347 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5348 aiov.iov_base = rwcmd.bufaddr; 5349 aiov.iov_len = rwcmd.buflen; 5350 5351 bzero((caddr_t)&auio, sizeof (struct uio)); 5352 auio.uio_iov = &aiov; 5353 auio.uio_iovcnt = 1; 5354 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5355 auio.uio_resid = rwcmd.buflen; 5356 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5357 5358 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5359 bioinit(buf); 5360 /* 5361 * We use the private field of buf to specify that this is an 5362 * I/O using an absolute offset. 5363 */ 5364 buf->b_private = (void *)VD_SLICE_NONE; 5365 5366 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5367 5368 biofini(buf); 5369 kmem_free(buf, sizeof (buf_t)); 5370 5371 return (status); 5372 } 5373 5374 /* 5375 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5376 * buffer is returned in alloc_len. 5377 */ 5378 static vd_scsi_t * 5379 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5380 int *alloc_len) 5381 { 5382 vd_scsi_t *vd_scsi; 5383 int vd_scsi_len = VD_SCSI_SIZE; 5384 5385 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5386 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5387 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5388 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5389 5390 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5391 5392 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5393 5394 vd_scsi->cdb_len = cdb_len; 5395 vd_scsi->sense_len = sense_len; 5396 vd_scsi->datain_len = datain_len; 5397 vd_scsi->dataout_len = dataout_len; 5398 5399 *alloc_len = vd_scsi_len; 5400 5401 return (vd_scsi); 5402 } 5403 5404 /* 5405 * Convert the status of a SCSI command to a Solaris return code. 5406 * 5407 * Arguments: 5408 * vd_scsi - The SCSI operation buffer. 5409 * log_error - indicate if an error message should be logged. 5410 * 5411 * Note that our SCSI error messages are rather primitive for the moment 5412 * and could be improved by decoding some data like the SCSI command and 5413 * the sense key. 5414 * 5415 * Return value: 5416 * 0 - Status is good. 5417 * EACCES - Status reports a reservation conflict. 5418 * ENOTSUP - Status reports a check condition and sense key 5419 * reports an illegal request. 5420 * EIO - Any other status. 5421 */ 5422 static int 5423 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5424 { 5425 int rv; 5426 char path_str[MAXPATHLEN]; 5427 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5428 union scsi_cdb *cdb; 5429 struct scsi_extended_sense *sense; 5430 5431 if (vd_scsi->cmd_status == STATUS_GOOD) 5432 /* no error */ 5433 return (0); 5434 5435 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5436 if (vdc_scsi_log_error) 5437 log_error = B_TRUE; 5438 5439 if (log_error) { 5440 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5441 ddi_pathname(vdc->dip, path_str), vdc->instance, 5442 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5443 } 5444 5445 /* default returned value */ 5446 rv = EIO; 5447 5448 switch (vd_scsi->cmd_status) { 5449 5450 case STATUS_CHECK: 5451 case STATUS_TERMINATED: 5452 if (log_error) 5453 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5454 5455 /* check sense buffer */ 5456 if (vd_scsi->sense_len == 0 || 5457 vd_scsi->sense_status != STATUS_GOOD) { 5458 if (log_error) 5459 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5460 break; 5461 } 5462 5463 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5464 5465 if (log_error) { 5466 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5467 "\tASC: 0x%x, ASCQ: 0x%x\n", 5468 scsi_sense_key((uint8_t *)sense), 5469 scsi_sense_asc((uint8_t *)sense), 5470 scsi_sense_ascq((uint8_t *)sense)); 5471 } 5472 5473 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5474 rv = ENOTSUP; 5475 break; 5476 5477 case STATUS_BUSY: 5478 if (log_error) 5479 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5480 break; 5481 5482 case STATUS_RESERVATION_CONFLICT: 5483 /* 5484 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5485 * reservation conflict could be due to various reasons like 5486 * incorrect keys, not registered or not reserved etc. So, 5487 * we should not panic in that case. 5488 */ 5489 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5490 if (vdc->failfast_interval != 0 && 5491 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5492 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5493 /* failfast is enabled so we have to panic */ 5494 (void) snprintf(panic_str, sizeof (panic_str), 5495 VDC_RESV_CONFLICT_FMT_STR "%s", 5496 ddi_pathname(vdc->dip, path_str)); 5497 panic(panic_str); 5498 } 5499 if (log_error) 5500 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5501 rv = EACCES; 5502 break; 5503 5504 case STATUS_QFULL: 5505 if (log_error) 5506 cmn_err(CE_NOTE, "\tQueue Full\n"); 5507 break; 5508 5509 case STATUS_MET: 5510 case STATUS_INTERMEDIATE: 5511 case STATUS_SCSI2: 5512 case STATUS_INTERMEDIATE_MET: 5513 case STATUS_ACA_ACTIVE: 5514 if (log_error) 5515 cmn_err(CE_CONT, 5516 "\tUnexpected SCSI status received: 0x%x\n", 5517 vd_scsi->cmd_status); 5518 break; 5519 5520 default: 5521 if (log_error) 5522 cmn_err(CE_CONT, 5523 "\tInvalid SCSI status received: 0x%x\n", 5524 vd_scsi->cmd_status); 5525 break; 5526 } 5527 5528 return (rv); 5529 } 5530 5531 /* 5532 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5533 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5534 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5535 * converted to a VD_OP_RESET operation. 5536 */ 5537 static int 5538 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5539 { 5540 struct uscsi_cmd uscsi; 5541 struct uscsi_cmd32 uscsi32; 5542 vd_scsi_t *vd_scsi; 5543 int vd_scsi_len; 5544 union scsi_cdb *cdb; 5545 struct scsi_extended_sense *sense; 5546 char *datain, *dataout; 5547 size_t cdb_len, datain_len, dataout_len, sense_len; 5548 int rv; 5549 5550 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5551 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5552 mode) != 0) 5553 return (EFAULT); 5554 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5555 } else { 5556 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5557 mode) != 0) 5558 return (EFAULT); 5559 } 5560 5561 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5562 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5563 USCSI_RESET_ALL)) { 5564 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5565 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5566 return (rv); 5567 } 5568 5569 /* cdb buffer length */ 5570 cdb_len = uscsi.uscsi_cdblen; 5571 5572 /* data in and out buffers length */ 5573 if (uscsi.uscsi_flags & USCSI_READ) { 5574 datain_len = uscsi.uscsi_buflen; 5575 dataout_len = 0; 5576 } else { 5577 datain_len = 0; 5578 dataout_len = uscsi.uscsi_buflen; 5579 } 5580 5581 /* sense buffer length */ 5582 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5583 sense_len = uscsi.uscsi_rqlen; 5584 else 5585 sense_len = 0; 5586 5587 /* allocate buffer for the VD_SCSICMD_OP operation */ 5588 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5589 &vd_scsi_len); 5590 5591 /* 5592 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5593 * but basically they prevent a SCSI command from being retried in case 5594 * of an error. 5595 */ 5596 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5597 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5598 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5599 5600 /* set task attribute */ 5601 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5602 vd_scsi->task_attribute = 0; 5603 } else { 5604 if (uscsi.uscsi_flags & USCSI_HEAD) 5605 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5606 else if (uscsi.uscsi_flags & USCSI_HTAG) 5607 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5608 else if (uscsi.uscsi_flags & USCSI_OTAG) 5609 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5610 else 5611 vd_scsi->task_attribute = 0; 5612 } 5613 5614 /* set timeout */ 5615 vd_scsi->timeout = uscsi.uscsi_timeout; 5616 5617 /* copy-in cdb data */ 5618 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5619 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5620 rv = EFAULT; 5621 goto done; 5622 } 5623 5624 /* keep a pointer to the sense buffer */ 5625 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5626 5627 /* keep a pointer to the data-in buffer */ 5628 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5629 5630 /* copy-in request data to the data-out buffer */ 5631 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5632 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5633 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5634 mode)) { 5635 rv = EFAULT; 5636 goto done; 5637 } 5638 } 5639 5640 /* submit the request */ 5641 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5642 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5643 5644 if (rv != 0) 5645 goto done; 5646 5647 /* update scsi status */ 5648 uscsi.uscsi_status = vd_scsi->cmd_status; 5649 5650 /* update sense data */ 5651 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5652 (uscsi.uscsi_status == STATUS_CHECK || 5653 uscsi.uscsi_status == STATUS_TERMINATED)) { 5654 5655 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5656 5657 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5658 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5659 vd_scsi->sense_len; 5660 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5661 vd_scsi->sense_len, mode) != 0) { 5662 rv = EFAULT; 5663 goto done; 5664 } 5665 } 5666 } 5667 5668 /* update request data */ 5669 if (uscsi.uscsi_status == STATUS_GOOD) { 5670 if (uscsi.uscsi_flags & USCSI_READ) { 5671 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5672 vd_scsi->datain_len; 5673 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5674 vd_scsi->datain_len, mode) != 0) { 5675 rv = EFAULT; 5676 goto done; 5677 } 5678 } else { 5679 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5680 vd_scsi->dataout_len; 5681 } 5682 } 5683 5684 /* copy-out result */ 5685 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5686 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5687 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5688 mode) != 0) { 5689 rv = EFAULT; 5690 goto done; 5691 } 5692 } else { 5693 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5694 mode) != 0) { 5695 rv = EFAULT; 5696 goto done; 5697 } 5698 } 5699 5700 /* get the return code from the SCSI command status */ 5701 rv = vdc_scsi_status(vdc, vd_scsi, 5702 !(uscsi.uscsi_flags & USCSI_SILENT)); 5703 5704 done: 5705 kmem_free(vd_scsi, vd_scsi_len); 5706 return (rv); 5707 } 5708 5709 /* 5710 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5711 * 5712 * Arguments: 5713 * cmd - SCSI PERSISTENT IN command 5714 * len - length of the SCSI input buffer 5715 * vd_scsi_len - return the length of the allocated buffer 5716 * 5717 * Returned Value: 5718 * a pointer to the allocated VD_OP_SCSICMD buffer. 5719 */ 5720 static vd_scsi_t * 5721 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5722 { 5723 int cdb_len, sense_len, datain_len, dataout_len; 5724 vd_scsi_t *vd_scsi; 5725 union scsi_cdb *cdb; 5726 5727 cdb_len = CDB_GROUP1; 5728 sense_len = sizeof (struct scsi_extended_sense); 5729 datain_len = len; 5730 dataout_len = 0; 5731 5732 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5733 vd_scsi_len); 5734 5735 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5736 5737 /* set cdb */ 5738 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5739 cdb->cdb_opaque[1] = cmd; 5740 FORMG1COUNT(cdb, datain_len); 5741 5742 vd_scsi->timeout = vdc_scsi_timeout; 5743 5744 return (vd_scsi); 5745 } 5746 5747 /* 5748 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5749 * 5750 * Arguments: 5751 * cmd - SCSI PERSISTENT OUT command 5752 * len - length of the SCSI output buffer 5753 * vd_scsi_len - return the length of the allocated buffer 5754 * 5755 * Returned Code: 5756 * a pointer to the allocated VD_OP_SCSICMD buffer. 5757 */ 5758 static vd_scsi_t * 5759 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5760 { 5761 int cdb_len, sense_len, datain_len, dataout_len; 5762 vd_scsi_t *vd_scsi; 5763 union scsi_cdb *cdb; 5764 5765 cdb_len = CDB_GROUP1; 5766 sense_len = sizeof (struct scsi_extended_sense); 5767 datain_len = 0; 5768 dataout_len = len; 5769 5770 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5771 vd_scsi_len); 5772 5773 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5774 5775 /* set cdb */ 5776 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5777 cdb->cdb_opaque[1] = cmd; 5778 FORMG1COUNT(cdb, dataout_len); 5779 5780 vd_scsi->timeout = vdc_scsi_timeout; 5781 5782 return (vd_scsi); 5783 } 5784 5785 /* 5786 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5787 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5788 * server with a VD_OP_SCSICMD operation. 5789 */ 5790 static int 5791 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5792 { 5793 vd_scsi_t *vd_scsi; 5794 mhioc_inkeys_t inkeys; 5795 mhioc_key_list_t klist; 5796 struct mhioc_inkeys32 inkeys32; 5797 struct mhioc_key_list32 klist32; 5798 sd_prin_readkeys_t *scsi_keys; 5799 void *user_keys; 5800 int vd_scsi_len; 5801 int listsize, listlen, rv; 5802 5803 /* copyin arguments */ 5804 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5805 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5806 if (rv != 0) 5807 return (EFAULT); 5808 5809 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5810 sizeof (klist32), mode); 5811 if (rv != 0) 5812 return (EFAULT); 5813 5814 listsize = klist32.listsize; 5815 } else { 5816 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5817 if (rv != 0) 5818 return (EFAULT); 5819 5820 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5821 if (rv != 0) 5822 return (EFAULT); 5823 5824 listsize = klist.listsize; 5825 } 5826 5827 /* build SCSI VD_OP request */ 5828 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5829 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5830 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5831 5832 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5833 5834 /* submit the request */ 5835 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5836 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5837 5838 if (rv != 0) 5839 goto done; 5840 5841 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5842 5843 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5844 inkeys32.generation = scsi_keys->generation; 5845 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5846 if (rv != 0) { 5847 rv = EFAULT; 5848 goto done; 5849 } 5850 5851 klist32.listlen = listlen; 5852 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5853 sizeof (klist32), mode); 5854 if (rv != 0) { 5855 rv = EFAULT; 5856 goto done; 5857 } 5858 5859 user_keys = (caddr_t)(uintptr_t)klist32.list; 5860 } else { 5861 inkeys.generation = scsi_keys->generation; 5862 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5863 if (rv != 0) { 5864 rv = EFAULT; 5865 goto done; 5866 } 5867 5868 klist.listlen = listlen; 5869 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5870 if (rv != 0) { 5871 rv = EFAULT; 5872 goto done; 5873 } 5874 5875 user_keys = klist.list; 5876 } 5877 5878 /* copy out keys */ 5879 if (listlen > 0 && listsize > 0) { 5880 if (listsize < listlen) 5881 listlen = listsize; 5882 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5883 listlen * MHIOC_RESV_KEY_SIZE, mode); 5884 if (rv != 0) 5885 rv = EFAULT; 5886 } 5887 5888 if (rv == 0) 5889 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5890 5891 done: 5892 kmem_free(vd_scsi, vd_scsi_len); 5893 5894 return (rv); 5895 } 5896 5897 /* 5898 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5899 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5900 * the vdisk server with a VD_OP_SCSICMD operation. 5901 */ 5902 static int 5903 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5904 { 5905 vd_scsi_t *vd_scsi; 5906 mhioc_inresvs_t inresv; 5907 mhioc_resv_desc_list_t rlist; 5908 struct mhioc_inresvs32 inresv32; 5909 struct mhioc_resv_desc_list32 rlist32; 5910 mhioc_resv_desc_t mhd_resv; 5911 sd_prin_readresv_t *scsi_resv; 5912 sd_readresv_desc_t *resv; 5913 mhioc_resv_desc_t *user_resv; 5914 int vd_scsi_len; 5915 int listsize, listlen, i, rv; 5916 5917 /* copyin arguments */ 5918 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5919 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5920 if (rv != 0) 5921 return (EFAULT); 5922 5923 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5924 sizeof (rlist32), mode); 5925 if (rv != 0) 5926 return (EFAULT); 5927 5928 listsize = rlist32.listsize; 5929 } else { 5930 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5931 if (rv != 0) 5932 return (EFAULT); 5933 5934 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5935 if (rv != 0) 5936 return (EFAULT); 5937 5938 listsize = rlist.listsize; 5939 } 5940 5941 /* build SCSI VD_OP request */ 5942 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5943 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5944 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5945 5946 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5947 5948 /* submit the request */ 5949 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5950 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5951 5952 if (rv != 0) 5953 goto done; 5954 5955 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5956 5957 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5958 inresv32.generation = scsi_resv->generation; 5959 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5960 if (rv != 0) { 5961 rv = EFAULT; 5962 goto done; 5963 } 5964 5965 rlist32.listlen = listlen; 5966 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5967 sizeof (rlist32), mode); 5968 if (rv != 0) { 5969 rv = EFAULT; 5970 goto done; 5971 } 5972 5973 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5974 } else { 5975 inresv.generation = scsi_resv->generation; 5976 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5977 if (rv != 0) { 5978 rv = EFAULT; 5979 goto done; 5980 } 5981 5982 rlist.listlen = listlen; 5983 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5984 if (rv != 0) { 5985 rv = EFAULT; 5986 goto done; 5987 } 5988 5989 user_resv = rlist.list; 5990 } 5991 5992 /* copy out reservations */ 5993 if (listsize > 0 && listlen > 0) { 5994 if (listsize < listlen) 5995 listlen = listsize; 5996 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5997 5998 for (i = 0; i < listlen; i++) { 5999 mhd_resv.type = resv->type; 6000 mhd_resv.scope = resv->scope; 6001 mhd_resv.scope_specific_addr = 6002 BE_32(resv->scope_specific_addr); 6003 bcopy(&resv->resvkey, &mhd_resv.key, 6004 MHIOC_RESV_KEY_SIZE); 6005 6006 rv = ddi_copyout(&mhd_resv, user_resv, 6007 sizeof (mhd_resv), mode); 6008 if (rv != 0) { 6009 rv = EFAULT; 6010 goto done; 6011 } 6012 resv++; 6013 user_resv++; 6014 } 6015 } 6016 6017 if (rv == 0) 6018 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6019 6020 done: 6021 kmem_free(vd_scsi, vd_scsi_len); 6022 return (rv); 6023 } 6024 6025 /* 6026 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 6027 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 6028 * server with a VD_OP_SCSICMD operation. 6029 */ 6030 static int 6031 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 6032 { 6033 vd_scsi_t *vd_scsi; 6034 sd_prout_t *scsi_prout; 6035 mhioc_register_t mhd_reg; 6036 int vd_scsi_len, rv; 6037 6038 /* copyin arguments */ 6039 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 6040 if (rv != 0) 6041 return (EFAULT); 6042 6043 /* build SCSI VD_OP request */ 6044 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 6045 sizeof (sd_prout_t), &vd_scsi_len); 6046 6047 /* set parameters */ 6048 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6049 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6050 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6051 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6052 6053 /* submit the request */ 6054 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6055 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6056 6057 if (rv == 0) 6058 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6059 6060 kmem_free(vd_scsi, vd_scsi_len); 6061 return (rv); 6062 } 6063 6064 /* 6065 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6066 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6067 * server with a VD_OP_SCSICMD operation. 6068 */ 6069 static int 6070 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6071 { 6072 union scsi_cdb *cdb; 6073 vd_scsi_t *vd_scsi; 6074 sd_prout_t *scsi_prout; 6075 mhioc_resv_desc_t mhd_resv; 6076 int vd_scsi_len, rv; 6077 6078 /* copyin arguments */ 6079 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6080 if (rv != 0) 6081 return (EFAULT); 6082 6083 /* build SCSI VD_OP request */ 6084 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6085 sizeof (sd_prout_t), &vd_scsi_len); 6086 6087 /* set parameters */ 6088 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6089 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6090 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6091 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6092 cdb->cdb_opaque[2] = mhd_resv.type; 6093 6094 /* submit the request */ 6095 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6096 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6097 6098 if (rv == 0) 6099 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6100 6101 kmem_free(vd_scsi, vd_scsi_len); 6102 return (rv); 6103 } 6104 6105 /* 6106 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6107 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6108 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6109 */ 6110 static int 6111 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6112 { 6113 union scsi_cdb *cdb; 6114 vd_scsi_t *vd_scsi; 6115 sd_prout_t *scsi_prout; 6116 mhioc_preemptandabort_t mhd_preempt; 6117 int vd_scsi_len, rv; 6118 6119 /* copyin arguments */ 6120 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6121 if (rv != 0) 6122 return (EFAULT); 6123 6124 /* build SCSI VD_OP request */ 6125 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6126 sizeof (sd_prout_t), &vd_scsi_len); 6127 6128 /* set parameters */ 6129 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6130 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6131 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6132 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6133 MHIOC_RESV_KEY_SIZE); 6134 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6135 MHIOC_RESV_KEY_SIZE); 6136 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6137 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6138 6139 /* submit the request */ 6140 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6141 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6142 6143 if (rv == 0) 6144 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6145 6146 kmem_free(vd_scsi, vd_scsi_len); 6147 return (rv); 6148 } 6149 6150 /* 6151 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6152 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6153 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6154 */ 6155 static int 6156 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6157 { 6158 vd_scsi_t *vd_scsi; 6159 sd_prout_t *scsi_prout; 6160 mhioc_registerandignorekey_t mhd_regi; 6161 int vd_scsi_len, rv; 6162 6163 /* copyin arguments */ 6164 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6165 if (rv != 0) 6166 return (EFAULT); 6167 6168 /* build SCSI VD_OP request */ 6169 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6170 sizeof (sd_prout_t), &vd_scsi_len); 6171 6172 /* set parameters */ 6173 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6174 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6175 MHIOC_RESV_KEY_SIZE); 6176 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6177 6178 /* submit the request */ 6179 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6180 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6181 6182 if (rv == 0) 6183 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6184 6185 kmem_free(vd_scsi, vd_scsi_len); 6186 return (rv); 6187 } 6188 6189 /* 6190 * This function is used by the failfast mechanism to send a SCSI command 6191 * to check for reservation conflict. 6192 */ 6193 static int 6194 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6195 { 6196 int cdb_len, sense_len, vd_scsi_len; 6197 vd_scsi_t *vd_scsi; 6198 union scsi_cdb *cdb; 6199 int rv; 6200 6201 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6202 6203 if (scmd == SCMD_WRITE_G1) 6204 cdb_len = CDB_GROUP1; 6205 else 6206 cdb_len = CDB_GROUP0; 6207 6208 sense_len = sizeof (struct scsi_extended_sense); 6209 6210 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6211 6212 /* set cdb */ 6213 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6214 cdb->scc_cmd = scmd; 6215 6216 vd_scsi->timeout = vdc_scsi_timeout; 6217 6218 /* 6219 * Submit the request. The last argument has to be B_FALSE so that 6220 * vdc_do_sync_op does not loop checking for reservation conflict if 6221 * the operation returns an error. 6222 */ 6223 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6224 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6225 6226 if (rv == 0) 6227 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6228 6229 kmem_free(vd_scsi, vd_scsi_len); 6230 return (rv); 6231 } 6232 6233 /* 6234 * This function is used by the failfast mechanism to check for reservation 6235 * conflict. It sends some SCSI commands which will fail with a reservation 6236 * conflict error if the system does not have access to the disk and this 6237 * will panic the system. 6238 * 6239 * Returned Code: 6240 * 0 - disk is accessible without reservation conflict error 6241 * != 0 - unable to check if disk is accessible 6242 */ 6243 int 6244 vdc_failfast_check_resv(vdc_t *vdc) 6245 { 6246 int failure = 0; 6247 6248 /* 6249 * Send a TEST UNIT READY command. The command will panic 6250 * the system if it fails with a reservation conflict. 6251 */ 6252 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6253 failure++; 6254 6255 /* 6256 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6257 * a reserved device, so we also do a WRITE(10) of zero byte in 6258 * order to provoke a Reservation Conflict status on those newer 6259 * devices. 6260 */ 6261 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6262 failure++; 6263 6264 return (failure); 6265 } 6266 6267 /* 6268 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6269 * queue when it has failed and failfast is enabled. Then we have to check 6270 * if it has failed because of a reservation conflict in which case we have 6271 * to panic the system. 6272 * 6273 * Async I/O should be queued with their block I/O data transfer structure 6274 * (buf). Sync I/O should be queued with buf = NULL. 6275 */ 6276 static vdc_io_t * 6277 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6278 { 6279 vdc_io_t *vio; 6280 6281 ASSERT(MUTEX_HELD(&vdc->lock)); 6282 6283 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6284 vio->vio_next = vdc->failfast_io_queue; 6285 vio->vio_buf = buf; 6286 vio->vio_qtime = ddi_get_lbolt(); 6287 6288 vdc->failfast_io_queue = vio; 6289 6290 /* notify the failfast thread that a new I/O is queued */ 6291 cv_signal(&vdc->failfast_cv); 6292 6293 return (vio); 6294 } 6295 6296 /* 6297 * Remove and complete I/O in the failfast I/O queue which have been 6298 * added after the indicated deadline. A deadline of 0 means that all 6299 * I/O have to be unqueued and marked as completed. 6300 */ 6301 static void 6302 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6303 { 6304 vdc_io_t *vio, *vio_tmp; 6305 6306 ASSERT(MUTEX_HELD(&vdc->lock)); 6307 6308 vio_tmp = NULL; 6309 vio = vdc->failfast_io_queue; 6310 6311 if (deadline != 0) { 6312 /* 6313 * Skip any io queued after the deadline. The failfast 6314 * I/O queue is ordered starting with the last I/O added 6315 * to the queue. 6316 */ 6317 while (vio != NULL && vio->vio_qtime > deadline) { 6318 vio_tmp = vio; 6319 vio = vio->vio_next; 6320 } 6321 } 6322 6323 if (vio == NULL) 6324 /* nothing to unqueue */ 6325 return; 6326 6327 /* update the queue */ 6328 if (vio_tmp == NULL) 6329 vdc->failfast_io_queue = NULL; 6330 else 6331 vio_tmp->vio_next = NULL; 6332 6333 /* 6334 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6335 * structure (buf) and they are completed by calling biodone(). Sync 6336 * I/O do not have a buf and they are completed by setting the 6337 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6338 * thread waiting for the I/O to complete is responsible for freeing 6339 * the vio structure. 6340 */ 6341 while (vio != NULL) { 6342 vio_tmp = vio->vio_next; 6343 if (vio->vio_buf != NULL) { 6344 VD_KSTAT_RUNQ_EXIT(vdc); 6345 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6346 biodone(vio->vio_buf); 6347 kmem_free(vio, sizeof (vdc_io_t)); 6348 } else { 6349 vio->vio_qtime = 0; 6350 } 6351 vio = vio_tmp; 6352 } 6353 6354 cv_broadcast(&vdc->failfast_io_cv); 6355 } 6356 6357 /* 6358 * Failfast Thread. 6359 * 6360 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6361 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6362 * we still have access to the disk. If a command fails with a RESERVATION 6363 * CONFLICT error then the system will immediatly panic. 6364 * 6365 * The failfast thread is also woken up when an I/O has failed. It then check 6366 * the access to the disk to ensure that the I/O failure was not due to a 6367 * reservation conflict. 6368 * 6369 * There is one failfast thread for each virtual disk for which failfast is 6370 * enabled. We could have only one thread sending requests for all disks but 6371 * this would need vdc to send asynchronous requests and to have callbacks to 6372 * process replies. 6373 */ 6374 static void 6375 vdc_failfast_thread(void *arg) 6376 { 6377 int status; 6378 vdc_t *vdc = (vdc_t *)arg; 6379 clock_t timeout, starttime; 6380 6381 mutex_enter(&vdc->lock); 6382 6383 while (vdc->failfast_interval != 0) { 6384 6385 starttime = ddi_get_lbolt(); 6386 6387 mutex_exit(&vdc->lock); 6388 6389 /* check for reservation conflict */ 6390 status = vdc_failfast_check_resv(vdc); 6391 6392 mutex_enter(&vdc->lock); 6393 /* 6394 * We have dropped the lock to send the SCSI command so we have 6395 * to check that failfast is still enabled. 6396 */ 6397 if (vdc->failfast_interval == 0) 6398 break; 6399 6400 /* 6401 * If we have successfully check the disk access and there was 6402 * no reservation conflict then we can complete any I/O queued 6403 * before the last check. 6404 */ 6405 if (status == 0) 6406 vdc_failfast_io_unqueue(vdc, starttime); 6407 6408 /* proceed again if some I/O are still in the queue */ 6409 if (vdc->failfast_io_queue != NULL) 6410 continue; 6411 6412 timeout = ddi_get_lbolt() + 6413 drv_usectohz(vdc->failfast_interval); 6414 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6415 } 6416 6417 /* 6418 * Failfast is being stop so we can complete any queued I/O. 6419 */ 6420 vdc_failfast_io_unqueue(vdc, 0); 6421 vdc->failfast_thread = NULL; 6422 mutex_exit(&vdc->lock); 6423 thread_exit(); 6424 } 6425 6426 /* 6427 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6428 */ 6429 static int 6430 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6431 { 6432 unsigned int mh_time; 6433 6434 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6435 return (EFAULT); 6436 6437 mutex_enter(&vdc->lock); 6438 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6439 vdc->failfast_thread = thread_create(NULL, 0, 6440 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6441 v.v_maxsyspri - 2); 6442 } 6443 6444 vdc->failfast_interval = mh_time * 1000; 6445 cv_signal(&vdc->failfast_cv); 6446 mutex_exit(&vdc->lock); 6447 6448 return (0); 6449 } 6450 6451 /* 6452 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6453 * converted to VD_OP_SET_ACCESS operations. 6454 */ 6455 static int 6456 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6457 { 6458 int rv; 6459 6460 /* submit owership command request */ 6461 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6462 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6463 VIO_both_dir, B_TRUE); 6464 6465 return (rv); 6466 } 6467 6468 /* 6469 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6470 * VD_OP_GET_ACCESS operation. 6471 */ 6472 static int 6473 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6474 { 6475 int rv; 6476 6477 /* submit owership command request */ 6478 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6479 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6480 VIO_both_dir, B_TRUE); 6481 6482 return (rv); 6483 } 6484 6485 /* 6486 * Disk Ownership Thread. 6487 * 6488 * When we have taken the ownership of a disk, this thread waits to be 6489 * notified when the LDC channel is reset so that it can recover the 6490 * ownership. 6491 * 6492 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6493 * can not be used to do the ownership recovery because it has to be 6494 * running to handle the reply message to the ownership operation. 6495 */ 6496 static void 6497 vdc_ownership_thread(void *arg) 6498 { 6499 vdc_t *vdc = (vdc_t *)arg; 6500 clock_t timeout; 6501 uint64_t status; 6502 6503 mutex_enter(&vdc->ownership_lock); 6504 mutex_enter(&vdc->lock); 6505 6506 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6507 6508 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6509 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6510 /* 6511 * There was a reset so the ownership has been lost, 6512 * try to recover. We do this without using the preempt 6513 * option so that we don't steal the ownership from 6514 * someone who has preempted us. 6515 */ 6516 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6517 vdc->instance); 6518 6519 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6520 VDC_OWNERSHIP_GRANTED); 6521 6522 mutex_exit(&vdc->lock); 6523 6524 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6525 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6526 6527 mutex_enter(&vdc->lock); 6528 6529 if (status == 0) { 6530 DMSG(vdc, 0, "[%d] Ownership recovered", 6531 vdc->instance); 6532 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6533 } else { 6534 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6535 vdc->instance); 6536 } 6537 6538 } 6539 6540 /* 6541 * If we have the ownership then we just wait for an event 6542 * to happen (LDC reset), otherwise we will retry to recover 6543 * after a delay. 6544 */ 6545 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6546 timeout = 0; 6547 else 6548 timeout = ddi_get_lbolt() + 6549 drv_usectohz(vdc_ownership_delay); 6550 6551 /* Release the ownership_lock and wait on the vdc lock */ 6552 mutex_exit(&vdc->ownership_lock); 6553 6554 if (timeout == 0) 6555 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6556 else 6557 (void) cv_timedwait(&vdc->ownership_cv, 6558 &vdc->lock, timeout); 6559 6560 mutex_exit(&vdc->lock); 6561 6562 mutex_enter(&vdc->ownership_lock); 6563 mutex_enter(&vdc->lock); 6564 } 6565 6566 vdc->ownership_thread = NULL; 6567 mutex_exit(&vdc->lock); 6568 mutex_exit(&vdc->ownership_lock); 6569 6570 thread_exit(); 6571 } 6572 6573 static void 6574 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6575 { 6576 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6577 6578 mutex_enter(&vdc->lock); 6579 vdc->ownership = ownership_flags; 6580 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6581 vdc->ownership_thread == NULL) { 6582 /* start ownership thread */ 6583 vdc->ownership_thread = thread_create(NULL, 0, 6584 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6585 v.v_maxsyspri - 2); 6586 } else { 6587 /* notify the ownership thread */ 6588 cv_signal(&vdc->ownership_cv); 6589 } 6590 mutex_exit(&vdc->lock); 6591 } 6592 6593 /* 6594 * Get the size and the block size of a virtual disk from the vdisk server. 6595 */ 6596 static int 6597 vdc_get_capacity(vdc_t *vdc, size_t *dsk_size, size_t *blk_size) 6598 { 6599 int rv = 0; 6600 size_t alloc_len; 6601 vd_capacity_t *vd_cap; 6602 6603 ASSERT(MUTEX_NOT_HELD(&vdc->lock)); 6604 6605 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6606 6607 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6608 6609 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6610 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6611 6612 *dsk_size = vd_cap->vdisk_size; 6613 *blk_size = vd_cap->vdisk_block_size; 6614 6615 kmem_free(vd_cap, alloc_len); 6616 return (rv); 6617 } 6618 6619 /* 6620 * Check the disk capacity. Disk size information is updated if size has 6621 * changed. 6622 * 6623 * Return 0 if the disk capacity is available, or non-zero if it is not. 6624 */ 6625 static int 6626 vdc_check_capacity(vdc_t *vdc) 6627 { 6628 size_t dsk_size, blk_size; 6629 int rv; 6630 6631 /* 6632 * If the vdisk does not support the VD_OP_GET_CAPACITY operation 6633 * then the disk capacity has been retrieved during the handshake 6634 * and there's nothing more to do here. 6635 */ 6636 if (!VD_OP_SUPPORTED(vdc->operations, VD_OP_GET_CAPACITY)) 6637 return (0); 6638 6639 if ((rv = vdc_get_capacity(vdc, &dsk_size, &blk_size)) != 0) 6640 return (rv); 6641 6642 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0) 6643 return (EINVAL); 6644 6645 mutex_enter(&vdc->lock); 6646 vdc_update_size(vdc, dsk_size, blk_size, vdc->max_xfer_sz); 6647 mutex_exit(&vdc->lock); 6648 6649 return (0); 6650 } 6651 6652 /* 6653 * This structure is used in the DKIO(7I) array below. 6654 */ 6655 typedef struct vdc_dk_ioctl { 6656 uint8_t op; /* VD_OP_XXX value */ 6657 int cmd; /* Solaris ioctl operation number */ 6658 size_t nbytes; /* size of structure to be copied */ 6659 6660 /* function to convert between vDisk and Solaris structure formats */ 6661 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6662 int mode, int dir); 6663 } vdc_dk_ioctl_t; 6664 6665 /* 6666 * Subset of DKIO(7I) operations currently supported 6667 */ 6668 static vdc_dk_ioctl_t dk_ioctl[] = { 6669 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6670 vdc_null_copy_func}, 6671 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6672 vdc_get_wce_convert}, 6673 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6674 vdc_set_wce_convert}, 6675 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6676 vdc_get_vtoc_convert}, 6677 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6678 vdc_set_vtoc_convert}, 6679 {VD_OP_GET_VTOC, DKIOCGEXTVTOC, sizeof (vd_vtoc_t), 6680 vdc_get_extvtoc_convert}, 6681 {VD_OP_SET_VTOC, DKIOCSEXTVTOC, sizeof (vd_vtoc_t), 6682 vdc_set_extvtoc_convert}, 6683 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6684 vdc_get_geom_convert}, 6685 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6686 vdc_get_geom_convert}, 6687 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6688 vdc_get_geom_convert}, 6689 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6690 vdc_set_geom_convert}, 6691 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6692 vdc_get_efi_convert}, 6693 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6694 vdc_set_efi_convert}, 6695 6696 /* DIOCTL_RWCMD is converted to a read or a write */ 6697 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6698 6699 /* mhd(7I) non-shared multihost disks ioctls */ 6700 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6701 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6702 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6703 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6704 6705 /* mhd(7I) shared multihost disks ioctls */ 6706 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6707 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6708 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6709 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6710 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6711 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6712 6713 /* mhd(7I) failfast ioctl */ 6714 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6715 6716 /* 6717 * These particular ioctls are not sent to the server - vdc fakes up 6718 * the necessary info. 6719 */ 6720 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6721 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6722 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6723 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6724 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6725 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6726 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6727 }; 6728 6729 /* 6730 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6731 * function and forward them to the vdisk. 6732 */ 6733 static int 6734 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6735 { 6736 vdc_t *vdc = (vdc_t *)vdisk; 6737 dev_t dev; 6738 int rval; 6739 6740 dev = makedevice(ddi_driver_major(vdc->dip), 6741 VD_MAKE_DEV(vdc->instance, 0)); 6742 6743 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6744 } 6745 6746 /* 6747 * Function: 6748 * vd_process_ioctl() 6749 * 6750 * Description: 6751 * This routine processes disk specific ioctl calls 6752 * 6753 * Arguments: 6754 * dev - the device number 6755 * cmd - the operation [dkio(7I)] to be processed 6756 * arg - pointer to user provided structure 6757 * (contains data to be set or reference parameter for get) 6758 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6759 * rvalp - pointer to return value for calling process. 6760 * 6761 * Return Code: 6762 * 0 6763 * EFAULT 6764 * ENXIO 6765 * EIO 6766 * ENOTSUP 6767 */ 6768 static int 6769 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6770 { 6771 int instance = VDCUNIT(dev); 6772 vdc_t *vdc = NULL; 6773 int rv = -1; 6774 int idx = 0; /* index into dk_ioctl[] */ 6775 size_t len = 0; /* #bytes to send to vds */ 6776 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6777 caddr_t mem_p = NULL; 6778 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6779 vdc_dk_ioctl_t *iop; 6780 6781 vdc = ddi_get_soft_state(vdc_state, instance); 6782 if (vdc == NULL) { 6783 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6784 instance); 6785 return (ENXIO); 6786 } 6787 6788 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6789 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6790 6791 if (rvalp != NULL) { 6792 /* the return value of the ioctl is 0 by default */ 6793 *rvalp = 0; 6794 } 6795 6796 /* 6797 * Validate the ioctl operation to be performed. 6798 * 6799 * If we have looped through the array without finding a match then we 6800 * don't support this ioctl. 6801 */ 6802 for (idx = 0; idx < nioctls; idx++) { 6803 if (cmd == dk_ioctl[idx].cmd) 6804 break; 6805 } 6806 6807 if (idx >= nioctls) { 6808 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6809 vdc->instance, cmd); 6810 return (ENOTSUP); 6811 } 6812 6813 iop = &(dk_ioctl[idx]); 6814 6815 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6816 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6817 dk_efi_t dk_efi; 6818 6819 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6820 if (rv != 0) 6821 return (EFAULT); 6822 6823 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6824 } else { 6825 len = iop->nbytes; 6826 } 6827 6828 /* check if the ioctl is applicable */ 6829 switch (cmd) { 6830 case CDROMREADOFFSET: 6831 case DKIOCREMOVABLE: 6832 return (ENOTTY); 6833 6834 case USCSICMD: 6835 case MHIOCTKOWN: 6836 case MHIOCSTATUS: 6837 case MHIOCQRESERVE: 6838 case MHIOCRELEASE: 6839 case MHIOCGRP_INKEYS: 6840 case MHIOCGRP_INRESV: 6841 case MHIOCGRP_REGISTER: 6842 case MHIOCGRP_RESERVE: 6843 case MHIOCGRP_PREEMPTANDABORT: 6844 case MHIOCGRP_REGISTERANDIGNOREKEY: 6845 case MHIOCENFAILFAST: 6846 if (vdc->cinfo == NULL) 6847 return (ENXIO); 6848 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6849 return (ENOTTY); 6850 break; 6851 6852 case DIOCTL_RWCMD: 6853 if (vdc->cinfo == NULL) 6854 return (ENXIO); 6855 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6856 return (ENOTTY); 6857 break; 6858 6859 case DKIOCINFO: 6860 if (vdc->cinfo == NULL) 6861 return (ENXIO); 6862 break; 6863 6864 case DKIOCGMEDIAINFO: 6865 if (vdc->minfo == NULL) 6866 return (ENXIO); 6867 if (vdc_check_capacity(vdc) != 0) 6868 /* disk capacity is not available */ 6869 return (EIO); 6870 break; 6871 } 6872 6873 /* 6874 * Deal with ioctls which require a processing different than 6875 * converting ioctl arguments and sending a corresponding 6876 * VD operation. 6877 */ 6878 switch (cmd) { 6879 6880 case USCSICMD: 6881 { 6882 return (vdc_uscsi_cmd(vdc, arg, mode)); 6883 } 6884 6885 case MHIOCTKOWN: 6886 { 6887 mutex_enter(&vdc->ownership_lock); 6888 /* 6889 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6890 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6891 * while we are processing the ioctl. 6892 */ 6893 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6894 6895 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6896 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6897 if (rv == 0) { 6898 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6899 VDC_OWNERSHIP_GRANTED); 6900 } else { 6901 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6902 } 6903 mutex_exit(&vdc->ownership_lock); 6904 return (rv); 6905 } 6906 6907 case MHIOCRELEASE: 6908 { 6909 mutex_enter(&vdc->ownership_lock); 6910 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6911 if (rv == 0) { 6912 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6913 } 6914 mutex_exit(&vdc->ownership_lock); 6915 return (rv); 6916 } 6917 6918 case MHIOCSTATUS: 6919 { 6920 uint64_t status; 6921 6922 rv = vdc_access_get(vdc, &status, mode); 6923 if (rv == 0 && rvalp != NULL) 6924 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6925 return (rv); 6926 } 6927 6928 case MHIOCQRESERVE: 6929 { 6930 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6931 return (rv); 6932 } 6933 6934 case MHIOCGRP_INKEYS: 6935 { 6936 return (vdc_mhd_inkeys(vdc, arg, mode)); 6937 } 6938 6939 case MHIOCGRP_INRESV: 6940 { 6941 return (vdc_mhd_inresv(vdc, arg, mode)); 6942 } 6943 6944 case MHIOCGRP_REGISTER: 6945 { 6946 return (vdc_mhd_register(vdc, arg, mode)); 6947 } 6948 6949 case MHIOCGRP_RESERVE: 6950 { 6951 return (vdc_mhd_reserve(vdc, arg, mode)); 6952 } 6953 6954 case MHIOCGRP_PREEMPTANDABORT: 6955 { 6956 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6957 } 6958 6959 case MHIOCGRP_REGISTERANDIGNOREKEY: 6960 { 6961 return (vdc_mhd_registerignore(vdc, arg, mode)); 6962 } 6963 6964 case MHIOCENFAILFAST: 6965 { 6966 rv = vdc_failfast(vdc, arg, mode); 6967 return (rv); 6968 } 6969 6970 case DIOCTL_RWCMD: 6971 { 6972 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6973 } 6974 6975 case DKIOCGAPART: 6976 { 6977 return (vdc_dkio_gapart(vdc, arg, mode)); 6978 } 6979 6980 case DKIOCPARTITION: 6981 { 6982 return (vdc_dkio_partition(vdc, arg, mode)); 6983 } 6984 6985 case DKIOCINFO: 6986 { 6987 struct dk_cinfo cinfo; 6988 6989 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6990 cinfo.dki_partition = VDCPART(dev); 6991 6992 rv = ddi_copyout(&cinfo, (void *)arg, 6993 sizeof (struct dk_cinfo), mode); 6994 if (rv != 0) 6995 return (EFAULT); 6996 6997 return (0); 6998 } 6999 7000 case DKIOCGMEDIAINFO: 7001 { 7002 ASSERT(vdc->vdisk_size != 0); 7003 ASSERT(vdc->minfo->dki_capacity != 0); 7004 rv = ddi_copyout(vdc->minfo, (void *)arg, 7005 sizeof (struct dk_minfo), mode); 7006 if (rv != 0) 7007 return (EFAULT); 7008 7009 return (0); 7010 } 7011 7012 case DKIOCFLUSHWRITECACHE: 7013 { 7014 struct dk_callback *dkc = 7015 (struct dk_callback *)(uintptr_t)arg; 7016 vdc_dk_arg_t *dkarg = NULL; 7017 7018 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 7019 instance, mode); 7020 7021 /* 7022 * If arg is NULL, then there is no callback function 7023 * registered and the call operates synchronously; we 7024 * break and continue with the rest of the function and 7025 * wait for vds to return (i.e. after the request to 7026 * vds returns successfully, all writes completed prior 7027 * to the ioctl will have been flushed from the disk 7028 * write cache to persistent media. 7029 * 7030 * If a callback function is registered, we dispatch 7031 * the request on a task queue and return immediately. 7032 * The callback will deal with informing the calling 7033 * thread that the flush request is completed. 7034 */ 7035 if (dkc == NULL) 7036 break; 7037 7038 /* 7039 * the asynchronous callback is only supported if 7040 * invoked from within the kernel 7041 */ 7042 if ((mode & FKIOCTL) == 0) 7043 return (ENOTSUP); 7044 7045 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 7046 7047 dkarg->mode = mode; 7048 dkarg->dev = dev; 7049 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 7050 7051 mutex_enter(&vdc->lock); 7052 vdc->dkio_flush_pending++; 7053 dkarg->vdc = vdc; 7054 mutex_exit(&vdc->lock); 7055 7056 /* put the request on a task queue */ 7057 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 7058 (void *)dkarg, DDI_SLEEP); 7059 if (rv == NULL) { 7060 /* clean up if dispatch fails */ 7061 mutex_enter(&vdc->lock); 7062 vdc->dkio_flush_pending--; 7063 mutex_exit(&vdc->lock); 7064 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7065 } 7066 7067 return (rv == NULL ? ENOMEM : 0); 7068 } 7069 } 7070 7071 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7072 ASSERT(iop->op != 0); 7073 7074 /* check if the vDisk server handles the operation for this vDisk */ 7075 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7076 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7077 vdc->instance, iop->op); 7078 return (ENOTSUP); 7079 } 7080 7081 /* LDC requires that the memory being mapped is 8-byte aligned */ 7082 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7083 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7084 instance, len, alloc_len); 7085 7086 if (alloc_len > 0) 7087 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7088 7089 /* 7090 * Call the conversion function for this ioctl which, if necessary, 7091 * converts from the Solaris format to the format ARC'ed 7092 * as part of the vDisk protocol (FWARC 2006/195) 7093 */ 7094 ASSERT(iop->convert != NULL); 7095 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7096 if (rv != 0) { 7097 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7098 instance, rv, cmd); 7099 if (mem_p != NULL) 7100 kmem_free(mem_p, alloc_len); 7101 return (rv); 7102 } 7103 7104 /* 7105 * send request to vds to service the ioctl. 7106 */ 7107 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7108 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7109 VIO_both_dir, B_TRUE); 7110 7111 if (rv != 0) { 7112 /* 7113 * This is not necessarily an error. The ioctl could 7114 * be returning a value such as ENOTTY to indicate 7115 * that the ioctl is not applicable. 7116 */ 7117 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7118 instance, rv, cmd); 7119 if (mem_p != NULL) 7120 kmem_free(mem_p, alloc_len); 7121 7122 return (rv); 7123 } 7124 7125 /* 7126 * Call the conversion function (if it exists) for this ioctl 7127 * which converts from the format ARC'ed as part of the vDisk 7128 * protocol (FWARC 2006/195) back to a format understood by 7129 * the rest of Solaris. 7130 */ 7131 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7132 if (rv != 0) { 7133 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7134 instance, rv, cmd); 7135 if (mem_p != NULL) 7136 kmem_free(mem_p, alloc_len); 7137 return (rv); 7138 } 7139 7140 if (mem_p != NULL) 7141 kmem_free(mem_p, alloc_len); 7142 7143 return (rv); 7144 } 7145 7146 /* 7147 * Function: 7148 * 7149 * Description: 7150 * This is an empty conversion function used by ioctl calls which 7151 * do not need to convert the data being passed in/out to userland 7152 */ 7153 static int 7154 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7155 { 7156 _NOTE(ARGUNUSED(vdc)) 7157 _NOTE(ARGUNUSED(from)) 7158 _NOTE(ARGUNUSED(to)) 7159 _NOTE(ARGUNUSED(mode)) 7160 _NOTE(ARGUNUSED(dir)) 7161 7162 return (0); 7163 } 7164 7165 static int 7166 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7167 int mode, int dir) 7168 { 7169 _NOTE(ARGUNUSED(vdc)) 7170 7171 if (dir == VD_COPYIN) 7172 return (0); /* nothing to do */ 7173 7174 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7175 return (EFAULT); 7176 7177 return (0); 7178 } 7179 7180 static int 7181 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7182 int mode, int dir) 7183 { 7184 _NOTE(ARGUNUSED(vdc)) 7185 7186 if (dir == VD_COPYOUT) 7187 return (0); /* nothing to do */ 7188 7189 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7190 return (EFAULT); 7191 7192 return (0); 7193 } 7194 7195 /* 7196 * Function: 7197 * vdc_get_vtoc_convert() 7198 * 7199 * Description: 7200 * This routine performs the necessary convertions from the DKIOCGVTOC 7201 * Solaris structure to the format defined in FWARC 2006/195. 7202 * 7203 * In the struct vtoc definition, the timestamp field is marked as not 7204 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7205 * However SVM uses that field to check it can write into the VTOC, 7206 * so we fake up the info of that field. 7207 * 7208 * Arguments: 7209 * vdc - the vDisk client 7210 * from - the buffer containing the data to be copied from 7211 * to - the buffer to be copied to 7212 * mode - flags passed to ioctl() call 7213 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7214 * 7215 * Return Code: 7216 * 0 - Success 7217 * ENXIO - incorrect buffer passed in. 7218 * EFAULT - ddi_copyout routine encountered an error. 7219 */ 7220 static int 7221 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7222 { 7223 int i; 7224 struct vtoc vtoc; 7225 struct vtoc32 vtoc32; 7226 struct extvtoc evtoc; 7227 int rv; 7228 7229 if (dir != VD_COPYOUT) 7230 return (0); /* nothing to do */ 7231 7232 if ((from == NULL) || (to == NULL)) 7233 return (ENXIO); 7234 7235 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7236 return (EOVERFLOW); 7237 7238 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7239 7240 /* fake the VTOC timestamp field */ 7241 for (i = 0; i < V_NUMPAR; i++) { 7242 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7243 } 7244 7245 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7246 /* LINTED E_ASSIGN_NARROW_CONV */ 7247 extvtoctovtoc32(evtoc, vtoc32); 7248 rv = ddi_copyout(&vtoc32, to, sizeof (vtoc32), mode); 7249 if (rv != 0) 7250 rv = EFAULT; 7251 } else { 7252 extvtoctovtoc(evtoc, vtoc); 7253 rv = ddi_copyout(&vtoc, to, sizeof (vtoc), mode); 7254 if (rv != 0) 7255 rv = EFAULT; 7256 } 7257 7258 return (rv); 7259 } 7260 7261 /* 7262 * Function: 7263 * vdc_set_vtoc_convert() 7264 * 7265 * Description: 7266 * This routine performs the necessary convertions from the DKIOCSVTOC 7267 * Solaris structure to the format defined in FWARC 2006/195. 7268 * 7269 * Arguments: 7270 * vdc - the vDisk client 7271 * from - Buffer with data 7272 * to - Buffer where data is to be copied to 7273 * mode - flags passed to ioctl 7274 * dir - direction of copy (in or out) 7275 * 7276 * Return Code: 7277 * 0 - Success 7278 * ENXIO - Invalid buffer passed in 7279 * EFAULT - ddi_copyin of data failed 7280 */ 7281 static int 7282 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7283 { 7284 void *uvtoc; 7285 struct vtoc vtoc; 7286 struct vtoc32 vtoc32; 7287 struct extvtoc evtoc; 7288 int i, rv; 7289 7290 if ((from == NULL) || (to == NULL)) 7291 return (ENXIO); 7292 7293 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7294 return (EOVERFLOW); 7295 7296 uvtoc = (dir == VD_COPYIN)? from : to; 7297 7298 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7299 rv = ddi_copyin(uvtoc, &vtoc32, sizeof (vtoc32), mode); 7300 if (rv != 0) 7301 return (EFAULT); 7302 vtoc32toextvtoc(vtoc32, evtoc); 7303 } else { 7304 rv = ddi_copyin(uvtoc, &vtoc, sizeof (vtoc), mode); 7305 if (rv != 0) 7306 return (EFAULT); 7307 vtoctoextvtoc(vtoc, evtoc); 7308 } 7309 7310 if (dir == VD_COPYOUT) { 7311 /* 7312 * The disk label may have changed. Revalidate the disk 7313 * geometry. This will also update the device nodes. 7314 */ 7315 vdc_validate(vdc); 7316 7317 /* 7318 * We also need to keep track of the timestamp fields. 7319 */ 7320 for (i = 0; i < V_NUMPAR; i++) { 7321 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7322 } 7323 7324 } else { 7325 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7326 } 7327 7328 return (0); 7329 } 7330 7331 static int 7332 vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7333 { 7334 int i, rv; 7335 struct extvtoc evtoc; 7336 7337 if (dir != VD_COPYOUT) 7338 return (0); /* nothing to do */ 7339 7340 if ((from == NULL) || (to == NULL)) 7341 return (ENXIO); 7342 7343 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7344 7345 /* fake the VTOC timestamp field */ 7346 for (i = 0; i < V_NUMPAR; i++) { 7347 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7348 } 7349 7350 rv = ddi_copyout(&evtoc, to, sizeof (struct extvtoc), mode); 7351 if (rv != 0) 7352 rv = EFAULT; 7353 7354 return (rv); 7355 } 7356 7357 static int 7358 vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7359 { 7360 void *uvtoc; 7361 struct extvtoc evtoc; 7362 int i, rv; 7363 7364 if ((from == NULL) || (to == NULL)) 7365 return (ENXIO); 7366 7367 uvtoc = (dir == VD_COPYIN)? from : to; 7368 7369 rv = ddi_copyin(uvtoc, &evtoc, sizeof (struct extvtoc), mode); 7370 if (rv != 0) 7371 return (EFAULT); 7372 7373 if (dir == VD_COPYOUT) { 7374 /* 7375 * The disk label may have changed. Revalidate the disk 7376 * geometry. This will also update the device nodes. 7377 */ 7378 vdc_validate(vdc); 7379 7380 /* 7381 * We also need to keep track of the timestamp fields. 7382 */ 7383 for (i = 0; i < V_NUMPAR; i++) { 7384 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7385 } 7386 7387 } else { 7388 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7389 } 7390 7391 return (0); 7392 } 7393 7394 /* 7395 * Function: 7396 * vdc_get_geom_convert() 7397 * 7398 * Description: 7399 * This routine performs the necessary convertions from the DKIOCGGEOM, 7400 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7401 * defined in FWARC 2006/195 7402 * 7403 * Arguments: 7404 * vdc - the vDisk client 7405 * from - Buffer with data 7406 * to - Buffer where data is to be copied to 7407 * mode - flags passed to ioctl 7408 * dir - direction of copy (in or out) 7409 * 7410 * Return Code: 7411 * 0 - Success 7412 * ENXIO - Invalid buffer passed in 7413 * EFAULT - ddi_copyout of data failed 7414 */ 7415 static int 7416 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7417 { 7418 _NOTE(ARGUNUSED(vdc)) 7419 7420 struct dk_geom geom; 7421 int copy_len = sizeof (struct dk_geom); 7422 int rv = 0; 7423 7424 if (dir != VD_COPYOUT) 7425 return (0); /* nothing to do */ 7426 7427 if ((from == NULL) || (to == NULL)) 7428 return (ENXIO); 7429 7430 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7431 rv = ddi_copyout(&geom, to, copy_len, mode); 7432 if (rv != 0) 7433 rv = EFAULT; 7434 7435 return (rv); 7436 } 7437 7438 /* 7439 * Function: 7440 * vdc_set_geom_convert() 7441 * 7442 * Description: 7443 * This routine performs the necessary convertions from the DKIOCSGEOM 7444 * Solaris structure to the format defined in FWARC 2006/195. 7445 * 7446 * Arguments: 7447 * vdc - the vDisk client 7448 * from - Buffer with data 7449 * to - Buffer where data is to be copied to 7450 * mode - flags passed to ioctl 7451 * dir - direction of copy (in or out) 7452 * 7453 * Return Code: 7454 * 0 - Success 7455 * ENXIO - Invalid buffer passed in 7456 * EFAULT - ddi_copyin of data failed 7457 */ 7458 static int 7459 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7460 { 7461 _NOTE(ARGUNUSED(vdc)) 7462 7463 vd_geom_t vdgeom; 7464 void *tmp_mem = NULL; 7465 int copy_len = sizeof (struct dk_geom); 7466 int rv = 0; 7467 7468 if (dir != VD_COPYIN) 7469 return (0); /* nothing to do */ 7470 7471 if ((from == NULL) || (to == NULL)) 7472 return (ENXIO); 7473 7474 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7475 7476 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7477 if (rv != 0) { 7478 kmem_free(tmp_mem, copy_len); 7479 return (EFAULT); 7480 } 7481 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7482 bcopy(&vdgeom, to, sizeof (vdgeom)); 7483 kmem_free(tmp_mem, copy_len); 7484 7485 return (0); 7486 } 7487 7488 static int 7489 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7490 { 7491 _NOTE(ARGUNUSED(vdc)) 7492 7493 vd_efi_t *vd_efi; 7494 dk_efi_t dk_efi; 7495 int rv = 0; 7496 void *uaddr; 7497 7498 if ((from == NULL) || (to == NULL)) 7499 return (ENXIO); 7500 7501 if (dir == VD_COPYIN) { 7502 7503 vd_efi = (vd_efi_t *)to; 7504 7505 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7506 if (rv != 0) 7507 return (EFAULT); 7508 7509 vd_efi->lba = dk_efi.dki_lba; 7510 vd_efi->length = dk_efi.dki_length; 7511 bzero(vd_efi->data, vd_efi->length); 7512 7513 } else { 7514 7515 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7516 if (rv != 0) 7517 return (EFAULT); 7518 7519 uaddr = dk_efi.dki_data; 7520 7521 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7522 7523 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7524 7525 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7526 mode); 7527 if (rv != 0) 7528 return (EFAULT); 7529 7530 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7531 } 7532 7533 return (0); 7534 } 7535 7536 static int 7537 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7538 { 7539 _NOTE(ARGUNUSED(vdc)) 7540 7541 dk_efi_t dk_efi; 7542 void *uaddr; 7543 7544 if (dir == VD_COPYOUT) { 7545 /* 7546 * The disk label may have changed. Revalidate the disk 7547 * geometry. This will also update the device nodes. 7548 */ 7549 vdc_validate(vdc); 7550 return (0); 7551 } 7552 7553 if ((from == NULL) || (to == NULL)) 7554 return (ENXIO); 7555 7556 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7557 return (EFAULT); 7558 7559 uaddr = dk_efi.dki_data; 7560 7561 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7562 7563 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7564 return (EFAULT); 7565 7566 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7567 7568 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7569 7570 return (0); 7571 } 7572 7573 7574 /* -------------------------------------------------------------------------- */ 7575 7576 /* 7577 * Function: 7578 * vdc_create_fake_geometry() 7579 * 7580 * Description: 7581 * This routine fakes up the disk info needed for some DKIO ioctls such 7582 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7583 * 7584 * Note: This function must not be called until the vDisk attributes have 7585 * been exchanged as part of the handshake with the vDisk server. 7586 * 7587 * Arguments: 7588 * vdc - soft state pointer for this instance of the device driver. 7589 * 7590 * Return Code: 7591 * none. 7592 */ 7593 static void 7594 vdc_create_fake_geometry(vdc_t *vdc) 7595 { 7596 ASSERT(vdc != NULL); 7597 ASSERT(vdc->max_xfer_sz != 0); 7598 7599 /* 7600 * DKIOCINFO support 7601 */ 7602 if (vdc->cinfo == NULL) 7603 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7604 7605 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7606 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7607 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7608 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7609 7610 /* 7611 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7612 * operation is supported, otherwise the controller type is DKC_DIRECT. 7613 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7614 * controller type is always DKC_DIRECT in that case. 7615 * 7616 * If the virtual disk is backed by a physical CD/DVD device or 7617 * an ISO image, modify the controller type to indicate this 7618 */ 7619 switch (vdc->vdisk_media) { 7620 case VD_MEDIA_CD: 7621 case VD_MEDIA_DVD: 7622 vdc->cinfo->dki_ctype = DKC_CDROM; 7623 break; 7624 case VD_MEDIA_FIXED: 7625 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7626 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7627 else 7628 vdc->cinfo->dki_ctype = DKC_DIRECT; 7629 break; 7630 default: 7631 /* in the case of v1.0 we default to a fixed disk */ 7632 vdc->cinfo->dki_ctype = DKC_DIRECT; 7633 break; 7634 } 7635 vdc->cinfo->dki_flags = DKI_FMTVOL; 7636 vdc->cinfo->dki_cnum = 0; 7637 vdc->cinfo->dki_addr = 0; 7638 vdc->cinfo->dki_space = 0; 7639 vdc->cinfo->dki_prio = 0; 7640 vdc->cinfo->dki_vec = 0; 7641 vdc->cinfo->dki_unit = vdc->instance; 7642 vdc->cinfo->dki_slave = 0; 7643 /* 7644 * The partition number will be created on the fly depending on the 7645 * actual slice (i.e. minor node) that is used to request the data. 7646 */ 7647 vdc->cinfo->dki_partition = 0; 7648 7649 /* 7650 * DKIOCGMEDIAINFO support 7651 */ 7652 if (vdc->minfo == NULL) 7653 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7654 7655 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7656 vdc->minfo->dki_media_type = 7657 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7658 } else { 7659 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7660 } 7661 7662 vdc->minfo->dki_capacity = vdc->vdisk_size; 7663 vdc->minfo->dki_lbsize = vdc->block_size; 7664 } 7665 7666 static ushort_t 7667 vdc_lbl2cksum(struct dk_label *label) 7668 { 7669 int count; 7670 ushort_t sum, *sp; 7671 7672 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7673 sp = (ushort_t *)label; 7674 sum = 0; 7675 while (count--) { 7676 sum ^= *sp++; 7677 } 7678 7679 return (sum); 7680 } 7681 7682 static void 7683 vdc_update_size(vdc_t *vdc, size_t dsk_size, size_t blk_size, size_t xfr_size) 7684 { 7685 vd_err_stats_t *stp; 7686 7687 ASSERT(MUTEX_HELD(&vdc->lock)); 7688 ASSERT(xfr_size != 0); 7689 7690 /* 7691 * If the disk size is unknown or sizes are unchanged then don't 7692 * update anything. 7693 */ 7694 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0 || 7695 (blk_size == vdc->block_size && dsk_size == vdc->vdisk_size && 7696 xfr_size == vdc->max_xfer_sz)) 7697 return; 7698 7699 /* 7700 * We don't know at compile time what the vDisk server will think 7701 * are good values but we apply a large (arbitrary) upper bound to 7702 * prevent memory exhaustion in vdc if it was allocating a DRing 7703 * based of huge values sent by the server. We probably will never 7704 * exceed this except if the message was garbage. 7705 */ 7706 if ((xfr_size * blk_size) > (PAGESIZE * DEV_BSIZE)) { 7707 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 7708 " using max supported by vdc", vdc->instance); 7709 xfr_size = maxphys / DEV_BSIZE; 7710 dsk_size = (dsk_size * blk_size) / DEV_BSIZE; 7711 blk_size = DEV_BSIZE; 7712 } 7713 7714 vdc->max_xfer_sz = xfr_size; 7715 vdc->block_size = blk_size; 7716 vdc->vdisk_size = dsk_size; 7717 7718 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 7719 stp->vd_capacity.value.ui64 = dsk_size * blk_size; 7720 7721 vdc->minfo->dki_capacity = dsk_size; 7722 vdc->minfo->dki_lbsize = (uint_t)blk_size; 7723 } 7724 7725 /* 7726 * Function: 7727 * vdc_validate_geometry 7728 * 7729 * Description: 7730 * This routine discovers the label and geometry of the disk. It stores 7731 * the disk label and related information in the vdc structure. If it 7732 * fails to validate the geometry or to discover the disk label then 7733 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7734 * 7735 * Arguments: 7736 * vdc - soft state pointer for this instance of the device driver. 7737 * 7738 * Return Code: 7739 * 0 - success. 7740 * EINVAL - unknown disk label. 7741 * ENOTSUP - geometry not applicable (EFI label). 7742 * EIO - error accessing the disk. 7743 */ 7744 static int 7745 vdc_validate_geometry(vdc_t *vdc) 7746 { 7747 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7748 dev_t dev; 7749 int rv, rval; 7750 struct dk_label label; 7751 struct dk_geom geom; 7752 struct extvtoc vtoc; 7753 efi_gpt_t *gpt; 7754 efi_gpe_t *gpe; 7755 vd_efi_dev_t edev; 7756 7757 ASSERT(vdc != NULL); 7758 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7759 ASSERT(MUTEX_HELD(&vdc->lock)); 7760 7761 mutex_exit(&vdc->lock); 7762 /* 7763 * Check the disk capacity in case it has changed. If that fails then 7764 * we proceed and we will be using the disk size we currently have. 7765 */ 7766 (void) vdc_check_capacity(vdc); 7767 dev = makedevice(ddi_driver_major(vdc->dip), 7768 VD_MAKE_DEV(vdc->instance, 0)); 7769 7770 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7771 if (rv == 0) 7772 rv = vd_process_ioctl(dev, DKIOCGEXTVTOC, (caddr_t)&vtoc, 7773 FKIOCTL, &rval); 7774 7775 if (rv == ENOTSUP) { 7776 /* 7777 * If the device does not support VTOC then we try 7778 * to read an EFI label. 7779 * 7780 * We need to know the block size and the disk size to 7781 * be able to read an EFI label. 7782 */ 7783 if (vdc->vdisk_size == 0) { 7784 mutex_enter(&vdc->lock); 7785 vdc_store_label_unk(vdc); 7786 return (EIO); 7787 } 7788 7789 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7790 7791 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7792 7793 if (rv) { 7794 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7795 vdc->instance, rv); 7796 mutex_enter(&vdc->lock); 7797 vdc_store_label_unk(vdc); 7798 return (EIO); 7799 } 7800 7801 mutex_enter(&vdc->lock); 7802 vdc_store_label_efi(vdc, gpt, gpe); 7803 vd_efi_free(&edev, gpt, gpe); 7804 return (ENOTSUP); 7805 } 7806 7807 if (rv != 0) { 7808 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7809 vdc->instance, rv); 7810 mutex_enter(&vdc->lock); 7811 vdc_store_label_unk(vdc); 7812 if (rv != EINVAL) 7813 rv = EIO; 7814 return (rv); 7815 } 7816 7817 /* check that geometry and vtoc are valid */ 7818 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7819 vtoc.v_sanity != VTOC_SANE) { 7820 mutex_enter(&vdc->lock); 7821 vdc_store_label_unk(vdc); 7822 return (EINVAL); 7823 } 7824 7825 /* 7826 * We have a disk and a valid VTOC. However this does not mean 7827 * that the disk currently have a VTOC label. The returned VTOC may 7828 * be a default VTOC to be used for configuring the disk (this is 7829 * what is done for disk image). So we read the label from the 7830 * beginning of the disk to ensure we really have a VTOC label. 7831 * 7832 * FUTURE: This could be the default way for reading the VTOC 7833 * from the disk as opposed to sending the VD_OP_GET_VTOC 7834 * to the server. This will be the default if vdc is implemented 7835 * ontop of cmlb. 7836 */ 7837 7838 /* 7839 * Single slice disk does not support read using an absolute disk 7840 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7841 */ 7842 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7843 mutex_enter(&vdc->lock); 7844 if (vtoc.v_nparts != 1) { 7845 vdc_store_label_unk(vdc); 7846 return (EINVAL); 7847 } 7848 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7849 return (0); 7850 } 7851 7852 if (vtoc.v_nparts != V_NUMPAR) { 7853 mutex_enter(&vdc->lock); 7854 vdc_store_label_unk(vdc); 7855 return (EINVAL); 7856 } 7857 7858 /* 7859 * Most CD/DVDs do not have a disk label and the label is 7860 * generated by the disk driver. So the on-disk label check 7861 * below may fail and we return now to avoid this problem. 7862 */ 7863 if (vdc->vdisk_media == VD_MEDIA_CD || 7864 vdc->vdisk_media == VD_MEDIA_DVD) { 7865 mutex_enter(&vdc->lock); 7866 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7867 return (0); 7868 } 7869 7870 /* 7871 * Read disk label from start of disk 7872 */ 7873 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7874 bioinit(buf); 7875 buf->b_un.b_addr = (caddr_t)&label; 7876 buf->b_bcount = DK_LABEL_SIZE; 7877 buf->b_flags = B_BUSY | B_READ; 7878 buf->b_dev = cmpdev(dev); 7879 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7880 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7881 if (rv) { 7882 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7883 vdc->instance); 7884 } else if (ddi_in_panic()) { 7885 rv = vdc_drain_response(vdc, CB_STRATEGY, buf); 7886 if (rv == 0) { 7887 rv = geterror(buf); 7888 } 7889 } else { 7890 rv = biowait(buf); 7891 } 7892 biofini(buf); 7893 kmem_free(buf, sizeof (buf_t)); 7894 7895 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7896 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7897 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7898 vdc->instance); 7899 mutex_enter(&vdc->lock); 7900 vdc_store_label_unk(vdc); 7901 return (EINVAL); 7902 } 7903 7904 mutex_enter(&vdc->lock); 7905 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7906 return (0); 7907 } 7908 7909 /* 7910 * Function: 7911 * vdc_validate 7912 * 7913 * Description: 7914 * This routine discovers the label of the disk and create the 7915 * appropriate device nodes if the label has changed. 7916 * 7917 * Arguments: 7918 * vdc - soft state pointer for this instance of the device driver. 7919 * 7920 * Return Code: 7921 * none. 7922 */ 7923 static void 7924 vdc_validate(vdc_t *vdc) 7925 { 7926 vd_disk_label_t old_label; 7927 vd_slice_t old_slice[V_NUMPAR]; 7928 int rv; 7929 7930 ASSERT(!MUTEX_HELD(&vdc->lock)); 7931 7932 mutex_enter(&vdc->lock); 7933 7934 /* save the current label and vtoc */ 7935 old_label = vdc->vdisk_label; 7936 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7937 7938 /* check the geometry */ 7939 (void) vdc_validate_geometry(vdc); 7940 7941 /* if the disk label has changed, update device nodes */ 7942 if (vdc->vdisk_label != old_label) { 7943 7944 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7945 rv = vdc_create_device_nodes_efi(vdc); 7946 else 7947 rv = vdc_create_device_nodes_vtoc(vdc); 7948 7949 if (rv != 0) { 7950 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7951 vdc->instance); 7952 } 7953 } 7954 7955 mutex_exit(&vdc->lock); 7956 } 7957 7958 static void 7959 vdc_validate_task(void *arg) 7960 { 7961 vdc_t *vdc = (vdc_t *)arg; 7962 7963 vdc_validate(vdc); 7964 7965 mutex_enter(&vdc->lock); 7966 ASSERT(vdc->validate_pending > 0); 7967 vdc->validate_pending--; 7968 mutex_exit(&vdc->lock); 7969 } 7970 7971 /* 7972 * Function: 7973 * vdc_setup_devid() 7974 * 7975 * Description: 7976 * This routine discovers the devid of a vDisk. It requests the devid of 7977 * the underlying device from the vDisk server, builds an encapsulated 7978 * devid based on the retrieved devid and registers that new devid to 7979 * the vDisk. 7980 * 7981 * Arguments: 7982 * vdc - soft state pointer for this instance of the device driver. 7983 * 7984 * Return Code: 7985 * 0 - A devid was succesfully registered for the vDisk 7986 */ 7987 static int 7988 vdc_setup_devid(vdc_t *vdc) 7989 { 7990 int rv; 7991 vd_devid_t *vd_devid; 7992 size_t bufsize, bufid_len; 7993 7994 /* 7995 * At first sight, we don't know the size of the devid that the 7996 * server will return but this size will be encoded into the 7997 * reply. So we do a first request using a default size then we 7998 * check if this size was large enough. If not then we do a second 7999 * request with the correct size returned by the server. Note that 8000 * ldc requires size to be 8-byte aligned. 8001 */ 8002 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 8003 sizeof (uint64_t)); 8004 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 8005 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 8006 8007 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 8008 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 8009 8010 DMSG(vdc, 2, "sync_op returned %d\n", rv); 8011 8012 if (rv) { 8013 kmem_free(vd_devid, bufsize); 8014 return (rv); 8015 } 8016 8017 if (vd_devid->length > bufid_len) { 8018 /* 8019 * The returned devid is larger than the buffer used. Try again 8020 * with a buffer with the right size. 8021 */ 8022 kmem_free(vd_devid, bufsize); 8023 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 8024 sizeof (uint64_t)); 8025 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 8026 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 8027 8028 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 8029 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 8030 VIO_both_dir, B_TRUE); 8031 8032 if (rv) { 8033 kmem_free(vd_devid, bufsize); 8034 return (rv); 8035 } 8036 } 8037 8038 /* 8039 * The virtual disk should have the same device id as the one associated 8040 * with the physical disk it is mapped on, otherwise sharing a disk 8041 * between a LDom and a non-LDom may not work (for example for a shared 8042 * SVM disk set). 8043 * 8044 * The DDI framework does not allow creating a device id with any 8045 * type so we first create a device id of type DEVID_ENCAP and then 8046 * we restore the orignal type of the physical device. 8047 */ 8048 8049 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 8050 8051 /* build an encapsulated devid based on the returned devid */ 8052 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 8053 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 8054 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 8055 kmem_free(vd_devid, bufsize); 8056 return (1); 8057 } 8058 8059 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 8060 8061 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 8062 8063 kmem_free(vd_devid, bufsize); 8064 8065 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 8066 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 8067 return (1); 8068 } 8069 8070 return (0); 8071 } 8072 8073 static void 8074 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 8075 { 8076 int i, nparts; 8077 8078 ASSERT(MUTEX_HELD(&vdc->lock)); 8079 8080 vdc->vdisk_label = VD_DISK_LABEL_EFI; 8081 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8082 bzero(vdc->geom, sizeof (struct dk_geom)); 8083 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8084 8085 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 8086 8087 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 8088 8089 if (gpe[i].efi_gpe_StartingLBA == 0 && 8090 gpe[i].efi_gpe_EndingLBA == 0) { 8091 continue; 8092 } 8093 8094 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 8095 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 8096 gpe[i].efi_gpe_StartingLBA + 1; 8097 } 8098 8099 ASSERT(vdc->vdisk_size != 0); 8100 vdc->slice[VD_EFI_WD_SLICE].start = 0; 8101 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 8102 8103 } 8104 8105 static void 8106 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct extvtoc *vtoc) 8107 { 8108 int i; 8109 8110 ASSERT(MUTEX_HELD(&vdc->lock)); 8111 ASSERT(vdc->block_size == vtoc->v_sectorsz); 8112 8113 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 8114 bcopy(vtoc, vdc->vtoc, sizeof (struct extvtoc)); 8115 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 8116 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8117 8118 for (i = 0; i < vtoc->v_nparts; i++) { 8119 vdc->slice[i].start = vtoc->v_part[i].p_start; 8120 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 8121 } 8122 } 8123 8124 static void 8125 vdc_store_label_unk(vdc_t *vdc) 8126 { 8127 ASSERT(MUTEX_HELD(&vdc->lock)); 8128 8129 vdc->vdisk_label = VD_DISK_LABEL_UNK; 8130 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8131 bzero(vdc->geom, sizeof (struct dk_geom)); 8132 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8133 } 8134