1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/kstat.h> 69 #include <sys/mach_descrip.h> 70 #include <sys/modctl.h> 71 #include <sys/mdeg.h> 72 #include <sys/note.h> 73 #include <sys/open.h> 74 #include <sys/sdt.h> 75 #include <sys/stat.h> 76 #include <sys/sunddi.h> 77 #include <sys/types.h> 78 #include <sys/promif.h> 79 #include <sys/var.h> 80 #include <sys/vtoc.h> 81 #include <sys/archsystm.h> 82 #include <sys/sysmacros.h> 83 84 #include <sys/cdio.h> 85 #include <sys/dktp/fdisk.h> 86 #include <sys/dktp/dadkio.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 124 int mod_flags, char *name, caddr_t valuep, int *lengthp); 125 126 /* setup */ 127 static void vdc_min(struct buf *bufp); 128 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 129 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 130 static int vdc_start_ldc_connection(vdc_t *vdc); 131 static int vdc_create_device_nodes(vdc_t *vdc); 132 static int vdc_create_device_nodes_efi(vdc_t *vdc); 133 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 134 static void vdc_create_io_kstats(vdc_t *vdc); 135 static void vdc_create_err_kstats(vdc_t *vdc); 136 static void vdc_set_err_kstats(vdc_t *vdc); 137 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 138 mde_cookie_t *vd_nodep); 139 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 140 static void vdc_fini_ports(vdc_t *vdc); 141 static void vdc_switch_server(vdc_t *vdcp); 142 static int vdc_do_ldc_up(vdc_t *vdc); 143 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 144 static int vdc_init_descriptor_ring(vdc_t *vdc); 145 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 146 static int vdc_setup_devid(vdc_t *vdc); 147 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 148 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 149 static void vdc_store_label_unk(vdc_t *vdc); 150 static boolean_t vdc_is_opened(vdc_t *vdc); 151 152 /* handshake with vds */ 153 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 154 static int vdc_ver_negotiation(vdc_t *vdcp); 155 static int vdc_init_attr_negotiation(vdc_t *vdc); 156 static int vdc_attr_negotiation(vdc_t *vdcp); 157 static int vdc_init_dring_negotiate(vdc_t *vdc); 158 static int vdc_dring_negotiation(vdc_t *vdcp); 159 static int vdc_send_rdx(vdc_t *vdcp); 160 static int vdc_rdx_exchange(vdc_t *vdcp); 161 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 162 163 /* processing incoming messages from vDisk server */ 164 static void vdc_process_msg_thread(vdc_t *vdc); 165 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 166 167 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 168 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 169 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 170 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 171 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 172 static int vdc_send_request(vdc_t *vdcp, int operation, 173 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 174 int cb_type, void *cb_arg, vio_desc_direction_t dir); 175 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 176 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 177 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 178 int cb_type, void *cb_arg, vio_desc_direction_t dir); 179 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 180 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 181 void *cb_arg, vio_desc_direction_t dir, boolean_t); 182 183 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 184 static int vdc_drain_response(vdc_t *vdcp, struct buf *buf); 185 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 186 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 187 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 188 189 /* dkio */ 190 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 191 int *rvalp); 192 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 193 static void vdc_create_fake_geometry(vdc_t *vdc); 194 static int vdc_validate_geometry(vdc_t *vdc); 195 static void vdc_validate(vdc_t *vdc); 196 static void vdc_validate_task(void *arg); 197 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 198 int mode, int dir); 199 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 200 int mode, int dir); 201 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 202 int mode, int dir); 203 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 204 int mode, int dir); 205 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 206 int mode, int dir); 207 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 208 int mode, int dir); 209 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 210 int mode, int dir); 211 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 212 int mode, int dir); 213 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 214 int mode, int dir); 215 216 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 217 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 218 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 219 static int vdc_failfast_check_resv(vdc_t *vdc); 220 221 /* 222 * Module variables 223 */ 224 225 /* 226 * Tunable variables to control how long vdc waits before timing out on 227 * various operations 228 */ 229 static int vdc_hshake_retries = 3; 230 231 static int vdc_timeout = 0; /* units: seconds */ 232 static int vdc_ldcup_timeout = 1; /* units: seconds */ 233 234 static uint64_t vdc_hz_min_ldc_delay; 235 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 236 static uint64_t vdc_hz_max_ldc_delay; 237 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 238 239 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 240 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 241 242 /* values for dumping - need to run in a tighter loop */ 243 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 244 static int vdc_dump_retries = 100; 245 246 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 247 248 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 249 250 /* Count of the number of vdc instances attached */ 251 static volatile uint32_t vdc_instance_count = 0; 252 253 /* Tunable to log all SCSI errors */ 254 static boolean_t vdc_scsi_log_error = B_FALSE; 255 256 /* Soft state pointer */ 257 static void *vdc_state; 258 259 /* 260 * Controlling the verbosity of the error/debug messages 261 * 262 * vdc_msglevel - controls level of messages 263 * vdc_matchinst - 64-bit variable where each bit corresponds 264 * to the vdc instance the vdc_msglevel applies. 265 */ 266 int vdc_msglevel = 0x0; 267 uint64_t vdc_matchinst = 0ull; 268 269 /* 270 * Supported vDisk protocol version pairs. 271 * 272 * The first array entry is the latest and preferred version. 273 */ 274 static const vio_ver_t vdc_version[] = {{1, 1}}; 275 276 static struct cb_ops vdc_cb_ops = { 277 vdc_open, /* cb_open */ 278 vdc_close, /* cb_close */ 279 vdc_strategy, /* cb_strategy */ 280 vdc_print, /* cb_print */ 281 vdc_dump, /* cb_dump */ 282 vdc_read, /* cb_read */ 283 vdc_write, /* cb_write */ 284 vdc_ioctl, /* cb_ioctl */ 285 nodev, /* cb_devmap */ 286 nodev, /* cb_mmap */ 287 nodev, /* cb_segmap */ 288 nochpoll, /* cb_chpoll */ 289 vdc_prop_op, /* cb_prop_op */ 290 NULL, /* cb_str */ 291 D_MP | D_64BIT, /* cb_flag */ 292 CB_REV, /* cb_rev */ 293 vdc_aread, /* cb_aread */ 294 vdc_awrite /* cb_awrite */ 295 }; 296 297 static struct dev_ops vdc_ops = { 298 DEVO_REV, /* devo_rev */ 299 0, /* devo_refcnt */ 300 vdc_getinfo, /* devo_getinfo */ 301 nulldev, /* devo_identify */ 302 nulldev, /* devo_probe */ 303 vdc_attach, /* devo_attach */ 304 vdc_detach, /* devo_detach */ 305 nodev, /* devo_reset */ 306 &vdc_cb_ops, /* devo_cb_ops */ 307 NULL, /* devo_bus_ops */ 308 nulldev /* devo_power */ 309 }; 310 311 static struct modldrv modldrv = { 312 &mod_driverops, 313 "virtual disk client", 314 &vdc_ops, 315 }; 316 317 static struct modlinkage modlinkage = { 318 MODREV_1, 319 &modldrv, 320 NULL 321 }; 322 323 /* -------------------------------------------------------------------------- */ 324 325 /* 326 * Device Driver housekeeping and setup 327 */ 328 329 int 330 _init(void) 331 { 332 int status; 333 334 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 335 return (status); 336 if ((status = mod_install(&modlinkage)) != 0) 337 ddi_soft_state_fini(&vdc_state); 338 return (status); 339 } 340 341 int 342 _info(struct modinfo *modinfop) 343 { 344 return (mod_info(&modlinkage, modinfop)); 345 } 346 347 int 348 _fini(void) 349 { 350 int status; 351 352 if ((status = mod_remove(&modlinkage)) != 0) 353 return (status); 354 ddi_soft_state_fini(&vdc_state); 355 return (0); 356 } 357 358 static int 359 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 360 { 361 _NOTE(ARGUNUSED(dip)) 362 363 int instance = VDCUNIT((dev_t)arg); 364 vdc_t *vdc = NULL; 365 366 switch (cmd) { 367 case DDI_INFO_DEVT2DEVINFO: 368 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 369 *resultp = NULL; 370 return (DDI_FAILURE); 371 } 372 *resultp = vdc->dip; 373 return (DDI_SUCCESS); 374 case DDI_INFO_DEVT2INSTANCE: 375 *resultp = (void *)(uintptr_t)instance; 376 return (DDI_SUCCESS); 377 default: 378 *resultp = NULL; 379 return (DDI_FAILURE); 380 } 381 } 382 383 static int 384 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 385 { 386 kt_did_t failfast_tid, ownership_tid; 387 int instance; 388 int rv; 389 vdc_server_t *srvr; 390 vdc_t *vdc = NULL; 391 392 switch (cmd) { 393 case DDI_DETACH: 394 /* the real work happens below */ 395 break; 396 case DDI_SUSPEND: 397 /* nothing to do for this non-device */ 398 return (DDI_SUCCESS); 399 default: 400 return (DDI_FAILURE); 401 } 402 403 ASSERT(cmd == DDI_DETACH); 404 instance = ddi_get_instance(dip); 405 DMSGX(1, "[%d] Entered\n", instance); 406 407 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 408 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 409 return (DDI_FAILURE); 410 } 411 412 /* 413 * This function is called when vdc is detached or if it has failed to 414 * attach. In that case, the attach may have fail before the vdisk type 415 * has been set so we can't call vdc_is_opened(). However as the attach 416 * has failed, we know that the vdisk is not opened and we can safely 417 * detach. 418 */ 419 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 420 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 421 return (DDI_FAILURE); 422 } 423 424 if (vdc->dkio_flush_pending) { 425 DMSG(vdc, 0, 426 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 427 instance, vdc->dkio_flush_pending); 428 return (DDI_FAILURE); 429 } 430 431 if (vdc->validate_pending) { 432 DMSG(vdc, 0, 433 "[%d] Cannot detach: %d outstanding validate request\n", 434 instance, vdc->validate_pending); 435 return (DDI_FAILURE); 436 } 437 438 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 439 440 /* If we took ownership, release ownership */ 441 mutex_enter(&vdc->ownership_lock); 442 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 443 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 444 if (rv == 0) { 445 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 446 } 447 } 448 mutex_exit(&vdc->ownership_lock); 449 450 /* mark instance as detaching */ 451 vdc->lifecycle = VDC_LC_DETACHING; 452 453 /* 454 * Try and disable callbacks to prevent another handshake. We have to 455 * disable callbacks for all servers. 456 */ 457 for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) { 458 rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE); 459 DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n", 460 srvr->ldc_id, rv); 461 } 462 463 if (vdc->initialized & VDC_THREAD) { 464 mutex_enter(&vdc->read_lock); 465 if ((vdc->read_state == VDC_READ_WAITING) || 466 (vdc->read_state == VDC_READ_RESET)) { 467 vdc->read_state = VDC_READ_RESET; 468 cv_signal(&vdc->read_cv); 469 } 470 471 mutex_exit(&vdc->read_lock); 472 473 /* wake up any thread waiting for connection to come online */ 474 mutex_enter(&vdc->lock); 475 if (vdc->state == VDC_STATE_INIT_WAITING) { 476 DMSG(vdc, 0, 477 "[%d] write reset - move to resetting state...\n", 478 instance); 479 vdc->state = VDC_STATE_RESETTING; 480 cv_signal(&vdc->initwait_cv); 481 } 482 mutex_exit(&vdc->lock); 483 484 /* now wait until state transitions to VDC_STATE_DETACH */ 485 thread_join(vdc->msg_proc_thr->t_did); 486 ASSERT(vdc->state == VDC_STATE_DETACH); 487 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 488 vdc->instance); 489 } 490 491 mutex_enter(&vdc->lock); 492 493 if (vdc->initialized & VDC_DRING) 494 vdc_destroy_descriptor_ring(vdc); 495 496 vdc_fini_ports(vdc); 497 498 if (vdc->failfast_thread) { 499 failfast_tid = vdc->failfast_thread->t_did; 500 vdc->failfast_interval = 0; 501 cv_signal(&vdc->failfast_cv); 502 } else { 503 failfast_tid = 0; 504 } 505 506 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 507 ownership_tid = vdc->ownership_thread->t_did; 508 vdc->ownership = VDC_OWNERSHIP_NONE; 509 cv_signal(&vdc->ownership_cv); 510 } else { 511 ownership_tid = 0; 512 } 513 514 mutex_exit(&vdc->lock); 515 516 if (failfast_tid != 0) 517 thread_join(failfast_tid); 518 519 if (ownership_tid != 0) 520 thread_join(ownership_tid); 521 522 if (vdc->initialized & VDC_MINOR) 523 ddi_remove_minor_node(dip, NULL); 524 525 if (vdc->io_stats) { 526 kstat_delete(vdc->io_stats); 527 vdc->io_stats = NULL; 528 } 529 530 if (vdc->err_stats) { 531 kstat_delete(vdc->err_stats); 532 vdc->err_stats = NULL; 533 } 534 535 if (vdc->initialized & VDC_LOCKS) { 536 mutex_destroy(&vdc->lock); 537 mutex_destroy(&vdc->read_lock); 538 mutex_destroy(&vdc->ownership_lock); 539 cv_destroy(&vdc->initwait_cv); 540 cv_destroy(&vdc->dring_free_cv); 541 cv_destroy(&vdc->membind_cv); 542 cv_destroy(&vdc->sync_pending_cv); 543 cv_destroy(&vdc->sync_blocked_cv); 544 cv_destroy(&vdc->read_cv); 545 cv_destroy(&vdc->running_cv); 546 cv_destroy(&vdc->ownership_cv); 547 cv_destroy(&vdc->failfast_cv); 548 cv_destroy(&vdc->failfast_io_cv); 549 } 550 551 if (vdc->minfo) 552 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 553 554 if (vdc->cinfo) 555 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 556 557 if (vdc->vtoc) 558 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 559 560 if (vdc->geom) 561 kmem_free(vdc->geom, sizeof (struct dk_geom)); 562 563 if (vdc->devid) { 564 ddi_devid_unregister(dip); 565 ddi_devid_free(vdc->devid); 566 } 567 568 if (vdc->initialized & VDC_SOFT_STATE) 569 ddi_soft_state_free(vdc_state, instance); 570 571 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 572 573 return (DDI_SUCCESS); 574 } 575 576 577 static int 578 vdc_do_attach(dev_info_t *dip) 579 { 580 int instance; 581 vdc_t *vdc = NULL; 582 int status; 583 md_t *mdp; 584 mde_cookie_t vd_node; 585 586 ASSERT(dip != NULL); 587 588 instance = ddi_get_instance(dip); 589 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 590 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 591 instance); 592 return (DDI_FAILURE); 593 } 594 595 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 596 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 597 return (DDI_FAILURE); 598 } 599 600 /* 601 * We assign the value to initialized in this case to zero out the 602 * variable and then set bits in it to indicate what has been done 603 */ 604 vdc->initialized = VDC_SOFT_STATE; 605 606 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 607 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 608 609 vdc->dip = dip; 610 vdc->instance = instance; 611 vdc->vdisk_type = VD_DISK_TYPE_UNK; 612 vdc->vdisk_label = VD_DISK_LABEL_UNK; 613 vdc->state = VDC_STATE_INIT; 614 vdc->lifecycle = VDC_LC_ATTACHING; 615 vdc->session_id = 0; 616 vdc->block_size = DEV_BSIZE; 617 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 618 619 /* 620 * We assume, for now, that the vDisk server will export 'read' 621 * operations to us at a minimum (this is needed because of checks 622 * in vdc for supported operations early in the handshake process). 623 * The vDisk server will return ENOTSUP if this is not the case. 624 * The value will be overwritten during the attribute exchange with 625 * the bitmask of operations exported by server. 626 */ 627 vdc->operations = VD_OP_MASK_READ; 628 629 vdc->vtoc = NULL; 630 vdc->geom = NULL; 631 vdc->cinfo = NULL; 632 vdc->minfo = NULL; 633 634 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 635 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 636 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 637 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 638 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 639 640 vdc->threads_pending = 0; 641 vdc->sync_op_pending = B_FALSE; 642 vdc->sync_op_blocked = B_FALSE; 643 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 645 646 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 647 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 648 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 649 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 650 651 /* init blocking msg read functionality */ 652 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 653 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 654 vdc->read_state = VDC_READ_IDLE; 655 656 vdc->initialized |= VDC_LOCKS; 657 658 /* get device and port MD node for this disk instance */ 659 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 660 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 661 instance); 662 return (DDI_FAILURE); 663 } 664 665 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 666 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 667 return (DDI_FAILURE); 668 } 669 670 (void) md_fini_handle(mdp); 671 672 /* initialize the thread responsible for managing state with server */ 673 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 674 vdc, 0, &p0, TS_RUN, minclsyspri); 675 if (vdc->msg_proc_thr == NULL) { 676 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 677 instance); 678 return (DDI_FAILURE); 679 } 680 681 vdc->initialized |= VDC_THREAD; 682 683 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 684 vdc_create_io_kstats(vdc); 685 vdc_create_err_kstats(vdc); 686 687 atomic_inc_32(&vdc_instance_count); 688 689 /* 690 * Check the disk label. This will send requests and do the handshake. 691 * We don't really care about the disk label now. What we really need is 692 * the handshake do be done so that we know the type of the disk (slice 693 * or full disk) and the appropriate device nodes can be created. 694 */ 695 vdc->vdisk_label = VD_DISK_LABEL_UNK; 696 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 697 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 698 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 699 700 mutex_enter(&vdc->lock); 701 (void) vdc_validate_geometry(vdc); 702 mutex_exit(&vdc->lock); 703 704 /* 705 * Now that we have the device info we can create the device nodes 706 */ 707 status = vdc_create_device_nodes(vdc); 708 if (status) { 709 DMSG(vdc, 0, "[%d] Failed to create device nodes", 710 instance); 711 goto return_status; 712 } 713 714 /* 715 * Setup devid 716 */ 717 if (vdc_setup_devid(vdc)) { 718 DMSG(vdc, 0, "[%d] No device id available\n", instance); 719 } 720 721 /* 722 * Fill in the fields of the error statistics kstat that were not 723 * available when creating the kstat 724 */ 725 vdc_set_err_kstats(vdc); 726 727 ddi_report_dev(dip); 728 vdc->lifecycle = VDC_LC_ONLINE; 729 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 730 731 return_status: 732 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 733 return (status); 734 } 735 736 static int 737 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 738 { 739 int status; 740 741 switch (cmd) { 742 case DDI_ATTACH: 743 if ((status = vdc_do_attach(dip)) != 0) 744 (void) vdc_detach(dip, DDI_DETACH); 745 return (status); 746 case DDI_RESUME: 747 /* nothing to do for this non-device */ 748 return (DDI_SUCCESS); 749 default: 750 return (DDI_FAILURE); 751 } 752 } 753 754 static int 755 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 756 { 757 int status = 0; 758 ldc_status_t ldc_state; 759 ldc_attr_t ldc_attr; 760 761 ASSERT(vdc != NULL); 762 ASSERT(srvr != NULL); 763 764 ldc_attr.devclass = LDC_DEV_BLK; 765 ldc_attr.instance = vdc->instance; 766 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 767 ldc_attr.mtu = VD_LDC_MTU; 768 769 if ((srvr->state & VDC_LDC_INIT) == 0) { 770 status = ldc_init(srvr->ldc_id, &ldc_attr, 771 &srvr->ldc_handle); 772 if (status != 0) { 773 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 774 vdc->instance, srvr->ldc_id, status); 775 return (status); 776 } 777 srvr->state |= VDC_LDC_INIT; 778 } 779 status = ldc_status(srvr->ldc_handle, &ldc_state); 780 if (status != 0) { 781 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 782 vdc->instance, status); 783 goto init_exit; 784 } 785 srvr->ldc_state = ldc_state; 786 787 if ((srvr->state & VDC_LDC_CB) == 0) { 788 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 789 (caddr_t)srvr); 790 if (status != 0) { 791 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 792 vdc->instance, status); 793 goto init_exit; 794 } 795 srvr->state |= VDC_LDC_CB; 796 } 797 798 /* 799 * At this stage we have initialised LDC, we will now try and open 800 * the connection. 801 */ 802 if (srvr->ldc_state == LDC_INIT) { 803 status = ldc_open(srvr->ldc_handle); 804 if (status != 0) { 805 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 806 vdc->instance, srvr->ldc_id, status); 807 goto init_exit; 808 } 809 srvr->state |= VDC_LDC_OPEN; 810 } 811 812 init_exit: 813 if (status) { 814 vdc_terminate_ldc(vdc, srvr); 815 } 816 817 return (status); 818 } 819 820 static int 821 vdc_start_ldc_connection(vdc_t *vdc) 822 { 823 int status = 0; 824 825 ASSERT(vdc != NULL); 826 827 ASSERT(MUTEX_HELD(&vdc->lock)); 828 829 status = vdc_do_ldc_up(vdc); 830 831 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 832 833 return (status); 834 } 835 836 static int 837 vdc_stop_ldc_connection(vdc_t *vdcp) 838 { 839 int status; 840 841 ASSERT(vdcp != NULL); 842 843 ASSERT(MUTEX_HELD(&vdcp->lock)); 844 845 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 846 vdcp->state); 847 848 status = ldc_down(vdcp->curr_server->ldc_handle); 849 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 850 851 vdcp->initialized &= ~VDC_HANDSHAKE; 852 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 853 854 return (status); 855 } 856 857 static void 858 vdc_create_io_kstats(vdc_t *vdc) 859 { 860 if (vdc->io_stats != NULL) { 861 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 862 return; 863 } 864 865 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 866 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 867 if (vdc->io_stats != NULL) { 868 vdc->io_stats->ks_lock = &vdc->lock; 869 kstat_install(vdc->io_stats); 870 } else { 871 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 872 " will not be gathered", vdc->instance); 873 } 874 } 875 876 static void 877 vdc_create_err_kstats(vdc_t *vdc) 878 { 879 vd_err_stats_t *stp; 880 char kstatmodule_err[KSTAT_STRLEN]; 881 char kstatname[KSTAT_STRLEN]; 882 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 883 int instance = vdc->instance; 884 885 if (vdc->err_stats != NULL) { 886 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 887 return; 888 } 889 890 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 891 "%serr", VDC_DRIVER_NAME); 892 (void) snprintf(kstatname, sizeof (kstatname), 893 "%s%d,err", VDC_DRIVER_NAME, instance); 894 895 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 896 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 897 898 if (vdc->err_stats == NULL) { 899 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 900 " will not be gathered", instance); 901 return; 902 } 903 904 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 905 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 906 KSTAT_DATA_UINT32); 907 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 908 KSTAT_DATA_UINT32); 909 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 910 KSTAT_DATA_UINT32); 911 kstat_named_init(&stp->vd_vid, "Vendor", 912 KSTAT_DATA_CHAR); 913 kstat_named_init(&stp->vd_pid, "Product", 914 KSTAT_DATA_CHAR); 915 kstat_named_init(&stp->vd_capacity, "Size", 916 KSTAT_DATA_ULONGLONG); 917 918 vdc->err_stats->ks_update = nulldev; 919 920 kstat_install(vdc->err_stats); 921 } 922 923 static void 924 vdc_set_err_kstats(vdc_t *vdc) 925 { 926 vd_err_stats_t *stp; 927 928 if (vdc->err_stats == NULL) 929 return; 930 931 mutex_enter(&vdc->lock); 932 933 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 934 ASSERT(stp != NULL); 935 936 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 937 (void) strcpy(stp->vd_vid.value.c, "SUN"); 938 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 939 940 mutex_exit(&vdc->lock); 941 } 942 943 static int 944 vdc_create_device_nodes_efi(vdc_t *vdc) 945 { 946 ddi_remove_minor_node(vdc->dip, "h"); 947 ddi_remove_minor_node(vdc->dip, "h,raw"); 948 949 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 950 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 951 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 952 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 953 vdc->instance); 954 return (EIO); 955 } 956 957 /* if any device node is created we set this flag */ 958 vdc->initialized |= VDC_MINOR; 959 960 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 961 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 962 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 963 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 964 vdc->instance); 965 return (EIO); 966 } 967 968 return (0); 969 } 970 971 static int 972 vdc_create_device_nodes_vtoc(vdc_t *vdc) 973 { 974 ddi_remove_minor_node(vdc->dip, "wd"); 975 ddi_remove_minor_node(vdc->dip, "wd,raw"); 976 977 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 978 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 979 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 980 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 981 vdc->instance); 982 return (EIO); 983 } 984 985 /* if any device node is created we set this flag */ 986 vdc->initialized |= VDC_MINOR; 987 988 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 989 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 990 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 991 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 992 vdc->instance); 993 return (EIO); 994 } 995 996 return (0); 997 } 998 999 /* 1000 * Function: 1001 * vdc_create_device_nodes 1002 * 1003 * Description: 1004 * This function creates the block and character device nodes under 1005 * /devices. It is called as part of the attach(9E) of the instance 1006 * during the handshake with vds after vds has sent the attributes 1007 * to vdc. 1008 * 1009 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1010 * of 2 is used in keeping with the Solaris convention that slice 2 1011 * refers to a whole disk. Slices start at 'a' 1012 * 1013 * Parameters: 1014 * vdc - soft state pointer 1015 * 1016 * Return Values 1017 * 0 - Success 1018 * EIO - Failed to create node 1019 * EINVAL - Unknown type of disk exported 1020 */ 1021 static int 1022 vdc_create_device_nodes(vdc_t *vdc) 1023 { 1024 char name[sizeof ("s,raw")]; 1025 dev_info_t *dip = NULL; 1026 int instance, status; 1027 int num_slices = 1; 1028 int i; 1029 1030 ASSERT(vdc != NULL); 1031 1032 instance = vdc->instance; 1033 dip = vdc->dip; 1034 1035 switch (vdc->vdisk_type) { 1036 case VD_DISK_TYPE_DISK: 1037 num_slices = V_NUMPAR; 1038 break; 1039 case VD_DISK_TYPE_SLICE: 1040 num_slices = 1; 1041 break; 1042 case VD_DISK_TYPE_UNK: 1043 default: 1044 return (EINVAL); 1045 } 1046 1047 /* 1048 * Minor nodes are different for EFI disks: EFI disks do not have 1049 * a minor node 'g' for the minor number corresponding to slice 1050 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1051 * representing the whole disk. 1052 */ 1053 for (i = 0; i < num_slices; i++) { 1054 1055 if (i == VD_EFI_WD_SLICE) { 1056 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1057 status = vdc_create_device_nodes_efi(vdc); 1058 else 1059 status = vdc_create_device_nodes_vtoc(vdc); 1060 if (status != 0) 1061 return (status); 1062 continue; 1063 } 1064 1065 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1066 if (ddi_create_minor_node(dip, name, S_IFBLK, 1067 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1068 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1069 instance, name); 1070 return (EIO); 1071 } 1072 1073 /* if any device node is created we set this flag */ 1074 vdc->initialized |= VDC_MINOR; 1075 1076 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1077 1078 if (ddi_create_minor_node(dip, name, S_IFCHR, 1079 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1080 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1081 instance, name); 1082 return (EIO); 1083 } 1084 } 1085 1086 return (0); 1087 } 1088 1089 /* 1090 * Driver prop_op(9e) entry point function. Return the number of blocks for 1091 * the partition in question or forward the request to the property facilities. 1092 */ 1093 static int 1094 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1095 char *name, caddr_t valuep, int *lengthp) 1096 { 1097 int instance = ddi_get_instance(dip); 1098 vdc_t *vdc; 1099 uint64_t nblocks; 1100 uint_t blksize; 1101 1102 vdc = ddi_get_soft_state(vdc_state, instance); 1103 1104 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1105 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1106 name, valuep, lengthp)); 1107 } 1108 1109 mutex_enter(&vdc->lock); 1110 (void) vdc_validate_geometry(vdc); 1111 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1112 mutex_exit(&vdc->lock); 1113 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1114 name, valuep, lengthp)); 1115 } 1116 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1117 blksize = vdc->block_size; 1118 mutex_exit(&vdc->lock); 1119 1120 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1121 name, valuep, lengthp, nblocks, blksize)); 1122 } 1123 1124 /* 1125 * Function: 1126 * vdc_is_opened 1127 * 1128 * Description: 1129 * This function checks if any slice of a given virtual disk is 1130 * currently opened. 1131 * 1132 * Parameters: 1133 * vdc - soft state pointer 1134 * 1135 * Return Values 1136 * B_TRUE - at least one slice is opened. 1137 * B_FALSE - no slice is opened. 1138 */ 1139 static boolean_t 1140 vdc_is_opened(vdc_t *vdc) 1141 { 1142 int i, nslices; 1143 1144 switch (vdc->vdisk_type) { 1145 case VD_DISK_TYPE_DISK: 1146 nslices = V_NUMPAR; 1147 break; 1148 case VD_DISK_TYPE_SLICE: 1149 nslices = 1; 1150 break; 1151 case VD_DISK_TYPE_UNK: 1152 default: 1153 ASSERT(0); 1154 } 1155 1156 /* check if there's any layered open */ 1157 for (i = 0; i < nslices; i++) { 1158 if (vdc->open_lyr[i] > 0) 1159 return (B_TRUE); 1160 } 1161 1162 /* check if there is any other kind of open */ 1163 for (i = 0; i < OTYPCNT; i++) { 1164 if (vdc->open[i] != 0) 1165 return (B_TRUE); 1166 } 1167 1168 return (B_FALSE); 1169 } 1170 1171 static int 1172 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1173 { 1174 uint8_t slicemask; 1175 int i; 1176 1177 ASSERT(otyp < OTYPCNT); 1178 ASSERT(slice < V_NUMPAR); 1179 ASSERT(MUTEX_HELD(&vdc->lock)); 1180 1181 slicemask = 1 << slice; 1182 1183 /* check if slice is already exclusively opened */ 1184 if (vdc->open_excl & slicemask) 1185 return (EBUSY); 1186 1187 /* if open exclusive, check if slice is already opened */ 1188 if (flag & FEXCL) { 1189 if (vdc->open_lyr[slice] > 0) 1190 return (EBUSY); 1191 for (i = 0; i < OTYPCNT; i++) { 1192 if (vdc->open[i] & slicemask) 1193 return (EBUSY); 1194 } 1195 vdc->open_excl |= slicemask; 1196 } 1197 1198 /* mark slice as opened */ 1199 if (otyp == OTYP_LYR) { 1200 vdc->open_lyr[slice]++; 1201 } else { 1202 vdc->open[otyp] |= slicemask; 1203 } 1204 1205 return (0); 1206 } 1207 1208 static void 1209 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1210 { 1211 uint8_t slicemask; 1212 1213 ASSERT(otyp < OTYPCNT); 1214 ASSERT(slice < V_NUMPAR); 1215 ASSERT(MUTEX_HELD(&vdc->lock)); 1216 1217 slicemask = 1 << slice; 1218 1219 if (otyp == OTYP_LYR) { 1220 ASSERT(vdc->open_lyr[slice] > 0); 1221 vdc->open_lyr[slice]--; 1222 } else { 1223 vdc->open[otyp] &= ~slicemask; 1224 } 1225 1226 if (flag & FEXCL) 1227 vdc->open_excl &= ~slicemask; 1228 } 1229 1230 static int 1231 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1232 { 1233 _NOTE(ARGUNUSED(cred)) 1234 1235 int instance, nodelay; 1236 int slice, status = 0; 1237 vdc_t *vdc; 1238 1239 ASSERT(dev != NULL); 1240 instance = VDCUNIT(*dev); 1241 1242 if (otyp >= OTYPCNT) 1243 return (EINVAL); 1244 1245 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1246 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1247 return (ENXIO); 1248 } 1249 1250 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1251 getminor(*dev), flag, otyp); 1252 1253 slice = VDCPART(*dev); 1254 1255 nodelay = flag & (FNDELAY | FNONBLOCK); 1256 1257 if ((flag & FWRITE) && (!nodelay) && 1258 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1259 return (EROFS); 1260 } 1261 1262 mutex_enter(&vdc->lock); 1263 1264 status = vdc_mark_opened(vdc, slice, flag, otyp); 1265 1266 if (status != 0) { 1267 mutex_exit(&vdc->lock); 1268 return (status); 1269 } 1270 1271 if (nodelay) { 1272 1273 /* don't resubmit a validate request if there's already one */ 1274 if (vdc->validate_pending > 0) { 1275 mutex_exit(&vdc->lock); 1276 return (0); 1277 } 1278 1279 /* call vdc_validate() asynchronously to avoid blocking */ 1280 if (taskq_dispatch(system_taskq, vdc_validate_task, 1281 (void *)vdc, TQ_NOSLEEP) == NULL) { 1282 vdc_mark_closed(vdc, slice, flag, otyp); 1283 mutex_exit(&vdc->lock); 1284 return (ENXIO); 1285 } 1286 1287 vdc->validate_pending++; 1288 mutex_exit(&vdc->lock); 1289 return (0); 1290 } 1291 1292 mutex_exit(&vdc->lock); 1293 1294 vdc_validate(vdc); 1295 1296 mutex_enter(&vdc->lock); 1297 1298 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1299 vdc->slice[slice].nblocks == 0) { 1300 vdc_mark_closed(vdc, slice, flag, otyp); 1301 status = EIO; 1302 } 1303 1304 mutex_exit(&vdc->lock); 1305 1306 return (status); 1307 } 1308 1309 static int 1310 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1311 { 1312 _NOTE(ARGUNUSED(cred)) 1313 1314 int instance; 1315 int slice; 1316 int rv, rval; 1317 vdc_t *vdc; 1318 1319 instance = VDCUNIT(dev); 1320 1321 if (otyp >= OTYPCNT) 1322 return (EINVAL); 1323 1324 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1325 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1326 return (ENXIO); 1327 } 1328 1329 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1330 1331 slice = VDCPART(dev); 1332 1333 /* 1334 * Attempt to flush the W$ on a close operation. If this is 1335 * not a supported IOCTL command or the backing device is read-only 1336 * do not fail the close operation. 1337 */ 1338 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1339 1340 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1341 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1342 instance, rv); 1343 return (EIO); 1344 } 1345 1346 mutex_enter(&vdc->lock); 1347 vdc_mark_closed(vdc, slice, flag, otyp); 1348 mutex_exit(&vdc->lock); 1349 1350 return (0); 1351 } 1352 1353 static int 1354 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1355 { 1356 _NOTE(ARGUNUSED(credp)) 1357 1358 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1359 } 1360 1361 static int 1362 vdc_print(dev_t dev, char *str) 1363 { 1364 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1365 return (0); 1366 } 1367 1368 static int 1369 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1370 { 1371 int rv; 1372 size_t nbytes = nblk * DEV_BSIZE; 1373 int instance = VDCUNIT(dev); 1374 vdc_t *vdc = NULL; 1375 1376 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1377 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1378 return (ENXIO); 1379 } 1380 1381 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1382 instance, nbytes, blkno, (void *)addr); 1383 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1384 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1385 if (rv) { 1386 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1387 return (rv); 1388 } 1389 1390 if (ddi_in_panic()) 1391 (void) vdc_drain_response(vdc, NULL); 1392 1393 DMSG(vdc, 0, "[%d] End\n", instance); 1394 1395 return (0); 1396 } 1397 1398 /* -------------------------------------------------------------------------- */ 1399 1400 /* 1401 * Disk access routines 1402 * 1403 */ 1404 1405 /* 1406 * vdc_strategy() 1407 * 1408 * Return Value: 1409 * 0: As per strategy(9E), the strategy() function must return 0 1410 * [ bioerror(9f) sets b_flags to the proper error code ] 1411 */ 1412 static int 1413 vdc_strategy(struct buf *buf) 1414 { 1415 int rv = -1; 1416 vdc_t *vdc = NULL; 1417 int instance = VDCUNIT(buf->b_edev); 1418 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1419 int slice; 1420 1421 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1422 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1423 bioerror(buf, ENXIO); 1424 biodone(buf); 1425 return (0); 1426 } 1427 1428 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1429 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1430 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1431 1432 bp_mapin(buf); 1433 1434 if ((long)buf->b_private == VD_SLICE_NONE) { 1435 /* I/O using an absolute disk offset */ 1436 slice = VD_SLICE_NONE; 1437 } else { 1438 slice = VDCPART(buf->b_edev); 1439 } 1440 1441 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1442 buf->b_bcount, slice, buf->b_lblkno, 1443 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1444 VIO_write_dir); 1445 1446 /* 1447 * If the request was successfully sent, the strategy call returns and 1448 * the ACK handler calls the bioxxx functions when the vDisk server is 1449 * done otherwise we handle the error here. 1450 */ 1451 if (rv) { 1452 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1453 bioerror(buf, rv); 1454 biodone(buf); 1455 } else if (ddi_in_panic()) { 1456 (void) vdc_drain_response(vdc, buf); 1457 } 1458 1459 return (0); 1460 } 1461 1462 /* 1463 * Function: 1464 * vdc_min 1465 * 1466 * Description: 1467 * Routine to limit the size of a data transfer. Used in 1468 * conjunction with physio(9F). 1469 * 1470 * Arguments: 1471 * bp - pointer to the indicated buf(9S) struct. 1472 * 1473 */ 1474 static void 1475 vdc_min(struct buf *bufp) 1476 { 1477 vdc_t *vdc = NULL; 1478 int instance = VDCUNIT(bufp->b_edev); 1479 1480 vdc = ddi_get_soft_state(vdc_state, instance); 1481 VERIFY(vdc != NULL); 1482 1483 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1484 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1485 } 1486 } 1487 1488 static int 1489 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1490 { 1491 _NOTE(ARGUNUSED(cred)) 1492 1493 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1494 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1495 } 1496 1497 static int 1498 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1499 { 1500 _NOTE(ARGUNUSED(cred)) 1501 1502 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1503 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1504 } 1505 1506 static int 1507 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1508 { 1509 _NOTE(ARGUNUSED(cred)) 1510 1511 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1512 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1513 } 1514 1515 static int 1516 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1517 { 1518 _NOTE(ARGUNUSED(cred)) 1519 1520 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1521 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1522 } 1523 1524 1525 /* -------------------------------------------------------------------------- */ 1526 1527 /* 1528 * Handshake support 1529 */ 1530 1531 1532 /* 1533 * Function: 1534 * vdc_init_ver_negotiation() 1535 * 1536 * Description: 1537 * 1538 * Arguments: 1539 * vdc - soft state pointer for this instance of the device driver. 1540 * 1541 * Return Code: 1542 * 0 - Success 1543 */ 1544 static int 1545 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1546 { 1547 vio_ver_msg_t pkt; 1548 size_t msglen = sizeof (pkt); 1549 int status = -1; 1550 1551 ASSERT(vdc != NULL); 1552 ASSERT(mutex_owned(&vdc->lock)); 1553 1554 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1555 1556 /* 1557 * set the Session ID to a unique value 1558 * (the lower 32 bits of the clock tick) 1559 */ 1560 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1561 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1562 1563 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1564 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1565 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1566 pkt.tag.vio_sid = vdc->session_id; 1567 pkt.dev_class = VDEV_DISK; 1568 pkt.ver_major = ver.major; 1569 pkt.ver_minor = ver.minor; 1570 1571 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1572 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1573 vdc->instance, status); 1574 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1575 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1576 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1577 vdc->curr_server->ldc_handle, status, msglen); 1578 if (msglen != sizeof (vio_ver_msg_t)) 1579 status = ENOMSG; 1580 } 1581 1582 return (status); 1583 } 1584 1585 /* 1586 * Function: 1587 * vdc_ver_negotiation() 1588 * 1589 * Description: 1590 * 1591 * Arguments: 1592 * vdcp - soft state pointer for this instance of the device driver. 1593 * 1594 * Return Code: 1595 * 0 - Success 1596 */ 1597 static int 1598 vdc_ver_negotiation(vdc_t *vdcp) 1599 { 1600 vio_msg_t vio_msg; 1601 int status; 1602 1603 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1604 return (status); 1605 1606 /* release lock and wait for response */ 1607 mutex_exit(&vdcp->lock); 1608 status = vdc_wait_for_response(vdcp, &vio_msg); 1609 mutex_enter(&vdcp->lock); 1610 if (status) { 1611 DMSG(vdcp, 0, 1612 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1613 vdcp->instance, status); 1614 return (status); 1615 } 1616 1617 /* check type and sub_type ... */ 1618 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1619 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1620 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1621 vdcp->instance); 1622 return (EPROTO); 1623 } 1624 1625 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1626 } 1627 1628 /* 1629 * Function: 1630 * vdc_init_attr_negotiation() 1631 * 1632 * Description: 1633 * 1634 * Arguments: 1635 * vdc - soft state pointer for this instance of the device driver. 1636 * 1637 * Return Code: 1638 * 0 - Success 1639 */ 1640 static int 1641 vdc_init_attr_negotiation(vdc_t *vdc) 1642 { 1643 vd_attr_msg_t pkt; 1644 size_t msglen = sizeof (pkt); 1645 int status; 1646 1647 ASSERT(vdc != NULL); 1648 ASSERT(mutex_owned(&vdc->lock)); 1649 1650 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1651 1652 /* fill in tag */ 1653 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1654 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1655 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1656 pkt.tag.vio_sid = vdc->session_id; 1657 /* fill in payload */ 1658 pkt.max_xfer_sz = vdc->max_xfer_sz; 1659 pkt.vdisk_block_size = vdc->block_size; 1660 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1661 pkt.operations = 0; /* server will set bits of valid operations */ 1662 pkt.vdisk_type = 0; /* server will set to valid device type */ 1663 pkt.vdisk_media = 0; /* server will set to valid media type */ 1664 pkt.vdisk_size = 0; /* server will set to valid size */ 1665 1666 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1667 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1668 1669 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1670 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1671 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1672 vdc->curr_server->ldc_handle, status, msglen); 1673 if (msglen != sizeof (vio_ver_msg_t)) 1674 status = ENOMSG; 1675 } 1676 1677 return (status); 1678 } 1679 1680 /* 1681 * Function: 1682 * vdc_attr_negotiation() 1683 * 1684 * Description: 1685 * 1686 * Arguments: 1687 * vdc - soft state pointer for this instance of the device driver. 1688 * 1689 * Return Code: 1690 * 0 - Success 1691 */ 1692 static int 1693 vdc_attr_negotiation(vdc_t *vdcp) 1694 { 1695 int status; 1696 vio_msg_t vio_msg; 1697 1698 if (status = vdc_init_attr_negotiation(vdcp)) 1699 return (status); 1700 1701 /* release lock and wait for response */ 1702 mutex_exit(&vdcp->lock); 1703 status = vdc_wait_for_response(vdcp, &vio_msg); 1704 mutex_enter(&vdcp->lock); 1705 if (status) { 1706 DMSG(vdcp, 0, 1707 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1708 vdcp->instance, status); 1709 return (status); 1710 } 1711 1712 /* check type and sub_type ... */ 1713 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1714 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1715 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1716 vdcp->instance); 1717 return (EPROTO); 1718 } 1719 1720 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1721 } 1722 1723 1724 /* 1725 * Function: 1726 * vdc_init_dring_negotiate() 1727 * 1728 * Description: 1729 * 1730 * Arguments: 1731 * vdc - soft state pointer for this instance of the device driver. 1732 * 1733 * Return Code: 1734 * 0 - Success 1735 */ 1736 static int 1737 vdc_init_dring_negotiate(vdc_t *vdc) 1738 { 1739 vio_dring_reg_msg_t pkt; 1740 size_t msglen = sizeof (pkt); 1741 int status = -1; 1742 int retry; 1743 int nretries = 10; 1744 1745 ASSERT(vdc != NULL); 1746 ASSERT(mutex_owned(&vdc->lock)); 1747 1748 for (retry = 0; retry < nretries; retry++) { 1749 status = vdc_init_descriptor_ring(vdc); 1750 if (status != EAGAIN) 1751 break; 1752 drv_usecwait(vdc_min_timeout_ldc); 1753 } 1754 1755 if (status != 0) { 1756 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1757 vdc->instance, status); 1758 return (status); 1759 } 1760 1761 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1762 vdc->instance, status); 1763 1764 /* fill in tag */ 1765 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1766 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1767 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1768 pkt.tag.vio_sid = vdc->session_id; 1769 /* fill in payload */ 1770 pkt.dring_ident = 0; 1771 pkt.num_descriptors = vdc->dring_len; 1772 pkt.descriptor_size = vdc->dring_entry_size; 1773 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1774 pkt.ncookies = vdc->dring_cookie_count; 1775 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1776 1777 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1778 if (status != 0) { 1779 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1780 vdc->instance, status); 1781 } 1782 1783 return (status); 1784 } 1785 1786 1787 /* 1788 * Function: 1789 * vdc_dring_negotiation() 1790 * 1791 * Description: 1792 * 1793 * Arguments: 1794 * vdc - soft state pointer for this instance of the device driver. 1795 * 1796 * Return Code: 1797 * 0 - Success 1798 */ 1799 static int 1800 vdc_dring_negotiation(vdc_t *vdcp) 1801 { 1802 int status; 1803 vio_msg_t vio_msg; 1804 1805 if (status = vdc_init_dring_negotiate(vdcp)) 1806 return (status); 1807 1808 /* release lock and wait for response */ 1809 mutex_exit(&vdcp->lock); 1810 status = vdc_wait_for_response(vdcp, &vio_msg); 1811 mutex_enter(&vdcp->lock); 1812 if (status) { 1813 DMSG(vdcp, 0, 1814 "[%d] Failed waiting for Dring negotiation response," 1815 " rv(%d)", vdcp->instance, status); 1816 return (status); 1817 } 1818 1819 /* check type and sub_type ... */ 1820 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1821 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1822 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1823 vdcp->instance); 1824 return (EPROTO); 1825 } 1826 1827 return (vdc_handle_dring_reg_msg(vdcp, 1828 (vio_dring_reg_msg_t *)&vio_msg)); 1829 } 1830 1831 1832 /* 1833 * Function: 1834 * vdc_send_rdx() 1835 * 1836 * Description: 1837 * 1838 * Arguments: 1839 * vdc - soft state pointer for this instance of the device driver. 1840 * 1841 * Return Code: 1842 * 0 - Success 1843 */ 1844 static int 1845 vdc_send_rdx(vdc_t *vdcp) 1846 { 1847 vio_msg_t msg; 1848 size_t msglen = sizeof (vio_msg_t); 1849 int status; 1850 1851 /* 1852 * Send an RDX message to vds to indicate we are ready 1853 * to send data 1854 */ 1855 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1856 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1857 msg.tag.vio_subtype_env = VIO_RDX; 1858 msg.tag.vio_sid = vdcp->session_id; 1859 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1860 if (status != 0) { 1861 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1862 vdcp->instance, status); 1863 } 1864 1865 return (status); 1866 } 1867 1868 /* 1869 * Function: 1870 * vdc_handle_rdx() 1871 * 1872 * Description: 1873 * 1874 * Arguments: 1875 * vdc - soft state pointer for this instance of the device driver. 1876 * msgp - received msg 1877 * 1878 * Return Code: 1879 * 0 - Success 1880 */ 1881 static int 1882 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1883 { 1884 _NOTE(ARGUNUSED(vdcp)) 1885 _NOTE(ARGUNUSED(msgp)) 1886 1887 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1888 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1889 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1890 1891 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1892 1893 return (0); 1894 } 1895 1896 /* 1897 * Function: 1898 * vdc_rdx_exchange() 1899 * 1900 * Description: 1901 * 1902 * Arguments: 1903 * vdc - soft state pointer for this instance of the device driver. 1904 * 1905 * Return Code: 1906 * 0 - Success 1907 */ 1908 static int 1909 vdc_rdx_exchange(vdc_t *vdcp) 1910 { 1911 int status; 1912 vio_msg_t vio_msg; 1913 1914 if (status = vdc_send_rdx(vdcp)) 1915 return (status); 1916 1917 /* release lock and wait for response */ 1918 mutex_exit(&vdcp->lock); 1919 status = vdc_wait_for_response(vdcp, &vio_msg); 1920 mutex_enter(&vdcp->lock); 1921 if (status) { 1922 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1923 vdcp->instance, status); 1924 return (status); 1925 } 1926 1927 /* check type and sub_type ... */ 1928 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1929 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1930 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1931 return (EPROTO); 1932 } 1933 1934 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1935 } 1936 1937 1938 /* -------------------------------------------------------------------------- */ 1939 1940 /* 1941 * LDC helper routines 1942 */ 1943 1944 static int 1945 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1946 { 1947 int status; 1948 boolean_t q_has_pkts = B_FALSE; 1949 uint64_t delay_time; 1950 size_t len; 1951 1952 mutex_enter(&vdc->read_lock); 1953 1954 if (vdc->read_state == VDC_READ_IDLE) 1955 vdc->read_state = VDC_READ_WAITING; 1956 1957 while (vdc->read_state != VDC_READ_PENDING) { 1958 1959 /* detect if the connection has been reset */ 1960 if (vdc->read_state == VDC_READ_RESET) { 1961 status = ECONNRESET; 1962 goto done; 1963 } 1964 1965 cv_wait(&vdc->read_cv, &vdc->read_lock); 1966 } 1967 1968 /* 1969 * Until we get a blocking ldc read we have to retry 1970 * until the entire LDC message has arrived before 1971 * ldc_read() will succeed. Note we also bail out if 1972 * the channel is reset or goes away. 1973 */ 1974 delay_time = vdc_ldc_read_init_delay; 1975 loop: 1976 len = *nbytesp; 1977 status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); 1978 switch (status) { 1979 case EAGAIN: 1980 delay_time *= 2; 1981 if (delay_time >= vdc_ldc_read_max_delay) 1982 delay_time = vdc_ldc_read_max_delay; 1983 delay(delay_time); 1984 goto loop; 1985 1986 case 0: 1987 if (len == 0) { 1988 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 1989 "no error!\n", vdc->instance); 1990 goto loop; 1991 } 1992 1993 *nbytesp = len; 1994 1995 /* 1996 * If there are pending messages, leave the 1997 * read state as pending. Otherwise, set the state 1998 * back to idle. 1999 */ 2000 status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); 2001 if (status == 0 && !q_has_pkts) 2002 vdc->read_state = VDC_READ_IDLE; 2003 2004 break; 2005 default: 2006 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2007 break; 2008 } 2009 2010 done: 2011 mutex_exit(&vdc->read_lock); 2012 2013 return (status); 2014 } 2015 2016 2017 2018 #ifdef DEBUG 2019 void 2020 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2021 { 2022 char *ms, *ss, *ses; 2023 switch (msg->tag.vio_msgtype) { 2024 #define Q(_s) case _s : ms = #_s; break; 2025 Q(VIO_TYPE_CTRL) 2026 Q(VIO_TYPE_DATA) 2027 Q(VIO_TYPE_ERR) 2028 #undef Q 2029 default: ms = "unknown"; break; 2030 } 2031 2032 switch (msg->tag.vio_subtype) { 2033 #define Q(_s) case _s : ss = #_s; break; 2034 Q(VIO_SUBTYPE_INFO) 2035 Q(VIO_SUBTYPE_ACK) 2036 Q(VIO_SUBTYPE_NACK) 2037 #undef Q 2038 default: ss = "unknown"; break; 2039 } 2040 2041 switch (msg->tag.vio_subtype_env) { 2042 #define Q(_s) case _s : ses = #_s; break; 2043 Q(VIO_VER_INFO) 2044 Q(VIO_ATTR_INFO) 2045 Q(VIO_DRING_REG) 2046 Q(VIO_DRING_UNREG) 2047 Q(VIO_RDX) 2048 Q(VIO_PKT_DATA) 2049 Q(VIO_DESC_DATA) 2050 Q(VIO_DRING_DATA) 2051 #undef Q 2052 default: ses = "unknown"; break; 2053 } 2054 2055 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2056 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2057 msg->tag.vio_subtype_env, ms, ss, ses); 2058 } 2059 #endif 2060 2061 /* 2062 * Function: 2063 * vdc_send() 2064 * 2065 * Description: 2066 * The function encapsulates the call to write a message using LDC. 2067 * If LDC indicates that the call failed due to the queue being full, 2068 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2069 * 2070 * Arguments: 2071 * ldc_handle - LDC handle for the channel this instance of vdc uses 2072 * pkt - address of LDC message to be sent 2073 * msglen - the size of the message being sent. When the function 2074 * returns, this contains the number of bytes written. 2075 * 2076 * Return Code: 2077 * 0 - Success. 2078 * EINVAL - pkt or msglen were NULL 2079 * ECONNRESET - The connection was not up. 2080 * EWOULDBLOCK - LDC queue is full 2081 * xxx - other error codes returned by ldc_write 2082 */ 2083 static int 2084 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2085 { 2086 size_t size = 0; 2087 int status = 0; 2088 clock_t delay_ticks; 2089 2090 ASSERT(vdc != NULL); 2091 ASSERT(mutex_owned(&vdc->lock)); 2092 ASSERT(msglen != NULL); 2093 ASSERT(*msglen != 0); 2094 2095 #ifdef DEBUG 2096 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2097 #endif 2098 /* 2099 * Wait indefinitely to send if channel 2100 * is busy, but bail out if we succeed or 2101 * if the channel closes or is reset. 2102 */ 2103 delay_ticks = vdc_hz_min_ldc_delay; 2104 do { 2105 size = *msglen; 2106 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2107 if (status == EWOULDBLOCK) { 2108 delay(delay_ticks); 2109 /* geometric backoff */ 2110 delay_ticks *= 2; 2111 if (delay_ticks > vdc_hz_max_ldc_delay) 2112 delay_ticks = vdc_hz_max_ldc_delay; 2113 } 2114 } while (status == EWOULDBLOCK); 2115 2116 /* if LDC had serious issues --- reset vdc state */ 2117 if (status == EIO || status == ECONNRESET) { 2118 /* LDC had serious issues --- reset vdc state */ 2119 mutex_enter(&vdc->read_lock); 2120 if ((vdc->read_state == VDC_READ_WAITING) || 2121 (vdc->read_state == VDC_READ_RESET)) 2122 cv_signal(&vdc->read_cv); 2123 vdc->read_state = VDC_READ_RESET; 2124 mutex_exit(&vdc->read_lock); 2125 2126 /* wake up any waiters in the reset thread */ 2127 if (vdc->state == VDC_STATE_INIT_WAITING) { 2128 DMSG(vdc, 0, "[%d] write reset - " 2129 "vdc is resetting ..\n", vdc->instance); 2130 vdc->state = VDC_STATE_RESETTING; 2131 cv_signal(&vdc->initwait_cv); 2132 } 2133 2134 return (ECONNRESET); 2135 } 2136 2137 /* return the last size written */ 2138 *msglen = size; 2139 2140 return (status); 2141 } 2142 2143 /* 2144 * Function: 2145 * vdc_get_md_node 2146 * 2147 * Description: 2148 * Get the MD, the device node for the given disk instance. The 2149 * caller is responsible for cleaning up the reference to the 2150 * returned MD (mdpp) by calling md_fini_handle(). 2151 * 2152 * Arguments: 2153 * dip - dev info pointer for this instance of the device driver. 2154 * mdpp - the returned MD. 2155 * vd_nodep - the returned device node. 2156 * 2157 * Return Code: 2158 * 0 - Success. 2159 * ENOENT - Expected node or property did not exist. 2160 * ENXIO - Unexpected error communicating with MD framework 2161 */ 2162 static int 2163 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2164 { 2165 int status = ENOENT; 2166 char *node_name = NULL; 2167 md_t *mdp = NULL; 2168 int num_nodes; 2169 int num_vdevs; 2170 mde_cookie_t rootnode; 2171 mde_cookie_t *listp = NULL; 2172 boolean_t found_inst = B_FALSE; 2173 int listsz; 2174 int idx; 2175 uint64_t md_inst; 2176 int obp_inst; 2177 int instance = ddi_get_instance(dip); 2178 2179 /* 2180 * Get the OBP instance number for comparison with the MD instance 2181 * 2182 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2183 * notion of "instance", or unique identifier, for that node; OBP 2184 * stores the value of the "cfg-handle" MD property as the value of 2185 * the "reg" property on the node in the device tree it builds from 2186 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2187 * "reg" property value to uniquely identify this device instance. 2188 * If the "reg" property cannot be found, the device tree state is 2189 * presumably so broken that there is no point in continuing. 2190 */ 2191 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2192 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2193 return (ENOENT); 2194 } 2195 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2196 OBP_REG, -1); 2197 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2198 2199 /* 2200 * We now walk the MD nodes to find the node for this vdisk. 2201 */ 2202 if ((mdp = md_get_handle()) == NULL) { 2203 cmn_err(CE_WARN, "unable to init machine description"); 2204 return (ENXIO); 2205 } 2206 2207 num_nodes = md_node_count(mdp); 2208 ASSERT(num_nodes > 0); 2209 2210 listsz = num_nodes * sizeof (mde_cookie_t); 2211 2212 /* allocate memory for nodes */ 2213 listp = kmem_zalloc(listsz, KM_SLEEP); 2214 2215 rootnode = md_root_node(mdp); 2216 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2217 2218 /* 2219 * Search for all the virtual devices, we will then check to see which 2220 * ones are disk nodes. 2221 */ 2222 num_vdevs = md_scan_dag(mdp, rootnode, 2223 md_find_name(mdp, VDC_MD_VDEV_NAME), 2224 md_find_name(mdp, "fwd"), listp); 2225 2226 if (num_vdevs <= 0) { 2227 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2228 status = ENOENT; 2229 goto done; 2230 } 2231 2232 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2233 for (idx = 0; idx < num_vdevs; idx++) { 2234 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2235 if ((status != 0) || (node_name == NULL)) { 2236 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2237 ": err %d", VDC_MD_VDEV_NAME, status); 2238 continue; 2239 } 2240 2241 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2242 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2243 status = md_get_prop_val(mdp, listp[idx], 2244 VDC_MD_CFG_HDL, &md_inst); 2245 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2246 instance, md_inst); 2247 if ((status == 0) && (md_inst == obp_inst)) { 2248 found_inst = B_TRUE; 2249 break; 2250 } 2251 } 2252 } 2253 2254 if (!found_inst) { 2255 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2256 status = ENOENT; 2257 goto done; 2258 } 2259 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2260 2261 *vd_nodep = listp[idx]; 2262 *mdpp = mdp; 2263 done: 2264 kmem_free(listp, listsz); 2265 return (status); 2266 } 2267 2268 /* 2269 * Function: 2270 * vdc_init_ports 2271 * 2272 * Description: 2273 * Initialize all the ports for this vdisk instance. 2274 * 2275 * Arguments: 2276 * vdc - soft state pointer for this instance of the device driver. 2277 * mdp - md pointer 2278 * vd_nodep - device md node. 2279 * 2280 * Return Code: 2281 * 0 - Success. 2282 * ENOENT - Expected node or property did not exist. 2283 */ 2284 static int 2285 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2286 { 2287 int status = 0; 2288 int idx; 2289 int num_nodes; 2290 int num_vports; 2291 int num_chans; 2292 int listsz; 2293 mde_cookie_t vd_port; 2294 mde_cookie_t *chanp = NULL; 2295 mde_cookie_t *portp = NULL; 2296 vdc_server_t *srvr; 2297 vdc_server_t *prev_srvr = NULL; 2298 2299 /* 2300 * We now walk the MD nodes to find the port nodes for this vdisk. 2301 */ 2302 num_nodes = md_node_count(mdp); 2303 ASSERT(num_nodes > 0); 2304 2305 listsz = num_nodes * sizeof (mde_cookie_t); 2306 2307 /* allocate memory for nodes */ 2308 portp = kmem_zalloc(listsz, KM_SLEEP); 2309 chanp = kmem_zalloc(listsz, KM_SLEEP); 2310 2311 num_vports = md_scan_dag(mdp, vd_nodep, 2312 md_find_name(mdp, VDC_MD_PORT_NAME), 2313 md_find_name(mdp, "fwd"), portp); 2314 if (num_vports == 0) { 2315 DMSGX(0, "Found no '%s' node for '%s' port\n", 2316 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2317 status = ENOENT; 2318 goto done; 2319 } 2320 2321 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2322 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2323 2324 vdc->num_servers = 0; 2325 for (idx = 0; idx < num_vports; idx++) { 2326 2327 /* initialize this port */ 2328 vd_port = portp[idx]; 2329 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2330 srvr->vdcp = vdc; 2331 2332 /* get port id */ 2333 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2334 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2335 VDC_MD_ID); 2336 kmem_free(srvr, sizeof (vdc_server_t)); 2337 continue; 2338 } 2339 2340 /* set the connection timeout */ 2341 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2342 &srvr->ctimeout) != 0) { 2343 srvr->ctimeout = 0; 2344 } 2345 2346 /* get the ldc id */ 2347 num_chans = md_scan_dag(mdp, vd_port, 2348 md_find_name(mdp, VDC_MD_CHAN_NAME), 2349 md_find_name(mdp, "fwd"), chanp); 2350 2351 /* expecting at least one channel */ 2352 if (num_chans <= 0) { 2353 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2354 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2355 kmem_free(srvr, sizeof (vdc_server_t)); 2356 continue; 2357 } else if (num_chans != 1) { 2358 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2359 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2360 num_chans); 2361 } 2362 2363 /* 2364 * We use the first channel found (index 0), irrespective of how 2365 * many are there in total. 2366 */ 2367 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2368 &srvr->ldc_id) != 0) { 2369 cmn_err(CE_NOTE, "Channel '%s' property not found", 2370 VDC_MD_ID); 2371 kmem_free(srvr, sizeof (vdc_server_t)); 2372 continue; 2373 } 2374 2375 /* 2376 * now initialise LDC channel which will be used to 2377 * communicate with this server 2378 */ 2379 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2380 kmem_free(srvr, sizeof (vdc_server_t)); 2381 continue; 2382 } 2383 2384 /* add server to list */ 2385 if (prev_srvr) 2386 prev_srvr->next = srvr; 2387 else 2388 vdc->server_list = srvr; 2389 2390 prev_srvr = srvr; 2391 2392 /* inc numbers of servers */ 2393 vdc->num_servers++; 2394 } 2395 2396 /* 2397 * Adjust the max number of handshake retries to match 2398 * the number of vdisk servers. 2399 */ 2400 if (vdc_hshake_retries < vdc->num_servers) 2401 vdc_hshake_retries = vdc->num_servers; 2402 2403 /* pick first server as current server */ 2404 if (vdc->server_list != NULL) { 2405 vdc->curr_server = vdc->server_list; 2406 status = 0; 2407 } else { 2408 status = ENOENT; 2409 } 2410 2411 done: 2412 kmem_free(chanp, listsz); 2413 kmem_free(portp, listsz); 2414 return (status); 2415 } 2416 2417 2418 /* 2419 * Function: 2420 * vdc_do_ldc_up 2421 * 2422 * Description: 2423 * Bring the channel for the current server up. 2424 * 2425 * Arguments: 2426 * vdc - soft state pointer for this instance of the device driver. 2427 * 2428 * Return Code: 2429 * 0 - Success. 2430 * EINVAL - Driver is detaching / LDC error 2431 * ECONNREFUSED - Other end is not listening 2432 */ 2433 static int 2434 vdc_do_ldc_up(vdc_t *vdc) 2435 { 2436 int status; 2437 ldc_status_t ldc_state; 2438 2439 ASSERT(MUTEX_HELD(&vdc->lock)); 2440 2441 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2442 vdc->instance, vdc->curr_server->ldc_id); 2443 2444 if (vdc->lifecycle == VDC_LC_DETACHING) 2445 return (EINVAL); 2446 2447 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2448 switch (status) { 2449 case ECONNREFUSED: /* listener not ready at other end */ 2450 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2451 vdc->instance, vdc->curr_server->ldc_id, status); 2452 status = 0; 2453 break; 2454 default: 2455 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2456 "channel=%ld, err=%d", vdc->instance, 2457 vdc->curr_server->ldc_id, status); 2458 break; 2459 } 2460 } 2461 2462 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2463 vdc->curr_server->ldc_state = ldc_state; 2464 if (ldc_state == LDC_UP) { 2465 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2466 vdc->instance); 2467 vdc->seq_num = 1; 2468 vdc->seq_num_reply = 0; 2469 } 2470 } 2471 2472 return (status); 2473 } 2474 2475 /* 2476 * Function: 2477 * vdc_terminate_ldc() 2478 * 2479 * Description: 2480 * 2481 * Arguments: 2482 * vdc - soft state pointer for this instance of the device driver. 2483 * srvr - vdc per-server info structure 2484 * 2485 * Return Code: 2486 * None 2487 */ 2488 static void 2489 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2490 { 2491 int instance = ddi_get_instance(vdc->dip); 2492 2493 if (srvr->state & VDC_LDC_OPEN) { 2494 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2495 (void) ldc_close(srvr->ldc_handle); 2496 } 2497 if (srvr->state & VDC_LDC_CB) { 2498 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2499 (void) ldc_unreg_callback(srvr->ldc_handle); 2500 } 2501 if (srvr->state & VDC_LDC_INIT) { 2502 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2503 (void) ldc_fini(srvr->ldc_handle); 2504 srvr->ldc_handle = NULL; 2505 } 2506 2507 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2508 } 2509 2510 /* 2511 * Function: 2512 * vdc_fini_ports() 2513 * 2514 * Description: 2515 * Finalize all ports by closing the channel associated with each 2516 * port and also freeing the server structure. 2517 * 2518 * Arguments: 2519 * vdc - soft state pointer for this instance of the device driver. 2520 * 2521 * Return Code: 2522 * None 2523 */ 2524 static void 2525 vdc_fini_ports(vdc_t *vdc) 2526 { 2527 int instance = ddi_get_instance(vdc->dip); 2528 vdc_server_t *srvr, *prev_srvr; 2529 2530 ASSERT(vdc != NULL); 2531 ASSERT(mutex_owned(&vdc->lock)); 2532 2533 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2534 2535 srvr = vdc->server_list; 2536 2537 while (srvr) { 2538 2539 vdc_terminate_ldc(vdc, srvr); 2540 2541 /* next server */ 2542 prev_srvr = srvr; 2543 srvr = srvr->next; 2544 2545 /* free server */ 2546 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2547 } 2548 2549 vdc->server_list = NULL; 2550 } 2551 2552 /* -------------------------------------------------------------------------- */ 2553 2554 /* 2555 * Descriptor Ring helper routines 2556 */ 2557 2558 /* 2559 * Function: 2560 * vdc_init_descriptor_ring() 2561 * 2562 * Description: 2563 * 2564 * Arguments: 2565 * vdc - soft state pointer for this instance of the device driver. 2566 * 2567 * Return Code: 2568 * 0 - Success 2569 */ 2570 static int 2571 vdc_init_descriptor_ring(vdc_t *vdc) 2572 { 2573 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2574 int status = 0; 2575 int i; 2576 2577 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2578 2579 ASSERT(vdc != NULL); 2580 ASSERT(mutex_owned(&vdc->lock)); 2581 2582 /* ensure we have enough room to store max sized block */ 2583 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2584 2585 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2586 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2587 /* 2588 * Calculate the maximum block size we can transmit using one 2589 * Descriptor Ring entry from the attributes returned by the 2590 * vDisk server. This is subject to a minimum of 'maxphys' 2591 * as we do not have the capability to split requests over 2592 * multiple DRing entries. 2593 */ 2594 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2595 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2596 vdc->instance); 2597 vdc->dring_max_cookies = maxphys / PAGESIZE; 2598 } else { 2599 vdc->dring_max_cookies = 2600 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2601 } 2602 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2603 (sizeof (ldc_mem_cookie_t) * 2604 (vdc->dring_max_cookies - 1))); 2605 vdc->dring_len = VD_DRING_LEN; 2606 2607 status = ldc_mem_dring_create(vdc->dring_len, 2608 vdc->dring_entry_size, &vdc->dring_hdl); 2609 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2610 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2611 vdc->instance); 2612 return (status); 2613 } 2614 vdc->initialized |= VDC_DRING_INIT; 2615 } 2616 2617 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2618 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2619 vdc->dring_cookie = 2620 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2621 2622 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2623 vdc->dring_hdl, 2624 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2625 &vdc->dring_cookie[0], 2626 &vdc->dring_cookie_count); 2627 if (status != 0) { 2628 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2629 "(%lx) to channel (%lx) status=%d\n", 2630 vdc->instance, vdc->dring_hdl, 2631 vdc->curr_server->ldc_handle, status); 2632 return (status); 2633 } 2634 ASSERT(vdc->dring_cookie_count == 1); 2635 vdc->initialized |= VDC_DRING_BOUND; 2636 } 2637 2638 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2639 if (status != 0) { 2640 DMSG(vdc, 0, 2641 "[%d] Failed to get info for descriptor ring (%lx)\n", 2642 vdc->instance, vdc->dring_hdl); 2643 return (status); 2644 } 2645 2646 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2647 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2648 2649 /* Allocate the local copy of this dring */ 2650 vdc->local_dring = 2651 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2652 KM_SLEEP); 2653 vdc->initialized |= VDC_DRING_LOCAL; 2654 } 2655 2656 /* 2657 * Mark all DRing entries as free and initialize the private 2658 * descriptor's memory handles. If any entry is initialized, 2659 * we need to free it later so we set the bit in 'initialized' 2660 * at the start. 2661 */ 2662 vdc->initialized |= VDC_DRING_ENTRY; 2663 for (i = 0; i < vdc->dring_len; i++) { 2664 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2665 dep->hdr.dstate = VIO_DESC_FREE; 2666 2667 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2668 &vdc->local_dring[i].desc_mhdl); 2669 if (status != 0) { 2670 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2671 " descriptor %d", vdc->instance, i); 2672 return (status); 2673 } 2674 vdc->local_dring[i].is_free = B_TRUE; 2675 vdc->local_dring[i].dep = dep; 2676 } 2677 2678 /* Initialize the starting index */ 2679 vdc->dring_curr_idx = 0; 2680 2681 return (status); 2682 } 2683 2684 /* 2685 * Function: 2686 * vdc_destroy_descriptor_ring() 2687 * 2688 * Description: 2689 * 2690 * Arguments: 2691 * vdc - soft state pointer for this instance of the device driver. 2692 * 2693 * Return Code: 2694 * None 2695 */ 2696 static void 2697 vdc_destroy_descriptor_ring(vdc_t *vdc) 2698 { 2699 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2700 ldc_mem_handle_t mhdl = NULL; 2701 ldc_mem_info_t minfo; 2702 int status = -1; 2703 int i; /* loop */ 2704 2705 ASSERT(vdc != NULL); 2706 ASSERT(mutex_owned(&vdc->lock)); 2707 2708 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2709 2710 if (vdc->initialized & VDC_DRING_ENTRY) { 2711 DMSG(vdc, 0, 2712 "[%d] Removing Local DRing entries\n", vdc->instance); 2713 for (i = 0; i < vdc->dring_len; i++) { 2714 ldep = &vdc->local_dring[i]; 2715 mhdl = ldep->desc_mhdl; 2716 2717 if (mhdl == NULL) 2718 continue; 2719 2720 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2721 DMSG(vdc, 0, 2722 "ldc_mem_info returned an error: %d\n", 2723 status); 2724 2725 /* 2726 * This must mean that the mem handle 2727 * is not valid. Clear it out so that 2728 * no one tries to use it. 2729 */ 2730 ldep->desc_mhdl = NULL; 2731 continue; 2732 } 2733 2734 if (minfo.status == LDC_BOUND) { 2735 (void) ldc_mem_unbind_handle(mhdl); 2736 } 2737 2738 (void) ldc_mem_free_handle(mhdl); 2739 2740 ldep->desc_mhdl = NULL; 2741 } 2742 vdc->initialized &= ~VDC_DRING_ENTRY; 2743 } 2744 2745 if (vdc->initialized & VDC_DRING_LOCAL) { 2746 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2747 kmem_free(vdc->local_dring, 2748 vdc->dring_len * sizeof (vdc_local_desc_t)); 2749 vdc->initialized &= ~VDC_DRING_LOCAL; 2750 } 2751 2752 if (vdc->initialized & VDC_DRING_BOUND) { 2753 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2754 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2755 if (status == 0) { 2756 vdc->initialized &= ~VDC_DRING_BOUND; 2757 } else { 2758 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2759 vdc->instance, status, vdc->dring_hdl); 2760 } 2761 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2762 } 2763 2764 if (vdc->initialized & VDC_DRING_INIT) { 2765 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2766 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2767 if (status == 0) { 2768 vdc->dring_hdl = NULL; 2769 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2770 vdc->initialized &= ~VDC_DRING_INIT; 2771 } else { 2772 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2773 vdc->instance, status, vdc->dring_hdl); 2774 } 2775 } 2776 } 2777 2778 /* 2779 * Function: 2780 * vdc_map_to_shared_dring() 2781 * 2782 * Description: 2783 * Copy contents of the local descriptor to the shared 2784 * memory descriptor. 2785 * 2786 * Arguments: 2787 * vdcp - soft state pointer for this instance of the device driver. 2788 * idx - descriptor ring index 2789 * 2790 * Return Code: 2791 * None 2792 */ 2793 static int 2794 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2795 { 2796 vdc_local_desc_t *ldep; 2797 vd_dring_entry_t *dep; 2798 int rv; 2799 2800 ldep = &(vdcp->local_dring[idx]); 2801 2802 /* for now leave in the old pop_mem_hdl stuff */ 2803 if (ldep->nbytes > 0) { 2804 rv = vdc_populate_mem_hdl(vdcp, ldep); 2805 if (rv) { 2806 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2807 vdcp->instance); 2808 return (rv); 2809 } 2810 } 2811 2812 /* 2813 * fill in the data details into the DRing 2814 */ 2815 dep = ldep->dep; 2816 ASSERT(dep != NULL); 2817 2818 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2819 dep->payload.operation = ldep->operation; 2820 dep->payload.addr = ldep->offset; 2821 dep->payload.nbytes = ldep->nbytes; 2822 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2823 dep->payload.slice = ldep->slice; 2824 dep->hdr.dstate = VIO_DESC_READY; 2825 dep->hdr.ack = 1; /* request an ACK for every message */ 2826 2827 return (0); 2828 } 2829 2830 /* 2831 * Function: 2832 * vdc_send_request 2833 * 2834 * Description: 2835 * This routine writes the data to be transmitted to vds into the 2836 * descriptor, notifies vds that the ring has been updated and 2837 * then waits for the request to be processed. 2838 * 2839 * Arguments: 2840 * vdcp - the soft state pointer 2841 * operation - operation we want vds to perform (VD_OP_XXX) 2842 * addr - address of data buf to be read/written. 2843 * nbytes - number of bytes to read/write 2844 * slice - the disk slice this request is for 2845 * offset - relative disk offset 2846 * cb_type - type of call - STRATEGY or SYNC 2847 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2848 * . mode for ioctl(9e) 2849 * . LP64 diskaddr_t (block I/O) 2850 * dir - direction of operation (READ/WRITE/BOTH) 2851 * 2852 * Return Codes: 2853 * 0 2854 * ENXIO 2855 */ 2856 static int 2857 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2858 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2859 void *cb_arg, vio_desc_direction_t dir) 2860 { 2861 int rv = 0; 2862 2863 ASSERT(vdcp != NULL); 2864 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2865 2866 mutex_enter(&vdcp->lock); 2867 2868 /* 2869 * If this is a block read/write operation we update the I/O statistics 2870 * to indicate that the request is being put on the waitq to be 2871 * serviced. 2872 * 2873 * We do it here (a common routine for both synchronous and strategy 2874 * calls) for performance reasons - we are already holding vdc->lock 2875 * so there is no extra locking overhead. We would have to explicitly 2876 * grab the 'lock' mutex to update the stats if we were to do this 2877 * higher up the stack in vdc_strategy() et. al. 2878 */ 2879 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2880 DTRACE_IO1(start, buf_t *, cb_arg); 2881 VD_KSTAT_WAITQ_ENTER(vdcp); 2882 } 2883 2884 do { 2885 while (vdcp->state != VDC_STATE_RUNNING) { 2886 2887 /* return error if detaching */ 2888 if (vdcp->state == VDC_STATE_DETACH) { 2889 rv = ENXIO; 2890 goto done; 2891 } 2892 2893 /* fail request if connection timeout is reached */ 2894 if (vdcp->ctimeout_reached) { 2895 rv = EIO; 2896 goto done; 2897 } 2898 2899 /* 2900 * If we are panicking and the disk is not ready then 2901 * we can't send any request because we can't complete 2902 * the handshake now. 2903 */ 2904 if (ddi_in_panic()) { 2905 rv = EIO; 2906 goto done; 2907 } 2908 2909 cv_wait(&vdcp->running_cv, &vdcp->lock); 2910 } 2911 2912 } while (vdc_populate_descriptor(vdcp, operation, addr, 2913 nbytes, slice, offset, cb_type, cb_arg, dir)); 2914 2915 done: 2916 /* 2917 * If this is a block read/write we update the I/O statistics kstat 2918 * to indicate that this request has been placed on the queue for 2919 * processing (i.e sent to the vDisk server) - iostat(1M) will 2920 * report the time waiting for the vDisk server under the %b column 2921 * In the case of an error we simply take it off the wait queue. 2922 */ 2923 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2924 if (rv == 0) { 2925 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2926 DTRACE_PROBE1(send, buf_t *, cb_arg); 2927 } else { 2928 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2929 VD_KSTAT_WAITQ_EXIT(vdcp); 2930 DTRACE_IO1(done, buf_t *, cb_arg); 2931 } 2932 } 2933 2934 mutex_exit(&vdcp->lock); 2935 2936 return (rv); 2937 } 2938 2939 2940 /* 2941 * Function: 2942 * vdc_populate_descriptor 2943 * 2944 * Description: 2945 * This routine writes the data to be transmitted to vds into the 2946 * descriptor, notifies vds that the ring has been updated and 2947 * then waits for the request to be processed. 2948 * 2949 * Arguments: 2950 * vdcp - the soft state pointer 2951 * operation - operation we want vds to perform (VD_OP_XXX) 2952 * addr - address of data buf to be read/written. 2953 * nbytes - number of bytes to read/write 2954 * slice - the disk slice this request is for 2955 * offset - relative disk offset 2956 * cb_type - type of call - STRATEGY or SYNC 2957 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2958 * . mode for ioctl(9e) 2959 * . LP64 diskaddr_t (block I/O) 2960 * dir - direction of operation (READ/WRITE/BOTH) 2961 * 2962 * Return Codes: 2963 * 0 2964 * EAGAIN 2965 * ECONNRESET 2966 * ENXIO 2967 */ 2968 static int 2969 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2970 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2971 void *cb_arg, vio_desc_direction_t dir) 2972 { 2973 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2974 int idx; /* Index of DRing entry used */ 2975 int next_idx; 2976 vio_dring_msg_t dmsg; 2977 size_t msglen; 2978 int rv; 2979 2980 ASSERT(MUTEX_HELD(&vdcp->lock)); 2981 vdcp->threads_pending++; 2982 loop: 2983 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2984 2985 /* Get next available D-Ring entry */ 2986 idx = vdcp->dring_curr_idx; 2987 local_dep = &(vdcp->local_dring[idx]); 2988 2989 if (!local_dep->is_free) { 2990 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 2991 vdcp->instance); 2992 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 2993 if (vdcp->state == VDC_STATE_RUNNING || 2994 vdcp->state == VDC_STATE_HANDLE_PENDING) { 2995 goto loop; 2996 } 2997 vdcp->threads_pending--; 2998 return (ECONNRESET); 2999 } 3000 3001 next_idx = idx + 1; 3002 if (next_idx >= vdcp->dring_len) 3003 next_idx = 0; 3004 vdcp->dring_curr_idx = next_idx; 3005 3006 ASSERT(local_dep->is_free); 3007 3008 local_dep->operation = operation; 3009 local_dep->addr = addr; 3010 local_dep->nbytes = nbytes; 3011 local_dep->slice = slice; 3012 local_dep->offset = offset; 3013 local_dep->cb_type = cb_type; 3014 local_dep->cb_arg = cb_arg; 3015 local_dep->dir = dir; 3016 3017 local_dep->is_free = B_FALSE; 3018 3019 rv = vdc_map_to_shared_dring(vdcp, idx); 3020 if (rv) { 3021 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3022 vdcp->instance); 3023 /* free the descriptor */ 3024 local_dep->is_free = B_TRUE; 3025 vdcp->dring_curr_idx = idx; 3026 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3027 if (vdcp->state == VDC_STATE_RUNNING || 3028 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3029 goto loop; 3030 } 3031 vdcp->threads_pending--; 3032 return (ECONNRESET); 3033 } 3034 3035 /* 3036 * Send a msg with the DRing details to vds 3037 */ 3038 VIO_INIT_DRING_DATA_TAG(dmsg); 3039 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3040 dmsg.dring_ident = vdcp->dring_ident; 3041 dmsg.start_idx = idx; 3042 dmsg.end_idx = idx; 3043 vdcp->seq_num++; 3044 3045 DTRACE_PROBE2(populate, int, vdcp->instance, 3046 vdc_local_desc_t *, local_dep); 3047 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3048 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3049 3050 /* 3051 * note we're still holding the lock here to 3052 * make sure the message goes out in order !!!... 3053 */ 3054 msglen = sizeof (dmsg); 3055 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3056 switch (rv) { 3057 case ECONNRESET: 3058 /* 3059 * vdc_send initiates the reset on failure. 3060 * Since the transaction has already been put 3061 * on the local dring, it will automatically get 3062 * retried when the channel is reset. Given that, 3063 * it is ok to just return success even though the 3064 * send failed. 3065 */ 3066 rv = 0; 3067 break; 3068 3069 case 0: /* EOK */ 3070 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3071 break; 3072 3073 default: 3074 goto cleanup_and_exit; 3075 } 3076 3077 vdcp->threads_pending--; 3078 return (rv); 3079 3080 cleanup_and_exit: 3081 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3082 return (ENXIO); 3083 } 3084 3085 /* 3086 * Function: 3087 * vdc_do_sync_op 3088 * 3089 * Description: 3090 * Wrapper around vdc_populate_descriptor that blocks until the 3091 * response to the message is available. 3092 * 3093 * Arguments: 3094 * vdcp - the soft state pointer 3095 * operation - operation we want vds to perform (VD_OP_XXX) 3096 * addr - address of data buf to be read/written. 3097 * nbytes - number of bytes to read/write 3098 * slice - the disk slice this request is for 3099 * offset - relative disk offset 3100 * cb_type - type of call - STRATEGY or SYNC 3101 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3102 * . mode for ioctl(9e) 3103 * . LP64 diskaddr_t (block I/O) 3104 * dir - direction of operation (READ/WRITE/BOTH) 3105 * rconflict - check for reservation conflict in case of failure 3106 * 3107 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3108 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3109 * result of a successful operation with vd_scsi_status(). 3110 * 3111 * Return Codes: 3112 * 0 3113 * EAGAIN 3114 * EFAULT 3115 * ENXIO 3116 * EIO 3117 */ 3118 static int 3119 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3120 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3121 vio_desc_direction_t dir, boolean_t rconflict) 3122 { 3123 int status; 3124 vdc_io_t *vio; 3125 boolean_t check_resv_conflict = B_FALSE; 3126 3127 ASSERT(cb_type == CB_SYNC); 3128 3129 /* 3130 * Grab the lock, if blocked wait until the server 3131 * response causes us to wake up again. 3132 */ 3133 mutex_enter(&vdcp->lock); 3134 vdcp->sync_op_cnt++; 3135 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3136 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3137 3138 if (vdcp->state == VDC_STATE_DETACH) { 3139 cv_broadcast(&vdcp->sync_blocked_cv); 3140 vdcp->sync_op_cnt--; 3141 mutex_exit(&vdcp->lock); 3142 return (ENXIO); 3143 } 3144 3145 /* now block anyone other thread entering after us */ 3146 vdcp->sync_op_blocked = B_TRUE; 3147 vdcp->sync_op_pending = B_TRUE; 3148 mutex_exit(&vdcp->lock); 3149 3150 status = vdc_send_request(vdcp, operation, addr, 3151 nbytes, slice, offset, cb_type, cb_arg, dir); 3152 3153 mutex_enter(&vdcp->lock); 3154 3155 if (status != 0) { 3156 vdcp->sync_op_pending = B_FALSE; 3157 } else { 3158 /* 3159 * block until our transaction completes. 3160 * Also anyone else waiting also gets to go next. 3161 */ 3162 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3163 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3164 3165 DMSG(vdcp, 2, ": operation returned %d\n", 3166 vdcp->sync_op_status); 3167 if (vdcp->state == VDC_STATE_DETACH) { 3168 vdcp->sync_op_pending = B_FALSE; 3169 status = ENXIO; 3170 } else { 3171 status = vdcp->sync_op_status; 3172 if (status != 0 && vdcp->failfast_interval != 0) { 3173 /* 3174 * Operation has failed and failfast is enabled. 3175 * We need to check if the failure is due to a 3176 * reservation conflict if this was requested. 3177 */ 3178 check_resv_conflict = rconflict; 3179 } 3180 3181 } 3182 } 3183 3184 vdcp->sync_op_status = 0; 3185 vdcp->sync_op_blocked = B_FALSE; 3186 vdcp->sync_op_cnt--; 3187 3188 /* signal the next waiting thread */ 3189 cv_signal(&vdcp->sync_blocked_cv); 3190 3191 /* 3192 * We have to check for reservation conflict after unblocking sync 3193 * operations because some sync operations will be used to do this 3194 * check. 3195 */ 3196 if (check_resv_conflict) { 3197 vio = vdc_failfast_io_queue(vdcp, NULL); 3198 while (vio->vio_qtime != 0) 3199 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3200 kmem_free(vio, sizeof (vdc_io_t)); 3201 } 3202 3203 mutex_exit(&vdcp->lock); 3204 3205 return (status); 3206 } 3207 3208 3209 /* 3210 * Function: 3211 * vdc_drain_response() 3212 * 3213 * Description: 3214 * When a guest is panicking, the completion of requests needs to be 3215 * handled differently because interrupts are disabled and vdc 3216 * will not get messages. We have to poll for the messages instead. 3217 * 3218 * Note: since we are panicking we don't implement the io:::done 3219 * DTrace probe or update the I/O statistics kstats. 3220 * 3221 * Arguments: 3222 * vdc - soft state pointer for this instance of the device driver. 3223 * buf - if buf is NULL then we drain all responses, otherwise we 3224 * poll until we receive a ACK/NACK for the specific I/O 3225 * described by buf. 3226 * 3227 * Return Code: 3228 * 0 - Success 3229 */ 3230 static int 3231 vdc_drain_response(vdc_t *vdc, struct buf *buf) 3232 { 3233 int rv, idx, retries; 3234 size_t msglen; 3235 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3236 vio_dring_msg_t dmsg; 3237 struct buf *mbuf; 3238 3239 mutex_enter(&vdc->lock); 3240 3241 retries = 0; 3242 for (;;) { 3243 msglen = sizeof (dmsg); 3244 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3245 &msglen); 3246 if (rv) { 3247 rv = EINVAL; 3248 break; 3249 } 3250 3251 /* 3252 * if there are no packets wait and check again 3253 */ 3254 if ((rv == 0) && (msglen == 0)) { 3255 if (retries++ > vdc_dump_retries) { 3256 rv = EAGAIN; 3257 break; 3258 } 3259 3260 drv_usecwait(vdc_usec_timeout_dump); 3261 continue; 3262 } 3263 3264 /* 3265 * Ignore all messages that are not ACKs/NACKs to 3266 * DRing requests. 3267 */ 3268 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3269 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3270 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3271 dmsg.tag.vio_msgtype, 3272 dmsg.tag.vio_subtype, 3273 dmsg.tag.vio_subtype_env); 3274 continue; 3275 } 3276 3277 /* 3278 * set the appropriate return value for the current request. 3279 */ 3280 switch (dmsg.tag.vio_subtype) { 3281 case VIO_SUBTYPE_ACK: 3282 rv = 0; 3283 break; 3284 case VIO_SUBTYPE_NACK: 3285 rv = EAGAIN; 3286 break; 3287 default: 3288 continue; 3289 } 3290 3291 idx = dmsg.start_idx; 3292 if (idx >= vdc->dring_len) { 3293 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3294 vdc->instance, idx); 3295 continue; 3296 } 3297 ldep = &vdc->local_dring[idx]; 3298 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3299 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3300 vdc->instance, idx, ldep->dep->hdr.dstate); 3301 continue; 3302 } 3303 3304 if (buf != NULL && ldep->cb_type == CB_STRATEGY) { 3305 mbuf = ldep->cb_arg; 3306 mbuf->b_resid = mbuf->b_bcount - 3307 ldep->dep->payload.nbytes; 3308 bioerror(mbuf, (rv == EAGAIN)? EIO: 3309 ldep->dep->payload.status); 3310 biodone(mbuf); 3311 } else { 3312 mbuf = NULL; 3313 } 3314 3315 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3316 vdc->instance, idx, ldep->dep->hdr.dstate); 3317 3318 rv = vdc_depopulate_descriptor(vdc, idx); 3319 if (rv) { 3320 DMSG(vdc, 0, 3321 "[%d] Entry @ %d - depopulate failed ..\n", 3322 vdc->instance, idx); 3323 } 3324 3325 /* we have received an ACK/NACK for the specified buffer */ 3326 if (buf != NULL && buf == mbuf) { 3327 rv = 0; 3328 break; 3329 } 3330 3331 /* if this is the last descriptor - break out of loop */ 3332 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) { 3333 if (buf != NULL) { 3334 /* 3335 * We never got a response for the specified 3336 * buffer so we fail the I/O. 3337 */ 3338 bioerror(buf, EIO); 3339 biodone(buf); 3340 } 3341 break; 3342 } 3343 } 3344 3345 mutex_exit(&vdc->lock); 3346 DMSG(vdc, 0, "End idx=%d\n", idx); 3347 3348 return (rv); 3349 } 3350 3351 3352 /* 3353 * Function: 3354 * vdc_depopulate_descriptor() 3355 * 3356 * Description: 3357 * 3358 * Arguments: 3359 * vdc - soft state pointer for this instance of the device driver. 3360 * idx - Index of the Descriptor Ring entry being modified 3361 * 3362 * Return Code: 3363 * 0 - Success 3364 */ 3365 static int 3366 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3367 { 3368 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3369 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3370 int status = ENXIO; 3371 int rv = 0; 3372 3373 ASSERT(vdc != NULL); 3374 ASSERT(idx < vdc->dring_len); 3375 ldep = &vdc->local_dring[idx]; 3376 ASSERT(ldep != NULL); 3377 ASSERT(MUTEX_HELD(&vdc->lock)); 3378 3379 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3380 DMSG(vdc, 2, ": idx = %d\n", idx); 3381 3382 dep = ldep->dep; 3383 ASSERT(dep != NULL); 3384 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3385 (dep->payload.status == ECANCELED)); 3386 3387 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3388 3389 ldep->is_free = B_TRUE; 3390 status = dep->payload.status; 3391 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3392 3393 /* 3394 * If no buffers were used to transfer information to the server when 3395 * populating the descriptor then no memory handles need to be unbound 3396 * and we can return now. 3397 */ 3398 if (ldep->nbytes == 0) { 3399 cv_signal(&vdc->dring_free_cv); 3400 return (status); 3401 } 3402 3403 /* 3404 * If the upper layer passed in a misaligned address we copied the 3405 * data into an aligned buffer before sending it to LDC - we now 3406 * copy it back to the original buffer. 3407 */ 3408 if (ldep->align_addr) { 3409 ASSERT(ldep->addr != NULL); 3410 3411 if (dep->payload.nbytes > 0) 3412 bcopy(ldep->align_addr, ldep->addr, 3413 dep->payload.nbytes); 3414 kmem_free(ldep->align_addr, 3415 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3416 ldep->align_addr = NULL; 3417 } 3418 3419 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3420 if (rv != 0) { 3421 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3422 vdc->instance, ldep->desc_mhdl, idx, rv); 3423 /* 3424 * The error returned by the vDisk server is more informative 3425 * and thus has a higher priority but if it isn't set we ensure 3426 * that this function returns an error. 3427 */ 3428 if (status == 0) 3429 status = EINVAL; 3430 } 3431 3432 cv_signal(&vdc->membind_cv); 3433 cv_signal(&vdc->dring_free_cv); 3434 3435 return (status); 3436 } 3437 3438 /* 3439 * Function: 3440 * vdc_populate_mem_hdl() 3441 * 3442 * Description: 3443 * 3444 * Arguments: 3445 * vdc - soft state pointer for this instance of the device driver. 3446 * idx - Index of the Descriptor Ring entry being modified 3447 * addr - virtual address being mapped in 3448 * nybtes - number of bytes in 'addr' 3449 * operation - the vDisk operation being performed (VD_OP_xxx) 3450 * 3451 * Return Code: 3452 * 0 - Success 3453 */ 3454 static int 3455 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3456 { 3457 vd_dring_entry_t *dep = NULL; 3458 ldc_mem_handle_t mhdl; 3459 caddr_t vaddr; 3460 size_t nbytes; 3461 uint8_t perm = LDC_MEM_RW; 3462 uint8_t maptype; 3463 int rv = 0; 3464 int i; 3465 3466 ASSERT(vdcp != NULL); 3467 3468 dep = ldep->dep; 3469 mhdl = ldep->desc_mhdl; 3470 3471 switch (ldep->dir) { 3472 case VIO_read_dir: 3473 perm = LDC_MEM_W; 3474 break; 3475 3476 case VIO_write_dir: 3477 perm = LDC_MEM_R; 3478 break; 3479 3480 case VIO_both_dir: 3481 perm = LDC_MEM_RW; 3482 break; 3483 3484 default: 3485 ASSERT(0); /* catch bad programming in vdc */ 3486 } 3487 3488 /* 3489 * LDC expects any addresses passed in to be 8-byte aligned. We need 3490 * to copy the contents of any misaligned buffers to a newly allocated 3491 * buffer and bind it instead (and copy the the contents back to the 3492 * original buffer passed in when depopulating the descriptor) 3493 */ 3494 vaddr = ldep->addr; 3495 nbytes = ldep->nbytes; 3496 if (((uint64_t)vaddr & 0x7) != 0) { 3497 ASSERT(ldep->align_addr == NULL); 3498 ldep->align_addr = 3499 kmem_alloc(sizeof (caddr_t) * 3500 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3501 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3502 "(buf=%p nb=%ld op=%d)\n", 3503 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3504 nbytes, ldep->operation); 3505 if (perm != LDC_MEM_W) 3506 bcopy(vaddr, ldep->align_addr, nbytes); 3507 vaddr = ldep->align_addr; 3508 } 3509 3510 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3511 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3512 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3513 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3514 vdcp->instance, dep->payload.ncookies); 3515 if (rv != 0) { 3516 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3517 "(mhdl=%p, buf=%p, err=%d)\n", 3518 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3519 if (ldep->align_addr) { 3520 kmem_free(ldep->align_addr, 3521 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3522 ldep->align_addr = NULL; 3523 } 3524 return (EAGAIN); 3525 } 3526 3527 /* 3528 * Get the other cookies (if any). 3529 */ 3530 for (i = 1; i < dep->payload.ncookies; i++) { 3531 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3532 if (rv != 0) { 3533 (void) ldc_mem_unbind_handle(mhdl); 3534 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3535 "(mhdl=%lx cnum=%d), err=%d", 3536 vdcp->instance, mhdl, i, rv); 3537 if (ldep->align_addr) { 3538 kmem_free(ldep->align_addr, 3539 sizeof (caddr_t) * ldep->nbytes); 3540 ldep->align_addr = NULL; 3541 } 3542 return (EAGAIN); 3543 } 3544 } 3545 3546 return (rv); 3547 } 3548 3549 /* 3550 * Interrupt handlers for messages from LDC 3551 */ 3552 3553 /* 3554 * Function: 3555 * vdc_handle_cb() 3556 * 3557 * Description: 3558 * 3559 * Arguments: 3560 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3561 * arg - soft state pointer for this instance of the device driver. 3562 * 3563 * Return Code: 3564 * 0 - Success 3565 */ 3566 static uint_t 3567 vdc_handle_cb(uint64_t event, caddr_t arg) 3568 { 3569 ldc_status_t ldc_state; 3570 int rv = 0; 3571 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3572 vdc_t *vdc = srvr->vdcp; 3573 3574 ASSERT(vdc != NULL); 3575 3576 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3577 3578 /* If callback is not for the current server, ignore it */ 3579 mutex_enter(&vdc->lock); 3580 3581 if (vdc->curr_server != srvr) { 3582 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3583 vdc->instance, event, srvr->id); 3584 mutex_exit(&vdc->lock); 3585 return (LDC_SUCCESS); 3586 } 3587 3588 /* 3589 * Depending on the type of event that triggered this callback, 3590 * we modify the handshake state or read the data. 3591 * 3592 * NOTE: not done as a switch() as event could be triggered by 3593 * a state change and a read request. Also the ordering of the 3594 * check for the event types is deliberate. 3595 */ 3596 if (event & LDC_EVT_UP) { 3597 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3598 3599 /* get LDC state */ 3600 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3601 if (rv != 0) { 3602 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3603 vdc->instance, rv); 3604 mutex_exit(&vdc->lock); 3605 return (LDC_SUCCESS); 3606 } 3607 if (srvr->ldc_state != LDC_UP && 3608 ldc_state == LDC_UP) { 3609 /* 3610 * Reset the transaction sequence numbers when 3611 * LDC comes up. We then kick off the handshake 3612 * negotiation with the vDisk server. 3613 */ 3614 vdc->seq_num = 1; 3615 vdc->seq_num_reply = 0; 3616 srvr->ldc_state = ldc_state; 3617 cv_signal(&vdc->initwait_cv); 3618 } 3619 } 3620 3621 if (event & LDC_EVT_READ) { 3622 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3623 mutex_enter(&vdc->read_lock); 3624 cv_signal(&vdc->read_cv); 3625 vdc->read_state = VDC_READ_PENDING; 3626 mutex_exit(&vdc->read_lock); 3627 mutex_exit(&vdc->lock); 3628 3629 /* that's all we have to do - no need to handle DOWN/RESET */ 3630 return (LDC_SUCCESS); 3631 } 3632 3633 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3634 3635 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3636 3637 /* 3638 * Need to wake up any readers so they will 3639 * detect that a reset has occurred. 3640 */ 3641 mutex_enter(&vdc->read_lock); 3642 if ((vdc->read_state == VDC_READ_WAITING) || 3643 (vdc->read_state == VDC_READ_RESET)) 3644 cv_signal(&vdc->read_cv); 3645 vdc->read_state = VDC_READ_RESET; 3646 mutex_exit(&vdc->read_lock); 3647 3648 /* wake up any threads waiting for connection to come up */ 3649 if (vdc->state == VDC_STATE_INIT_WAITING) { 3650 vdc->state = VDC_STATE_RESETTING; 3651 cv_signal(&vdc->initwait_cv); 3652 } 3653 3654 } 3655 3656 mutex_exit(&vdc->lock); 3657 3658 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3659 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3660 vdc->instance, event); 3661 3662 return (LDC_SUCCESS); 3663 } 3664 3665 /* 3666 * Function: 3667 * vdc_wait_for_response() 3668 * 3669 * Description: 3670 * Block waiting for a response from the server. If there is 3671 * no data the thread block on the read_cv that is signalled 3672 * by the callback when an EVT_READ occurs. 3673 * 3674 * Arguments: 3675 * vdcp - soft state pointer for this instance of the device driver. 3676 * 3677 * Return Code: 3678 * 0 - Success 3679 */ 3680 static int 3681 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3682 { 3683 size_t nbytes = sizeof (*msgp); 3684 int status; 3685 3686 ASSERT(vdcp != NULL); 3687 3688 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3689 3690 status = vdc_recv(vdcp, msgp, &nbytes); 3691 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3692 status, (int)nbytes); 3693 if (status) { 3694 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3695 vdcp->instance, status); 3696 return (status); 3697 } 3698 3699 if (nbytes < sizeof (vio_msg_tag_t)) { 3700 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3701 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3702 return (ENOMSG); 3703 } 3704 3705 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3706 msgp->tag.vio_msgtype, 3707 msgp->tag.vio_subtype, 3708 msgp->tag.vio_subtype_env); 3709 3710 /* 3711 * Verify the Session ID of the message 3712 * 3713 * Every message after the Version has been negotiated should 3714 * have the correct session ID set. 3715 */ 3716 if ((msgp->tag.vio_sid != vdcp->session_id) && 3717 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3718 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3719 "expected 0x%lx [seq num %lx @ %d]", 3720 vdcp->instance, msgp->tag.vio_sid, 3721 vdcp->session_id, 3722 ((vio_dring_msg_t *)msgp)->seq_num, 3723 ((vio_dring_msg_t *)msgp)->start_idx); 3724 return (ENOMSG); 3725 } 3726 return (0); 3727 } 3728 3729 3730 /* 3731 * Function: 3732 * vdc_resubmit_backup_dring() 3733 * 3734 * Description: 3735 * Resubmit each descriptor in the backed up dring to 3736 * vDisk server. The Dring was backed up during connection 3737 * reset. 3738 * 3739 * Arguments: 3740 * vdcp - soft state pointer for this instance of the device driver. 3741 * 3742 * Return Code: 3743 * 0 - Success 3744 */ 3745 static int 3746 vdc_resubmit_backup_dring(vdc_t *vdcp) 3747 { 3748 int processed = 0; 3749 int count; 3750 int b_idx; 3751 int rv = 0; 3752 int dring_size; 3753 int op; 3754 vio_msg_t vio_msg; 3755 vdc_local_desc_t *curr_ldep; 3756 3757 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3758 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3759 3760 if (vdcp->local_dring_backup == NULL) { 3761 /* the pending requests have already been processed */ 3762 return (0); 3763 } 3764 3765 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3766 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3767 3768 /* 3769 * Walk the backup copy of the local descriptor ring and 3770 * resubmit all the outstanding transactions. 3771 */ 3772 b_idx = vdcp->local_dring_backup_tail; 3773 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3774 3775 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3776 3777 /* only resubmit outstanding transactions */ 3778 if (!curr_ldep->is_free) { 3779 /* 3780 * If we are retrying a block read/write operation we 3781 * need to update the I/O statistics to indicate that 3782 * the request is being put back on the waitq to be 3783 * serviced (it will have been taken off after the 3784 * error was reported). 3785 */ 3786 mutex_enter(&vdcp->lock); 3787 op = curr_ldep->operation; 3788 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3789 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3790 VD_KSTAT_WAITQ_ENTER(vdcp); 3791 } 3792 3793 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3794 rv = vdc_populate_descriptor(vdcp, op, 3795 curr_ldep->addr, curr_ldep->nbytes, 3796 curr_ldep->slice, curr_ldep->offset, 3797 curr_ldep->cb_type, curr_ldep->cb_arg, 3798 curr_ldep->dir); 3799 3800 if (rv) { 3801 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3802 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3803 VD_KSTAT_WAITQ_EXIT(vdcp); 3804 DTRACE_IO1(done, buf_t *, 3805 curr_ldep->cb_arg); 3806 } 3807 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3808 vdcp->instance, b_idx); 3809 mutex_exit(&vdcp->lock); 3810 goto done; 3811 } 3812 3813 /* 3814 * If this is a block read/write we update the I/O 3815 * statistics kstat to indicate that the request 3816 * has been sent back to the vDisk server and should 3817 * now be put on the run queue. 3818 */ 3819 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3820 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3821 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3822 } 3823 mutex_exit(&vdcp->lock); 3824 3825 /* Wait for the response message. */ 3826 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3827 b_idx); 3828 rv = vdc_wait_for_response(vdcp, &vio_msg); 3829 if (rv) { 3830 /* 3831 * If this is a block read/write we update 3832 * the I/O statistics kstat to take it 3833 * off the run queue. 3834 */ 3835 mutex_enter(&vdcp->lock); 3836 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3837 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3838 VD_KSTAT_RUNQ_EXIT(vdcp); 3839 DTRACE_IO1(done, buf_t *, 3840 curr_ldep->cb_arg); 3841 } 3842 DMSG(vdcp, 1, "[%d] wait_for_response " 3843 "returned err=%d\n", vdcp->instance, 3844 rv); 3845 mutex_exit(&vdcp->lock); 3846 goto done; 3847 } 3848 3849 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3850 rv = vdc_process_data_msg(vdcp, &vio_msg); 3851 if (rv) { 3852 DMSG(vdcp, 1, "[%d] process_data_msg " 3853 "returned err=%d\n", vdcp->instance, 3854 rv); 3855 goto done; 3856 } 3857 /* 3858 * Mark this entry as free so that we will not resubmit 3859 * this "done" request again, if we were to use the same 3860 * backup_dring again in future. This could happen when 3861 * a reset happens while processing the backup_dring. 3862 */ 3863 curr_ldep->is_free = B_TRUE; 3864 processed++; 3865 } 3866 3867 /* get the next element to submit */ 3868 if (++b_idx >= vdcp->local_dring_backup_len) 3869 b_idx = 0; 3870 } 3871 3872 /* all done - now clear up pending dring copy */ 3873 dring_size = vdcp->local_dring_backup_len * 3874 sizeof (vdcp->local_dring_backup[0]); 3875 3876 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3877 3878 vdcp->local_dring_backup = NULL; 3879 3880 done: 3881 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3882 3883 return (rv); 3884 } 3885 3886 /* 3887 * Function: 3888 * vdc_cancel_backup_dring 3889 * 3890 * Description: 3891 * Cancel each descriptor in the backed up dring to vDisk server. 3892 * The Dring was backed up during connection reset. 3893 * 3894 * Arguments: 3895 * vdcp - soft state pointer for this instance of the device driver. 3896 * 3897 * Return Code: 3898 * None 3899 */ 3900 void 3901 vdc_cancel_backup_dring(vdc_t *vdcp) 3902 { 3903 vdc_local_desc_t *ldep; 3904 struct buf *bufp; 3905 int count; 3906 int b_idx; 3907 int dring_size; 3908 int cancelled = 0; 3909 3910 ASSERT(MUTEX_HELD(&vdcp->lock)); 3911 ASSERT(vdcp->state == VDC_STATE_INIT || 3912 vdcp->state == VDC_STATE_INIT_WAITING || 3913 vdcp->state == VDC_STATE_NEGOTIATE || 3914 vdcp->state == VDC_STATE_RESETTING); 3915 3916 if (vdcp->local_dring_backup == NULL) { 3917 /* the pending requests have already been processed */ 3918 return; 3919 } 3920 3921 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3922 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3923 3924 /* 3925 * Walk the backup copy of the local descriptor ring and 3926 * cancel all the outstanding transactions. 3927 */ 3928 b_idx = vdcp->local_dring_backup_tail; 3929 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3930 3931 ldep = &(vdcp->local_dring_backup[b_idx]); 3932 3933 /* only cancel outstanding transactions */ 3934 if (!ldep->is_free) { 3935 3936 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3937 cancelled++; 3938 3939 /* 3940 * All requests have already been cleared from the 3941 * local descriptor ring and the LDC channel has been 3942 * reset so we will never get any reply for these 3943 * requests. Now we just have to notify threads waiting 3944 * for replies that the request has failed. 3945 */ 3946 switch (ldep->cb_type) { 3947 case CB_SYNC: 3948 ASSERT(vdcp->sync_op_pending); 3949 vdcp->sync_op_status = EIO; 3950 vdcp->sync_op_pending = B_FALSE; 3951 cv_signal(&vdcp->sync_pending_cv); 3952 break; 3953 3954 case CB_STRATEGY: 3955 bufp = ldep->cb_arg; 3956 ASSERT(bufp != NULL); 3957 bufp->b_resid = bufp->b_bcount; 3958 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3959 VD_KSTAT_RUNQ_EXIT(vdcp); 3960 DTRACE_IO1(done, buf_t *, bufp); 3961 bioerror(bufp, EIO); 3962 biodone(bufp); 3963 break; 3964 3965 default: 3966 ASSERT(0); 3967 } 3968 3969 } 3970 3971 /* get the next element to cancel */ 3972 if (++b_idx >= vdcp->local_dring_backup_len) 3973 b_idx = 0; 3974 } 3975 3976 /* all done - now clear up pending dring copy */ 3977 dring_size = vdcp->local_dring_backup_len * 3978 sizeof (vdcp->local_dring_backup[0]); 3979 3980 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3981 3982 vdcp->local_dring_backup = NULL; 3983 3984 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 3985 } 3986 3987 /* 3988 * Function: 3989 * vdc_connection_timeout 3990 * 3991 * Description: 3992 * This function is invoked if the timeout set to establish the connection 3993 * with vds expires. This will happen if we spend too much time in the 3994 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 3995 * cancel any pending request and mark them as failed. 3996 * 3997 * If the timeout does not expire, it will be cancelled when we reach the 3998 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 3999 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 4000 * VDC_STATE_RESETTING state in which case we do nothing because the 4001 * timeout is being cancelled. 4002 * 4003 * Arguments: 4004 * arg - argument of the timeout function actually a soft state 4005 * pointer for the instance of the device driver. 4006 * 4007 * Return Code: 4008 * None 4009 */ 4010 void 4011 vdc_connection_timeout(void *arg) 4012 { 4013 vdc_t *vdcp = (vdc_t *)arg; 4014 4015 mutex_enter(&vdcp->lock); 4016 4017 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4018 vdcp->state == VDC_STATE_DETACH) { 4019 /* 4020 * The connection has just been re-established or 4021 * we are detaching. 4022 */ 4023 vdcp->ctimeout_reached = B_FALSE; 4024 mutex_exit(&vdcp->lock); 4025 return; 4026 } 4027 4028 vdcp->ctimeout_reached = B_TRUE; 4029 4030 /* notify requests waiting for sending */ 4031 cv_broadcast(&vdcp->running_cv); 4032 4033 /* cancel requests waiting for a result */ 4034 vdc_cancel_backup_dring(vdcp); 4035 4036 mutex_exit(&vdcp->lock); 4037 4038 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4039 vdcp->instance); 4040 } 4041 4042 /* 4043 * Function: 4044 * vdc_backup_local_dring() 4045 * 4046 * Description: 4047 * Backup the current dring in the event of a reset. The Dring 4048 * transactions will be resubmitted to the server when the 4049 * connection is restored. 4050 * 4051 * Arguments: 4052 * vdcp - soft state pointer for this instance of the device driver. 4053 * 4054 * Return Code: 4055 * NONE 4056 */ 4057 static void 4058 vdc_backup_local_dring(vdc_t *vdcp) 4059 { 4060 int dring_size; 4061 4062 ASSERT(MUTEX_HELD(&vdcp->lock)); 4063 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4064 4065 /* 4066 * If the backup dring is stil around, it means 4067 * that the last restore did not complete. However, 4068 * since we never got back into the running state, 4069 * the backup copy we have is still valid. 4070 */ 4071 if (vdcp->local_dring_backup != NULL) { 4072 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4073 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4074 vdcp->local_dring_backup_tail); 4075 return; 4076 } 4077 4078 /* 4079 * The backup dring can be NULL and the local dring may not be 4080 * initialized. This can happen if we had a reset while establishing 4081 * a new connection but after the connection has timed out. In that 4082 * case the backup dring is NULL because the requests have been 4083 * cancelled and the request occured before the local dring is 4084 * initialized. 4085 */ 4086 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4087 return; 4088 4089 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4090 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4091 4092 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4093 4094 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4095 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4096 4097 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4098 vdcp->local_dring_backup_len = vdcp->dring_len; 4099 } 4100 4101 static void 4102 vdc_switch_server(vdc_t *vdcp) 4103 { 4104 int rv; 4105 vdc_server_t *curr_server, *new_server; 4106 4107 ASSERT(MUTEX_HELD(&vdcp->lock)); 4108 4109 /* if there is only one server return back */ 4110 if (vdcp->num_servers == 1) { 4111 return; 4112 } 4113 4114 /* Get current and next server */ 4115 curr_server = vdcp->curr_server; 4116 new_server = 4117 (curr_server->next) ? curr_server->next : vdcp->server_list; 4118 ASSERT(curr_server != new_server); 4119 4120 /* bring current server's channel down */ 4121 rv = ldc_down(curr_server->ldc_handle); 4122 if (rv) { 4123 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4124 vdcp->instance, curr_server->id); 4125 return; 4126 } 4127 4128 /* switch the server */ 4129 vdcp->curr_server = new_server; 4130 4131 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4132 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4133 } 4134 4135 /* -------------------------------------------------------------------------- */ 4136 4137 /* 4138 * The following functions process the incoming messages from vds 4139 */ 4140 4141 /* 4142 * Function: 4143 * vdc_process_msg_thread() 4144 * 4145 * Description: 4146 * 4147 * Main VDC message processing thread. Each vDisk instance 4148 * consists of a copy of this thread. This thread triggers 4149 * all the handshakes and data exchange with the server. It 4150 * also handles all channel resets 4151 * 4152 * Arguments: 4153 * vdc - soft state pointer for this instance of the device driver. 4154 * 4155 * Return Code: 4156 * None 4157 */ 4158 static void 4159 vdc_process_msg_thread(vdc_t *vdcp) 4160 { 4161 int status; 4162 int ctimeout; 4163 timeout_id_t tmid = 0; 4164 clock_t ldcup_timeout = 0; 4165 4166 mutex_enter(&vdcp->lock); 4167 4168 for (;;) { 4169 4170 #define Q(_s) (vdcp->state == _s) ? #_s : 4171 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4172 Q(VDC_STATE_INIT) 4173 Q(VDC_STATE_INIT_WAITING) 4174 Q(VDC_STATE_NEGOTIATE) 4175 Q(VDC_STATE_HANDLE_PENDING) 4176 Q(VDC_STATE_RUNNING) 4177 Q(VDC_STATE_RESETTING) 4178 Q(VDC_STATE_DETACH) 4179 "UNKNOWN"); 4180 4181 switch (vdcp->state) { 4182 case VDC_STATE_INIT: 4183 4184 /* 4185 * If requested, start a timeout to check if the 4186 * connection with vds is established in the 4187 * specified delay. If the timeout expires, we 4188 * will cancel any pending request. 4189 * 4190 * If some reset have occurred while establishing 4191 * the connection, we already have a timeout armed 4192 * and in that case we don't need to arm a new one. 4193 * 4194 * The same rule applies when there are multiple vds'. 4195 * If either a connection cannot be established or 4196 * the handshake times out, the connection thread will 4197 * try another server. The 'ctimeout' will report 4198 * back an error after it expires irrespective of 4199 * whether the vdisk is trying to connect to just 4200 * one or multiple servers. 4201 */ 4202 ctimeout = (vdc_timeout != 0)? 4203 vdc_timeout : vdcp->curr_server->ctimeout; 4204 4205 if (ctimeout != 0 && tmid == 0) { 4206 tmid = timeout(vdc_connection_timeout, vdcp, 4207 ctimeout * drv_usectohz(MICROSEC)); 4208 } 4209 4210 /* Check if we are re-initializing repeatedly */ 4211 if (vdcp->hshake_cnt > vdc_hshake_retries && 4212 vdcp->lifecycle != VDC_LC_ONLINE) { 4213 4214 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4215 vdcp->instance, vdcp->hshake_cnt); 4216 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4217 vdcp->instance); 4218 vdcp->state = VDC_STATE_DETACH; 4219 break; 4220 } 4221 4222 /* Switch to STATE_DETACH if drv is detaching */ 4223 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4224 vdcp->state = VDC_STATE_DETACH; 4225 break; 4226 } 4227 4228 /* Switch server */ 4229 if (vdcp->hshake_cnt > 0) 4230 vdc_switch_server(vdcp); 4231 vdcp->hshake_cnt++; 4232 4233 /* Bring up connection with vds via LDC */ 4234 status = vdc_start_ldc_connection(vdcp); 4235 if (status != EINVAL) { 4236 vdcp->state = VDC_STATE_INIT_WAITING; 4237 } 4238 break; 4239 4240 case VDC_STATE_INIT_WAITING: 4241 4242 /* if channel is UP, start negotiation */ 4243 if (vdcp->curr_server->ldc_state == LDC_UP) { 4244 vdcp->state = VDC_STATE_NEGOTIATE; 4245 break; 4246 } 4247 4248 /* check if only one server exists */ 4249 if (vdcp->num_servers == 1) { 4250 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4251 } else { 4252 /* 4253 * wait for LDC_UP, if it times out, switch 4254 * to another server. 4255 */ 4256 ldcup_timeout = ddi_get_lbolt() + 4257 (vdc_ldcup_timeout * 4258 drv_usectohz(MICROSEC)); 4259 status = cv_timedwait(&vdcp->initwait_cv, 4260 &vdcp->lock, ldcup_timeout); 4261 if (status == -1 && 4262 vdcp->state == VDC_STATE_INIT_WAITING && 4263 vdcp->curr_server->ldc_state != LDC_UP) { 4264 /* timed out & still waiting */ 4265 vdcp->state = VDC_STATE_INIT; 4266 break; 4267 } 4268 } 4269 4270 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4271 DMSG(vdcp, 0, 4272 "state moved to %d out from under us...\n", 4273 vdcp->state); 4274 } 4275 break; 4276 4277 case VDC_STATE_NEGOTIATE: 4278 switch (status = vdc_ver_negotiation(vdcp)) { 4279 case 0: 4280 break; 4281 default: 4282 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4283 status); 4284 goto reset; 4285 } 4286 4287 switch (status = vdc_attr_negotiation(vdcp)) { 4288 case 0: 4289 break; 4290 default: 4291 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4292 status); 4293 goto reset; 4294 } 4295 4296 switch (status = vdc_dring_negotiation(vdcp)) { 4297 case 0: 4298 break; 4299 default: 4300 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4301 status); 4302 goto reset; 4303 } 4304 4305 switch (status = vdc_rdx_exchange(vdcp)) { 4306 case 0: 4307 vdcp->state = VDC_STATE_HANDLE_PENDING; 4308 goto done; 4309 default: 4310 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4311 status); 4312 goto reset; 4313 } 4314 reset: 4315 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4316 status); 4317 vdcp->state = VDC_STATE_RESETTING; 4318 vdcp->self_reset = B_TRUE; 4319 done: 4320 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4321 vdcp->state); 4322 break; 4323 4324 case VDC_STATE_HANDLE_PENDING: 4325 4326 if (vdcp->ctimeout_reached) { 4327 /* 4328 * The connection timeout had been reached so 4329 * pending requests have been cancelled. Now 4330 * that the connection is back we can reset 4331 * the timeout. 4332 */ 4333 ASSERT(vdcp->local_dring_backup == NULL); 4334 ASSERT(tmid != 0); 4335 tmid = 0; 4336 vdcp->ctimeout_reached = B_FALSE; 4337 vdcp->state = VDC_STATE_RUNNING; 4338 DMSG(vdcp, 0, "[%d] connection to service " 4339 "domain is up", vdcp->instance); 4340 break; 4341 } 4342 4343 mutex_exit(&vdcp->lock); 4344 if (tmid != 0) { 4345 (void) untimeout(tmid); 4346 tmid = 0; 4347 } 4348 status = vdc_resubmit_backup_dring(vdcp); 4349 mutex_enter(&vdcp->lock); 4350 4351 if (status) 4352 vdcp->state = VDC_STATE_RESETTING; 4353 else 4354 vdcp->state = VDC_STATE_RUNNING; 4355 4356 break; 4357 4358 /* enter running state */ 4359 case VDC_STATE_RUNNING: 4360 /* 4361 * Signal anyone waiting for the connection 4362 * to come on line. 4363 */ 4364 vdcp->hshake_cnt = 0; 4365 cv_broadcast(&vdcp->running_cv); 4366 4367 /* failfast has to been checked after reset */ 4368 cv_signal(&vdcp->failfast_cv); 4369 4370 /* ownership is lost during reset */ 4371 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4372 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4373 cv_signal(&vdcp->ownership_cv); 4374 4375 cmn_err(CE_CONT, "?vdisk@%d is online using " 4376 "ldc@%ld,%ld\n", vdcp->instance, 4377 vdcp->curr_server->ldc_id, vdcp->curr_server->id); 4378 4379 mutex_exit(&vdcp->lock); 4380 4381 for (;;) { 4382 vio_msg_t msg; 4383 status = vdc_wait_for_response(vdcp, &msg); 4384 if (status) break; 4385 4386 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4387 vdcp->instance); 4388 status = vdc_process_data_msg(vdcp, &msg); 4389 if (status) { 4390 DMSG(vdcp, 1, "[%d] process_data_msg " 4391 "returned err=%d\n", vdcp->instance, 4392 status); 4393 break; 4394 } 4395 4396 } 4397 4398 mutex_enter(&vdcp->lock); 4399 4400 cmn_err(CE_CONT, "?vdisk@%d is offline\n", 4401 vdcp->instance); 4402 4403 vdcp->state = VDC_STATE_RESETTING; 4404 vdcp->self_reset = B_TRUE; 4405 break; 4406 4407 case VDC_STATE_RESETTING: 4408 /* 4409 * When we reach this state, we either come from the 4410 * VDC_STATE_RUNNING state and we can have pending 4411 * request but no timeout is armed; or we come from 4412 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4413 * VDC_HANDLE_PENDING state and there is no pending 4414 * request or pending requests have already been copied 4415 * into the backup dring. So we can safely keep the 4416 * connection timeout armed while we are in this state. 4417 */ 4418 4419 DMSG(vdcp, 0, "Initiating channel reset " 4420 "(pending = %d)\n", (int)vdcp->threads_pending); 4421 4422 if (vdcp->self_reset) { 4423 DMSG(vdcp, 0, 4424 "[%d] calling stop_ldc_connection.\n", 4425 vdcp->instance); 4426 status = vdc_stop_ldc_connection(vdcp); 4427 vdcp->self_reset = B_FALSE; 4428 } 4429 4430 /* 4431 * Wait for all threads currently waiting 4432 * for a free dring entry to use. 4433 */ 4434 while (vdcp->threads_pending) { 4435 cv_broadcast(&vdcp->membind_cv); 4436 cv_broadcast(&vdcp->dring_free_cv); 4437 mutex_exit(&vdcp->lock); 4438 /* give the waiters enough time to wake up */ 4439 delay(vdc_hz_min_ldc_delay); 4440 mutex_enter(&vdcp->lock); 4441 } 4442 4443 ASSERT(vdcp->threads_pending == 0); 4444 4445 /* Sanity check that no thread is receiving */ 4446 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4447 4448 vdcp->read_state = VDC_READ_IDLE; 4449 4450 vdc_backup_local_dring(vdcp); 4451 4452 /* cleanup the old d-ring */ 4453 vdc_destroy_descriptor_ring(vdcp); 4454 4455 /* go and start again */ 4456 vdcp->state = VDC_STATE_INIT; 4457 4458 break; 4459 4460 case VDC_STATE_DETACH: 4461 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4462 vdcp->instance); 4463 4464 /* cancel any pending timeout */ 4465 mutex_exit(&vdcp->lock); 4466 if (tmid != 0) { 4467 (void) untimeout(tmid); 4468 tmid = 0; 4469 } 4470 mutex_enter(&vdcp->lock); 4471 4472 /* 4473 * Signal anyone waiting for connection 4474 * to come online 4475 */ 4476 cv_broadcast(&vdcp->running_cv); 4477 4478 while (vdcp->sync_op_pending) { 4479 cv_signal(&vdcp->sync_pending_cv); 4480 cv_signal(&vdcp->sync_blocked_cv); 4481 mutex_exit(&vdcp->lock); 4482 /* give the waiters enough time to wake up */ 4483 delay(vdc_hz_min_ldc_delay); 4484 mutex_enter(&vdcp->lock); 4485 } 4486 4487 mutex_exit(&vdcp->lock); 4488 4489 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4490 vdcp->instance); 4491 thread_exit(); 4492 break; 4493 } 4494 } 4495 } 4496 4497 4498 /* 4499 * Function: 4500 * vdc_process_data_msg() 4501 * 4502 * Description: 4503 * This function is called by the message processing thread each time 4504 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4505 * be an ACK or NACK from vds[1] which vdc handles as follows. 4506 * ACK - wake up the waiting thread 4507 * NACK - resend any messages necessary 4508 * 4509 * [1] Although the message format allows it, vds should not send a 4510 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4511 * some bizarre reason it does, vdc will reset the connection. 4512 * 4513 * Arguments: 4514 * vdc - soft state pointer for this instance of the device driver. 4515 * msg - the LDC message sent by vds 4516 * 4517 * Return Code: 4518 * 0 - Success. 4519 * > 0 - error value returned by LDC 4520 */ 4521 static int 4522 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4523 { 4524 int status = 0; 4525 vio_dring_msg_t *dring_msg; 4526 vdc_local_desc_t *ldep = NULL; 4527 int start, end; 4528 int idx; 4529 int op; 4530 4531 dring_msg = (vio_dring_msg_t *)msg; 4532 4533 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4534 ASSERT(vdcp != NULL); 4535 4536 mutex_enter(&vdcp->lock); 4537 4538 /* 4539 * Check to see if the message has bogus data 4540 */ 4541 idx = start = dring_msg->start_idx; 4542 end = dring_msg->end_idx; 4543 if ((start >= vdcp->dring_len) || 4544 (end >= vdcp->dring_len) || (end < -1)) { 4545 /* 4546 * Update the I/O statistics to indicate that an error ocurred. 4547 * No need to update the wait/run queues as no specific read or 4548 * write request is being completed in response to this 'msg'. 4549 */ 4550 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4551 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4552 vdcp->instance, start, end); 4553 mutex_exit(&vdcp->lock); 4554 return (EINVAL); 4555 } 4556 4557 /* 4558 * Verify that the sequence number is what vdc expects. 4559 */ 4560 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4561 case VDC_SEQ_NUM_TODO: 4562 break; /* keep processing this message */ 4563 case VDC_SEQ_NUM_SKIP: 4564 mutex_exit(&vdcp->lock); 4565 return (0); 4566 case VDC_SEQ_NUM_INVALID: 4567 /* 4568 * Update the I/O statistics to indicate that an error ocurred. 4569 * No need to update the wait/run queues as no specific read or 4570 * write request is being completed in response to this 'msg'. 4571 */ 4572 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4573 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4574 mutex_exit(&vdcp->lock); 4575 return (ENXIO); 4576 } 4577 4578 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4579 /* 4580 * Update the I/O statistics to indicate that an error ocurred. 4581 * 4582 * We need to update the run queue if a read or write request 4583 * is being NACKed - otherwise there will appear to be an 4584 * indefinite outstanding request and statistics reported by 4585 * iostat(1M) will be incorrect. The transaction will be 4586 * resubmitted from the backup DRing following the reset 4587 * and the wait/run queues will be entered again. 4588 */ 4589 ldep = &vdcp->local_dring[idx]; 4590 op = ldep->operation; 4591 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4592 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4593 VD_KSTAT_RUNQ_EXIT(vdcp); 4594 } 4595 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4596 VDC_DUMP_DRING_MSG(dring_msg); 4597 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4598 mutex_exit(&vdcp->lock); 4599 return (EIO); 4600 4601 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4602 /* 4603 * Update the I/O statistics to indicate that an error occurred. 4604 * No need to update the wait/run queues as no specific read or 4605 * write request is being completed in response to this 'msg'. 4606 */ 4607 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4608 mutex_exit(&vdcp->lock); 4609 return (EPROTO); 4610 } 4611 4612 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4613 ASSERT(start == end); 4614 4615 ldep = &vdcp->local_dring[idx]; 4616 4617 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4618 ldep->dep->hdr.dstate, ldep->cb_type); 4619 4620 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4621 struct buf *bufp; 4622 4623 switch (ldep->cb_type) { 4624 case CB_SYNC: 4625 ASSERT(vdcp->sync_op_pending); 4626 4627 status = vdc_depopulate_descriptor(vdcp, idx); 4628 vdcp->sync_op_status = status; 4629 vdcp->sync_op_pending = B_FALSE; 4630 cv_signal(&vdcp->sync_pending_cv); 4631 break; 4632 4633 case CB_STRATEGY: 4634 bufp = ldep->cb_arg; 4635 ASSERT(bufp != NULL); 4636 bufp->b_resid = 4637 bufp->b_bcount - ldep->dep->payload.nbytes; 4638 status = ldep->dep->payload.status; /* Future:ntoh */ 4639 if (status != 0) { 4640 DMSG(vdcp, 1, "strategy status=%d\n", status); 4641 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4642 bioerror(bufp, status); 4643 } 4644 4645 (void) vdc_depopulate_descriptor(vdcp, idx); 4646 4647 DMSG(vdcp, 1, 4648 "strategy complete req=%ld bytes resp=%ld bytes\n", 4649 bufp->b_bcount, ldep->dep->payload.nbytes); 4650 4651 if (status != 0 && vdcp->failfast_interval != 0) { 4652 /* 4653 * The I/O has failed and failfast is enabled. 4654 * We need the failfast thread to check if the 4655 * failure is due to a reservation conflict. 4656 */ 4657 (void) vdc_failfast_io_queue(vdcp, bufp); 4658 } else { 4659 if (status == 0) { 4660 op = (bufp->b_flags & B_READ) ? 4661 VD_OP_BREAD : VD_OP_BWRITE; 4662 VD_UPDATE_IO_STATS(vdcp, op, 4663 ldep->dep->payload.nbytes); 4664 } 4665 VD_KSTAT_RUNQ_EXIT(vdcp); 4666 DTRACE_IO1(done, buf_t *, bufp); 4667 biodone(bufp); 4668 } 4669 break; 4670 4671 default: 4672 ASSERT(0); 4673 } 4674 } 4675 4676 /* let the arrival signal propogate */ 4677 mutex_exit(&vdcp->lock); 4678 4679 /* probe gives the count of how many entries were processed */ 4680 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4681 4682 return (0); 4683 } 4684 4685 4686 /* 4687 * Function: 4688 * vdc_handle_ver_msg() 4689 * 4690 * Description: 4691 * 4692 * Arguments: 4693 * vdc - soft state pointer for this instance of the device driver. 4694 * ver_msg - LDC message sent by vDisk server 4695 * 4696 * Return Code: 4697 * 0 - Success 4698 */ 4699 static int 4700 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4701 { 4702 int status = 0; 4703 4704 ASSERT(vdc != NULL); 4705 ASSERT(mutex_owned(&vdc->lock)); 4706 4707 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4708 return (EPROTO); 4709 } 4710 4711 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4712 return (EINVAL); 4713 } 4714 4715 switch (ver_msg->tag.vio_subtype) { 4716 case VIO_SUBTYPE_ACK: 4717 /* 4718 * We check to see if the version returned is indeed supported 4719 * (The server may have also adjusted the minor number downwards 4720 * and if so 'ver_msg' will contain the actual version agreed) 4721 */ 4722 if (vdc_is_supported_version(ver_msg)) { 4723 vdc->ver.major = ver_msg->ver_major; 4724 vdc->ver.minor = ver_msg->ver_minor; 4725 ASSERT(vdc->ver.major > 0); 4726 } else { 4727 status = EPROTO; 4728 } 4729 break; 4730 4731 case VIO_SUBTYPE_NACK: 4732 /* 4733 * call vdc_is_supported_version() which will return the next 4734 * supported version (if any) in 'ver_msg' 4735 */ 4736 (void) vdc_is_supported_version(ver_msg); 4737 if (ver_msg->ver_major > 0) { 4738 size_t len = sizeof (*ver_msg); 4739 4740 ASSERT(vdc->ver.major > 0); 4741 4742 /* reset the necessary fields and resend */ 4743 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4744 ver_msg->dev_class = VDEV_DISK; 4745 4746 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4747 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4748 vdc->instance, status); 4749 if (len != sizeof (*ver_msg)) 4750 status = EBADMSG; 4751 } else { 4752 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4753 vdc->instance); 4754 status = ENOTSUP; 4755 } 4756 4757 break; 4758 case VIO_SUBTYPE_INFO: 4759 /* 4760 * Handle the case where vds starts handshake 4761 * (for now only vdc is the instigator) 4762 */ 4763 status = ENOTSUP; 4764 break; 4765 4766 default: 4767 status = EINVAL; 4768 break; 4769 } 4770 4771 return (status); 4772 } 4773 4774 /* 4775 * Function: 4776 * vdc_handle_attr_msg() 4777 * 4778 * Description: 4779 * 4780 * Arguments: 4781 * vdc - soft state pointer for this instance of the device driver. 4782 * attr_msg - LDC message sent by vDisk server 4783 * 4784 * Return Code: 4785 * 0 - Success 4786 */ 4787 static int 4788 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4789 { 4790 int status = 0; 4791 4792 ASSERT(vdc != NULL); 4793 ASSERT(mutex_owned(&vdc->lock)); 4794 4795 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4796 return (EPROTO); 4797 } 4798 4799 switch (attr_msg->tag.vio_subtype) { 4800 case VIO_SUBTYPE_ACK: 4801 /* 4802 * We now verify the attributes sent by vds. 4803 */ 4804 if (attr_msg->vdisk_size == 0) { 4805 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4806 vdc->instance); 4807 status = EINVAL; 4808 break; 4809 } 4810 4811 if (attr_msg->max_xfer_sz == 0) { 4812 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4813 vdc->instance); 4814 status = EINVAL; 4815 break; 4816 } 4817 4818 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4819 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4820 vdc->instance); 4821 attr_msg->vdisk_size = 0; 4822 } 4823 4824 /* 4825 * If the disk size is already set check that it hasn't changed. 4826 */ 4827 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4828 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4829 DMSG(vdc, 0, "[%d] Different disk size from vds " 4830 "(old=0x%lx - new=0x%lx", vdc->instance, 4831 vdc->vdisk_size, attr_msg->vdisk_size) 4832 status = EINVAL; 4833 break; 4834 } 4835 4836 vdc->vdisk_size = attr_msg->vdisk_size; 4837 vdc->vdisk_type = attr_msg->vdisk_type; 4838 vdc->operations = attr_msg->operations; 4839 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4840 vdc->vdisk_media = attr_msg->vdisk_media; 4841 else 4842 vdc->vdisk_media = 0; 4843 4844 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4845 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4846 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4847 vdc->instance, vdc->block_size, 4848 attr_msg->vdisk_block_size); 4849 4850 /* 4851 * We don't know at compile time what the vDisk server will 4852 * think are good values but we apply a large (arbitrary) 4853 * upper bound to prevent memory exhaustion in vdc if it was 4854 * allocating a DRing based of huge values sent by the server. 4855 * We probably will never exceed this except if the message 4856 * was garbage. 4857 */ 4858 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4859 (PAGESIZE * DEV_BSIZE)) { 4860 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4861 vdc->block_size = attr_msg->vdisk_block_size; 4862 } else { 4863 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4864 " using max supported by vdc", vdc->instance); 4865 } 4866 4867 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4868 (attr_msg->vdisk_size > INT64_MAX) || 4869 (attr_msg->operations == 0) || 4870 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4871 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4872 vdc->instance); 4873 status = EINVAL; 4874 break; 4875 } 4876 4877 /* 4878 * Now that we have received all attributes we can create a 4879 * fake geometry for the disk. 4880 */ 4881 vdc_create_fake_geometry(vdc); 4882 break; 4883 4884 case VIO_SUBTYPE_NACK: 4885 /* 4886 * vds could not handle the attributes we sent so we 4887 * stop negotiating. 4888 */ 4889 status = EPROTO; 4890 break; 4891 4892 case VIO_SUBTYPE_INFO: 4893 /* 4894 * Handle the case where vds starts the handshake 4895 * (for now; vdc is the only supported instigatior) 4896 */ 4897 status = ENOTSUP; 4898 break; 4899 4900 default: 4901 status = ENOTSUP; 4902 break; 4903 } 4904 4905 return (status); 4906 } 4907 4908 /* 4909 * Function: 4910 * vdc_handle_dring_reg_msg() 4911 * 4912 * Description: 4913 * 4914 * Arguments: 4915 * vdc - soft state pointer for this instance of the driver. 4916 * dring_msg - LDC message sent by vDisk server 4917 * 4918 * Return Code: 4919 * 0 - Success 4920 */ 4921 static int 4922 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4923 { 4924 int status = 0; 4925 4926 ASSERT(vdc != NULL); 4927 ASSERT(mutex_owned(&vdc->lock)); 4928 4929 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4930 return (EPROTO); 4931 } 4932 4933 switch (dring_msg->tag.vio_subtype) { 4934 case VIO_SUBTYPE_ACK: 4935 /* save the received dring_ident */ 4936 vdc->dring_ident = dring_msg->dring_ident; 4937 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4938 vdc->instance, vdc->dring_ident); 4939 break; 4940 4941 case VIO_SUBTYPE_NACK: 4942 /* 4943 * vds could not handle the DRing info we sent so we 4944 * stop negotiating. 4945 */ 4946 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4947 vdc->instance); 4948 status = EPROTO; 4949 break; 4950 4951 case VIO_SUBTYPE_INFO: 4952 /* 4953 * Handle the case where vds starts handshake 4954 * (for now only vdc is the instigatior) 4955 */ 4956 status = ENOTSUP; 4957 break; 4958 default: 4959 status = ENOTSUP; 4960 } 4961 4962 return (status); 4963 } 4964 4965 /* 4966 * Function: 4967 * vdc_verify_seq_num() 4968 * 4969 * Description: 4970 * This functions verifies that the sequence number sent back by the vDisk 4971 * server with the latest message is what is expected (i.e. it is greater 4972 * than the last seq num sent by the vDisk server and less than or equal 4973 * to the last seq num generated by vdc). 4974 * 4975 * It then checks the request ID to see if any requests need processing 4976 * in the DRing. 4977 * 4978 * Arguments: 4979 * vdc - soft state pointer for this instance of the driver. 4980 * dring_msg - pointer to the LDC message sent by vds 4981 * 4982 * Return Code: 4983 * VDC_SEQ_NUM_TODO - Message needs to be processed 4984 * VDC_SEQ_NUM_SKIP - Message has already been processed 4985 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4986 * vdc cannot deal with them 4987 */ 4988 static int 4989 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4990 { 4991 ASSERT(vdc != NULL); 4992 ASSERT(dring_msg != NULL); 4993 ASSERT(mutex_owned(&vdc->lock)); 4994 4995 /* 4996 * Check to see if the messages were responded to in the correct 4997 * order by vds. 4998 */ 4999 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 5000 (dring_msg->seq_num > vdc->seq_num)) { 5001 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 5002 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 5003 vdc->instance, dring_msg->seq_num, 5004 vdc->seq_num_reply, vdc->seq_num, 5005 vdc->req_id_proc, vdc->req_id); 5006 return (VDC_SEQ_NUM_INVALID); 5007 } 5008 vdc->seq_num_reply = dring_msg->seq_num; 5009 5010 if (vdc->req_id_proc < vdc->req_id) 5011 return (VDC_SEQ_NUM_TODO); 5012 else 5013 return (VDC_SEQ_NUM_SKIP); 5014 } 5015 5016 5017 /* 5018 * Function: 5019 * vdc_is_supported_version() 5020 * 5021 * Description: 5022 * This routine checks if the major/minor version numbers specified in 5023 * 'ver_msg' are supported. If not it finds the next version that is 5024 * in the supported version list 'vdc_version[]' and sets the fields in 5025 * 'ver_msg' to those values 5026 * 5027 * Arguments: 5028 * ver_msg - LDC message sent by vDisk server 5029 * 5030 * Return Code: 5031 * B_TRUE - Success 5032 * B_FALSE - Version not supported 5033 */ 5034 static boolean_t 5035 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5036 { 5037 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5038 5039 for (int i = 0; i < vdc_num_versions; i++) { 5040 ASSERT(vdc_version[i].major > 0); 5041 ASSERT((i == 0) || 5042 (vdc_version[i].major < vdc_version[i-1].major)); 5043 5044 /* 5045 * If the major versions match, adjust the minor version, if 5046 * necessary, down to the highest value supported by this 5047 * client. The server should support all minor versions lower 5048 * than the value it sent 5049 */ 5050 if (ver_msg->ver_major == vdc_version[i].major) { 5051 if (ver_msg->ver_minor > vdc_version[i].minor) { 5052 DMSGX(0, 5053 "Adjusting minor version from %u to %u", 5054 ver_msg->ver_minor, vdc_version[i].minor); 5055 ver_msg->ver_minor = vdc_version[i].minor; 5056 } 5057 return (B_TRUE); 5058 } 5059 5060 /* 5061 * If the message contains a higher major version number, set 5062 * the message's major/minor versions to the current values 5063 * and return false, so this message will get resent with 5064 * these values, and the server will potentially try again 5065 * with the same or a lower version 5066 */ 5067 if (ver_msg->ver_major > vdc_version[i].major) { 5068 ver_msg->ver_major = vdc_version[i].major; 5069 ver_msg->ver_minor = vdc_version[i].minor; 5070 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5071 ver_msg->ver_major, ver_msg->ver_minor); 5072 5073 return (B_FALSE); 5074 } 5075 5076 /* 5077 * Otherwise, the message's major version is less than the 5078 * current major version, so continue the loop to the next 5079 * (lower) supported version 5080 */ 5081 } 5082 5083 /* 5084 * No common version was found; "ground" the version pair in the 5085 * message to terminate negotiation 5086 */ 5087 ver_msg->ver_major = 0; 5088 ver_msg->ver_minor = 0; 5089 5090 return (B_FALSE); 5091 } 5092 /* -------------------------------------------------------------------------- */ 5093 5094 /* 5095 * DKIO(7) support 5096 */ 5097 5098 typedef struct vdc_dk_arg { 5099 struct dk_callback dkc; 5100 int mode; 5101 dev_t dev; 5102 vdc_t *vdc; 5103 } vdc_dk_arg_t; 5104 5105 /* 5106 * Function: 5107 * vdc_dkio_flush_cb() 5108 * 5109 * Description: 5110 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5111 * by kernel code. 5112 * 5113 * Arguments: 5114 * arg - a pointer to a vdc_dk_arg_t structure. 5115 */ 5116 void 5117 vdc_dkio_flush_cb(void *arg) 5118 { 5119 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5120 struct dk_callback *dkc = NULL; 5121 vdc_t *vdc = NULL; 5122 int rv; 5123 5124 if (dk_arg == NULL) { 5125 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5126 return; 5127 } 5128 dkc = &dk_arg->dkc; 5129 vdc = dk_arg->vdc; 5130 ASSERT(vdc != NULL); 5131 5132 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5133 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5134 if (rv != 0) { 5135 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5136 vdc->instance, rv, 5137 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5138 } 5139 5140 /* 5141 * Trigger the call back to notify the caller the the ioctl call has 5142 * been completed. 5143 */ 5144 if ((dk_arg->mode & FKIOCTL) && 5145 (dkc != NULL) && 5146 (dkc->dkc_callback != NULL)) { 5147 ASSERT(dkc->dkc_cookie != NULL); 5148 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5149 } 5150 5151 /* Indicate that one less DKIO write flush is outstanding */ 5152 mutex_enter(&vdc->lock); 5153 vdc->dkio_flush_pending--; 5154 ASSERT(vdc->dkio_flush_pending >= 0); 5155 mutex_exit(&vdc->lock); 5156 5157 /* free the mem that was allocated when the callback was dispatched */ 5158 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5159 } 5160 5161 /* 5162 * Function: 5163 * vdc_dkio_gapart() 5164 * 5165 * Description: 5166 * This function implements the DKIOCGAPART ioctl. 5167 * 5168 * Arguments: 5169 * vdc - soft state pointer 5170 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5171 * flag - ioctl flags 5172 */ 5173 static int 5174 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5175 { 5176 struct dk_geom *geom; 5177 struct vtoc *vtoc; 5178 union { 5179 struct dk_map map[NDKMAP]; 5180 struct dk_map32 map32[NDKMAP]; 5181 } data; 5182 int i, rv, size; 5183 5184 mutex_enter(&vdc->lock); 5185 5186 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5187 mutex_exit(&vdc->lock); 5188 return (rv); 5189 } 5190 5191 vtoc = vdc->vtoc; 5192 geom = vdc->geom; 5193 5194 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5195 5196 for (i = 0; i < vtoc->v_nparts; i++) { 5197 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5198 (geom->dkg_nhead * geom->dkg_nsect); 5199 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5200 } 5201 size = NDKMAP * sizeof (struct dk_map32); 5202 5203 } else { 5204 5205 for (i = 0; i < vtoc->v_nparts; i++) { 5206 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5207 (geom->dkg_nhead * geom->dkg_nsect); 5208 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5209 } 5210 size = NDKMAP * sizeof (struct dk_map); 5211 5212 } 5213 5214 mutex_exit(&vdc->lock); 5215 5216 if (ddi_copyout(&data, arg, size, flag) != 0) 5217 return (EFAULT); 5218 5219 return (0); 5220 } 5221 5222 /* 5223 * Function: 5224 * vdc_dkio_partition() 5225 * 5226 * Description: 5227 * This function implements the DKIOCPARTITION ioctl. 5228 * 5229 * Arguments: 5230 * vdc - soft state pointer 5231 * arg - a pointer to a struct partition64 structure 5232 * flag - ioctl flags 5233 */ 5234 static int 5235 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5236 { 5237 struct partition64 p64; 5238 efi_gpt_t *gpt; 5239 efi_gpe_t *gpe; 5240 vd_efi_dev_t edev; 5241 uint_t partno; 5242 int rv; 5243 5244 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5245 return (EFAULT); 5246 } 5247 5248 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5249 5250 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5251 return (rv); 5252 } 5253 5254 partno = p64.p_partno; 5255 5256 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5257 vd_efi_free(&edev, gpt, gpe); 5258 return (ESRCH); 5259 } 5260 5261 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5262 sizeof (struct uuid)); 5263 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5264 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5265 5266 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5267 vd_efi_free(&edev, gpt, gpe); 5268 return (EFAULT); 5269 } 5270 5271 vd_efi_free(&edev, gpt, gpe); 5272 return (0); 5273 } 5274 5275 /* 5276 * Function: 5277 * vdc_dioctl_rwcmd() 5278 * 5279 * Description: 5280 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5281 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5282 * 5283 * Arguments: 5284 * dev - device 5285 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5286 * flag - ioctl flags 5287 */ 5288 static int 5289 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5290 { 5291 struct dadkio_rwcmd32 rwcmd32; 5292 struct dadkio_rwcmd rwcmd; 5293 struct iovec aiov; 5294 struct uio auio; 5295 int rw, status; 5296 struct buf *buf; 5297 5298 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5299 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5300 sizeof (struct dadkio_rwcmd32), flag)) { 5301 return (EFAULT); 5302 } 5303 rwcmd.cmd = rwcmd32.cmd; 5304 rwcmd.flags = rwcmd32.flags; 5305 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5306 rwcmd.buflen = rwcmd32.buflen; 5307 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5308 } else { 5309 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5310 sizeof (struct dadkio_rwcmd), flag)) { 5311 return (EFAULT); 5312 } 5313 } 5314 5315 switch (rwcmd.cmd) { 5316 case DADKIO_RWCMD_READ: 5317 rw = B_READ; 5318 break; 5319 case DADKIO_RWCMD_WRITE: 5320 rw = B_WRITE; 5321 break; 5322 default: 5323 return (EINVAL); 5324 } 5325 5326 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5327 aiov.iov_base = rwcmd.bufaddr; 5328 aiov.iov_len = rwcmd.buflen; 5329 5330 bzero((caddr_t)&auio, sizeof (struct uio)); 5331 auio.uio_iov = &aiov; 5332 auio.uio_iovcnt = 1; 5333 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5334 auio.uio_resid = rwcmd.buflen; 5335 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5336 5337 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5338 bioinit(buf); 5339 /* 5340 * We use the private field of buf to specify that this is an 5341 * I/O using an absolute offset. 5342 */ 5343 buf->b_private = (void *)VD_SLICE_NONE; 5344 5345 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5346 5347 biofini(buf); 5348 kmem_free(buf, sizeof (buf_t)); 5349 5350 return (status); 5351 } 5352 5353 /* 5354 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5355 * buffer is returned in alloc_len. 5356 */ 5357 static vd_scsi_t * 5358 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5359 int *alloc_len) 5360 { 5361 vd_scsi_t *vd_scsi; 5362 int vd_scsi_len = VD_SCSI_SIZE; 5363 5364 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5365 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5366 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5367 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5368 5369 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5370 5371 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5372 5373 vd_scsi->cdb_len = cdb_len; 5374 vd_scsi->sense_len = sense_len; 5375 vd_scsi->datain_len = datain_len; 5376 vd_scsi->dataout_len = dataout_len; 5377 5378 *alloc_len = vd_scsi_len; 5379 5380 return (vd_scsi); 5381 } 5382 5383 /* 5384 * Convert the status of a SCSI command to a Solaris return code. 5385 * 5386 * Arguments: 5387 * vd_scsi - The SCSI operation buffer. 5388 * log_error - indicate if an error message should be logged. 5389 * 5390 * Note that our SCSI error messages are rather primitive for the moment 5391 * and could be improved by decoding some data like the SCSI command and 5392 * the sense key. 5393 * 5394 * Return value: 5395 * 0 - Status is good. 5396 * EACCES - Status reports a reservation conflict. 5397 * ENOTSUP - Status reports a check condition and sense key 5398 * reports an illegal request. 5399 * EIO - Any other status. 5400 */ 5401 static int 5402 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5403 { 5404 int rv; 5405 char path_str[MAXPATHLEN]; 5406 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5407 union scsi_cdb *cdb; 5408 struct scsi_extended_sense *sense; 5409 5410 if (vd_scsi->cmd_status == STATUS_GOOD) 5411 /* no error */ 5412 return (0); 5413 5414 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5415 if (vdc_scsi_log_error) 5416 log_error = B_TRUE; 5417 5418 if (log_error) { 5419 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5420 ddi_pathname(vdc->dip, path_str), vdc->instance, 5421 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5422 } 5423 5424 /* default returned value */ 5425 rv = EIO; 5426 5427 switch (vd_scsi->cmd_status) { 5428 5429 case STATUS_CHECK: 5430 case STATUS_TERMINATED: 5431 if (log_error) 5432 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5433 5434 /* check sense buffer */ 5435 if (vd_scsi->sense_len == 0 || 5436 vd_scsi->sense_status != STATUS_GOOD) { 5437 if (log_error) 5438 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5439 break; 5440 } 5441 5442 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5443 5444 if (log_error) { 5445 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5446 "\tASC: 0x%x, ASCQ: 0x%x\n", 5447 scsi_sense_key((uint8_t *)sense), 5448 scsi_sense_asc((uint8_t *)sense), 5449 scsi_sense_ascq((uint8_t *)sense)); 5450 } 5451 5452 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5453 rv = ENOTSUP; 5454 break; 5455 5456 case STATUS_BUSY: 5457 if (log_error) 5458 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5459 break; 5460 5461 case STATUS_RESERVATION_CONFLICT: 5462 /* 5463 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5464 * reservation conflict could be due to various reasons like 5465 * incorrect keys, not registered or not reserved etc. So, 5466 * we should not panic in that case. 5467 */ 5468 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5469 if (vdc->failfast_interval != 0 && 5470 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5471 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5472 /* failfast is enabled so we have to panic */ 5473 (void) snprintf(panic_str, sizeof (panic_str), 5474 VDC_RESV_CONFLICT_FMT_STR "%s", 5475 ddi_pathname(vdc->dip, path_str)); 5476 panic(panic_str); 5477 } 5478 if (log_error) 5479 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5480 rv = EACCES; 5481 break; 5482 5483 case STATUS_QFULL: 5484 if (log_error) 5485 cmn_err(CE_NOTE, "\tQueue Full\n"); 5486 break; 5487 5488 case STATUS_MET: 5489 case STATUS_INTERMEDIATE: 5490 case STATUS_SCSI2: 5491 case STATUS_INTERMEDIATE_MET: 5492 case STATUS_ACA_ACTIVE: 5493 if (log_error) 5494 cmn_err(CE_CONT, 5495 "\tUnexpected SCSI status received: 0x%x\n", 5496 vd_scsi->cmd_status); 5497 break; 5498 5499 default: 5500 if (log_error) 5501 cmn_err(CE_CONT, 5502 "\tInvalid SCSI status received: 0x%x\n", 5503 vd_scsi->cmd_status); 5504 break; 5505 } 5506 5507 return (rv); 5508 } 5509 5510 /* 5511 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5512 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5513 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5514 * converted to a VD_OP_RESET operation. 5515 */ 5516 static int 5517 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5518 { 5519 struct uscsi_cmd uscsi; 5520 struct uscsi_cmd32 uscsi32; 5521 vd_scsi_t *vd_scsi; 5522 int vd_scsi_len; 5523 union scsi_cdb *cdb; 5524 struct scsi_extended_sense *sense; 5525 char *datain, *dataout; 5526 size_t cdb_len, datain_len, dataout_len, sense_len; 5527 int rv; 5528 5529 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5530 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5531 mode) != 0) 5532 return (EFAULT); 5533 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5534 } else { 5535 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5536 mode) != 0) 5537 return (EFAULT); 5538 } 5539 5540 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5541 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5542 USCSI_RESET_ALL)) { 5543 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5544 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5545 return (rv); 5546 } 5547 5548 /* cdb buffer length */ 5549 cdb_len = uscsi.uscsi_cdblen; 5550 5551 /* data in and out buffers length */ 5552 if (uscsi.uscsi_flags & USCSI_READ) { 5553 datain_len = uscsi.uscsi_buflen; 5554 dataout_len = 0; 5555 } else { 5556 datain_len = 0; 5557 dataout_len = uscsi.uscsi_buflen; 5558 } 5559 5560 /* sense buffer length */ 5561 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5562 sense_len = uscsi.uscsi_rqlen; 5563 else 5564 sense_len = 0; 5565 5566 /* allocate buffer for the VD_SCSICMD_OP operation */ 5567 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5568 &vd_scsi_len); 5569 5570 /* 5571 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5572 * but basically they prevent a SCSI command from being retried in case 5573 * of an error. 5574 */ 5575 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5576 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5577 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5578 5579 /* set task attribute */ 5580 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5581 vd_scsi->task_attribute = 0; 5582 } else { 5583 if (uscsi.uscsi_flags & USCSI_HEAD) 5584 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5585 else if (uscsi.uscsi_flags & USCSI_HTAG) 5586 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5587 else if (uscsi.uscsi_flags & USCSI_OTAG) 5588 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5589 else 5590 vd_scsi->task_attribute = 0; 5591 } 5592 5593 /* set timeout */ 5594 vd_scsi->timeout = uscsi.uscsi_timeout; 5595 5596 /* copy-in cdb data */ 5597 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5598 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5599 rv = EFAULT; 5600 goto done; 5601 } 5602 5603 /* keep a pointer to the sense buffer */ 5604 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5605 5606 /* keep a pointer to the data-in buffer */ 5607 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5608 5609 /* copy-in request data to the data-out buffer */ 5610 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5611 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5612 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5613 mode)) { 5614 rv = EFAULT; 5615 goto done; 5616 } 5617 } 5618 5619 /* submit the request */ 5620 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5621 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5622 5623 if (rv != 0) 5624 goto done; 5625 5626 /* update scsi status */ 5627 uscsi.uscsi_status = vd_scsi->cmd_status; 5628 5629 /* update sense data */ 5630 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5631 (uscsi.uscsi_status == STATUS_CHECK || 5632 uscsi.uscsi_status == STATUS_TERMINATED)) { 5633 5634 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5635 5636 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5637 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5638 vd_scsi->sense_len; 5639 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5640 vd_scsi->sense_len, mode) != 0) { 5641 rv = EFAULT; 5642 goto done; 5643 } 5644 } 5645 } 5646 5647 /* update request data */ 5648 if (uscsi.uscsi_status == STATUS_GOOD) { 5649 if (uscsi.uscsi_flags & USCSI_READ) { 5650 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5651 vd_scsi->datain_len; 5652 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5653 vd_scsi->datain_len, mode) != 0) { 5654 rv = EFAULT; 5655 goto done; 5656 } 5657 } else { 5658 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5659 vd_scsi->dataout_len; 5660 } 5661 } 5662 5663 /* copy-out result */ 5664 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5665 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5666 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5667 mode) != 0) { 5668 rv = EFAULT; 5669 goto done; 5670 } 5671 } else { 5672 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5673 mode) != 0) { 5674 rv = EFAULT; 5675 goto done; 5676 } 5677 } 5678 5679 /* get the return code from the SCSI command status */ 5680 rv = vdc_scsi_status(vdc, vd_scsi, 5681 !(uscsi.uscsi_flags & USCSI_SILENT)); 5682 5683 done: 5684 kmem_free(vd_scsi, vd_scsi_len); 5685 return (rv); 5686 } 5687 5688 /* 5689 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5690 * 5691 * Arguments: 5692 * cmd - SCSI PERSISTENT IN command 5693 * len - length of the SCSI input buffer 5694 * vd_scsi_len - return the length of the allocated buffer 5695 * 5696 * Returned Value: 5697 * a pointer to the allocated VD_OP_SCSICMD buffer. 5698 */ 5699 static vd_scsi_t * 5700 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5701 { 5702 int cdb_len, sense_len, datain_len, dataout_len; 5703 vd_scsi_t *vd_scsi; 5704 union scsi_cdb *cdb; 5705 5706 cdb_len = CDB_GROUP1; 5707 sense_len = sizeof (struct scsi_extended_sense); 5708 datain_len = len; 5709 dataout_len = 0; 5710 5711 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5712 vd_scsi_len); 5713 5714 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5715 5716 /* set cdb */ 5717 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5718 cdb->cdb_opaque[1] = cmd; 5719 FORMG1COUNT(cdb, datain_len); 5720 5721 vd_scsi->timeout = vdc_scsi_timeout; 5722 5723 return (vd_scsi); 5724 } 5725 5726 /* 5727 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5728 * 5729 * Arguments: 5730 * cmd - SCSI PERSISTENT OUT command 5731 * len - length of the SCSI output buffer 5732 * vd_scsi_len - return the length of the allocated buffer 5733 * 5734 * Returned Code: 5735 * a pointer to the allocated VD_OP_SCSICMD buffer. 5736 */ 5737 static vd_scsi_t * 5738 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5739 { 5740 int cdb_len, sense_len, datain_len, dataout_len; 5741 vd_scsi_t *vd_scsi; 5742 union scsi_cdb *cdb; 5743 5744 cdb_len = CDB_GROUP1; 5745 sense_len = sizeof (struct scsi_extended_sense); 5746 datain_len = 0; 5747 dataout_len = len; 5748 5749 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5750 vd_scsi_len); 5751 5752 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5753 5754 /* set cdb */ 5755 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5756 cdb->cdb_opaque[1] = cmd; 5757 FORMG1COUNT(cdb, dataout_len); 5758 5759 vd_scsi->timeout = vdc_scsi_timeout; 5760 5761 return (vd_scsi); 5762 } 5763 5764 /* 5765 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5766 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5767 * server with a VD_OP_SCSICMD operation. 5768 */ 5769 static int 5770 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5771 { 5772 vd_scsi_t *vd_scsi; 5773 mhioc_inkeys_t inkeys; 5774 mhioc_key_list_t klist; 5775 struct mhioc_inkeys32 inkeys32; 5776 struct mhioc_key_list32 klist32; 5777 sd_prin_readkeys_t *scsi_keys; 5778 void *user_keys; 5779 int vd_scsi_len; 5780 int listsize, listlen, rv; 5781 5782 /* copyin arguments */ 5783 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5784 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5785 if (rv != 0) 5786 return (EFAULT); 5787 5788 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5789 sizeof (klist32), mode); 5790 if (rv != 0) 5791 return (EFAULT); 5792 5793 listsize = klist32.listsize; 5794 } else { 5795 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5796 if (rv != 0) 5797 return (EFAULT); 5798 5799 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5800 if (rv != 0) 5801 return (EFAULT); 5802 5803 listsize = klist.listsize; 5804 } 5805 5806 /* build SCSI VD_OP request */ 5807 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5808 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5809 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5810 5811 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5812 5813 /* submit the request */ 5814 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5815 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5816 5817 if (rv != 0) 5818 goto done; 5819 5820 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5821 5822 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5823 inkeys32.generation = scsi_keys->generation; 5824 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5825 if (rv != 0) { 5826 rv = EFAULT; 5827 goto done; 5828 } 5829 5830 klist32.listlen = listlen; 5831 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5832 sizeof (klist32), mode); 5833 if (rv != 0) { 5834 rv = EFAULT; 5835 goto done; 5836 } 5837 5838 user_keys = (caddr_t)(uintptr_t)klist32.list; 5839 } else { 5840 inkeys.generation = scsi_keys->generation; 5841 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5842 if (rv != 0) { 5843 rv = EFAULT; 5844 goto done; 5845 } 5846 5847 klist.listlen = listlen; 5848 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5849 if (rv != 0) { 5850 rv = EFAULT; 5851 goto done; 5852 } 5853 5854 user_keys = klist.list; 5855 } 5856 5857 /* copy out keys */ 5858 if (listlen > 0 && listsize > 0) { 5859 if (listsize < listlen) 5860 listlen = listsize; 5861 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5862 listlen * MHIOC_RESV_KEY_SIZE, mode); 5863 if (rv != 0) 5864 rv = EFAULT; 5865 } 5866 5867 if (rv == 0) 5868 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5869 5870 done: 5871 kmem_free(vd_scsi, vd_scsi_len); 5872 5873 return (rv); 5874 } 5875 5876 /* 5877 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5878 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5879 * the vdisk server with a VD_OP_SCSICMD operation. 5880 */ 5881 static int 5882 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5883 { 5884 vd_scsi_t *vd_scsi; 5885 mhioc_inresvs_t inresv; 5886 mhioc_resv_desc_list_t rlist; 5887 struct mhioc_inresvs32 inresv32; 5888 struct mhioc_resv_desc_list32 rlist32; 5889 mhioc_resv_desc_t mhd_resv; 5890 sd_prin_readresv_t *scsi_resv; 5891 sd_readresv_desc_t *resv; 5892 mhioc_resv_desc_t *user_resv; 5893 int vd_scsi_len; 5894 int listsize, listlen, i, rv; 5895 5896 /* copyin arguments */ 5897 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5898 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5899 if (rv != 0) 5900 return (EFAULT); 5901 5902 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5903 sizeof (rlist32), mode); 5904 if (rv != 0) 5905 return (EFAULT); 5906 5907 listsize = rlist32.listsize; 5908 } else { 5909 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5910 if (rv != 0) 5911 return (EFAULT); 5912 5913 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5914 if (rv != 0) 5915 return (EFAULT); 5916 5917 listsize = rlist.listsize; 5918 } 5919 5920 /* build SCSI VD_OP request */ 5921 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5922 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5923 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5924 5925 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5926 5927 /* submit the request */ 5928 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5929 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5930 5931 if (rv != 0) 5932 goto done; 5933 5934 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5935 5936 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5937 inresv32.generation = scsi_resv->generation; 5938 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5939 if (rv != 0) { 5940 rv = EFAULT; 5941 goto done; 5942 } 5943 5944 rlist32.listlen = listlen; 5945 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5946 sizeof (rlist32), mode); 5947 if (rv != 0) { 5948 rv = EFAULT; 5949 goto done; 5950 } 5951 5952 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5953 } else { 5954 inresv.generation = scsi_resv->generation; 5955 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5956 if (rv != 0) { 5957 rv = EFAULT; 5958 goto done; 5959 } 5960 5961 rlist.listlen = listlen; 5962 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5963 if (rv != 0) { 5964 rv = EFAULT; 5965 goto done; 5966 } 5967 5968 user_resv = rlist.list; 5969 } 5970 5971 /* copy out reservations */ 5972 if (listsize > 0 && listlen > 0) { 5973 if (listsize < listlen) 5974 listlen = listsize; 5975 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5976 5977 for (i = 0; i < listlen; i++) { 5978 mhd_resv.type = resv->type; 5979 mhd_resv.scope = resv->scope; 5980 mhd_resv.scope_specific_addr = 5981 BE_32(resv->scope_specific_addr); 5982 bcopy(&resv->resvkey, &mhd_resv.key, 5983 MHIOC_RESV_KEY_SIZE); 5984 5985 rv = ddi_copyout(&mhd_resv, user_resv, 5986 sizeof (mhd_resv), mode); 5987 if (rv != 0) { 5988 rv = EFAULT; 5989 goto done; 5990 } 5991 resv++; 5992 user_resv++; 5993 } 5994 } 5995 5996 if (rv == 0) 5997 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5998 5999 done: 6000 kmem_free(vd_scsi, vd_scsi_len); 6001 return (rv); 6002 } 6003 6004 /* 6005 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 6006 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 6007 * server with a VD_OP_SCSICMD operation. 6008 */ 6009 static int 6010 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 6011 { 6012 vd_scsi_t *vd_scsi; 6013 sd_prout_t *scsi_prout; 6014 mhioc_register_t mhd_reg; 6015 int vd_scsi_len, rv; 6016 6017 /* copyin arguments */ 6018 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 6019 if (rv != 0) 6020 return (EFAULT); 6021 6022 /* build SCSI VD_OP request */ 6023 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 6024 sizeof (sd_prout_t), &vd_scsi_len); 6025 6026 /* set parameters */ 6027 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6028 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6029 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6030 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6031 6032 /* submit the request */ 6033 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6034 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6035 6036 if (rv == 0) 6037 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6038 6039 kmem_free(vd_scsi, vd_scsi_len); 6040 return (rv); 6041 } 6042 6043 /* 6044 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6045 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6046 * server with a VD_OP_SCSICMD operation. 6047 */ 6048 static int 6049 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6050 { 6051 union scsi_cdb *cdb; 6052 vd_scsi_t *vd_scsi; 6053 sd_prout_t *scsi_prout; 6054 mhioc_resv_desc_t mhd_resv; 6055 int vd_scsi_len, rv; 6056 6057 /* copyin arguments */ 6058 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6059 if (rv != 0) 6060 return (EFAULT); 6061 6062 /* build SCSI VD_OP request */ 6063 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6064 sizeof (sd_prout_t), &vd_scsi_len); 6065 6066 /* set parameters */ 6067 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6068 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6069 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6070 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6071 cdb->cdb_opaque[2] = mhd_resv.type; 6072 6073 /* submit the request */ 6074 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6075 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6076 6077 if (rv == 0) 6078 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6079 6080 kmem_free(vd_scsi, vd_scsi_len); 6081 return (rv); 6082 } 6083 6084 /* 6085 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6086 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6087 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6088 */ 6089 static int 6090 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6091 { 6092 union scsi_cdb *cdb; 6093 vd_scsi_t *vd_scsi; 6094 sd_prout_t *scsi_prout; 6095 mhioc_preemptandabort_t mhd_preempt; 6096 int vd_scsi_len, rv; 6097 6098 /* copyin arguments */ 6099 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6100 if (rv != 0) 6101 return (EFAULT); 6102 6103 /* build SCSI VD_OP request */ 6104 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6105 sizeof (sd_prout_t), &vd_scsi_len); 6106 6107 /* set parameters */ 6108 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6109 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6110 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6111 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6112 MHIOC_RESV_KEY_SIZE); 6113 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6114 MHIOC_RESV_KEY_SIZE); 6115 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6116 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6117 6118 /* submit the request */ 6119 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6120 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6121 6122 if (rv == 0) 6123 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6124 6125 kmem_free(vd_scsi, vd_scsi_len); 6126 return (rv); 6127 } 6128 6129 /* 6130 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6131 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6132 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6133 */ 6134 static int 6135 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6136 { 6137 vd_scsi_t *vd_scsi; 6138 sd_prout_t *scsi_prout; 6139 mhioc_registerandignorekey_t mhd_regi; 6140 int vd_scsi_len, rv; 6141 6142 /* copyin arguments */ 6143 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6144 if (rv != 0) 6145 return (EFAULT); 6146 6147 /* build SCSI VD_OP request */ 6148 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6149 sizeof (sd_prout_t), &vd_scsi_len); 6150 6151 /* set parameters */ 6152 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6153 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6154 MHIOC_RESV_KEY_SIZE); 6155 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6156 6157 /* submit the request */ 6158 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6159 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6160 6161 if (rv == 0) 6162 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6163 6164 kmem_free(vd_scsi, vd_scsi_len); 6165 return (rv); 6166 } 6167 6168 /* 6169 * This function is used by the failfast mechanism to send a SCSI command 6170 * to check for reservation conflict. 6171 */ 6172 static int 6173 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6174 { 6175 int cdb_len, sense_len, vd_scsi_len; 6176 vd_scsi_t *vd_scsi; 6177 union scsi_cdb *cdb; 6178 int rv; 6179 6180 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6181 6182 if (scmd == SCMD_WRITE_G1) 6183 cdb_len = CDB_GROUP1; 6184 else 6185 cdb_len = CDB_GROUP0; 6186 6187 sense_len = sizeof (struct scsi_extended_sense); 6188 6189 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6190 6191 /* set cdb */ 6192 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6193 cdb->scc_cmd = scmd; 6194 6195 vd_scsi->timeout = vdc_scsi_timeout; 6196 6197 /* 6198 * Submit the request. The last argument has to be B_FALSE so that 6199 * vdc_do_sync_op does not loop checking for reservation conflict if 6200 * the operation returns an error. 6201 */ 6202 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6203 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6204 6205 if (rv == 0) 6206 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6207 6208 kmem_free(vd_scsi, vd_scsi_len); 6209 return (rv); 6210 } 6211 6212 /* 6213 * This function is used by the failfast mechanism to check for reservation 6214 * conflict. It sends some SCSI commands which will fail with a reservation 6215 * conflict error if the system does not have access to the disk and this 6216 * will panic the system. 6217 * 6218 * Returned Code: 6219 * 0 - disk is accessible without reservation conflict error 6220 * != 0 - unable to check if disk is accessible 6221 */ 6222 int 6223 vdc_failfast_check_resv(vdc_t *vdc) 6224 { 6225 int failure = 0; 6226 6227 /* 6228 * Send a TEST UNIT READY command. The command will panic 6229 * the system if it fails with a reservation conflict. 6230 */ 6231 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6232 failure++; 6233 6234 /* 6235 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6236 * a reserved device, so we also do a WRITE(10) of zero byte in 6237 * order to provoke a Reservation Conflict status on those newer 6238 * devices. 6239 */ 6240 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6241 failure++; 6242 6243 return (failure); 6244 } 6245 6246 /* 6247 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6248 * queue when it has failed and failfast is enabled. Then we have to check 6249 * if it has failed because of a reservation conflict in which case we have 6250 * to panic the system. 6251 * 6252 * Async I/O should be queued with their block I/O data transfer structure 6253 * (buf). Sync I/O should be queued with buf = NULL. 6254 */ 6255 static vdc_io_t * 6256 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6257 { 6258 vdc_io_t *vio; 6259 6260 ASSERT(MUTEX_HELD(&vdc->lock)); 6261 6262 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6263 vio->vio_next = vdc->failfast_io_queue; 6264 vio->vio_buf = buf; 6265 vio->vio_qtime = ddi_get_lbolt(); 6266 6267 vdc->failfast_io_queue = vio; 6268 6269 /* notify the failfast thread that a new I/O is queued */ 6270 cv_signal(&vdc->failfast_cv); 6271 6272 return (vio); 6273 } 6274 6275 /* 6276 * Remove and complete I/O in the failfast I/O queue which have been 6277 * added after the indicated deadline. A deadline of 0 means that all 6278 * I/O have to be unqueued and marked as completed. 6279 */ 6280 static void 6281 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6282 { 6283 vdc_io_t *vio, *vio_tmp; 6284 6285 ASSERT(MUTEX_HELD(&vdc->lock)); 6286 6287 vio_tmp = NULL; 6288 vio = vdc->failfast_io_queue; 6289 6290 if (deadline != 0) { 6291 /* 6292 * Skip any io queued after the deadline. The failfast 6293 * I/O queue is ordered starting with the last I/O added 6294 * to the queue. 6295 */ 6296 while (vio != NULL && vio->vio_qtime > deadline) { 6297 vio_tmp = vio; 6298 vio = vio->vio_next; 6299 } 6300 } 6301 6302 if (vio == NULL) 6303 /* nothing to unqueue */ 6304 return; 6305 6306 /* update the queue */ 6307 if (vio_tmp == NULL) 6308 vdc->failfast_io_queue = NULL; 6309 else 6310 vio_tmp->vio_next = NULL; 6311 6312 /* 6313 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6314 * structure (buf) and they are completed by calling biodone(). Sync 6315 * I/O do not have a buf and they are completed by setting the 6316 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6317 * thread waiting for the I/O to complete is responsible for freeing 6318 * the vio structure. 6319 */ 6320 while (vio != NULL) { 6321 vio_tmp = vio->vio_next; 6322 if (vio->vio_buf != NULL) { 6323 VD_KSTAT_RUNQ_EXIT(vdc); 6324 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6325 biodone(vio->vio_buf); 6326 kmem_free(vio, sizeof (vdc_io_t)); 6327 } else { 6328 vio->vio_qtime = 0; 6329 } 6330 vio = vio_tmp; 6331 } 6332 6333 cv_broadcast(&vdc->failfast_io_cv); 6334 } 6335 6336 /* 6337 * Failfast Thread. 6338 * 6339 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6340 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6341 * we still have access to the disk. If a command fails with a RESERVATION 6342 * CONFLICT error then the system will immediatly panic. 6343 * 6344 * The failfast thread is also woken up when an I/O has failed. It then check 6345 * the access to the disk to ensure that the I/O failure was not due to a 6346 * reservation conflict. 6347 * 6348 * There is one failfast thread for each virtual disk for which failfast is 6349 * enabled. We could have only one thread sending requests for all disks but 6350 * this would need vdc to send asynchronous requests and to have callbacks to 6351 * process replies. 6352 */ 6353 static void 6354 vdc_failfast_thread(void *arg) 6355 { 6356 int status; 6357 vdc_t *vdc = (vdc_t *)arg; 6358 clock_t timeout, starttime; 6359 6360 mutex_enter(&vdc->lock); 6361 6362 while (vdc->failfast_interval != 0) { 6363 6364 starttime = ddi_get_lbolt(); 6365 6366 mutex_exit(&vdc->lock); 6367 6368 /* check for reservation conflict */ 6369 status = vdc_failfast_check_resv(vdc); 6370 6371 mutex_enter(&vdc->lock); 6372 /* 6373 * We have dropped the lock to send the SCSI command so we have 6374 * to check that failfast is still enabled. 6375 */ 6376 if (vdc->failfast_interval == 0) 6377 break; 6378 6379 /* 6380 * If we have successfully check the disk access and there was 6381 * no reservation conflict then we can complete any I/O queued 6382 * before the last check. 6383 */ 6384 if (status == 0) 6385 vdc_failfast_io_unqueue(vdc, starttime); 6386 6387 /* proceed again if some I/O are still in the queue */ 6388 if (vdc->failfast_io_queue != NULL) 6389 continue; 6390 6391 timeout = ddi_get_lbolt() + 6392 drv_usectohz(vdc->failfast_interval); 6393 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6394 } 6395 6396 /* 6397 * Failfast is being stop so we can complete any queued I/O. 6398 */ 6399 vdc_failfast_io_unqueue(vdc, 0); 6400 vdc->failfast_thread = NULL; 6401 mutex_exit(&vdc->lock); 6402 thread_exit(); 6403 } 6404 6405 /* 6406 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6407 */ 6408 static int 6409 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6410 { 6411 unsigned int mh_time; 6412 6413 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6414 return (EFAULT); 6415 6416 mutex_enter(&vdc->lock); 6417 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6418 vdc->failfast_thread = thread_create(NULL, 0, 6419 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6420 v.v_maxsyspri - 2); 6421 } 6422 6423 vdc->failfast_interval = mh_time * 1000; 6424 cv_signal(&vdc->failfast_cv); 6425 mutex_exit(&vdc->lock); 6426 6427 return (0); 6428 } 6429 6430 /* 6431 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6432 * converted to VD_OP_SET_ACCESS operations. 6433 */ 6434 static int 6435 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6436 { 6437 int rv; 6438 6439 /* submit owership command request */ 6440 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6441 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6442 VIO_both_dir, B_TRUE); 6443 6444 return (rv); 6445 } 6446 6447 /* 6448 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6449 * VD_OP_GET_ACCESS operation. 6450 */ 6451 static int 6452 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6453 { 6454 int rv; 6455 6456 /* submit owership command request */ 6457 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6458 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6459 VIO_both_dir, B_TRUE); 6460 6461 return (rv); 6462 } 6463 6464 /* 6465 * Disk Ownership Thread. 6466 * 6467 * When we have taken the ownership of a disk, this thread waits to be 6468 * notified when the LDC channel is reset so that it can recover the 6469 * ownership. 6470 * 6471 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6472 * can not be used to do the ownership recovery because it has to be 6473 * running to handle the reply message to the ownership operation. 6474 */ 6475 static void 6476 vdc_ownership_thread(void *arg) 6477 { 6478 vdc_t *vdc = (vdc_t *)arg; 6479 clock_t timeout; 6480 uint64_t status; 6481 6482 mutex_enter(&vdc->ownership_lock); 6483 mutex_enter(&vdc->lock); 6484 6485 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6486 6487 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6488 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6489 /* 6490 * There was a reset so the ownership has been lost, 6491 * try to recover. We do this without using the preempt 6492 * option so that we don't steal the ownership from 6493 * someone who has preempted us. 6494 */ 6495 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6496 vdc->instance); 6497 6498 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6499 VDC_OWNERSHIP_GRANTED); 6500 6501 mutex_exit(&vdc->lock); 6502 6503 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6504 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6505 6506 mutex_enter(&vdc->lock); 6507 6508 if (status == 0) { 6509 DMSG(vdc, 0, "[%d] Ownership recovered", 6510 vdc->instance); 6511 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6512 } else { 6513 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6514 vdc->instance); 6515 } 6516 6517 } 6518 6519 /* 6520 * If we have the ownership then we just wait for an event 6521 * to happen (LDC reset), otherwise we will retry to recover 6522 * after a delay. 6523 */ 6524 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6525 timeout = 0; 6526 else 6527 timeout = ddi_get_lbolt() + 6528 drv_usectohz(vdc_ownership_delay); 6529 6530 /* Release the ownership_lock and wait on the vdc lock */ 6531 mutex_exit(&vdc->ownership_lock); 6532 6533 if (timeout == 0) 6534 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6535 else 6536 (void) cv_timedwait(&vdc->ownership_cv, 6537 &vdc->lock, timeout); 6538 6539 mutex_exit(&vdc->lock); 6540 6541 mutex_enter(&vdc->ownership_lock); 6542 mutex_enter(&vdc->lock); 6543 } 6544 6545 vdc->ownership_thread = NULL; 6546 mutex_exit(&vdc->lock); 6547 mutex_exit(&vdc->ownership_lock); 6548 6549 thread_exit(); 6550 } 6551 6552 static void 6553 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6554 { 6555 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6556 6557 mutex_enter(&vdc->lock); 6558 vdc->ownership = ownership_flags; 6559 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6560 vdc->ownership_thread == NULL) { 6561 /* start ownership thread */ 6562 vdc->ownership_thread = thread_create(NULL, 0, 6563 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6564 v.v_maxsyspri - 2); 6565 } else { 6566 /* notify the ownership thread */ 6567 cv_signal(&vdc->ownership_cv); 6568 } 6569 mutex_exit(&vdc->lock); 6570 } 6571 6572 /* 6573 * Get the size and the block size of a virtual disk from the vdisk server. 6574 * We need to use this operation when the vdisk_size attribute was not 6575 * available during the handshake with the vdisk server. 6576 */ 6577 static int 6578 vdc_check_capacity(vdc_t *vdc) 6579 { 6580 int rv = 0; 6581 size_t alloc_len; 6582 vd_capacity_t *vd_cap; 6583 6584 if (vdc->vdisk_size != 0) 6585 return (0); 6586 6587 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6588 6589 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6590 6591 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6592 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6593 6594 if (rv == 0) { 6595 if (vd_cap->vdisk_block_size != vdc->block_size || 6596 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6597 vd_cap->vdisk_size == 0) 6598 rv = EINVAL; 6599 else 6600 vdc->vdisk_size = vd_cap->vdisk_size; 6601 } 6602 6603 kmem_free(vd_cap, alloc_len); 6604 return (rv); 6605 } 6606 6607 /* 6608 * This structure is used in the DKIO(7I) array below. 6609 */ 6610 typedef struct vdc_dk_ioctl { 6611 uint8_t op; /* VD_OP_XXX value */ 6612 int cmd; /* Solaris ioctl operation number */ 6613 size_t nbytes; /* size of structure to be copied */ 6614 6615 /* function to convert between vDisk and Solaris structure formats */ 6616 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6617 int mode, int dir); 6618 } vdc_dk_ioctl_t; 6619 6620 /* 6621 * Subset of DKIO(7I) operations currently supported 6622 */ 6623 static vdc_dk_ioctl_t dk_ioctl[] = { 6624 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6625 vdc_null_copy_func}, 6626 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6627 vdc_get_wce_convert}, 6628 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6629 vdc_set_wce_convert}, 6630 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6631 vdc_get_vtoc_convert}, 6632 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6633 vdc_set_vtoc_convert}, 6634 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6635 vdc_get_geom_convert}, 6636 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6637 vdc_get_geom_convert}, 6638 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6639 vdc_get_geom_convert}, 6640 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6641 vdc_set_geom_convert}, 6642 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6643 vdc_get_efi_convert}, 6644 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6645 vdc_set_efi_convert}, 6646 6647 /* DIOCTL_RWCMD is converted to a read or a write */ 6648 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6649 6650 /* mhd(7I) non-shared multihost disks ioctls */ 6651 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6652 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6653 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6654 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6655 6656 /* mhd(7I) shared multihost disks ioctls */ 6657 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6658 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6659 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6660 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6661 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6662 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6663 6664 /* mhd(7I) failfast ioctl */ 6665 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6666 6667 /* 6668 * These particular ioctls are not sent to the server - vdc fakes up 6669 * the necessary info. 6670 */ 6671 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6672 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6673 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6674 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6675 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6676 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6677 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6678 }; 6679 6680 /* 6681 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6682 * function and forward them to the vdisk. 6683 */ 6684 static int 6685 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6686 { 6687 vdc_t *vdc = (vdc_t *)vdisk; 6688 dev_t dev; 6689 int rval; 6690 6691 dev = makedevice(ddi_driver_major(vdc->dip), 6692 VD_MAKE_DEV(vdc->instance, 0)); 6693 6694 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6695 } 6696 6697 /* 6698 * Function: 6699 * vd_process_ioctl() 6700 * 6701 * Description: 6702 * This routine processes disk specific ioctl calls 6703 * 6704 * Arguments: 6705 * dev - the device number 6706 * cmd - the operation [dkio(7I)] to be processed 6707 * arg - pointer to user provided structure 6708 * (contains data to be set or reference parameter for get) 6709 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6710 * rvalp - pointer to return value for calling process. 6711 * 6712 * Return Code: 6713 * 0 6714 * EFAULT 6715 * ENXIO 6716 * EIO 6717 * ENOTSUP 6718 */ 6719 static int 6720 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6721 { 6722 int instance = VDCUNIT(dev); 6723 vdc_t *vdc = NULL; 6724 int rv = -1; 6725 int idx = 0; /* index into dk_ioctl[] */ 6726 size_t len = 0; /* #bytes to send to vds */ 6727 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6728 caddr_t mem_p = NULL; 6729 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6730 vdc_dk_ioctl_t *iop; 6731 6732 vdc = ddi_get_soft_state(vdc_state, instance); 6733 if (vdc == NULL) { 6734 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6735 instance); 6736 return (ENXIO); 6737 } 6738 6739 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6740 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6741 6742 if (rvalp != NULL) { 6743 /* the return value of the ioctl is 0 by default */ 6744 *rvalp = 0; 6745 } 6746 6747 /* 6748 * Validate the ioctl operation to be performed. 6749 * 6750 * If we have looped through the array without finding a match then we 6751 * don't support this ioctl. 6752 */ 6753 for (idx = 0; idx < nioctls; idx++) { 6754 if (cmd == dk_ioctl[idx].cmd) 6755 break; 6756 } 6757 6758 if (idx >= nioctls) { 6759 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6760 vdc->instance, cmd); 6761 return (ENOTSUP); 6762 } 6763 6764 iop = &(dk_ioctl[idx]); 6765 6766 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6767 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6768 dk_efi_t dk_efi; 6769 6770 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6771 if (rv != 0) 6772 return (EFAULT); 6773 6774 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6775 } else { 6776 len = iop->nbytes; 6777 } 6778 6779 /* check if the ioctl is applicable */ 6780 switch (cmd) { 6781 case CDROMREADOFFSET: 6782 case DKIOCREMOVABLE: 6783 return (ENOTTY); 6784 6785 case USCSICMD: 6786 case MHIOCTKOWN: 6787 case MHIOCSTATUS: 6788 case MHIOCQRESERVE: 6789 case MHIOCRELEASE: 6790 case MHIOCGRP_INKEYS: 6791 case MHIOCGRP_INRESV: 6792 case MHIOCGRP_REGISTER: 6793 case MHIOCGRP_RESERVE: 6794 case MHIOCGRP_PREEMPTANDABORT: 6795 case MHIOCGRP_REGISTERANDIGNOREKEY: 6796 case MHIOCENFAILFAST: 6797 if (vdc->cinfo == NULL) 6798 return (ENXIO); 6799 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6800 return (ENOTTY); 6801 break; 6802 6803 case DIOCTL_RWCMD: 6804 if (vdc->cinfo == NULL) 6805 return (ENXIO); 6806 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6807 return (ENOTTY); 6808 break; 6809 6810 case DKIOCINFO: 6811 if (vdc->cinfo == NULL) 6812 return (ENXIO); 6813 break; 6814 6815 case DKIOCGMEDIAINFO: 6816 if (vdc->minfo == NULL) 6817 return (ENXIO); 6818 if (vdc_check_capacity(vdc) != 0) 6819 /* disk capacity is not available */ 6820 return (EIO); 6821 break; 6822 } 6823 6824 /* 6825 * Deal with ioctls which require a processing different than 6826 * converting ioctl arguments and sending a corresponding 6827 * VD operation. 6828 */ 6829 switch (cmd) { 6830 6831 case USCSICMD: 6832 { 6833 return (vdc_uscsi_cmd(vdc, arg, mode)); 6834 } 6835 6836 case MHIOCTKOWN: 6837 { 6838 mutex_enter(&vdc->ownership_lock); 6839 /* 6840 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6841 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6842 * while we are processing the ioctl. 6843 */ 6844 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6845 6846 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6847 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6848 if (rv == 0) { 6849 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6850 VDC_OWNERSHIP_GRANTED); 6851 } else { 6852 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6853 } 6854 mutex_exit(&vdc->ownership_lock); 6855 return (rv); 6856 } 6857 6858 case MHIOCRELEASE: 6859 { 6860 mutex_enter(&vdc->ownership_lock); 6861 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6862 if (rv == 0) { 6863 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6864 } 6865 mutex_exit(&vdc->ownership_lock); 6866 return (rv); 6867 } 6868 6869 case MHIOCSTATUS: 6870 { 6871 uint64_t status; 6872 6873 rv = vdc_access_get(vdc, &status, mode); 6874 if (rv == 0 && rvalp != NULL) 6875 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6876 return (rv); 6877 } 6878 6879 case MHIOCQRESERVE: 6880 { 6881 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6882 return (rv); 6883 } 6884 6885 case MHIOCGRP_INKEYS: 6886 { 6887 return (vdc_mhd_inkeys(vdc, arg, mode)); 6888 } 6889 6890 case MHIOCGRP_INRESV: 6891 { 6892 return (vdc_mhd_inresv(vdc, arg, mode)); 6893 } 6894 6895 case MHIOCGRP_REGISTER: 6896 { 6897 return (vdc_mhd_register(vdc, arg, mode)); 6898 } 6899 6900 case MHIOCGRP_RESERVE: 6901 { 6902 return (vdc_mhd_reserve(vdc, arg, mode)); 6903 } 6904 6905 case MHIOCGRP_PREEMPTANDABORT: 6906 { 6907 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6908 } 6909 6910 case MHIOCGRP_REGISTERANDIGNOREKEY: 6911 { 6912 return (vdc_mhd_registerignore(vdc, arg, mode)); 6913 } 6914 6915 case MHIOCENFAILFAST: 6916 { 6917 rv = vdc_failfast(vdc, arg, mode); 6918 return (rv); 6919 } 6920 6921 case DIOCTL_RWCMD: 6922 { 6923 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6924 } 6925 6926 case DKIOCGAPART: 6927 { 6928 return (vdc_dkio_gapart(vdc, arg, mode)); 6929 } 6930 6931 case DKIOCPARTITION: 6932 { 6933 return (vdc_dkio_partition(vdc, arg, mode)); 6934 } 6935 6936 case DKIOCINFO: 6937 { 6938 struct dk_cinfo cinfo; 6939 6940 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6941 cinfo.dki_partition = VDCPART(dev); 6942 6943 rv = ddi_copyout(&cinfo, (void *)arg, 6944 sizeof (struct dk_cinfo), mode); 6945 if (rv != 0) 6946 return (EFAULT); 6947 6948 return (0); 6949 } 6950 6951 case DKIOCGMEDIAINFO: 6952 { 6953 ASSERT(vdc->vdisk_size != 0); 6954 if (vdc->minfo->dki_capacity == 0) 6955 vdc->minfo->dki_capacity = vdc->vdisk_size; 6956 rv = ddi_copyout(vdc->minfo, (void *)arg, 6957 sizeof (struct dk_minfo), mode); 6958 if (rv != 0) 6959 return (EFAULT); 6960 6961 return (0); 6962 } 6963 6964 case DKIOCFLUSHWRITECACHE: 6965 { 6966 struct dk_callback *dkc = 6967 (struct dk_callback *)(uintptr_t)arg; 6968 vdc_dk_arg_t *dkarg = NULL; 6969 6970 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6971 instance, mode); 6972 6973 /* 6974 * If arg is NULL, then there is no callback function 6975 * registered and the call operates synchronously; we 6976 * break and continue with the rest of the function and 6977 * wait for vds to return (i.e. after the request to 6978 * vds returns successfully, all writes completed prior 6979 * to the ioctl will have been flushed from the disk 6980 * write cache to persistent media. 6981 * 6982 * If a callback function is registered, we dispatch 6983 * the request on a task queue and return immediately. 6984 * The callback will deal with informing the calling 6985 * thread that the flush request is completed. 6986 */ 6987 if (dkc == NULL) 6988 break; 6989 6990 /* 6991 * the asynchronous callback is only supported if 6992 * invoked from within the kernel 6993 */ 6994 if ((mode & FKIOCTL) == 0) 6995 return (ENOTSUP); 6996 6997 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 6998 6999 dkarg->mode = mode; 7000 dkarg->dev = dev; 7001 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 7002 7003 mutex_enter(&vdc->lock); 7004 vdc->dkio_flush_pending++; 7005 dkarg->vdc = vdc; 7006 mutex_exit(&vdc->lock); 7007 7008 /* put the request on a task queue */ 7009 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 7010 (void *)dkarg, DDI_SLEEP); 7011 if (rv == NULL) { 7012 /* clean up if dispatch fails */ 7013 mutex_enter(&vdc->lock); 7014 vdc->dkio_flush_pending--; 7015 mutex_exit(&vdc->lock); 7016 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7017 } 7018 7019 return (rv == NULL ? ENOMEM : 0); 7020 } 7021 } 7022 7023 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7024 ASSERT(iop->op != 0); 7025 7026 /* check if the vDisk server handles the operation for this vDisk */ 7027 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7028 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7029 vdc->instance, iop->op); 7030 return (ENOTSUP); 7031 } 7032 7033 /* LDC requires that the memory being mapped is 8-byte aligned */ 7034 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7035 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7036 instance, len, alloc_len); 7037 7038 if (alloc_len > 0) 7039 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7040 7041 /* 7042 * Call the conversion function for this ioctl which, if necessary, 7043 * converts from the Solaris format to the format ARC'ed 7044 * as part of the vDisk protocol (FWARC 2006/195) 7045 */ 7046 ASSERT(iop->convert != NULL); 7047 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7048 if (rv != 0) { 7049 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7050 instance, rv, cmd); 7051 if (mem_p != NULL) 7052 kmem_free(mem_p, alloc_len); 7053 return (rv); 7054 } 7055 7056 /* 7057 * send request to vds to service the ioctl. 7058 */ 7059 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7060 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7061 VIO_both_dir, B_TRUE); 7062 7063 if (rv != 0) { 7064 /* 7065 * This is not necessarily an error. The ioctl could 7066 * be returning a value such as ENOTTY to indicate 7067 * that the ioctl is not applicable. 7068 */ 7069 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7070 instance, rv, cmd); 7071 if (mem_p != NULL) 7072 kmem_free(mem_p, alloc_len); 7073 7074 return (rv); 7075 } 7076 7077 /* 7078 * Call the conversion function (if it exists) for this ioctl 7079 * which converts from the format ARC'ed as part of the vDisk 7080 * protocol (FWARC 2006/195) back to a format understood by 7081 * the rest of Solaris. 7082 */ 7083 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7084 if (rv != 0) { 7085 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7086 instance, rv, cmd); 7087 if (mem_p != NULL) 7088 kmem_free(mem_p, alloc_len); 7089 return (rv); 7090 } 7091 7092 if (mem_p != NULL) 7093 kmem_free(mem_p, alloc_len); 7094 7095 return (rv); 7096 } 7097 7098 /* 7099 * Function: 7100 * 7101 * Description: 7102 * This is an empty conversion function used by ioctl calls which 7103 * do not need to convert the data being passed in/out to userland 7104 */ 7105 static int 7106 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7107 { 7108 _NOTE(ARGUNUSED(vdc)) 7109 _NOTE(ARGUNUSED(from)) 7110 _NOTE(ARGUNUSED(to)) 7111 _NOTE(ARGUNUSED(mode)) 7112 _NOTE(ARGUNUSED(dir)) 7113 7114 return (0); 7115 } 7116 7117 static int 7118 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7119 int mode, int dir) 7120 { 7121 _NOTE(ARGUNUSED(vdc)) 7122 7123 if (dir == VD_COPYIN) 7124 return (0); /* nothing to do */ 7125 7126 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7127 return (EFAULT); 7128 7129 return (0); 7130 } 7131 7132 static int 7133 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7134 int mode, int dir) 7135 { 7136 _NOTE(ARGUNUSED(vdc)) 7137 7138 if (dir == VD_COPYOUT) 7139 return (0); /* nothing to do */ 7140 7141 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7142 return (EFAULT); 7143 7144 return (0); 7145 } 7146 7147 /* 7148 * Function: 7149 * vdc_get_vtoc_convert() 7150 * 7151 * Description: 7152 * This routine performs the necessary convertions from the DKIOCGVTOC 7153 * Solaris structure to the format defined in FWARC 2006/195. 7154 * 7155 * In the struct vtoc definition, the timestamp field is marked as not 7156 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7157 * However SVM uses that field to check it can write into the VTOC, 7158 * so we fake up the info of that field. 7159 * 7160 * Arguments: 7161 * vdc - the vDisk client 7162 * from - the buffer containing the data to be copied from 7163 * to - the buffer to be copied to 7164 * mode - flags passed to ioctl() call 7165 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7166 * 7167 * Return Code: 7168 * 0 - Success 7169 * ENXIO - incorrect buffer passed in. 7170 * EFAULT - ddi_copyout routine encountered an error. 7171 */ 7172 static int 7173 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7174 { 7175 int i; 7176 void *tmp_mem = NULL; 7177 void *tmp_memp; 7178 struct vtoc vt; 7179 struct vtoc32 vt32; 7180 int copy_len = 0; 7181 int rv = 0; 7182 7183 if (dir != VD_COPYOUT) 7184 return (0); /* nothing to do */ 7185 7186 if ((from == NULL) || (to == NULL)) 7187 return (ENXIO); 7188 7189 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7190 copy_len = sizeof (struct vtoc32); 7191 else 7192 copy_len = sizeof (struct vtoc); 7193 7194 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7195 7196 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 7197 7198 /* fake the VTOC timestamp field */ 7199 for (i = 0; i < V_NUMPAR; i++) { 7200 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 7201 } 7202 7203 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7204 /* LINTED E_ASSIGN_NARROW_CONV */ 7205 vtoctovtoc32(vt, vt32); 7206 tmp_memp = &vt32; 7207 } else { 7208 tmp_memp = &vt; 7209 } 7210 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 7211 if (rv != 0) 7212 rv = EFAULT; 7213 7214 kmem_free(tmp_mem, copy_len); 7215 return (rv); 7216 } 7217 7218 /* 7219 * Function: 7220 * vdc_set_vtoc_convert() 7221 * 7222 * Description: 7223 * This routine performs the necessary convertions from the DKIOCSVTOC 7224 * Solaris structure to the format defined in FWARC 2006/195. 7225 * 7226 * Arguments: 7227 * vdc - the vDisk client 7228 * from - Buffer with data 7229 * to - Buffer where data is to be copied to 7230 * mode - flags passed to ioctl 7231 * dir - direction of copy (in or out) 7232 * 7233 * Return Code: 7234 * 0 - Success 7235 * ENXIO - Invalid buffer passed in 7236 * EFAULT - ddi_copyin of data failed 7237 */ 7238 static int 7239 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7240 { 7241 _NOTE(ARGUNUSED(vdc)) 7242 7243 void *tmp_mem = NULL, *uvtoc; 7244 struct vtoc vt; 7245 struct vtoc *vtp = &vt; 7246 vd_vtoc_t vtvd; 7247 int copy_len = 0; 7248 int i, rv = 0; 7249 7250 if ((from == NULL) || (to == NULL)) 7251 return (ENXIO); 7252 7253 if (dir == VD_COPYIN) 7254 uvtoc = from; 7255 else 7256 uvtoc = to; 7257 7258 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7259 copy_len = sizeof (struct vtoc32); 7260 else 7261 copy_len = sizeof (struct vtoc); 7262 7263 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7264 7265 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 7266 if (rv != 0) { 7267 kmem_free(tmp_mem, copy_len); 7268 return (EFAULT); 7269 } 7270 7271 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7272 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 7273 } else { 7274 vtp = tmp_mem; 7275 } 7276 7277 if (dir == VD_COPYOUT) { 7278 /* 7279 * The disk label may have changed. Revalidate the disk 7280 * geometry. This will also update the device nodes. 7281 */ 7282 vdc_validate(vdc); 7283 7284 /* 7285 * We also need to keep track of the timestamp fields. 7286 */ 7287 for (i = 0; i < V_NUMPAR; i++) { 7288 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 7289 } 7290 7291 return (0); 7292 } 7293 7294 VTOC2VD_VTOC(vtp, &vtvd); 7295 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 7296 kmem_free(tmp_mem, copy_len); 7297 7298 return (0); 7299 } 7300 7301 /* 7302 * Function: 7303 * vdc_get_geom_convert() 7304 * 7305 * Description: 7306 * This routine performs the necessary convertions from the DKIOCGGEOM, 7307 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7308 * defined in FWARC 2006/195 7309 * 7310 * Arguments: 7311 * vdc - the vDisk client 7312 * from - Buffer with data 7313 * to - Buffer where data is to be copied to 7314 * mode - flags passed to ioctl 7315 * dir - direction of copy (in or out) 7316 * 7317 * Return Code: 7318 * 0 - Success 7319 * ENXIO - Invalid buffer passed in 7320 * EFAULT - ddi_copyout of data failed 7321 */ 7322 static int 7323 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7324 { 7325 _NOTE(ARGUNUSED(vdc)) 7326 7327 struct dk_geom geom; 7328 int copy_len = sizeof (struct dk_geom); 7329 int rv = 0; 7330 7331 if (dir != VD_COPYOUT) 7332 return (0); /* nothing to do */ 7333 7334 if ((from == NULL) || (to == NULL)) 7335 return (ENXIO); 7336 7337 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7338 rv = ddi_copyout(&geom, to, copy_len, mode); 7339 if (rv != 0) 7340 rv = EFAULT; 7341 7342 return (rv); 7343 } 7344 7345 /* 7346 * Function: 7347 * vdc_set_geom_convert() 7348 * 7349 * Description: 7350 * This routine performs the necessary convertions from the DKIOCSGEOM 7351 * Solaris structure to the format defined in FWARC 2006/195. 7352 * 7353 * Arguments: 7354 * vdc - the vDisk client 7355 * from - Buffer with data 7356 * to - Buffer where data is to be copied to 7357 * mode - flags passed to ioctl 7358 * dir - direction of copy (in or out) 7359 * 7360 * Return Code: 7361 * 0 - Success 7362 * ENXIO - Invalid buffer passed in 7363 * EFAULT - ddi_copyin of data failed 7364 */ 7365 static int 7366 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7367 { 7368 _NOTE(ARGUNUSED(vdc)) 7369 7370 vd_geom_t vdgeom; 7371 void *tmp_mem = NULL; 7372 int copy_len = sizeof (struct dk_geom); 7373 int rv = 0; 7374 7375 if (dir != VD_COPYIN) 7376 return (0); /* nothing to do */ 7377 7378 if ((from == NULL) || (to == NULL)) 7379 return (ENXIO); 7380 7381 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7382 7383 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7384 if (rv != 0) { 7385 kmem_free(tmp_mem, copy_len); 7386 return (EFAULT); 7387 } 7388 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7389 bcopy(&vdgeom, to, sizeof (vdgeom)); 7390 kmem_free(tmp_mem, copy_len); 7391 7392 return (0); 7393 } 7394 7395 static int 7396 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7397 { 7398 _NOTE(ARGUNUSED(vdc)) 7399 7400 vd_efi_t *vd_efi; 7401 dk_efi_t dk_efi; 7402 int rv = 0; 7403 void *uaddr; 7404 7405 if ((from == NULL) || (to == NULL)) 7406 return (ENXIO); 7407 7408 if (dir == VD_COPYIN) { 7409 7410 vd_efi = (vd_efi_t *)to; 7411 7412 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7413 if (rv != 0) 7414 return (EFAULT); 7415 7416 vd_efi->lba = dk_efi.dki_lba; 7417 vd_efi->length = dk_efi.dki_length; 7418 bzero(vd_efi->data, vd_efi->length); 7419 7420 } else { 7421 7422 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7423 if (rv != 0) 7424 return (EFAULT); 7425 7426 uaddr = dk_efi.dki_data; 7427 7428 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7429 7430 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7431 7432 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7433 mode); 7434 if (rv != 0) 7435 return (EFAULT); 7436 7437 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7438 } 7439 7440 return (0); 7441 } 7442 7443 static int 7444 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7445 { 7446 _NOTE(ARGUNUSED(vdc)) 7447 7448 dk_efi_t dk_efi; 7449 void *uaddr; 7450 7451 if (dir == VD_COPYOUT) { 7452 /* 7453 * The disk label may have changed. Revalidate the disk 7454 * geometry. This will also update the device nodes. 7455 */ 7456 vdc_validate(vdc); 7457 return (0); 7458 } 7459 7460 if ((from == NULL) || (to == NULL)) 7461 return (ENXIO); 7462 7463 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7464 return (EFAULT); 7465 7466 uaddr = dk_efi.dki_data; 7467 7468 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7469 7470 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7471 return (EFAULT); 7472 7473 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7474 7475 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7476 7477 return (0); 7478 } 7479 7480 7481 /* -------------------------------------------------------------------------- */ 7482 7483 /* 7484 * Function: 7485 * vdc_create_fake_geometry() 7486 * 7487 * Description: 7488 * This routine fakes up the disk info needed for some DKIO ioctls such 7489 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7490 * 7491 * Note: This function must not be called until the vDisk attributes have 7492 * been exchanged as part of the handshake with the vDisk server. 7493 * 7494 * Arguments: 7495 * vdc - soft state pointer for this instance of the device driver. 7496 * 7497 * Return Code: 7498 * none. 7499 */ 7500 static void 7501 vdc_create_fake_geometry(vdc_t *vdc) 7502 { 7503 ASSERT(vdc != NULL); 7504 ASSERT(vdc->max_xfer_sz != 0); 7505 7506 /* 7507 * DKIOCINFO support 7508 */ 7509 if (vdc->cinfo == NULL) 7510 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7511 7512 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7513 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7514 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7515 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7516 7517 /* 7518 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7519 * operation is supported, otherwise the controller type is DKC_DIRECT. 7520 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7521 * controller type is always DKC_DIRECT in that case. 7522 * 7523 * If the virtual disk is backed by a physical CD/DVD device or 7524 * an ISO image, modify the controller type to indicate this 7525 */ 7526 switch (vdc->vdisk_media) { 7527 case VD_MEDIA_CD: 7528 case VD_MEDIA_DVD: 7529 vdc->cinfo->dki_ctype = DKC_CDROM; 7530 break; 7531 case VD_MEDIA_FIXED: 7532 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7533 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7534 else 7535 vdc->cinfo->dki_ctype = DKC_DIRECT; 7536 break; 7537 default: 7538 /* in the case of v1.0 we default to a fixed disk */ 7539 vdc->cinfo->dki_ctype = DKC_DIRECT; 7540 break; 7541 } 7542 vdc->cinfo->dki_flags = DKI_FMTVOL; 7543 vdc->cinfo->dki_cnum = 0; 7544 vdc->cinfo->dki_addr = 0; 7545 vdc->cinfo->dki_space = 0; 7546 vdc->cinfo->dki_prio = 0; 7547 vdc->cinfo->dki_vec = 0; 7548 vdc->cinfo->dki_unit = vdc->instance; 7549 vdc->cinfo->dki_slave = 0; 7550 /* 7551 * The partition number will be created on the fly depending on the 7552 * actual slice (i.e. minor node) that is used to request the data. 7553 */ 7554 vdc->cinfo->dki_partition = 0; 7555 7556 /* 7557 * DKIOCGMEDIAINFO support 7558 */ 7559 if (vdc->minfo == NULL) 7560 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7561 7562 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7563 vdc->minfo->dki_media_type = 7564 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7565 } else { 7566 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7567 } 7568 7569 vdc->minfo->dki_capacity = vdc->vdisk_size; 7570 vdc->minfo->dki_lbsize = vdc->block_size; 7571 } 7572 7573 static ushort_t 7574 vdc_lbl2cksum(struct dk_label *label) 7575 { 7576 int count; 7577 ushort_t sum, *sp; 7578 7579 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7580 sp = (ushort_t *)label; 7581 sum = 0; 7582 while (count--) { 7583 sum ^= *sp++; 7584 } 7585 7586 return (sum); 7587 } 7588 7589 /* 7590 * Function: 7591 * vdc_validate_geometry 7592 * 7593 * Description: 7594 * This routine discovers the label and geometry of the disk. It stores 7595 * the disk label and related information in the vdc structure. If it 7596 * fails to validate the geometry or to discover the disk label then 7597 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7598 * 7599 * Arguments: 7600 * vdc - soft state pointer for this instance of the device driver. 7601 * 7602 * Return Code: 7603 * 0 - success. 7604 * EINVAL - unknown disk label. 7605 * ENOTSUP - geometry not applicable (EFI label). 7606 * EIO - error accessing the disk. 7607 */ 7608 static int 7609 vdc_validate_geometry(vdc_t *vdc) 7610 { 7611 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7612 dev_t dev; 7613 int rv, rval; 7614 struct dk_label label; 7615 struct dk_geom geom; 7616 struct vtoc vtoc; 7617 efi_gpt_t *gpt; 7618 efi_gpe_t *gpe; 7619 vd_efi_dev_t edev; 7620 7621 ASSERT(vdc != NULL); 7622 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7623 ASSERT(MUTEX_HELD(&vdc->lock)); 7624 7625 mutex_exit(&vdc->lock); 7626 7627 dev = makedevice(ddi_driver_major(vdc->dip), 7628 VD_MAKE_DEV(vdc->instance, 0)); 7629 7630 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7631 if (rv == 0) 7632 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7633 FKIOCTL, &rval); 7634 7635 if (rv == ENOTSUP) { 7636 /* 7637 * If the device does not support VTOC then we try 7638 * to read an EFI label. 7639 * 7640 * We need to know the block size and the disk size to 7641 * be able to read an EFI label. 7642 */ 7643 if (vdc->vdisk_size == 0) { 7644 if ((rv = vdc_check_capacity(vdc)) != 0) { 7645 mutex_enter(&vdc->lock); 7646 vdc_store_label_unk(vdc); 7647 return (rv); 7648 } 7649 } 7650 7651 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7652 7653 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7654 7655 if (rv) { 7656 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7657 vdc->instance, rv); 7658 mutex_enter(&vdc->lock); 7659 vdc_store_label_unk(vdc); 7660 return (EIO); 7661 } 7662 7663 mutex_enter(&vdc->lock); 7664 vdc_store_label_efi(vdc, gpt, gpe); 7665 vd_efi_free(&edev, gpt, gpe); 7666 return (ENOTSUP); 7667 } 7668 7669 if (rv != 0) { 7670 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7671 vdc->instance, rv); 7672 mutex_enter(&vdc->lock); 7673 vdc_store_label_unk(vdc); 7674 if (rv != EINVAL) 7675 rv = EIO; 7676 return (rv); 7677 } 7678 7679 /* check that geometry and vtoc are valid */ 7680 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7681 vtoc.v_sanity != VTOC_SANE) { 7682 mutex_enter(&vdc->lock); 7683 vdc_store_label_unk(vdc); 7684 return (EINVAL); 7685 } 7686 7687 /* 7688 * We have a disk and a valid VTOC. However this does not mean 7689 * that the disk currently have a VTOC label. The returned VTOC may 7690 * be a default VTOC to be used for configuring the disk (this is 7691 * what is done for disk image). So we read the label from the 7692 * beginning of the disk to ensure we really have a VTOC label. 7693 * 7694 * FUTURE: This could be the default way for reading the VTOC 7695 * from the disk as opposed to sending the VD_OP_GET_VTOC 7696 * to the server. This will be the default if vdc is implemented 7697 * ontop of cmlb. 7698 */ 7699 7700 /* 7701 * Single slice disk does not support read using an absolute disk 7702 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7703 */ 7704 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7705 mutex_enter(&vdc->lock); 7706 if (vtoc.v_nparts != 1) { 7707 vdc_store_label_unk(vdc); 7708 return (EINVAL); 7709 } 7710 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7711 return (0); 7712 } 7713 7714 if (vtoc.v_nparts != V_NUMPAR) { 7715 mutex_enter(&vdc->lock); 7716 vdc_store_label_unk(vdc); 7717 return (EINVAL); 7718 } 7719 7720 /* 7721 * Read disk label from start of disk 7722 */ 7723 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7724 bioinit(buf); 7725 buf->b_un.b_addr = (caddr_t)&label; 7726 buf->b_bcount = DK_LABEL_SIZE; 7727 buf->b_flags = B_BUSY | B_READ; 7728 buf->b_dev = cmpdev(dev); 7729 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7730 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7731 if (rv) { 7732 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7733 vdc->instance); 7734 } else { 7735 rv = biowait(buf); 7736 biofini(buf); 7737 } 7738 kmem_free(buf, sizeof (buf_t)); 7739 7740 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7741 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7742 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7743 vdc->instance); 7744 mutex_enter(&vdc->lock); 7745 vdc_store_label_unk(vdc); 7746 return (EINVAL); 7747 } 7748 7749 mutex_enter(&vdc->lock); 7750 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7751 return (0); 7752 } 7753 7754 /* 7755 * Function: 7756 * vdc_validate 7757 * 7758 * Description: 7759 * This routine discovers the label of the disk and create the 7760 * appropriate device nodes if the label has changed. 7761 * 7762 * Arguments: 7763 * vdc - soft state pointer for this instance of the device driver. 7764 * 7765 * Return Code: 7766 * none. 7767 */ 7768 static void 7769 vdc_validate(vdc_t *vdc) 7770 { 7771 vd_disk_label_t old_label; 7772 vd_slice_t old_slice[V_NUMPAR]; 7773 int rv; 7774 7775 ASSERT(!MUTEX_HELD(&vdc->lock)); 7776 7777 mutex_enter(&vdc->lock); 7778 7779 /* save the current label and vtoc */ 7780 old_label = vdc->vdisk_label; 7781 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7782 7783 /* check the geometry */ 7784 (void) vdc_validate_geometry(vdc); 7785 7786 /* if the disk label has changed, update device nodes */ 7787 if (vdc->vdisk_label != old_label) { 7788 7789 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7790 rv = vdc_create_device_nodes_efi(vdc); 7791 else 7792 rv = vdc_create_device_nodes_vtoc(vdc); 7793 7794 if (rv != 0) { 7795 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7796 vdc->instance); 7797 } 7798 } 7799 7800 mutex_exit(&vdc->lock); 7801 } 7802 7803 static void 7804 vdc_validate_task(void *arg) 7805 { 7806 vdc_t *vdc = (vdc_t *)arg; 7807 7808 vdc_validate(vdc); 7809 7810 mutex_enter(&vdc->lock); 7811 ASSERT(vdc->validate_pending > 0); 7812 vdc->validate_pending--; 7813 mutex_exit(&vdc->lock); 7814 } 7815 7816 /* 7817 * Function: 7818 * vdc_setup_devid() 7819 * 7820 * Description: 7821 * This routine discovers the devid of a vDisk. It requests the devid of 7822 * the underlying device from the vDisk server, builds an encapsulated 7823 * devid based on the retrieved devid and registers that new devid to 7824 * the vDisk. 7825 * 7826 * Arguments: 7827 * vdc - soft state pointer for this instance of the device driver. 7828 * 7829 * Return Code: 7830 * 0 - A devid was succesfully registered for the vDisk 7831 */ 7832 static int 7833 vdc_setup_devid(vdc_t *vdc) 7834 { 7835 int rv; 7836 vd_devid_t *vd_devid; 7837 size_t bufsize, bufid_len; 7838 7839 /* 7840 * At first sight, we don't know the size of the devid that the 7841 * server will return but this size will be encoded into the 7842 * reply. So we do a first request using a default size then we 7843 * check if this size was large enough. If not then we do a second 7844 * request with the correct size returned by the server. Note that 7845 * ldc requires size to be 8-byte aligned. 7846 */ 7847 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7848 sizeof (uint64_t)); 7849 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7850 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7851 7852 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7853 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7854 7855 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7856 7857 if (rv) { 7858 kmem_free(vd_devid, bufsize); 7859 return (rv); 7860 } 7861 7862 if (vd_devid->length > bufid_len) { 7863 /* 7864 * The returned devid is larger than the buffer used. Try again 7865 * with a buffer with the right size. 7866 */ 7867 kmem_free(vd_devid, bufsize); 7868 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7869 sizeof (uint64_t)); 7870 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7871 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7872 7873 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7874 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7875 VIO_both_dir, B_TRUE); 7876 7877 if (rv) { 7878 kmem_free(vd_devid, bufsize); 7879 return (rv); 7880 } 7881 } 7882 7883 /* 7884 * The virtual disk should have the same device id as the one associated 7885 * with the physical disk it is mapped on, otherwise sharing a disk 7886 * between a LDom and a non-LDom may not work (for example for a shared 7887 * SVM disk set). 7888 * 7889 * The DDI framework does not allow creating a device id with any 7890 * type so we first create a device id of type DEVID_ENCAP and then 7891 * we restore the orignal type of the physical device. 7892 */ 7893 7894 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7895 7896 /* build an encapsulated devid based on the returned devid */ 7897 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7898 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7899 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7900 kmem_free(vd_devid, bufsize); 7901 return (1); 7902 } 7903 7904 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7905 7906 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7907 7908 kmem_free(vd_devid, bufsize); 7909 7910 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7911 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7912 return (1); 7913 } 7914 7915 return (0); 7916 } 7917 7918 static void 7919 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7920 { 7921 int i, nparts; 7922 7923 ASSERT(MUTEX_HELD(&vdc->lock)); 7924 7925 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7926 bzero(vdc->vtoc, sizeof (struct vtoc)); 7927 bzero(vdc->geom, sizeof (struct dk_geom)); 7928 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7929 7930 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7931 7932 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7933 7934 if (gpe[i].efi_gpe_StartingLBA == 0 || 7935 gpe[i].efi_gpe_EndingLBA == 0) { 7936 continue; 7937 } 7938 7939 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7940 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7941 gpe[i].efi_gpe_StartingLBA + 1; 7942 } 7943 7944 ASSERT(vdc->vdisk_size != 0); 7945 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7946 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7947 7948 } 7949 7950 static void 7951 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7952 { 7953 int i; 7954 7955 ASSERT(MUTEX_HELD(&vdc->lock)); 7956 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7957 7958 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7959 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7960 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7961 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7962 7963 for (i = 0; i < vtoc->v_nparts; i++) { 7964 vdc->slice[i].start = vtoc->v_part[i].p_start; 7965 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7966 } 7967 } 7968 7969 static void 7970 vdc_store_label_unk(vdc_t *vdc) 7971 { 7972 ASSERT(MUTEX_HELD(&vdc->lock)); 7973 7974 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7975 bzero(vdc->vtoc, sizeof (struct vtoc)); 7976 bzero(vdc->geom, sizeof (struct dk_geom)); 7977 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7978 } 7979