1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/kstat.h> 69 #include <sys/mach_descrip.h> 70 #include <sys/modctl.h> 71 #include <sys/mdeg.h> 72 #include <sys/note.h> 73 #include <sys/open.h> 74 #include <sys/sdt.h> 75 #include <sys/stat.h> 76 #include <sys/sunddi.h> 77 #include <sys/types.h> 78 #include <sys/promif.h> 79 #include <sys/var.h> 80 #include <sys/vtoc.h> 81 #include <sys/archsystm.h> 82 #include <sys/sysmacros.h> 83 84 #include <sys/cdio.h> 85 #include <sys/dktp/fdisk.h> 86 #include <sys/dktp/dadkio.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 124 /* setup */ 125 static void vdc_min(struct buf *bufp); 126 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 127 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 128 static int vdc_start_ldc_connection(vdc_t *vdc); 129 static int vdc_create_device_nodes(vdc_t *vdc); 130 static int vdc_create_device_nodes_efi(vdc_t *vdc); 131 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 132 static int vdc_create_device_nodes_props(vdc_t *vdc); 133 static void vdc_create_io_kstats(vdc_t *vdc); 134 static void vdc_create_err_kstats(vdc_t *vdc); 135 static void vdc_set_err_kstats(vdc_t *vdc); 136 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 137 mde_cookie_t *vd_nodep); 138 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 139 static void vdc_fini_ports(vdc_t *vdc); 140 static void vdc_switch_server(vdc_t *vdcp); 141 static int vdc_do_ldc_up(vdc_t *vdc); 142 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 143 static int vdc_init_descriptor_ring(vdc_t *vdc); 144 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 145 static int vdc_setup_devid(vdc_t *vdc); 146 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 147 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 148 static void vdc_store_label_unk(vdc_t *vdc); 149 static boolean_t vdc_is_opened(vdc_t *vdc); 150 151 /* handshake with vds */ 152 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 153 static int vdc_ver_negotiation(vdc_t *vdcp); 154 static int vdc_init_attr_negotiation(vdc_t *vdc); 155 static int vdc_attr_negotiation(vdc_t *vdcp); 156 static int vdc_init_dring_negotiate(vdc_t *vdc); 157 static int vdc_dring_negotiation(vdc_t *vdcp); 158 static int vdc_send_rdx(vdc_t *vdcp); 159 static int vdc_rdx_exchange(vdc_t *vdcp); 160 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 161 162 /* processing incoming messages from vDisk server */ 163 static void vdc_process_msg_thread(vdc_t *vdc); 164 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 165 166 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 167 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 168 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 169 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 170 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 171 static int vdc_send_request(vdc_t *vdcp, int operation, 172 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 173 int cb_type, void *cb_arg, vio_desc_direction_t dir); 174 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 175 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 176 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 177 int cb_type, void *cb_arg, vio_desc_direction_t dir); 178 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 179 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 180 void *cb_arg, vio_desc_direction_t dir, boolean_t); 181 182 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 183 static int vdc_drain_response(vdc_t *vdcp); 184 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 185 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 186 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 187 188 /* dkio */ 189 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 190 int *rvalp); 191 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 192 static void vdc_create_fake_geometry(vdc_t *vdc); 193 static int vdc_validate_geometry(vdc_t *vdc); 194 static void vdc_validate(vdc_t *vdc); 195 static void vdc_validate_task(void *arg); 196 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 197 int mode, int dir); 198 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 199 int mode, int dir); 200 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 201 int mode, int dir); 202 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 203 int mode, int dir); 204 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 205 int mode, int dir); 206 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 207 int mode, int dir); 208 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 209 int mode, int dir); 210 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 211 int mode, int dir); 212 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 213 int mode, int dir); 214 215 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 216 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 217 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 218 static int vdc_failfast_check_resv(vdc_t *vdc); 219 220 /* 221 * Module variables 222 */ 223 224 /* 225 * Tunable variables to control how long vdc waits before timing out on 226 * various operations 227 */ 228 static int vdc_hshake_retries = 3; 229 230 static int vdc_timeout = 0; /* units: seconds */ 231 static int vdc_ldcup_timeout = 1; /* units: seconds */ 232 233 static uint64_t vdc_hz_min_ldc_delay; 234 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 235 static uint64_t vdc_hz_max_ldc_delay; 236 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 237 238 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 239 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 240 241 /* values for dumping - need to run in a tighter loop */ 242 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 243 static int vdc_dump_retries = 100; 244 245 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 246 247 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 248 249 /* Count of the number of vdc instances attached */ 250 static volatile uint32_t vdc_instance_count = 0; 251 252 /* Tunable to log all SCSI errors */ 253 static boolean_t vdc_scsi_log_error = B_FALSE; 254 255 /* Soft state pointer */ 256 static void *vdc_state; 257 258 /* 259 * Controlling the verbosity of the error/debug messages 260 * 261 * vdc_msglevel - controls level of messages 262 * vdc_matchinst - 64-bit variable where each bit corresponds 263 * to the vdc instance the vdc_msglevel applies. 264 */ 265 int vdc_msglevel = 0x0; 266 uint64_t vdc_matchinst = 0ull; 267 268 /* 269 * Supported vDisk protocol version pairs. 270 * 271 * The first array entry is the latest and preferred version. 272 */ 273 static const vio_ver_t vdc_version[] = {{1, 1}}; 274 275 static struct cb_ops vdc_cb_ops = { 276 vdc_open, /* cb_open */ 277 vdc_close, /* cb_close */ 278 vdc_strategy, /* cb_strategy */ 279 vdc_print, /* cb_print */ 280 vdc_dump, /* cb_dump */ 281 vdc_read, /* cb_read */ 282 vdc_write, /* cb_write */ 283 vdc_ioctl, /* cb_ioctl */ 284 nodev, /* cb_devmap */ 285 nodev, /* cb_mmap */ 286 nodev, /* cb_segmap */ 287 nochpoll, /* cb_chpoll */ 288 ddi_prop_op, /* cb_prop_op */ 289 NULL, /* cb_str */ 290 D_MP | D_64BIT, /* cb_flag */ 291 CB_REV, /* cb_rev */ 292 vdc_aread, /* cb_aread */ 293 vdc_awrite /* cb_awrite */ 294 }; 295 296 static struct dev_ops vdc_ops = { 297 DEVO_REV, /* devo_rev */ 298 0, /* devo_refcnt */ 299 vdc_getinfo, /* devo_getinfo */ 300 nulldev, /* devo_identify */ 301 nulldev, /* devo_probe */ 302 vdc_attach, /* devo_attach */ 303 vdc_detach, /* devo_detach */ 304 nodev, /* devo_reset */ 305 &vdc_cb_ops, /* devo_cb_ops */ 306 NULL, /* devo_bus_ops */ 307 nulldev /* devo_power */ 308 }; 309 310 static struct modldrv modldrv = { 311 &mod_driverops, 312 "virtual disk client", 313 &vdc_ops, 314 }; 315 316 static struct modlinkage modlinkage = { 317 MODREV_1, 318 &modldrv, 319 NULL 320 }; 321 322 /* -------------------------------------------------------------------------- */ 323 324 /* 325 * Device Driver housekeeping and setup 326 */ 327 328 int 329 _init(void) 330 { 331 int status; 332 333 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 334 return (status); 335 if ((status = mod_install(&modlinkage)) != 0) 336 ddi_soft_state_fini(&vdc_state); 337 return (status); 338 } 339 340 int 341 _info(struct modinfo *modinfop) 342 { 343 return (mod_info(&modlinkage, modinfop)); 344 } 345 346 int 347 _fini(void) 348 { 349 int status; 350 351 if ((status = mod_remove(&modlinkage)) != 0) 352 return (status); 353 ddi_soft_state_fini(&vdc_state); 354 return (0); 355 } 356 357 static int 358 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 359 { 360 _NOTE(ARGUNUSED(dip)) 361 362 int instance = VDCUNIT((dev_t)arg); 363 vdc_t *vdc = NULL; 364 365 switch (cmd) { 366 case DDI_INFO_DEVT2DEVINFO: 367 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 368 *resultp = NULL; 369 return (DDI_FAILURE); 370 } 371 *resultp = vdc->dip; 372 return (DDI_SUCCESS); 373 case DDI_INFO_DEVT2INSTANCE: 374 *resultp = (void *)(uintptr_t)instance; 375 return (DDI_SUCCESS); 376 default: 377 *resultp = NULL; 378 return (DDI_FAILURE); 379 } 380 } 381 382 static int 383 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 384 { 385 kt_did_t failfast_tid, ownership_tid; 386 int instance; 387 int rv; 388 vdc_t *vdc = NULL; 389 390 switch (cmd) { 391 case DDI_DETACH: 392 /* the real work happens below */ 393 break; 394 case DDI_SUSPEND: 395 /* nothing to do for this non-device */ 396 return (DDI_SUCCESS); 397 default: 398 return (DDI_FAILURE); 399 } 400 401 ASSERT(cmd == DDI_DETACH); 402 instance = ddi_get_instance(dip); 403 DMSGX(1, "[%d] Entered\n", instance); 404 405 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 406 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 407 return (DDI_FAILURE); 408 } 409 410 /* 411 * This function is called when vdc is detached or if it has failed to 412 * attach. In that case, the attach may have fail before the vdisk type 413 * has been set so we can't call vdc_is_opened(). However as the attach 414 * has failed, we know that the vdisk is not opened and we can safely 415 * detach. 416 */ 417 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 418 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 419 return (DDI_FAILURE); 420 } 421 422 if (vdc->dkio_flush_pending) { 423 DMSG(vdc, 0, 424 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 425 instance, vdc->dkio_flush_pending); 426 return (DDI_FAILURE); 427 } 428 429 if (vdc->validate_pending) { 430 DMSG(vdc, 0, 431 "[%d] Cannot detach: %d outstanding validate request\n", 432 instance, vdc->validate_pending); 433 return (DDI_FAILURE); 434 } 435 436 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 437 438 /* If we took ownership, release ownership */ 439 mutex_enter(&vdc->ownership_lock); 440 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 441 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 442 if (rv == 0) { 443 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 444 } 445 } 446 mutex_exit(&vdc->ownership_lock); 447 448 /* mark instance as detaching */ 449 vdc->lifecycle = VDC_LC_DETACHING; 450 451 /* 452 * try and disable callbacks to prevent another handshake 453 */ 454 if (vdc->curr_server != NULL) { 455 rv = ldc_set_cb_mode(vdc->curr_server->ldc_handle, 456 LDC_CB_DISABLE); 457 DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); 458 } 459 460 if (vdc->initialized & VDC_THREAD) { 461 mutex_enter(&vdc->read_lock); 462 if ((vdc->read_state == VDC_READ_WAITING) || 463 (vdc->read_state == VDC_READ_RESET)) { 464 vdc->read_state = VDC_READ_RESET; 465 cv_signal(&vdc->read_cv); 466 } 467 468 mutex_exit(&vdc->read_lock); 469 470 /* wake up any thread waiting for connection to come online */ 471 mutex_enter(&vdc->lock); 472 if (vdc->state == VDC_STATE_INIT_WAITING) { 473 DMSG(vdc, 0, 474 "[%d] write reset - move to resetting state...\n", 475 instance); 476 vdc->state = VDC_STATE_RESETTING; 477 cv_signal(&vdc->initwait_cv); 478 } 479 mutex_exit(&vdc->lock); 480 481 /* now wait until state transitions to VDC_STATE_DETACH */ 482 thread_join(vdc->msg_proc_thr->t_did); 483 ASSERT(vdc->state == VDC_STATE_DETACH); 484 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 485 vdc->instance); 486 } 487 488 mutex_enter(&vdc->lock); 489 490 if (vdc->initialized & VDC_DRING) 491 vdc_destroy_descriptor_ring(vdc); 492 493 vdc_fini_ports(vdc); 494 495 if (vdc->failfast_thread) { 496 failfast_tid = vdc->failfast_thread->t_did; 497 vdc->failfast_interval = 0; 498 cv_signal(&vdc->failfast_cv); 499 } else { 500 failfast_tid = 0; 501 } 502 503 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 504 ownership_tid = vdc->ownership_thread->t_did; 505 vdc->ownership = VDC_OWNERSHIP_NONE; 506 cv_signal(&vdc->ownership_cv); 507 } else { 508 ownership_tid = 0; 509 } 510 511 mutex_exit(&vdc->lock); 512 513 if (failfast_tid != 0) 514 thread_join(failfast_tid); 515 516 if (ownership_tid != 0) 517 thread_join(ownership_tid); 518 519 if (vdc->initialized & VDC_MINOR) { 520 ddi_prop_remove_all(dip); 521 ddi_remove_minor_node(dip, NULL); 522 } 523 524 if (vdc->io_stats) { 525 kstat_delete(vdc->io_stats); 526 vdc->io_stats = NULL; 527 } 528 529 if (vdc->err_stats) { 530 kstat_delete(vdc->err_stats); 531 vdc->err_stats = NULL; 532 } 533 534 if (vdc->initialized & VDC_LOCKS) { 535 mutex_destroy(&vdc->lock); 536 mutex_destroy(&vdc->read_lock); 537 mutex_destroy(&vdc->ownership_lock); 538 cv_destroy(&vdc->initwait_cv); 539 cv_destroy(&vdc->dring_free_cv); 540 cv_destroy(&vdc->membind_cv); 541 cv_destroy(&vdc->sync_pending_cv); 542 cv_destroy(&vdc->sync_blocked_cv); 543 cv_destroy(&vdc->read_cv); 544 cv_destroy(&vdc->running_cv); 545 cv_destroy(&vdc->ownership_cv); 546 cv_destroy(&vdc->failfast_cv); 547 cv_destroy(&vdc->failfast_io_cv); 548 } 549 550 if (vdc->minfo) 551 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 552 553 if (vdc->cinfo) 554 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 555 556 if (vdc->vtoc) 557 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 558 559 if (vdc->geom) 560 kmem_free(vdc->geom, sizeof (struct dk_geom)); 561 562 if (vdc->devid) { 563 ddi_devid_unregister(dip); 564 ddi_devid_free(vdc->devid); 565 } 566 567 if (vdc->initialized & VDC_SOFT_STATE) 568 ddi_soft_state_free(vdc_state, instance); 569 570 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 571 572 return (DDI_SUCCESS); 573 } 574 575 576 static int 577 vdc_do_attach(dev_info_t *dip) 578 { 579 int instance; 580 vdc_t *vdc = NULL; 581 int status; 582 md_t *mdp; 583 mde_cookie_t vd_node; 584 585 ASSERT(dip != NULL); 586 587 instance = ddi_get_instance(dip); 588 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 589 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 590 instance); 591 return (DDI_FAILURE); 592 } 593 594 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 595 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 596 return (DDI_FAILURE); 597 } 598 599 /* 600 * We assign the value to initialized in this case to zero out the 601 * variable and then set bits in it to indicate what has been done 602 */ 603 vdc->initialized = VDC_SOFT_STATE; 604 605 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 606 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 607 608 vdc->dip = dip; 609 vdc->instance = instance; 610 vdc->vdisk_type = VD_DISK_TYPE_UNK; 611 vdc->vdisk_label = VD_DISK_LABEL_UNK; 612 vdc->state = VDC_STATE_INIT; 613 vdc->lifecycle = VDC_LC_ATTACHING; 614 vdc->session_id = 0; 615 vdc->block_size = DEV_BSIZE; 616 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 617 618 /* 619 * We assume, for now, that the vDisk server will export 'read' 620 * operations to us at a minimum (this is needed because of checks 621 * in vdc for supported operations early in the handshake process). 622 * The vDisk server will return ENOTSUP if this is not the case. 623 * The value will be overwritten during the attribute exchange with 624 * the bitmask of operations exported by server. 625 */ 626 vdc->operations = VD_OP_MASK_READ; 627 628 vdc->vtoc = NULL; 629 vdc->geom = NULL; 630 vdc->cinfo = NULL; 631 vdc->minfo = NULL; 632 633 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 634 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 635 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 636 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 637 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 638 639 vdc->threads_pending = 0; 640 vdc->sync_op_pending = B_FALSE; 641 vdc->sync_op_blocked = B_FALSE; 642 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 643 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 644 645 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 646 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 647 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 648 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 649 650 /* init blocking msg read functionality */ 651 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 652 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 653 vdc->read_state = VDC_READ_IDLE; 654 655 vdc->initialized |= VDC_LOCKS; 656 657 /* get device and port MD node for this disk instance */ 658 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 659 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 660 instance); 661 return (DDI_FAILURE); 662 } 663 664 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 665 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 666 return (DDI_FAILURE); 667 } 668 669 (void) md_fini_handle(mdp); 670 671 /* initialize the thread responsible for managing state with server */ 672 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 673 vdc, 0, &p0, TS_RUN, minclsyspri); 674 if (vdc->msg_proc_thr == NULL) { 675 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 676 instance); 677 return (DDI_FAILURE); 678 } 679 680 vdc->initialized |= VDC_THREAD; 681 682 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 683 vdc_create_io_kstats(vdc); 684 vdc_create_err_kstats(vdc); 685 686 atomic_inc_32(&vdc_instance_count); 687 688 /* 689 * Check the disk label. This will send requests and do the handshake. 690 * We don't really care about the disk label now. What we really need is 691 * the handshake do be done so that we know the type of the disk (slice 692 * or full disk) and the appropriate device nodes can be created. 693 */ 694 vdc->vdisk_label = VD_DISK_LABEL_UNK; 695 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 696 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 697 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 698 699 mutex_enter(&vdc->lock); 700 (void) vdc_validate_geometry(vdc); 701 mutex_exit(&vdc->lock); 702 703 /* 704 * Now that we have the device info we can create the 705 * device nodes and properties 706 */ 707 status = vdc_create_device_nodes(vdc); 708 if (status) { 709 DMSG(vdc, 0, "[%d] Failed to create device nodes", 710 instance); 711 goto return_status; 712 } 713 status = vdc_create_device_nodes_props(vdc); 714 if (status) { 715 DMSG(vdc, 0, "[%d] Failed to create device nodes" 716 " properties (%d)", instance, status); 717 goto return_status; 718 } 719 720 /* 721 * Setup devid 722 */ 723 if (vdc_setup_devid(vdc)) { 724 DMSG(vdc, 0, "[%d] No device id available\n", instance); 725 } 726 727 /* 728 * Fill in the fields of the error statistics kstat that were not 729 * available when creating the kstat 730 */ 731 vdc_set_err_kstats(vdc); 732 733 ddi_report_dev(dip); 734 vdc->lifecycle = VDC_LC_ONLINE; 735 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 736 737 return_status: 738 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 739 return (status); 740 } 741 742 static int 743 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 744 { 745 int status; 746 747 switch (cmd) { 748 case DDI_ATTACH: 749 if ((status = vdc_do_attach(dip)) != 0) 750 (void) vdc_detach(dip, DDI_DETACH); 751 return (status); 752 case DDI_RESUME: 753 /* nothing to do for this non-device */ 754 return (DDI_SUCCESS); 755 default: 756 return (DDI_FAILURE); 757 } 758 } 759 760 static int 761 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 762 { 763 int status = 0; 764 ldc_status_t ldc_state; 765 ldc_attr_t ldc_attr; 766 767 ASSERT(vdc != NULL); 768 ASSERT(srvr != NULL); 769 770 ldc_attr.devclass = LDC_DEV_BLK; 771 ldc_attr.instance = vdc->instance; 772 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 773 ldc_attr.mtu = VD_LDC_MTU; 774 775 if ((srvr->state & VDC_LDC_INIT) == 0) { 776 status = ldc_init(srvr->ldc_id, &ldc_attr, 777 &srvr->ldc_handle); 778 if (status != 0) { 779 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 780 vdc->instance, srvr->ldc_id, status); 781 return (status); 782 } 783 srvr->state |= VDC_LDC_INIT; 784 } 785 status = ldc_status(srvr->ldc_handle, &ldc_state); 786 if (status != 0) { 787 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 788 vdc->instance, status); 789 goto init_exit; 790 } 791 srvr->ldc_state = ldc_state; 792 793 if ((srvr->state & VDC_LDC_CB) == 0) { 794 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 795 (caddr_t)srvr); 796 if (status != 0) { 797 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 798 vdc->instance, status); 799 goto init_exit; 800 } 801 srvr->state |= VDC_LDC_CB; 802 } 803 804 /* 805 * At this stage we have initialised LDC, we will now try and open 806 * the connection. 807 */ 808 if (srvr->ldc_state == LDC_INIT) { 809 status = ldc_open(srvr->ldc_handle); 810 if (status != 0) { 811 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 812 vdc->instance, srvr->ldc_id, status); 813 goto init_exit; 814 } 815 srvr->state |= VDC_LDC_OPEN; 816 } 817 818 init_exit: 819 if (status) { 820 vdc_terminate_ldc(vdc, srvr); 821 } 822 823 return (status); 824 } 825 826 static int 827 vdc_start_ldc_connection(vdc_t *vdc) 828 { 829 int status = 0; 830 831 ASSERT(vdc != NULL); 832 833 ASSERT(MUTEX_HELD(&vdc->lock)); 834 835 status = vdc_do_ldc_up(vdc); 836 837 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 838 839 return (status); 840 } 841 842 static int 843 vdc_stop_ldc_connection(vdc_t *vdcp) 844 { 845 int status; 846 847 ASSERT(vdcp != NULL); 848 849 ASSERT(MUTEX_HELD(&vdcp->lock)); 850 851 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 852 vdcp->state); 853 854 status = ldc_down(vdcp->curr_server->ldc_handle); 855 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 856 857 vdcp->initialized &= ~VDC_HANDSHAKE; 858 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 859 860 return (status); 861 } 862 863 static void 864 vdc_create_io_kstats(vdc_t *vdc) 865 { 866 if (vdc->io_stats != NULL) { 867 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 868 return; 869 } 870 871 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 872 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 873 if (vdc->io_stats != NULL) { 874 vdc->io_stats->ks_lock = &vdc->lock; 875 kstat_install(vdc->io_stats); 876 } else { 877 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 878 " will not be gathered", vdc->instance); 879 } 880 } 881 882 static void 883 vdc_create_err_kstats(vdc_t *vdc) 884 { 885 vd_err_stats_t *stp; 886 char kstatmodule_err[KSTAT_STRLEN]; 887 char kstatname[KSTAT_STRLEN]; 888 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 889 int instance = vdc->instance; 890 891 if (vdc->err_stats != NULL) { 892 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 893 return; 894 } 895 896 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 897 "%serr", VDC_DRIVER_NAME); 898 (void) snprintf(kstatname, sizeof (kstatname), 899 "%s%d,err", VDC_DRIVER_NAME, instance); 900 901 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 902 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 903 904 if (vdc->err_stats == NULL) { 905 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 906 " will not be gathered", instance); 907 return; 908 } 909 910 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 911 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 912 KSTAT_DATA_UINT32); 913 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 914 KSTAT_DATA_UINT32); 915 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 916 KSTAT_DATA_UINT32); 917 kstat_named_init(&stp->vd_vid, "Vendor", 918 KSTAT_DATA_CHAR); 919 kstat_named_init(&stp->vd_pid, "Product", 920 KSTAT_DATA_CHAR); 921 kstat_named_init(&stp->vd_capacity, "Size", 922 KSTAT_DATA_ULONGLONG); 923 924 vdc->err_stats->ks_update = nulldev; 925 926 kstat_install(vdc->err_stats); 927 } 928 929 static void 930 vdc_set_err_kstats(vdc_t *vdc) 931 { 932 vd_err_stats_t *stp; 933 934 if (vdc->err_stats == NULL) 935 return; 936 937 mutex_enter(&vdc->lock); 938 939 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 940 ASSERT(stp != NULL); 941 942 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 943 (void) strcpy(stp->vd_vid.value.c, "SUN"); 944 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 945 946 mutex_exit(&vdc->lock); 947 } 948 949 static int 950 vdc_create_device_nodes_efi(vdc_t *vdc) 951 { 952 ddi_remove_minor_node(vdc->dip, "h"); 953 ddi_remove_minor_node(vdc->dip, "h,raw"); 954 955 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 956 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 957 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 958 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 959 vdc->instance); 960 return (EIO); 961 } 962 963 /* if any device node is created we set this flag */ 964 vdc->initialized |= VDC_MINOR; 965 966 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 967 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 968 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 969 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 970 vdc->instance); 971 return (EIO); 972 } 973 974 return (0); 975 } 976 977 static int 978 vdc_create_device_nodes_vtoc(vdc_t *vdc) 979 { 980 ddi_remove_minor_node(vdc->dip, "wd"); 981 ddi_remove_minor_node(vdc->dip, "wd,raw"); 982 983 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 984 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 985 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 986 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 987 vdc->instance); 988 return (EIO); 989 } 990 991 /* if any device node is created we set this flag */ 992 vdc->initialized |= VDC_MINOR; 993 994 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 995 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 996 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 997 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 998 vdc->instance); 999 return (EIO); 1000 } 1001 1002 return (0); 1003 } 1004 1005 /* 1006 * Function: 1007 * vdc_create_device_nodes 1008 * 1009 * Description: 1010 * This function creates the block and character device nodes under 1011 * /devices along with the node properties. It is called as part of 1012 * the attach(9E) of the instance during the handshake with vds after 1013 * vds has sent the attributes to vdc. 1014 * 1015 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1016 * of 2 is used in keeping with the Solaris convention that slice 2 1017 * refers to a whole disk. Slices start at 'a' 1018 * 1019 * Parameters: 1020 * vdc - soft state pointer 1021 * 1022 * Return Values 1023 * 0 - Success 1024 * EIO - Failed to create node 1025 * EINVAL - Unknown type of disk exported 1026 */ 1027 static int 1028 vdc_create_device_nodes(vdc_t *vdc) 1029 { 1030 char name[sizeof ("s,raw")]; 1031 dev_info_t *dip = NULL; 1032 int instance, status; 1033 int num_slices = 1; 1034 int i; 1035 1036 ASSERT(vdc != NULL); 1037 1038 instance = vdc->instance; 1039 dip = vdc->dip; 1040 1041 switch (vdc->vdisk_type) { 1042 case VD_DISK_TYPE_DISK: 1043 num_slices = V_NUMPAR; 1044 break; 1045 case VD_DISK_TYPE_SLICE: 1046 num_slices = 1; 1047 break; 1048 case VD_DISK_TYPE_UNK: 1049 default: 1050 return (EINVAL); 1051 } 1052 1053 /* 1054 * Minor nodes are different for EFI disks: EFI disks do not have 1055 * a minor node 'g' for the minor number corresponding to slice 1056 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1057 * representing the whole disk. 1058 */ 1059 for (i = 0; i < num_slices; i++) { 1060 1061 if (i == VD_EFI_WD_SLICE) { 1062 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1063 status = vdc_create_device_nodes_efi(vdc); 1064 else 1065 status = vdc_create_device_nodes_vtoc(vdc); 1066 if (status != 0) 1067 return (status); 1068 continue; 1069 } 1070 1071 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1072 if (ddi_create_minor_node(dip, name, S_IFBLK, 1073 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1074 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1075 instance, name); 1076 return (EIO); 1077 } 1078 1079 /* if any device node is created we set this flag */ 1080 vdc->initialized |= VDC_MINOR; 1081 1082 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1083 1084 if (ddi_create_minor_node(dip, name, S_IFCHR, 1085 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1086 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1087 instance, name); 1088 return (EIO); 1089 } 1090 } 1091 1092 return (0); 1093 } 1094 1095 /* 1096 * Function: 1097 * vdc_create_device_nodes_props 1098 * 1099 * Description: 1100 * This function creates the block and character device nodes under 1101 * /devices along with the node properties. It is called as part of 1102 * the attach(9E) of the instance during the handshake with vds after 1103 * vds has sent the attributes to vdc. 1104 * 1105 * Parameters: 1106 * vdc - soft state pointer 1107 * 1108 * Return Values 1109 * 0 - Success 1110 * EIO - Failed to create device node property 1111 * EINVAL - Unknown type of disk exported 1112 */ 1113 static int 1114 vdc_create_device_nodes_props(vdc_t *vdc) 1115 { 1116 dev_info_t *dip = NULL; 1117 int instance; 1118 int num_slices = 1; 1119 int64_t size = 0; 1120 dev_t dev; 1121 int rv; 1122 int i; 1123 1124 ASSERT(vdc != NULL); 1125 1126 instance = vdc->instance; 1127 dip = vdc->dip; 1128 1129 switch (vdc->vdisk_type) { 1130 case VD_DISK_TYPE_DISK: 1131 num_slices = V_NUMPAR; 1132 break; 1133 case VD_DISK_TYPE_SLICE: 1134 num_slices = 1; 1135 break; 1136 case VD_DISK_TYPE_UNK: 1137 default: 1138 return (EINVAL); 1139 } 1140 1141 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1142 /* remove all properties */ 1143 for (i = 0; i < num_slices; i++) { 1144 dev = makedevice(ddi_driver_major(dip), 1145 VD_MAKE_DEV(instance, i)); 1146 (void) ddi_prop_remove(dev, dip, VDC_SIZE_PROP_NAME); 1147 (void) ddi_prop_remove(dev, dip, VDC_NBLOCKS_PROP_NAME); 1148 } 1149 return (0); 1150 } 1151 1152 for (i = 0; i < num_slices; i++) { 1153 dev = makedevice(ddi_driver_major(dip), 1154 VD_MAKE_DEV(instance, i)); 1155 1156 size = vdc->slice[i].nblocks * vdc->block_size; 1157 DMSG(vdc, 0, "[%d] sz %ld (%ld Mb) p_size %lx\n", 1158 instance, size, size / (1024 * 1024), 1159 vdc->slice[i].nblocks); 1160 1161 rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); 1162 if (rv != DDI_PROP_SUCCESS) { 1163 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop of [%ld]", 1164 instance, VDC_SIZE_PROP_NAME, size); 1165 return (EIO); 1166 } 1167 1168 rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME, 1169 lbtodb(size)); 1170 if (rv != DDI_PROP_SUCCESS) { 1171 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop [%llu]", 1172 instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size)); 1173 return (EIO); 1174 } 1175 } 1176 1177 return (0); 1178 } 1179 1180 /* 1181 * Function: 1182 * vdc_is_opened 1183 * 1184 * Description: 1185 * This function checks if any slice of a given virtual disk is 1186 * currently opened. 1187 * 1188 * Parameters: 1189 * vdc - soft state pointer 1190 * 1191 * Return Values 1192 * B_TRUE - at least one slice is opened. 1193 * B_FALSE - no slice is opened. 1194 */ 1195 static boolean_t 1196 vdc_is_opened(vdc_t *vdc) 1197 { 1198 int i, nslices; 1199 1200 switch (vdc->vdisk_type) { 1201 case VD_DISK_TYPE_DISK: 1202 nslices = V_NUMPAR; 1203 break; 1204 case VD_DISK_TYPE_SLICE: 1205 nslices = 1; 1206 break; 1207 case VD_DISK_TYPE_UNK: 1208 default: 1209 ASSERT(0); 1210 } 1211 1212 /* check if there's any layered open */ 1213 for (i = 0; i < nslices; i++) { 1214 if (vdc->open_lyr[i] > 0) 1215 return (B_TRUE); 1216 } 1217 1218 /* check if there is any other kind of open */ 1219 for (i = 0; i < OTYPCNT; i++) { 1220 if (vdc->open[i] != 0) 1221 return (B_TRUE); 1222 } 1223 1224 return (B_FALSE); 1225 } 1226 1227 static int 1228 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1229 { 1230 uint8_t slicemask; 1231 int i; 1232 1233 ASSERT(otyp < OTYPCNT); 1234 ASSERT(slice < V_NUMPAR); 1235 ASSERT(MUTEX_HELD(&vdc->lock)); 1236 1237 slicemask = 1 << slice; 1238 1239 /* check if slice is already exclusively opened */ 1240 if (vdc->open_excl & slicemask) 1241 return (EBUSY); 1242 1243 /* if open exclusive, check if slice is already opened */ 1244 if (flag & FEXCL) { 1245 if (vdc->open_lyr[slice] > 0) 1246 return (EBUSY); 1247 for (i = 0; i < OTYPCNT; i++) { 1248 if (vdc->open[i] & slicemask) 1249 return (EBUSY); 1250 } 1251 vdc->open_excl |= slicemask; 1252 } 1253 1254 /* mark slice as opened */ 1255 if (otyp == OTYP_LYR) { 1256 vdc->open_lyr[slice]++; 1257 } else { 1258 vdc->open[otyp] |= slicemask; 1259 } 1260 1261 return (0); 1262 } 1263 1264 static void 1265 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1266 { 1267 uint8_t slicemask; 1268 1269 ASSERT(otyp < OTYPCNT); 1270 ASSERT(slice < V_NUMPAR); 1271 ASSERT(MUTEX_HELD(&vdc->lock)); 1272 1273 slicemask = 1 << slice; 1274 1275 if (otyp == OTYP_LYR) { 1276 ASSERT(vdc->open_lyr[slice] > 0); 1277 vdc->open_lyr[slice]--; 1278 } else { 1279 vdc->open[otyp] &= ~slicemask; 1280 } 1281 1282 if (flag & FEXCL) 1283 vdc->open_excl &= ~slicemask; 1284 } 1285 1286 static int 1287 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1288 { 1289 _NOTE(ARGUNUSED(cred)) 1290 1291 int instance, nodelay; 1292 int slice, status = 0; 1293 vdc_t *vdc; 1294 1295 ASSERT(dev != NULL); 1296 instance = VDCUNIT(*dev); 1297 1298 if (otyp >= OTYPCNT) 1299 return (EINVAL); 1300 1301 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1302 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1303 return (ENXIO); 1304 } 1305 1306 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1307 getminor(*dev), flag, otyp); 1308 1309 slice = VDCPART(*dev); 1310 1311 nodelay = flag & (FNDELAY | FNONBLOCK); 1312 1313 if ((flag & FWRITE) && (!nodelay) && 1314 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1315 return (EROFS); 1316 } 1317 1318 mutex_enter(&vdc->lock); 1319 1320 status = vdc_mark_opened(vdc, slice, flag, otyp); 1321 1322 if (status != 0) { 1323 mutex_exit(&vdc->lock); 1324 return (status); 1325 } 1326 1327 if (nodelay) { 1328 1329 /* don't resubmit a validate request if there's already one */ 1330 if (vdc->validate_pending > 0) { 1331 mutex_exit(&vdc->lock); 1332 return (0); 1333 } 1334 1335 /* call vdc_validate() asynchronously to avoid blocking */ 1336 if (taskq_dispatch(system_taskq, vdc_validate_task, 1337 (void *)vdc, TQ_NOSLEEP) == NULL) { 1338 vdc_mark_closed(vdc, slice, flag, otyp); 1339 mutex_exit(&vdc->lock); 1340 return (ENXIO); 1341 } 1342 1343 vdc->validate_pending++; 1344 mutex_exit(&vdc->lock); 1345 return (0); 1346 } 1347 1348 mutex_exit(&vdc->lock); 1349 1350 vdc_validate(vdc); 1351 1352 mutex_enter(&vdc->lock); 1353 1354 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1355 vdc->slice[slice].nblocks == 0) { 1356 vdc_mark_closed(vdc, slice, flag, otyp); 1357 status = EIO; 1358 } 1359 1360 mutex_exit(&vdc->lock); 1361 1362 return (status); 1363 } 1364 1365 static int 1366 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1367 { 1368 _NOTE(ARGUNUSED(cred)) 1369 1370 int instance; 1371 int slice; 1372 int rv, rval; 1373 vdc_t *vdc; 1374 1375 instance = VDCUNIT(dev); 1376 1377 if (otyp >= OTYPCNT) 1378 return (EINVAL); 1379 1380 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1381 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1382 return (ENXIO); 1383 } 1384 1385 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1386 1387 slice = VDCPART(dev); 1388 1389 /* 1390 * Attempt to flush the W$ on a close operation. If this is 1391 * not a supported IOCTL command or the backing device is read-only 1392 * do not fail the close operation. 1393 */ 1394 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1395 1396 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1397 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1398 instance, rv); 1399 return (EIO); 1400 } 1401 1402 mutex_enter(&vdc->lock); 1403 vdc_mark_closed(vdc, slice, flag, otyp); 1404 mutex_exit(&vdc->lock); 1405 1406 return (0); 1407 } 1408 1409 static int 1410 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1411 { 1412 _NOTE(ARGUNUSED(credp)) 1413 1414 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1415 } 1416 1417 static int 1418 vdc_print(dev_t dev, char *str) 1419 { 1420 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1421 return (0); 1422 } 1423 1424 static int 1425 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1426 { 1427 int rv; 1428 size_t nbytes = nblk * DEV_BSIZE; 1429 int instance = VDCUNIT(dev); 1430 vdc_t *vdc = NULL; 1431 1432 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1433 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1434 return (ENXIO); 1435 } 1436 1437 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1438 instance, nbytes, blkno, (void *)addr); 1439 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1440 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1441 if (rv) { 1442 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1443 return (rv); 1444 } 1445 1446 if (ddi_in_panic()) 1447 (void) vdc_drain_response(vdc); 1448 1449 DMSG(vdc, 0, "[%d] End\n", instance); 1450 1451 return (0); 1452 } 1453 1454 /* -------------------------------------------------------------------------- */ 1455 1456 /* 1457 * Disk access routines 1458 * 1459 */ 1460 1461 /* 1462 * vdc_strategy() 1463 * 1464 * Return Value: 1465 * 0: As per strategy(9E), the strategy() function must return 0 1466 * [ bioerror(9f) sets b_flags to the proper error code ] 1467 */ 1468 static int 1469 vdc_strategy(struct buf *buf) 1470 { 1471 int rv = -1; 1472 vdc_t *vdc = NULL; 1473 int instance = VDCUNIT(buf->b_edev); 1474 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1475 int slice; 1476 1477 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1478 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1479 bioerror(buf, ENXIO); 1480 biodone(buf); 1481 return (0); 1482 } 1483 1484 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1485 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1486 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1487 1488 bp_mapin(buf); 1489 1490 if ((long)buf->b_private == VD_SLICE_NONE) { 1491 /* I/O using an absolute disk offset */ 1492 slice = VD_SLICE_NONE; 1493 } else { 1494 slice = VDCPART(buf->b_edev); 1495 } 1496 1497 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1498 buf->b_bcount, slice, buf->b_lblkno, 1499 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1500 VIO_write_dir); 1501 1502 /* 1503 * If the request was successfully sent, the strategy call returns and 1504 * the ACK handler calls the bioxxx functions when the vDisk server is 1505 * done otherwise we handle the error here. 1506 */ 1507 if (rv) { 1508 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1509 bioerror(buf, rv); 1510 biodone(buf); 1511 } 1512 1513 return (0); 1514 } 1515 1516 /* 1517 * Function: 1518 * vdc_min 1519 * 1520 * Description: 1521 * Routine to limit the size of a data transfer. Used in 1522 * conjunction with physio(9F). 1523 * 1524 * Arguments: 1525 * bp - pointer to the indicated buf(9S) struct. 1526 * 1527 */ 1528 static void 1529 vdc_min(struct buf *bufp) 1530 { 1531 vdc_t *vdc = NULL; 1532 int instance = VDCUNIT(bufp->b_edev); 1533 1534 vdc = ddi_get_soft_state(vdc_state, instance); 1535 VERIFY(vdc != NULL); 1536 1537 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1538 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1539 } 1540 } 1541 1542 static int 1543 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1544 { 1545 _NOTE(ARGUNUSED(cred)) 1546 1547 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1548 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1549 } 1550 1551 static int 1552 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1553 { 1554 _NOTE(ARGUNUSED(cred)) 1555 1556 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1557 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1558 } 1559 1560 static int 1561 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1562 { 1563 _NOTE(ARGUNUSED(cred)) 1564 1565 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1566 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1567 } 1568 1569 static int 1570 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1571 { 1572 _NOTE(ARGUNUSED(cred)) 1573 1574 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1575 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1576 } 1577 1578 1579 /* -------------------------------------------------------------------------- */ 1580 1581 /* 1582 * Handshake support 1583 */ 1584 1585 1586 /* 1587 * Function: 1588 * vdc_init_ver_negotiation() 1589 * 1590 * Description: 1591 * 1592 * Arguments: 1593 * vdc - soft state pointer for this instance of the device driver. 1594 * 1595 * Return Code: 1596 * 0 - Success 1597 */ 1598 static int 1599 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1600 { 1601 vio_ver_msg_t pkt; 1602 size_t msglen = sizeof (pkt); 1603 int status = -1; 1604 1605 ASSERT(vdc != NULL); 1606 ASSERT(mutex_owned(&vdc->lock)); 1607 1608 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1609 1610 /* 1611 * set the Session ID to a unique value 1612 * (the lower 32 bits of the clock tick) 1613 */ 1614 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1615 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1616 1617 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1618 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1619 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1620 pkt.tag.vio_sid = vdc->session_id; 1621 pkt.dev_class = VDEV_DISK; 1622 pkt.ver_major = ver.major; 1623 pkt.ver_minor = ver.minor; 1624 1625 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1626 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1627 vdc->instance, status); 1628 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1629 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1630 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1631 vdc->curr_server->ldc_handle, status, msglen); 1632 if (msglen != sizeof (vio_ver_msg_t)) 1633 status = ENOMSG; 1634 } 1635 1636 return (status); 1637 } 1638 1639 /* 1640 * Function: 1641 * vdc_ver_negotiation() 1642 * 1643 * Description: 1644 * 1645 * Arguments: 1646 * vdcp - soft state pointer for this instance of the device driver. 1647 * 1648 * Return Code: 1649 * 0 - Success 1650 */ 1651 static int 1652 vdc_ver_negotiation(vdc_t *vdcp) 1653 { 1654 vio_msg_t vio_msg; 1655 int status; 1656 1657 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1658 return (status); 1659 1660 /* release lock and wait for response */ 1661 mutex_exit(&vdcp->lock); 1662 status = vdc_wait_for_response(vdcp, &vio_msg); 1663 mutex_enter(&vdcp->lock); 1664 if (status) { 1665 DMSG(vdcp, 0, 1666 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1667 vdcp->instance, status); 1668 return (status); 1669 } 1670 1671 /* check type and sub_type ... */ 1672 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1673 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1674 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1675 vdcp->instance); 1676 return (EPROTO); 1677 } 1678 1679 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1680 } 1681 1682 /* 1683 * Function: 1684 * vdc_init_attr_negotiation() 1685 * 1686 * Description: 1687 * 1688 * Arguments: 1689 * vdc - soft state pointer for this instance of the device driver. 1690 * 1691 * Return Code: 1692 * 0 - Success 1693 */ 1694 static int 1695 vdc_init_attr_negotiation(vdc_t *vdc) 1696 { 1697 vd_attr_msg_t pkt; 1698 size_t msglen = sizeof (pkt); 1699 int status; 1700 1701 ASSERT(vdc != NULL); 1702 ASSERT(mutex_owned(&vdc->lock)); 1703 1704 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1705 1706 /* fill in tag */ 1707 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1708 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1709 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1710 pkt.tag.vio_sid = vdc->session_id; 1711 /* fill in payload */ 1712 pkt.max_xfer_sz = vdc->max_xfer_sz; 1713 pkt.vdisk_block_size = vdc->block_size; 1714 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1715 pkt.operations = 0; /* server will set bits of valid operations */ 1716 pkt.vdisk_type = 0; /* server will set to valid device type */ 1717 pkt.vdisk_media = 0; /* server will set to valid media type */ 1718 pkt.vdisk_size = 0; /* server will set to valid size */ 1719 1720 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1721 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1722 1723 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1724 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1725 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1726 vdc->curr_server->ldc_handle, status, msglen); 1727 if (msglen != sizeof (vio_ver_msg_t)) 1728 status = ENOMSG; 1729 } 1730 1731 return (status); 1732 } 1733 1734 /* 1735 * Function: 1736 * vdc_attr_negotiation() 1737 * 1738 * Description: 1739 * 1740 * Arguments: 1741 * vdc - soft state pointer for this instance of the device driver. 1742 * 1743 * Return Code: 1744 * 0 - Success 1745 */ 1746 static int 1747 vdc_attr_negotiation(vdc_t *vdcp) 1748 { 1749 int status; 1750 vio_msg_t vio_msg; 1751 1752 if (status = vdc_init_attr_negotiation(vdcp)) 1753 return (status); 1754 1755 /* release lock and wait for response */ 1756 mutex_exit(&vdcp->lock); 1757 status = vdc_wait_for_response(vdcp, &vio_msg); 1758 mutex_enter(&vdcp->lock); 1759 if (status) { 1760 DMSG(vdcp, 0, 1761 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1762 vdcp->instance, status); 1763 return (status); 1764 } 1765 1766 /* check type and sub_type ... */ 1767 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1768 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1769 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1770 vdcp->instance); 1771 return (EPROTO); 1772 } 1773 1774 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1775 } 1776 1777 1778 /* 1779 * Function: 1780 * vdc_init_dring_negotiate() 1781 * 1782 * Description: 1783 * 1784 * Arguments: 1785 * vdc - soft state pointer for this instance of the device driver. 1786 * 1787 * Return Code: 1788 * 0 - Success 1789 */ 1790 static int 1791 vdc_init_dring_negotiate(vdc_t *vdc) 1792 { 1793 vio_dring_reg_msg_t pkt; 1794 size_t msglen = sizeof (pkt); 1795 int status = -1; 1796 int retry; 1797 int nretries = 10; 1798 1799 ASSERT(vdc != NULL); 1800 ASSERT(mutex_owned(&vdc->lock)); 1801 1802 for (retry = 0; retry < nretries; retry++) { 1803 status = vdc_init_descriptor_ring(vdc); 1804 if (status != EAGAIN) 1805 break; 1806 drv_usecwait(vdc_min_timeout_ldc); 1807 } 1808 1809 if (status != 0) { 1810 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1811 vdc->instance, status); 1812 return (status); 1813 } 1814 1815 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1816 vdc->instance, status); 1817 1818 /* fill in tag */ 1819 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1820 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1821 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1822 pkt.tag.vio_sid = vdc->session_id; 1823 /* fill in payload */ 1824 pkt.dring_ident = 0; 1825 pkt.num_descriptors = vdc->dring_len; 1826 pkt.descriptor_size = vdc->dring_entry_size; 1827 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1828 pkt.ncookies = vdc->dring_cookie_count; 1829 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1830 1831 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1832 if (status != 0) { 1833 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1834 vdc->instance, status); 1835 } 1836 1837 return (status); 1838 } 1839 1840 1841 /* 1842 * Function: 1843 * vdc_dring_negotiation() 1844 * 1845 * Description: 1846 * 1847 * Arguments: 1848 * vdc - soft state pointer for this instance of the device driver. 1849 * 1850 * Return Code: 1851 * 0 - Success 1852 */ 1853 static int 1854 vdc_dring_negotiation(vdc_t *vdcp) 1855 { 1856 int status; 1857 vio_msg_t vio_msg; 1858 1859 if (status = vdc_init_dring_negotiate(vdcp)) 1860 return (status); 1861 1862 /* release lock and wait for response */ 1863 mutex_exit(&vdcp->lock); 1864 status = vdc_wait_for_response(vdcp, &vio_msg); 1865 mutex_enter(&vdcp->lock); 1866 if (status) { 1867 DMSG(vdcp, 0, 1868 "[%d] Failed waiting for Dring negotiation response," 1869 " rv(%d)", vdcp->instance, status); 1870 return (status); 1871 } 1872 1873 /* check type and sub_type ... */ 1874 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1875 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1876 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1877 vdcp->instance); 1878 return (EPROTO); 1879 } 1880 1881 return (vdc_handle_dring_reg_msg(vdcp, 1882 (vio_dring_reg_msg_t *)&vio_msg)); 1883 } 1884 1885 1886 /* 1887 * Function: 1888 * vdc_send_rdx() 1889 * 1890 * Description: 1891 * 1892 * Arguments: 1893 * vdc - soft state pointer for this instance of the device driver. 1894 * 1895 * Return Code: 1896 * 0 - Success 1897 */ 1898 static int 1899 vdc_send_rdx(vdc_t *vdcp) 1900 { 1901 vio_msg_t msg; 1902 size_t msglen = sizeof (vio_msg_t); 1903 int status; 1904 1905 /* 1906 * Send an RDX message to vds to indicate we are ready 1907 * to send data 1908 */ 1909 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1910 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1911 msg.tag.vio_subtype_env = VIO_RDX; 1912 msg.tag.vio_sid = vdcp->session_id; 1913 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1914 if (status != 0) { 1915 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1916 vdcp->instance, status); 1917 } 1918 1919 return (status); 1920 } 1921 1922 /* 1923 * Function: 1924 * vdc_handle_rdx() 1925 * 1926 * Description: 1927 * 1928 * Arguments: 1929 * vdc - soft state pointer for this instance of the device driver. 1930 * msgp - received msg 1931 * 1932 * Return Code: 1933 * 0 - Success 1934 */ 1935 static int 1936 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1937 { 1938 _NOTE(ARGUNUSED(vdcp)) 1939 _NOTE(ARGUNUSED(msgp)) 1940 1941 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1942 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1943 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1944 1945 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1946 1947 return (0); 1948 } 1949 1950 /* 1951 * Function: 1952 * vdc_rdx_exchange() 1953 * 1954 * Description: 1955 * 1956 * Arguments: 1957 * vdc - soft state pointer for this instance of the device driver. 1958 * 1959 * Return Code: 1960 * 0 - Success 1961 */ 1962 static int 1963 vdc_rdx_exchange(vdc_t *vdcp) 1964 { 1965 int status; 1966 vio_msg_t vio_msg; 1967 1968 if (status = vdc_send_rdx(vdcp)) 1969 return (status); 1970 1971 /* release lock and wait for response */ 1972 mutex_exit(&vdcp->lock); 1973 status = vdc_wait_for_response(vdcp, &vio_msg); 1974 mutex_enter(&vdcp->lock); 1975 if (status) { 1976 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1977 vdcp->instance, status); 1978 return (status); 1979 } 1980 1981 /* check type and sub_type ... */ 1982 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1983 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1984 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1985 return (EPROTO); 1986 } 1987 1988 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1989 } 1990 1991 1992 /* -------------------------------------------------------------------------- */ 1993 1994 /* 1995 * LDC helper routines 1996 */ 1997 1998 static int 1999 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 2000 { 2001 int status; 2002 boolean_t q_has_pkts = B_FALSE; 2003 uint64_t delay_time; 2004 size_t len; 2005 2006 mutex_enter(&vdc->read_lock); 2007 2008 if (vdc->read_state == VDC_READ_IDLE) 2009 vdc->read_state = VDC_READ_WAITING; 2010 2011 while (vdc->read_state != VDC_READ_PENDING) { 2012 2013 /* detect if the connection has been reset */ 2014 if (vdc->read_state == VDC_READ_RESET) { 2015 status = ECONNRESET; 2016 goto done; 2017 } 2018 2019 cv_wait(&vdc->read_cv, &vdc->read_lock); 2020 } 2021 2022 /* 2023 * Until we get a blocking ldc read we have to retry 2024 * until the entire LDC message has arrived before 2025 * ldc_read() will succeed. Note we also bail out if 2026 * the channel is reset or goes away. 2027 */ 2028 delay_time = vdc_ldc_read_init_delay; 2029 loop: 2030 len = *nbytesp; 2031 status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); 2032 switch (status) { 2033 case EAGAIN: 2034 delay_time *= 2; 2035 if (delay_time >= vdc_ldc_read_max_delay) 2036 delay_time = vdc_ldc_read_max_delay; 2037 delay(delay_time); 2038 goto loop; 2039 2040 case 0: 2041 if (len == 0) { 2042 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 2043 "no error!\n", vdc->instance); 2044 goto loop; 2045 } 2046 2047 *nbytesp = len; 2048 2049 /* 2050 * If there are pending messages, leave the 2051 * read state as pending. Otherwise, set the state 2052 * back to idle. 2053 */ 2054 status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); 2055 if (status == 0 && !q_has_pkts) 2056 vdc->read_state = VDC_READ_IDLE; 2057 2058 break; 2059 default: 2060 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2061 break; 2062 } 2063 2064 done: 2065 mutex_exit(&vdc->read_lock); 2066 2067 return (status); 2068 } 2069 2070 2071 2072 #ifdef DEBUG 2073 void 2074 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2075 { 2076 char *ms, *ss, *ses; 2077 switch (msg->tag.vio_msgtype) { 2078 #define Q(_s) case _s : ms = #_s; break; 2079 Q(VIO_TYPE_CTRL) 2080 Q(VIO_TYPE_DATA) 2081 Q(VIO_TYPE_ERR) 2082 #undef Q 2083 default: ms = "unknown"; break; 2084 } 2085 2086 switch (msg->tag.vio_subtype) { 2087 #define Q(_s) case _s : ss = #_s; break; 2088 Q(VIO_SUBTYPE_INFO) 2089 Q(VIO_SUBTYPE_ACK) 2090 Q(VIO_SUBTYPE_NACK) 2091 #undef Q 2092 default: ss = "unknown"; break; 2093 } 2094 2095 switch (msg->tag.vio_subtype_env) { 2096 #define Q(_s) case _s : ses = #_s; break; 2097 Q(VIO_VER_INFO) 2098 Q(VIO_ATTR_INFO) 2099 Q(VIO_DRING_REG) 2100 Q(VIO_DRING_UNREG) 2101 Q(VIO_RDX) 2102 Q(VIO_PKT_DATA) 2103 Q(VIO_DESC_DATA) 2104 Q(VIO_DRING_DATA) 2105 #undef Q 2106 default: ses = "unknown"; break; 2107 } 2108 2109 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2110 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2111 msg->tag.vio_subtype_env, ms, ss, ses); 2112 } 2113 #endif 2114 2115 /* 2116 * Function: 2117 * vdc_send() 2118 * 2119 * Description: 2120 * The function encapsulates the call to write a message using LDC. 2121 * If LDC indicates that the call failed due to the queue being full, 2122 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2123 * 2124 * Arguments: 2125 * ldc_handle - LDC handle for the channel this instance of vdc uses 2126 * pkt - address of LDC message to be sent 2127 * msglen - the size of the message being sent. When the function 2128 * returns, this contains the number of bytes written. 2129 * 2130 * Return Code: 2131 * 0 - Success. 2132 * EINVAL - pkt or msglen were NULL 2133 * ECONNRESET - The connection was not up. 2134 * EWOULDBLOCK - LDC queue is full 2135 * xxx - other error codes returned by ldc_write 2136 */ 2137 static int 2138 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2139 { 2140 size_t size = 0; 2141 int status = 0; 2142 clock_t delay_ticks; 2143 2144 ASSERT(vdc != NULL); 2145 ASSERT(mutex_owned(&vdc->lock)); 2146 ASSERT(msglen != NULL); 2147 ASSERT(*msglen != 0); 2148 2149 #ifdef DEBUG 2150 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2151 #endif 2152 /* 2153 * Wait indefinitely to send if channel 2154 * is busy, but bail out if we succeed or 2155 * if the channel closes or is reset. 2156 */ 2157 delay_ticks = vdc_hz_min_ldc_delay; 2158 do { 2159 size = *msglen; 2160 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2161 if (status == EWOULDBLOCK) { 2162 delay(delay_ticks); 2163 /* geometric backoff */ 2164 delay_ticks *= 2; 2165 if (delay_ticks > vdc_hz_max_ldc_delay) 2166 delay_ticks = vdc_hz_max_ldc_delay; 2167 } 2168 } while (status == EWOULDBLOCK); 2169 2170 /* if LDC had serious issues --- reset vdc state */ 2171 if (status == EIO || status == ECONNRESET) { 2172 /* LDC had serious issues --- reset vdc state */ 2173 mutex_enter(&vdc->read_lock); 2174 if ((vdc->read_state == VDC_READ_WAITING) || 2175 (vdc->read_state == VDC_READ_RESET)) 2176 cv_signal(&vdc->read_cv); 2177 vdc->read_state = VDC_READ_RESET; 2178 mutex_exit(&vdc->read_lock); 2179 2180 /* wake up any waiters in the reset thread */ 2181 if (vdc->state == VDC_STATE_INIT_WAITING) { 2182 DMSG(vdc, 0, "[%d] write reset - " 2183 "vdc is resetting ..\n", vdc->instance); 2184 vdc->state = VDC_STATE_RESETTING; 2185 cv_signal(&vdc->initwait_cv); 2186 } 2187 2188 return (ECONNRESET); 2189 } 2190 2191 /* return the last size written */ 2192 *msglen = size; 2193 2194 return (status); 2195 } 2196 2197 /* 2198 * Function: 2199 * vdc_get_md_node 2200 * 2201 * Description: 2202 * Get the MD, the device node for the given disk instance. The 2203 * caller is responsible for cleaning up the reference to the 2204 * returned MD (mdpp) by calling md_fini_handle(). 2205 * 2206 * Arguments: 2207 * dip - dev info pointer for this instance of the device driver. 2208 * mdpp - the returned MD. 2209 * vd_nodep - the returned device node. 2210 * 2211 * Return Code: 2212 * 0 - Success. 2213 * ENOENT - Expected node or property did not exist. 2214 * ENXIO - Unexpected error communicating with MD framework 2215 */ 2216 static int 2217 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2218 { 2219 int status = ENOENT; 2220 char *node_name = NULL; 2221 md_t *mdp = NULL; 2222 int num_nodes; 2223 int num_vdevs; 2224 mde_cookie_t rootnode; 2225 mde_cookie_t *listp = NULL; 2226 boolean_t found_inst = B_FALSE; 2227 int listsz; 2228 int idx; 2229 uint64_t md_inst; 2230 int obp_inst; 2231 int instance = ddi_get_instance(dip); 2232 2233 /* 2234 * Get the OBP instance number for comparison with the MD instance 2235 * 2236 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2237 * notion of "instance", or unique identifier, for that node; OBP 2238 * stores the value of the "cfg-handle" MD property as the value of 2239 * the "reg" property on the node in the device tree it builds from 2240 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2241 * "reg" property value to uniquely identify this device instance. 2242 * If the "reg" property cannot be found, the device tree state is 2243 * presumably so broken that there is no point in continuing. 2244 */ 2245 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2246 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2247 return (ENOENT); 2248 } 2249 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2250 OBP_REG, -1); 2251 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2252 2253 /* 2254 * We now walk the MD nodes to find the node for this vdisk. 2255 */ 2256 if ((mdp = md_get_handle()) == NULL) { 2257 cmn_err(CE_WARN, "unable to init machine description"); 2258 return (ENXIO); 2259 } 2260 2261 num_nodes = md_node_count(mdp); 2262 ASSERT(num_nodes > 0); 2263 2264 listsz = num_nodes * sizeof (mde_cookie_t); 2265 2266 /* allocate memory for nodes */ 2267 listp = kmem_zalloc(listsz, KM_SLEEP); 2268 2269 rootnode = md_root_node(mdp); 2270 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2271 2272 /* 2273 * Search for all the virtual devices, we will then check to see which 2274 * ones are disk nodes. 2275 */ 2276 num_vdevs = md_scan_dag(mdp, rootnode, 2277 md_find_name(mdp, VDC_MD_VDEV_NAME), 2278 md_find_name(mdp, "fwd"), listp); 2279 2280 if (num_vdevs <= 0) { 2281 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2282 status = ENOENT; 2283 goto done; 2284 } 2285 2286 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2287 for (idx = 0; idx < num_vdevs; idx++) { 2288 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2289 if ((status != 0) || (node_name == NULL)) { 2290 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2291 ": err %d", VDC_MD_VDEV_NAME, status); 2292 continue; 2293 } 2294 2295 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2296 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2297 status = md_get_prop_val(mdp, listp[idx], 2298 VDC_MD_CFG_HDL, &md_inst); 2299 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2300 instance, md_inst); 2301 if ((status == 0) && (md_inst == obp_inst)) { 2302 found_inst = B_TRUE; 2303 break; 2304 } 2305 } 2306 } 2307 2308 if (!found_inst) { 2309 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2310 status = ENOENT; 2311 goto done; 2312 } 2313 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2314 2315 *vd_nodep = listp[idx]; 2316 *mdpp = mdp; 2317 done: 2318 kmem_free(listp, listsz); 2319 return (status); 2320 } 2321 2322 /* 2323 * Function: 2324 * vdc_init_ports 2325 * 2326 * Description: 2327 * Initialize all the ports for this vdisk instance. 2328 * 2329 * Arguments: 2330 * vdc - soft state pointer for this instance of the device driver. 2331 * mdp - md pointer 2332 * vd_nodep - device md node. 2333 * 2334 * Return Code: 2335 * 0 - Success. 2336 * ENOENT - Expected node or property did not exist. 2337 */ 2338 static int 2339 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2340 { 2341 int status = 0; 2342 int idx; 2343 int num_nodes; 2344 int num_vports; 2345 int num_chans; 2346 int listsz; 2347 mde_cookie_t vd_port; 2348 mde_cookie_t *chanp = NULL; 2349 mde_cookie_t *portp = NULL; 2350 vdc_server_t *srvr; 2351 vdc_server_t *prev_srvr = NULL; 2352 2353 /* 2354 * We now walk the MD nodes to find the port nodes for this vdisk. 2355 */ 2356 num_nodes = md_node_count(mdp); 2357 ASSERT(num_nodes > 0); 2358 2359 listsz = num_nodes * sizeof (mde_cookie_t); 2360 2361 /* allocate memory for nodes */ 2362 portp = kmem_zalloc(listsz, KM_SLEEP); 2363 chanp = kmem_zalloc(listsz, KM_SLEEP); 2364 2365 num_vports = md_scan_dag(mdp, vd_nodep, 2366 md_find_name(mdp, VDC_MD_PORT_NAME), 2367 md_find_name(mdp, "fwd"), portp); 2368 if (num_vports == 0) { 2369 DMSGX(0, "Found no '%s' node for '%s' port\n", 2370 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2371 status = ENOENT; 2372 goto done; 2373 } 2374 2375 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2376 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2377 2378 vdc->num_servers = 0; 2379 for (idx = 0; idx < num_vports; idx++) { 2380 2381 /* initialize this port */ 2382 vd_port = portp[idx]; 2383 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2384 srvr->vdcp = vdc; 2385 2386 /* get port id */ 2387 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2388 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2389 VDC_MD_ID); 2390 kmem_free(srvr, sizeof (vdc_server_t)); 2391 continue; 2392 } 2393 2394 /* set the connection timeout */ 2395 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2396 &srvr->ctimeout) != 0) { 2397 srvr->ctimeout = 0; 2398 } 2399 2400 /* get the ldc id */ 2401 num_chans = md_scan_dag(mdp, vd_port, 2402 md_find_name(mdp, VDC_MD_CHAN_NAME), 2403 md_find_name(mdp, "fwd"), chanp); 2404 2405 /* expecting at least one channel */ 2406 if (num_chans <= 0) { 2407 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2408 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2409 kmem_free(srvr, sizeof (vdc_server_t)); 2410 continue; 2411 } else if (num_chans != 1) { 2412 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2413 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2414 num_chans); 2415 } 2416 2417 /* 2418 * We use the first channel found (index 0), irrespective of how 2419 * many are there in total. 2420 */ 2421 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2422 &srvr->ldc_id) != 0) { 2423 cmn_err(CE_NOTE, "Channel '%s' property not found", 2424 VDC_MD_ID); 2425 kmem_free(srvr, sizeof (vdc_server_t)); 2426 continue; 2427 } 2428 2429 /* 2430 * now initialise LDC channel which will be used to 2431 * communicate with this server 2432 */ 2433 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2434 kmem_free(srvr, sizeof (vdc_server_t)); 2435 continue; 2436 } 2437 2438 /* add server to list */ 2439 if (prev_srvr) { 2440 prev_srvr->next = srvr; 2441 } else { 2442 vdc->server_list = srvr; 2443 prev_srvr = srvr; 2444 } 2445 2446 /* inc numbers of servers */ 2447 vdc->num_servers++; 2448 } 2449 2450 /* 2451 * Adjust the max number of handshake retries to match 2452 * the number of vdisk servers. 2453 */ 2454 if (vdc_hshake_retries < vdc->num_servers) 2455 vdc_hshake_retries = vdc->num_servers; 2456 2457 /* pick first server as current server */ 2458 if (vdc->server_list != NULL) { 2459 vdc->curr_server = vdc->server_list; 2460 status = 0; 2461 } else { 2462 status = ENOENT; 2463 } 2464 2465 done: 2466 kmem_free(chanp, listsz); 2467 kmem_free(portp, listsz); 2468 return (status); 2469 } 2470 2471 2472 /* 2473 * Function: 2474 * vdc_do_ldc_up 2475 * 2476 * Description: 2477 * Bring the channel for the current server up. 2478 * 2479 * Arguments: 2480 * vdc - soft state pointer for this instance of the device driver. 2481 * 2482 * Return Code: 2483 * 0 - Success. 2484 * EINVAL - Driver is detaching / LDC error 2485 * ECONNREFUSED - Other end is not listening 2486 */ 2487 static int 2488 vdc_do_ldc_up(vdc_t *vdc) 2489 { 2490 int status; 2491 ldc_status_t ldc_state; 2492 2493 ASSERT(MUTEX_HELD(&vdc->lock)); 2494 2495 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2496 vdc->instance, vdc->curr_server->ldc_id); 2497 2498 if (vdc->lifecycle == VDC_LC_DETACHING) 2499 return (EINVAL); 2500 2501 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2502 switch (status) { 2503 case ECONNREFUSED: /* listener not ready at other end */ 2504 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2505 vdc->instance, vdc->curr_server->ldc_id, status); 2506 status = 0; 2507 break; 2508 default: 2509 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2510 "channel=%ld, err=%d", vdc->instance, 2511 vdc->curr_server->ldc_id, status); 2512 break; 2513 } 2514 } 2515 2516 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2517 vdc->curr_server->ldc_state = ldc_state; 2518 if (ldc_state == LDC_UP) { 2519 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2520 vdc->instance); 2521 vdc->seq_num = 1; 2522 vdc->seq_num_reply = 0; 2523 } 2524 } 2525 2526 return (status); 2527 } 2528 2529 /* 2530 * Function: 2531 * vdc_terminate_ldc() 2532 * 2533 * Description: 2534 * 2535 * Arguments: 2536 * vdc - soft state pointer for this instance of the device driver. 2537 * srvr - vdc per-server info structure 2538 * 2539 * Return Code: 2540 * None 2541 */ 2542 static void 2543 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2544 { 2545 int instance = ddi_get_instance(vdc->dip); 2546 2547 if (srvr->state & VDC_LDC_OPEN) { 2548 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2549 (void) ldc_close(srvr->ldc_handle); 2550 } 2551 if (srvr->state & VDC_LDC_CB) { 2552 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2553 (void) ldc_unreg_callback(srvr->ldc_handle); 2554 } 2555 if (srvr->state & VDC_LDC_INIT) { 2556 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2557 (void) ldc_fini(srvr->ldc_handle); 2558 srvr->ldc_handle = NULL; 2559 } 2560 2561 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2562 } 2563 2564 /* 2565 * Function: 2566 * vdc_fini_ports() 2567 * 2568 * Description: 2569 * Finalize all ports by closing the channel associated with each 2570 * port and also freeing the server structure. 2571 * 2572 * Arguments: 2573 * vdc - soft state pointer for this instance of the device driver. 2574 * 2575 * Return Code: 2576 * None 2577 */ 2578 static void 2579 vdc_fini_ports(vdc_t *vdc) 2580 { 2581 int instance = ddi_get_instance(vdc->dip); 2582 vdc_server_t *srvr, *prev_srvr; 2583 2584 ASSERT(vdc != NULL); 2585 ASSERT(mutex_owned(&vdc->lock)); 2586 2587 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2588 2589 srvr = vdc->server_list; 2590 2591 while (srvr) { 2592 2593 vdc_terminate_ldc(vdc, srvr); 2594 2595 /* next server */ 2596 prev_srvr = srvr; 2597 srvr = srvr->next; 2598 2599 /* free server */ 2600 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2601 } 2602 2603 vdc->server_list = NULL; 2604 } 2605 2606 /* -------------------------------------------------------------------------- */ 2607 2608 /* 2609 * Descriptor Ring helper routines 2610 */ 2611 2612 /* 2613 * Function: 2614 * vdc_init_descriptor_ring() 2615 * 2616 * Description: 2617 * 2618 * Arguments: 2619 * vdc - soft state pointer for this instance of the device driver. 2620 * 2621 * Return Code: 2622 * 0 - Success 2623 */ 2624 static int 2625 vdc_init_descriptor_ring(vdc_t *vdc) 2626 { 2627 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2628 int status = 0; 2629 int i; 2630 2631 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2632 2633 ASSERT(vdc != NULL); 2634 ASSERT(mutex_owned(&vdc->lock)); 2635 2636 /* ensure we have enough room to store max sized block */ 2637 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2638 2639 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2640 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2641 /* 2642 * Calculate the maximum block size we can transmit using one 2643 * Descriptor Ring entry from the attributes returned by the 2644 * vDisk server. This is subject to a minimum of 'maxphys' 2645 * as we do not have the capability to split requests over 2646 * multiple DRing entries. 2647 */ 2648 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2649 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2650 vdc->instance); 2651 vdc->dring_max_cookies = maxphys / PAGESIZE; 2652 } else { 2653 vdc->dring_max_cookies = 2654 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2655 } 2656 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2657 (sizeof (ldc_mem_cookie_t) * 2658 (vdc->dring_max_cookies - 1))); 2659 vdc->dring_len = VD_DRING_LEN; 2660 2661 status = ldc_mem_dring_create(vdc->dring_len, 2662 vdc->dring_entry_size, &vdc->dring_hdl); 2663 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2664 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2665 vdc->instance); 2666 return (status); 2667 } 2668 vdc->initialized |= VDC_DRING_INIT; 2669 } 2670 2671 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2672 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2673 vdc->dring_cookie = 2674 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2675 2676 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2677 vdc->dring_hdl, 2678 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2679 &vdc->dring_cookie[0], 2680 &vdc->dring_cookie_count); 2681 if (status != 0) { 2682 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2683 "(%lx) to channel (%lx) status=%d\n", 2684 vdc->instance, vdc->dring_hdl, 2685 vdc->curr_server->ldc_handle, status); 2686 return (status); 2687 } 2688 ASSERT(vdc->dring_cookie_count == 1); 2689 vdc->initialized |= VDC_DRING_BOUND; 2690 } 2691 2692 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2693 if (status != 0) { 2694 DMSG(vdc, 0, 2695 "[%d] Failed to get info for descriptor ring (%lx)\n", 2696 vdc->instance, vdc->dring_hdl); 2697 return (status); 2698 } 2699 2700 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2701 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2702 2703 /* Allocate the local copy of this dring */ 2704 vdc->local_dring = 2705 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2706 KM_SLEEP); 2707 vdc->initialized |= VDC_DRING_LOCAL; 2708 } 2709 2710 /* 2711 * Mark all DRing entries as free and initialize the private 2712 * descriptor's memory handles. If any entry is initialized, 2713 * we need to free it later so we set the bit in 'initialized' 2714 * at the start. 2715 */ 2716 vdc->initialized |= VDC_DRING_ENTRY; 2717 for (i = 0; i < vdc->dring_len; i++) { 2718 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2719 dep->hdr.dstate = VIO_DESC_FREE; 2720 2721 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2722 &vdc->local_dring[i].desc_mhdl); 2723 if (status != 0) { 2724 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2725 " descriptor %d", vdc->instance, i); 2726 return (status); 2727 } 2728 vdc->local_dring[i].is_free = B_TRUE; 2729 vdc->local_dring[i].dep = dep; 2730 } 2731 2732 /* Initialize the starting index */ 2733 vdc->dring_curr_idx = 0; 2734 2735 return (status); 2736 } 2737 2738 /* 2739 * Function: 2740 * vdc_destroy_descriptor_ring() 2741 * 2742 * Description: 2743 * 2744 * Arguments: 2745 * vdc - soft state pointer for this instance of the device driver. 2746 * 2747 * Return Code: 2748 * None 2749 */ 2750 static void 2751 vdc_destroy_descriptor_ring(vdc_t *vdc) 2752 { 2753 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2754 ldc_mem_handle_t mhdl = NULL; 2755 ldc_mem_info_t minfo; 2756 int status = -1; 2757 int i; /* loop */ 2758 2759 ASSERT(vdc != NULL); 2760 ASSERT(mutex_owned(&vdc->lock)); 2761 2762 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2763 2764 if (vdc->initialized & VDC_DRING_ENTRY) { 2765 DMSG(vdc, 0, 2766 "[%d] Removing Local DRing entries\n", vdc->instance); 2767 for (i = 0; i < vdc->dring_len; i++) { 2768 ldep = &vdc->local_dring[i]; 2769 mhdl = ldep->desc_mhdl; 2770 2771 if (mhdl == NULL) 2772 continue; 2773 2774 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2775 DMSG(vdc, 0, 2776 "ldc_mem_info returned an error: %d\n", 2777 status); 2778 2779 /* 2780 * This must mean that the mem handle 2781 * is not valid. Clear it out so that 2782 * no one tries to use it. 2783 */ 2784 ldep->desc_mhdl = NULL; 2785 continue; 2786 } 2787 2788 if (minfo.status == LDC_BOUND) { 2789 (void) ldc_mem_unbind_handle(mhdl); 2790 } 2791 2792 (void) ldc_mem_free_handle(mhdl); 2793 2794 ldep->desc_mhdl = NULL; 2795 } 2796 vdc->initialized &= ~VDC_DRING_ENTRY; 2797 } 2798 2799 if (vdc->initialized & VDC_DRING_LOCAL) { 2800 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2801 kmem_free(vdc->local_dring, 2802 vdc->dring_len * sizeof (vdc_local_desc_t)); 2803 vdc->initialized &= ~VDC_DRING_LOCAL; 2804 } 2805 2806 if (vdc->initialized & VDC_DRING_BOUND) { 2807 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2808 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2809 if (status == 0) { 2810 vdc->initialized &= ~VDC_DRING_BOUND; 2811 } else { 2812 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2813 vdc->instance, status, vdc->dring_hdl); 2814 } 2815 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2816 } 2817 2818 if (vdc->initialized & VDC_DRING_INIT) { 2819 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2820 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2821 if (status == 0) { 2822 vdc->dring_hdl = NULL; 2823 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2824 vdc->initialized &= ~VDC_DRING_INIT; 2825 } else { 2826 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2827 vdc->instance, status, vdc->dring_hdl); 2828 } 2829 } 2830 } 2831 2832 /* 2833 * Function: 2834 * vdc_map_to_shared_dring() 2835 * 2836 * Description: 2837 * Copy contents of the local descriptor to the shared 2838 * memory descriptor. 2839 * 2840 * Arguments: 2841 * vdcp - soft state pointer for this instance of the device driver. 2842 * idx - descriptor ring index 2843 * 2844 * Return Code: 2845 * None 2846 */ 2847 static int 2848 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2849 { 2850 vdc_local_desc_t *ldep; 2851 vd_dring_entry_t *dep; 2852 int rv; 2853 2854 ldep = &(vdcp->local_dring[idx]); 2855 2856 /* for now leave in the old pop_mem_hdl stuff */ 2857 if (ldep->nbytes > 0) { 2858 rv = vdc_populate_mem_hdl(vdcp, ldep); 2859 if (rv) { 2860 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2861 vdcp->instance); 2862 return (rv); 2863 } 2864 } 2865 2866 /* 2867 * fill in the data details into the DRing 2868 */ 2869 dep = ldep->dep; 2870 ASSERT(dep != NULL); 2871 2872 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2873 dep->payload.operation = ldep->operation; 2874 dep->payload.addr = ldep->offset; 2875 dep->payload.nbytes = ldep->nbytes; 2876 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2877 dep->payload.slice = ldep->slice; 2878 dep->hdr.dstate = VIO_DESC_READY; 2879 dep->hdr.ack = 1; /* request an ACK for every message */ 2880 2881 return (0); 2882 } 2883 2884 /* 2885 * Function: 2886 * vdc_send_request 2887 * 2888 * Description: 2889 * This routine writes the data to be transmitted to vds into the 2890 * descriptor, notifies vds that the ring has been updated and 2891 * then waits for the request to be processed. 2892 * 2893 * Arguments: 2894 * vdcp - the soft state pointer 2895 * operation - operation we want vds to perform (VD_OP_XXX) 2896 * addr - address of data buf to be read/written. 2897 * nbytes - number of bytes to read/write 2898 * slice - the disk slice this request is for 2899 * offset - relative disk offset 2900 * cb_type - type of call - STRATEGY or SYNC 2901 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2902 * . mode for ioctl(9e) 2903 * . LP64 diskaddr_t (block I/O) 2904 * dir - direction of operation (READ/WRITE/BOTH) 2905 * 2906 * Return Codes: 2907 * 0 2908 * ENXIO 2909 */ 2910 static int 2911 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2912 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2913 void *cb_arg, vio_desc_direction_t dir) 2914 { 2915 int rv = 0; 2916 2917 ASSERT(vdcp != NULL); 2918 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2919 2920 mutex_enter(&vdcp->lock); 2921 2922 /* 2923 * If this is a block read/write operation we update the I/O statistics 2924 * to indicate that the request is being put on the waitq to be 2925 * serviced. 2926 * 2927 * We do it here (a common routine for both synchronous and strategy 2928 * calls) for performance reasons - we are already holding vdc->lock 2929 * so there is no extra locking overhead. We would have to explicitly 2930 * grab the 'lock' mutex to update the stats if we were to do this 2931 * higher up the stack in vdc_strategy() et. al. 2932 */ 2933 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2934 DTRACE_IO1(start, buf_t *, cb_arg); 2935 VD_KSTAT_WAITQ_ENTER(vdcp); 2936 } 2937 2938 do { 2939 while (vdcp->state != VDC_STATE_RUNNING) { 2940 2941 /* return error if detaching */ 2942 if (vdcp->state == VDC_STATE_DETACH) { 2943 rv = ENXIO; 2944 goto done; 2945 } 2946 2947 /* fail request if connection timeout is reached */ 2948 if (vdcp->ctimeout_reached) { 2949 rv = EIO; 2950 goto done; 2951 } 2952 2953 /* 2954 * If we are panicking and the disk is not ready then 2955 * we can't send any request because we can't complete 2956 * the handshake now. 2957 */ 2958 if (ddi_in_panic()) { 2959 rv = EIO; 2960 goto done; 2961 } 2962 2963 cv_wait(&vdcp->running_cv, &vdcp->lock); 2964 } 2965 2966 } while (vdc_populate_descriptor(vdcp, operation, addr, 2967 nbytes, slice, offset, cb_type, cb_arg, dir)); 2968 2969 done: 2970 /* 2971 * If this is a block read/write we update the I/O statistics kstat 2972 * to indicate that this request has been placed on the queue for 2973 * processing (i.e sent to the vDisk server) - iostat(1M) will 2974 * report the time waiting for the vDisk server under the %b column 2975 * In the case of an error we simply take it off the wait queue. 2976 */ 2977 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2978 if (rv == 0) { 2979 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2980 DTRACE_PROBE1(send, buf_t *, cb_arg); 2981 } else { 2982 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2983 VD_KSTAT_WAITQ_EXIT(vdcp); 2984 DTRACE_IO1(done, buf_t *, cb_arg); 2985 } 2986 } 2987 2988 mutex_exit(&vdcp->lock); 2989 2990 return (rv); 2991 } 2992 2993 2994 /* 2995 * Function: 2996 * vdc_populate_descriptor 2997 * 2998 * Description: 2999 * This routine writes the data to be transmitted to vds into the 3000 * descriptor, notifies vds that the ring has been updated and 3001 * then waits for the request to be processed. 3002 * 3003 * Arguments: 3004 * vdcp - the soft state pointer 3005 * operation - operation we want vds to perform (VD_OP_XXX) 3006 * addr - address of data buf to be read/written. 3007 * nbytes - number of bytes to read/write 3008 * slice - the disk slice this request is for 3009 * offset - relative disk offset 3010 * cb_type - type of call - STRATEGY or SYNC 3011 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3012 * . mode for ioctl(9e) 3013 * . LP64 diskaddr_t (block I/O) 3014 * dir - direction of operation (READ/WRITE/BOTH) 3015 * 3016 * Return Codes: 3017 * 0 3018 * EAGAIN 3019 * ECONNRESET 3020 * ENXIO 3021 */ 3022 static int 3023 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 3024 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 3025 void *cb_arg, vio_desc_direction_t dir) 3026 { 3027 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 3028 int idx; /* Index of DRing entry used */ 3029 int next_idx; 3030 vio_dring_msg_t dmsg; 3031 size_t msglen; 3032 int rv; 3033 3034 ASSERT(MUTEX_HELD(&vdcp->lock)); 3035 vdcp->threads_pending++; 3036 loop: 3037 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 3038 3039 /* Get next available D-Ring entry */ 3040 idx = vdcp->dring_curr_idx; 3041 local_dep = &(vdcp->local_dring[idx]); 3042 3043 if (!local_dep->is_free) { 3044 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 3045 vdcp->instance); 3046 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 3047 if (vdcp->state == VDC_STATE_RUNNING || 3048 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3049 goto loop; 3050 } 3051 vdcp->threads_pending--; 3052 return (ECONNRESET); 3053 } 3054 3055 next_idx = idx + 1; 3056 if (next_idx >= vdcp->dring_len) 3057 next_idx = 0; 3058 vdcp->dring_curr_idx = next_idx; 3059 3060 ASSERT(local_dep->is_free); 3061 3062 local_dep->operation = operation; 3063 local_dep->addr = addr; 3064 local_dep->nbytes = nbytes; 3065 local_dep->slice = slice; 3066 local_dep->offset = offset; 3067 local_dep->cb_type = cb_type; 3068 local_dep->cb_arg = cb_arg; 3069 local_dep->dir = dir; 3070 3071 local_dep->is_free = B_FALSE; 3072 3073 rv = vdc_map_to_shared_dring(vdcp, idx); 3074 if (rv) { 3075 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3076 vdcp->instance); 3077 /* free the descriptor */ 3078 local_dep->is_free = B_TRUE; 3079 vdcp->dring_curr_idx = idx; 3080 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3081 if (vdcp->state == VDC_STATE_RUNNING || 3082 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3083 goto loop; 3084 } 3085 vdcp->threads_pending--; 3086 return (ECONNRESET); 3087 } 3088 3089 /* 3090 * Send a msg with the DRing details to vds 3091 */ 3092 VIO_INIT_DRING_DATA_TAG(dmsg); 3093 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3094 dmsg.dring_ident = vdcp->dring_ident; 3095 dmsg.start_idx = idx; 3096 dmsg.end_idx = idx; 3097 vdcp->seq_num++; 3098 3099 DTRACE_PROBE2(populate, int, vdcp->instance, 3100 vdc_local_desc_t *, local_dep); 3101 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3102 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3103 3104 /* 3105 * note we're still holding the lock here to 3106 * make sure the message goes out in order !!!... 3107 */ 3108 msglen = sizeof (dmsg); 3109 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3110 switch (rv) { 3111 case ECONNRESET: 3112 /* 3113 * vdc_send initiates the reset on failure. 3114 * Since the transaction has already been put 3115 * on the local dring, it will automatically get 3116 * retried when the channel is reset. Given that, 3117 * it is ok to just return success even though the 3118 * send failed. 3119 */ 3120 rv = 0; 3121 break; 3122 3123 case 0: /* EOK */ 3124 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3125 break; 3126 3127 default: 3128 goto cleanup_and_exit; 3129 } 3130 3131 vdcp->threads_pending--; 3132 return (rv); 3133 3134 cleanup_and_exit: 3135 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3136 return (ENXIO); 3137 } 3138 3139 /* 3140 * Function: 3141 * vdc_do_sync_op 3142 * 3143 * Description: 3144 * Wrapper around vdc_populate_descriptor that blocks until the 3145 * response to the message is available. 3146 * 3147 * Arguments: 3148 * vdcp - the soft state pointer 3149 * operation - operation we want vds to perform (VD_OP_XXX) 3150 * addr - address of data buf to be read/written. 3151 * nbytes - number of bytes to read/write 3152 * slice - the disk slice this request is for 3153 * offset - relative disk offset 3154 * cb_type - type of call - STRATEGY or SYNC 3155 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3156 * . mode for ioctl(9e) 3157 * . LP64 diskaddr_t (block I/O) 3158 * dir - direction of operation (READ/WRITE/BOTH) 3159 * rconflict - check for reservation conflict in case of failure 3160 * 3161 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3162 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3163 * result of a successful operation with vd_scsi_status(). 3164 * 3165 * Return Codes: 3166 * 0 3167 * EAGAIN 3168 * EFAULT 3169 * ENXIO 3170 * EIO 3171 */ 3172 static int 3173 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3174 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3175 vio_desc_direction_t dir, boolean_t rconflict) 3176 { 3177 int status; 3178 vdc_io_t *vio; 3179 boolean_t check_resv_conflict = B_FALSE; 3180 3181 ASSERT(cb_type == CB_SYNC); 3182 3183 /* 3184 * Grab the lock, if blocked wait until the server 3185 * response causes us to wake up again. 3186 */ 3187 mutex_enter(&vdcp->lock); 3188 vdcp->sync_op_cnt++; 3189 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3190 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3191 3192 if (vdcp->state == VDC_STATE_DETACH) { 3193 cv_broadcast(&vdcp->sync_blocked_cv); 3194 vdcp->sync_op_cnt--; 3195 mutex_exit(&vdcp->lock); 3196 return (ENXIO); 3197 } 3198 3199 /* now block anyone other thread entering after us */ 3200 vdcp->sync_op_blocked = B_TRUE; 3201 vdcp->sync_op_pending = B_TRUE; 3202 mutex_exit(&vdcp->lock); 3203 3204 status = vdc_send_request(vdcp, operation, addr, 3205 nbytes, slice, offset, cb_type, cb_arg, dir); 3206 3207 mutex_enter(&vdcp->lock); 3208 3209 if (status != 0) { 3210 vdcp->sync_op_pending = B_FALSE; 3211 } else { 3212 /* 3213 * block until our transaction completes. 3214 * Also anyone else waiting also gets to go next. 3215 */ 3216 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3217 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3218 3219 DMSG(vdcp, 2, ": operation returned %d\n", 3220 vdcp->sync_op_status); 3221 if (vdcp->state == VDC_STATE_DETACH) { 3222 vdcp->sync_op_pending = B_FALSE; 3223 status = ENXIO; 3224 } else { 3225 status = vdcp->sync_op_status; 3226 if (status != 0 && vdcp->failfast_interval != 0) { 3227 /* 3228 * Operation has failed and failfast is enabled. 3229 * We need to check if the failure is due to a 3230 * reservation conflict if this was requested. 3231 */ 3232 check_resv_conflict = rconflict; 3233 } 3234 3235 } 3236 } 3237 3238 vdcp->sync_op_status = 0; 3239 vdcp->sync_op_blocked = B_FALSE; 3240 vdcp->sync_op_cnt--; 3241 3242 /* signal the next waiting thread */ 3243 cv_signal(&vdcp->sync_blocked_cv); 3244 3245 /* 3246 * We have to check for reservation conflict after unblocking sync 3247 * operations because some sync operations will be used to do this 3248 * check. 3249 */ 3250 if (check_resv_conflict) { 3251 vio = vdc_failfast_io_queue(vdcp, NULL); 3252 while (vio->vio_qtime != 0) 3253 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3254 kmem_free(vio, sizeof (vdc_io_t)); 3255 } 3256 3257 mutex_exit(&vdcp->lock); 3258 3259 return (status); 3260 } 3261 3262 3263 /* 3264 * Function: 3265 * vdc_drain_response() 3266 * 3267 * Description: 3268 * When a guest is panicking, the completion of requests needs to be 3269 * handled differently because interrupts are disabled and vdc 3270 * will not get messages. We have to poll for the messages instead. 3271 * 3272 * Note: since we don't have a buf_t available we cannot implement 3273 * the io:::done DTrace probe in this specific case. 3274 * 3275 * Arguments: 3276 * vdc - soft state pointer for this instance of the device driver. 3277 * 3278 * Return Code: 3279 * 0 - Success 3280 */ 3281 static int 3282 vdc_drain_response(vdc_t *vdc) 3283 { 3284 int rv, idx, retries; 3285 size_t msglen; 3286 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3287 vio_dring_msg_t dmsg; 3288 3289 mutex_enter(&vdc->lock); 3290 3291 retries = 0; 3292 for (;;) { 3293 msglen = sizeof (dmsg); 3294 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3295 &msglen); 3296 if (rv) { 3297 rv = EINVAL; 3298 break; 3299 } 3300 3301 /* 3302 * if there are no packets wait and check again 3303 */ 3304 if ((rv == 0) && (msglen == 0)) { 3305 if (retries++ > vdc_dump_retries) { 3306 rv = EAGAIN; 3307 break; 3308 } 3309 3310 drv_usecwait(vdc_usec_timeout_dump); 3311 continue; 3312 } 3313 3314 /* 3315 * Ignore all messages that are not ACKs/NACKs to 3316 * DRing requests. 3317 */ 3318 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3319 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3320 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3321 dmsg.tag.vio_msgtype, 3322 dmsg.tag.vio_subtype, 3323 dmsg.tag.vio_subtype_env); 3324 continue; 3325 } 3326 3327 /* 3328 * set the appropriate return value for the current request. 3329 */ 3330 switch (dmsg.tag.vio_subtype) { 3331 case VIO_SUBTYPE_ACK: 3332 rv = 0; 3333 break; 3334 case VIO_SUBTYPE_NACK: 3335 rv = EAGAIN; 3336 break; 3337 default: 3338 continue; 3339 } 3340 3341 idx = dmsg.start_idx; 3342 if (idx >= vdc->dring_len) { 3343 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3344 vdc->instance, idx); 3345 continue; 3346 } 3347 ldep = &vdc->local_dring[idx]; 3348 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3349 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3350 vdc->instance, idx, ldep->dep->hdr.dstate); 3351 continue; 3352 } 3353 3354 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3355 vdc->instance, idx, ldep->dep->hdr.dstate); 3356 3357 rv = vdc_depopulate_descriptor(vdc, idx); 3358 if (rv) { 3359 DMSG(vdc, 0, 3360 "[%d] Entry @ %d - depopulate failed ..\n", 3361 vdc->instance, idx); 3362 } 3363 3364 /* if this is the last descriptor - break out of loop */ 3365 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3366 break; 3367 } 3368 3369 mutex_exit(&vdc->lock); 3370 DMSG(vdc, 0, "End idx=%d\n", idx); 3371 3372 return (rv); 3373 } 3374 3375 3376 /* 3377 * Function: 3378 * vdc_depopulate_descriptor() 3379 * 3380 * Description: 3381 * 3382 * Arguments: 3383 * vdc - soft state pointer for this instance of the device driver. 3384 * idx - Index of the Descriptor Ring entry being modified 3385 * 3386 * Return Code: 3387 * 0 - Success 3388 */ 3389 static int 3390 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3391 { 3392 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3393 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3394 int status = ENXIO; 3395 int rv = 0; 3396 3397 ASSERT(vdc != NULL); 3398 ASSERT(idx < vdc->dring_len); 3399 ldep = &vdc->local_dring[idx]; 3400 ASSERT(ldep != NULL); 3401 ASSERT(MUTEX_HELD(&vdc->lock)); 3402 3403 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3404 DMSG(vdc, 2, ": idx = %d\n", idx); 3405 3406 dep = ldep->dep; 3407 ASSERT(dep != NULL); 3408 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3409 (dep->payload.status == ECANCELED)); 3410 3411 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3412 3413 ldep->is_free = B_TRUE; 3414 status = dep->payload.status; 3415 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3416 3417 /* 3418 * If no buffers were used to transfer information to the server when 3419 * populating the descriptor then no memory handles need to be unbound 3420 * and we can return now. 3421 */ 3422 if (ldep->nbytes == 0) { 3423 cv_signal(&vdc->dring_free_cv); 3424 return (status); 3425 } 3426 3427 /* 3428 * If the upper layer passed in a misaligned address we copied the 3429 * data into an aligned buffer before sending it to LDC - we now 3430 * copy it back to the original buffer. 3431 */ 3432 if (ldep->align_addr) { 3433 ASSERT(ldep->addr != NULL); 3434 3435 if (dep->payload.nbytes > 0) 3436 bcopy(ldep->align_addr, ldep->addr, 3437 dep->payload.nbytes); 3438 kmem_free(ldep->align_addr, 3439 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3440 ldep->align_addr = NULL; 3441 } 3442 3443 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3444 if (rv != 0) { 3445 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3446 vdc->instance, ldep->desc_mhdl, idx, rv); 3447 /* 3448 * The error returned by the vDisk server is more informative 3449 * and thus has a higher priority but if it isn't set we ensure 3450 * that this function returns an error. 3451 */ 3452 if (status == 0) 3453 status = EINVAL; 3454 } 3455 3456 cv_signal(&vdc->membind_cv); 3457 cv_signal(&vdc->dring_free_cv); 3458 3459 return (status); 3460 } 3461 3462 /* 3463 * Function: 3464 * vdc_populate_mem_hdl() 3465 * 3466 * Description: 3467 * 3468 * Arguments: 3469 * vdc - soft state pointer for this instance of the device driver. 3470 * idx - Index of the Descriptor Ring entry being modified 3471 * addr - virtual address being mapped in 3472 * nybtes - number of bytes in 'addr' 3473 * operation - the vDisk operation being performed (VD_OP_xxx) 3474 * 3475 * Return Code: 3476 * 0 - Success 3477 */ 3478 static int 3479 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3480 { 3481 vd_dring_entry_t *dep = NULL; 3482 ldc_mem_handle_t mhdl; 3483 caddr_t vaddr; 3484 size_t nbytes; 3485 uint8_t perm = LDC_MEM_RW; 3486 uint8_t maptype; 3487 int rv = 0; 3488 int i; 3489 3490 ASSERT(vdcp != NULL); 3491 3492 dep = ldep->dep; 3493 mhdl = ldep->desc_mhdl; 3494 3495 switch (ldep->dir) { 3496 case VIO_read_dir: 3497 perm = LDC_MEM_W; 3498 break; 3499 3500 case VIO_write_dir: 3501 perm = LDC_MEM_R; 3502 break; 3503 3504 case VIO_both_dir: 3505 perm = LDC_MEM_RW; 3506 break; 3507 3508 default: 3509 ASSERT(0); /* catch bad programming in vdc */ 3510 } 3511 3512 /* 3513 * LDC expects any addresses passed in to be 8-byte aligned. We need 3514 * to copy the contents of any misaligned buffers to a newly allocated 3515 * buffer and bind it instead (and copy the the contents back to the 3516 * original buffer passed in when depopulating the descriptor) 3517 */ 3518 vaddr = ldep->addr; 3519 nbytes = ldep->nbytes; 3520 if (((uint64_t)vaddr & 0x7) != 0) { 3521 ASSERT(ldep->align_addr == NULL); 3522 ldep->align_addr = 3523 kmem_alloc(sizeof (caddr_t) * 3524 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3525 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3526 "(buf=%p nb=%ld op=%d)\n", 3527 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3528 nbytes, ldep->operation); 3529 if (perm != LDC_MEM_W) 3530 bcopy(vaddr, ldep->align_addr, nbytes); 3531 vaddr = ldep->align_addr; 3532 } 3533 3534 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3535 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3536 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3537 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3538 vdcp->instance, dep->payload.ncookies); 3539 if (rv != 0) { 3540 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3541 "(mhdl=%p, buf=%p, err=%d)\n", 3542 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3543 if (ldep->align_addr) { 3544 kmem_free(ldep->align_addr, 3545 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3546 ldep->align_addr = NULL; 3547 } 3548 return (EAGAIN); 3549 } 3550 3551 /* 3552 * Get the other cookies (if any). 3553 */ 3554 for (i = 1; i < dep->payload.ncookies; i++) { 3555 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3556 if (rv != 0) { 3557 (void) ldc_mem_unbind_handle(mhdl); 3558 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3559 "(mhdl=%lx cnum=%d), err=%d", 3560 vdcp->instance, mhdl, i, rv); 3561 if (ldep->align_addr) { 3562 kmem_free(ldep->align_addr, 3563 sizeof (caddr_t) * ldep->nbytes); 3564 ldep->align_addr = NULL; 3565 } 3566 return (EAGAIN); 3567 } 3568 } 3569 3570 return (rv); 3571 } 3572 3573 /* 3574 * Interrupt handlers for messages from LDC 3575 */ 3576 3577 /* 3578 * Function: 3579 * vdc_handle_cb() 3580 * 3581 * Description: 3582 * 3583 * Arguments: 3584 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3585 * arg - soft state pointer for this instance of the device driver. 3586 * 3587 * Return Code: 3588 * 0 - Success 3589 */ 3590 static uint_t 3591 vdc_handle_cb(uint64_t event, caddr_t arg) 3592 { 3593 ldc_status_t ldc_state; 3594 int rv = 0; 3595 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3596 vdc_t *vdc = srvr->vdcp; 3597 3598 ASSERT(vdc != NULL); 3599 3600 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3601 3602 /* If callback is not for the current server, ignore it */ 3603 mutex_enter(&vdc->lock); 3604 3605 if (vdc->curr_server != srvr) { 3606 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3607 vdc->instance, event, srvr->id); 3608 mutex_exit(&vdc->lock); 3609 return (LDC_SUCCESS); 3610 } 3611 3612 /* 3613 * Depending on the type of event that triggered this callback, 3614 * we modify the handshake state or read the data. 3615 * 3616 * NOTE: not done as a switch() as event could be triggered by 3617 * a state change and a read request. Also the ordering of the 3618 * check for the event types is deliberate. 3619 */ 3620 if (event & LDC_EVT_UP) { 3621 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3622 3623 /* get LDC state */ 3624 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3625 if (rv != 0) { 3626 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3627 vdc->instance, rv); 3628 mutex_exit(&vdc->lock); 3629 return (LDC_SUCCESS); 3630 } 3631 if (srvr->ldc_state != LDC_UP && 3632 ldc_state == LDC_UP) { 3633 /* 3634 * Reset the transaction sequence numbers when 3635 * LDC comes up. We then kick off the handshake 3636 * negotiation with the vDisk server. 3637 */ 3638 vdc->seq_num = 1; 3639 vdc->seq_num_reply = 0; 3640 srvr->ldc_state = ldc_state; 3641 cv_signal(&vdc->initwait_cv); 3642 } 3643 } 3644 3645 if (event & LDC_EVT_READ) { 3646 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3647 mutex_enter(&vdc->read_lock); 3648 cv_signal(&vdc->read_cv); 3649 vdc->read_state = VDC_READ_PENDING; 3650 mutex_exit(&vdc->read_lock); 3651 mutex_exit(&vdc->lock); 3652 3653 /* that's all we have to do - no need to handle DOWN/RESET */ 3654 return (LDC_SUCCESS); 3655 } 3656 3657 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3658 3659 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3660 3661 /* 3662 * Need to wake up any readers so they will 3663 * detect that a reset has occurred. 3664 */ 3665 mutex_enter(&vdc->read_lock); 3666 if ((vdc->read_state == VDC_READ_WAITING) || 3667 (vdc->read_state == VDC_READ_RESET)) 3668 cv_signal(&vdc->read_cv); 3669 vdc->read_state = VDC_READ_RESET; 3670 mutex_exit(&vdc->read_lock); 3671 3672 /* wake up any threads waiting for connection to come up */ 3673 if (vdc->state == VDC_STATE_INIT_WAITING) { 3674 vdc->state = VDC_STATE_RESETTING; 3675 cv_signal(&vdc->initwait_cv); 3676 } 3677 3678 } 3679 3680 mutex_exit(&vdc->lock); 3681 3682 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3683 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3684 vdc->instance, event); 3685 3686 return (LDC_SUCCESS); 3687 } 3688 3689 /* 3690 * Function: 3691 * vdc_wait_for_response() 3692 * 3693 * Description: 3694 * Block waiting for a response from the server. If there is 3695 * no data the thread block on the read_cv that is signalled 3696 * by the callback when an EVT_READ occurs. 3697 * 3698 * Arguments: 3699 * vdcp - soft state pointer for this instance of the device driver. 3700 * 3701 * Return Code: 3702 * 0 - Success 3703 */ 3704 static int 3705 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3706 { 3707 size_t nbytes = sizeof (*msgp); 3708 int status; 3709 3710 ASSERT(vdcp != NULL); 3711 3712 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3713 3714 status = vdc_recv(vdcp, msgp, &nbytes); 3715 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3716 status, (int)nbytes); 3717 if (status) { 3718 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3719 vdcp->instance, status); 3720 return (status); 3721 } 3722 3723 if (nbytes < sizeof (vio_msg_tag_t)) { 3724 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3725 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3726 return (ENOMSG); 3727 } 3728 3729 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3730 msgp->tag.vio_msgtype, 3731 msgp->tag.vio_subtype, 3732 msgp->tag.vio_subtype_env); 3733 3734 /* 3735 * Verify the Session ID of the message 3736 * 3737 * Every message after the Version has been negotiated should 3738 * have the correct session ID set. 3739 */ 3740 if ((msgp->tag.vio_sid != vdcp->session_id) && 3741 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3742 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3743 "expected 0x%lx [seq num %lx @ %d]", 3744 vdcp->instance, msgp->tag.vio_sid, 3745 vdcp->session_id, 3746 ((vio_dring_msg_t *)msgp)->seq_num, 3747 ((vio_dring_msg_t *)msgp)->start_idx); 3748 return (ENOMSG); 3749 } 3750 return (0); 3751 } 3752 3753 3754 /* 3755 * Function: 3756 * vdc_resubmit_backup_dring() 3757 * 3758 * Description: 3759 * Resubmit each descriptor in the backed up dring to 3760 * vDisk server. The Dring was backed up during connection 3761 * reset. 3762 * 3763 * Arguments: 3764 * vdcp - soft state pointer for this instance of the device driver. 3765 * 3766 * Return Code: 3767 * 0 - Success 3768 */ 3769 static int 3770 vdc_resubmit_backup_dring(vdc_t *vdcp) 3771 { 3772 int processed = 0; 3773 int count; 3774 int b_idx; 3775 int rv = 0; 3776 int dring_size; 3777 int op; 3778 vio_msg_t vio_msg; 3779 vdc_local_desc_t *curr_ldep; 3780 3781 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3782 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3783 3784 if (vdcp->local_dring_backup == NULL) { 3785 /* the pending requests have already been processed */ 3786 return (0); 3787 } 3788 3789 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3790 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3791 3792 /* 3793 * Walk the backup copy of the local descriptor ring and 3794 * resubmit all the outstanding transactions. 3795 */ 3796 b_idx = vdcp->local_dring_backup_tail; 3797 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3798 3799 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3800 3801 /* only resubmit outstanding transactions */ 3802 if (!curr_ldep->is_free) { 3803 /* 3804 * If we are retrying a block read/write operation we 3805 * need to update the I/O statistics to indicate that 3806 * the request is being put back on the waitq to be 3807 * serviced (it will have been taken off after the 3808 * error was reported). 3809 */ 3810 mutex_enter(&vdcp->lock); 3811 op = curr_ldep->operation; 3812 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3813 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3814 VD_KSTAT_WAITQ_ENTER(vdcp); 3815 } 3816 3817 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3818 rv = vdc_populate_descriptor(vdcp, op, 3819 curr_ldep->addr, curr_ldep->nbytes, 3820 curr_ldep->slice, curr_ldep->offset, 3821 curr_ldep->cb_type, curr_ldep->cb_arg, 3822 curr_ldep->dir); 3823 3824 if (rv) { 3825 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3826 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3827 VD_KSTAT_WAITQ_EXIT(vdcp); 3828 DTRACE_IO1(done, buf_t *, 3829 curr_ldep->cb_arg); 3830 } 3831 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3832 vdcp->instance, b_idx); 3833 mutex_exit(&vdcp->lock); 3834 goto done; 3835 } 3836 3837 /* 3838 * If this is a block read/write we update the I/O 3839 * statistics kstat to indicate that the request 3840 * has been sent back to the vDisk server and should 3841 * now be put on the run queue. 3842 */ 3843 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3844 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3845 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3846 } 3847 mutex_exit(&vdcp->lock); 3848 3849 /* Wait for the response message. */ 3850 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3851 b_idx); 3852 rv = vdc_wait_for_response(vdcp, &vio_msg); 3853 if (rv) { 3854 /* 3855 * If this is a block read/write we update 3856 * the I/O statistics kstat to take it 3857 * off the run queue. 3858 */ 3859 mutex_enter(&vdcp->lock); 3860 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3861 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3862 VD_KSTAT_RUNQ_EXIT(vdcp); 3863 DTRACE_IO1(done, buf_t *, 3864 curr_ldep->cb_arg); 3865 } 3866 DMSG(vdcp, 1, "[%d] wait_for_response " 3867 "returned err=%d\n", vdcp->instance, 3868 rv); 3869 mutex_exit(&vdcp->lock); 3870 goto done; 3871 } 3872 3873 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3874 rv = vdc_process_data_msg(vdcp, &vio_msg); 3875 if (rv) { 3876 DMSG(vdcp, 1, "[%d] process_data_msg " 3877 "returned err=%d\n", vdcp->instance, 3878 rv); 3879 goto done; 3880 } 3881 processed++; 3882 } 3883 3884 /* get the next element to submit */ 3885 if (++b_idx >= vdcp->local_dring_backup_len) 3886 b_idx = 0; 3887 } 3888 3889 /* all done - now clear up pending dring copy */ 3890 dring_size = vdcp->local_dring_backup_len * 3891 sizeof (vdcp->local_dring_backup[0]); 3892 3893 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3894 3895 vdcp->local_dring_backup = NULL; 3896 3897 done: 3898 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3899 3900 return (rv); 3901 } 3902 3903 /* 3904 * Function: 3905 * vdc_cancel_backup_dring 3906 * 3907 * Description: 3908 * Cancel each descriptor in the backed up dring to vDisk server. 3909 * The Dring was backed up during connection reset. 3910 * 3911 * Arguments: 3912 * vdcp - soft state pointer for this instance of the device driver. 3913 * 3914 * Return Code: 3915 * None 3916 */ 3917 void 3918 vdc_cancel_backup_dring(vdc_t *vdcp) 3919 { 3920 vdc_local_desc_t *ldep; 3921 struct buf *bufp; 3922 int count; 3923 int b_idx; 3924 int dring_size; 3925 int cancelled = 0; 3926 3927 ASSERT(MUTEX_HELD(&vdcp->lock)); 3928 ASSERT(vdcp->state == VDC_STATE_INIT || 3929 vdcp->state == VDC_STATE_INIT_WAITING || 3930 vdcp->state == VDC_STATE_NEGOTIATE || 3931 vdcp->state == VDC_STATE_RESETTING); 3932 3933 if (vdcp->local_dring_backup == NULL) { 3934 /* the pending requests have already been processed */ 3935 return; 3936 } 3937 3938 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3939 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3940 3941 /* 3942 * Walk the backup copy of the local descriptor ring and 3943 * cancel all the outstanding transactions. 3944 */ 3945 b_idx = vdcp->local_dring_backup_tail; 3946 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3947 3948 ldep = &(vdcp->local_dring_backup[b_idx]); 3949 3950 /* only cancel outstanding transactions */ 3951 if (!ldep->is_free) { 3952 3953 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3954 cancelled++; 3955 3956 /* 3957 * All requests have already been cleared from the 3958 * local descriptor ring and the LDC channel has been 3959 * reset so we will never get any reply for these 3960 * requests. Now we just have to notify threads waiting 3961 * for replies that the request has failed. 3962 */ 3963 switch (ldep->cb_type) { 3964 case CB_SYNC: 3965 ASSERT(vdcp->sync_op_pending); 3966 vdcp->sync_op_status = EIO; 3967 vdcp->sync_op_pending = B_FALSE; 3968 cv_signal(&vdcp->sync_pending_cv); 3969 break; 3970 3971 case CB_STRATEGY: 3972 bufp = ldep->cb_arg; 3973 ASSERT(bufp != NULL); 3974 bufp->b_resid = bufp->b_bcount; 3975 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3976 VD_KSTAT_RUNQ_EXIT(vdcp); 3977 DTRACE_IO1(done, buf_t *, bufp); 3978 bioerror(bufp, EIO); 3979 biodone(bufp); 3980 break; 3981 3982 default: 3983 ASSERT(0); 3984 } 3985 3986 } 3987 3988 /* get the next element to cancel */ 3989 if (++b_idx >= vdcp->local_dring_backup_len) 3990 b_idx = 0; 3991 } 3992 3993 /* all done - now clear up pending dring copy */ 3994 dring_size = vdcp->local_dring_backup_len * 3995 sizeof (vdcp->local_dring_backup[0]); 3996 3997 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3998 3999 vdcp->local_dring_backup = NULL; 4000 4001 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 4002 } 4003 4004 /* 4005 * Function: 4006 * vdc_connection_timeout 4007 * 4008 * Description: 4009 * This function is invoked if the timeout set to establish the connection 4010 * with vds expires. This will happen if we spend too much time in the 4011 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 4012 * cancel any pending request and mark them as failed. 4013 * 4014 * If the timeout does not expire, it will be cancelled when we reach the 4015 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 4016 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 4017 * VDC_STATE_RESETTING state in which case we do nothing because the 4018 * timeout is being cancelled. 4019 * 4020 * Arguments: 4021 * arg - argument of the timeout function actually a soft state 4022 * pointer for the instance of the device driver. 4023 * 4024 * Return Code: 4025 * None 4026 */ 4027 void 4028 vdc_connection_timeout(void *arg) 4029 { 4030 vdc_t *vdcp = (vdc_t *)arg; 4031 4032 mutex_enter(&vdcp->lock); 4033 4034 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4035 vdcp->state == VDC_STATE_DETACH) { 4036 /* 4037 * The connection has just been re-established or 4038 * we are detaching. 4039 */ 4040 vdcp->ctimeout_reached = B_FALSE; 4041 mutex_exit(&vdcp->lock); 4042 return; 4043 } 4044 4045 vdcp->ctimeout_reached = B_TRUE; 4046 4047 /* notify requests waiting for sending */ 4048 cv_broadcast(&vdcp->running_cv); 4049 4050 /* cancel requests waiting for a result */ 4051 vdc_cancel_backup_dring(vdcp); 4052 4053 mutex_exit(&vdcp->lock); 4054 4055 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4056 vdcp->instance); 4057 } 4058 4059 /* 4060 * Function: 4061 * vdc_backup_local_dring() 4062 * 4063 * Description: 4064 * Backup the current dring in the event of a reset. The Dring 4065 * transactions will be resubmitted to the server when the 4066 * connection is restored. 4067 * 4068 * Arguments: 4069 * vdcp - soft state pointer for this instance of the device driver. 4070 * 4071 * Return Code: 4072 * NONE 4073 */ 4074 static void 4075 vdc_backup_local_dring(vdc_t *vdcp) 4076 { 4077 int dring_size; 4078 4079 ASSERT(MUTEX_HELD(&vdcp->lock)); 4080 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4081 4082 /* 4083 * If the backup dring is stil around, it means 4084 * that the last restore did not complete. However, 4085 * since we never got back into the running state, 4086 * the backup copy we have is still valid. 4087 */ 4088 if (vdcp->local_dring_backup != NULL) { 4089 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4090 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4091 vdcp->local_dring_backup_tail); 4092 return; 4093 } 4094 4095 /* 4096 * The backup dring can be NULL and the local dring may not be 4097 * initialized. This can happen if we had a reset while establishing 4098 * a new connection but after the connection has timed out. In that 4099 * case the backup dring is NULL because the requests have been 4100 * cancelled and the request occured before the local dring is 4101 * initialized. 4102 */ 4103 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4104 return; 4105 4106 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4107 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4108 4109 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4110 4111 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4112 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4113 4114 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4115 vdcp->local_dring_backup_len = vdcp->dring_len; 4116 } 4117 4118 static void 4119 vdc_switch_server(vdc_t *vdcp) 4120 { 4121 int rv; 4122 vdc_server_t *curr_server, *new_server; 4123 4124 ASSERT(MUTEX_HELD(&vdcp->lock)); 4125 4126 /* if there is only one server return back */ 4127 if (vdcp->num_servers == 1) { 4128 return; 4129 } 4130 4131 /* Get current and next server */ 4132 curr_server = vdcp->curr_server; 4133 new_server = 4134 (curr_server->next) ? curr_server->next : vdcp->server_list; 4135 ASSERT(curr_server != new_server); 4136 4137 /* bring current server's channel down */ 4138 rv = ldc_down(curr_server->ldc_handle); 4139 if (rv) { 4140 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4141 vdcp->instance, curr_server->id); 4142 return; 4143 } 4144 4145 /* switch the server */ 4146 vdcp->curr_server = new_server; 4147 4148 cmn_err(CE_NOTE, "Successfully failed over from VDS on port@%ld to " 4149 "VDS on port@%ld.\n", curr_server->id, new_server->id); 4150 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4151 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4152 } 4153 4154 /* -------------------------------------------------------------------------- */ 4155 4156 /* 4157 * The following functions process the incoming messages from vds 4158 */ 4159 4160 /* 4161 * Function: 4162 * vdc_process_msg_thread() 4163 * 4164 * Description: 4165 * 4166 * Main VDC message processing thread. Each vDisk instance 4167 * consists of a copy of this thread. This thread triggers 4168 * all the handshakes and data exchange with the server. It 4169 * also handles all channel resets 4170 * 4171 * Arguments: 4172 * vdc - soft state pointer for this instance of the device driver. 4173 * 4174 * Return Code: 4175 * None 4176 */ 4177 static void 4178 vdc_process_msg_thread(vdc_t *vdcp) 4179 { 4180 int status; 4181 int ctimeout; 4182 timeout_id_t tmid = 0; 4183 clock_t ldcup_timeout = 0; 4184 4185 mutex_enter(&vdcp->lock); 4186 4187 for (;;) { 4188 4189 #define Q(_s) (vdcp->state == _s) ? #_s : 4190 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4191 Q(VDC_STATE_INIT) 4192 Q(VDC_STATE_INIT_WAITING) 4193 Q(VDC_STATE_NEGOTIATE) 4194 Q(VDC_STATE_HANDLE_PENDING) 4195 Q(VDC_STATE_RUNNING) 4196 Q(VDC_STATE_RESETTING) 4197 Q(VDC_STATE_DETACH) 4198 "UNKNOWN"); 4199 4200 switch (vdcp->state) { 4201 case VDC_STATE_INIT: 4202 4203 /* 4204 * If requested, start a timeout to check if the 4205 * connection with vds is established in the 4206 * specified delay. If the timeout expires, we 4207 * will cancel any pending request. 4208 * 4209 * If some reset have occurred while establishing 4210 * the connection, we already have a timeout armed 4211 * and in that case we don't need to arm a new one. 4212 * 4213 * The same rule applies when there are multiple vds'. 4214 * If either a connection cannot be established or 4215 * the handshake times out, the connection thread will 4216 * try another server. The 'ctimeout' will report 4217 * back an error after it expires irrespective of 4218 * whether the vdisk is trying to connect to just 4219 * one or multiple servers. 4220 */ 4221 ctimeout = (vdc_timeout != 0)? 4222 vdc_timeout : vdcp->curr_server->ctimeout; 4223 4224 if (ctimeout != 0 && tmid == 0) { 4225 tmid = timeout(vdc_connection_timeout, vdcp, 4226 ctimeout * drv_usectohz(MICROSEC)); 4227 } 4228 4229 /* Check if we are re-initializing repeatedly */ 4230 if (vdcp->hshake_cnt > vdc_hshake_retries && 4231 vdcp->lifecycle != VDC_LC_ONLINE) { 4232 4233 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4234 vdcp->instance, vdcp->hshake_cnt); 4235 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4236 vdcp->instance); 4237 vdcp->state = VDC_STATE_DETACH; 4238 break; 4239 } 4240 4241 /* Switch to STATE_DETACH if drv is detaching */ 4242 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4243 vdcp->state = VDC_STATE_DETACH; 4244 break; 4245 } 4246 4247 /* Switch server */ 4248 if (vdcp->hshake_cnt > 0) 4249 vdc_switch_server(vdcp); 4250 vdcp->hshake_cnt++; 4251 4252 /* Bring up connection with vds via LDC */ 4253 status = vdc_start_ldc_connection(vdcp); 4254 if (status != EINVAL) { 4255 vdcp->state = VDC_STATE_INIT_WAITING; 4256 } 4257 break; 4258 4259 case VDC_STATE_INIT_WAITING: 4260 4261 /* if channel is UP, start negotiation */ 4262 if (vdcp->curr_server->ldc_state == LDC_UP) { 4263 vdcp->state = VDC_STATE_NEGOTIATE; 4264 break; 4265 } 4266 4267 /* check if only one server exists */ 4268 if (vdcp->num_servers == 1) { 4269 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4270 } else { 4271 /* 4272 * wait for LDC_UP, if it times out, switch 4273 * to another server. 4274 */ 4275 ldcup_timeout = ddi_get_lbolt() + 4276 (vdc_ldcup_timeout * 4277 drv_usectohz(MICROSEC)); 4278 status = cv_timedwait(&vdcp->initwait_cv, 4279 &vdcp->lock, ldcup_timeout); 4280 if (status == -1 && 4281 vdcp->state == VDC_STATE_INIT_WAITING && 4282 vdcp->curr_server->ldc_state != LDC_UP) { 4283 /* timed out & still waiting */ 4284 vdcp->state = VDC_STATE_INIT; 4285 break; 4286 } 4287 } 4288 4289 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4290 DMSG(vdcp, 0, 4291 "state moved to %d out from under us...\n", 4292 vdcp->state); 4293 } 4294 break; 4295 4296 case VDC_STATE_NEGOTIATE: 4297 switch (status = vdc_ver_negotiation(vdcp)) { 4298 case 0: 4299 break; 4300 default: 4301 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4302 status); 4303 goto reset; 4304 } 4305 4306 switch (status = vdc_attr_negotiation(vdcp)) { 4307 case 0: 4308 break; 4309 default: 4310 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4311 status); 4312 goto reset; 4313 } 4314 4315 switch (status = vdc_dring_negotiation(vdcp)) { 4316 case 0: 4317 break; 4318 default: 4319 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4320 status); 4321 goto reset; 4322 } 4323 4324 switch (status = vdc_rdx_exchange(vdcp)) { 4325 case 0: 4326 vdcp->state = VDC_STATE_HANDLE_PENDING; 4327 goto done; 4328 default: 4329 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4330 status); 4331 goto reset; 4332 } 4333 reset: 4334 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4335 status); 4336 vdcp->state = VDC_STATE_RESETTING; 4337 vdcp->self_reset = B_TRUE; 4338 done: 4339 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4340 vdcp->state); 4341 break; 4342 4343 case VDC_STATE_HANDLE_PENDING: 4344 4345 if (vdcp->ctimeout_reached) { 4346 /* 4347 * The connection timeout had been reached so 4348 * pending requests have been cancelled. Now 4349 * that the connection is back we can reset 4350 * the timeout. 4351 */ 4352 ASSERT(vdcp->local_dring_backup == NULL); 4353 ASSERT(tmid != 0); 4354 tmid = 0; 4355 vdcp->ctimeout_reached = B_FALSE; 4356 vdcp->state = VDC_STATE_RUNNING; 4357 DMSG(vdcp, 0, "[%d] connection to service " 4358 "domain is up", vdcp->instance); 4359 break; 4360 } 4361 4362 mutex_exit(&vdcp->lock); 4363 if (tmid != 0) { 4364 (void) untimeout(tmid); 4365 tmid = 0; 4366 } 4367 status = vdc_resubmit_backup_dring(vdcp); 4368 mutex_enter(&vdcp->lock); 4369 4370 if (status) 4371 vdcp->state = VDC_STATE_RESETTING; 4372 else 4373 vdcp->state = VDC_STATE_RUNNING; 4374 4375 break; 4376 4377 /* enter running state */ 4378 case VDC_STATE_RUNNING: 4379 /* 4380 * Signal anyone waiting for the connection 4381 * to come on line. 4382 */ 4383 vdcp->hshake_cnt = 0; 4384 cv_broadcast(&vdcp->running_cv); 4385 4386 /* failfast has to been checked after reset */ 4387 cv_signal(&vdcp->failfast_cv); 4388 4389 /* ownership is lost during reset */ 4390 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4391 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4392 cv_signal(&vdcp->ownership_cv); 4393 4394 mutex_exit(&vdcp->lock); 4395 4396 for (;;) { 4397 vio_msg_t msg; 4398 status = vdc_wait_for_response(vdcp, &msg); 4399 if (status) break; 4400 4401 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4402 vdcp->instance); 4403 status = vdc_process_data_msg(vdcp, &msg); 4404 if (status) { 4405 DMSG(vdcp, 1, "[%d] process_data_msg " 4406 "returned err=%d\n", vdcp->instance, 4407 status); 4408 break; 4409 } 4410 4411 } 4412 4413 mutex_enter(&vdcp->lock); 4414 4415 vdcp->state = VDC_STATE_RESETTING; 4416 vdcp->self_reset = B_TRUE; 4417 break; 4418 4419 case VDC_STATE_RESETTING: 4420 /* 4421 * When we reach this state, we either come from the 4422 * VDC_STATE_RUNNING state and we can have pending 4423 * request but no timeout is armed; or we come from 4424 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4425 * VDC_HANDLE_PENDING state and there is no pending 4426 * request or pending requests have already been copied 4427 * into the backup dring. So we can safely keep the 4428 * connection timeout armed while we are in this state. 4429 */ 4430 4431 DMSG(vdcp, 0, "Initiating channel reset " 4432 "(pending = %d)\n", (int)vdcp->threads_pending); 4433 4434 if (vdcp->self_reset) { 4435 DMSG(vdcp, 0, 4436 "[%d] calling stop_ldc_connection.\n", 4437 vdcp->instance); 4438 status = vdc_stop_ldc_connection(vdcp); 4439 vdcp->self_reset = B_FALSE; 4440 } 4441 4442 /* 4443 * Wait for all threads currently waiting 4444 * for a free dring entry to use. 4445 */ 4446 while (vdcp->threads_pending) { 4447 cv_broadcast(&vdcp->membind_cv); 4448 cv_broadcast(&vdcp->dring_free_cv); 4449 mutex_exit(&vdcp->lock); 4450 /* give the waiters enough time to wake up */ 4451 delay(vdc_hz_min_ldc_delay); 4452 mutex_enter(&vdcp->lock); 4453 } 4454 4455 ASSERT(vdcp->threads_pending == 0); 4456 4457 /* Sanity check that no thread is receiving */ 4458 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4459 4460 vdcp->read_state = VDC_READ_IDLE; 4461 4462 vdc_backup_local_dring(vdcp); 4463 4464 /* cleanup the old d-ring */ 4465 vdc_destroy_descriptor_ring(vdcp); 4466 4467 /* go and start again */ 4468 vdcp->state = VDC_STATE_INIT; 4469 4470 break; 4471 4472 case VDC_STATE_DETACH: 4473 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4474 vdcp->instance); 4475 4476 /* cancel any pending timeout */ 4477 mutex_exit(&vdcp->lock); 4478 if (tmid != 0) { 4479 (void) untimeout(tmid); 4480 tmid = 0; 4481 } 4482 mutex_enter(&vdcp->lock); 4483 4484 /* 4485 * Signal anyone waiting for connection 4486 * to come online 4487 */ 4488 cv_broadcast(&vdcp->running_cv); 4489 4490 while (vdcp->sync_op_pending) { 4491 cv_signal(&vdcp->sync_pending_cv); 4492 cv_signal(&vdcp->sync_blocked_cv); 4493 mutex_exit(&vdcp->lock); 4494 /* give the waiters enough time to wake up */ 4495 delay(vdc_hz_min_ldc_delay); 4496 mutex_enter(&vdcp->lock); 4497 } 4498 4499 mutex_exit(&vdcp->lock); 4500 4501 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4502 vdcp->instance); 4503 thread_exit(); 4504 break; 4505 } 4506 } 4507 } 4508 4509 4510 /* 4511 * Function: 4512 * vdc_process_data_msg() 4513 * 4514 * Description: 4515 * This function is called by the message processing thread each time 4516 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4517 * be an ACK or NACK from vds[1] which vdc handles as follows. 4518 * ACK - wake up the waiting thread 4519 * NACK - resend any messages necessary 4520 * 4521 * [1] Although the message format allows it, vds should not send a 4522 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4523 * some bizarre reason it does, vdc will reset the connection. 4524 * 4525 * Arguments: 4526 * vdc - soft state pointer for this instance of the device driver. 4527 * msg - the LDC message sent by vds 4528 * 4529 * Return Code: 4530 * 0 - Success. 4531 * > 0 - error value returned by LDC 4532 */ 4533 static int 4534 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4535 { 4536 int status = 0; 4537 vio_dring_msg_t *dring_msg; 4538 vdc_local_desc_t *ldep = NULL; 4539 int start, end; 4540 int idx; 4541 int op; 4542 4543 dring_msg = (vio_dring_msg_t *)msg; 4544 4545 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4546 ASSERT(vdcp != NULL); 4547 4548 mutex_enter(&vdcp->lock); 4549 4550 /* 4551 * Check to see if the message has bogus data 4552 */ 4553 idx = start = dring_msg->start_idx; 4554 end = dring_msg->end_idx; 4555 if ((start >= vdcp->dring_len) || 4556 (end >= vdcp->dring_len) || (end < -1)) { 4557 /* 4558 * Update the I/O statistics to indicate that an error ocurred. 4559 * No need to update the wait/run queues as no specific read or 4560 * write request is being completed in response to this 'msg'. 4561 */ 4562 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4563 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4564 vdcp->instance, start, end); 4565 mutex_exit(&vdcp->lock); 4566 return (EINVAL); 4567 } 4568 4569 /* 4570 * Verify that the sequence number is what vdc expects. 4571 */ 4572 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4573 case VDC_SEQ_NUM_TODO: 4574 break; /* keep processing this message */ 4575 case VDC_SEQ_NUM_SKIP: 4576 mutex_exit(&vdcp->lock); 4577 return (0); 4578 case VDC_SEQ_NUM_INVALID: 4579 /* 4580 * Update the I/O statistics to indicate that an error ocurred. 4581 * No need to update the wait/run queues as no specific read or 4582 * write request is being completed in response to this 'msg'. 4583 */ 4584 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4585 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4586 mutex_exit(&vdcp->lock); 4587 return (ENXIO); 4588 } 4589 4590 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4591 /* 4592 * Update the I/O statistics to indicate that an error ocurred. 4593 * 4594 * We need to update the run queue if a read or write request 4595 * is being NACKed - otherwise there will appear to be an 4596 * indefinite outstanding request and statistics reported by 4597 * iostat(1M) will be incorrect. The transaction will be 4598 * resubmitted from the backup DRing following the reset 4599 * and the wait/run queues will be entered again. 4600 */ 4601 ldep = &vdcp->local_dring[idx]; 4602 op = ldep->operation; 4603 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4604 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4605 VD_KSTAT_RUNQ_EXIT(vdcp); 4606 } 4607 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4608 VDC_DUMP_DRING_MSG(dring_msg); 4609 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4610 mutex_exit(&vdcp->lock); 4611 return (EIO); 4612 4613 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4614 /* 4615 * Update the I/O statistics to indicate that an error occurred. 4616 * No need to update the wait/run queues as no specific read or 4617 * write request is being completed in response to this 'msg'. 4618 */ 4619 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4620 mutex_exit(&vdcp->lock); 4621 return (EPROTO); 4622 } 4623 4624 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4625 ASSERT(start == end); 4626 4627 ldep = &vdcp->local_dring[idx]; 4628 4629 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4630 ldep->dep->hdr.dstate, ldep->cb_type); 4631 4632 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4633 struct buf *bufp; 4634 4635 switch (ldep->cb_type) { 4636 case CB_SYNC: 4637 ASSERT(vdcp->sync_op_pending); 4638 4639 status = vdc_depopulate_descriptor(vdcp, idx); 4640 vdcp->sync_op_status = status; 4641 vdcp->sync_op_pending = B_FALSE; 4642 cv_signal(&vdcp->sync_pending_cv); 4643 break; 4644 4645 case CB_STRATEGY: 4646 bufp = ldep->cb_arg; 4647 ASSERT(bufp != NULL); 4648 bufp->b_resid = 4649 bufp->b_bcount - ldep->dep->payload.nbytes; 4650 status = ldep->dep->payload.status; /* Future:ntoh */ 4651 if (status != 0) { 4652 DMSG(vdcp, 1, "strategy status=%d\n", status); 4653 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4654 bioerror(bufp, status); 4655 } 4656 4657 (void) vdc_depopulate_descriptor(vdcp, idx); 4658 4659 DMSG(vdcp, 1, 4660 "strategy complete req=%ld bytes resp=%ld bytes\n", 4661 bufp->b_bcount, ldep->dep->payload.nbytes); 4662 4663 if (status != 0 && vdcp->failfast_interval != 0) { 4664 /* 4665 * The I/O has failed and failfast is enabled. 4666 * We need the failfast thread to check if the 4667 * failure is due to a reservation conflict. 4668 */ 4669 (void) vdc_failfast_io_queue(vdcp, bufp); 4670 } else { 4671 if (status == 0) { 4672 op = (bufp->b_flags & B_READ) ? 4673 VD_OP_BREAD : VD_OP_BWRITE; 4674 VD_UPDATE_IO_STATS(vdcp, op, 4675 ldep->dep->payload.nbytes); 4676 } 4677 VD_KSTAT_RUNQ_EXIT(vdcp); 4678 DTRACE_IO1(done, buf_t *, bufp); 4679 biodone(bufp); 4680 } 4681 break; 4682 4683 default: 4684 ASSERT(0); 4685 } 4686 } 4687 4688 /* let the arrival signal propogate */ 4689 mutex_exit(&vdcp->lock); 4690 4691 /* probe gives the count of how many entries were processed */ 4692 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4693 4694 return (0); 4695 } 4696 4697 4698 /* 4699 * Function: 4700 * vdc_handle_ver_msg() 4701 * 4702 * Description: 4703 * 4704 * Arguments: 4705 * vdc - soft state pointer for this instance of the device driver. 4706 * ver_msg - LDC message sent by vDisk server 4707 * 4708 * Return Code: 4709 * 0 - Success 4710 */ 4711 static int 4712 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4713 { 4714 int status = 0; 4715 4716 ASSERT(vdc != NULL); 4717 ASSERT(mutex_owned(&vdc->lock)); 4718 4719 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4720 return (EPROTO); 4721 } 4722 4723 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4724 return (EINVAL); 4725 } 4726 4727 switch (ver_msg->tag.vio_subtype) { 4728 case VIO_SUBTYPE_ACK: 4729 /* 4730 * We check to see if the version returned is indeed supported 4731 * (The server may have also adjusted the minor number downwards 4732 * and if so 'ver_msg' will contain the actual version agreed) 4733 */ 4734 if (vdc_is_supported_version(ver_msg)) { 4735 vdc->ver.major = ver_msg->ver_major; 4736 vdc->ver.minor = ver_msg->ver_minor; 4737 ASSERT(vdc->ver.major > 0); 4738 } else { 4739 status = EPROTO; 4740 } 4741 break; 4742 4743 case VIO_SUBTYPE_NACK: 4744 /* 4745 * call vdc_is_supported_version() which will return the next 4746 * supported version (if any) in 'ver_msg' 4747 */ 4748 (void) vdc_is_supported_version(ver_msg); 4749 if (ver_msg->ver_major > 0) { 4750 size_t len = sizeof (*ver_msg); 4751 4752 ASSERT(vdc->ver.major > 0); 4753 4754 /* reset the necessary fields and resend */ 4755 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4756 ver_msg->dev_class = VDEV_DISK; 4757 4758 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4759 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4760 vdc->instance, status); 4761 if (len != sizeof (*ver_msg)) 4762 status = EBADMSG; 4763 } else { 4764 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4765 vdc->instance); 4766 status = ENOTSUP; 4767 } 4768 4769 break; 4770 case VIO_SUBTYPE_INFO: 4771 /* 4772 * Handle the case where vds starts handshake 4773 * (for now only vdc is the instigator) 4774 */ 4775 status = ENOTSUP; 4776 break; 4777 4778 default: 4779 status = EINVAL; 4780 break; 4781 } 4782 4783 return (status); 4784 } 4785 4786 /* 4787 * Function: 4788 * vdc_handle_attr_msg() 4789 * 4790 * Description: 4791 * 4792 * Arguments: 4793 * vdc - soft state pointer for this instance of the device driver. 4794 * attr_msg - LDC message sent by vDisk server 4795 * 4796 * Return Code: 4797 * 0 - Success 4798 */ 4799 static int 4800 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4801 { 4802 int status = 0; 4803 4804 ASSERT(vdc != NULL); 4805 ASSERT(mutex_owned(&vdc->lock)); 4806 4807 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4808 return (EPROTO); 4809 } 4810 4811 switch (attr_msg->tag.vio_subtype) { 4812 case VIO_SUBTYPE_ACK: 4813 /* 4814 * We now verify the attributes sent by vds. 4815 */ 4816 if (attr_msg->vdisk_size == 0) { 4817 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4818 vdc->instance); 4819 status = EINVAL; 4820 break; 4821 } 4822 4823 if (attr_msg->max_xfer_sz == 0) { 4824 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4825 vdc->instance); 4826 status = EINVAL; 4827 break; 4828 } 4829 4830 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4831 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4832 vdc->instance); 4833 attr_msg->vdisk_size = 0; 4834 } 4835 4836 /* 4837 * If the disk size is already set check that it hasn't changed. 4838 */ 4839 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4840 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4841 DMSG(vdc, 0, "[%d] Different disk size from vds " 4842 "(old=0x%lx - new=0x%lx", vdc->instance, 4843 vdc->vdisk_size, attr_msg->vdisk_size) 4844 status = EINVAL; 4845 break; 4846 } 4847 4848 vdc->vdisk_size = attr_msg->vdisk_size; 4849 vdc->vdisk_type = attr_msg->vdisk_type; 4850 vdc->operations = attr_msg->operations; 4851 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4852 vdc->vdisk_media = attr_msg->vdisk_media; 4853 else 4854 vdc->vdisk_media = 0; 4855 4856 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4857 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4858 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4859 vdc->instance, vdc->block_size, 4860 attr_msg->vdisk_block_size); 4861 4862 /* 4863 * We don't know at compile time what the vDisk server will 4864 * think are good values but we apply a large (arbitrary) 4865 * upper bound to prevent memory exhaustion in vdc if it was 4866 * allocating a DRing based of huge values sent by the server. 4867 * We probably will never exceed this except if the message 4868 * was garbage. 4869 */ 4870 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4871 (PAGESIZE * DEV_BSIZE)) { 4872 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4873 vdc->block_size = attr_msg->vdisk_block_size; 4874 } else { 4875 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4876 " using max supported by vdc", vdc->instance); 4877 } 4878 4879 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4880 (attr_msg->vdisk_size > INT64_MAX) || 4881 (attr_msg->operations == 0) || 4882 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4883 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4884 vdc->instance); 4885 status = EINVAL; 4886 break; 4887 } 4888 4889 /* 4890 * Now that we have received all attributes we can create a 4891 * fake geometry for the disk. 4892 */ 4893 vdc_create_fake_geometry(vdc); 4894 break; 4895 4896 case VIO_SUBTYPE_NACK: 4897 /* 4898 * vds could not handle the attributes we sent so we 4899 * stop negotiating. 4900 */ 4901 status = EPROTO; 4902 break; 4903 4904 case VIO_SUBTYPE_INFO: 4905 /* 4906 * Handle the case where vds starts the handshake 4907 * (for now; vdc is the only supported instigatior) 4908 */ 4909 status = ENOTSUP; 4910 break; 4911 4912 default: 4913 status = ENOTSUP; 4914 break; 4915 } 4916 4917 return (status); 4918 } 4919 4920 /* 4921 * Function: 4922 * vdc_handle_dring_reg_msg() 4923 * 4924 * Description: 4925 * 4926 * Arguments: 4927 * vdc - soft state pointer for this instance of the driver. 4928 * dring_msg - LDC message sent by vDisk server 4929 * 4930 * Return Code: 4931 * 0 - Success 4932 */ 4933 static int 4934 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4935 { 4936 int status = 0; 4937 4938 ASSERT(vdc != NULL); 4939 ASSERT(mutex_owned(&vdc->lock)); 4940 4941 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4942 return (EPROTO); 4943 } 4944 4945 switch (dring_msg->tag.vio_subtype) { 4946 case VIO_SUBTYPE_ACK: 4947 /* save the received dring_ident */ 4948 vdc->dring_ident = dring_msg->dring_ident; 4949 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4950 vdc->instance, vdc->dring_ident); 4951 break; 4952 4953 case VIO_SUBTYPE_NACK: 4954 /* 4955 * vds could not handle the DRing info we sent so we 4956 * stop negotiating. 4957 */ 4958 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4959 vdc->instance); 4960 status = EPROTO; 4961 break; 4962 4963 case VIO_SUBTYPE_INFO: 4964 /* 4965 * Handle the case where vds starts handshake 4966 * (for now only vdc is the instigatior) 4967 */ 4968 status = ENOTSUP; 4969 break; 4970 default: 4971 status = ENOTSUP; 4972 } 4973 4974 return (status); 4975 } 4976 4977 /* 4978 * Function: 4979 * vdc_verify_seq_num() 4980 * 4981 * Description: 4982 * This functions verifies that the sequence number sent back by the vDisk 4983 * server with the latest message is what is expected (i.e. it is greater 4984 * than the last seq num sent by the vDisk server and less than or equal 4985 * to the last seq num generated by vdc). 4986 * 4987 * It then checks the request ID to see if any requests need processing 4988 * in the DRing. 4989 * 4990 * Arguments: 4991 * vdc - soft state pointer for this instance of the driver. 4992 * dring_msg - pointer to the LDC message sent by vds 4993 * 4994 * Return Code: 4995 * VDC_SEQ_NUM_TODO - Message needs to be processed 4996 * VDC_SEQ_NUM_SKIP - Message has already been processed 4997 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4998 * vdc cannot deal with them 4999 */ 5000 static int 5001 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 5002 { 5003 ASSERT(vdc != NULL); 5004 ASSERT(dring_msg != NULL); 5005 ASSERT(mutex_owned(&vdc->lock)); 5006 5007 /* 5008 * Check to see if the messages were responded to in the correct 5009 * order by vds. 5010 */ 5011 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 5012 (dring_msg->seq_num > vdc->seq_num)) { 5013 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 5014 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 5015 vdc->instance, dring_msg->seq_num, 5016 vdc->seq_num_reply, vdc->seq_num, 5017 vdc->req_id_proc, vdc->req_id); 5018 return (VDC_SEQ_NUM_INVALID); 5019 } 5020 vdc->seq_num_reply = dring_msg->seq_num; 5021 5022 if (vdc->req_id_proc < vdc->req_id) 5023 return (VDC_SEQ_NUM_TODO); 5024 else 5025 return (VDC_SEQ_NUM_SKIP); 5026 } 5027 5028 5029 /* 5030 * Function: 5031 * vdc_is_supported_version() 5032 * 5033 * Description: 5034 * This routine checks if the major/minor version numbers specified in 5035 * 'ver_msg' are supported. If not it finds the next version that is 5036 * in the supported version list 'vdc_version[]' and sets the fields in 5037 * 'ver_msg' to those values 5038 * 5039 * Arguments: 5040 * ver_msg - LDC message sent by vDisk server 5041 * 5042 * Return Code: 5043 * B_TRUE - Success 5044 * B_FALSE - Version not supported 5045 */ 5046 static boolean_t 5047 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5048 { 5049 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5050 5051 for (int i = 0; i < vdc_num_versions; i++) { 5052 ASSERT(vdc_version[i].major > 0); 5053 ASSERT((i == 0) || 5054 (vdc_version[i].major < vdc_version[i-1].major)); 5055 5056 /* 5057 * If the major versions match, adjust the minor version, if 5058 * necessary, down to the highest value supported by this 5059 * client. The server should support all minor versions lower 5060 * than the value it sent 5061 */ 5062 if (ver_msg->ver_major == vdc_version[i].major) { 5063 if (ver_msg->ver_minor > vdc_version[i].minor) { 5064 DMSGX(0, 5065 "Adjusting minor version from %u to %u", 5066 ver_msg->ver_minor, vdc_version[i].minor); 5067 ver_msg->ver_minor = vdc_version[i].minor; 5068 } 5069 return (B_TRUE); 5070 } 5071 5072 /* 5073 * If the message contains a higher major version number, set 5074 * the message's major/minor versions to the current values 5075 * and return false, so this message will get resent with 5076 * these values, and the server will potentially try again 5077 * with the same or a lower version 5078 */ 5079 if (ver_msg->ver_major > vdc_version[i].major) { 5080 ver_msg->ver_major = vdc_version[i].major; 5081 ver_msg->ver_minor = vdc_version[i].minor; 5082 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5083 ver_msg->ver_major, ver_msg->ver_minor); 5084 5085 return (B_FALSE); 5086 } 5087 5088 /* 5089 * Otherwise, the message's major version is less than the 5090 * current major version, so continue the loop to the next 5091 * (lower) supported version 5092 */ 5093 } 5094 5095 /* 5096 * No common version was found; "ground" the version pair in the 5097 * message to terminate negotiation 5098 */ 5099 ver_msg->ver_major = 0; 5100 ver_msg->ver_minor = 0; 5101 5102 return (B_FALSE); 5103 } 5104 /* -------------------------------------------------------------------------- */ 5105 5106 /* 5107 * DKIO(7) support 5108 */ 5109 5110 typedef struct vdc_dk_arg { 5111 struct dk_callback dkc; 5112 int mode; 5113 dev_t dev; 5114 vdc_t *vdc; 5115 } vdc_dk_arg_t; 5116 5117 /* 5118 * Function: 5119 * vdc_dkio_flush_cb() 5120 * 5121 * Description: 5122 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5123 * by kernel code. 5124 * 5125 * Arguments: 5126 * arg - a pointer to a vdc_dk_arg_t structure. 5127 */ 5128 void 5129 vdc_dkio_flush_cb(void *arg) 5130 { 5131 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5132 struct dk_callback *dkc = NULL; 5133 vdc_t *vdc = NULL; 5134 int rv; 5135 5136 if (dk_arg == NULL) { 5137 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5138 return; 5139 } 5140 dkc = &dk_arg->dkc; 5141 vdc = dk_arg->vdc; 5142 ASSERT(vdc != NULL); 5143 5144 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5145 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5146 if (rv != 0) { 5147 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5148 vdc->instance, rv, 5149 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5150 } 5151 5152 /* 5153 * Trigger the call back to notify the caller the the ioctl call has 5154 * been completed. 5155 */ 5156 if ((dk_arg->mode & FKIOCTL) && 5157 (dkc != NULL) && 5158 (dkc->dkc_callback != NULL)) { 5159 ASSERT(dkc->dkc_cookie != NULL); 5160 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5161 } 5162 5163 /* Indicate that one less DKIO write flush is outstanding */ 5164 mutex_enter(&vdc->lock); 5165 vdc->dkio_flush_pending--; 5166 ASSERT(vdc->dkio_flush_pending >= 0); 5167 mutex_exit(&vdc->lock); 5168 5169 /* free the mem that was allocated when the callback was dispatched */ 5170 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5171 } 5172 5173 /* 5174 * Function: 5175 * vdc_dkio_gapart() 5176 * 5177 * Description: 5178 * This function implements the DKIOCGAPART ioctl. 5179 * 5180 * Arguments: 5181 * vdc - soft state pointer 5182 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5183 * flag - ioctl flags 5184 */ 5185 static int 5186 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5187 { 5188 struct dk_geom *geom; 5189 struct vtoc *vtoc; 5190 union { 5191 struct dk_map map[NDKMAP]; 5192 struct dk_map32 map32[NDKMAP]; 5193 } data; 5194 int i, rv, size; 5195 5196 mutex_enter(&vdc->lock); 5197 5198 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5199 mutex_exit(&vdc->lock); 5200 return (rv); 5201 } 5202 5203 vtoc = vdc->vtoc; 5204 geom = vdc->geom; 5205 5206 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5207 5208 for (i = 0; i < vtoc->v_nparts; i++) { 5209 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5210 (geom->dkg_nhead * geom->dkg_nsect); 5211 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5212 } 5213 size = NDKMAP * sizeof (struct dk_map32); 5214 5215 } else { 5216 5217 for (i = 0; i < vtoc->v_nparts; i++) { 5218 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5219 (geom->dkg_nhead * geom->dkg_nsect); 5220 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5221 } 5222 size = NDKMAP * sizeof (struct dk_map); 5223 5224 } 5225 5226 mutex_exit(&vdc->lock); 5227 5228 if (ddi_copyout(&data, arg, size, flag) != 0) 5229 return (EFAULT); 5230 5231 return (0); 5232 } 5233 5234 /* 5235 * Function: 5236 * vdc_dkio_partition() 5237 * 5238 * Description: 5239 * This function implements the DKIOCPARTITION ioctl. 5240 * 5241 * Arguments: 5242 * vdc - soft state pointer 5243 * arg - a pointer to a struct partition64 structure 5244 * flag - ioctl flags 5245 */ 5246 static int 5247 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5248 { 5249 struct partition64 p64; 5250 efi_gpt_t *gpt; 5251 efi_gpe_t *gpe; 5252 vd_efi_dev_t edev; 5253 uint_t partno; 5254 int rv; 5255 5256 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5257 return (EFAULT); 5258 } 5259 5260 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5261 5262 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5263 return (rv); 5264 } 5265 5266 partno = p64.p_partno; 5267 5268 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5269 vd_efi_free(&edev, gpt, gpe); 5270 return (ESRCH); 5271 } 5272 5273 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5274 sizeof (struct uuid)); 5275 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5276 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5277 5278 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5279 vd_efi_free(&edev, gpt, gpe); 5280 return (EFAULT); 5281 } 5282 5283 vd_efi_free(&edev, gpt, gpe); 5284 return (0); 5285 } 5286 5287 /* 5288 * Function: 5289 * vdc_dioctl_rwcmd() 5290 * 5291 * Description: 5292 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5293 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5294 * 5295 * Arguments: 5296 * dev - device 5297 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5298 * flag - ioctl flags 5299 */ 5300 static int 5301 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5302 { 5303 struct dadkio_rwcmd32 rwcmd32; 5304 struct dadkio_rwcmd rwcmd; 5305 struct iovec aiov; 5306 struct uio auio; 5307 int rw, status; 5308 struct buf *buf; 5309 5310 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5311 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5312 sizeof (struct dadkio_rwcmd32), flag)) { 5313 return (EFAULT); 5314 } 5315 rwcmd.cmd = rwcmd32.cmd; 5316 rwcmd.flags = rwcmd32.flags; 5317 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5318 rwcmd.buflen = rwcmd32.buflen; 5319 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5320 } else { 5321 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5322 sizeof (struct dadkio_rwcmd), flag)) { 5323 return (EFAULT); 5324 } 5325 } 5326 5327 switch (rwcmd.cmd) { 5328 case DADKIO_RWCMD_READ: 5329 rw = B_READ; 5330 break; 5331 case DADKIO_RWCMD_WRITE: 5332 rw = B_WRITE; 5333 break; 5334 default: 5335 return (EINVAL); 5336 } 5337 5338 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5339 aiov.iov_base = rwcmd.bufaddr; 5340 aiov.iov_len = rwcmd.buflen; 5341 5342 bzero((caddr_t)&auio, sizeof (struct uio)); 5343 auio.uio_iov = &aiov; 5344 auio.uio_iovcnt = 1; 5345 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5346 auio.uio_resid = rwcmd.buflen; 5347 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5348 5349 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5350 bioinit(buf); 5351 /* 5352 * We use the private field of buf to specify that this is an 5353 * I/O using an absolute offset. 5354 */ 5355 buf->b_private = (void *)VD_SLICE_NONE; 5356 5357 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5358 5359 biofini(buf); 5360 kmem_free(buf, sizeof (buf_t)); 5361 5362 return (status); 5363 } 5364 5365 /* 5366 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5367 * buffer is returned in alloc_len. 5368 */ 5369 static vd_scsi_t * 5370 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5371 int *alloc_len) 5372 { 5373 vd_scsi_t *vd_scsi; 5374 int vd_scsi_len = VD_SCSI_SIZE; 5375 5376 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5377 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5378 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5379 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5380 5381 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5382 5383 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5384 5385 vd_scsi->cdb_len = cdb_len; 5386 vd_scsi->sense_len = sense_len; 5387 vd_scsi->datain_len = datain_len; 5388 vd_scsi->dataout_len = dataout_len; 5389 5390 *alloc_len = vd_scsi_len; 5391 5392 return (vd_scsi); 5393 } 5394 5395 /* 5396 * Convert the status of a SCSI command to a Solaris return code. 5397 * 5398 * Arguments: 5399 * vd_scsi - The SCSI operation buffer. 5400 * log_error - indicate if an error message should be logged. 5401 * 5402 * Note that our SCSI error messages are rather primitive for the moment 5403 * and could be improved by decoding some data like the SCSI command and 5404 * the sense key. 5405 * 5406 * Return value: 5407 * 0 - Status is good. 5408 * EACCES - Status reports a reservation conflict. 5409 * ENOTSUP - Status reports a check condition and sense key 5410 * reports an illegal request. 5411 * EIO - Any other status. 5412 */ 5413 static int 5414 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5415 { 5416 int rv; 5417 char path_str[MAXPATHLEN]; 5418 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5419 union scsi_cdb *cdb; 5420 struct scsi_extended_sense *sense; 5421 5422 if (vd_scsi->cmd_status == STATUS_GOOD) 5423 /* no error */ 5424 return (0); 5425 5426 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5427 if (vdc_scsi_log_error) 5428 log_error = B_TRUE; 5429 5430 if (log_error) { 5431 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5432 ddi_pathname(vdc->dip, path_str), vdc->instance, 5433 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5434 } 5435 5436 /* default returned value */ 5437 rv = EIO; 5438 5439 switch (vd_scsi->cmd_status) { 5440 5441 case STATUS_CHECK: 5442 case STATUS_TERMINATED: 5443 if (log_error) 5444 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5445 5446 /* check sense buffer */ 5447 if (vd_scsi->sense_len == 0 || 5448 vd_scsi->sense_status != STATUS_GOOD) { 5449 if (log_error) 5450 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5451 break; 5452 } 5453 5454 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5455 5456 if (log_error) { 5457 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5458 "\tASC: 0x%x, ASCQ: 0x%x\n", 5459 scsi_sense_key((uint8_t *)sense), 5460 scsi_sense_asc((uint8_t *)sense), 5461 scsi_sense_ascq((uint8_t *)sense)); 5462 } 5463 5464 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5465 rv = ENOTSUP; 5466 break; 5467 5468 case STATUS_BUSY: 5469 if (log_error) 5470 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5471 break; 5472 5473 case STATUS_RESERVATION_CONFLICT: 5474 /* 5475 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5476 * reservation conflict could be due to various reasons like 5477 * incorrect keys, not registered or not reserved etc. So, 5478 * we should not panic in that case. 5479 */ 5480 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5481 if (vdc->failfast_interval != 0 && 5482 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5483 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5484 /* failfast is enabled so we have to panic */ 5485 (void) snprintf(panic_str, sizeof (panic_str), 5486 VDC_RESV_CONFLICT_FMT_STR "%s", 5487 ddi_pathname(vdc->dip, path_str)); 5488 panic(panic_str); 5489 } 5490 if (log_error) 5491 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5492 rv = EACCES; 5493 break; 5494 5495 case STATUS_QFULL: 5496 if (log_error) 5497 cmn_err(CE_NOTE, "\tQueue Full\n"); 5498 break; 5499 5500 case STATUS_MET: 5501 case STATUS_INTERMEDIATE: 5502 case STATUS_SCSI2: 5503 case STATUS_INTERMEDIATE_MET: 5504 case STATUS_ACA_ACTIVE: 5505 if (log_error) 5506 cmn_err(CE_CONT, 5507 "\tUnexpected SCSI status received: 0x%x\n", 5508 vd_scsi->cmd_status); 5509 break; 5510 5511 default: 5512 if (log_error) 5513 cmn_err(CE_CONT, 5514 "\tInvalid SCSI status received: 0x%x\n", 5515 vd_scsi->cmd_status); 5516 break; 5517 } 5518 5519 return (rv); 5520 } 5521 5522 /* 5523 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5524 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5525 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5526 * converted to a VD_OP_RESET operation. 5527 */ 5528 static int 5529 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5530 { 5531 struct uscsi_cmd uscsi; 5532 struct uscsi_cmd32 uscsi32; 5533 vd_scsi_t *vd_scsi; 5534 int vd_scsi_len; 5535 union scsi_cdb *cdb; 5536 struct scsi_extended_sense *sense; 5537 char *datain, *dataout; 5538 size_t cdb_len, datain_len, dataout_len, sense_len; 5539 int rv; 5540 5541 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5542 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5543 mode) != 0) 5544 return (EFAULT); 5545 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5546 } else { 5547 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5548 mode) != 0) 5549 return (EFAULT); 5550 } 5551 5552 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5553 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5554 USCSI_RESET_ALL)) { 5555 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5556 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5557 return (rv); 5558 } 5559 5560 /* cdb buffer length */ 5561 cdb_len = uscsi.uscsi_cdblen; 5562 5563 /* data in and out buffers length */ 5564 if (uscsi.uscsi_flags & USCSI_READ) { 5565 datain_len = uscsi.uscsi_buflen; 5566 dataout_len = 0; 5567 } else { 5568 datain_len = 0; 5569 dataout_len = uscsi.uscsi_buflen; 5570 } 5571 5572 /* sense buffer length */ 5573 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5574 sense_len = uscsi.uscsi_rqlen; 5575 else 5576 sense_len = 0; 5577 5578 /* allocate buffer for the VD_SCSICMD_OP operation */ 5579 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5580 &vd_scsi_len); 5581 5582 /* 5583 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5584 * but basically they prevent a SCSI command from being retried in case 5585 * of an error. 5586 */ 5587 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5588 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5589 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5590 5591 /* set task attribute */ 5592 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5593 vd_scsi->task_attribute = 0; 5594 } else { 5595 if (uscsi.uscsi_flags & USCSI_HEAD) 5596 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5597 else if (uscsi.uscsi_flags & USCSI_HTAG) 5598 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5599 else if (uscsi.uscsi_flags & USCSI_OTAG) 5600 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5601 else 5602 vd_scsi->task_attribute = 0; 5603 } 5604 5605 /* set timeout */ 5606 vd_scsi->timeout = uscsi.uscsi_timeout; 5607 5608 /* copy-in cdb data */ 5609 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5610 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5611 rv = EFAULT; 5612 goto done; 5613 } 5614 5615 /* keep a pointer to the sense buffer */ 5616 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5617 5618 /* keep a pointer to the data-in buffer */ 5619 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5620 5621 /* copy-in request data to the data-out buffer */ 5622 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5623 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5624 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5625 mode)) { 5626 rv = EFAULT; 5627 goto done; 5628 } 5629 } 5630 5631 /* submit the request */ 5632 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5633 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5634 5635 if (rv != 0) 5636 goto done; 5637 5638 /* update scsi status */ 5639 uscsi.uscsi_status = vd_scsi->cmd_status; 5640 5641 /* update sense data */ 5642 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5643 (uscsi.uscsi_status == STATUS_CHECK || 5644 uscsi.uscsi_status == STATUS_TERMINATED)) { 5645 5646 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5647 5648 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5649 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5650 vd_scsi->sense_len; 5651 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5652 vd_scsi->sense_len, mode) != 0) { 5653 rv = EFAULT; 5654 goto done; 5655 } 5656 } 5657 } 5658 5659 /* update request data */ 5660 if (uscsi.uscsi_status == STATUS_GOOD) { 5661 if (uscsi.uscsi_flags & USCSI_READ) { 5662 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5663 vd_scsi->datain_len; 5664 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5665 vd_scsi->datain_len, mode) != 0) { 5666 rv = EFAULT; 5667 goto done; 5668 } 5669 } else { 5670 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5671 vd_scsi->dataout_len; 5672 } 5673 } 5674 5675 /* copy-out result */ 5676 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5677 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5678 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5679 mode) != 0) { 5680 rv = EFAULT; 5681 goto done; 5682 } 5683 } else { 5684 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5685 mode) != 0) { 5686 rv = EFAULT; 5687 goto done; 5688 } 5689 } 5690 5691 /* get the return code from the SCSI command status */ 5692 rv = vdc_scsi_status(vdc, vd_scsi, 5693 !(uscsi.uscsi_flags & USCSI_SILENT)); 5694 5695 done: 5696 kmem_free(vd_scsi, vd_scsi_len); 5697 return (rv); 5698 } 5699 5700 /* 5701 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5702 * 5703 * Arguments: 5704 * cmd - SCSI PERSISTENT IN command 5705 * len - length of the SCSI input buffer 5706 * vd_scsi_len - return the length of the allocated buffer 5707 * 5708 * Returned Value: 5709 * a pointer to the allocated VD_OP_SCSICMD buffer. 5710 */ 5711 static vd_scsi_t * 5712 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5713 { 5714 int cdb_len, sense_len, datain_len, dataout_len; 5715 vd_scsi_t *vd_scsi; 5716 union scsi_cdb *cdb; 5717 5718 cdb_len = CDB_GROUP1; 5719 sense_len = sizeof (struct scsi_extended_sense); 5720 datain_len = len; 5721 dataout_len = 0; 5722 5723 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5724 vd_scsi_len); 5725 5726 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5727 5728 /* set cdb */ 5729 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5730 cdb->cdb_opaque[1] = cmd; 5731 FORMG1COUNT(cdb, datain_len); 5732 5733 vd_scsi->timeout = vdc_scsi_timeout; 5734 5735 return (vd_scsi); 5736 } 5737 5738 /* 5739 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5740 * 5741 * Arguments: 5742 * cmd - SCSI PERSISTENT OUT command 5743 * len - length of the SCSI output buffer 5744 * vd_scsi_len - return the length of the allocated buffer 5745 * 5746 * Returned Code: 5747 * a pointer to the allocated VD_OP_SCSICMD buffer. 5748 */ 5749 static vd_scsi_t * 5750 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5751 { 5752 int cdb_len, sense_len, datain_len, dataout_len; 5753 vd_scsi_t *vd_scsi; 5754 union scsi_cdb *cdb; 5755 5756 cdb_len = CDB_GROUP1; 5757 sense_len = sizeof (struct scsi_extended_sense); 5758 datain_len = 0; 5759 dataout_len = len; 5760 5761 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5762 vd_scsi_len); 5763 5764 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5765 5766 /* set cdb */ 5767 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5768 cdb->cdb_opaque[1] = cmd; 5769 FORMG1COUNT(cdb, dataout_len); 5770 5771 vd_scsi->timeout = vdc_scsi_timeout; 5772 5773 return (vd_scsi); 5774 } 5775 5776 /* 5777 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5778 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5779 * server with a VD_OP_SCSICMD operation. 5780 */ 5781 static int 5782 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5783 { 5784 vd_scsi_t *vd_scsi; 5785 mhioc_inkeys_t inkeys; 5786 mhioc_key_list_t klist; 5787 struct mhioc_inkeys32 inkeys32; 5788 struct mhioc_key_list32 klist32; 5789 sd_prin_readkeys_t *scsi_keys; 5790 void *user_keys; 5791 int vd_scsi_len; 5792 int listsize, listlen, rv; 5793 5794 /* copyin arguments */ 5795 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5796 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5797 if (rv != 0) 5798 return (EFAULT); 5799 5800 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5801 sizeof (klist32), mode); 5802 if (rv != 0) 5803 return (EFAULT); 5804 5805 listsize = klist32.listsize; 5806 } else { 5807 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5808 if (rv != 0) 5809 return (EFAULT); 5810 5811 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5812 if (rv != 0) 5813 return (EFAULT); 5814 5815 listsize = klist.listsize; 5816 } 5817 5818 /* build SCSI VD_OP request */ 5819 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5820 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5821 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5822 5823 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5824 5825 /* submit the request */ 5826 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5827 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5828 5829 if (rv != 0) 5830 goto done; 5831 5832 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5833 5834 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5835 inkeys32.generation = scsi_keys->generation; 5836 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5837 if (rv != 0) { 5838 rv = EFAULT; 5839 goto done; 5840 } 5841 5842 klist32.listlen = listlen; 5843 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5844 sizeof (klist32), mode); 5845 if (rv != 0) { 5846 rv = EFAULT; 5847 goto done; 5848 } 5849 5850 user_keys = (caddr_t)(uintptr_t)klist32.list; 5851 } else { 5852 inkeys.generation = scsi_keys->generation; 5853 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5854 if (rv != 0) { 5855 rv = EFAULT; 5856 goto done; 5857 } 5858 5859 klist.listlen = listlen; 5860 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5861 if (rv != 0) { 5862 rv = EFAULT; 5863 goto done; 5864 } 5865 5866 user_keys = klist.list; 5867 } 5868 5869 /* copy out keys */ 5870 if (listlen > 0 && listsize > 0) { 5871 if (listsize < listlen) 5872 listlen = listsize; 5873 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5874 listlen * MHIOC_RESV_KEY_SIZE, mode); 5875 if (rv != 0) 5876 rv = EFAULT; 5877 } 5878 5879 if (rv == 0) 5880 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5881 5882 done: 5883 kmem_free(vd_scsi, vd_scsi_len); 5884 5885 return (rv); 5886 } 5887 5888 /* 5889 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5890 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5891 * the vdisk server with a VD_OP_SCSICMD operation. 5892 */ 5893 static int 5894 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5895 { 5896 vd_scsi_t *vd_scsi; 5897 mhioc_inresvs_t inresv; 5898 mhioc_resv_desc_list_t rlist; 5899 struct mhioc_inresvs32 inresv32; 5900 struct mhioc_resv_desc_list32 rlist32; 5901 mhioc_resv_desc_t mhd_resv; 5902 sd_prin_readresv_t *scsi_resv; 5903 sd_readresv_desc_t *resv; 5904 mhioc_resv_desc_t *user_resv; 5905 int vd_scsi_len; 5906 int listsize, listlen, i, rv; 5907 5908 /* copyin arguments */ 5909 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5910 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5911 if (rv != 0) 5912 return (EFAULT); 5913 5914 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5915 sizeof (rlist32), mode); 5916 if (rv != 0) 5917 return (EFAULT); 5918 5919 listsize = rlist32.listsize; 5920 } else { 5921 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5922 if (rv != 0) 5923 return (EFAULT); 5924 5925 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5926 if (rv != 0) 5927 return (EFAULT); 5928 5929 listsize = rlist.listsize; 5930 } 5931 5932 /* build SCSI VD_OP request */ 5933 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5934 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5935 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5936 5937 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5938 5939 /* submit the request */ 5940 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5941 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5942 5943 if (rv != 0) 5944 goto done; 5945 5946 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5947 5948 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5949 inresv32.generation = scsi_resv->generation; 5950 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5951 if (rv != 0) { 5952 rv = EFAULT; 5953 goto done; 5954 } 5955 5956 rlist32.listlen = listlen; 5957 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5958 sizeof (rlist32), mode); 5959 if (rv != 0) { 5960 rv = EFAULT; 5961 goto done; 5962 } 5963 5964 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5965 } else { 5966 inresv.generation = scsi_resv->generation; 5967 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5968 if (rv != 0) { 5969 rv = EFAULT; 5970 goto done; 5971 } 5972 5973 rlist.listlen = listlen; 5974 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5975 if (rv != 0) { 5976 rv = EFAULT; 5977 goto done; 5978 } 5979 5980 user_resv = rlist.list; 5981 } 5982 5983 /* copy out reservations */ 5984 if (listsize > 0 && listlen > 0) { 5985 if (listsize < listlen) 5986 listlen = listsize; 5987 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5988 5989 for (i = 0; i < listlen; i++) { 5990 mhd_resv.type = resv->type; 5991 mhd_resv.scope = resv->scope; 5992 mhd_resv.scope_specific_addr = 5993 BE_32(resv->scope_specific_addr); 5994 bcopy(&resv->resvkey, &mhd_resv.key, 5995 MHIOC_RESV_KEY_SIZE); 5996 5997 rv = ddi_copyout(&mhd_resv, user_resv, 5998 sizeof (mhd_resv), mode); 5999 if (rv != 0) { 6000 rv = EFAULT; 6001 goto done; 6002 } 6003 resv++; 6004 user_resv++; 6005 } 6006 } 6007 6008 if (rv == 0) 6009 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6010 6011 done: 6012 kmem_free(vd_scsi, vd_scsi_len); 6013 return (rv); 6014 } 6015 6016 /* 6017 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 6018 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 6019 * server with a VD_OP_SCSICMD operation. 6020 */ 6021 static int 6022 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 6023 { 6024 vd_scsi_t *vd_scsi; 6025 sd_prout_t *scsi_prout; 6026 mhioc_register_t mhd_reg; 6027 int vd_scsi_len, rv; 6028 6029 /* copyin arguments */ 6030 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 6031 if (rv != 0) 6032 return (EFAULT); 6033 6034 /* build SCSI VD_OP request */ 6035 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 6036 sizeof (sd_prout_t), &vd_scsi_len); 6037 6038 /* set parameters */ 6039 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6040 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6041 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6042 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6043 6044 /* submit the request */ 6045 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6046 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6047 6048 if (rv == 0) 6049 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6050 6051 kmem_free(vd_scsi, vd_scsi_len); 6052 return (rv); 6053 } 6054 6055 /* 6056 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6057 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6058 * server with a VD_OP_SCSICMD operation. 6059 */ 6060 static int 6061 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6062 { 6063 union scsi_cdb *cdb; 6064 vd_scsi_t *vd_scsi; 6065 sd_prout_t *scsi_prout; 6066 mhioc_resv_desc_t mhd_resv; 6067 int vd_scsi_len, rv; 6068 6069 /* copyin arguments */ 6070 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6071 if (rv != 0) 6072 return (EFAULT); 6073 6074 /* build SCSI VD_OP request */ 6075 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6076 sizeof (sd_prout_t), &vd_scsi_len); 6077 6078 /* set parameters */ 6079 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6080 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6081 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6082 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6083 cdb->cdb_opaque[2] = mhd_resv.type; 6084 6085 /* submit the request */ 6086 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6087 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6088 6089 if (rv == 0) 6090 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6091 6092 kmem_free(vd_scsi, vd_scsi_len); 6093 return (rv); 6094 } 6095 6096 /* 6097 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6098 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6099 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6100 */ 6101 static int 6102 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6103 { 6104 union scsi_cdb *cdb; 6105 vd_scsi_t *vd_scsi; 6106 sd_prout_t *scsi_prout; 6107 mhioc_preemptandabort_t mhd_preempt; 6108 int vd_scsi_len, rv; 6109 6110 /* copyin arguments */ 6111 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6112 if (rv != 0) 6113 return (EFAULT); 6114 6115 /* build SCSI VD_OP request */ 6116 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6117 sizeof (sd_prout_t), &vd_scsi_len); 6118 6119 /* set parameters */ 6120 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6121 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6122 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6123 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6124 MHIOC_RESV_KEY_SIZE); 6125 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6126 MHIOC_RESV_KEY_SIZE); 6127 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6128 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6129 6130 /* submit the request */ 6131 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6132 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6133 6134 if (rv == 0) 6135 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6136 6137 kmem_free(vd_scsi, vd_scsi_len); 6138 return (rv); 6139 } 6140 6141 /* 6142 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6143 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6144 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6145 */ 6146 static int 6147 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6148 { 6149 vd_scsi_t *vd_scsi; 6150 sd_prout_t *scsi_prout; 6151 mhioc_registerandignorekey_t mhd_regi; 6152 int vd_scsi_len, rv; 6153 6154 /* copyin arguments */ 6155 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6156 if (rv != 0) 6157 return (EFAULT); 6158 6159 /* build SCSI VD_OP request */ 6160 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6161 sizeof (sd_prout_t), &vd_scsi_len); 6162 6163 /* set parameters */ 6164 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6165 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6166 MHIOC_RESV_KEY_SIZE); 6167 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6168 6169 /* submit the request */ 6170 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6171 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6172 6173 if (rv == 0) 6174 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6175 6176 kmem_free(vd_scsi, vd_scsi_len); 6177 return (rv); 6178 } 6179 6180 /* 6181 * This function is used by the failfast mechanism to send a SCSI command 6182 * to check for reservation conflict. 6183 */ 6184 static int 6185 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6186 { 6187 int cdb_len, sense_len, vd_scsi_len; 6188 vd_scsi_t *vd_scsi; 6189 union scsi_cdb *cdb; 6190 int rv; 6191 6192 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6193 6194 if (scmd == SCMD_WRITE_G1) 6195 cdb_len = CDB_GROUP1; 6196 else 6197 cdb_len = CDB_GROUP0; 6198 6199 sense_len = sizeof (struct scsi_extended_sense); 6200 6201 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6202 6203 /* set cdb */ 6204 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6205 cdb->scc_cmd = scmd; 6206 6207 vd_scsi->timeout = vdc_scsi_timeout; 6208 6209 /* 6210 * Submit the request. The last argument has to be B_FALSE so that 6211 * vdc_do_sync_op does not loop checking for reservation conflict if 6212 * the operation returns an error. 6213 */ 6214 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6215 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6216 6217 if (rv == 0) 6218 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6219 6220 kmem_free(vd_scsi, vd_scsi_len); 6221 return (rv); 6222 } 6223 6224 /* 6225 * This function is used by the failfast mechanism to check for reservation 6226 * conflict. It sends some SCSI commands which will fail with a reservation 6227 * conflict error if the system does not have access to the disk and this 6228 * will panic the system. 6229 * 6230 * Returned Code: 6231 * 0 - disk is accessible without reservation conflict error 6232 * != 0 - unable to check if disk is accessible 6233 */ 6234 int 6235 vdc_failfast_check_resv(vdc_t *vdc) 6236 { 6237 int failure = 0; 6238 6239 /* 6240 * Send a TEST UNIT READY command. The command will panic 6241 * the system if it fails with a reservation conflict. 6242 */ 6243 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6244 failure++; 6245 6246 /* 6247 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6248 * a reserved device, so we also do a WRITE(10) of zero byte in 6249 * order to provoke a Reservation Conflict status on those newer 6250 * devices. 6251 */ 6252 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6253 failure++; 6254 6255 return (failure); 6256 } 6257 6258 /* 6259 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6260 * queue when it has failed and failfast is enabled. Then we have to check 6261 * if it has failed because of a reservation conflict in which case we have 6262 * to panic the system. 6263 * 6264 * Async I/O should be queued with their block I/O data transfer structure 6265 * (buf). Sync I/O should be queued with buf = NULL. 6266 */ 6267 static vdc_io_t * 6268 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6269 { 6270 vdc_io_t *vio; 6271 6272 ASSERT(MUTEX_HELD(&vdc->lock)); 6273 6274 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6275 vio->vio_next = vdc->failfast_io_queue; 6276 vio->vio_buf = buf; 6277 vio->vio_qtime = ddi_get_lbolt(); 6278 6279 vdc->failfast_io_queue = vio; 6280 6281 /* notify the failfast thread that a new I/O is queued */ 6282 cv_signal(&vdc->failfast_cv); 6283 6284 return (vio); 6285 } 6286 6287 /* 6288 * Remove and complete I/O in the failfast I/O queue which have been 6289 * added after the indicated deadline. A deadline of 0 means that all 6290 * I/O have to be unqueued and marked as completed. 6291 */ 6292 static void 6293 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6294 { 6295 vdc_io_t *vio, *vio_tmp; 6296 6297 ASSERT(MUTEX_HELD(&vdc->lock)); 6298 6299 vio_tmp = NULL; 6300 vio = vdc->failfast_io_queue; 6301 6302 if (deadline != 0) { 6303 /* 6304 * Skip any io queued after the deadline. The failfast 6305 * I/O queue is ordered starting with the last I/O added 6306 * to the queue. 6307 */ 6308 while (vio != NULL && vio->vio_qtime > deadline) { 6309 vio_tmp = vio; 6310 vio = vio->vio_next; 6311 } 6312 } 6313 6314 if (vio == NULL) 6315 /* nothing to unqueue */ 6316 return; 6317 6318 /* update the queue */ 6319 if (vio_tmp == NULL) 6320 vdc->failfast_io_queue = NULL; 6321 else 6322 vio_tmp->vio_next = NULL; 6323 6324 /* 6325 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6326 * structure (buf) and they are completed by calling biodone(). Sync 6327 * I/O do not have a buf and they are completed by setting the 6328 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6329 * thread waiting for the I/O to complete is responsible for freeing 6330 * the vio structure. 6331 */ 6332 while (vio != NULL) { 6333 vio_tmp = vio->vio_next; 6334 if (vio->vio_buf != NULL) { 6335 VD_KSTAT_RUNQ_EXIT(vdc); 6336 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6337 biodone(vio->vio_buf); 6338 kmem_free(vio, sizeof (vdc_io_t)); 6339 } else { 6340 vio->vio_qtime = 0; 6341 } 6342 vio = vio_tmp; 6343 } 6344 6345 cv_broadcast(&vdc->failfast_io_cv); 6346 } 6347 6348 /* 6349 * Failfast Thread. 6350 * 6351 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6352 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6353 * we still have access to the disk. If a command fails with a RESERVATION 6354 * CONFLICT error then the system will immediatly panic. 6355 * 6356 * The failfast thread is also woken up when an I/O has failed. It then check 6357 * the access to the disk to ensure that the I/O failure was not due to a 6358 * reservation conflict. 6359 * 6360 * There is one failfast thread for each virtual disk for which failfast is 6361 * enabled. We could have only one thread sending requests for all disks but 6362 * this would need vdc to send asynchronous requests and to have callbacks to 6363 * process replies. 6364 */ 6365 static void 6366 vdc_failfast_thread(void *arg) 6367 { 6368 int status; 6369 vdc_t *vdc = (vdc_t *)arg; 6370 clock_t timeout, starttime; 6371 6372 mutex_enter(&vdc->lock); 6373 6374 while (vdc->failfast_interval != 0) { 6375 6376 starttime = ddi_get_lbolt(); 6377 6378 mutex_exit(&vdc->lock); 6379 6380 /* check for reservation conflict */ 6381 status = vdc_failfast_check_resv(vdc); 6382 6383 mutex_enter(&vdc->lock); 6384 /* 6385 * We have dropped the lock to send the SCSI command so we have 6386 * to check that failfast is still enabled. 6387 */ 6388 if (vdc->failfast_interval == 0) 6389 break; 6390 6391 /* 6392 * If we have successfully check the disk access and there was 6393 * no reservation conflict then we can complete any I/O queued 6394 * before the last check. 6395 */ 6396 if (status == 0) 6397 vdc_failfast_io_unqueue(vdc, starttime); 6398 6399 /* proceed again if some I/O are still in the queue */ 6400 if (vdc->failfast_io_queue != NULL) 6401 continue; 6402 6403 timeout = ddi_get_lbolt() + 6404 drv_usectohz(vdc->failfast_interval); 6405 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6406 } 6407 6408 /* 6409 * Failfast is being stop so we can complete any queued I/O. 6410 */ 6411 vdc_failfast_io_unqueue(vdc, 0); 6412 vdc->failfast_thread = NULL; 6413 mutex_exit(&vdc->lock); 6414 thread_exit(); 6415 } 6416 6417 /* 6418 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6419 */ 6420 static int 6421 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6422 { 6423 unsigned int mh_time; 6424 6425 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6426 return (EFAULT); 6427 6428 mutex_enter(&vdc->lock); 6429 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6430 vdc->failfast_thread = thread_create(NULL, 0, 6431 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6432 v.v_maxsyspri - 2); 6433 } 6434 6435 vdc->failfast_interval = mh_time * 1000; 6436 cv_signal(&vdc->failfast_cv); 6437 mutex_exit(&vdc->lock); 6438 6439 return (0); 6440 } 6441 6442 /* 6443 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6444 * converted to VD_OP_SET_ACCESS operations. 6445 */ 6446 static int 6447 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6448 { 6449 int rv; 6450 6451 /* submit owership command request */ 6452 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6453 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6454 VIO_both_dir, B_TRUE); 6455 6456 return (rv); 6457 } 6458 6459 /* 6460 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6461 * VD_OP_GET_ACCESS operation. 6462 */ 6463 static int 6464 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6465 { 6466 int rv; 6467 6468 /* submit owership command request */ 6469 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6470 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6471 VIO_both_dir, B_TRUE); 6472 6473 return (rv); 6474 } 6475 6476 /* 6477 * Disk Ownership Thread. 6478 * 6479 * When we have taken the ownership of a disk, this thread waits to be 6480 * notified when the LDC channel is reset so that it can recover the 6481 * ownership. 6482 * 6483 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6484 * can not be used to do the ownership recovery because it has to be 6485 * running to handle the reply message to the ownership operation. 6486 */ 6487 static void 6488 vdc_ownership_thread(void *arg) 6489 { 6490 vdc_t *vdc = (vdc_t *)arg; 6491 clock_t timeout; 6492 uint64_t status; 6493 6494 mutex_enter(&vdc->ownership_lock); 6495 mutex_enter(&vdc->lock); 6496 6497 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6498 6499 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6500 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6501 /* 6502 * There was a reset so the ownership has been lost, 6503 * try to recover. We do this without using the preempt 6504 * option so that we don't steal the ownership from 6505 * someone who has preempted us. 6506 */ 6507 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6508 vdc->instance); 6509 6510 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6511 VDC_OWNERSHIP_GRANTED); 6512 6513 mutex_exit(&vdc->lock); 6514 6515 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6516 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6517 6518 mutex_enter(&vdc->lock); 6519 6520 if (status == 0) { 6521 DMSG(vdc, 0, "[%d] Ownership recovered", 6522 vdc->instance); 6523 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6524 } else { 6525 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6526 vdc->instance); 6527 } 6528 6529 } 6530 6531 /* 6532 * If we have the ownership then we just wait for an event 6533 * to happen (LDC reset), otherwise we will retry to recover 6534 * after a delay. 6535 */ 6536 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6537 timeout = 0; 6538 else 6539 timeout = ddi_get_lbolt() + 6540 drv_usectohz(vdc_ownership_delay); 6541 6542 /* Release the ownership_lock and wait on the vdc lock */ 6543 mutex_exit(&vdc->ownership_lock); 6544 6545 if (timeout == 0) 6546 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6547 else 6548 (void) cv_timedwait(&vdc->ownership_cv, 6549 &vdc->lock, timeout); 6550 6551 mutex_exit(&vdc->lock); 6552 6553 mutex_enter(&vdc->ownership_lock); 6554 mutex_enter(&vdc->lock); 6555 } 6556 6557 vdc->ownership_thread = NULL; 6558 mutex_exit(&vdc->lock); 6559 mutex_exit(&vdc->ownership_lock); 6560 6561 thread_exit(); 6562 } 6563 6564 static void 6565 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6566 { 6567 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6568 6569 mutex_enter(&vdc->lock); 6570 vdc->ownership = ownership_flags; 6571 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6572 vdc->ownership_thread == NULL) { 6573 /* start ownership thread */ 6574 vdc->ownership_thread = thread_create(NULL, 0, 6575 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6576 v.v_maxsyspri - 2); 6577 } else { 6578 /* notify the ownership thread */ 6579 cv_signal(&vdc->ownership_cv); 6580 } 6581 mutex_exit(&vdc->lock); 6582 } 6583 6584 /* 6585 * Get the size and the block size of a virtual disk from the vdisk server. 6586 * We need to use this operation when the vdisk_size attribute was not 6587 * available during the handshake with the vdisk server. 6588 */ 6589 static int 6590 vdc_check_capacity(vdc_t *vdc) 6591 { 6592 int rv = 0; 6593 size_t alloc_len; 6594 vd_capacity_t *vd_cap; 6595 6596 if (vdc->vdisk_size != 0) 6597 return (0); 6598 6599 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6600 6601 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6602 6603 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6604 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6605 6606 if (rv == 0) { 6607 if (vd_cap->vdisk_block_size != vdc->block_size || 6608 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6609 vd_cap->vdisk_size == 0) 6610 rv = EINVAL; 6611 else 6612 vdc->vdisk_size = vd_cap->vdisk_size; 6613 } 6614 6615 kmem_free(vd_cap, alloc_len); 6616 return (rv); 6617 } 6618 6619 /* 6620 * This structure is used in the DKIO(7I) array below. 6621 */ 6622 typedef struct vdc_dk_ioctl { 6623 uint8_t op; /* VD_OP_XXX value */ 6624 int cmd; /* Solaris ioctl operation number */ 6625 size_t nbytes; /* size of structure to be copied */ 6626 6627 /* function to convert between vDisk and Solaris structure formats */ 6628 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6629 int mode, int dir); 6630 } vdc_dk_ioctl_t; 6631 6632 /* 6633 * Subset of DKIO(7I) operations currently supported 6634 */ 6635 static vdc_dk_ioctl_t dk_ioctl[] = { 6636 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6637 vdc_null_copy_func}, 6638 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6639 vdc_get_wce_convert}, 6640 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6641 vdc_set_wce_convert}, 6642 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6643 vdc_get_vtoc_convert}, 6644 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6645 vdc_set_vtoc_convert}, 6646 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6647 vdc_get_geom_convert}, 6648 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6649 vdc_get_geom_convert}, 6650 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6651 vdc_get_geom_convert}, 6652 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6653 vdc_set_geom_convert}, 6654 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6655 vdc_get_efi_convert}, 6656 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6657 vdc_set_efi_convert}, 6658 6659 /* DIOCTL_RWCMD is converted to a read or a write */ 6660 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6661 6662 /* mhd(7I) non-shared multihost disks ioctls */ 6663 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6664 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6665 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6666 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6667 6668 /* mhd(7I) shared multihost disks ioctls */ 6669 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6670 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6671 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6672 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6673 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6674 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6675 6676 /* mhd(7I) failfast ioctl */ 6677 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6678 6679 /* 6680 * These particular ioctls are not sent to the server - vdc fakes up 6681 * the necessary info. 6682 */ 6683 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6684 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6685 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6686 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6687 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6688 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6689 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6690 }; 6691 6692 /* 6693 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6694 * function and forward them to the vdisk. 6695 */ 6696 static int 6697 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6698 { 6699 vdc_t *vdc = (vdc_t *)vdisk; 6700 dev_t dev; 6701 int rval; 6702 6703 dev = makedevice(ddi_driver_major(vdc->dip), 6704 VD_MAKE_DEV(vdc->instance, 0)); 6705 6706 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6707 } 6708 6709 /* 6710 * Function: 6711 * vd_process_ioctl() 6712 * 6713 * Description: 6714 * This routine processes disk specific ioctl calls 6715 * 6716 * Arguments: 6717 * dev - the device number 6718 * cmd - the operation [dkio(7I)] to be processed 6719 * arg - pointer to user provided structure 6720 * (contains data to be set or reference parameter for get) 6721 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6722 * rvalp - pointer to return value for calling process. 6723 * 6724 * Return Code: 6725 * 0 6726 * EFAULT 6727 * ENXIO 6728 * EIO 6729 * ENOTSUP 6730 */ 6731 static int 6732 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6733 { 6734 int instance = VDCUNIT(dev); 6735 vdc_t *vdc = NULL; 6736 int rv = -1; 6737 int idx = 0; /* index into dk_ioctl[] */ 6738 size_t len = 0; /* #bytes to send to vds */ 6739 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6740 caddr_t mem_p = NULL; 6741 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6742 vdc_dk_ioctl_t *iop; 6743 6744 vdc = ddi_get_soft_state(vdc_state, instance); 6745 if (vdc == NULL) { 6746 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6747 instance); 6748 return (ENXIO); 6749 } 6750 6751 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6752 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6753 6754 if (rvalp != NULL) { 6755 /* the return value of the ioctl is 0 by default */ 6756 *rvalp = 0; 6757 } 6758 6759 /* 6760 * Validate the ioctl operation to be performed. 6761 * 6762 * If we have looped through the array without finding a match then we 6763 * don't support this ioctl. 6764 */ 6765 for (idx = 0; idx < nioctls; idx++) { 6766 if (cmd == dk_ioctl[idx].cmd) 6767 break; 6768 } 6769 6770 if (idx >= nioctls) { 6771 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6772 vdc->instance, cmd); 6773 return (ENOTSUP); 6774 } 6775 6776 iop = &(dk_ioctl[idx]); 6777 6778 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6779 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6780 dk_efi_t dk_efi; 6781 6782 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6783 if (rv != 0) 6784 return (EFAULT); 6785 6786 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6787 } else { 6788 len = iop->nbytes; 6789 } 6790 6791 /* check if the ioctl is applicable */ 6792 switch (cmd) { 6793 case CDROMREADOFFSET: 6794 case DKIOCREMOVABLE: 6795 return (ENOTTY); 6796 6797 case USCSICMD: 6798 case MHIOCTKOWN: 6799 case MHIOCSTATUS: 6800 case MHIOCQRESERVE: 6801 case MHIOCRELEASE: 6802 case MHIOCGRP_INKEYS: 6803 case MHIOCGRP_INRESV: 6804 case MHIOCGRP_REGISTER: 6805 case MHIOCGRP_RESERVE: 6806 case MHIOCGRP_PREEMPTANDABORT: 6807 case MHIOCGRP_REGISTERANDIGNOREKEY: 6808 case MHIOCENFAILFAST: 6809 if (vdc->cinfo == NULL) 6810 return (ENXIO); 6811 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6812 return (ENOTTY); 6813 break; 6814 6815 case DIOCTL_RWCMD: 6816 if (vdc->cinfo == NULL) 6817 return (ENXIO); 6818 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6819 return (ENOTTY); 6820 break; 6821 6822 case DKIOCINFO: 6823 if (vdc->cinfo == NULL) 6824 return (ENXIO); 6825 break; 6826 6827 case DKIOCGMEDIAINFO: 6828 if (vdc->minfo == NULL) 6829 return (ENXIO); 6830 if (vdc_check_capacity(vdc) != 0) 6831 /* disk capacity is not available */ 6832 return (EIO); 6833 break; 6834 } 6835 6836 /* 6837 * Deal with ioctls which require a processing different than 6838 * converting ioctl arguments and sending a corresponding 6839 * VD operation. 6840 */ 6841 switch (cmd) { 6842 6843 case USCSICMD: 6844 { 6845 return (vdc_uscsi_cmd(vdc, arg, mode)); 6846 } 6847 6848 case MHIOCTKOWN: 6849 { 6850 mutex_enter(&vdc->ownership_lock); 6851 /* 6852 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6853 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6854 * while we are processing the ioctl. 6855 */ 6856 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6857 6858 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6859 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6860 if (rv == 0) { 6861 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6862 VDC_OWNERSHIP_GRANTED); 6863 } else { 6864 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6865 } 6866 mutex_exit(&vdc->ownership_lock); 6867 return (rv); 6868 } 6869 6870 case MHIOCRELEASE: 6871 { 6872 mutex_enter(&vdc->ownership_lock); 6873 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6874 if (rv == 0) { 6875 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6876 } 6877 mutex_exit(&vdc->ownership_lock); 6878 return (rv); 6879 } 6880 6881 case MHIOCSTATUS: 6882 { 6883 uint64_t status; 6884 6885 rv = vdc_access_get(vdc, &status, mode); 6886 if (rv == 0 && rvalp != NULL) 6887 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6888 return (rv); 6889 } 6890 6891 case MHIOCQRESERVE: 6892 { 6893 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6894 return (rv); 6895 } 6896 6897 case MHIOCGRP_INKEYS: 6898 { 6899 return (vdc_mhd_inkeys(vdc, arg, mode)); 6900 } 6901 6902 case MHIOCGRP_INRESV: 6903 { 6904 return (vdc_mhd_inresv(vdc, arg, mode)); 6905 } 6906 6907 case MHIOCGRP_REGISTER: 6908 { 6909 return (vdc_mhd_register(vdc, arg, mode)); 6910 } 6911 6912 case MHIOCGRP_RESERVE: 6913 { 6914 return (vdc_mhd_reserve(vdc, arg, mode)); 6915 } 6916 6917 case MHIOCGRP_PREEMPTANDABORT: 6918 { 6919 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6920 } 6921 6922 case MHIOCGRP_REGISTERANDIGNOREKEY: 6923 { 6924 return (vdc_mhd_registerignore(vdc, arg, mode)); 6925 } 6926 6927 case MHIOCENFAILFAST: 6928 { 6929 rv = vdc_failfast(vdc, arg, mode); 6930 return (rv); 6931 } 6932 6933 case DIOCTL_RWCMD: 6934 { 6935 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6936 } 6937 6938 case DKIOCGAPART: 6939 { 6940 return (vdc_dkio_gapart(vdc, arg, mode)); 6941 } 6942 6943 case DKIOCPARTITION: 6944 { 6945 return (vdc_dkio_partition(vdc, arg, mode)); 6946 } 6947 6948 case DKIOCINFO: 6949 { 6950 struct dk_cinfo cinfo; 6951 6952 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6953 cinfo.dki_partition = VDCPART(dev); 6954 6955 rv = ddi_copyout(&cinfo, (void *)arg, 6956 sizeof (struct dk_cinfo), mode); 6957 if (rv != 0) 6958 return (EFAULT); 6959 6960 return (0); 6961 } 6962 6963 case DKIOCGMEDIAINFO: 6964 { 6965 ASSERT(vdc->vdisk_size != 0); 6966 if (vdc->minfo->dki_capacity == 0) 6967 vdc->minfo->dki_capacity = vdc->vdisk_size; 6968 rv = ddi_copyout(vdc->minfo, (void *)arg, 6969 sizeof (struct dk_minfo), mode); 6970 if (rv != 0) 6971 return (EFAULT); 6972 6973 return (0); 6974 } 6975 6976 case DKIOCFLUSHWRITECACHE: 6977 { 6978 struct dk_callback *dkc = 6979 (struct dk_callback *)(uintptr_t)arg; 6980 vdc_dk_arg_t *dkarg = NULL; 6981 6982 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6983 instance, mode); 6984 6985 /* 6986 * If arg is NULL, then there is no callback function 6987 * registered and the call operates synchronously; we 6988 * break and continue with the rest of the function and 6989 * wait for vds to return (i.e. after the request to 6990 * vds returns successfully, all writes completed prior 6991 * to the ioctl will have been flushed from the disk 6992 * write cache to persistent media. 6993 * 6994 * If a callback function is registered, we dispatch 6995 * the request on a task queue and return immediately. 6996 * The callback will deal with informing the calling 6997 * thread that the flush request is completed. 6998 */ 6999 if (dkc == NULL) 7000 break; 7001 7002 /* 7003 * the asynchronous callback is only supported if 7004 * invoked from within the kernel 7005 */ 7006 if ((mode & FKIOCTL) == 0) 7007 return (ENOTSUP); 7008 7009 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 7010 7011 dkarg->mode = mode; 7012 dkarg->dev = dev; 7013 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 7014 7015 mutex_enter(&vdc->lock); 7016 vdc->dkio_flush_pending++; 7017 dkarg->vdc = vdc; 7018 mutex_exit(&vdc->lock); 7019 7020 /* put the request on a task queue */ 7021 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 7022 (void *)dkarg, DDI_SLEEP); 7023 if (rv == NULL) { 7024 /* clean up if dispatch fails */ 7025 mutex_enter(&vdc->lock); 7026 vdc->dkio_flush_pending--; 7027 mutex_exit(&vdc->lock); 7028 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7029 } 7030 7031 return (rv == NULL ? ENOMEM : 0); 7032 } 7033 } 7034 7035 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7036 ASSERT(iop->op != 0); 7037 7038 /* check if the vDisk server handles the operation for this vDisk */ 7039 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7040 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7041 vdc->instance, iop->op); 7042 return (ENOTSUP); 7043 } 7044 7045 /* LDC requires that the memory being mapped is 8-byte aligned */ 7046 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7047 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7048 instance, len, alloc_len); 7049 7050 if (alloc_len > 0) 7051 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7052 7053 /* 7054 * Call the conversion function for this ioctl which, if necessary, 7055 * converts from the Solaris format to the format ARC'ed 7056 * as part of the vDisk protocol (FWARC 2006/195) 7057 */ 7058 ASSERT(iop->convert != NULL); 7059 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7060 if (rv != 0) { 7061 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7062 instance, rv, cmd); 7063 if (mem_p != NULL) 7064 kmem_free(mem_p, alloc_len); 7065 return (rv); 7066 } 7067 7068 /* 7069 * send request to vds to service the ioctl. 7070 */ 7071 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7072 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7073 VIO_both_dir, B_TRUE); 7074 7075 if (rv != 0) { 7076 /* 7077 * This is not necessarily an error. The ioctl could 7078 * be returning a value such as ENOTTY to indicate 7079 * that the ioctl is not applicable. 7080 */ 7081 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7082 instance, rv, cmd); 7083 if (mem_p != NULL) 7084 kmem_free(mem_p, alloc_len); 7085 7086 return (rv); 7087 } 7088 7089 /* 7090 * Call the conversion function (if it exists) for this ioctl 7091 * which converts from the format ARC'ed as part of the vDisk 7092 * protocol (FWARC 2006/195) back to a format understood by 7093 * the rest of Solaris. 7094 */ 7095 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7096 if (rv != 0) { 7097 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7098 instance, rv, cmd); 7099 if (mem_p != NULL) 7100 kmem_free(mem_p, alloc_len); 7101 return (rv); 7102 } 7103 7104 if (mem_p != NULL) 7105 kmem_free(mem_p, alloc_len); 7106 7107 return (rv); 7108 } 7109 7110 /* 7111 * Function: 7112 * 7113 * Description: 7114 * This is an empty conversion function used by ioctl calls which 7115 * do not need to convert the data being passed in/out to userland 7116 */ 7117 static int 7118 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7119 { 7120 _NOTE(ARGUNUSED(vdc)) 7121 _NOTE(ARGUNUSED(from)) 7122 _NOTE(ARGUNUSED(to)) 7123 _NOTE(ARGUNUSED(mode)) 7124 _NOTE(ARGUNUSED(dir)) 7125 7126 return (0); 7127 } 7128 7129 static int 7130 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7131 int mode, int dir) 7132 { 7133 _NOTE(ARGUNUSED(vdc)) 7134 7135 if (dir == VD_COPYIN) 7136 return (0); /* nothing to do */ 7137 7138 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7139 return (EFAULT); 7140 7141 return (0); 7142 } 7143 7144 static int 7145 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7146 int mode, int dir) 7147 { 7148 _NOTE(ARGUNUSED(vdc)) 7149 7150 if (dir == VD_COPYOUT) 7151 return (0); /* nothing to do */ 7152 7153 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7154 return (EFAULT); 7155 7156 return (0); 7157 } 7158 7159 /* 7160 * Function: 7161 * vdc_get_vtoc_convert() 7162 * 7163 * Description: 7164 * This routine performs the necessary convertions from the DKIOCGVTOC 7165 * Solaris structure to the format defined in FWARC 2006/195. 7166 * 7167 * In the struct vtoc definition, the timestamp field is marked as not 7168 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7169 * However SVM uses that field to check it can write into the VTOC, 7170 * so we fake up the info of that field. 7171 * 7172 * Arguments: 7173 * vdc - the vDisk client 7174 * from - the buffer containing the data to be copied from 7175 * to - the buffer to be copied to 7176 * mode - flags passed to ioctl() call 7177 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7178 * 7179 * Return Code: 7180 * 0 - Success 7181 * ENXIO - incorrect buffer passed in. 7182 * EFAULT - ddi_copyout routine encountered an error. 7183 */ 7184 static int 7185 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7186 { 7187 int i; 7188 void *tmp_mem = NULL; 7189 void *tmp_memp; 7190 struct vtoc vt; 7191 struct vtoc32 vt32; 7192 int copy_len = 0; 7193 int rv = 0; 7194 7195 if (dir != VD_COPYOUT) 7196 return (0); /* nothing to do */ 7197 7198 if ((from == NULL) || (to == NULL)) 7199 return (ENXIO); 7200 7201 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7202 copy_len = sizeof (struct vtoc32); 7203 else 7204 copy_len = sizeof (struct vtoc); 7205 7206 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7207 7208 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 7209 7210 /* fake the VTOC timestamp field */ 7211 for (i = 0; i < V_NUMPAR; i++) { 7212 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 7213 } 7214 7215 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7216 /* LINTED E_ASSIGN_NARROW_CONV */ 7217 vtoctovtoc32(vt, vt32); 7218 tmp_memp = &vt32; 7219 } else { 7220 tmp_memp = &vt; 7221 } 7222 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 7223 if (rv != 0) 7224 rv = EFAULT; 7225 7226 kmem_free(tmp_mem, copy_len); 7227 return (rv); 7228 } 7229 7230 /* 7231 * Function: 7232 * vdc_set_vtoc_convert() 7233 * 7234 * Description: 7235 * This routine performs the necessary convertions from the DKIOCSVTOC 7236 * Solaris structure to the format defined in FWARC 2006/195. 7237 * 7238 * Arguments: 7239 * vdc - the vDisk client 7240 * from - Buffer with data 7241 * to - Buffer where data is to be copied to 7242 * mode - flags passed to ioctl 7243 * dir - direction of copy (in or out) 7244 * 7245 * Return Code: 7246 * 0 - Success 7247 * ENXIO - Invalid buffer passed in 7248 * EFAULT - ddi_copyin of data failed 7249 */ 7250 static int 7251 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7252 { 7253 _NOTE(ARGUNUSED(vdc)) 7254 7255 void *tmp_mem = NULL, *uvtoc; 7256 struct vtoc vt; 7257 struct vtoc *vtp = &vt; 7258 vd_vtoc_t vtvd; 7259 int copy_len = 0; 7260 int i, rv = 0; 7261 7262 if ((from == NULL) || (to == NULL)) 7263 return (ENXIO); 7264 7265 if (dir == VD_COPYIN) 7266 uvtoc = from; 7267 else 7268 uvtoc = to; 7269 7270 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7271 copy_len = sizeof (struct vtoc32); 7272 else 7273 copy_len = sizeof (struct vtoc); 7274 7275 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7276 7277 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 7278 if (rv != 0) { 7279 kmem_free(tmp_mem, copy_len); 7280 return (EFAULT); 7281 } 7282 7283 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7284 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 7285 } else { 7286 vtp = tmp_mem; 7287 } 7288 7289 if (dir == VD_COPYOUT) { 7290 /* 7291 * The disk label may have changed. Revalidate the disk 7292 * geometry. This will also update the device nodes and 7293 * properties. 7294 */ 7295 vdc_validate(vdc); 7296 7297 /* 7298 * We also need to keep track of the timestamp fields. 7299 */ 7300 for (i = 0; i < V_NUMPAR; i++) { 7301 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 7302 } 7303 7304 return (0); 7305 } 7306 7307 VTOC2VD_VTOC(vtp, &vtvd); 7308 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 7309 kmem_free(tmp_mem, copy_len); 7310 7311 return (0); 7312 } 7313 7314 /* 7315 * Function: 7316 * vdc_get_geom_convert() 7317 * 7318 * Description: 7319 * This routine performs the necessary convertions from the DKIOCGGEOM, 7320 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7321 * defined in FWARC 2006/195 7322 * 7323 * Arguments: 7324 * vdc - the vDisk client 7325 * from - Buffer with data 7326 * to - Buffer where data is to be copied to 7327 * mode - flags passed to ioctl 7328 * dir - direction of copy (in or out) 7329 * 7330 * Return Code: 7331 * 0 - Success 7332 * ENXIO - Invalid buffer passed in 7333 * EFAULT - ddi_copyout of data failed 7334 */ 7335 static int 7336 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7337 { 7338 _NOTE(ARGUNUSED(vdc)) 7339 7340 struct dk_geom geom; 7341 int copy_len = sizeof (struct dk_geom); 7342 int rv = 0; 7343 7344 if (dir != VD_COPYOUT) 7345 return (0); /* nothing to do */ 7346 7347 if ((from == NULL) || (to == NULL)) 7348 return (ENXIO); 7349 7350 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7351 rv = ddi_copyout(&geom, to, copy_len, mode); 7352 if (rv != 0) 7353 rv = EFAULT; 7354 7355 return (rv); 7356 } 7357 7358 /* 7359 * Function: 7360 * vdc_set_geom_convert() 7361 * 7362 * Description: 7363 * This routine performs the necessary convertions from the DKIOCSGEOM 7364 * Solaris structure to the format defined in FWARC 2006/195. 7365 * 7366 * Arguments: 7367 * vdc - the vDisk client 7368 * from - Buffer with data 7369 * to - Buffer where data is to be copied to 7370 * mode - flags passed to ioctl 7371 * dir - direction of copy (in or out) 7372 * 7373 * Return Code: 7374 * 0 - Success 7375 * ENXIO - Invalid buffer passed in 7376 * EFAULT - ddi_copyin of data failed 7377 */ 7378 static int 7379 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7380 { 7381 _NOTE(ARGUNUSED(vdc)) 7382 7383 vd_geom_t vdgeom; 7384 void *tmp_mem = NULL; 7385 int copy_len = sizeof (struct dk_geom); 7386 int rv = 0; 7387 7388 if (dir != VD_COPYIN) 7389 return (0); /* nothing to do */ 7390 7391 if ((from == NULL) || (to == NULL)) 7392 return (ENXIO); 7393 7394 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7395 7396 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7397 if (rv != 0) { 7398 kmem_free(tmp_mem, copy_len); 7399 return (EFAULT); 7400 } 7401 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7402 bcopy(&vdgeom, to, sizeof (vdgeom)); 7403 kmem_free(tmp_mem, copy_len); 7404 7405 return (0); 7406 } 7407 7408 static int 7409 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7410 { 7411 _NOTE(ARGUNUSED(vdc)) 7412 7413 vd_efi_t *vd_efi; 7414 dk_efi_t dk_efi; 7415 int rv = 0; 7416 void *uaddr; 7417 7418 if ((from == NULL) || (to == NULL)) 7419 return (ENXIO); 7420 7421 if (dir == VD_COPYIN) { 7422 7423 vd_efi = (vd_efi_t *)to; 7424 7425 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7426 if (rv != 0) 7427 return (EFAULT); 7428 7429 vd_efi->lba = dk_efi.dki_lba; 7430 vd_efi->length = dk_efi.dki_length; 7431 bzero(vd_efi->data, vd_efi->length); 7432 7433 } else { 7434 7435 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7436 if (rv != 0) 7437 return (EFAULT); 7438 7439 uaddr = dk_efi.dki_data; 7440 7441 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7442 7443 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7444 7445 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7446 mode); 7447 if (rv != 0) 7448 return (EFAULT); 7449 7450 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7451 } 7452 7453 return (0); 7454 } 7455 7456 static int 7457 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7458 { 7459 _NOTE(ARGUNUSED(vdc)) 7460 7461 dk_efi_t dk_efi; 7462 void *uaddr; 7463 7464 if (dir == VD_COPYOUT) { 7465 /* 7466 * The disk label may have changed. Revalidate the disk 7467 * geometry. This will also update the device nodes and 7468 * properties. 7469 */ 7470 vdc_validate(vdc); 7471 return (0); 7472 } 7473 7474 if ((from == NULL) || (to == NULL)) 7475 return (ENXIO); 7476 7477 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7478 return (EFAULT); 7479 7480 uaddr = dk_efi.dki_data; 7481 7482 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7483 7484 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7485 return (EFAULT); 7486 7487 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7488 7489 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7490 7491 return (0); 7492 } 7493 7494 7495 /* -------------------------------------------------------------------------- */ 7496 7497 /* 7498 * Function: 7499 * vdc_create_fake_geometry() 7500 * 7501 * Description: 7502 * This routine fakes up the disk info needed for some DKIO ioctls such 7503 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7504 * 7505 * Note: This function must not be called until the vDisk attributes have 7506 * been exchanged as part of the handshake with the vDisk server. 7507 * 7508 * Arguments: 7509 * vdc - soft state pointer for this instance of the device driver. 7510 * 7511 * Return Code: 7512 * none. 7513 */ 7514 static void 7515 vdc_create_fake_geometry(vdc_t *vdc) 7516 { 7517 ASSERT(vdc != NULL); 7518 ASSERT(vdc->max_xfer_sz != 0); 7519 7520 /* 7521 * DKIOCINFO support 7522 */ 7523 if (vdc->cinfo == NULL) 7524 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7525 7526 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7527 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7528 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7529 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7530 7531 /* 7532 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7533 * operation is supported, otherwise the controller type is DKC_DIRECT. 7534 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7535 * controller type is always DKC_DIRECT in that case. 7536 * 7537 * If the virtual disk is backed by a physical CD/DVD device or 7538 * an ISO image, modify the controller type to indicate this 7539 */ 7540 switch (vdc->vdisk_media) { 7541 case VD_MEDIA_CD: 7542 case VD_MEDIA_DVD: 7543 vdc->cinfo->dki_ctype = DKC_CDROM; 7544 break; 7545 case VD_MEDIA_FIXED: 7546 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7547 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7548 else 7549 vdc->cinfo->dki_ctype = DKC_DIRECT; 7550 break; 7551 default: 7552 /* in the case of v1.0 we default to a fixed disk */ 7553 vdc->cinfo->dki_ctype = DKC_DIRECT; 7554 break; 7555 } 7556 vdc->cinfo->dki_flags = DKI_FMTVOL; 7557 vdc->cinfo->dki_cnum = 0; 7558 vdc->cinfo->dki_addr = 0; 7559 vdc->cinfo->dki_space = 0; 7560 vdc->cinfo->dki_prio = 0; 7561 vdc->cinfo->dki_vec = 0; 7562 vdc->cinfo->dki_unit = vdc->instance; 7563 vdc->cinfo->dki_slave = 0; 7564 /* 7565 * The partition number will be created on the fly depending on the 7566 * actual slice (i.e. minor node) that is used to request the data. 7567 */ 7568 vdc->cinfo->dki_partition = 0; 7569 7570 /* 7571 * DKIOCGMEDIAINFO support 7572 */ 7573 if (vdc->minfo == NULL) 7574 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7575 7576 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7577 vdc->minfo->dki_media_type = 7578 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7579 } else { 7580 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7581 } 7582 7583 vdc->minfo->dki_capacity = vdc->vdisk_size; 7584 vdc->minfo->dki_lbsize = vdc->block_size; 7585 } 7586 7587 static ushort_t 7588 vdc_lbl2cksum(struct dk_label *label) 7589 { 7590 int count; 7591 ushort_t sum, *sp; 7592 7593 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7594 sp = (ushort_t *)label; 7595 sum = 0; 7596 while (count--) { 7597 sum ^= *sp++; 7598 } 7599 7600 return (sum); 7601 } 7602 7603 /* 7604 * Function: 7605 * vdc_validate_geometry 7606 * 7607 * Description: 7608 * This routine discovers the label and geometry of the disk. It stores 7609 * the disk label and related information in the vdc structure. If it 7610 * fails to validate the geometry or to discover the disk label then 7611 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7612 * 7613 * Arguments: 7614 * vdc - soft state pointer for this instance of the device driver. 7615 * 7616 * Return Code: 7617 * 0 - success. 7618 * EINVAL - unknown disk label. 7619 * ENOTSUP - geometry not applicable (EFI label). 7620 * EIO - error accessing the disk. 7621 */ 7622 static int 7623 vdc_validate_geometry(vdc_t *vdc) 7624 { 7625 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7626 dev_t dev; 7627 int rv, rval; 7628 struct dk_label label; 7629 struct dk_geom geom; 7630 struct vtoc vtoc; 7631 efi_gpt_t *gpt; 7632 efi_gpe_t *gpe; 7633 vd_efi_dev_t edev; 7634 7635 ASSERT(vdc != NULL); 7636 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7637 ASSERT(MUTEX_HELD(&vdc->lock)); 7638 7639 mutex_exit(&vdc->lock); 7640 7641 dev = makedevice(ddi_driver_major(vdc->dip), 7642 VD_MAKE_DEV(vdc->instance, 0)); 7643 7644 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7645 if (rv == 0) 7646 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7647 FKIOCTL, &rval); 7648 7649 if (rv == ENOTSUP) { 7650 /* 7651 * If the device does not support VTOC then we try 7652 * to read an EFI label. 7653 * 7654 * We need to know the block size and the disk size to 7655 * be able to read an EFI label. 7656 */ 7657 if (vdc->vdisk_size == 0) { 7658 if ((rv = vdc_check_capacity(vdc)) != 0) { 7659 mutex_enter(&vdc->lock); 7660 vdc_store_label_unk(vdc); 7661 return (rv); 7662 } 7663 } 7664 7665 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7666 7667 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7668 7669 if (rv) { 7670 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7671 vdc->instance, rv); 7672 mutex_enter(&vdc->lock); 7673 vdc_store_label_unk(vdc); 7674 return (EIO); 7675 } 7676 7677 mutex_enter(&vdc->lock); 7678 vdc_store_label_efi(vdc, gpt, gpe); 7679 vd_efi_free(&edev, gpt, gpe); 7680 return (ENOTSUP); 7681 } 7682 7683 if (rv != 0) { 7684 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7685 vdc->instance, rv); 7686 mutex_enter(&vdc->lock); 7687 vdc_store_label_unk(vdc); 7688 if (rv != EINVAL) 7689 rv = EIO; 7690 return (rv); 7691 } 7692 7693 /* check that geometry and vtoc are valid */ 7694 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7695 vtoc.v_sanity != VTOC_SANE) { 7696 mutex_enter(&vdc->lock); 7697 vdc_store_label_unk(vdc); 7698 return (EINVAL); 7699 } 7700 7701 /* 7702 * We have a disk and a valid VTOC. However this does not mean 7703 * that the disk currently have a VTOC label. The returned VTOC may 7704 * be a default VTOC to be used for configuring the disk (this is 7705 * what is done for disk image). So we read the label from the 7706 * beginning of the disk to ensure we really have a VTOC label. 7707 * 7708 * FUTURE: This could be the default way for reading the VTOC 7709 * from the disk as opposed to sending the VD_OP_GET_VTOC 7710 * to the server. This will be the default if vdc is implemented 7711 * ontop of cmlb. 7712 */ 7713 7714 /* 7715 * Single slice disk does not support read using an absolute disk 7716 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7717 */ 7718 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7719 mutex_enter(&vdc->lock); 7720 if (vtoc.v_nparts != 1) { 7721 vdc_store_label_unk(vdc); 7722 return (EINVAL); 7723 } 7724 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7725 return (0); 7726 } 7727 7728 if (vtoc.v_nparts != V_NUMPAR) { 7729 mutex_enter(&vdc->lock); 7730 vdc_store_label_unk(vdc); 7731 return (EINVAL); 7732 } 7733 7734 /* 7735 * Read disk label from start of disk 7736 */ 7737 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7738 bioinit(buf); 7739 buf->b_un.b_addr = (caddr_t)&label; 7740 buf->b_bcount = DK_LABEL_SIZE; 7741 buf->b_flags = B_BUSY | B_READ; 7742 buf->b_dev = cmpdev(dev); 7743 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7744 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7745 if (rv) { 7746 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7747 vdc->instance); 7748 } else { 7749 rv = biowait(buf); 7750 biofini(buf); 7751 } 7752 kmem_free(buf, sizeof (buf_t)); 7753 7754 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7755 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7756 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7757 vdc->instance); 7758 mutex_enter(&vdc->lock); 7759 vdc_store_label_unk(vdc); 7760 return (EINVAL); 7761 } 7762 7763 mutex_enter(&vdc->lock); 7764 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7765 return (0); 7766 } 7767 7768 /* 7769 * Function: 7770 * vdc_validate 7771 * 7772 * Description: 7773 * This routine discovers the label of the disk and create the 7774 * appropriate device nodes if the label has changed. 7775 * 7776 * Arguments: 7777 * vdc - soft state pointer for this instance of the device driver. 7778 * 7779 * Return Code: 7780 * none. 7781 */ 7782 static void 7783 vdc_validate(vdc_t *vdc) 7784 { 7785 vd_disk_label_t old_label; 7786 vd_slice_t old_slice[V_NUMPAR]; 7787 int rv; 7788 7789 ASSERT(!MUTEX_HELD(&vdc->lock)); 7790 7791 mutex_enter(&vdc->lock); 7792 7793 /* save the current label and vtoc */ 7794 old_label = vdc->vdisk_label; 7795 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7796 7797 /* check the geometry */ 7798 (void) vdc_validate_geometry(vdc); 7799 7800 /* if the disk label has changed, update device nodes */ 7801 if (vdc->vdisk_label != old_label) { 7802 7803 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7804 rv = vdc_create_device_nodes_efi(vdc); 7805 else 7806 rv = vdc_create_device_nodes_vtoc(vdc); 7807 7808 if (rv != 0) { 7809 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7810 vdc->instance); 7811 } 7812 } 7813 7814 /* if the vtoc has changed, update device nodes properties */ 7815 if (bcmp(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR) != 0) { 7816 7817 if (vdc_create_device_nodes_props(vdc) != 0) { 7818 DMSG(vdc, 0, "![%d] Failed to update device nodes" 7819 " properties", vdc->instance); 7820 } 7821 } 7822 7823 mutex_exit(&vdc->lock); 7824 } 7825 7826 static void 7827 vdc_validate_task(void *arg) 7828 { 7829 vdc_t *vdc = (vdc_t *)arg; 7830 7831 vdc_validate(vdc); 7832 7833 mutex_enter(&vdc->lock); 7834 ASSERT(vdc->validate_pending > 0); 7835 vdc->validate_pending--; 7836 mutex_exit(&vdc->lock); 7837 } 7838 7839 /* 7840 * Function: 7841 * vdc_setup_devid() 7842 * 7843 * Description: 7844 * This routine discovers the devid of a vDisk. It requests the devid of 7845 * the underlying device from the vDisk server, builds an encapsulated 7846 * devid based on the retrieved devid and registers that new devid to 7847 * the vDisk. 7848 * 7849 * Arguments: 7850 * vdc - soft state pointer for this instance of the device driver. 7851 * 7852 * Return Code: 7853 * 0 - A devid was succesfully registered for the vDisk 7854 */ 7855 static int 7856 vdc_setup_devid(vdc_t *vdc) 7857 { 7858 int rv; 7859 vd_devid_t *vd_devid; 7860 size_t bufsize, bufid_len; 7861 7862 /* 7863 * At first sight, we don't know the size of the devid that the 7864 * server will return but this size will be encoded into the 7865 * reply. So we do a first request using a default size then we 7866 * check if this size was large enough. If not then we do a second 7867 * request with the correct size returned by the server. Note that 7868 * ldc requires size to be 8-byte aligned. 7869 */ 7870 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7871 sizeof (uint64_t)); 7872 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7873 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7874 7875 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7876 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7877 7878 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7879 7880 if (rv) { 7881 kmem_free(vd_devid, bufsize); 7882 return (rv); 7883 } 7884 7885 if (vd_devid->length > bufid_len) { 7886 /* 7887 * The returned devid is larger than the buffer used. Try again 7888 * with a buffer with the right size. 7889 */ 7890 kmem_free(vd_devid, bufsize); 7891 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7892 sizeof (uint64_t)); 7893 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7894 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7895 7896 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7897 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7898 VIO_both_dir, B_TRUE); 7899 7900 if (rv) { 7901 kmem_free(vd_devid, bufsize); 7902 return (rv); 7903 } 7904 } 7905 7906 /* 7907 * The virtual disk should have the same device id as the one associated 7908 * with the physical disk it is mapped on, otherwise sharing a disk 7909 * between a LDom and a non-LDom may not work (for example for a shared 7910 * SVM disk set). 7911 * 7912 * The DDI framework does not allow creating a device id with any 7913 * type so we first create a device id of type DEVID_ENCAP and then 7914 * we restore the orignal type of the physical device. 7915 */ 7916 7917 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7918 7919 /* build an encapsulated devid based on the returned devid */ 7920 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7921 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7922 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7923 kmem_free(vd_devid, bufsize); 7924 return (1); 7925 } 7926 7927 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7928 7929 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7930 7931 kmem_free(vd_devid, bufsize); 7932 7933 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7934 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7935 return (1); 7936 } 7937 7938 return (0); 7939 } 7940 7941 static void 7942 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7943 { 7944 int i, nparts; 7945 7946 ASSERT(MUTEX_HELD(&vdc->lock)); 7947 7948 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7949 bzero(vdc->vtoc, sizeof (struct vtoc)); 7950 bzero(vdc->geom, sizeof (struct dk_geom)); 7951 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7952 7953 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7954 7955 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7956 7957 if (gpe[i].efi_gpe_StartingLBA == 0 || 7958 gpe[i].efi_gpe_EndingLBA == 0) { 7959 continue; 7960 } 7961 7962 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7963 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7964 gpe[i].efi_gpe_StartingLBA + 1; 7965 } 7966 7967 ASSERT(vdc->vdisk_size != 0); 7968 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7969 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7970 7971 } 7972 7973 static void 7974 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7975 { 7976 int i; 7977 7978 ASSERT(MUTEX_HELD(&vdc->lock)); 7979 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7980 7981 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7982 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7983 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7984 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7985 7986 for (i = 0; i < vtoc->v_nparts; i++) { 7987 vdc->slice[i].start = vtoc->v_part[i].p_start; 7988 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7989 } 7990 } 7991 7992 static void 7993 vdc_store_label_unk(vdc_t *vdc) 7994 { 7995 ASSERT(MUTEX_HELD(&vdc->lock)); 7996 7997 vdc->vdisk_label = VD_DISK_LABEL_UNK; 7998 bzero(vdc->vtoc, sizeof (struct vtoc)); 7999 bzero(vdc->geom, sizeof (struct dk_geom)); 8000 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8001 } 8002