1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * LDoms virtual disk client (vdc) device driver 31 * 32 * This driver runs on a guest logical domain and communicates with the virtual 33 * disk server (vds) driver running on the service domain which is exporting 34 * virtualized "disks" to the guest logical domain. 35 * 36 * The driver can be divided into four sections: 37 * 38 * 1) generic device driver housekeeping 39 * _init, _fini, attach, detach, ops structures, etc. 40 * 41 * 2) communication channel setup 42 * Setup the communications link over the LDC channel that vdc uses to 43 * talk to the vDisk server. Initialise the descriptor ring which 44 * allows the LDC clients to transfer data via memory mappings. 45 * 46 * 3) Support exported to upper layers (filesystems, etc) 47 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 48 * ioctl calls. vdc will copy the data to be written to the descriptor 49 * ring or maps the buffer to store the data read by the vDisk 50 * server into the descriptor ring. It then sends a message to the 51 * vDisk server requesting it to complete the operation. 52 * 53 * 4) Handling responses from vDisk server. 54 * The vDisk server will ACK some or all of the messages vdc sends to it 55 * (this is configured during the handshake). Upon receipt of an ACK 56 * vdc will check the descriptor ring and signal to the upper layer 57 * code waiting on the IO. 58 */ 59 60 #include <sys/atomic.h> 61 #include <sys/conf.h> 62 #include <sys/disp.h> 63 #include <sys/ddi.h> 64 #include <sys/dkio.h> 65 #include <sys/efi_partition.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/kstat.h> 69 #include <sys/mach_descrip.h> 70 #include <sys/modctl.h> 71 #include <sys/mdeg.h> 72 #include <sys/note.h> 73 #include <sys/open.h> 74 #include <sys/sdt.h> 75 #include <sys/stat.h> 76 #include <sys/sunddi.h> 77 #include <sys/types.h> 78 #include <sys/promif.h> 79 #include <sys/var.h> 80 #include <sys/vtoc.h> 81 #include <sys/archsystm.h> 82 #include <sys/sysmacros.h> 83 84 #include <sys/cdio.h> 85 #include <sys/dktp/fdisk.h> 86 #include <sys/dktp/dadkio.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 124 /* setup */ 125 static void vdc_min(struct buf *bufp); 126 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 127 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 128 static int vdc_start_ldc_connection(vdc_t *vdc); 129 static int vdc_create_device_nodes(vdc_t *vdc); 130 static int vdc_create_device_nodes_efi(vdc_t *vdc); 131 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 132 static int vdc_create_device_nodes_props(vdc_t *vdc); 133 static void vdc_create_io_kstats(vdc_t *vdc); 134 static void vdc_create_err_kstats(vdc_t *vdc); 135 static void vdc_set_err_kstats(vdc_t *vdc); 136 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 137 mde_cookie_t *vd_nodep); 138 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 139 static void vdc_fini_ports(vdc_t *vdc); 140 static void vdc_switch_server(vdc_t *vdcp); 141 static int vdc_do_ldc_up(vdc_t *vdc); 142 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 143 static int vdc_init_descriptor_ring(vdc_t *vdc); 144 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 145 static int vdc_setup_devid(vdc_t *vdc); 146 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 147 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, struct vtoc *); 148 static void vdc_store_label_unk(vdc_t *vdc); 149 static boolean_t vdc_is_opened(vdc_t *vdc); 150 151 /* handshake with vds */ 152 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 153 static int vdc_ver_negotiation(vdc_t *vdcp); 154 static int vdc_init_attr_negotiation(vdc_t *vdc); 155 static int vdc_attr_negotiation(vdc_t *vdcp); 156 static int vdc_init_dring_negotiate(vdc_t *vdc); 157 static int vdc_dring_negotiation(vdc_t *vdcp); 158 static int vdc_send_rdx(vdc_t *vdcp); 159 static int vdc_rdx_exchange(vdc_t *vdcp); 160 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 161 162 /* processing incoming messages from vDisk server */ 163 static void vdc_process_msg_thread(vdc_t *vdc); 164 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 165 166 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 167 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 168 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 169 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 170 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 171 static int vdc_send_request(vdc_t *vdcp, int operation, 172 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 173 int cb_type, void *cb_arg, vio_desc_direction_t dir); 174 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 175 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 176 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 177 int cb_type, void *cb_arg, vio_desc_direction_t dir); 178 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 179 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 180 void *cb_arg, vio_desc_direction_t dir, boolean_t); 181 182 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 183 static int vdc_drain_response(vdc_t *vdcp); 184 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 185 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 186 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 187 188 /* dkio */ 189 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 190 int *rvalp); 191 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 192 static void vdc_create_fake_geometry(vdc_t *vdc); 193 static int vdc_validate_geometry(vdc_t *vdc); 194 static void vdc_validate(vdc_t *vdc); 195 static void vdc_validate_task(void *arg); 196 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 197 int mode, int dir); 198 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 199 int mode, int dir); 200 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 201 int mode, int dir); 202 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 203 int mode, int dir); 204 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 205 int mode, int dir); 206 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 207 int mode, int dir); 208 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 209 int mode, int dir); 210 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 211 int mode, int dir); 212 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 213 int mode, int dir); 214 215 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 216 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 217 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 218 static int vdc_failfast_check_resv(vdc_t *vdc); 219 220 /* 221 * Module variables 222 */ 223 224 /* 225 * Tunable variables to control how long vdc waits before timing out on 226 * various operations 227 */ 228 static int vdc_hshake_retries = 3; 229 230 static int vdc_timeout = 0; /* units: seconds */ 231 static int vdc_ldcup_timeout = 1; /* units: seconds */ 232 233 static uint64_t vdc_hz_min_ldc_delay; 234 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 235 static uint64_t vdc_hz_max_ldc_delay; 236 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 237 238 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 239 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 240 241 /* values for dumping - need to run in a tighter loop */ 242 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 243 static int vdc_dump_retries = 100; 244 245 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 246 247 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 248 249 /* Count of the number of vdc instances attached */ 250 static volatile uint32_t vdc_instance_count = 0; 251 252 /* Tunable to log all SCSI errors */ 253 static boolean_t vdc_scsi_log_error = B_FALSE; 254 255 /* Soft state pointer */ 256 static void *vdc_state; 257 258 /* 259 * Controlling the verbosity of the error/debug messages 260 * 261 * vdc_msglevel - controls level of messages 262 * vdc_matchinst - 64-bit variable where each bit corresponds 263 * to the vdc instance the vdc_msglevel applies. 264 */ 265 int vdc_msglevel = 0x0; 266 uint64_t vdc_matchinst = 0ull; 267 268 /* 269 * Supported vDisk protocol version pairs. 270 * 271 * The first array entry is the latest and preferred version. 272 */ 273 static const vio_ver_t vdc_version[] = {{1, 1}}; 274 275 static struct cb_ops vdc_cb_ops = { 276 vdc_open, /* cb_open */ 277 vdc_close, /* cb_close */ 278 vdc_strategy, /* cb_strategy */ 279 vdc_print, /* cb_print */ 280 vdc_dump, /* cb_dump */ 281 vdc_read, /* cb_read */ 282 vdc_write, /* cb_write */ 283 vdc_ioctl, /* cb_ioctl */ 284 nodev, /* cb_devmap */ 285 nodev, /* cb_mmap */ 286 nodev, /* cb_segmap */ 287 nochpoll, /* cb_chpoll */ 288 ddi_prop_op, /* cb_prop_op */ 289 NULL, /* cb_str */ 290 D_MP | D_64BIT, /* cb_flag */ 291 CB_REV, /* cb_rev */ 292 vdc_aread, /* cb_aread */ 293 vdc_awrite /* cb_awrite */ 294 }; 295 296 static struct dev_ops vdc_ops = { 297 DEVO_REV, /* devo_rev */ 298 0, /* devo_refcnt */ 299 vdc_getinfo, /* devo_getinfo */ 300 nulldev, /* devo_identify */ 301 nulldev, /* devo_probe */ 302 vdc_attach, /* devo_attach */ 303 vdc_detach, /* devo_detach */ 304 nodev, /* devo_reset */ 305 &vdc_cb_ops, /* devo_cb_ops */ 306 NULL, /* devo_bus_ops */ 307 nulldev /* devo_power */ 308 }; 309 310 static struct modldrv modldrv = { 311 &mod_driverops, 312 "virtual disk client", 313 &vdc_ops, 314 }; 315 316 static struct modlinkage modlinkage = { 317 MODREV_1, 318 &modldrv, 319 NULL 320 }; 321 322 /* -------------------------------------------------------------------------- */ 323 324 /* 325 * Device Driver housekeeping and setup 326 */ 327 328 int 329 _init(void) 330 { 331 int status; 332 333 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 334 return (status); 335 if ((status = mod_install(&modlinkage)) != 0) 336 ddi_soft_state_fini(&vdc_state); 337 return (status); 338 } 339 340 int 341 _info(struct modinfo *modinfop) 342 { 343 return (mod_info(&modlinkage, modinfop)); 344 } 345 346 int 347 _fini(void) 348 { 349 int status; 350 351 if ((status = mod_remove(&modlinkage)) != 0) 352 return (status); 353 ddi_soft_state_fini(&vdc_state); 354 return (0); 355 } 356 357 static int 358 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 359 { 360 _NOTE(ARGUNUSED(dip)) 361 362 int instance = VDCUNIT((dev_t)arg); 363 vdc_t *vdc = NULL; 364 365 switch (cmd) { 366 case DDI_INFO_DEVT2DEVINFO: 367 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 368 *resultp = NULL; 369 return (DDI_FAILURE); 370 } 371 *resultp = vdc->dip; 372 return (DDI_SUCCESS); 373 case DDI_INFO_DEVT2INSTANCE: 374 *resultp = (void *)(uintptr_t)instance; 375 return (DDI_SUCCESS); 376 default: 377 *resultp = NULL; 378 return (DDI_FAILURE); 379 } 380 } 381 382 static int 383 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 384 { 385 kt_did_t failfast_tid, ownership_tid; 386 int instance; 387 int rv; 388 vdc_t *vdc = NULL; 389 390 switch (cmd) { 391 case DDI_DETACH: 392 /* the real work happens below */ 393 break; 394 case DDI_SUSPEND: 395 /* nothing to do for this non-device */ 396 return (DDI_SUCCESS); 397 default: 398 return (DDI_FAILURE); 399 } 400 401 ASSERT(cmd == DDI_DETACH); 402 instance = ddi_get_instance(dip); 403 DMSGX(1, "[%d] Entered\n", instance); 404 405 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 406 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 407 return (DDI_FAILURE); 408 } 409 410 /* 411 * This function is called when vdc is detached or if it has failed to 412 * attach. In that case, the attach may have fail before the vdisk type 413 * has been set so we can't call vdc_is_opened(). However as the attach 414 * has failed, we know that the vdisk is not opened and we can safely 415 * detach. 416 */ 417 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 418 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 419 return (DDI_FAILURE); 420 } 421 422 if (vdc->dkio_flush_pending) { 423 DMSG(vdc, 0, 424 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 425 instance, vdc->dkio_flush_pending); 426 return (DDI_FAILURE); 427 } 428 429 if (vdc->validate_pending) { 430 DMSG(vdc, 0, 431 "[%d] Cannot detach: %d outstanding validate request\n", 432 instance, vdc->validate_pending); 433 return (DDI_FAILURE); 434 } 435 436 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 437 438 /* If we took ownership, release ownership */ 439 mutex_enter(&vdc->ownership_lock); 440 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 441 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 442 if (rv == 0) { 443 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 444 } 445 } 446 mutex_exit(&vdc->ownership_lock); 447 448 /* mark instance as detaching */ 449 vdc->lifecycle = VDC_LC_DETACHING; 450 451 /* 452 * try and disable callbacks to prevent another handshake 453 */ 454 if (vdc->curr_server != NULL) { 455 rv = ldc_set_cb_mode(vdc->curr_server->ldc_handle, 456 LDC_CB_DISABLE); 457 DMSG(vdc, 0, "callback disabled (rv=%d)\n", rv); 458 } 459 460 if (vdc->initialized & VDC_THREAD) { 461 mutex_enter(&vdc->read_lock); 462 if ((vdc->read_state == VDC_READ_WAITING) || 463 (vdc->read_state == VDC_READ_RESET)) { 464 vdc->read_state = VDC_READ_RESET; 465 cv_signal(&vdc->read_cv); 466 } 467 468 mutex_exit(&vdc->read_lock); 469 470 /* wake up any thread waiting for connection to come online */ 471 mutex_enter(&vdc->lock); 472 if (vdc->state == VDC_STATE_INIT_WAITING) { 473 DMSG(vdc, 0, 474 "[%d] write reset - move to resetting state...\n", 475 instance); 476 vdc->state = VDC_STATE_RESETTING; 477 cv_signal(&vdc->initwait_cv); 478 } 479 mutex_exit(&vdc->lock); 480 481 /* now wait until state transitions to VDC_STATE_DETACH */ 482 thread_join(vdc->msg_proc_thr->t_did); 483 ASSERT(vdc->state == VDC_STATE_DETACH); 484 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 485 vdc->instance); 486 } 487 488 mutex_enter(&vdc->lock); 489 490 if (vdc->initialized & VDC_DRING) 491 vdc_destroy_descriptor_ring(vdc); 492 493 vdc_fini_ports(vdc); 494 495 if (vdc->failfast_thread) { 496 failfast_tid = vdc->failfast_thread->t_did; 497 vdc->failfast_interval = 0; 498 cv_signal(&vdc->failfast_cv); 499 } else { 500 failfast_tid = 0; 501 } 502 503 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 504 ownership_tid = vdc->ownership_thread->t_did; 505 vdc->ownership = VDC_OWNERSHIP_NONE; 506 cv_signal(&vdc->ownership_cv); 507 } else { 508 ownership_tid = 0; 509 } 510 511 mutex_exit(&vdc->lock); 512 513 if (failfast_tid != 0) 514 thread_join(failfast_tid); 515 516 if (ownership_tid != 0) 517 thread_join(ownership_tid); 518 519 if (vdc->initialized & VDC_MINOR) { 520 ddi_prop_remove_all(dip); 521 ddi_remove_minor_node(dip, NULL); 522 } 523 524 if (vdc->io_stats) { 525 kstat_delete(vdc->io_stats); 526 vdc->io_stats = NULL; 527 } 528 529 if (vdc->err_stats) { 530 kstat_delete(vdc->err_stats); 531 vdc->err_stats = NULL; 532 } 533 534 if (vdc->initialized & VDC_LOCKS) { 535 mutex_destroy(&vdc->lock); 536 mutex_destroy(&vdc->read_lock); 537 mutex_destroy(&vdc->ownership_lock); 538 cv_destroy(&vdc->initwait_cv); 539 cv_destroy(&vdc->dring_free_cv); 540 cv_destroy(&vdc->membind_cv); 541 cv_destroy(&vdc->sync_pending_cv); 542 cv_destroy(&vdc->sync_blocked_cv); 543 cv_destroy(&vdc->read_cv); 544 cv_destroy(&vdc->running_cv); 545 cv_destroy(&vdc->ownership_cv); 546 cv_destroy(&vdc->failfast_cv); 547 cv_destroy(&vdc->failfast_io_cv); 548 } 549 550 if (vdc->minfo) 551 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 552 553 if (vdc->cinfo) 554 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 555 556 if (vdc->vtoc) 557 kmem_free(vdc->vtoc, sizeof (struct vtoc)); 558 559 if (vdc->geom) 560 kmem_free(vdc->geom, sizeof (struct dk_geom)); 561 562 if (vdc->devid) { 563 ddi_devid_unregister(dip); 564 ddi_devid_free(vdc->devid); 565 } 566 567 if (vdc->initialized & VDC_SOFT_STATE) 568 ddi_soft_state_free(vdc_state, instance); 569 570 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 571 572 return (DDI_SUCCESS); 573 } 574 575 576 static int 577 vdc_do_attach(dev_info_t *dip) 578 { 579 int instance; 580 vdc_t *vdc = NULL; 581 int status; 582 md_t *mdp; 583 mde_cookie_t vd_node; 584 585 ASSERT(dip != NULL); 586 587 instance = ddi_get_instance(dip); 588 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 589 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 590 instance); 591 return (DDI_FAILURE); 592 } 593 594 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 595 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 596 return (DDI_FAILURE); 597 } 598 599 /* 600 * We assign the value to initialized in this case to zero out the 601 * variable and then set bits in it to indicate what has been done 602 */ 603 vdc->initialized = VDC_SOFT_STATE; 604 605 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 606 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 607 608 vdc->dip = dip; 609 vdc->instance = instance; 610 vdc->vdisk_type = VD_DISK_TYPE_UNK; 611 vdc->vdisk_label = VD_DISK_LABEL_UNK; 612 vdc->state = VDC_STATE_INIT; 613 vdc->lifecycle = VDC_LC_ATTACHING; 614 vdc->session_id = 0; 615 vdc->block_size = DEV_BSIZE; 616 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 617 618 /* 619 * We assume, for now, that the vDisk server will export 'read' 620 * operations to us at a minimum (this is needed because of checks 621 * in vdc for supported operations early in the handshake process). 622 * The vDisk server will return ENOTSUP if this is not the case. 623 * The value will be overwritten during the attribute exchange with 624 * the bitmask of operations exported by server. 625 */ 626 vdc->operations = VD_OP_MASK_READ; 627 628 vdc->vtoc = NULL; 629 vdc->geom = NULL; 630 vdc->cinfo = NULL; 631 vdc->minfo = NULL; 632 633 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 634 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 635 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 636 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 637 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 638 639 vdc->threads_pending = 0; 640 vdc->sync_op_pending = B_FALSE; 641 vdc->sync_op_blocked = B_FALSE; 642 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 643 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 644 645 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 646 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 647 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 648 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 649 650 /* init blocking msg read functionality */ 651 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 652 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 653 vdc->read_state = VDC_READ_IDLE; 654 655 vdc->initialized |= VDC_LOCKS; 656 657 /* get device and port MD node for this disk instance */ 658 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 659 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 660 instance); 661 return (DDI_FAILURE); 662 } 663 664 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 665 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 666 return (DDI_FAILURE); 667 } 668 669 (void) md_fini_handle(mdp); 670 671 /* initialize the thread responsible for managing state with server */ 672 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 673 vdc, 0, &p0, TS_RUN, minclsyspri); 674 if (vdc->msg_proc_thr == NULL) { 675 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 676 instance); 677 return (DDI_FAILURE); 678 } 679 680 vdc->initialized |= VDC_THREAD; 681 682 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 683 vdc_create_io_kstats(vdc); 684 vdc_create_err_kstats(vdc); 685 686 atomic_inc_32(&vdc_instance_count); 687 688 /* 689 * Check the disk label. This will send requests and do the handshake. 690 * We don't really care about the disk label now. What we really need is 691 * the handshake do be done so that we know the type of the disk (slice 692 * or full disk) and the appropriate device nodes can be created. 693 */ 694 vdc->vdisk_label = VD_DISK_LABEL_UNK; 695 vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); 696 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 697 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 698 699 mutex_enter(&vdc->lock); 700 (void) vdc_validate_geometry(vdc); 701 mutex_exit(&vdc->lock); 702 703 /* 704 * Now that we have the device info we can create the 705 * device nodes and properties 706 */ 707 status = vdc_create_device_nodes(vdc); 708 if (status) { 709 DMSG(vdc, 0, "[%d] Failed to create device nodes", 710 instance); 711 goto return_status; 712 } 713 status = vdc_create_device_nodes_props(vdc); 714 if (status) { 715 DMSG(vdc, 0, "[%d] Failed to create device nodes" 716 " properties (%d)", instance, status); 717 goto return_status; 718 } 719 720 /* 721 * Setup devid 722 */ 723 if (vdc_setup_devid(vdc)) { 724 DMSG(vdc, 0, "[%d] No device id available\n", instance); 725 } 726 727 /* 728 * Fill in the fields of the error statistics kstat that were not 729 * available when creating the kstat 730 */ 731 vdc_set_err_kstats(vdc); 732 733 ddi_report_dev(dip); 734 vdc->lifecycle = VDC_LC_ONLINE; 735 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 736 737 return_status: 738 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 739 return (status); 740 } 741 742 static int 743 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 744 { 745 int status; 746 747 switch (cmd) { 748 case DDI_ATTACH: 749 if ((status = vdc_do_attach(dip)) != 0) 750 (void) vdc_detach(dip, DDI_DETACH); 751 return (status); 752 case DDI_RESUME: 753 /* nothing to do for this non-device */ 754 return (DDI_SUCCESS); 755 default: 756 return (DDI_FAILURE); 757 } 758 } 759 760 static int 761 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 762 { 763 int status = 0; 764 ldc_status_t ldc_state; 765 ldc_attr_t ldc_attr; 766 767 ASSERT(vdc != NULL); 768 ASSERT(srvr != NULL); 769 770 ldc_attr.devclass = LDC_DEV_BLK; 771 ldc_attr.instance = vdc->instance; 772 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 773 ldc_attr.mtu = VD_LDC_MTU; 774 775 if ((srvr->state & VDC_LDC_INIT) == 0) { 776 status = ldc_init(srvr->ldc_id, &ldc_attr, 777 &srvr->ldc_handle); 778 if (status != 0) { 779 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 780 vdc->instance, srvr->ldc_id, status); 781 return (status); 782 } 783 srvr->state |= VDC_LDC_INIT; 784 } 785 status = ldc_status(srvr->ldc_handle, &ldc_state); 786 if (status != 0) { 787 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 788 vdc->instance, status); 789 goto init_exit; 790 } 791 srvr->ldc_state = ldc_state; 792 793 if ((srvr->state & VDC_LDC_CB) == 0) { 794 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 795 (caddr_t)srvr); 796 if (status != 0) { 797 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 798 vdc->instance, status); 799 goto init_exit; 800 } 801 srvr->state |= VDC_LDC_CB; 802 } 803 804 /* 805 * At this stage we have initialised LDC, we will now try and open 806 * the connection. 807 */ 808 if (srvr->ldc_state == LDC_INIT) { 809 status = ldc_open(srvr->ldc_handle); 810 if (status != 0) { 811 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 812 vdc->instance, srvr->ldc_id, status); 813 goto init_exit; 814 } 815 srvr->state |= VDC_LDC_OPEN; 816 } 817 818 init_exit: 819 if (status) { 820 vdc_terminate_ldc(vdc, srvr); 821 } 822 823 return (status); 824 } 825 826 static int 827 vdc_start_ldc_connection(vdc_t *vdc) 828 { 829 int status = 0; 830 831 ASSERT(vdc != NULL); 832 833 ASSERT(MUTEX_HELD(&vdc->lock)); 834 835 status = vdc_do_ldc_up(vdc); 836 837 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 838 839 return (status); 840 } 841 842 static int 843 vdc_stop_ldc_connection(vdc_t *vdcp) 844 { 845 int status; 846 847 ASSERT(vdcp != NULL); 848 849 ASSERT(MUTEX_HELD(&vdcp->lock)); 850 851 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 852 vdcp->state); 853 854 status = ldc_down(vdcp->curr_server->ldc_handle); 855 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 856 857 vdcp->initialized &= ~VDC_HANDSHAKE; 858 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 859 860 return (status); 861 } 862 863 static void 864 vdc_create_io_kstats(vdc_t *vdc) 865 { 866 if (vdc->io_stats != NULL) { 867 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 868 return; 869 } 870 871 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 872 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 873 if (vdc->io_stats != NULL) { 874 vdc->io_stats->ks_lock = &vdc->lock; 875 kstat_install(vdc->io_stats); 876 } else { 877 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 878 " will not be gathered", vdc->instance); 879 } 880 } 881 882 static void 883 vdc_create_err_kstats(vdc_t *vdc) 884 { 885 vd_err_stats_t *stp; 886 char kstatmodule_err[KSTAT_STRLEN]; 887 char kstatname[KSTAT_STRLEN]; 888 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 889 int instance = vdc->instance; 890 891 if (vdc->err_stats != NULL) { 892 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 893 return; 894 } 895 896 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 897 "%serr", VDC_DRIVER_NAME); 898 (void) snprintf(kstatname, sizeof (kstatname), 899 "%s%d,err", VDC_DRIVER_NAME, instance); 900 901 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 902 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 903 904 if (vdc->err_stats == NULL) { 905 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 906 " will not be gathered", instance); 907 return; 908 } 909 910 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 911 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 912 KSTAT_DATA_UINT32); 913 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 914 KSTAT_DATA_UINT32); 915 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 916 KSTAT_DATA_UINT32); 917 kstat_named_init(&stp->vd_vid, "Vendor", 918 KSTAT_DATA_CHAR); 919 kstat_named_init(&stp->vd_pid, "Product", 920 KSTAT_DATA_CHAR); 921 kstat_named_init(&stp->vd_capacity, "Size", 922 KSTAT_DATA_ULONGLONG); 923 924 vdc->err_stats->ks_update = nulldev; 925 926 kstat_install(vdc->err_stats); 927 } 928 929 static void 930 vdc_set_err_kstats(vdc_t *vdc) 931 { 932 vd_err_stats_t *stp; 933 934 if (vdc->err_stats == NULL) 935 return; 936 937 mutex_enter(&vdc->lock); 938 939 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 940 ASSERT(stp != NULL); 941 942 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 943 (void) strcpy(stp->vd_vid.value.c, "SUN"); 944 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 945 946 mutex_exit(&vdc->lock); 947 } 948 949 static int 950 vdc_create_device_nodes_efi(vdc_t *vdc) 951 { 952 ddi_remove_minor_node(vdc->dip, "h"); 953 ddi_remove_minor_node(vdc->dip, "h,raw"); 954 955 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 956 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 957 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 958 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 959 vdc->instance); 960 return (EIO); 961 } 962 963 /* if any device node is created we set this flag */ 964 vdc->initialized |= VDC_MINOR; 965 966 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 967 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 968 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 969 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 970 vdc->instance); 971 return (EIO); 972 } 973 974 return (0); 975 } 976 977 static int 978 vdc_create_device_nodes_vtoc(vdc_t *vdc) 979 { 980 ddi_remove_minor_node(vdc->dip, "wd"); 981 ddi_remove_minor_node(vdc->dip, "wd,raw"); 982 983 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 984 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 985 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 986 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 987 vdc->instance); 988 return (EIO); 989 } 990 991 /* if any device node is created we set this flag */ 992 vdc->initialized |= VDC_MINOR; 993 994 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 995 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 996 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 997 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 998 vdc->instance); 999 return (EIO); 1000 } 1001 1002 return (0); 1003 } 1004 1005 /* 1006 * Function: 1007 * vdc_create_device_nodes 1008 * 1009 * Description: 1010 * This function creates the block and character device nodes under 1011 * /devices along with the node properties. It is called as part of 1012 * the attach(9E) of the instance during the handshake with vds after 1013 * vds has sent the attributes to vdc. 1014 * 1015 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1016 * of 2 is used in keeping with the Solaris convention that slice 2 1017 * refers to a whole disk. Slices start at 'a' 1018 * 1019 * Parameters: 1020 * vdc - soft state pointer 1021 * 1022 * Return Values 1023 * 0 - Success 1024 * EIO - Failed to create node 1025 * EINVAL - Unknown type of disk exported 1026 */ 1027 static int 1028 vdc_create_device_nodes(vdc_t *vdc) 1029 { 1030 char name[sizeof ("s,raw")]; 1031 dev_info_t *dip = NULL; 1032 int instance, status; 1033 int num_slices = 1; 1034 int i; 1035 1036 ASSERT(vdc != NULL); 1037 1038 instance = vdc->instance; 1039 dip = vdc->dip; 1040 1041 switch (vdc->vdisk_type) { 1042 case VD_DISK_TYPE_DISK: 1043 num_slices = V_NUMPAR; 1044 break; 1045 case VD_DISK_TYPE_SLICE: 1046 num_slices = 1; 1047 break; 1048 case VD_DISK_TYPE_UNK: 1049 default: 1050 return (EINVAL); 1051 } 1052 1053 /* 1054 * Minor nodes are different for EFI disks: EFI disks do not have 1055 * a minor node 'g' for the minor number corresponding to slice 1056 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1057 * representing the whole disk. 1058 */ 1059 for (i = 0; i < num_slices; i++) { 1060 1061 if (i == VD_EFI_WD_SLICE) { 1062 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1063 status = vdc_create_device_nodes_efi(vdc); 1064 else 1065 status = vdc_create_device_nodes_vtoc(vdc); 1066 if (status != 0) 1067 return (status); 1068 continue; 1069 } 1070 1071 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1072 if (ddi_create_minor_node(dip, name, S_IFBLK, 1073 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1074 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1075 instance, name); 1076 return (EIO); 1077 } 1078 1079 /* if any device node is created we set this flag */ 1080 vdc->initialized |= VDC_MINOR; 1081 1082 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1083 1084 if (ddi_create_minor_node(dip, name, S_IFCHR, 1085 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1086 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1087 instance, name); 1088 return (EIO); 1089 } 1090 } 1091 1092 return (0); 1093 } 1094 1095 /* 1096 * Function: 1097 * vdc_create_device_nodes_props 1098 * 1099 * Description: 1100 * This function creates the block and character device nodes under 1101 * /devices along with the node properties. It is called as part of 1102 * the attach(9E) of the instance during the handshake with vds after 1103 * vds has sent the attributes to vdc. 1104 * 1105 * Parameters: 1106 * vdc - soft state pointer 1107 * 1108 * Return Values 1109 * 0 - Success 1110 * EIO - Failed to create device node property 1111 * EINVAL - Unknown type of disk exported 1112 */ 1113 static int 1114 vdc_create_device_nodes_props(vdc_t *vdc) 1115 { 1116 dev_info_t *dip = NULL; 1117 int instance; 1118 int num_slices = 1; 1119 int64_t size = 0; 1120 dev_t dev; 1121 int rv; 1122 int i; 1123 1124 ASSERT(vdc != NULL); 1125 1126 instance = vdc->instance; 1127 dip = vdc->dip; 1128 1129 switch (vdc->vdisk_type) { 1130 case VD_DISK_TYPE_DISK: 1131 num_slices = V_NUMPAR; 1132 break; 1133 case VD_DISK_TYPE_SLICE: 1134 num_slices = 1; 1135 break; 1136 case VD_DISK_TYPE_UNK: 1137 default: 1138 return (EINVAL); 1139 } 1140 1141 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1142 /* remove all properties */ 1143 for (i = 0; i < num_slices; i++) { 1144 dev = makedevice(ddi_driver_major(dip), 1145 VD_MAKE_DEV(instance, i)); 1146 (void) ddi_prop_remove(dev, dip, VDC_SIZE_PROP_NAME); 1147 (void) ddi_prop_remove(dev, dip, VDC_NBLOCKS_PROP_NAME); 1148 } 1149 return (0); 1150 } 1151 1152 for (i = 0; i < num_slices; i++) { 1153 dev = makedevice(ddi_driver_major(dip), 1154 VD_MAKE_DEV(instance, i)); 1155 1156 size = vdc->slice[i].nblocks * vdc->block_size; 1157 DMSG(vdc, 0, "[%d] sz %ld (%ld Mb) p_size %lx\n", 1158 instance, size, size / (1024 * 1024), 1159 vdc->slice[i].nblocks); 1160 1161 rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); 1162 if (rv != DDI_PROP_SUCCESS) { 1163 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop of [%ld]", 1164 instance, VDC_SIZE_PROP_NAME, size); 1165 return (EIO); 1166 } 1167 1168 rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME, 1169 lbtodb(size)); 1170 if (rv != DDI_PROP_SUCCESS) { 1171 cmn_err(CE_NOTE, "[%d] Couldn't add '%s' prop [%llu]", 1172 instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size)); 1173 return (EIO); 1174 } 1175 } 1176 1177 return (0); 1178 } 1179 1180 /* 1181 * Function: 1182 * vdc_is_opened 1183 * 1184 * Description: 1185 * This function checks if any slice of a given virtual disk is 1186 * currently opened. 1187 * 1188 * Parameters: 1189 * vdc - soft state pointer 1190 * 1191 * Return Values 1192 * B_TRUE - at least one slice is opened. 1193 * B_FALSE - no slice is opened. 1194 */ 1195 static boolean_t 1196 vdc_is_opened(vdc_t *vdc) 1197 { 1198 int i, nslices; 1199 1200 switch (vdc->vdisk_type) { 1201 case VD_DISK_TYPE_DISK: 1202 nslices = V_NUMPAR; 1203 break; 1204 case VD_DISK_TYPE_SLICE: 1205 nslices = 1; 1206 break; 1207 case VD_DISK_TYPE_UNK: 1208 default: 1209 ASSERT(0); 1210 } 1211 1212 /* check if there's any layered open */ 1213 for (i = 0; i < nslices; i++) { 1214 if (vdc->open_lyr[i] > 0) 1215 return (B_TRUE); 1216 } 1217 1218 /* check if there is any other kind of open */ 1219 for (i = 0; i < OTYPCNT; i++) { 1220 if (vdc->open[i] != 0) 1221 return (B_TRUE); 1222 } 1223 1224 return (B_FALSE); 1225 } 1226 1227 static int 1228 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1229 { 1230 uint8_t slicemask; 1231 int i; 1232 1233 ASSERT(otyp < OTYPCNT); 1234 ASSERT(slice < V_NUMPAR); 1235 ASSERT(MUTEX_HELD(&vdc->lock)); 1236 1237 slicemask = 1 << slice; 1238 1239 /* check if slice is already exclusively opened */ 1240 if (vdc->open_excl & slicemask) 1241 return (EBUSY); 1242 1243 /* if open exclusive, check if slice is already opened */ 1244 if (flag & FEXCL) { 1245 if (vdc->open_lyr[slice] > 0) 1246 return (EBUSY); 1247 for (i = 0; i < OTYPCNT; i++) { 1248 if (vdc->open[i] & slicemask) 1249 return (EBUSY); 1250 } 1251 vdc->open_excl |= slicemask; 1252 } 1253 1254 /* mark slice as opened */ 1255 if (otyp == OTYP_LYR) { 1256 vdc->open_lyr[slice]++; 1257 } else { 1258 vdc->open[otyp] |= slicemask; 1259 } 1260 1261 return (0); 1262 } 1263 1264 static void 1265 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1266 { 1267 uint8_t slicemask; 1268 1269 ASSERT(otyp < OTYPCNT); 1270 ASSERT(slice < V_NUMPAR); 1271 ASSERT(MUTEX_HELD(&vdc->lock)); 1272 1273 slicemask = 1 << slice; 1274 1275 if (otyp == OTYP_LYR) { 1276 ASSERT(vdc->open_lyr[slice] > 0); 1277 vdc->open_lyr[slice]--; 1278 } else { 1279 vdc->open[otyp] &= ~slicemask; 1280 } 1281 1282 if (flag & FEXCL) 1283 vdc->open_excl &= ~slicemask; 1284 } 1285 1286 static int 1287 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1288 { 1289 _NOTE(ARGUNUSED(cred)) 1290 1291 int instance, nodelay; 1292 int slice, status = 0; 1293 vdc_t *vdc; 1294 1295 ASSERT(dev != NULL); 1296 instance = VDCUNIT(*dev); 1297 1298 if (otyp >= OTYPCNT) 1299 return (EINVAL); 1300 1301 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1302 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1303 return (ENXIO); 1304 } 1305 1306 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1307 getminor(*dev), flag, otyp); 1308 1309 slice = VDCPART(*dev); 1310 1311 nodelay = flag & (FNDELAY | FNONBLOCK); 1312 1313 if ((flag & FWRITE) && (!nodelay) && 1314 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1315 return (EROFS); 1316 } 1317 1318 mutex_enter(&vdc->lock); 1319 1320 status = vdc_mark_opened(vdc, slice, flag, otyp); 1321 1322 if (status != 0) { 1323 mutex_exit(&vdc->lock); 1324 return (status); 1325 } 1326 1327 if (nodelay) { 1328 1329 /* don't resubmit a validate request if there's already one */ 1330 if (vdc->validate_pending > 0) { 1331 mutex_exit(&vdc->lock); 1332 return (0); 1333 } 1334 1335 /* call vdc_validate() asynchronously to avoid blocking */ 1336 if (taskq_dispatch(system_taskq, vdc_validate_task, 1337 (void *)vdc, TQ_NOSLEEP) == NULL) { 1338 vdc_mark_closed(vdc, slice, flag, otyp); 1339 mutex_exit(&vdc->lock); 1340 return (ENXIO); 1341 } 1342 1343 vdc->validate_pending++; 1344 mutex_exit(&vdc->lock); 1345 return (0); 1346 } 1347 1348 mutex_exit(&vdc->lock); 1349 1350 vdc_validate(vdc); 1351 1352 mutex_enter(&vdc->lock); 1353 1354 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1355 vdc->slice[slice].nblocks == 0) { 1356 vdc_mark_closed(vdc, slice, flag, otyp); 1357 status = EIO; 1358 } 1359 1360 mutex_exit(&vdc->lock); 1361 1362 return (status); 1363 } 1364 1365 static int 1366 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1367 { 1368 _NOTE(ARGUNUSED(cred)) 1369 1370 int instance; 1371 int slice; 1372 int rv, rval; 1373 vdc_t *vdc; 1374 1375 instance = VDCUNIT(dev); 1376 1377 if (otyp >= OTYPCNT) 1378 return (EINVAL); 1379 1380 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1381 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1382 return (ENXIO); 1383 } 1384 1385 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1386 1387 slice = VDCPART(dev); 1388 1389 /* 1390 * Attempt to flush the W$ on a close operation. If this is 1391 * not a supported IOCTL command or the backing device is read-only 1392 * do not fail the close operation. 1393 */ 1394 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1395 1396 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1397 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1398 instance, rv); 1399 return (EIO); 1400 } 1401 1402 mutex_enter(&vdc->lock); 1403 vdc_mark_closed(vdc, slice, flag, otyp); 1404 mutex_exit(&vdc->lock); 1405 1406 return (0); 1407 } 1408 1409 static int 1410 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1411 { 1412 _NOTE(ARGUNUSED(credp)) 1413 1414 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1415 } 1416 1417 static int 1418 vdc_print(dev_t dev, char *str) 1419 { 1420 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1421 return (0); 1422 } 1423 1424 static int 1425 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1426 { 1427 int rv; 1428 size_t nbytes = nblk * DEV_BSIZE; 1429 int instance = VDCUNIT(dev); 1430 vdc_t *vdc = NULL; 1431 1432 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1433 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1434 return (ENXIO); 1435 } 1436 1437 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1438 instance, nbytes, blkno, (void *)addr); 1439 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1440 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1441 if (rv) { 1442 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1443 return (rv); 1444 } 1445 1446 if (ddi_in_panic()) 1447 (void) vdc_drain_response(vdc); 1448 1449 DMSG(vdc, 0, "[%d] End\n", instance); 1450 1451 return (0); 1452 } 1453 1454 /* -------------------------------------------------------------------------- */ 1455 1456 /* 1457 * Disk access routines 1458 * 1459 */ 1460 1461 /* 1462 * vdc_strategy() 1463 * 1464 * Return Value: 1465 * 0: As per strategy(9E), the strategy() function must return 0 1466 * [ bioerror(9f) sets b_flags to the proper error code ] 1467 */ 1468 static int 1469 vdc_strategy(struct buf *buf) 1470 { 1471 int rv = -1; 1472 vdc_t *vdc = NULL; 1473 int instance = VDCUNIT(buf->b_edev); 1474 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1475 int slice; 1476 1477 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1478 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1479 bioerror(buf, ENXIO); 1480 biodone(buf); 1481 return (0); 1482 } 1483 1484 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1485 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1486 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1487 1488 bp_mapin(buf); 1489 1490 if ((long)buf->b_private == VD_SLICE_NONE) { 1491 /* I/O using an absolute disk offset */ 1492 slice = VD_SLICE_NONE; 1493 } else { 1494 slice = VDCPART(buf->b_edev); 1495 } 1496 1497 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1498 buf->b_bcount, slice, buf->b_lblkno, 1499 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1500 VIO_write_dir); 1501 1502 /* 1503 * If the request was successfully sent, the strategy call returns and 1504 * the ACK handler calls the bioxxx functions when the vDisk server is 1505 * done otherwise we handle the error here. 1506 */ 1507 if (rv) { 1508 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1509 bioerror(buf, rv); 1510 biodone(buf); 1511 } 1512 1513 return (0); 1514 } 1515 1516 /* 1517 * Function: 1518 * vdc_min 1519 * 1520 * Description: 1521 * Routine to limit the size of a data transfer. Used in 1522 * conjunction with physio(9F). 1523 * 1524 * Arguments: 1525 * bp - pointer to the indicated buf(9S) struct. 1526 * 1527 */ 1528 static void 1529 vdc_min(struct buf *bufp) 1530 { 1531 vdc_t *vdc = NULL; 1532 int instance = VDCUNIT(bufp->b_edev); 1533 1534 vdc = ddi_get_soft_state(vdc_state, instance); 1535 VERIFY(vdc != NULL); 1536 1537 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1538 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1539 } 1540 } 1541 1542 static int 1543 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1544 { 1545 _NOTE(ARGUNUSED(cred)) 1546 1547 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1548 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1549 } 1550 1551 static int 1552 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1553 { 1554 _NOTE(ARGUNUSED(cred)) 1555 1556 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1557 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1558 } 1559 1560 static int 1561 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1562 { 1563 _NOTE(ARGUNUSED(cred)) 1564 1565 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1566 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1567 } 1568 1569 static int 1570 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1571 { 1572 _NOTE(ARGUNUSED(cred)) 1573 1574 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1575 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1576 } 1577 1578 1579 /* -------------------------------------------------------------------------- */ 1580 1581 /* 1582 * Handshake support 1583 */ 1584 1585 1586 /* 1587 * Function: 1588 * vdc_init_ver_negotiation() 1589 * 1590 * Description: 1591 * 1592 * Arguments: 1593 * vdc - soft state pointer for this instance of the device driver. 1594 * 1595 * Return Code: 1596 * 0 - Success 1597 */ 1598 static int 1599 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1600 { 1601 vio_ver_msg_t pkt; 1602 size_t msglen = sizeof (pkt); 1603 int status = -1; 1604 1605 ASSERT(vdc != NULL); 1606 ASSERT(mutex_owned(&vdc->lock)); 1607 1608 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1609 1610 /* 1611 * set the Session ID to a unique value 1612 * (the lower 32 bits of the clock tick) 1613 */ 1614 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1615 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1616 1617 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1618 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1619 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1620 pkt.tag.vio_sid = vdc->session_id; 1621 pkt.dev_class = VDEV_DISK; 1622 pkt.ver_major = ver.major; 1623 pkt.ver_minor = ver.minor; 1624 1625 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1626 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1627 vdc->instance, status); 1628 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1629 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1630 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1631 vdc->curr_server->ldc_handle, status, msglen); 1632 if (msglen != sizeof (vio_ver_msg_t)) 1633 status = ENOMSG; 1634 } 1635 1636 return (status); 1637 } 1638 1639 /* 1640 * Function: 1641 * vdc_ver_negotiation() 1642 * 1643 * Description: 1644 * 1645 * Arguments: 1646 * vdcp - soft state pointer for this instance of the device driver. 1647 * 1648 * Return Code: 1649 * 0 - Success 1650 */ 1651 static int 1652 vdc_ver_negotiation(vdc_t *vdcp) 1653 { 1654 vio_msg_t vio_msg; 1655 int status; 1656 1657 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1658 return (status); 1659 1660 /* release lock and wait for response */ 1661 mutex_exit(&vdcp->lock); 1662 status = vdc_wait_for_response(vdcp, &vio_msg); 1663 mutex_enter(&vdcp->lock); 1664 if (status) { 1665 DMSG(vdcp, 0, 1666 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1667 vdcp->instance, status); 1668 return (status); 1669 } 1670 1671 /* check type and sub_type ... */ 1672 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1673 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1674 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1675 vdcp->instance); 1676 return (EPROTO); 1677 } 1678 1679 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1680 } 1681 1682 /* 1683 * Function: 1684 * vdc_init_attr_negotiation() 1685 * 1686 * Description: 1687 * 1688 * Arguments: 1689 * vdc - soft state pointer for this instance of the device driver. 1690 * 1691 * Return Code: 1692 * 0 - Success 1693 */ 1694 static int 1695 vdc_init_attr_negotiation(vdc_t *vdc) 1696 { 1697 vd_attr_msg_t pkt; 1698 size_t msglen = sizeof (pkt); 1699 int status; 1700 1701 ASSERT(vdc != NULL); 1702 ASSERT(mutex_owned(&vdc->lock)); 1703 1704 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1705 1706 /* fill in tag */ 1707 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1708 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1709 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1710 pkt.tag.vio_sid = vdc->session_id; 1711 /* fill in payload */ 1712 pkt.max_xfer_sz = vdc->max_xfer_sz; 1713 pkt.vdisk_block_size = vdc->block_size; 1714 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1715 pkt.operations = 0; /* server will set bits of valid operations */ 1716 pkt.vdisk_type = 0; /* server will set to valid device type */ 1717 pkt.vdisk_media = 0; /* server will set to valid media type */ 1718 pkt.vdisk_size = 0; /* server will set to valid size */ 1719 1720 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1721 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1722 1723 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1724 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1725 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1726 vdc->curr_server->ldc_handle, status, msglen); 1727 if (msglen != sizeof (vio_ver_msg_t)) 1728 status = ENOMSG; 1729 } 1730 1731 return (status); 1732 } 1733 1734 /* 1735 * Function: 1736 * vdc_attr_negotiation() 1737 * 1738 * Description: 1739 * 1740 * Arguments: 1741 * vdc - soft state pointer for this instance of the device driver. 1742 * 1743 * Return Code: 1744 * 0 - Success 1745 */ 1746 static int 1747 vdc_attr_negotiation(vdc_t *vdcp) 1748 { 1749 int status; 1750 vio_msg_t vio_msg; 1751 1752 if (status = vdc_init_attr_negotiation(vdcp)) 1753 return (status); 1754 1755 /* release lock and wait for response */ 1756 mutex_exit(&vdcp->lock); 1757 status = vdc_wait_for_response(vdcp, &vio_msg); 1758 mutex_enter(&vdcp->lock); 1759 if (status) { 1760 DMSG(vdcp, 0, 1761 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1762 vdcp->instance, status); 1763 return (status); 1764 } 1765 1766 /* check type and sub_type ... */ 1767 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1768 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1769 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1770 vdcp->instance); 1771 return (EPROTO); 1772 } 1773 1774 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1775 } 1776 1777 1778 /* 1779 * Function: 1780 * vdc_init_dring_negotiate() 1781 * 1782 * Description: 1783 * 1784 * Arguments: 1785 * vdc - soft state pointer for this instance of the device driver. 1786 * 1787 * Return Code: 1788 * 0 - Success 1789 */ 1790 static int 1791 vdc_init_dring_negotiate(vdc_t *vdc) 1792 { 1793 vio_dring_reg_msg_t pkt; 1794 size_t msglen = sizeof (pkt); 1795 int status = -1; 1796 int retry; 1797 int nretries = 10; 1798 1799 ASSERT(vdc != NULL); 1800 ASSERT(mutex_owned(&vdc->lock)); 1801 1802 for (retry = 0; retry < nretries; retry++) { 1803 status = vdc_init_descriptor_ring(vdc); 1804 if (status != EAGAIN) 1805 break; 1806 drv_usecwait(vdc_min_timeout_ldc); 1807 } 1808 1809 if (status != 0) { 1810 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1811 vdc->instance, status); 1812 return (status); 1813 } 1814 1815 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1816 vdc->instance, status); 1817 1818 /* fill in tag */ 1819 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1820 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1821 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1822 pkt.tag.vio_sid = vdc->session_id; 1823 /* fill in payload */ 1824 pkt.dring_ident = 0; 1825 pkt.num_descriptors = vdc->dring_len; 1826 pkt.descriptor_size = vdc->dring_entry_size; 1827 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1828 pkt.ncookies = vdc->dring_cookie_count; 1829 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1830 1831 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1832 if (status != 0) { 1833 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1834 vdc->instance, status); 1835 } 1836 1837 return (status); 1838 } 1839 1840 1841 /* 1842 * Function: 1843 * vdc_dring_negotiation() 1844 * 1845 * Description: 1846 * 1847 * Arguments: 1848 * vdc - soft state pointer for this instance of the device driver. 1849 * 1850 * Return Code: 1851 * 0 - Success 1852 */ 1853 static int 1854 vdc_dring_negotiation(vdc_t *vdcp) 1855 { 1856 int status; 1857 vio_msg_t vio_msg; 1858 1859 if (status = vdc_init_dring_negotiate(vdcp)) 1860 return (status); 1861 1862 /* release lock and wait for response */ 1863 mutex_exit(&vdcp->lock); 1864 status = vdc_wait_for_response(vdcp, &vio_msg); 1865 mutex_enter(&vdcp->lock); 1866 if (status) { 1867 DMSG(vdcp, 0, 1868 "[%d] Failed waiting for Dring negotiation response," 1869 " rv(%d)", vdcp->instance, status); 1870 return (status); 1871 } 1872 1873 /* check type and sub_type ... */ 1874 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1875 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1876 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1877 vdcp->instance); 1878 return (EPROTO); 1879 } 1880 1881 return (vdc_handle_dring_reg_msg(vdcp, 1882 (vio_dring_reg_msg_t *)&vio_msg)); 1883 } 1884 1885 1886 /* 1887 * Function: 1888 * vdc_send_rdx() 1889 * 1890 * Description: 1891 * 1892 * Arguments: 1893 * vdc - soft state pointer for this instance of the device driver. 1894 * 1895 * Return Code: 1896 * 0 - Success 1897 */ 1898 static int 1899 vdc_send_rdx(vdc_t *vdcp) 1900 { 1901 vio_msg_t msg; 1902 size_t msglen = sizeof (vio_msg_t); 1903 int status; 1904 1905 /* 1906 * Send an RDX message to vds to indicate we are ready 1907 * to send data 1908 */ 1909 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1910 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1911 msg.tag.vio_subtype_env = VIO_RDX; 1912 msg.tag.vio_sid = vdcp->session_id; 1913 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1914 if (status != 0) { 1915 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1916 vdcp->instance, status); 1917 } 1918 1919 return (status); 1920 } 1921 1922 /* 1923 * Function: 1924 * vdc_handle_rdx() 1925 * 1926 * Description: 1927 * 1928 * Arguments: 1929 * vdc - soft state pointer for this instance of the device driver. 1930 * msgp - received msg 1931 * 1932 * Return Code: 1933 * 0 - Success 1934 */ 1935 static int 1936 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1937 { 1938 _NOTE(ARGUNUSED(vdcp)) 1939 _NOTE(ARGUNUSED(msgp)) 1940 1941 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1942 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1943 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1944 1945 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1946 1947 return (0); 1948 } 1949 1950 /* 1951 * Function: 1952 * vdc_rdx_exchange() 1953 * 1954 * Description: 1955 * 1956 * Arguments: 1957 * vdc - soft state pointer for this instance of the device driver. 1958 * 1959 * Return Code: 1960 * 0 - Success 1961 */ 1962 static int 1963 vdc_rdx_exchange(vdc_t *vdcp) 1964 { 1965 int status; 1966 vio_msg_t vio_msg; 1967 1968 if (status = vdc_send_rdx(vdcp)) 1969 return (status); 1970 1971 /* release lock and wait for response */ 1972 mutex_exit(&vdcp->lock); 1973 status = vdc_wait_for_response(vdcp, &vio_msg); 1974 mutex_enter(&vdcp->lock); 1975 if (status) { 1976 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1977 vdcp->instance, status); 1978 return (status); 1979 } 1980 1981 /* check type and sub_type ... */ 1982 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1983 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1984 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1985 return (EPROTO); 1986 } 1987 1988 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1989 } 1990 1991 1992 /* -------------------------------------------------------------------------- */ 1993 1994 /* 1995 * LDC helper routines 1996 */ 1997 1998 static int 1999 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 2000 { 2001 int status; 2002 boolean_t q_has_pkts = B_FALSE; 2003 uint64_t delay_time; 2004 size_t len; 2005 2006 mutex_enter(&vdc->read_lock); 2007 2008 if (vdc->read_state == VDC_READ_IDLE) 2009 vdc->read_state = VDC_READ_WAITING; 2010 2011 while (vdc->read_state != VDC_READ_PENDING) { 2012 2013 /* detect if the connection has been reset */ 2014 if (vdc->read_state == VDC_READ_RESET) { 2015 status = ECONNRESET; 2016 goto done; 2017 } 2018 2019 cv_wait(&vdc->read_cv, &vdc->read_lock); 2020 } 2021 2022 /* 2023 * Until we get a blocking ldc read we have to retry 2024 * until the entire LDC message has arrived before 2025 * ldc_read() will succeed. Note we also bail out if 2026 * the channel is reset or goes away. 2027 */ 2028 delay_time = vdc_ldc_read_init_delay; 2029 loop: 2030 len = *nbytesp; 2031 status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); 2032 switch (status) { 2033 case EAGAIN: 2034 delay_time *= 2; 2035 if (delay_time >= vdc_ldc_read_max_delay) 2036 delay_time = vdc_ldc_read_max_delay; 2037 delay(delay_time); 2038 goto loop; 2039 2040 case 0: 2041 if (len == 0) { 2042 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 2043 "no error!\n", vdc->instance); 2044 goto loop; 2045 } 2046 2047 *nbytesp = len; 2048 2049 /* 2050 * If there are pending messages, leave the 2051 * read state as pending. Otherwise, set the state 2052 * back to idle. 2053 */ 2054 status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); 2055 if (status == 0 && !q_has_pkts) 2056 vdc->read_state = VDC_READ_IDLE; 2057 2058 break; 2059 default: 2060 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2061 break; 2062 } 2063 2064 done: 2065 mutex_exit(&vdc->read_lock); 2066 2067 return (status); 2068 } 2069 2070 2071 2072 #ifdef DEBUG 2073 void 2074 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2075 { 2076 char *ms, *ss, *ses; 2077 switch (msg->tag.vio_msgtype) { 2078 #define Q(_s) case _s : ms = #_s; break; 2079 Q(VIO_TYPE_CTRL) 2080 Q(VIO_TYPE_DATA) 2081 Q(VIO_TYPE_ERR) 2082 #undef Q 2083 default: ms = "unknown"; break; 2084 } 2085 2086 switch (msg->tag.vio_subtype) { 2087 #define Q(_s) case _s : ss = #_s; break; 2088 Q(VIO_SUBTYPE_INFO) 2089 Q(VIO_SUBTYPE_ACK) 2090 Q(VIO_SUBTYPE_NACK) 2091 #undef Q 2092 default: ss = "unknown"; break; 2093 } 2094 2095 switch (msg->tag.vio_subtype_env) { 2096 #define Q(_s) case _s : ses = #_s; break; 2097 Q(VIO_VER_INFO) 2098 Q(VIO_ATTR_INFO) 2099 Q(VIO_DRING_REG) 2100 Q(VIO_DRING_UNREG) 2101 Q(VIO_RDX) 2102 Q(VIO_PKT_DATA) 2103 Q(VIO_DESC_DATA) 2104 Q(VIO_DRING_DATA) 2105 #undef Q 2106 default: ses = "unknown"; break; 2107 } 2108 2109 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2110 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2111 msg->tag.vio_subtype_env, ms, ss, ses); 2112 } 2113 #endif 2114 2115 /* 2116 * Function: 2117 * vdc_send() 2118 * 2119 * Description: 2120 * The function encapsulates the call to write a message using LDC. 2121 * If LDC indicates that the call failed due to the queue being full, 2122 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2123 * 2124 * Arguments: 2125 * ldc_handle - LDC handle for the channel this instance of vdc uses 2126 * pkt - address of LDC message to be sent 2127 * msglen - the size of the message being sent. When the function 2128 * returns, this contains the number of bytes written. 2129 * 2130 * Return Code: 2131 * 0 - Success. 2132 * EINVAL - pkt or msglen were NULL 2133 * ECONNRESET - The connection was not up. 2134 * EWOULDBLOCK - LDC queue is full 2135 * xxx - other error codes returned by ldc_write 2136 */ 2137 static int 2138 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2139 { 2140 size_t size = 0; 2141 int status = 0; 2142 clock_t delay_ticks; 2143 2144 ASSERT(vdc != NULL); 2145 ASSERT(mutex_owned(&vdc->lock)); 2146 ASSERT(msglen != NULL); 2147 ASSERT(*msglen != 0); 2148 2149 #ifdef DEBUG 2150 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2151 #endif 2152 /* 2153 * Wait indefinitely to send if channel 2154 * is busy, but bail out if we succeed or 2155 * if the channel closes or is reset. 2156 */ 2157 delay_ticks = vdc_hz_min_ldc_delay; 2158 do { 2159 size = *msglen; 2160 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2161 if (status == EWOULDBLOCK) { 2162 delay(delay_ticks); 2163 /* geometric backoff */ 2164 delay_ticks *= 2; 2165 if (delay_ticks > vdc_hz_max_ldc_delay) 2166 delay_ticks = vdc_hz_max_ldc_delay; 2167 } 2168 } while (status == EWOULDBLOCK); 2169 2170 /* if LDC had serious issues --- reset vdc state */ 2171 if (status == EIO || status == ECONNRESET) { 2172 /* LDC had serious issues --- reset vdc state */ 2173 mutex_enter(&vdc->read_lock); 2174 if ((vdc->read_state == VDC_READ_WAITING) || 2175 (vdc->read_state == VDC_READ_RESET)) 2176 cv_signal(&vdc->read_cv); 2177 vdc->read_state = VDC_READ_RESET; 2178 mutex_exit(&vdc->read_lock); 2179 2180 /* wake up any waiters in the reset thread */ 2181 if (vdc->state == VDC_STATE_INIT_WAITING) { 2182 DMSG(vdc, 0, "[%d] write reset - " 2183 "vdc is resetting ..\n", vdc->instance); 2184 vdc->state = VDC_STATE_RESETTING; 2185 cv_signal(&vdc->initwait_cv); 2186 } 2187 2188 return (ECONNRESET); 2189 } 2190 2191 /* return the last size written */ 2192 *msglen = size; 2193 2194 return (status); 2195 } 2196 2197 /* 2198 * Function: 2199 * vdc_get_md_node 2200 * 2201 * Description: 2202 * Get the MD, the device node for the given disk instance. The 2203 * caller is responsible for cleaning up the reference to the 2204 * returned MD (mdpp) by calling md_fini_handle(). 2205 * 2206 * Arguments: 2207 * dip - dev info pointer for this instance of the device driver. 2208 * mdpp - the returned MD. 2209 * vd_nodep - the returned device node. 2210 * 2211 * Return Code: 2212 * 0 - Success. 2213 * ENOENT - Expected node or property did not exist. 2214 * ENXIO - Unexpected error communicating with MD framework 2215 */ 2216 static int 2217 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2218 { 2219 int status = ENOENT; 2220 char *node_name = NULL; 2221 md_t *mdp = NULL; 2222 int num_nodes; 2223 int num_vdevs; 2224 mde_cookie_t rootnode; 2225 mde_cookie_t *listp = NULL; 2226 boolean_t found_inst = B_FALSE; 2227 int listsz; 2228 int idx; 2229 uint64_t md_inst; 2230 int obp_inst; 2231 int instance = ddi_get_instance(dip); 2232 2233 /* 2234 * Get the OBP instance number for comparison with the MD instance 2235 * 2236 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2237 * notion of "instance", or unique identifier, for that node; OBP 2238 * stores the value of the "cfg-handle" MD property as the value of 2239 * the "reg" property on the node in the device tree it builds from 2240 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2241 * "reg" property value to uniquely identify this device instance. 2242 * If the "reg" property cannot be found, the device tree state is 2243 * presumably so broken that there is no point in continuing. 2244 */ 2245 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2246 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2247 return (ENOENT); 2248 } 2249 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2250 OBP_REG, -1); 2251 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2252 2253 /* 2254 * We now walk the MD nodes to find the node for this vdisk. 2255 */ 2256 if ((mdp = md_get_handle()) == NULL) { 2257 cmn_err(CE_WARN, "unable to init machine description"); 2258 return (ENXIO); 2259 } 2260 2261 num_nodes = md_node_count(mdp); 2262 ASSERT(num_nodes > 0); 2263 2264 listsz = num_nodes * sizeof (mde_cookie_t); 2265 2266 /* allocate memory for nodes */ 2267 listp = kmem_zalloc(listsz, KM_SLEEP); 2268 2269 rootnode = md_root_node(mdp); 2270 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2271 2272 /* 2273 * Search for all the virtual devices, we will then check to see which 2274 * ones are disk nodes. 2275 */ 2276 num_vdevs = md_scan_dag(mdp, rootnode, 2277 md_find_name(mdp, VDC_MD_VDEV_NAME), 2278 md_find_name(mdp, "fwd"), listp); 2279 2280 if (num_vdevs <= 0) { 2281 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2282 status = ENOENT; 2283 goto done; 2284 } 2285 2286 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2287 for (idx = 0; idx < num_vdevs; idx++) { 2288 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2289 if ((status != 0) || (node_name == NULL)) { 2290 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2291 ": err %d", VDC_MD_VDEV_NAME, status); 2292 continue; 2293 } 2294 2295 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2296 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2297 status = md_get_prop_val(mdp, listp[idx], 2298 VDC_MD_CFG_HDL, &md_inst); 2299 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2300 instance, md_inst); 2301 if ((status == 0) && (md_inst == obp_inst)) { 2302 found_inst = B_TRUE; 2303 break; 2304 } 2305 } 2306 } 2307 2308 if (!found_inst) { 2309 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2310 status = ENOENT; 2311 goto done; 2312 } 2313 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2314 2315 *vd_nodep = listp[idx]; 2316 *mdpp = mdp; 2317 done: 2318 kmem_free(listp, listsz); 2319 return (status); 2320 } 2321 2322 /* 2323 * Function: 2324 * vdc_init_ports 2325 * 2326 * Description: 2327 * Initialize all the ports for this vdisk instance. 2328 * 2329 * Arguments: 2330 * vdc - soft state pointer for this instance of the device driver. 2331 * mdp - md pointer 2332 * vd_nodep - device md node. 2333 * 2334 * Return Code: 2335 * 0 - Success. 2336 * ENOENT - Expected node or property did not exist. 2337 */ 2338 static int 2339 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2340 { 2341 int status = 0; 2342 int idx; 2343 int num_nodes; 2344 int num_vports; 2345 int num_chans; 2346 int listsz; 2347 mde_cookie_t vd_port; 2348 mde_cookie_t *chanp = NULL; 2349 mde_cookie_t *portp = NULL; 2350 vdc_server_t *srvr; 2351 vdc_server_t *prev_srvr = NULL; 2352 2353 /* 2354 * We now walk the MD nodes to find the port nodes for this vdisk. 2355 */ 2356 num_nodes = md_node_count(mdp); 2357 ASSERT(num_nodes > 0); 2358 2359 listsz = num_nodes * sizeof (mde_cookie_t); 2360 2361 /* allocate memory for nodes */ 2362 portp = kmem_zalloc(listsz, KM_SLEEP); 2363 chanp = kmem_zalloc(listsz, KM_SLEEP); 2364 2365 num_vports = md_scan_dag(mdp, vd_nodep, 2366 md_find_name(mdp, VDC_MD_PORT_NAME), 2367 md_find_name(mdp, "fwd"), portp); 2368 if (num_vports == 0) { 2369 DMSGX(0, "Found no '%s' node for '%s' port\n", 2370 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2371 status = ENOENT; 2372 goto done; 2373 } 2374 2375 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2376 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2377 2378 vdc->num_servers = 0; 2379 for (idx = 0; idx < num_vports; idx++) { 2380 2381 /* initialize this port */ 2382 vd_port = portp[idx]; 2383 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2384 srvr->vdcp = vdc; 2385 2386 /* get port id */ 2387 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2388 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2389 VDC_MD_ID); 2390 kmem_free(srvr, sizeof (vdc_server_t)); 2391 continue; 2392 } 2393 2394 /* set the connection timeout */ 2395 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2396 &srvr->ctimeout) != 0) { 2397 srvr->ctimeout = 0; 2398 } 2399 2400 /* get the ldc id */ 2401 num_chans = md_scan_dag(mdp, vd_port, 2402 md_find_name(mdp, VDC_MD_CHAN_NAME), 2403 md_find_name(mdp, "fwd"), chanp); 2404 2405 /* expecting at least one channel */ 2406 if (num_chans <= 0) { 2407 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2408 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2409 kmem_free(srvr, sizeof (vdc_server_t)); 2410 continue; 2411 } else if (num_chans != 1) { 2412 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2413 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2414 num_chans); 2415 } 2416 2417 /* 2418 * We use the first channel found (index 0), irrespective of how 2419 * many are there in total. 2420 */ 2421 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2422 &srvr->ldc_id) != 0) { 2423 cmn_err(CE_NOTE, "Channel '%s' property not found", 2424 VDC_MD_ID); 2425 kmem_free(srvr, sizeof (vdc_server_t)); 2426 continue; 2427 } 2428 2429 /* 2430 * now initialise LDC channel which will be used to 2431 * communicate with this server 2432 */ 2433 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2434 kmem_free(srvr, sizeof (vdc_server_t)); 2435 continue; 2436 } 2437 2438 /* add server to list */ 2439 if (prev_srvr) { 2440 prev_srvr->next = srvr; 2441 } else { 2442 vdc->server_list = srvr; 2443 prev_srvr = srvr; 2444 } 2445 2446 /* inc numbers of servers */ 2447 vdc->num_servers++; 2448 } 2449 2450 /* 2451 * Adjust the max number of handshake retries to match 2452 * the number of vdisk servers. 2453 */ 2454 if (vdc_hshake_retries < vdc->num_servers) 2455 vdc_hshake_retries = vdc->num_servers; 2456 2457 /* pick first server as current server */ 2458 if (vdc->server_list != NULL) { 2459 vdc->curr_server = vdc->server_list; 2460 status = 0; 2461 } else { 2462 status = ENOENT; 2463 } 2464 2465 done: 2466 kmem_free(chanp, listsz); 2467 kmem_free(portp, listsz); 2468 return (status); 2469 } 2470 2471 2472 /* 2473 * Function: 2474 * vdc_do_ldc_up 2475 * 2476 * Description: 2477 * Bring the channel for the current server up. 2478 * 2479 * Arguments: 2480 * vdc - soft state pointer for this instance of the device driver. 2481 * 2482 * Return Code: 2483 * 0 - Success. 2484 * EINVAL - Driver is detaching / LDC error 2485 * ECONNREFUSED - Other end is not listening 2486 */ 2487 static int 2488 vdc_do_ldc_up(vdc_t *vdc) 2489 { 2490 int status; 2491 ldc_status_t ldc_state; 2492 2493 ASSERT(MUTEX_HELD(&vdc->lock)); 2494 2495 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2496 vdc->instance, vdc->curr_server->ldc_id); 2497 2498 if (vdc->lifecycle == VDC_LC_DETACHING) 2499 return (EINVAL); 2500 2501 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2502 switch (status) { 2503 case ECONNREFUSED: /* listener not ready at other end */ 2504 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2505 vdc->instance, vdc->curr_server->ldc_id, status); 2506 status = 0; 2507 break; 2508 default: 2509 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2510 "channel=%ld, err=%d", vdc->instance, 2511 vdc->curr_server->ldc_id, status); 2512 break; 2513 } 2514 } 2515 2516 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2517 vdc->curr_server->ldc_state = ldc_state; 2518 if (ldc_state == LDC_UP) { 2519 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2520 vdc->instance); 2521 vdc->seq_num = 1; 2522 vdc->seq_num_reply = 0; 2523 } 2524 } 2525 2526 return (status); 2527 } 2528 2529 /* 2530 * Function: 2531 * vdc_terminate_ldc() 2532 * 2533 * Description: 2534 * 2535 * Arguments: 2536 * vdc - soft state pointer for this instance of the device driver. 2537 * srvr - vdc per-server info structure 2538 * 2539 * Return Code: 2540 * None 2541 */ 2542 static void 2543 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2544 { 2545 int instance = ddi_get_instance(vdc->dip); 2546 2547 if (srvr->state & VDC_LDC_OPEN) { 2548 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2549 (void) ldc_close(srvr->ldc_handle); 2550 } 2551 if (srvr->state & VDC_LDC_CB) { 2552 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2553 (void) ldc_unreg_callback(srvr->ldc_handle); 2554 } 2555 if (srvr->state & VDC_LDC_INIT) { 2556 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2557 (void) ldc_fini(srvr->ldc_handle); 2558 srvr->ldc_handle = NULL; 2559 } 2560 2561 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2562 } 2563 2564 /* 2565 * Function: 2566 * vdc_fini_ports() 2567 * 2568 * Description: 2569 * Finalize all ports by closing the channel associated with each 2570 * port and also freeing the server structure. 2571 * 2572 * Arguments: 2573 * vdc - soft state pointer for this instance of the device driver. 2574 * 2575 * Return Code: 2576 * None 2577 */ 2578 static void 2579 vdc_fini_ports(vdc_t *vdc) 2580 { 2581 int instance = ddi_get_instance(vdc->dip); 2582 vdc_server_t *srvr, *prev_srvr; 2583 2584 ASSERT(vdc != NULL); 2585 ASSERT(mutex_owned(&vdc->lock)); 2586 2587 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2588 2589 srvr = vdc->server_list; 2590 2591 while (srvr) { 2592 2593 vdc_terminate_ldc(vdc, srvr); 2594 2595 /* next server */ 2596 prev_srvr = srvr; 2597 srvr = srvr->next; 2598 2599 /* free server */ 2600 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2601 } 2602 2603 vdc->server_list = NULL; 2604 } 2605 2606 /* -------------------------------------------------------------------------- */ 2607 2608 /* 2609 * Descriptor Ring helper routines 2610 */ 2611 2612 /* 2613 * Function: 2614 * vdc_init_descriptor_ring() 2615 * 2616 * Description: 2617 * 2618 * Arguments: 2619 * vdc - soft state pointer for this instance of the device driver. 2620 * 2621 * Return Code: 2622 * 0 - Success 2623 */ 2624 static int 2625 vdc_init_descriptor_ring(vdc_t *vdc) 2626 { 2627 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2628 int status = 0; 2629 int i; 2630 2631 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2632 2633 ASSERT(vdc != NULL); 2634 ASSERT(mutex_owned(&vdc->lock)); 2635 2636 /* ensure we have enough room to store max sized block */ 2637 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2638 2639 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2640 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2641 /* 2642 * Calculate the maximum block size we can transmit using one 2643 * Descriptor Ring entry from the attributes returned by the 2644 * vDisk server. This is subject to a minimum of 'maxphys' 2645 * as we do not have the capability to split requests over 2646 * multiple DRing entries. 2647 */ 2648 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2649 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2650 vdc->instance); 2651 vdc->dring_max_cookies = maxphys / PAGESIZE; 2652 } else { 2653 vdc->dring_max_cookies = 2654 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2655 } 2656 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2657 (sizeof (ldc_mem_cookie_t) * 2658 (vdc->dring_max_cookies - 1))); 2659 vdc->dring_len = VD_DRING_LEN; 2660 2661 status = ldc_mem_dring_create(vdc->dring_len, 2662 vdc->dring_entry_size, &vdc->dring_hdl); 2663 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2664 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2665 vdc->instance); 2666 return (status); 2667 } 2668 vdc->initialized |= VDC_DRING_INIT; 2669 } 2670 2671 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2672 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2673 vdc->dring_cookie = 2674 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2675 2676 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2677 vdc->dring_hdl, 2678 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2679 &vdc->dring_cookie[0], 2680 &vdc->dring_cookie_count); 2681 if (status != 0) { 2682 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2683 "(%lx) to channel (%lx) status=%d\n", 2684 vdc->instance, vdc->dring_hdl, 2685 vdc->curr_server->ldc_handle, status); 2686 return (status); 2687 } 2688 ASSERT(vdc->dring_cookie_count == 1); 2689 vdc->initialized |= VDC_DRING_BOUND; 2690 } 2691 2692 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2693 if (status != 0) { 2694 DMSG(vdc, 0, 2695 "[%d] Failed to get info for descriptor ring (%lx)\n", 2696 vdc->instance, vdc->dring_hdl); 2697 return (status); 2698 } 2699 2700 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2701 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2702 2703 /* Allocate the local copy of this dring */ 2704 vdc->local_dring = 2705 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2706 KM_SLEEP); 2707 vdc->initialized |= VDC_DRING_LOCAL; 2708 } 2709 2710 /* 2711 * Mark all DRing entries as free and initialize the private 2712 * descriptor's memory handles. If any entry is initialized, 2713 * we need to free it later so we set the bit in 'initialized' 2714 * at the start. 2715 */ 2716 vdc->initialized |= VDC_DRING_ENTRY; 2717 for (i = 0; i < vdc->dring_len; i++) { 2718 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2719 dep->hdr.dstate = VIO_DESC_FREE; 2720 2721 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2722 &vdc->local_dring[i].desc_mhdl); 2723 if (status != 0) { 2724 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2725 " descriptor %d", vdc->instance, i); 2726 return (status); 2727 } 2728 vdc->local_dring[i].is_free = B_TRUE; 2729 vdc->local_dring[i].dep = dep; 2730 } 2731 2732 /* Initialize the starting index */ 2733 vdc->dring_curr_idx = 0; 2734 2735 return (status); 2736 } 2737 2738 /* 2739 * Function: 2740 * vdc_destroy_descriptor_ring() 2741 * 2742 * Description: 2743 * 2744 * Arguments: 2745 * vdc - soft state pointer for this instance of the device driver. 2746 * 2747 * Return Code: 2748 * None 2749 */ 2750 static void 2751 vdc_destroy_descriptor_ring(vdc_t *vdc) 2752 { 2753 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2754 ldc_mem_handle_t mhdl = NULL; 2755 ldc_mem_info_t minfo; 2756 int status = -1; 2757 int i; /* loop */ 2758 2759 ASSERT(vdc != NULL); 2760 ASSERT(mutex_owned(&vdc->lock)); 2761 2762 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2763 2764 if (vdc->initialized & VDC_DRING_ENTRY) { 2765 DMSG(vdc, 0, 2766 "[%d] Removing Local DRing entries\n", vdc->instance); 2767 for (i = 0; i < vdc->dring_len; i++) { 2768 ldep = &vdc->local_dring[i]; 2769 mhdl = ldep->desc_mhdl; 2770 2771 if (mhdl == NULL) 2772 continue; 2773 2774 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2775 DMSG(vdc, 0, 2776 "ldc_mem_info returned an error: %d\n", 2777 status); 2778 2779 /* 2780 * This must mean that the mem handle 2781 * is not valid. Clear it out so that 2782 * no one tries to use it. 2783 */ 2784 ldep->desc_mhdl = NULL; 2785 continue; 2786 } 2787 2788 if (minfo.status == LDC_BOUND) { 2789 (void) ldc_mem_unbind_handle(mhdl); 2790 } 2791 2792 (void) ldc_mem_free_handle(mhdl); 2793 2794 ldep->desc_mhdl = NULL; 2795 } 2796 vdc->initialized &= ~VDC_DRING_ENTRY; 2797 } 2798 2799 if (vdc->initialized & VDC_DRING_LOCAL) { 2800 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2801 kmem_free(vdc->local_dring, 2802 vdc->dring_len * sizeof (vdc_local_desc_t)); 2803 vdc->initialized &= ~VDC_DRING_LOCAL; 2804 } 2805 2806 if (vdc->initialized & VDC_DRING_BOUND) { 2807 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2808 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2809 if (status == 0) { 2810 vdc->initialized &= ~VDC_DRING_BOUND; 2811 } else { 2812 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2813 vdc->instance, status, vdc->dring_hdl); 2814 } 2815 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2816 } 2817 2818 if (vdc->initialized & VDC_DRING_INIT) { 2819 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2820 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2821 if (status == 0) { 2822 vdc->dring_hdl = NULL; 2823 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2824 vdc->initialized &= ~VDC_DRING_INIT; 2825 } else { 2826 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2827 vdc->instance, status, vdc->dring_hdl); 2828 } 2829 } 2830 } 2831 2832 /* 2833 * Function: 2834 * vdc_map_to_shared_dring() 2835 * 2836 * Description: 2837 * Copy contents of the local descriptor to the shared 2838 * memory descriptor. 2839 * 2840 * Arguments: 2841 * vdcp - soft state pointer for this instance of the device driver. 2842 * idx - descriptor ring index 2843 * 2844 * Return Code: 2845 * None 2846 */ 2847 static int 2848 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2849 { 2850 vdc_local_desc_t *ldep; 2851 vd_dring_entry_t *dep; 2852 int rv; 2853 2854 ldep = &(vdcp->local_dring[idx]); 2855 2856 /* for now leave in the old pop_mem_hdl stuff */ 2857 if (ldep->nbytes > 0) { 2858 rv = vdc_populate_mem_hdl(vdcp, ldep); 2859 if (rv) { 2860 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2861 vdcp->instance); 2862 return (rv); 2863 } 2864 } 2865 2866 /* 2867 * fill in the data details into the DRing 2868 */ 2869 dep = ldep->dep; 2870 ASSERT(dep != NULL); 2871 2872 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2873 dep->payload.operation = ldep->operation; 2874 dep->payload.addr = ldep->offset; 2875 dep->payload.nbytes = ldep->nbytes; 2876 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2877 dep->payload.slice = ldep->slice; 2878 dep->hdr.dstate = VIO_DESC_READY; 2879 dep->hdr.ack = 1; /* request an ACK for every message */ 2880 2881 return (0); 2882 } 2883 2884 /* 2885 * Function: 2886 * vdc_send_request 2887 * 2888 * Description: 2889 * This routine writes the data to be transmitted to vds into the 2890 * descriptor, notifies vds that the ring has been updated and 2891 * then waits for the request to be processed. 2892 * 2893 * Arguments: 2894 * vdcp - the soft state pointer 2895 * operation - operation we want vds to perform (VD_OP_XXX) 2896 * addr - address of data buf to be read/written. 2897 * nbytes - number of bytes to read/write 2898 * slice - the disk slice this request is for 2899 * offset - relative disk offset 2900 * cb_type - type of call - STRATEGY or SYNC 2901 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2902 * . mode for ioctl(9e) 2903 * . LP64 diskaddr_t (block I/O) 2904 * dir - direction of operation (READ/WRITE/BOTH) 2905 * 2906 * Return Codes: 2907 * 0 2908 * ENXIO 2909 */ 2910 static int 2911 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2912 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2913 void *cb_arg, vio_desc_direction_t dir) 2914 { 2915 int rv = 0; 2916 2917 ASSERT(vdcp != NULL); 2918 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2919 2920 mutex_enter(&vdcp->lock); 2921 2922 /* 2923 * If this is a block read/write operation we update the I/O statistics 2924 * to indicate that the request is being put on the waitq to be 2925 * serviced. 2926 * 2927 * We do it here (a common routine for both synchronous and strategy 2928 * calls) for performance reasons - we are already holding vdc->lock 2929 * so there is no extra locking overhead. We would have to explicitly 2930 * grab the 'lock' mutex to update the stats if we were to do this 2931 * higher up the stack in vdc_strategy() et. al. 2932 */ 2933 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2934 DTRACE_IO1(start, buf_t *, cb_arg); 2935 VD_KSTAT_WAITQ_ENTER(vdcp); 2936 } 2937 2938 do { 2939 while (vdcp->state != VDC_STATE_RUNNING) { 2940 2941 /* return error if detaching */ 2942 if (vdcp->state == VDC_STATE_DETACH) { 2943 rv = ENXIO; 2944 goto done; 2945 } 2946 2947 /* fail request if connection timeout is reached */ 2948 if (vdcp->ctimeout_reached) { 2949 rv = EIO; 2950 goto done; 2951 } 2952 2953 /* 2954 * If we are panicking and the disk is not ready then 2955 * we can't send any request because we can't complete 2956 * the handshake now. 2957 */ 2958 if (ddi_in_panic()) { 2959 rv = EIO; 2960 goto done; 2961 } 2962 2963 cv_wait(&vdcp->running_cv, &vdcp->lock); 2964 } 2965 2966 } while (vdc_populate_descriptor(vdcp, operation, addr, 2967 nbytes, slice, offset, cb_type, cb_arg, dir)); 2968 2969 done: 2970 /* 2971 * If this is a block read/write we update the I/O statistics kstat 2972 * to indicate that this request has been placed on the queue for 2973 * processing (i.e sent to the vDisk server) - iostat(1M) will 2974 * report the time waiting for the vDisk server under the %b column 2975 * In the case of an error we simply take it off the wait queue. 2976 */ 2977 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2978 if (rv == 0) { 2979 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2980 DTRACE_PROBE1(send, buf_t *, cb_arg); 2981 } else { 2982 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2983 VD_KSTAT_WAITQ_EXIT(vdcp); 2984 DTRACE_IO1(done, buf_t *, cb_arg); 2985 } 2986 } 2987 2988 mutex_exit(&vdcp->lock); 2989 2990 return (rv); 2991 } 2992 2993 2994 /* 2995 * Function: 2996 * vdc_populate_descriptor 2997 * 2998 * Description: 2999 * This routine writes the data to be transmitted to vds into the 3000 * descriptor, notifies vds that the ring has been updated and 3001 * then waits for the request to be processed. 3002 * 3003 * Arguments: 3004 * vdcp - the soft state pointer 3005 * operation - operation we want vds to perform (VD_OP_XXX) 3006 * addr - address of data buf to be read/written. 3007 * nbytes - number of bytes to read/write 3008 * slice - the disk slice this request is for 3009 * offset - relative disk offset 3010 * cb_type - type of call - STRATEGY or SYNC 3011 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3012 * . mode for ioctl(9e) 3013 * . LP64 diskaddr_t (block I/O) 3014 * dir - direction of operation (READ/WRITE/BOTH) 3015 * 3016 * Return Codes: 3017 * 0 3018 * EAGAIN 3019 * ECONNRESET 3020 * ENXIO 3021 */ 3022 static int 3023 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 3024 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 3025 void *cb_arg, vio_desc_direction_t dir) 3026 { 3027 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 3028 int idx; /* Index of DRing entry used */ 3029 int next_idx; 3030 vio_dring_msg_t dmsg; 3031 size_t msglen; 3032 int rv; 3033 3034 ASSERT(MUTEX_HELD(&vdcp->lock)); 3035 vdcp->threads_pending++; 3036 loop: 3037 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 3038 3039 /* Get next available D-Ring entry */ 3040 idx = vdcp->dring_curr_idx; 3041 local_dep = &(vdcp->local_dring[idx]); 3042 3043 if (!local_dep->is_free) { 3044 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 3045 vdcp->instance); 3046 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 3047 if (vdcp->state == VDC_STATE_RUNNING || 3048 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3049 goto loop; 3050 } 3051 vdcp->threads_pending--; 3052 return (ECONNRESET); 3053 } 3054 3055 next_idx = idx + 1; 3056 if (next_idx >= vdcp->dring_len) 3057 next_idx = 0; 3058 vdcp->dring_curr_idx = next_idx; 3059 3060 ASSERT(local_dep->is_free); 3061 3062 local_dep->operation = operation; 3063 local_dep->addr = addr; 3064 local_dep->nbytes = nbytes; 3065 local_dep->slice = slice; 3066 local_dep->offset = offset; 3067 local_dep->cb_type = cb_type; 3068 local_dep->cb_arg = cb_arg; 3069 local_dep->dir = dir; 3070 3071 local_dep->is_free = B_FALSE; 3072 3073 rv = vdc_map_to_shared_dring(vdcp, idx); 3074 if (rv) { 3075 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3076 vdcp->instance); 3077 /* free the descriptor */ 3078 local_dep->is_free = B_TRUE; 3079 vdcp->dring_curr_idx = idx; 3080 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3081 if (vdcp->state == VDC_STATE_RUNNING || 3082 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3083 goto loop; 3084 } 3085 vdcp->threads_pending--; 3086 return (ECONNRESET); 3087 } 3088 3089 /* 3090 * Send a msg with the DRing details to vds 3091 */ 3092 VIO_INIT_DRING_DATA_TAG(dmsg); 3093 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3094 dmsg.dring_ident = vdcp->dring_ident; 3095 dmsg.start_idx = idx; 3096 dmsg.end_idx = idx; 3097 vdcp->seq_num++; 3098 3099 DTRACE_PROBE2(populate, int, vdcp->instance, 3100 vdc_local_desc_t *, local_dep); 3101 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3102 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3103 3104 /* 3105 * note we're still holding the lock here to 3106 * make sure the message goes out in order !!!... 3107 */ 3108 msglen = sizeof (dmsg); 3109 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3110 switch (rv) { 3111 case ECONNRESET: 3112 /* 3113 * vdc_send initiates the reset on failure. 3114 * Since the transaction has already been put 3115 * on the local dring, it will automatically get 3116 * retried when the channel is reset. Given that, 3117 * it is ok to just return success even though the 3118 * send failed. 3119 */ 3120 rv = 0; 3121 break; 3122 3123 case 0: /* EOK */ 3124 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3125 break; 3126 3127 default: 3128 goto cleanup_and_exit; 3129 } 3130 3131 vdcp->threads_pending--; 3132 return (rv); 3133 3134 cleanup_and_exit: 3135 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3136 return (ENXIO); 3137 } 3138 3139 /* 3140 * Function: 3141 * vdc_do_sync_op 3142 * 3143 * Description: 3144 * Wrapper around vdc_populate_descriptor that blocks until the 3145 * response to the message is available. 3146 * 3147 * Arguments: 3148 * vdcp - the soft state pointer 3149 * operation - operation we want vds to perform (VD_OP_XXX) 3150 * addr - address of data buf to be read/written. 3151 * nbytes - number of bytes to read/write 3152 * slice - the disk slice this request is for 3153 * offset - relative disk offset 3154 * cb_type - type of call - STRATEGY or SYNC 3155 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3156 * . mode for ioctl(9e) 3157 * . LP64 diskaddr_t (block I/O) 3158 * dir - direction of operation (READ/WRITE/BOTH) 3159 * rconflict - check for reservation conflict in case of failure 3160 * 3161 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3162 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3163 * result of a successful operation with vd_scsi_status(). 3164 * 3165 * Return Codes: 3166 * 0 3167 * EAGAIN 3168 * EFAULT 3169 * ENXIO 3170 * EIO 3171 */ 3172 static int 3173 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3174 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3175 vio_desc_direction_t dir, boolean_t rconflict) 3176 { 3177 int status; 3178 vdc_io_t *vio; 3179 boolean_t check_resv_conflict = B_FALSE; 3180 3181 ASSERT(cb_type == CB_SYNC); 3182 3183 /* 3184 * Grab the lock, if blocked wait until the server 3185 * response causes us to wake up again. 3186 */ 3187 mutex_enter(&vdcp->lock); 3188 vdcp->sync_op_cnt++; 3189 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3190 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3191 3192 if (vdcp->state == VDC_STATE_DETACH) { 3193 cv_broadcast(&vdcp->sync_blocked_cv); 3194 vdcp->sync_op_cnt--; 3195 mutex_exit(&vdcp->lock); 3196 return (ENXIO); 3197 } 3198 3199 /* now block anyone other thread entering after us */ 3200 vdcp->sync_op_blocked = B_TRUE; 3201 vdcp->sync_op_pending = B_TRUE; 3202 mutex_exit(&vdcp->lock); 3203 3204 status = vdc_send_request(vdcp, operation, addr, 3205 nbytes, slice, offset, cb_type, cb_arg, dir); 3206 3207 mutex_enter(&vdcp->lock); 3208 3209 if (status != 0) { 3210 vdcp->sync_op_pending = B_FALSE; 3211 } else { 3212 /* 3213 * block until our transaction completes. 3214 * Also anyone else waiting also gets to go next. 3215 */ 3216 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3217 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3218 3219 DMSG(vdcp, 2, ": operation returned %d\n", 3220 vdcp->sync_op_status); 3221 if (vdcp->state == VDC_STATE_DETACH) { 3222 vdcp->sync_op_pending = B_FALSE; 3223 status = ENXIO; 3224 } else { 3225 status = vdcp->sync_op_status; 3226 if (status != 0 && vdcp->failfast_interval != 0) { 3227 /* 3228 * Operation has failed and failfast is enabled. 3229 * We need to check if the failure is due to a 3230 * reservation conflict if this was requested. 3231 */ 3232 check_resv_conflict = rconflict; 3233 } 3234 3235 } 3236 } 3237 3238 vdcp->sync_op_status = 0; 3239 vdcp->sync_op_blocked = B_FALSE; 3240 vdcp->sync_op_cnt--; 3241 3242 /* signal the next waiting thread */ 3243 cv_signal(&vdcp->sync_blocked_cv); 3244 3245 /* 3246 * We have to check for reservation conflict after unblocking sync 3247 * operations because some sync operations will be used to do this 3248 * check. 3249 */ 3250 if (check_resv_conflict) { 3251 vio = vdc_failfast_io_queue(vdcp, NULL); 3252 while (vio->vio_qtime != 0) 3253 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3254 kmem_free(vio, sizeof (vdc_io_t)); 3255 } 3256 3257 mutex_exit(&vdcp->lock); 3258 3259 return (status); 3260 } 3261 3262 3263 /* 3264 * Function: 3265 * vdc_drain_response() 3266 * 3267 * Description: 3268 * When a guest is panicking, the completion of requests needs to be 3269 * handled differently because interrupts are disabled and vdc 3270 * will not get messages. We have to poll for the messages instead. 3271 * 3272 * Note: since we don't have a buf_t available we cannot implement 3273 * the io:::done DTrace probe in this specific case. 3274 * 3275 * Arguments: 3276 * vdc - soft state pointer for this instance of the device driver. 3277 * 3278 * Return Code: 3279 * 0 - Success 3280 */ 3281 static int 3282 vdc_drain_response(vdc_t *vdc) 3283 { 3284 int rv, idx, retries; 3285 size_t msglen; 3286 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3287 vio_dring_msg_t dmsg; 3288 3289 mutex_enter(&vdc->lock); 3290 3291 retries = 0; 3292 for (;;) { 3293 msglen = sizeof (dmsg); 3294 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3295 &msglen); 3296 if (rv) { 3297 rv = EINVAL; 3298 break; 3299 } 3300 3301 /* 3302 * if there are no packets wait and check again 3303 */ 3304 if ((rv == 0) && (msglen == 0)) { 3305 if (retries++ > vdc_dump_retries) { 3306 rv = EAGAIN; 3307 break; 3308 } 3309 3310 drv_usecwait(vdc_usec_timeout_dump); 3311 continue; 3312 } 3313 3314 /* 3315 * Ignore all messages that are not ACKs/NACKs to 3316 * DRing requests. 3317 */ 3318 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3319 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3320 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3321 dmsg.tag.vio_msgtype, 3322 dmsg.tag.vio_subtype, 3323 dmsg.tag.vio_subtype_env); 3324 continue; 3325 } 3326 3327 /* 3328 * set the appropriate return value for the current request. 3329 */ 3330 switch (dmsg.tag.vio_subtype) { 3331 case VIO_SUBTYPE_ACK: 3332 rv = 0; 3333 break; 3334 case VIO_SUBTYPE_NACK: 3335 rv = EAGAIN; 3336 break; 3337 default: 3338 continue; 3339 } 3340 3341 idx = dmsg.start_idx; 3342 if (idx >= vdc->dring_len) { 3343 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3344 vdc->instance, idx); 3345 continue; 3346 } 3347 ldep = &vdc->local_dring[idx]; 3348 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3349 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3350 vdc->instance, idx, ldep->dep->hdr.dstate); 3351 continue; 3352 } 3353 3354 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3355 vdc->instance, idx, ldep->dep->hdr.dstate); 3356 3357 rv = vdc_depopulate_descriptor(vdc, idx); 3358 if (rv) { 3359 DMSG(vdc, 0, 3360 "[%d] Entry @ %d - depopulate failed ..\n", 3361 vdc->instance, idx); 3362 } 3363 3364 /* if this is the last descriptor - break out of loop */ 3365 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) 3366 break; 3367 } 3368 3369 mutex_exit(&vdc->lock); 3370 DMSG(vdc, 0, "End idx=%d\n", idx); 3371 3372 return (rv); 3373 } 3374 3375 3376 /* 3377 * Function: 3378 * vdc_depopulate_descriptor() 3379 * 3380 * Description: 3381 * 3382 * Arguments: 3383 * vdc - soft state pointer for this instance of the device driver. 3384 * idx - Index of the Descriptor Ring entry being modified 3385 * 3386 * Return Code: 3387 * 0 - Success 3388 */ 3389 static int 3390 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3391 { 3392 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3393 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3394 int status = ENXIO; 3395 int rv = 0; 3396 3397 ASSERT(vdc != NULL); 3398 ASSERT(idx < vdc->dring_len); 3399 ldep = &vdc->local_dring[idx]; 3400 ASSERT(ldep != NULL); 3401 ASSERT(MUTEX_HELD(&vdc->lock)); 3402 3403 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3404 DMSG(vdc, 2, ": idx = %d\n", idx); 3405 3406 dep = ldep->dep; 3407 ASSERT(dep != NULL); 3408 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3409 (dep->payload.status == ECANCELED)); 3410 3411 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3412 3413 ldep->is_free = B_TRUE; 3414 status = dep->payload.status; 3415 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3416 3417 /* 3418 * If no buffers were used to transfer information to the server when 3419 * populating the descriptor then no memory handles need to be unbound 3420 * and we can return now. 3421 */ 3422 if (ldep->nbytes == 0) { 3423 cv_signal(&vdc->dring_free_cv); 3424 return (status); 3425 } 3426 3427 /* 3428 * If the upper layer passed in a misaligned address we copied the 3429 * data into an aligned buffer before sending it to LDC - we now 3430 * copy it back to the original buffer. 3431 */ 3432 if (ldep->align_addr) { 3433 ASSERT(ldep->addr != NULL); 3434 3435 if (dep->payload.nbytes > 0) 3436 bcopy(ldep->align_addr, ldep->addr, 3437 dep->payload.nbytes); 3438 kmem_free(ldep->align_addr, 3439 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3440 ldep->align_addr = NULL; 3441 } 3442 3443 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3444 if (rv != 0) { 3445 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3446 vdc->instance, ldep->desc_mhdl, idx, rv); 3447 /* 3448 * The error returned by the vDisk server is more informative 3449 * and thus has a higher priority but if it isn't set we ensure 3450 * that this function returns an error. 3451 */ 3452 if (status == 0) 3453 status = EINVAL; 3454 } 3455 3456 cv_signal(&vdc->membind_cv); 3457 cv_signal(&vdc->dring_free_cv); 3458 3459 return (status); 3460 } 3461 3462 /* 3463 * Function: 3464 * vdc_populate_mem_hdl() 3465 * 3466 * Description: 3467 * 3468 * Arguments: 3469 * vdc - soft state pointer for this instance of the device driver. 3470 * idx - Index of the Descriptor Ring entry being modified 3471 * addr - virtual address being mapped in 3472 * nybtes - number of bytes in 'addr' 3473 * operation - the vDisk operation being performed (VD_OP_xxx) 3474 * 3475 * Return Code: 3476 * 0 - Success 3477 */ 3478 static int 3479 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3480 { 3481 vd_dring_entry_t *dep = NULL; 3482 ldc_mem_handle_t mhdl; 3483 caddr_t vaddr; 3484 size_t nbytes; 3485 uint8_t perm = LDC_MEM_RW; 3486 uint8_t maptype; 3487 int rv = 0; 3488 int i; 3489 3490 ASSERT(vdcp != NULL); 3491 3492 dep = ldep->dep; 3493 mhdl = ldep->desc_mhdl; 3494 3495 switch (ldep->dir) { 3496 case VIO_read_dir: 3497 perm = LDC_MEM_W; 3498 break; 3499 3500 case VIO_write_dir: 3501 perm = LDC_MEM_R; 3502 break; 3503 3504 case VIO_both_dir: 3505 perm = LDC_MEM_RW; 3506 break; 3507 3508 default: 3509 ASSERT(0); /* catch bad programming in vdc */ 3510 } 3511 3512 /* 3513 * LDC expects any addresses passed in to be 8-byte aligned. We need 3514 * to copy the contents of any misaligned buffers to a newly allocated 3515 * buffer and bind it instead (and copy the the contents back to the 3516 * original buffer passed in when depopulating the descriptor) 3517 */ 3518 vaddr = ldep->addr; 3519 nbytes = ldep->nbytes; 3520 if (((uint64_t)vaddr & 0x7) != 0) { 3521 ASSERT(ldep->align_addr == NULL); 3522 ldep->align_addr = 3523 kmem_alloc(sizeof (caddr_t) * 3524 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3525 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3526 "(buf=%p nb=%ld op=%d)\n", 3527 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3528 nbytes, ldep->operation); 3529 if (perm != LDC_MEM_W) 3530 bcopy(vaddr, ldep->align_addr, nbytes); 3531 vaddr = ldep->align_addr; 3532 } 3533 3534 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3535 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3536 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3537 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3538 vdcp->instance, dep->payload.ncookies); 3539 if (rv != 0) { 3540 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3541 "(mhdl=%p, buf=%p, err=%d)\n", 3542 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3543 if (ldep->align_addr) { 3544 kmem_free(ldep->align_addr, 3545 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3546 ldep->align_addr = NULL; 3547 } 3548 return (EAGAIN); 3549 } 3550 3551 /* 3552 * Get the other cookies (if any). 3553 */ 3554 for (i = 1; i < dep->payload.ncookies; i++) { 3555 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3556 if (rv != 0) { 3557 (void) ldc_mem_unbind_handle(mhdl); 3558 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3559 "(mhdl=%lx cnum=%d), err=%d", 3560 vdcp->instance, mhdl, i, rv); 3561 if (ldep->align_addr) { 3562 kmem_free(ldep->align_addr, 3563 sizeof (caddr_t) * ldep->nbytes); 3564 ldep->align_addr = NULL; 3565 } 3566 return (EAGAIN); 3567 } 3568 } 3569 3570 return (rv); 3571 } 3572 3573 /* 3574 * Interrupt handlers for messages from LDC 3575 */ 3576 3577 /* 3578 * Function: 3579 * vdc_handle_cb() 3580 * 3581 * Description: 3582 * 3583 * Arguments: 3584 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3585 * arg - soft state pointer for this instance of the device driver. 3586 * 3587 * Return Code: 3588 * 0 - Success 3589 */ 3590 static uint_t 3591 vdc_handle_cb(uint64_t event, caddr_t arg) 3592 { 3593 ldc_status_t ldc_state; 3594 int rv = 0; 3595 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3596 vdc_t *vdc = srvr->vdcp; 3597 3598 ASSERT(vdc != NULL); 3599 3600 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3601 3602 /* If callback is not for the current server, ignore it */ 3603 mutex_enter(&vdc->lock); 3604 3605 if (vdc->curr_server != srvr) { 3606 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3607 vdc->instance, event, srvr->id); 3608 mutex_exit(&vdc->lock); 3609 return (LDC_SUCCESS); 3610 } 3611 3612 /* 3613 * Depending on the type of event that triggered this callback, 3614 * we modify the handshake state or read the data. 3615 * 3616 * NOTE: not done as a switch() as event could be triggered by 3617 * a state change and a read request. Also the ordering of the 3618 * check for the event types is deliberate. 3619 */ 3620 if (event & LDC_EVT_UP) { 3621 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3622 3623 /* get LDC state */ 3624 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3625 if (rv != 0) { 3626 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3627 vdc->instance, rv); 3628 mutex_exit(&vdc->lock); 3629 return (LDC_SUCCESS); 3630 } 3631 if (srvr->ldc_state != LDC_UP && 3632 ldc_state == LDC_UP) { 3633 /* 3634 * Reset the transaction sequence numbers when 3635 * LDC comes up. We then kick off the handshake 3636 * negotiation with the vDisk server. 3637 */ 3638 vdc->seq_num = 1; 3639 vdc->seq_num_reply = 0; 3640 srvr->ldc_state = ldc_state; 3641 cv_signal(&vdc->initwait_cv); 3642 } 3643 } 3644 3645 if (event & LDC_EVT_READ) { 3646 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3647 mutex_enter(&vdc->read_lock); 3648 cv_signal(&vdc->read_cv); 3649 vdc->read_state = VDC_READ_PENDING; 3650 mutex_exit(&vdc->read_lock); 3651 mutex_exit(&vdc->lock); 3652 3653 /* that's all we have to do - no need to handle DOWN/RESET */ 3654 return (LDC_SUCCESS); 3655 } 3656 3657 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3658 3659 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3660 3661 /* 3662 * Need to wake up any readers so they will 3663 * detect that a reset has occurred. 3664 */ 3665 mutex_enter(&vdc->read_lock); 3666 if ((vdc->read_state == VDC_READ_WAITING) || 3667 (vdc->read_state == VDC_READ_RESET)) 3668 cv_signal(&vdc->read_cv); 3669 vdc->read_state = VDC_READ_RESET; 3670 mutex_exit(&vdc->read_lock); 3671 3672 /* wake up any threads waiting for connection to come up */ 3673 if (vdc->state == VDC_STATE_INIT_WAITING) { 3674 vdc->state = VDC_STATE_RESETTING; 3675 cv_signal(&vdc->initwait_cv); 3676 } 3677 3678 } 3679 3680 mutex_exit(&vdc->lock); 3681 3682 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3683 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3684 vdc->instance, event); 3685 3686 return (LDC_SUCCESS); 3687 } 3688 3689 /* 3690 * Function: 3691 * vdc_wait_for_response() 3692 * 3693 * Description: 3694 * Block waiting for a response from the server. If there is 3695 * no data the thread block on the read_cv that is signalled 3696 * by the callback when an EVT_READ occurs. 3697 * 3698 * Arguments: 3699 * vdcp - soft state pointer for this instance of the device driver. 3700 * 3701 * Return Code: 3702 * 0 - Success 3703 */ 3704 static int 3705 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3706 { 3707 size_t nbytes = sizeof (*msgp); 3708 int status; 3709 3710 ASSERT(vdcp != NULL); 3711 3712 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3713 3714 status = vdc_recv(vdcp, msgp, &nbytes); 3715 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3716 status, (int)nbytes); 3717 if (status) { 3718 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3719 vdcp->instance, status); 3720 return (status); 3721 } 3722 3723 if (nbytes < sizeof (vio_msg_tag_t)) { 3724 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3725 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3726 return (ENOMSG); 3727 } 3728 3729 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3730 msgp->tag.vio_msgtype, 3731 msgp->tag.vio_subtype, 3732 msgp->tag.vio_subtype_env); 3733 3734 /* 3735 * Verify the Session ID of the message 3736 * 3737 * Every message after the Version has been negotiated should 3738 * have the correct session ID set. 3739 */ 3740 if ((msgp->tag.vio_sid != vdcp->session_id) && 3741 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3742 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3743 "expected 0x%lx [seq num %lx @ %d]", 3744 vdcp->instance, msgp->tag.vio_sid, 3745 vdcp->session_id, 3746 ((vio_dring_msg_t *)msgp)->seq_num, 3747 ((vio_dring_msg_t *)msgp)->start_idx); 3748 return (ENOMSG); 3749 } 3750 return (0); 3751 } 3752 3753 3754 /* 3755 * Function: 3756 * vdc_resubmit_backup_dring() 3757 * 3758 * Description: 3759 * Resubmit each descriptor in the backed up dring to 3760 * vDisk server. The Dring was backed up during connection 3761 * reset. 3762 * 3763 * Arguments: 3764 * vdcp - soft state pointer for this instance of the device driver. 3765 * 3766 * Return Code: 3767 * 0 - Success 3768 */ 3769 static int 3770 vdc_resubmit_backup_dring(vdc_t *vdcp) 3771 { 3772 int processed = 0; 3773 int count; 3774 int b_idx; 3775 int rv = 0; 3776 int dring_size; 3777 int op; 3778 vio_msg_t vio_msg; 3779 vdc_local_desc_t *curr_ldep; 3780 3781 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3782 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3783 3784 if (vdcp->local_dring_backup == NULL) { 3785 /* the pending requests have already been processed */ 3786 return (0); 3787 } 3788 3789 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3790 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3791 3792 /* 3793 * Walk the backup copy of the local descriptor ring and 3794 * resubmit all the outstanding transactions. 3795 */ 3796 b_idx = vdcp->local_dring_backup_tail; 3797 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3798 3799 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3800 3801 /* only resubmit outstanding transactions */ 3802 if (!curr_ldep->is_free) { 3803 /* 3804 * If we are retrying a block read/write operation we 3805 * need to update the I/O statistics to indicate that 3806 * the request is being put back on the waitq to be 3807 * serviced (it will have been taken off after the 3808 * error was reported). 3809 */ 3810 mutex_enter(&vdcp->lock); 3811 op = curr_ldep->operation; 3812 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3813 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3814 VD_KSTAT_WAITQ_ENTER(vdcp); 3815 } 3816 3817 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3818 rv = vdc_populate_descriptor(vdcp, op, 3819 curr_ldep->addr, curr_ldep->nbytes, 3820 curr_ldep->slice, curr_ldep->offset, 3821 curr_ldep->cb_type, curr_ldep->cb_arg, 3822 curr_ldep->dir); 3823 3824 if (rv) { 3825 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3826 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3827 VD_KSTAT_WAITQ_EXIT(vdcp); 3828 DTRACE_IO1(done, buf_t *, 3829 curr_ldep->cb_arg); 3830 } 3831 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3832 vdcp->instance, b_idx); 3833 mutex_exit(&vdcp->lock); 3834 goto done; 3835 } 3836 3837 /* 3838 * If this is a block read/write we update the I/O 3839 * statistics kstat to indicate that the request 3840 * has been sent back to the vDisk server and should 3841 * now be put on the run queue. 3842 */ 3843 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3844 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3845 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3846 } 3847 mutex_exit(&vdcp->lock); 3848 3849 /* Wait for the response message. */ 3850 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3851 b_idx); 3852 rv = vdc_wait_for_response(vdcp, &vio_msg); 3853 if (rv) { 3854 /* 3855 * If this is a block read/write we update 3856 * the I/O statistics kstat to take it 3857 * off the run queue. 3858 */ 3859 mutex_enter(&vdcp->lock); 3860 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3861 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3862 VD_KSTAT_RUNQ_EXIT(vdcp); 3863 DTRACE_IO1(done, buf_t *, 3864 curr_ldep->cb_arg); 3865 } 3866 DMSG(vdcp, 1, "[%d] wait_for_response " 3867 "returned err=%d\n", vdcp->instance, 3868 rv); 3869 mutex_exit(&vdcp->lock); 3870 goto done; 3871 } 3872 3873 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3874 rv = vdc_process_data_msg(vdcp, &vio_msg); 3875 if (rv) { 3876 DMSG(vdcp, 1, "[%d] process_data_msg " 3877 "returned err=%d\n", vdcp->instance, 3878 rv); 3879 goto done; 3880 } 3881 /* 3882 * Mark this entry as free so that we will not resubmit 3883 * this "done" request again, if we were to use the same 3884 * backup_dring again in future. This could happen when 3885 * a reset happens while processing the backup_dring. 3886 */ 3887 curr_ldep->is_free = B_TRUE; 3888 processed++; 3889 } 3890 3891 /* get the next element to submit */ 3892 if (++b_idx >= vdcp->local_dring_backup_len) 3893 b_idx = 0; 3894 } 3895 3896 /* all done - now clear up pending dring copy */ 3897 dring_size = vdcp->local_dring_backup_len * 3898 sizeof (vdcp->local_dring_backup[0]); 3899 3900 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3901 3902 vdcp->local_dring_backup = NULL; 3903 3904 done: 3905 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3906 3907 return (rv); 3908 } 3909 3910 /* 3911 * Function: 3912 * vdc_cancel_backup_dring 3913 * 3914 * Description: 3915 * Cancel each descriptor in the backed up dring to vDisk server. 3916 * The Dring was backed up during connection reset. 3917 * 3918 * Arguments: 3919 * vdcp - soft state pointer for this instance of the device driver. 3920 * 3921 * Return Code: 3922 * None 3923 */ 3924 void 3925 vdc_cancel_backup_dring(vdc_t *vdcp) 3926 { 3927 vdc_local_desc_t *ldep; 3928 struct buf *bufp; 3929 int count; 3930 int b_idx; 3931 int dring_size; 3932 int cancelled = 0; 3933 3934 ASSERT(MUTEX_HELD(&vdcp->lock)); 3935 ASSERT(vdcp->state == VDC_STATE_INIT || 3936 vdcp->state == VDC_STATE_INIT_WAITING || 3937 vdcp->state == VDC_STATE_NEGOTIATE || 3938 vdcp->state == VDC_STATE_RESETTING); 3939 3940 if (vdcp->local_dring_backup == NULL) { 3941 /* the pending requests have already been processed */ 3942 return; 3943 } 3944 3945 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3946 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3947 3948 /* 3949 * Walk the backup copy of the local descriptor ring and 3950 * cancel all the outstanding transactions. 3951 */ 3952 b_idx = vdcp->local_dring_backup_tail; 3953 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3954 3955 ldep = &(vdcp->local_dring_backup[b_idx]); 3956 3957 /* only cancel outstanding transactions */ 3958 if (!ldep->is_free) { 3959 3960 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3961 cancelled++; 3962 3963 /* 3964 * All requests have already been cleared from the 3965 * local descriptor ring and the LDC channel has been 3966 * reset so we will never get any reply for these 3967 * requests. Now we just have to notify threads waiting 3968 * for replies that the request has failed. 3969 */ 3970 switch (ldep->cb_type) { 3971 case CB_SYNC: 3972 ASSERT(vdcp->sync_op_pending); 3973 vdcp->sync_op_status = EIO; 3974 vdcp->sync_op_pending = B_FALSE; 3975 cv_signal(&vdcp->sync_pending_cv); 3976 break; 3977 3978 case CB_STRATEGY: 3979 bufp = ldep->cb_arg; 3980 ASSERT(bufp != NULL); 3981 bufp->b_resid = bufp->b_bcount; 3982 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3983 VD_KSTAT_RUNQ_EXIT(vdcp); 3984 DTRACE_IO1(done, buf_t *, bufp); 3985 bioerror(bufp, EIO); 3986 biodone(bufp); 3987 break; 3988 3989 default: 3990 ASSERT(0); 3991 } 3992 3993 } 3994 3995 /* get the next element to cancel */ 3996 if (++b_idx >= vdcp->local_dring_backup_len) 3997 b_idx = 0; 3998 } 3999 4000 /* all done - now clear up pending dring copy */ 4001 dring_size = vdcp->local_dring_backup_len * 4002 sizeof (vdcp->local_dring_backup[0]); 4003 4004 (void) kmem_free(vdcp->local_dring_backup, dring_size); 4005 4006 vdcp->local_dring_backup = NULL; 4007 4008 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 4009 } 4010 4011 /* 4012 * Function: 4013 * vdc_connection_timeout 4014 * 4015 * Description: 4016 * This function is invoked if the timeout set to establish the connection 4017 * with vds expires. This will happen if we spend too much time in the 4018 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 4019 * cancel any pending request and mark them as failed. 4020 * 4021 * If the timeout does not expire, it will be cancelled when we reach the 4022 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 4023 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 4024 * VDC_STATE_RESETTING state in which case we do nothing because the 4025 * timeout is being cancelled. 4026 * 4027 * Arguments: 4028 * arg - argument of the timeout function actually a soft state 4029 * pointer for the instance of the device driver. 4030 * 4031 * Return Code: 4032 * None 4033 */ 4034 void 4035 vdc_connection_timeout(void *arg) 4036 { 4037 vdc_t *vdcp = (vdc_t *)arg; 4038 4039 mutex_enter(&vdcp->lock); 4040 4041 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4042 vdcp->state == VDC_STATE_DETACH) { 4043 /* 4044 * The connection has just been re-established or 4045 * we are detaching. 4046 */ 4047 vdcp->ctimeout_reached = B_FALSE; 4048 mutex_exit(&vdcp->lock); 4049 return; 4050 } 4051 4052 vdcp->ctimeout_reached = B_TRUE; 4053 4054 /* notify requests waiting for sending */ 4055 cv_broadcast(&vdcp->running_cv); 4056 4057 /* cancel requests waiting for a result */ 4058 vdc_cancel_backup_dring(vdcp); 4059 4060 mutex_exit(&vdcp->lock); 4061 4062 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4063 vdcp->instance); 4064 } 4065 4066 /* 4067 * Function: 4068 * vdc_backup_local_dring() 4069 * 4070 * Description: 4071 * Backup the current dring in the event of a reset. The Dring 4072 * transactions will be resubmitted to the server when the 4073 * connection is restored. 4074 * 4075 * Arguments: 4076 * vdcp - soft state pointer for this instance of the device driver. 4077 * 4078 * Return Code: 4079 * NONE 4080 */ 4081 static void 4082 vdc_backup_local_dring(vdc_t *vdcp) 4083 { 4084 int dring_size; 4085 4086 ASSERT(MUTEX_HELD(&vdcp->lock)); 4087 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4088 4089 /* 4090 * If the backup dring is stil around, it means 4091 * that the last restore did not complete. However, 4092 * since we never got back into the running state, 4093 * the backup copy we have is still valid. 4094 */ 4095 if (vdcp->local_dring_backup != NULL) { 4096 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4097 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4098 vdcp->local_dring_backup_tail); 4099 return; 4100 } 4101 4102 /* 4103 * The backup dring can be NULL and the local dring may not be 4104 * initialized. This can happen if we had a reset while establishing 4105 * a new connection but after the connection has timed out. In that 4106 * case the backup dring is NULL because the requests have been 4107 * cancelled and the request occured before the local dring is 4108 * initialized. 4109 */ 4110 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4111 return; 4112 4113 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4114 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4115 4116 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4117 4118 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4119 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4120 4121 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4122 vdcp->local_dring_backup_len = vdcp->dring_len; 4123 } 4124 4125 static void 4126 vdc_switch_server(vdc_t *vdcp) 4127 { 4128 int rv; 4129 vdc_server_t *curr_server, *new_server; 4130 4131 ASSERT(MUTEX_HELD(&vdcp->lock)); 4132 4133 /* if there is only one server return back */ 4134 if (vdcp->num_servers == 1) { 4135 return; 4136 } 4137 4138 /* Get current and next server */ 4139 curr_server = vdcp->curr_server; 4140 new_server = 4141 (curr_server->next) ? curr_server->next : vdcp->server_list; 4142 ASSERT(curr_server != new_server); 4143 4144 /* bring current server's channel down */ 4145 rv = ldc_down(curr_server->ldc_handle); 4146 if (rv) { 4147 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4148 vdcp->instance, curr_server->id); 4149 return; 4150 } 4151 4152 /* switch the server */ 4153 vdcp->curr_server = new_server; 4154 4155 cmn_err(CE_NOTE, "Successfully failed over from VDS on port@%ld to " 4156 "VDS on port@%ld.\n", curr_server->id, new_server->id); 4157 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4158 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4159 } 4160 4161 /* -------------------------------------------------------------------------- */ 4162 4163 /* 4164 * The following functions process the incoming messages from vds 4165 */ 4166 4167 /* 4168 * Function: 4169 * vdc_process_msg_thread() 4170 * 4171 * Description: 4172 * 4173 * Main VDC message processing thread. Each vDisk instance 4174 * consists of a copy of this thread. This thread triggers 4175 * all the handshakes and data exchange with the server. It 4176 * also handles all channel resets 4177 * 4178 * Arguments: 4179 * vdc - soft state pointer for this instance of the device driver. 4180 * 4181 * Return Code: 4182 * None 4183 */ 4184 static void 4185 vdc_process_msg_thread(vdc_t *vdcp) 4186 { 4187 int status; 4188 int ctimeout; 4189 timeout_id_t tmid = 0; 4190 clock_t ldcup_timeout = 0; 4191 4192 mutex_enter(&vdcp->lock); 4193 4194 for (;;) { 4195 4196 #define Q(_s) (vdcp->state == _s) ? #_s : 4197 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4198 Q(VDC_STATE_INIT) 4199 Q(VDC_STATE_INIT_WAITING) 4200 Q(VDC_STATE_NEGOTIATE) 4201 Q(VDC_STATE_HANDLE_PENDING) 4202 Q(VDC_STATE_RUNNING) 4203 Q(VDC_STATE_RESETTING) 4204 Q(VDC_STATE_DETACH) 4205 "UNKNOWN"); 4206 4207 switch (vdcp->state) { 4208 case VDC_STATE_INIT: 4209 4210 /* 4211 * If requested, start a timeout to check if the 4212 * connection with vds is established in the 4213 * specified delay. If the timeout expires, we 4214 * will cancel any pending request. 4215 * 4216 * If some reset have occurred while establishing 4217 * the connection, we already have a timeout armed 4218 * and in that case we don't need to arm a new one. 4219 * 4220 * The same rule applies when there are multiple vds'. 4221 * If either a connection cannot be established or 4222 * the handshake times out, the connection thread will 4223 * try another server. The 'ctimeout' will report 4224 * back an error after it expires irrespective of 4225 * whether the vdisk is trying to connect to just 4226 * one or multiple servers. 4227 */ 4228 ctimeout = (vdc_timeout != 0)? 4229 vdc_timeout : vdcp->curr_server->ctimeout; 4230 4231 if (ctimeout != 0 && tmid == 0) { 4232 tmid = timeout(vdc_connection_timeout, vdcp, 4233 ctimeout * drv_usectohz(MICROSEC)); 4234 } 4235 4236 /* Check if we are re-initializing repeatedly */ 4237 if (vdcp->hshake_cnt > vdc_hshake_retries && 4238 vdcp->lifecycle != VDC_LC_ONLINE) { 4239 4240 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4241 vdcp->instance, vdcp->hshake_cnt); 4242 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4243 vdcp->instance); 4244 vdcp->state = VDC_STATE_DETACH; 4245 break; 4246 } 4247 4248 /* Switch to STATE_DETACH if drv is detaching */ 4249 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4250 vdcp->state = VDC_STATE_DETACH; 4251 break; 4252 } 4253 4254 /* Switch server */ 4255 if (vdcp->hshake_cnt > 0) 4256 vdc_switch_server(vdcp); 4257 vdcp->hshake_cnt++; 4258 4259 /* Bring up connection with vds via LDC */ 4260 status = vdc_start_ldc_connection(vdcp); 4261 if (status != EINVAL) { 4262 vdcp->state = VDC_STATE_INIT_WAITING; 4263 } 4264 break; 4265 4266 case VDC_STATE_INIT_WAITING: 4267 4268 /* if channel is UP, start negotiation */ 4269 if (vdcp->curr_server->ldc_state == LDC_UP) { 4270 vdcp->state = VDC_STATE_NEGOTIATE; 4271 break; 4272 } 4273 4274 /* check if only one server exists */ 4275 if (vdcp->num_servers == 1) { 4276 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4277 } else { 4278 /* 4279 * wait for LDC_UP, if it times out, switch 4280 * to another server. 4281 */ 4282 ldcup_timeout = ddi_get_lbolt() + 4283 (vdc_ldcup_timeout * 4284 drv_usectohz(MICROSEC)); 4285 status = cv_timedwait(&vdcp->initwait_cv, 4286 &vdcp->lock, ldcup_timeout); 4287 if (status == -1 && 4288 vdcp->state == VDC_STATE_INIT_WAITING && 4289 vdcp->curr_server->ldc_state != LDC_UP) { 4290 /* timed out & still waiting */ 4291 vdcp->state = VDC_STATE_INIT; 4292 break; 4293 } 4294 } 4295 4296 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4297 DMSG(vdcp, 0, 4298 "state moved to %d out from under us...\n", 4299 vdcp->state); 4300 } 4301 break; 4302 4303 case VDC_STATE_NEGOTIATE: 4304 switch (status = vdc_ver_negotiation(vdcp)) { 4305 case 0: 4306 break; 4307 default: 4308 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4309 status); 4310 goto reset; 4311 } 4312 4313 switch (status = vdc_attr_negotiation(vdcp)) { 4314 case 0: 4315 break; 4316 default: 4317 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4318 status); 4319 goto reset; 4320 } 4321 4322 switch (status = vdc_dring_negotiation(vdcp)) { 4323 case 0: 4324 break; 4325 default: 4326 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4327 status); 4328 goto reset; 4329 } 4330 4331 switch (status = vdc_rdx_exchange(vdcp)) { 4332 case 0: 4333 vdcp->state = VDC_STATE_HANDLE_PENDING; 4334 goto done; 4335 default: 4336 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4337 status); 4338 goto reset; 4339 } 4340 reset: 4341 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4342 status); 4343 vdcp->state = VDC_STATE_RESETTING; 4344 vdcp->self_reset = B_TRUE; 4345 done: 4346 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4347 vdcp->state); 4348 break; 4349 4350 case VDC_STATE_HANDLE_PENDING: 4351 4352 if (vdcp->ctimeout_reached) { 4353 /* 4354 * The connection timeout had been reached so 4355 * pending requests have been cancelled. Now 4356 * that the connection is back we can reset 4357 * the timeout. 4358 */ 4359 ASSERT(vdcp->local_dring_backup == NULL); 4360 ASSERT(tmid != 0); 4361 tmid = 0; 4362 vdcp->ctimeout_reached = B_FALSE; 4363 vdcp->state = VDC_STATE_RUNNING; 4364 DMSG(vdcp, 0, "[%d] connection to service " 4365 "domain is up", vdcp->instance); 4366 break; 4367 } 4368 4369 mutex_exit(&vdcp->lock); 4370 if (tmid != 0) { 4371 (void) untimeout(tmid); 4372 tmid = 0; 4373 } 4374 status = vdc_resubmit_backup_dring(vdcp); 4375 mutex_enter(&vdcp->lock); 4376 4377 if (status) 4378 vdcp->state = VDC_STATE_RESETTING; 4379 else 4380 vdcp->state = VDC_STATE_RUNNING; 4381 4382 break; 4383 4384 /* enter running state */ 4385 case VDC_STATE_RUNNING: 4386 /* 4387 * Signal anyone waiting for the connection 4388 * to come on line. 4389 */ 4390 vdcp->hshake_cnt = 0; 4391 cv_broadcast(&vdcp->running_cv); 4392 4393 /* failfast has to been checked after reset */ 4394 cv_signal(&vdcp->failfast_cv); 4395 4396 /* ownership is lost during reset */ 4397 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4398 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4399 cv_signal(&vdcp->ownership_cv); 4400 4401 mutex_exit(&vdcp->lock); 4402 4403 for (;;) { 4404 vio_msg_t msg; 4405 status = vdc_wait_for_response(vdcp, &msg); 4406 if (status) break; 4407 4408 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4409 vdcp->instance); 4410 status = vdc_process_data_msg(vdcp, &msg); 4411 if (status) { 4412 DMSG(vdcp, 1, "[%d] process_data_msg " 4413 "returned err=%d\n", vdcp->instance, 4414 status); 4415 break; 4416 } 4417 4418 } 4419 4420 mutex_enter(&vdcp->lock); 4421 4422 vdcp->state = VDC_STATE_RESETTING; 4423 vdcp->self_reset = B_TRUE; 4424 break; 4425 4426 case VDC_STATE_RESETTING: 4427 /* 4428 * When we reach this state, we either come from the 4429 * VDC_STATE_RUNNING state and we can have pending 4430 * request but no timeout is armed; or we come from 4431 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4432 * VDC_HANDLE_PENDING state and there is no pending 4433 * request or pending requests have already been copied 4434 * into the backup dring. So we can safely keep the 4435 * connection timeout armed while we are in this state. 4436 */ 4437 4438 DMSG(vdcp, 0, "Initiating channel reset " 4439 "(pending = %d)\n", (int)vdcp->threads_pending); 4440 4441 if (vdcp->self_reset) { 4442 DMSG(vdcp, 0, 4443 "[%d] calling stop_ldc_connection.\n", 4444 vdcp->instance); 4445 status = vdc_stop_ldc_connection(vdcp); 4446 vdcp->self_reset = B_FALSE; 4447 } 4448 4449 /* 4450 * Wait for all threads currently waiting 4451 * for a free dring entry to use. 4452 */ 4453 while (vdcp->threads_pending) { 4454 cv_broadcast(&vdcp->membind_cv); 4455 cv_broadcast(&vdcp->dring_free_cv); 4456 mutex_exit(&vdcp->lock); 4457 /* give the waiters enough time to wake up */ 4458 delay(vdc_hz_min_ldc_delay); 4459 mutex_enter(&vdcp->lock); 4460 } 4461 4462 ASSERT(vdcp->threads_pending == 0); 4463 4464 /* Sanity check that no thread is receiving */ 4465 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4466 4467 vdcp->read_state = VDC_READ_IDLE; 4468 4469 vdc_backup_local_dring(vdcp); 4470 4471 /* cleanup the old d-ring */ 4472 vdc_destroy_descriptor_ring(vdcp); 4473 4474 /* go and start again */ 4475 vdcp->state = VDC_STATE_INIT; 4476 4477 break; 4478 4479 case VDC_STATE_DETACH: 4480 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4481 vdcp->instance); 4482 4483 /* cancel any pending timeout */ 4484 mutex_exit(&vdcp->lock); 4485 if (tmid != 0) { 4486 (void) untimeout(tmid); 4487 tmid = 0; 4488 } 4489 mutex_enter(&vdcp->lock); 4490 4491 /* 4492 * Signal anyone waiting for connection 4493 * to come online 4494 */ 4495 cv_broadcast(&vdcp->running_cv); 4496 4497 while (vdcp->sync_op_pending) { 4498 cv_signal(&vdcp->sync_pending_cv); 4499 cv_signal(&vdcp->sync_blocked_cv); 4500 mutex_exit(&vdcp->lock); 4501 /* give the waiters enough time to wake up */ 4502 delay(vdc_hz_min_ldc_delay); 4503 mutex_enter(&vdcp->lock); 4504 } 4505 4506 mutex_exit(&vdcp->lock); 4507 4508 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4509 vdcp->instance); 4510 thread_exit(); 4511 break; 4512 } 4513 } 4514 } 4515 4516 4517 /* 4518 * Function: 4519 * vdc_process_data_msg() 4520 * 4521 * Description: 4522 * This function is called by the message processing thread each time 4523 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4524 * be an ACK or NACK from vds[1] which vdc handles as follows. 4525 * ACK - wake up the waiting thread 4526 * NACK - resend any messages necessary 4527 * 4528 * [1] Although the message format allows it, vds should not send a 4529 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4530 * some bizarre reason it does, vdc will reset the connection. 4531 * 4532 * Arguments: 4533 * vdc - soft state pointer for this instance of the device driver. 4534 * msg - the LDC message sent by vds 4535 * 4536 * Return Code: 4537 * 0 - Success. 4538 * > 0 - error value returned by LDC 4539 */ 4540 static int 4541 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4542 { 4543 int status = 0; 4544 vio_dring_msg_t *dring_msg; 4545 vdc_local_desc_t *ldep = NULL; 4546 int start, end; 4547 int idx; 4548 int op; 4549 4550 dring_msg = (vio_dring_msg_t *)msg; 4551 4552 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4553 ASSERT(vdcp != NULL); 4554 4555 mutex_enter(&vdcp->lock); 4556 4557 /* 4558 * Check to see if the message has bogus data 4559 */ 4560 idx = start = dring_msg->start_idx; 4561 end = dring_msg->end_idx; 4562 if ((start >= vdcp->dring_len) || 4563 (end >= vdcp->dring_len) || (end < -1)) { 4564 /* 4565 * Update the I/O statistics to indicate that an error ocurred. 4566 * No need to update the wait/run queues as no specific read or 4567 * write request is being completed in response to this 'msg'. 4568 */ 4569 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4570 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4571 vdcp->instance, start, end); 4572 mutex_exit(&vdcp->lock); 4573 return (EINVAL); 4574 } 4575 4576 /* 4577 * Verify that the sequence number is what vdc expects. 4578 */ 4579 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4580 case VDC_SEQ_NUM_TODO: 4581 break; /* keep processing this message */ 4582 case VDC_SEQ_NUM_SKIP: 4583 mutex_exit(&vdcp->lock); 4584 return (0); 4585 case VDC_SEQ_NUM_INVALID: 4586 /* 4587 * Update the I/O statistics to indicate that an error ocurred. 4588 * No need to update the wait/run queues as no specific read or 4589 * write request is being completed in response to this 'msg'. 4590 */ 4591 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4592 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4593 mutex_exit(&vdcp->lock); 4594 return (ENXIO); 4595 } 4596 4597 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4598 /* 4599 * Update the I/O statistics to indicate that an error ocurred. 4600 * 4601 * We need to update the run queue if a read or write request 4602 * is being NACKed - otherwise there will appear to be an 4603 * indefinite outstanding request and statistics reported by 4604 * iostat(1M) will be incorrect. The transaction will be 4605 * resubmitted from the backup DRing following the reset 4606 * and the wait/run queues will be entered again. 4607 */ 4608 ldep = &vdcp->local_dring[idx]; 4609 op = ldep->operation; 4610 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4611 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4612 VD_KSTAT_RUNQ_EXIT(vdcp); 4613 } 4614 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4615 VDC_DUMP_DRING_MSG(dring_msg); 4616 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4617 mutex_exit(&vdcp->lock); 4618 return (EIO); 4619 4620 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4621 /* 4622 * Update the I/O statistics to indicate that an error occurred. 4623 * No need to update the wait/run queues as no specific read or 4624 * write request is being completed in response to this 'msg'. 4625 */ 4626 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4627 mutex_exit(&vdcp->lock); 4628 return (EPROTO); 4629 } 4630 4631 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4632 ASSERT(start == end); 4633 4634 ldep = &vdcp->local_dring[idx]; 4635 4636 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4637 ldep->dep->hdr.dstate, ldep->cb_type); 4638 4639 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4640 struct buf *bufp; 4641 4642 switch (ldep->cb_type) { 4643 case CB_SYNC: 4644 ASSERT(vdcp->sync_op_pending); 4645 4646 status = vdc_depopulate_descriptor(vdcp, idx); 4647 vdcp->sync_op_status = status; 4648 vdcp->sync_op_pending = B_FALSE; 4649 cv_signal(&vdcp->sync_pending_cv); 4650 break; 4651 4652 case CB_STRATEGY: 4653 bufp = ldep->cb_arg; 4654 ASSERT(bufp != NULL); 4655 bufp->b_resid = 4656 bufp->b_bcount - ldep->dep->payload.nbytes; 4657 status = ldep->dep->payload.status; /* Future:ntoh */ 4658 if (status != 0) { 4659 DMSG(vdcp, 1, "strategy status=%d\n", status); 4660 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4661 bioerror(bufp, status); 4662 } 4663 4664 (void) vdc_depopulate_descriptor(vdcp, idx); 4665 4666 DMSG(vdcp, 1, 4667 "strategy complete req=%ld bytes resp=%ld bytes\n", 4668 bufp->b_bcount, ldep->dep->payload.nbytes); 4669 4670 if (status != 0 && vdcp->failfast_interval != 0) { 4671 /* 4672 * The I/O has failed and failfast is enabled. 4673 * We need the failfast thread to check if the 4674 * failure is due to a reservation conflict. 4675 */ 4676 (void) vdc_failfast_io_queue(vdcp, bufp); 4677 } else { 4678 if (status == 0) { 4679 op = (bufp->b_flags & B_READ) ? 4680 VD_OP_BREAD : VD_OP_BWRITE; 4681 VD_UPDATE_IO_STATS(vdcp, op, 4682 ldep->dep->payload.nbytes); 4683 } 4684 VD_KSTAT_RUNQ_EXIT(vdcp); 4685 DTRACE_IO1(done, buf_t *, bufp); 4686 biodone(bufp); 4687 } 4688 break; 4689 4690 default: 4691 ASSERT(0); 4692 } 4693 } 4694 4695 /* let the arrival signal propogate */ 4696 mutex_exit(&vdcp->lock); 4697 4698 /* probe gives the count of how many entries were processed */ 4699 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4700 4701 return (0); 4702 } 4703 4704 4705 /* 4706 * Function: 4707 * vdc_handle_ver_msg() 4708 * 4709 * Description: 4710 * 4711 * Arguments: 4712 * vdc - soft state pointer for this instance of the device driver. 4713 * ver_msg - LDC message sent by vDisk server 4714 * 4715 * Return Code: 4716 * 0 - Success 4717 */ 4718 static int 4719 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4720 { 4721 int status = 0; 4722 4723 ASSERT(vdc != NULL); 4724 ASSERT(mutex_owned(&vdc->lock)); 4725 4726 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4727 return (EPROTO); 4728 } 4729 4730 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4731 return (EINVAL); 4732 } 4733 4734 switch (ver_msg->tag.vio_subtype) { 4735 case VIO_SUBTYPE_ACK: 4736 /* 4737 * We check to see if the version returned is indeed supported 4738 * (The server may have also adjusted the minor number downwards 4739 * and if so 'ver_msg' will contain the actual version agreed) 4740 */ 4741 if (vdc_is_supported_version(ver_msg)) { 4742 vdc->ver.major = ver_msg->ver_major; 4743 vdc->ver.minor = ver_msg->ver_minor; 4744 ASSERT(vdc->ver.major > 0); 4745 } else { 4746 status = EPROTO; 4747 } 4748 break; 4749 4750 case VIO_SUBTYPE_NACK: 4751 /* 4752 * call vdc_is_supported_version() which will return the next 4753 * supported version (if any) in 'ver_msg' 4754 */ 4755 (void) vdc_is_supported_version(ver_msg); 4756 if (ver_msg->ver_major > 0) { 4757 size_t len = sizeof (*ver_msg); 4758 4759 ASSERT(vdc->ver.major > 0); 4760 4761 /* reset the necessary fields and resend */ 4762 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4763 ver_msg->dev_class = VDEV_DISK; 4764 4765 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4766 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4767 vdc->instance, status); 4768 if (len != sizeof (*ver_msg)) 4769 status = EBADMSG; 4770 } else { 4771 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4772 vdc->instance); 4773 status = ENOTSUP; 4774 } 4775 4776 break; 4777 case VIO_SUBTYPE_INFO: 4778 /* 4779 * Handle the case where vds starts handshake 4780 * (for now only vdc is the instigator) 4781 */ 4782 status = ENOTSUP; 4783 break; 4784 4785 default: 4786 status = EINVAL; 4787 break; 4788 } 4789 4790 return (status); 4791 } 4792 4793 /* 4794 * Function: 4795 * vdc_handle_attr_msg() 4796 * 4797 * Description: 4798 * 4799 * Arguments: 4800 * vdc - soft state pointer for this instance of the device driver. 4801 * attr_msg - LDC message sent by vDisk server 4802 * 4803 * Return Code: 4804 * 0 - Success 4805 */ 4806 static int 4807 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4808 { 4809 int status = 0; 4810 4811 ASSERT(vdc != NULL); 4812 ASSERT(mutex_owned(&vdc->lock)); 4813 4814 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4815 return (EPROTO); 4816 } 4817 4818 switch (attr_msg->tag.vio_subtype) { 4819 case VIO_SUBTYPE_ACK: 4820 /* 4821 * We now verify the attributes sent by vds. 4822 */ 4823 if (attr_msg->vdisk_size == 0) { 4824 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4825 vdc->instance); 4826 status = EINVAL; 4827 break; 4828 } 4829 4830 if (attr_msg->max_xfer_sz == 0) { 4831 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4832 vdc->instance); 4833 status = EINVAL; 4834 break; 4835 } 4836 4837 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4838 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4839 vdc->instance); 4840 attr_msg->vdisk_size = 0; 4841 } 4842 4843 /* 4844 * If the disk size is already set check that it hasn't changed. 4845 */ 4846 if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && 4847 (vdc->vdisk_size != attr_msg->vdisk_size)) { 4848 DMSG(vdc, 0, "[%d] Different disk size from vds " 4849 "(old=0x%lx - new=0x%lx", vdc->instance, 4850 vdc->vdisk_size, attr_msg->vdisk_size) 4851 status = EINVAL; 4852 break; 4853 } 4854 4855 vdc->vdisk_size = attr_msg->vdisk_size; 4856 vdc->vdisk_type = attr_msg->vdisk_type; 4857 vdc->operations = attr_msg->operations; 4858 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4859 vdc->vdisk_media = attr_msg->vdisk_media; 4860 else 4861 vdc->vdisk_media = 0; 4862 4863 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4864 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4865 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4866 vdc->instance, vdc->block_size, 4867 attr_msg->vdisk_block_size); 4868 4869 /* 4870 * We don't know at compile time what the vDisk server will 4871 * think are good values but we apply a large (arbitrary) 4872 * upper bound to prevent memory exhaustion in vdc if it was 4873 * allocating a DRing based of huge values sent by the server. 4874 * We probably will never exceed this except if the message 4875 * was garbage. 4876 */ 4877 if ((attr_msg->max_xfer_sz * attr_msg->vdisk_block_size) <= 4878 (PAGESIZE * DEV_BSIZE)) { 4879 vdc->max_xfer_sz = attr_msg->max_xfer_sz; 4880 vdc->block_size = attr_msg->vdisk_block_size; 4881 } else { 4882 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 4883 " using max supported by vdc", vdc->instance); 4884 } 4885 4886 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4887 (attr_msg->vdisk_size > INT64_MAX) || 4888 (attr_msg->operations == 0) || 4889 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4890 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4891 vdc->instance); 4892 status = EINVAL; 4893 break; 4894 } 4895 4896 /* 4897 * Now that we have received all attributes we can create a 4898 * fake geometry for the disk. 4899 */ 4900 vdc_create_fake_geometry(vdc); 4901 break; 4902 4903 case VIO_SUBTYPE_NACK: 4904 /* 4905 * vds could not handle the attributes we sent so we 4906 * stop negotiating. 4907 */ 4908 status = EPROTO; 4909 break; 4910 4911 case VIO_SUBTYPE_INFO: 4912 /* 4913 * Handle the case where vds starts the handshake 4914 * (for now; vdc is the only supported instigatior) 4915 */ 4916 status = ENOTSUP; 4917 break; 4918 4919 default: 4920 status = ENOTSUP; 4921 break; 4922 } 4923 4924 return (status); 4925 } 4926 4927 /* 4928 * Function: 4929 * vdc_handle_dring_reg_msg() 4930 * 4931 * Description: 4932 * 4933 * Arguments: 4934 * vdc - soft state pointer for this instance of the driver. 4935 * dring_msg - LDC message sent by vDisk server 4936 * 4937 * Return Code: 4938 * 0 - Success 4939 */ 4940 static int 4941 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4942 { 4943 int status = 0; 4944 4945 ASSERT(vdc != NULL); 4946 ASSERT(mutex_owned(&vdc->lock)); 4947 4948 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4949 return (EPROTO); 4950 } 4951 4952 switch (dring_msg->tag.vio_subtype) { 4953 case VIO_SUBTYPE_ACK: 4954 /* save the received dring_ident */ 4955 vdc->dring_ident = dring_msg->dring_ident; 4956 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4957 vdc->instance, vdc->dring_ident); 4958 break; 4959 4960 case VIO_SUBTYPE_NACK: 4961 /* 4962 * vds could not handle the DRing info we sent so we 4963 * stop negotiating. 4964 */ 4965 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4966 vdc->instance); 4967 status = EPROTO; 4968 break; 4969 4970 case VIO_SUBTYPE_INFO: 4971 /* 4972 * Handle the case where vds starts handshake 4973 * (for now only vdc is the instigatior) 4974 */ 4975 status = ENOTSUP; 4976 break; 4977 default: 4978 status = ENOTSUP; 4979 } 4980 4981 return (status); 4982 } 4983 4984 /* 4985 * Function: 4986 * vdc_verify_seq_num() 4987 * 4988 * Description: 4989 * This functions verifies that the sequence number sent back by the vDisk 4990 * server with the latest message is what is expected (i.e. it is greater 4991 * than the last seq num sent by the vDisk server and less than or equal 4992 * to the last seq num generated by vdc). 4993 * 4994 * It then checks the request ID to see if any requests need processing 4995 * in the DRing. 4996 * 4997 * Arguments: 4998 * vdc - soft state pointer for this instance of the driver. 4999 * dring_msg - pointer to the LDC message sent by vds 5000 * 5001 * Return Code: 5002 * VDC_SEQ_NUM_TODO - Message needs to be processed 5003 * VDC_SEQ_NUM_SKIP - Message has already been processed 5004 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 5005 * vdc cannot deal with them 5006 */ 5007 static int 5008 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 5009 { 5010 ASSERT(vdc != NULL); 5011 ASSERT(dring_msg != NULL); 5012 ASSERT(mutex_owned(&vdc->lock)); 5013 5014 /* 5015 * Check to see if the messages were responded to in the correct 5016 * order by vds. 5017 */ 5018 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 5019 (dring_msg->seq_num > vdc->seq_num)) { 5020 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 5021 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 5022 vdc->instance, dring_msg->seq_num, 5023 vdc->seq_num_reply, vdc->seq_num, 5024 vdc->req_id_proc, vdc->req_id); 5025 return (VDC_SEQ_NUM_INVALID); 5026 } 5027 vdc->seq_num_reply = dring_msg->seq_num; 5028 5029 if (vdc->req_id_proc < vdc->req_id) 5030 return (VDC_SEQ_NUM_TODO); 5031 else 5032 return (VDC_SEQ_NUM_SKIP); 5033 } 5034 5035 5036 /* 5037 * Function: 5038 * vdc_is_supported_version() 5039 * 5040 * Description: 5041 * This routine checks if the major/minor version numbers specified in 5042 * 'ver_msg' are supported. If not it finds the next version that is 5043 * in the supported version list 'vdc_version[]' and sets the fields in 5044 * 'ver_msg' to those values 5045 * 5046 * Arguments: 5047 * ver_msg - LDC message sent by vDisk server 5048 * 5049 * Return Code: 5050 * B_TRUE - Success 5051 * B_FALSE - Version not supported 5052 */ 5053 static boolean_t 5054 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5055 { 5056 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5057 5058 for (int i = 0; i < vdc_num_versions; i++) { 5059 ASSERT(vdc_version[i].major > 0); 5060 ASSERT((i == 0) || 5061 (vdc_version[i].major < vdc_version[i-1].major)); 5062 5063 /* 5064 * If the major versions match, adjust the minor version, if 5065 * necessary, down to the highest value supported by this 5066 * client. The server should support all minor versions lower 5067 * than the value it sent 5068 */ 5069 if (ver_msg->ver_major == vdc_version[i].major) { 5070 if (ver_msg->ver_minor > vdc_version[i].minor) { 5071 DMSGX(0, 5072 "Adjusting minor version from %u to %u", 5073 ver_msg->ver_minor, vdc_version[i].minor); 5074 ver_msg->ver_minor = vdc_version[i].minor; 5075 } 5076 return (B_TRUE); 5077 } 5078 5079 /* 5080 * If the message contains a higher major version number, set 5081 * the message's major/minor versions to the current values 5082 * and return false, so this message will get resent with 5083 * these values, and the server will potentially try again 5084 * with the same or a lower version 5085 */ 5086 if (ver_msg->ver_major > vdc_version[i].major) { 5087 ver_msg->ver_major = vdc_version[i].major; 5088 ver_msg->ver_minor = vdc_version[i].minor; 5089 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5090 ver_msg->ver_major, ver_msg->ver_minor); 5091 5092 return (B_FALSE); 5093 } 5094 5095 /* 5096 * Otherwise, the message's major version is less than the 5097 * current major version, so continue the loop to the next 5098 * (lower) supported version 5099 */ 5100 } 5101 5102 /* 5103 * No common version was found; "ground" the version pair in the 5104 * message to terminate negotiation 5105 */ 5106 ver_msg->ver_major = 0; 5107 ver_msg->ver_minor = 0; 5108 5109 return (B_FALSE); 5110 } 5111 /* -------------------------------------------------------------------------- */ 5112 5113 /* 5114 * DKIO(7) support 5115 */ 5116 5117 typedef struct vdc_dk_arg { 5118 struct dk_callback dkc; 5119 int mode; 5120 dev_t dev; 5121 vdc_t *vdc; 5122 } vdc_dk_arg_t; 5123 5124 /* 5125 * Function: 5126 * vdc_dkio_flush_cb() 5127 * 5128 * Description: 5129 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5130 * by kernel code. 5131 * 5132 * Arguments: 5133 * arg - a pointer to a vdc_dk_arg_t structure. 5134 */ 5135 void 5136 vdc_dkio_flush_cb(void *arg) 5137 { 5138 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5139 struct dk_callback *dkc = NULL; 5140 vdc_t *vdc = NULL; 5141 int rv; 5142 5143 if (dk_arg == NULL) { 5144 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5145 return; 5146 } 5147 dkc = &dk_arg->dkc; 5148 vdc = dk_arg->vdc; 5149 ASSERT(vdc != NULL); 5150 5151 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5152 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5153 if (rv != 0) { 5154 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5155 vdc->instance, rv, 5156 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5157 } 5158 5159 /* 5160 * Trigger the call back to notify the caller the the ioctl call has 5161 * been completed. 5162 */ 5163 if ((dk_arg->mode & FKIOCTL) && 5164 (dkc != NULL) && 5165 (dkc->dkc_callback != NULL)) { 5166 ASSERT(dkc->dkc_cookie != NULL); 5167 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5168 } 5169 5170 /* Indicate that one less DKIO write flush is outstanding */ 5171 mutex_enter(&vdc->lock); 5172 vdc->dkio_flush_pending--; 5173 ASSERT(vdc->dkio_flush_pending >= 0); 5174 mutex_exit(&vdc->lock); 5175 5176 /* free the mem that was allocated when the callback was dispatched */ 5177 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5178 } 5179 5180 /* 5181 * Function: 5182 * vdc_dkio_gapart() 5183 * 5184 * Description: 5185 * This function implements the DKIOCGAPART ioctl. 5186 * 5187 * Arguments: 5188 * vdc - soft state pointer 5189 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5190 * flag - ioctl flags 5191 */ 5192 static int 5193 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5194 { 5195 struct dk_geom *geom; 5196 struct vtoc *vtoc; 5197 union { 5198 struct dk_map map[NDKMAP]; 5199 struct dk_map32 map32[NDKMAP]; 5200 } data; 5201 int i, rv, size; 5202 5203 mutex_enter(&vdc->lock); 5204 5205 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5206 mutex_exit(&vdc->lock); 5207 return (rv); 5208 } 5209 5210 vtoc = vdc->vtoc; 5211 geom = vdc->geom; 5212 5213 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5214 5215 for (i = 0; i < vtoc->v_nparts; i++) { 5216 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5217 (geom->dkg_nhead * geom->dkg_nsect); 5218 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5219 } 5220 size = NDKMAP * sizeof (struct dk_map32); 5221 5222 } else { 5223 5224 for (i = 0; i < vtoc->v_nparts; i++) { 5225 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5226 (geom->dkg_nhead * geom->dkg_nsect); 5227 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5228 } 5229 size = NDKMAP * sizeof (struct dk_map); 5230 5231 } 5232 5233 mutex_exit(&vdc->lock); 5234 5235 if (ddi_copyout(&data, arg, size, flag) != 0) 5236 return (EFAULT); 5237 5238 return (0); 5239 } 5240 5241 /* 5242 * Function: 5243 * vdc_dkio_partition() 5244 * 5245 * Description: 5246 * This function implements the DKIOCPARTITION ioctl. 5247 * 5248 * Arguments: 5249 * vdc - soft state pointer 5250 * arg - a pointer to a struct partition64 structure 5251 * flag - ioctl flags 5252 */ 5253 static int 5254 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5255 { 5256 struct partition64 p64; 5257 efi_gpt_t *gpt; 5258 efi_gpe_t *gpe; 5259 vd_efi_dev_t edev; 5260 uint_t partno; 5261 int rv; 5262 5263 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5264 return (EFAULT); 5265 } 5266 5267 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5268 5269 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5270 return (rv); 5271 } 5272 5273 partno = p64.p_partno; 5274 5275 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5276 vd_efi_free(&edev, gpt, gpe); 5277 return (ESRCH); 5278 } 5279 5280 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5281 sizeof (struct uuid)); 5282 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5283 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5284 5285 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5286 vd_efi_free(&edev, gpt, gpe); 5287 return (EFAULT); 5288 } 5289 5290 vd_efi_free(&edev, gpt, gpe); 5291 return (0); 5292 } 5293 5294 /* 5295 * Function: 5296 * vdc_dioctl_rwcmd() 5297 * 5298 * Description: 5299 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5300 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5301 * 5302 * Arguments: 5303 * dev - device 5304 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5305 * flag - ioctl flags 5306 */ 5307 static int 5308 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5309 { 5310 struct dadkio_rwcmd32 rwcmd32; 5311 struct dadkio_rwcmd rwcmd; 5312 struct iovec aiov; 5313 struct uio auio; 5314 int rw, status; 5315 struct buf *buf; 5316 5317 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5318 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5319 sizeof (struct dadkio_rwcmd32), flag)) { 5320 return (EFAULT); 5321 } 5322 rwcmd.cmd = rwcmd32.cmd; 5323 rwcmd.flags = rwcmd32.flags; 5324 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5325 rwcmd.buflen = rwcmd32.buflen; 5326 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5327 } else { 5328 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5329 sizeof (struct dadkio_rwcmd), flag)) { 5330 return (EFAULT); 5331 } 5332 } 5333 5334 switch (rwcmd.cmd) { 5335 case DADKIO_RWCMD_READ: 5336 rw = B_READ; 5337 break; 5338 case DADKIO_RWCMD_WRITE: 5339 rw = B_WRITE; 5340 break; 5341 default: 5342 return (EINVAL); 5343 } 5344 5345 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5346 aiov.iov_base = rwcmd.bufaddr; 5347 aiov.iov_len = rwcmd.buflen; 5348 5349 bzero((caddr_t)&auio, sizeof (struct uio)); 5350 auio.uio_iov = &aiov; 5351 auio.uio_iovcnt = 1; 5352 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5353 auio.uio_resid = rwcmd.buflen; 5354 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5355 5356 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5357 bioinit(buf); 5358 /* 5359 * We use the private field of buf to specify that this is an 5360 * I/O using an absolute offset. 5361 */ 5362 buf->b_private = (void *)VD_SLICE_NONE; 5363 5364 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5365 5366 biofini(buf); 5367 kmem_free(buf, sizeof (buf_t)); 5368 5369 return (status); 5370 } 5371 5372 /* 5373 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5374 * buffer is returned in alloc_len. 5375 */ 5376 static vd_scsi_t * 5377 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5378 int *alloc_len) 5379 { 5380 vd_scsi_t *vd_scsi; 5381 int vd_scsi_len = VD_SCSI_SIZE; 5382 5383 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5384 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5385 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5386 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5387 5388 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5389 5390 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5391 5392 vd_scsi->cdb_len = cdb_len; 5393 vd_scsi->sense_len = sense_len; 5394 vd_scsi->datain_len = datain_len; 5395 vd_scsi->dataout_len = dataout_len; 5396 5397 *alloc_len = vd_scsi_len; 5398 5399 return (vd_scsi); 5400 } 5401 5402 /* 5403 * Convert the status of a SCSI command to a Solaris return code. 5404 * 5405 * Arguments: 5406 * vd_scsi - The SCSI operation buffer. 5407 * log_error - indicate if an error message should be logged. 5408 * 5409 * Note that our SCSI error messages are rather primitive for the moment 5410 * and could be improved by decoding some data like the SCSI command and 5411 * the sense key. 5412 * 5413 * Return value: 5414 * 0 - Status is good. 5415 * EACCES - Status reports a reservation conflict. 5416 * ENOTSUP - Status reports a check condition and sense key 5417 * reports an illegal request. 5418 * EIO - Any other status. 5419 */ 5420 static int 5421 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5422 { 5423 int rv; 5424 char path_str[MAXPATHLEN]; 5425 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5426 union scsi_cdb *cdb; 5427 struct scsi_extended_sense *sense; 5428 5429 if (vd_scsi->cmd_status == STATUS_GOOD) 5430 /* no error */ 5431 return (0); 5432 5433 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5434 if (vdc_scsi_log_error) 5435 log_error = B_TRUE; 5436 5437 if (log_error) { 5438 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5439 ddi_pathname(vdc->dip, path_str), vdc->instance, 5440 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5441 } 5442 5443 /* default returned value */ 5444 rv = EIO; 5445 5446 switch (vd_scsi->cmd_status) { 5447 5448 case STATUS_CHECK: 5449 case STATUS_TERMINATED: 5450 if (log_error) 5451 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5452 5453 /* check sense buffer */ 5454 if (vd_scsi->sense_len == 0 || 5455 vd_scsi->sense_status != STATUS_GOOD) { 5456 if (log_error) 5457 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5458 break; 5459 } 5460 5461 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5462 5463 if (log_error) { 5464 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5465 "\tASC: 0x%x, ASCQ: 0x%x\n", 5466 scsi_sense_key((uint8_t *)sense), 5467 scsi_sense_asc((uint8_t *)sense), 5468 scsi_sense_ascq((uint8_t *)sense)); 5469 } 5470 5471 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5472 rv = ENOTSUP; 5473 break; 5474 5475 case STATUS_BUSY: 5476 if (log_error) 5477 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5478 break; 5479 5480 case STATUS_RESERVATION_CONFLICT: 5481 /* 5482 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5483 * reservation conflict could be due to various reasons like 5484 * incorrect keys, not registered or not reserved etc. So, 5485 * we should not panic in that case. 5486 */ 5487 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5488 if (vdc->failfast_interval != 0 && 5489 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5490 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5491 /* failfast is enabled so we have to panic */ 5492 (void) snprintf(panic_str, sizeof (panic_str), 5493 VDC_RESV_CONFLICT_FMT_STR "%s", 5494 ddi_pathname(vdc->dip, path_str)); 5495 panic(panic_str); 5496 } 5497 if (log_error) 5498 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5499 rv = EACCES; 5500 break; 5501 5502 case STATUS_QFULL: 5503 if (log_error) 5504 cmn_err(CE_NOTE, "\tQueue Full\n"); 5505 break; 5506 5507 case STATUS_MET: 5508 case STATUS_INTERMEDIATE: 5509 case STATUS_SCSI2: 5510 case STATUS_INTERMEDIATE_MET: 5511 case STATUS_ACA_ACTIVE: 5512 if (log_error) 5513 cmn_err(CE_CONT, 5514 "\tUnexpected SCSI status received: 0x%x\n", 5515 vd_scsi->cmd_status); 5516 break; 5517 5518 default: 5519 if (log_error) 5520 cmn_err(CE_CONT, 5521 "\tInvalid SCSI status received: 0x%x\n", 5522 vd_scsi->cmd_status); 5523 break; 5524 } 5525 5526 return (rv); 5527 } 5528 5529 /* 5530 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5531 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5532 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5533 * converted to a VD_OP_RESET operation. 5534 */ 5535 static int 5536 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5537 { 5538 struct uscsi_cmd uscsi; 5539 struct uscsi_cmd32 uscsi32; 5540 vd_scsi_t *vd_scsi; 5541 int vd_scsi_len; 5542 union scsi_cdb *cdb; 5543 struct scsi_extended_sense *sense; 5544 char *datain, *dataout; 5545 size_t cdb_len, datain_len, dataout_len, sense_len; 5546 int rv; 5547 5548 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5549 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5550 mode) != 0) 5551 return (EFAULT); 5552 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5553 } else { 5554 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5555 mode) != 0) 5556 return (EFAULT); 5557 } 5558 5559 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5560 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5561 USCSI_RESET_ALL)) { 5562 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5563 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5564 return (rv); 5565 } 5566 5567 /* cdb buffer length */ 5568 cdb_len = uscsi.uscsi_cdblen; 5569 5570 /* data in and out buffers length */ 5571 if (uscsi.uscsi_flags & USCSI_READ) { 5572 datain_len = uscsi.uscsi_buflen; 5573 dataout_len = 0; 5574 } else { 5575 datain_len = 0; 5576 dataout_len = uscsi.uscsi_buflen; 5577 } 5578 5579 /* sense buffer length */ 5580 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5581 sense_len = uscsi.uscsi_rqlen; 5582 else 5583 sense_len = 0; 5584 5585 /* allocate buffer for the VD_SCSICMD_OP operation */ 5586 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5587 &vd_scsi_len); 5588 5589 /* 5590 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5591 * but basically they prevent a SCSI command from being retried in case 5592 * of an error. 5593 */ 5594 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5595 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5596 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5597 5598 /* set task attribute */ 5599 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5600 vd_scsi->task_attribute = 0; 5601 } else { 5602 if (uscsi.uscsi_flags & USCSI_HEAD) 5603 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5604 else if (uscsi.uscsi_flags & USCSI_HTAG) 5605 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5606 else if (uscsi.uscsi_flags & USCSI_OTAG) 5607 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5608 else 5609 vd_scsi->task_attribute = 0; 5610 } 5611 5612 /* set timeout */ 5613 vd_scsi->timeout = uscsi.uscsi_timeout; 5614 5615 /* copy-in cdb data */ 5616 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5617 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5618 rv = EFAULT; 5619 goto done; 5620 } 5621 5622 /* keep a pointer to the sense buffer */ 5623 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5624 5625 /* keep a pointer to the data-in buffer */ 5626 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5627 5628 /* copy-in request data to the data-out buffer */ 5629 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5630 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5631 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5632 mode)) { 5633 rv = EFAULT; 5634 goto done; 5635 } 5636 } 5637 5638 /* submit the request */ 5639 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5640 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5641 5642 if (rv != 0) 5643 goto done; 5644 5645 /* update scsi status */ 5646 uscsi.uscsi_status = vd_scsi->cmd_status; 5647 5648 /* update sense data */ 5649 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5650 (uscsi.uscsi_status == STATUS_CHECK || 5651 uscsi.uscsi_status == STATUS_TERMINATED)) { 5652 5653 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5654 5655 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5656 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5657 vd_scsi->sense_len; 5658 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5659 vd_scsi->sense_len, mode) != 0) { 5660 rv = EFAULT; 5661 goto done; 5662 } 5663 } 5664 } 5665 5666 /* update request data */ 5667 if (uscsi.uscsi_status == STATUS_GOOD) { 5668 if (uscsi.uscsi_flags & USCSI_READ) { 5669 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5670 vd_scsi->datain_len; 5671 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5672 vd_scsi->datain_len, mode) != 0) { 5673 rv = EFAULT; 5674 goto done; 5675 } 5676 } else { 5677 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5678 vd_scsi->dataout_len; 5679 } 5680 } 5681 5682 /* copy-out result */ 5683 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5684 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5685 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5686 mode) != 0) { 5687 rv = EFAULT; 5688 goto done; 5689 } 5690 } else { 5691 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5692 mode) != 0) { 5693 rv = EFAULT; 5694 goto done; 5695 } 5696 } 5697 5698 /* get the return code from the SCSI command status */ 5699 rv = vdc_scsi_status(vdc, vd_scsi, 5700 !(uscsi.uscsi_flags & USCSI_SILENT)); 5701 5702 done: 5703 kmem_free(vd_scsi, vd_scsi_len); 5704 return (rv); 5705 } 5706 5707 /* 5708 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5709 * 5710 * Arguments: 5711 * cmd - SCSI PERSISTENT IN command 5712 * len - length of the SCSI input buffer 5713 * vd_scsi_len - return the length of the allocated buffer 5714 * 5715 * Returned Value: 5716 * a pointer to the allocated VD_OP_SCSICMD buffer. 5717 */ 5718 static vd_scsi_t * 5719 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5720 { 5721 int cdb_len, sense_len, datain_len, dataout_len; 5722 vd_scsi_t *vd_scsi; 5723 union scsi_cdb *cdb; 5724 5725 cdb_len = CDB_GROUP1; 5726 sense_len = sizeof (struct scsi_extended_sense); 5727 datain_len = len; 5728 dataout_len = 0; 5729 5730 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5731 vd_scsi_len); 5732 5733 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5734 5735 /* set cdb */ 5736 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5737 cdb->cdb_opaque[1] = cmd; 5738 FORMG1COUNT(cdb, datain_len); 5739 5740 vd_scsi->timeout = vdc_scsi_timeout; 5741 5742 return (vd_scsi); 5743 } 5744 5745 /* 5746 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5747 * 5748 * Arguments: 5749 * cmd - SCSI PERSISTENT OUT command 5750 * len - length of the SCSI output buffer 5751 * vd_scsi_len - return the length of the allocated buffer 5752 * 5753 * Returned Code: 5754 * a pointer to the allocated VD_OP_SCSICMD buffer. 5755 */ 5756 static vd_scsi_t * 5757 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5758 { 5759 int cdb_len, sense_len, datain_len, dataout_len; 5760 vd_scsi_t *vd_scsi; 5761 union scsi_cdb *cdb; 5762 5763 cdb_len = CDB_GROUP1; 5764 sense_len = sizeof (struct scsi_extended_sense); 5765 datain_len = 0; 5766 dataout_len = len; 5767 5768 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5769 vd_scsi_len); 5770 5771 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5772 5773 /* set cdb */ 5774 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5775 cdb->cdb_opaque[1] = cmd; 5776 FORMG1COUNT(cdb, dataout_len); 5777 5778 vd_scsi->timeout = vdc_scsi_timeout; 5779 5780 return (vd_scsi); 5781 } 5782 5783 /* 5784 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5785 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5786 * server with a VD_OP_SCSICMD operation. 5787 */ 5788 static int 5789 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5790 { 5791 vd_scsi_t *vd_scsi; 5792 mhioc_inkeys_t inkeys; 5793 mhioc_key_list_t klist; 5794 struct mhioc_inkeys32 inkeys32; 5795 struct mhioc_key_list32 klist32; 5796 sd_prin_readkeys_t *scsi_keys; 5797 void *user_keys; 5798 int vd_scsi_len; 5799 int listsize, listlen, rv; 5800 5801 /* copyin arguments */ 5802 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5803 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5804 if (rv != 0) 5805 return (EFAULT); 5806 5807 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5808 sizeof (klist32), mode); 5809 if (rv != 0) 5810 return (EFAULT); 5811 5812 listsize = klist32.listsize; 5813 } else { 5814 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5815 if (rv != 0) 5816 return (EFAULT); 5817 5818 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5819 if (rv != 0) 5820 return (EFAULT); 5821 5822 listsize = klist.listsize; 5823 } 5824 5825 /* build SCSI VD_OP request */ 5826 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5827 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5828 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5829 5830 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5831 5832 /* submit the request */ 5833 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5834 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5835 5836 if (rv != 0) 5837 goto done; 5838 5839 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5840 5841 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5842 inkeys32.generation = scsi_keys->generation; 5843 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5844 if (rv != 0) { 5845 rv = EFAULT; 5846 goto done; 5847 } 5848 5849 klist32.listlen = listlen; 5850 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5851 sizeof (klist32), mode); 5852 if (rv != 0) { 5853 rv = EFAULT; 5854 goto done; 5855 } 5856 5857 user_keys = (caddr_t)(uintptr_t)klist32.list; 5858 } else { 5859 inkeys.generation = scsi_keys->generation; 5860 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5861 if (rv != 0) { 5862 rv = EFAULT; 5863 goto done; 5864 } 5865 5866 klist.listlen = listlen; 5867 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5868 if (rv != 0) { 5869 rv = EFAULT; 5870 goto done; 5871 } 5872 5873 user_keys = klist.list; 5874 } 5875 5876 /* copy out keys */ 5877 if (listlen > 0 && listsize > 0) { 5878 if (listsize < listlen) 5879 listlen = listsize; 5880 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5881 listlen * MHIOC_RESV_KEY_SIZE, mode); 5882 if (rv != 0) 5883 rv = EFAULT; 5884 } 5885 5886 if (rv == 0) 5887 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5888 5889 done: 5890 kmem_free(vd_scsi, vd_scsi_len); 5891 5892 return (rv); 5893 } 5894 5895 /* 5896 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5897 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5898 * the vdisk server with a VD_OP_SCSICMD operation. 5899 */ 5900 static int 5901 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5902 { 5903 vd_scsi_t *vd_scsi; 5904 mhioc_inresvs_t inresv; 5905 mhioc_resv_desc_list_t rlist; 5906 struct mhioc_inresvs32 inresv32; 5907 struct mhioc_resv_desc_list32 rlist32; 5908 mhioc_resv_desc_t mhd_resv; 5909 sd_prin_readresv_t *scsi_resv; 5910 sd_readresv_desc_t *resv; 5911 mhioc_resv_desc_t *user_resv; 5912 int vd_scsi_len; 5913 int listsize, listlen, i, rv; 5914 5915 /* copyin arguments */ 5916 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5917 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5918 if (rv != 0) 5919 return (EFAULT); 5920 5921 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5922 sizeof (rlist32), mode); 5923 if (rv != 0) 5924 return (EFAULT); 5925 5926 listsize = rlist32.listsize; 5927 } else { 5928 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5929 if (rv != 0) 5930 return (EFAULT); 5931 5932 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5933 if (rv != 0) 5934 return (EFAULT); 5935 5936 listsize = rlist.listsize; 5937 } 5938 5939 /* build SCSI VD_OP request */ 5940 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5941 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5942 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5943 5944 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5945 5946 /* submit the request */ 5947 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5948 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5949 5950 if (rv != 0) 5951 goto done; 5952 5953 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5954 5955 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5956 inresv32.generation = scsi_resv->generation; 5957 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5958 if (rv != 0) { 5959 rv = EFAULT; 5960 goto done; 5961 } 5962 5963 rlist32.listlen = listlen; 5964 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5965 sizeof (rlist32), mode); 5966 if (rv != 0) { 5967 rv = EFAULT; 5968 goto done; 5969 } 5970 5971 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5972 } else { 5973 inresv.generation = scsi_resv->generation; 5974 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5975 if (rv != 0) { 5976 rv = EFAULT; 5977 goto done; 5978 } 5979 5980 rlist.listlen = listlen; 5981 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5982 if (rv != 0) { 5983 rv = EFAULT; 5984 goto done; 5985 } 5986 5987 user_resv = rlist.list; 5988 } 5989 5990 /* copy out reservations */ 5991 if (listsize > 0 && listlen > 0) { 5992 if (listsize < listlen) 5993 listlen = listsize; 5994 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5995 5996 for (i = 0; i < listlen; i++) { 5997 mhd_resv.type = resv->type; 5998 mhd_resv.scope = resv->scope; 5999 mhd_resv.scope_specific_addr = 6000 BE_32(resv->scope_specific_addr); 6001 bcopy(&resv->resvkey, &mhd_resv.key, 6002 MHIOC_RESV_KEY_SIZE); 6003 6004 rv = ddi_copyout(&mhd_resv, user_resv, 6005 sizeof (mhd_resv), mode); 6006 if (rv != 0) { 6007 rv = EFAULT; 6008 goto done; 6009 } 6010 resv++; 6011 user_resv++; 6012 } 6013 } 6014 6015 if (rv == 0) 6016 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6017 6018 done: 6019 kmem_free(vd_scsi, vd_scsi_len); 6020 return (rv); 6021 } 6022 6023 /* 6024 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 6025 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 6026 * server with a VD_OP_SCSICMD operation. 6027 */ 6028 static int 6029 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 6030 { 6031 vd_scsi_t *vd_scsi; 6032 sd_prout_t *scsi_prout; 6033 mhioc_register_t mhd_reg; 6034 int vd_scsi_len, rv; 6035 6036 /* copyin arguments */ 6037 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 6038 if (rv != 0) 6039 return (EFAULT); 6040 6041 /* build SCSI VD_OP request */ 6042 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 6043 sizeof (sd_prout_t), &vd_scsi_len); 6044 6045 /* set parameters */ 6046 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6047 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6048 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6049 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6050 6051 /* submit the request */ 6052 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6053 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6054 6055 if (rv == 0) 6056 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6057 6058 kmem_free(vd_scsi, vd_scsi_len); 6059 return (rv); 6060 } 6061 6062 /* 6063 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6064 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6065 * server with a VD_OP_SCSICMD operation. 6066 */ 6067 static int 6068 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6069 { 6070 union scsi_cdb *cdb; 6071 vd_scsi_t *vd_scsi; 6072 sd_prout_t *scsi_prout; 6073 mhioc_resv_desc_t mhd_resv; 6074 int vd_scsi_len, rv; 6075 6076 /* copyin arguments */ 6077 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6078 if (rv != 0) 6079 return (EFAULT); 6080 6081 /* build SCSI VD_OP request */ 6082 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6083 sizeof (sd_prout_t), &vd_scsi_len); 6084 6085 /* set parameters */ 6086 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6087 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6088 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6089 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6090 cdb->cdb_opaque[2] = mhd_resv.type; 6091 6092 /* submit the request */ 6093 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6094 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6095 6096 if (rv == 0) 6097 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6098 6099 kmem_free(vd_scsi, vd_scsi_len); 6100 return (rv); 6101 } 6102 6103 /* 6104 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6105 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6106 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6107 */ 6108 static int 6109 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6110 { 6111 union scsi_cdb *cdb; 6112 vd_scsi_t *vd_scsi; 6113 sd_prout_t *scsi_prout; 6114 mhioc_preemptandabort_t mhd_preempt; 6115 int vd_scsi_len, rv; 6116 6117 /* copyin arguments */ 6118 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6119 if (rv != 0) 6120 return (EFAULT); 6121 6122 /* build SCSI VD_OP request */ 6123 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6124 sizeof (sd_prout_t), &vd_scsi_len); 6125 6126 /* set parameters */ 6127 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6128 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6129 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6130 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6131 MHIOC_RESV_KEY_SIZE); 6132 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6133 MHIOC_RESV_KEY_SIZE); 6134 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6135 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6136 6137 /* submit the request */ 6138 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6139 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6140 6141 if (rv == 0) 6142 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6143 6144 kmem_free(vd_scsi, vd_scsi_len); 6145 return (rv); 6146 } 6147 6148 /* 6149 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6150 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6151 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6152 */ 6153 static int 6154 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6155 { 6156 vd_scsi_t *vd_scsi; 6157 sd_prout_t *scsi_prout; 6158 mhioc_registerandignorekey_t mhd_regi; 6159 int vd_scsi_len, rv; 6160 6161 /* copyin arguments */ 6162 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6163 if (rv != 0) 6164 return (EFAULT); 6165 6166 /* build SCSI VD_OP request */ 6167 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6168 sizeof (sd_prout_t), &vd_scsi_len); 6169 6170 /* set parameters */ 6171 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6172 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6173 MHIOC_RESV_KEY_SIZE); 6174 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6175 6176 /* submit the request */ 6177 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6178 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6179 6180 if (rv == 0) 6181 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6182 6183 kmem_free(vd_scsi, vd_scsi_len); 6184 return (rv); 6185 } 6186 6187 /* 6188 * This function is used by the failfast mechanism to send a SCSI command 6189 * to check for reservation conflict. 6190 */ 6191 static int 6192 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6193 { 6194 int cdb_len, sense_len, vd_scsi_len; 6195 vd_scsi_t *vd_scsi; 6196 union scsi_cdb *cdb; 6197 int rv; 6198 6199 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6200 6201 if (scmd == SCMD_WRITE_G1) 6202 cdb_len = CDB_GROUP1; 6203 else 6204 cdb_len = CDB_GROUP0; 6205 6206 sense_len = sizeof (struct scsi_extended_sense); 6207 6208 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6209 6210 /* set cdb */ 6211 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6212 cdb->scc_cmd = scmd; 6213 6214 vd_scsi->timeout = vdc_scsi_timeout; 6215 6216 /* 6217 * Submit the request. The last argument has to be B_FALSE so that 6218 * vdc_do_sync_op does not loop checking for reservation conflict if 6219 * the operation returns an error. 6220 */ 6221 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6222 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6223 6224 if (rv == 0) 6225 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6226 6227 kmem_free(vd_scsi, vd_scsi_len); 6228 return (rv); 6229 } 6230 6231 /* 6232 * This function is used by the failfast mechanism to check for reservation 6233 * conflict. It sends some SCSI commands which will fail with a reservation 6234 * conflict error if the system does not have access to the disk and this 6235 * will panic the system. 6236 * 6237 * Returned Code: 6238 * 0 - disk is accessible without reservation conflict error 6239 * != 0 - unable to check if disk is accessible 6240 */ 6241 int 6242 vdc_failfast_check_resv(vdc_t *vdc) 6243 { 6244 int failure = 0; 6245 6246 /* 6247 * Send a TEST UNIT READY command. The command will panic 6248 * the system if it fails with a reservation conflict. 6249 */ 6250 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6251 failure++; 6252 6253 /* 6254 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6255 * a reserved device, so we also do a WRITE(10) of zero byte in 6256 * order to provoke a Reservation Conflict status on those newer 6257 * devices. 6258 */ 6259 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6260 failure++; 6261 6262 return (failure); 6263 } 6264 6265 /* 6266 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6267 * queue when it has failed and failfast is enabled. Then we have to check 6268 * if it has failed because of a reservation conflict in which case we have 6269 * to panic the system. 6270 * 6271 * Async I/O should be queued with their block I/O data transfer structure 6272 * (buf). Sync I/O should be queued with buf = NULL. 6273 */ 6274 static vdc_io_t * 6275 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6276 { 6277 vdc_io_t *vio; 6278 6279 ASSERT(MUTEX_HELD(&vdc->lock)); 6280 6281 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6282 vio->vio_next = vdc->failfast_io_queue; 6283 vio->vio_buf = buf; 6284 vio->vio_qtime = ddi_get_lbolt(); 6285 6286 vdc->failfast_io_queue = vio; 6287 6288 /* notify the failfast thread that a new I/O is queued */ 6289 cv_signal(&vdc->failfast_cv); 6290 6291 return (vio); 6292 } 6293 6294 /* 6295 * Remove and complete I/O in the failfast I/O queue which have been 6296 * added after the indicated deadline. A deadline of 0 means that all 6297 * I/O have to be unqueued and marked as completed. 6298 */ 6299 static void 6300 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6301 { 6302 vdc_io_t *vio, *vio_tmp; 6303 6304 ASSERT(MUTEX_HELD(&vdc->lock)); 6305 6306 vio_tmp = NULL; 6307 vio = vdc->failfast_io_queue; 6308 6309 if (deadline != 0) { 6310 /* 6311 * Skip any io queued after the deadline. The failfast 6312 * I/O queue is ordered starting with the last I/O added 6313 * to the queue. 6314 */ 6315 while (vio != NULL && vio->vio_qtime > deadline) { 6316 vio_tmp = vio; 6317 vio = vio->vio_next; 6318 } 6319 } 6320 6321 if (vio == NULL) 6322 /* nothing to unqueue */ 6323 return; 6324 6325 /* update the queue */ 6326 if (vio_tmp == NULL) 6327 vdc->failfast_io_queue = NULL; 6328 else 6329 vio_tmp->vio_next = NULL; 6330 6331 /* 6332 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6333 * structure (buf) and they are completed by calling biodone(). Sync 6334 * I/O do not have a buf and they are completed by setting the 6335 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6336 * thread waiting for the I/O to complete is responsible for freeing 6337 * the vio structure. 6338 */ 6339 while (vio != NULL) { 6340 vio_tmp = vio->vio_next; 6341 if (vio->vio_buf != NULL) { 6342 VD_KSTAT_RUNQ_EXIT(vdc); 6343 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6344 biodone(vio->vio_buf); 6345 kmem_free(vio, sizeof (vdc_io_t)); 6346 } else { 6347 vio->vio_qtime = 0; 6348 } 6349 vio = vio_tmp; 6350 } 6351 6352 cv_broadcast(&vdc->failfast_io_cv); 6353 } 6354 6355 /* 6356 * Failfast Thread. 6357 * 6358 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6359 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6360 * we still have access to the disk. If a command fails with a RESERVATION 6361 * CONFLICT error then the system will immediatly panic. 6362 * 6363 * The failfast thread is also woken up when an I/O has failed. It then check 6364 * the access to the disk to ensure that the I/O failure was not due to a 6365 * reservation conflict. 6366 * 6367 * There is one failfast thread for each virtual disk for which failfast is 6368 * enabled. We could have only one thread sending requests for all disks but 6369 * this would need vdc to send asynchronous requests and to have callbacks to 6370 * process replies. 6371 */ 6372 static void 6373 vdc_failfast_thread(void *arg) 6374 { 6375 int status; 6376 vdc_t *vdc = (vdc_t *)arg; 6377 clock_t timeout, starttime; 6378 6379 mutex_enter(&vdc->lock); 6380 6381 while (vdc->failfast_interval != 0) { 6382 6383 starttime = ddi_get_lbolt(); 6384 6385 mutex_exit(&vdc->lock); 6386 6387 /* check for reservation conflict */ 6388 status = vdc_failfast_check_resv(vdc); 6389 6390 mutex_enter(&vdc->lock); 6391 /* 6392 * We have dropped the lock to send the SCSI command so we have 6393 * to check that failfast is still enabled. 6394 */ 6395 if (vdc->failfast_interval == 0) 6396 break; 6397 6398 /* 6399 * If we have successfully check the disk access and there was 6400 * no reservation conflict then we can complete any I/O queued 6401 * before the last check. 6402 */ 6403 if (status == 0) 6404 vdc_failfast_io_unqueue(vdc, starttime); 6405 6406 /* proceed again if some I/O are still in the queue */ 6407 if (vdc->failfast_io_queue != NULL) 6408 continue; 6409 6410 timeout = ddi_get_lbolt() + 6411 drv_usectohz(vdc->failfast_interval); 6412 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6413 } 6414 6415 /* 6416 * Failfast is being stop so we can complete any queued I/O. 6417 */ 6418 vdc_failfast_io_unqueue(vdc, 0); 6419 vdc->failfast_thread = NULL; 6420 mutex_exit(&vdc->lock); 6421 thread_exit(); 6422 } 6423 6424 /* 6425 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6426 */ 6427 static int 6428 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6429 { 6430 unsigned int mh_time; 6431 6432 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6433 return (EFAULT); 6434 6435 mutex_enter(&vdc->lock); 6436 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6437 vdc->failfast_thread = thread_create(NULL, 0, 6438 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6439 v.v_maxsyspri - 2); 6440 } 6441 6442 vdc->failfast_interval = mh_time * 1000; 6443 cv_signal(&vdc->failfast_cv); 6444 mutex_exit(&vdc->lock); 6445 6446 return (0); 6447 } 6448 6449 /* 6450 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6451 * converted to VD_OP_SET_ACCESS operations. 6452 */ 6453 static int 6454 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6455 { 6456 int rv; 6457 6458 /* submit owership command request */ 6459 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6460 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6461 VIO_both_dir, B_TRUE); 6462 6463 return (rv); 6464 } 6465 6466 /* 6467 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6468 * VD_OP_GET_ACCESS operation. 6469 */ 6470 static int 6471 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6472 { 6473 int rv; 6474 6475 /* submit owership command request */ 6476 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6477 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6478 VIO_both_dir, B_TRUE); 6479 6480 return (rv); 6481 } 6482 6483 /* 6484 * Disk Ownership Thread. 6485 * 6486 * When we have taken the ownership of a disk, this thread waits to be 6487 * notified when the LDC channel is reset so that it can recover the 6488 * ownership. 6489 * 6490 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6491 * can not be used to do the ownership recovery because it has to be 6492 * running to handle the reply message to the ownership operation. 6493 */ 6494 static void 6495 vdc_ownership_thread(void *arg) 6496 { 6497 vdc_t *vdc = (vdc_t *)arg; 6498 clock_t timeout; 6499 uint64_t status; 6500 6501 mutex_enter(&vdc->ownership_lock); 6502 mutex_enter(&vdc->lock); 6503 6504 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6505 6506 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6507 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6508 /* 6509 * There was a reset so the ownership has been lost, 6510 * try to recover. We do this without using the preempt 6511 * option so that we don't steal the ownership from 6512 * someone who has preempted us. 6513 */ 6514 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6515 vdc->instance); 6516 6517 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6518 VDC_OWNERSHIP_GRANTED); 6519 6520 mutex_exit(&vdc->lock); 6521 6522 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6523 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6524 6525 mutex_enter(&vdc->lock); 6526 6527 if (status == 0) { 6528 DMSG(vdc, 0, "[%d] Ownership recovered", 6529 vdc->instance); 6530 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6531 } else { 6532 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6533 vdc->instance); 6534 } 6535 6536 } 6537 6538 /* 6539 * If we have the ownership then we just wait for an event 6540 * to happen (LDC reset), otherwise we will retry to recover 6541 * after a delay. 6542 */ 6543 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6544 timeout = 0; 6545 else 6546 timeout = ddi_get_lbolt() + 6547 drv_usectohz(vdc_ownership_delay); 6548 6549 /* Release the ownership_lock and wait on the vdc lock */ 6550 mutex_exit(&vdc->ownership_lock); 6551 6552 if (timeout == 0) 6553 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6554 else 6555 (void) cv_timedwait(&vdc->ownership_cv, 6556 &vdc->lock, timeout); 6557 6558 mutex_exit(&vdc->lock); 6559 6560 mutex_enter(&vdc->ownership_lock); 6561 mutex_enter(&vdc->lock); 6562 } 6563 6564 vdc->ownership_thread = NULL; 6565 mutex_exit(&vdc->lock); 6566 mutex_exit(&vdc->ownership_lock); 6567 6568 thread_exit(); 6569 } 6570 6571 static void 6572 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6573 { 6574 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6575 6576 mutex_enter(&vdc->lock); 6577 vdc->ownership = ownership_flags; 6578 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6579 vdc->ownership_thread == NULL) { 6580 /* start ownership thread */ 6581 vdc->ownership_thread = thread_create(NULL, 0, 6582 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6583 v.v_maxsyspri - 2); 6584 } else { 6585 /* notify the ownership thread */ 6586 cv_signal(&vdc->ownership_cv); 6587 } 6588 mutex_exit(&vdc->lock); 6589 } 6590 6591 /* 6592 * Get the size and the block size of a virtual disk from the vdisk server. 6593 * We need to use this operation when the vdisk_size attribute was not 6594 * available during the handshake with the vdisk server. 6595 */ 6596 static int 6597 vdc_check_capacity(vdc_t *vdc) 6598 { 6599 int rv = 0; 6600 size_t alloc_len; 6601 vd_capacity_t *vd_cap; 6602 6603 if (vdc->vdisk_size != 0) 6604 return (0); 6605 6606 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6607 6608 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6609 6610 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6611 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6612 6613 if (rv == 0) { 6614 if (vd_cap->vdisk_block_size != vdc->block_size || 6615 vd_cap->vdisk_size == VD_SIZE_UNKNOWN || 6616 vd_cap->vdisk_size == 0) 6617 rv = EINVAL; 6618 else 6619 vdc->vdisk_size = vd_cap->vdisk_size; 6620 } 6621 6622 kmem_free(vd_cap, alloc_len); 6623 return (rv); 6624 } 6625 6626 /* 6627 * This structure is used in the DKIO(7I) array below. 6628 */ 6629 typedef struct vdc_dk_ioctl { 6630 uint8_t op; /* VD_OP_XXX value */ 6631 int cmd; /* Solaris ioctl operation number */ 6632 size_t nbytes; /* size of structure to be copied */ 6633 6634 /* function to convert between vDisk and Solaris structure formats */ 6635 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6636 int mode, int dir); 6637 } vdc_dk_ioctl_t; 6638 6639 /* 6640 * Subset of DKIO(7I) operations currently supported 6641 */ 6642 static vdc_dk_ioctl_t dk_ioctl[] = { 6643 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6644 vdc_null_copy_func}, 6645 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6646 vdc_get_wce_convert}, 6647 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6648 vdc_set_wce_convert}, 6649 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6650 vdc_get_vtoc_convert}, 6651 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6652 vdc_set_vtoc_convert}, 6653 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6654 vdc_get_geom_convert}, 6655 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6656 vdc_get_geom_convert}, 6657 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6658 vdc_get_geom_convert}, 6659 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6660 vdc_set_geom_convert}, 6661 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6662 vdc_get_efi_convert}, 6663 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6664 vdc_set_efi_convert}, 6665 6666 /* DIOCTL_RWCMD is converted to a read or a write */ 6667 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6668 6669 /* mhd(7I) non-shared multihost disks ioctls */ 6670 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6671 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6672 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6673 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6674 6675 /* mhd(7I) shared multihost disks ioctls */ 6676 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6677 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6678 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6679 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6680 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6681 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6682 6683 /* mhd(7I) failfast ioctl */ 6684 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6685 6686 /* 6687 * These particular ioctls are not sent to the server - vdc fakes up 6688 * the necessary info. 6689 */ 6690 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6691 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6692 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6693 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6694 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6695 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6696 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6697 }; 6698 6699 /* 6700 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6701 * function and forward them to the vdisk. 6702 */ 6703 static int 6704 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6705 { 6706 vdc_t *vdc = (vdc_t *)vdisk; 6707 dev_t dev; 6708 int rval; 6709 6710 dev = makedevice(ddi_driver_major(vdc->dip), 6711 VD_MAKE_DEV(vdc->instance, 0)); 6712 6713 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6714 } 6715 6716 /* 6717 * Function: 6718 * vd_process_ioctl() 6719 * 6720 * Description: 6721 * This routine processes disk specific ioctl calls 6722 * 6723 * Arguments: 6724 * dev - the device number 6725 * cmd - the operation [dkio(7I)] to be processed 6726 * arg - pointer to user provided structure 6727 * (contains data to be set or reference parameter for get) 6728 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6729 * rvalp - pointer to return value for calling process. 6730 * 6731 * Return Code: 6732 * 0 6733 * EFAULT 6734 * ENXIO 6735 * EIO 6736 * ENOTSUP 6737 */ 6738 static int 6739 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6740 { 6741 int instance = VDCUNIT(dev); 6742 vdc_t *vdc = NULL; 6743 int rv = -1; 6744 int idx = 0; /* index into dk_ioctl[] */ 6745 size_t len = 0; /* #bytes to send to vds */ 6746 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6747 caddr_t mem_p = NULL; 6748 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6749 vdc_dk_ioctl_t *iop; 6750 6751 vdc = ddi_get_soft_state(vdc_state, instance); 6752 if (vdc == NULL) { 6753 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6754 instance); 6755 return (ENXIO); 6756 } 6757 6758 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6759 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6760 6761 if (rvalp != NULL) { 6762 /* the return value of the ioctl is 0 by default */ 6763 *rvalp = 0; 6764 } 6765 6766 /* 6767 * Validate the ioctl operation to be performed. 6768 * 6769 * If we have looped through the array without finding a match then we 6770 * don't support this ioctl. 6771 */ 6772 for (idx = 0; idx < nioctls; idx++) { 6773 if (cmd == dk_ioctl[idx].cmd) 6774 break; 6775 } 6776 6777 if (idx >= nioctls) { 6778 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6779 vdc->instance, cmd); 6780 return (ENOTSUP); 6781 } 6782 6783 iop = &(dk_ioctl[idx]); 6784 6785 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6786 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6787 dk_efi_t dk_efi; 6788 6789 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6790 if (rv != 0) 6791 return (EFAULT); 6792 6793 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6794 } else { 6795 len = iop->nbytes; 6796 } 6797 6798 /* check if the ioctl is applicable */ 6799 switch (cmd) { 6800 case CDROMREADOFFSET: 6801 case DKIOCREMOVABLE: 6802 return (ENOTTY); 6803 6804 case USCSICMD: 6805 case MHIOCTKOWN: 6806 case MHIOCSTATUS: 6807 case MHIOCQRESERVE: 6808 case MHIOCRELEASE: 6809 case MHIOCGRP_INKEYS: 6810 case MHIOCGRP_INRESV: 6811 case MHIOCGRP_REGISTER: 6812 case MHIOCGRP_RESERVE: 6813 case MHIOCGRP_PREEMPTANDABORT: 6814 case MHIOCGRP_REGISTERANDIGNOREKEY: 6815 case MHIOCENFAILFAST: 6816 if (vdc->cinfo == NULL) 6817 return (ENXIO); 6818 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6819 return (ENOTTY); 6820 break; 6821 6822 case DIOCTL_RWCMD: 6823 if (vdc->cinfo == NULL) 6824 return (ENXIO); 6825 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6826 return (ENOTTY); 6827 break; 6828 6829 case DKIOCINFO: 6830 if (vdc->cinfo == NULL) 6831 return (ENXIO); 6832 break; 6833 6834 case DKIOCGMEDIAINFO: 6835 if (vdc->minfo == NULL) 6836 return (ENXIO); 6837 if (vdc_check_capacity(vdc) != 0) 6838 /* disk capacity is not available */ 6839 return (EIO); 6840 break; 6841 } 6842 6843 /* 6844 * Deal with ioctls which require a processing different than 6845 * converting ioctl arguments and sending a corresponding 6846 * VD operation. 6847 */ 6848 switch (cmd) { 6849 6850 case USCSICMD: 6851 { 6852 return (vdc_uscsi_cmd(vdc, arg, mode)); 6853 } 6854 6855 case MHIOCTKOWN: 6856 { 6857 mutex_enter(&vdc->ownership_lock); 6858 /* 6859 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6860 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6861 * while we are processing the ioctl. 6862 */ 6863 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6864 6865 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6866 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6867 if (rv == 0) { 6868 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6869 VDC_OWNERSHIP_GRANTED); 6870 } else { 6871 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6872 } 6873 mutex_exit(&vdc->ownership_lock); 6874 return (rv); 6875 } 6876 6877 case MHIOCRELEASE: 6878 { 6879 mutex_enter(&vdc->ownership_lock); 6880 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6881 if (rv == 0) { 6882 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6883 } 6884 mutex_exit(&vdc->ownership_lock); 6885 return (rv); 6886 } 6887 6888 case MHIOCSTATUS: 6889 { 6890 uint64_t status; 6891 6892 rv = vdc_access_get(vdc, &status, mode); 6893 if (rv == 0 && rvalp != NULL) 6894 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6895 return (rv); 6896 } 6897 6898 case MHIOCQRESERVE: 6899 { 6900 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6901 return (rv); 6902 } 6903 6904 case MHIOCGRP_INKEYS: 6905 { 6906 return (vdc_mhd_inkeys(vdc, arg, mode)); 6907 } 6908 6909 case MHIOCGRP_INRESV: 6910 { 6911 return (vdc_mhd_inresv(vdc, arg, mode)); 6912 } 6913 6914 case MHIOCGRP_REGISTER: 6915 { 6916 return (vdc_mhd_register(vdc, arg, mode)); 6917 } 6918 6919 case MHIOCGRP_RESERVE: 6920 { 6921 return (vdc_mhd_reserve(vdc, arg, mode)); 6922 } 6923 6924 case MHIOCGRP_PREEMPTANDABORT: 6925 { 6926 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6927 } 6928 6929 case MHIOCGRP_REGISTERANDIGNOREKEY: 6930 { 6931 return (vdc_mhd_registerignore(vdc, arg, mode)); 6932 } 6933 6934 case MHIOCENFAILFAST: 6935 { 6936 rv = vdc_failfast(vdc, arg, mode); 6937 return (rv); 6938 } 6939 6940 case DIOCTL_RWCMD: 6941 { 6942 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6943 } 6944 6945 case DKIOCGAPART: 6946 { 6947 return (vdc_dkio_gapart(vdc, arg, mode)); 6948 } 6949 6950 case DKIOCPARTITION: 6951 { 6952 return (vdc_dkio_partition(vdc, arg, mode)); 6953 } 6954 6955 case DKIOCINFO: 6956 { 6957 struct dk_cinfo cinfo; 6958 6959 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6960 cinfo.dki_partition = VDCPART(dev); 6961 6962 rv = ddi_copyout(&cinfo, (void *)arg, 6963 sizeof (struct dk_cinfo), mode); 6964 if (rv != 0) 6965 return (EFAULT); 6966 6967 return (0); 6968 } 6969 6970 case DKIOCGMEDIAINFO: 6971 { 6972 ASSERT(vdc->vdisk_size != 0); 6973 if (vdc->minfo->dki_capacity == 0) 6974 vdc->minfo->dki_capacity = vdc->vdisk_size; 6975 rv = ddi_copyout(vdc->minfo, (void *)arg, 6976 sizeof (struct dk_minfo), mode); 6977 if (rv != 0) 6978 return (EFAULT); 6979 6980 return (0); 6981 } 6982 6983 case DKIOCFLUSHWRITECACHE: 6984 { 6985 struct dk_callback *dkc = 6986 (struct dk_callback *)(uintptr_t)arg; 6987 vdc_dk_arg_t *dkarg = NULL; 6988 6989 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6990 instance, mode); 6991 6992 /* 6993 * If arg is NULL, then there is no callback function 6994 * registered and the call operates synchronously; we 6995 * break and continue with the rest of the function and 6996 * wait for vds to return (i.e. after the request to 6997 * vds returns successfully, all writes completed prior 6998 * to the ioctl will have been flushed from the disk 6999 * write cache to persistent media. 7000 * 7001 * If a callback function is registered, we dispatch 7002 * the request on a task queue and return immediately. 7003 * The callback will deal with informing the calling 7004 * thread that the flush request is completed. 7005 */ 7006 if (dkc == NULL) 7007 break; 7008 7009 /* 7010 * the asynchronous callback is only supported if 7011 * invoked from within the kernel 7012 */ 7013 if ((mode & FKIOCTL) == 0) 7014 return (ENOTSUP); 7015 7016 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 7017 7018 dkarg->mode = mode; 7019 dkarg->dev = dev; 7020 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 7021 7022 mutex_enter(&vdc->lock); 7023 vdc->dkio_flush_pending++; 7024 dkarg->vdc = vdc; 7025 mutex_exit(&vdc->lock); 7026 7027 /* put the request on a task queue */ 7028 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 7029 (void *)dkarg, DDI_SLEEP); 7030 if (rv == NULL) { 7031 /* clean up if dispatch fails */ 7032 mutex_enter(&vdc->lock); 7033 vdc->dkio_flush_pending--; 7034 mutex_exit(&vdc->lock); 7035 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7036 } 7037 7038 return (rv == NULL ? ENOMEM : 0); 7039 } 7040 } 7041 7042 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7043 ASSERT(iop->op != 0); 7044 7045 /* check if the vDisk server handles the operation for this vDisk */ 7046 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7047 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7048 vdc->instance, iop->op); 7049 return (ENOTSUP); 7050 } 7051 7052 /* LDC requires that the memory being mapped is 8-byte aligned */ 7053 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7054 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7055 instance, len, alloc_len); 7056 7057 if (alloc_len > 0) 7058 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7059 7060 /* 7061 * Call the conversion function for this ioctl which, if necessary, 7062 * converts from the Solaris format to the format ARC'ed 7063 * as part of the vDisk protocol (FWARC 2006/195) 7064 */ 7065 ASSERT(iop->convert != NULL); 7066 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7067 if (rv != 0) { 7068 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7069 instance, rv, cmd); 7070 if (mem_p != NULL) 7071 kmem_free(mem_p, alloc_len); 7072 return (rv); 7073 } 7074 7075 /* 7076 * send request to vds to service the ioctl. 7077 */ 7078 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7079 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7080 VIO_both_dir, B_TRUE); 7081 7082 if (rv != 0) { 7083 /* 7084 * This is not necessarily an error. The ioctl could 7085 * be returning a value such as ENOTTY to indicate 7086 * that the ioctl is not applicable. 7087 */ 7088 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7089 instance, rv, cmd); 7090 if (mem_p != NULL) 7091 kmem_free(mem_p, alloc_len); 7092 7093 return (rv); 7094 } 7095 7096 /* 7097 * Call the conversion function (if it exists) for this ioctl 7098 * which converts from the format ARC'ed as part of the vDisk 7099 * protocol (FWARC 2006/195) back to a format understood by 7100 * the rest of Solaris. 7101 */ 7102 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7103 if (rv != 0) { 7104 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7105 instance, rv, cmd); 7106 if (mem_p != NULL) 7107 kmem_free(mem_p, alloc_len); 7108 return (rv); 7109 } 7110 7111 if (mem_p != NULL) 7112 kmem_free(mem_p, alloc_len); 7113 7114 return (rv); 7115 } 7116 7117 /* 7118 * Function: 7119 * 7120 * Description: 7121 * This is an empty conversion function used by ioctl calls which 7122 * do not need to convert the data being passed in/out to userland 7123 */ 7124 static int 7125 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7126 { 7127 _NOTE(ARGUNUSED(vdc)) 7128 _NOTE(ARGUNUSED(from)) 7129 _NOTE(ARGUNUSED(to)) 7130 _NOTE(ARGUNUSED(mode)) 7131 _NOTE(ARGUNUSED(dir)) 7132 7133 return (0); 7134 } 7135 7136 static int 7137 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7138 int mode, int dir) 7139 { 7140 _NOTE(ARGUNUSED(vdc)) 7141 7142 if (dir == VD_COPYIN) 7143 return (0); /* nothing to do */ 7144 7145 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7146 return (EFAULT); 7147 7148 return (0); 7149 } 7150 7151 static int 7152 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7153 int mode, int dir) 7154 { 7155 _NOTE(ARGUNUSED(vdc)) 7156 7157 if (dir == VD_COPYOUT) 7158 return (0); /* nothing to do */ 7159 7160 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7161 return (EFAULT); 7162 7163 return (0); 7164 } 7165 7166 /* 7167 * Function: 7168 * vdc_get_vtoc_convert() 7169 * 7170 * Description: 7171 * This routine performs the necessary convertions from the DKIOCGVTOC 7172 * Solaris structure to the format defined in FWARC 2006/195. 7173 * 7174 * In the struct vtoc definition, the timestamp field is marked as not 7175 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7176 * However SVM uses that field to check it can write into the VTOC, 7177 * so we fake up the info of that field. 7178 * 7179 * Arguments: 7180 * vdc - the vDisk client 7181 * from - the buffer containing the data to be copied from 7182 * to - the buffer to be copied to 7183 * mode - flags passed to ioctl() call 7184 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7185 * 7186 * Return Code: 7187 * 0 - Success 7188 * ENXIO - incorrect buffer passed in. 7189 * EFAULT - ddi_copyout routine encountered an error. 7190 */ 7191 static int 7192 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7193 { 7194 int i; 7195 void *tmp_mem = NULL; 7196 void *tmp_memp; 7197 struct vtoc vt; 7198 struct vtoc32 vt32; 7199 int copy_len = 0; 7200 int rv = 0; 7201 7202 if (dir != VD_COPYOUT) 7203 return (0); /* nothing to do */ 7204 7205 if ((from == NULL) || (to == NULL)) 7206 return (ENXIO); 7207 7208 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7209 copy_len = sizeof (struct vtoc32); 7210 else 7211 copy_len = sizeof (struct vtoc); 7212 7213 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7214 7215 VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); 7216 7217 /* fake the VTOC timestamp field */ 7218 for (i = 0; i < V_NUMPAR; i++) { 7219 vt.timestamp[i] = vdc->vtoc->timestamp[i]; 7220 } 7221 7222 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7223 /* LINTED E_ASSIGN_NARROW_CONV */ 7224 vtoctovtoc32(vt, vt32); 7225 tmp_memp = &vt32; 7226 } else { 7227 tmp_memp = &vt; 7228 } 7229 rv = ddi_copyout(tmp_memp, to, copy_len, mode); 7230 if (rv != 0) 7231 rv = EFAULT; 7232 7233 kmem_free(tmp_mem, copy_len); 7234 return (rv); 7235 } 7236 7237 /* 7238 * Function: 7239 * vdc_set_vtoc_convert() 7240 * 7241 * Description: 7242 * This routine performs the necessary convertions from the DKIOCSVTOC 7243 * Solaris structure to the format defined in FWARC 2006/195. 7244 * 7245 * Arguments: 7246 * vdc - the vDisk client 7247 * from - Buffer with data 7248 * to - Buffer where data is to be copied to 7249 * mode - flags passed to ioctl 7250 * dir - direction of copy (in or out) 7251 * 7252 * Return Code: 7253 * 0 - Success 7254 * ENXIO - Invalid buffer passed in 7255 * EFAULT - ddi_copyin of data failed 7256 */ 7257 static int 7258 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7259 { 7260 _NOTE(ARGUNUSED(vdc)) 7261 7262 void *tmp_mem = NULL, *uvtoc; 7263 struct vtoc vt; 7264 struct vtoc *vtp = &vt; 7265 vd_vtoc_t vtvd; 7266 int copy_len = 0; 7267 int i, rv = 0; 7268 7269 if ((from == NULL) || (to == NULL)) 7270 return (ENXIO); 7271 7272 if (dir == VD_COPYIN) 7273 uvtoc = from; 7274 else 7275 uvtoc = to; 7276 7277 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) 7278 copy_len = sizeof (struct vtoc32); 7279 else 7280 copy_len = sizeof (struct vtoc); 7281 7282 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7283 7284 rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); 7285 if (rv != 0) { 7286 kmem_free(tmp_mem, copy_len); 7287 return (EFAULT); 7288 } 7289 7290 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7291 vtoc32tovtoc((*(struct vtoc32 *)tmp_mem), vt); 7292 } else { 7293 vtp = tmp_mem; 7294 } 7295 7296 if (dir == VD_COPYOUT) { 7297 /* 7298 * The disk label may have changed. Revalidate the disk 7299 * geometry. This will also update the device nodes and 7300 * properties. 7301 */ 7302 vdc_validate(vdc); 7303 7304 /* 7305 * We also need to keep track of the timestamp fields. 7306 */ 7307 for (i = 0; i < V_NUMPAR; i++) { 7308 vdc->vtoc->timestamp[i] = vtp->timestamp[i]; 7309 } 7310 7311 return (0); 7312 } 7313 7314 VTOC2VD_VTOC(vtp, &vtvd); 7315 bcopy(&vtvd, to, sizeof (vd_vtoc_t)); 7316 kmem_free(tmp_mem, copy_len); 7317 7318 return (0); 7319 } 7320 7321 /* 7322 * Function: 7323 * vdc_get_geom_convert() 7324 * 7325 * Description: 7326 * This routine performs the necessary convertions from the DKIOCGGEOM, 7327 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7328 * defined in FWARC 2006/195 7329 * 7330 * Arguments: 7331 * vdc - the vDisk client 7332 * from - Buffer with data 7333 * to - Buffer where data is to be copied to 7334 * mode - flags passed to ioctl 7335 * dir - direction of copy (in or out) 7336 * 7337 * Return Code: 7338 * 0 - Success 7339 * ENXIO - Invalid buffer passed in 7340 * EFAULT - ddi_copyout of data failed 7341 */ 7342 static int 7343 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7344 { 7345 _NOTE(ARGUNUSED(vdc)) 7346 7347 struct dk_geom geom; 7348 int copy_len = sizeof (struct dk_geom); 7349 int rv = 0; 7350 7351 if (dir != VD_COPYOUT) 7352 return (0); /* nothing to do */ 7353 7354 if ((from == NULL) || (to == NULL)) 7355 return (ENXIO); 7356 7357 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7358 rv = ddi_copyout(&geom, to, copy_len, mode); 7359 if (rv != 0) 7360 rv = EFAULT; 7361 7362 return (rv); 7363 } 7364 7365 /* 7366 * Function: 7367 * vdc_set_geom_convert() 7368 * 7369 * Description: 7370 * This routine performs the necessary convertions from the DKIOCSGEOM 7371 * Solaris structure to the format defined in FWARC 2006/195. 7372 * 7373 * Arguments: 7374 * vdc - the vDisk client 7375 * from - Buffer with data 7376 * to - Buffer where data is to be copied to 7377 * mode - flags passed to ioctl 7378 * dir - direction of copy (in or out) 7379 * 7380 * Return Code: 7381 * 0 - Success 7382 * ENXIO - Invalid buffer passed in 7383 * EFAULT - ddi_copyin of data failed 7384 */ 7385 static int 7386 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7387 { 7388 _NOTE(ARGUNUSED(vdc)) 7389 7390 vd_geom_t vdgeom; 7391 void *tmp_mem = NULL; 7392 int copy_len = sizeof (struct dk_geom); 7393 int rv = 0; 7394 7395 if (dir != VD_COPYIN) 7396 return (0); /* nothing to do */ 7397 7398 if ((from == NULL) || (to == NULL)) 7399 return (ENXIO); 7400 7401 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7402 7403 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7404 if (rv != 0) { 7405 kmem_free(tmp_mem, copy_len); 7406 return (EFAULT); 7407 } 7408 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7409 bcopy(&vdgeom, to, sizeof (vdgeom)); 7410 kmem_free(tmp_mem, copy_len); 7411 7412 return (0); 7413 } 7414 7415 static int 7416 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7417 { 7418 _NOTE(ARGUNUSED(vdc)) 7419 7420 vd_efi_t *vd_efi; 7421 dk_efi_t dk_efi; 7422 int rv = 0; 7423 void *uaddr; 7424 7425 if ((from == NULL) || (to == NULL)) 7426 return (ENXIO); 7427 7428 if (dir == VD_COPYIN) { 7429 7430 vd_efi = (vd_efi_t *)to; 7431 7432 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7433 if (rv != 0) 7434 return (EFAULT); 7435 7436 vd_efi->lba = dk_efi.dki_lba; 7437 vd_efi->length = dk_efi.dki_length; 7438 bzero(vd_efi->data, vd_efi->length); 7439 7440 } else { 7441 7442 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7443 if (rv != 0) 7444 return (EFAULT); 7445 7446 uaddr = dk_efi.dki_data; 7447 7448 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7449 7450 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7451 7452 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7453 mode); 7454 if (rv != 0) 7455 return (EFAULT); 7456 7457 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7458 } 7459 7460 return (0); 7461 } 7462 7463 static int 7464 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7465 { 7466 _NOTE(ARGUNUSED(vdc)) 7467 7468 dk_efi_t dk_efi; 7469 void *uaddr; 7470 7471 if (dir == VD_COPYOUT) { 7472 /* 7473 * The disk label may have changed. Revalidate the disk 7474 * geometry. This will also update the device nodes and 7475 * properties. 7476 */ 7477 vdc_validate(vdc); 7478 return (0); 7479 } 7480 7481 if ((from == NULL) || (to == NULL)) 7482 return (ENXIO); 7483 7484 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7485 return (EFAULT); 7486 7487 uaddr = dk_efi.dki_data; 7488 7489 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7490 7491 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7492 return (EFAULT); 7493 7494 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7495 7496 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7497 7498 return (0); 7499 } 7500 7501 7502 /* -------------------------------------------------------------------------- */ 7503 7504 /* 7505 * Function: 7506 * vdc_create_fake_geometry() 7507 * 7508 * Description: 7509 * This routine fakes up the disk info needed for some DKIO ioctls such 7510 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7511 * 7512 * Note: This function must not be called until the vDisk attributes have 7513 * been exchanged as part of the handshake with the vDisk server. 7514 * 7515 * Arguments: 7516 * vdc - soft state pointer for this instance of the device driver. 7517 * 7518 * Return Code: 7519 * none. 7520 */ 7521 static void 7522 vdc_create_fake_geometry(vdc_t *vdc) 7523 { 7524 ASSERT(vdc != NULL); 7525 ASSERT(vdc->max_xfer_sz != 0); 7526 7527 /* 7528 * DKIOCINFO support 7529 */ 7530 if (vdc->cinfo == NULL) 7531 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7532 7533 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7534 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7535 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7536 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7537 7538 /* 7539 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7540 * operation is supported, otherwise the controller type is DKC_DIRECT. 7541 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7542 * controller type is always DKC_DIRECT in that case. 7543 * 7544 * If the virtual disk is backed by a physical CD/DVD device or 7545 * an ISO image, modify the controller type to indicate this 7546 */ 7547 switch (vdc->vdisk_media) { 7548 case VD_MEDIA_CD: 7549 case VD_MEDIA_DVD: 7550 vdc->cinfo->dki_ctype = DKC_CDROM; 7551 break; 7552 case VD_MEDIA_FIXED: 7553 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7554 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7555 else 7556 vdc->cinfo->dki_ctype = DKC_DIRECT; 7557 break; 7558 default: 7559 /* in the case of v1.0 we default to a fixed disk */ 7560 vdc->cinfo->dki_ctype = DKC_DIRECT; 7561 break; 7562 } 7563 vdc->cinfo->dki_flags = DKI_FMTVOL; 7564 vdc->cinfo->dki_cnum = 0; 7565 vdc->cinfo->dki_addr = 0; 7566 vdc->cinfo->dki_space = 0; 7567 vdc->cinfo->dki_prio = 0; 7568 vdc->cinfo->dki_vec = 0; 7569 vdc->cinfo->dki_unit = vdc->instance; 7570 vdc->cinfo->dki_slave = 0; 7571 /* 7572 * The partition number will be created on the fly depending on the 7573 * actual slice (i.e. minor node) that is used to request the data. 7574 */ 7575 vdc->cinfo->dki_partition = 0; 7576 7577 /* 7578 * DKIOCGMEDIAINFO support 7579 */ 7580 if (vdc->minfo == NULL) 7581 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7582 7583 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7584 vdc->minfo->dki_media_type = 7585 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7586 } else { 7587 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7588 } 7589 7590 vdc->minfo->dki_capacity = vdc->vdisk_size; 7591 vdc->minfo->dki_lbsize = vdc->block_size; 7592 } 7593 7594 static ushort_t 7595 vdc_lbl2cksum(struct dk_label *label) 7596 { 7597 int count; 7598 ushort_t sum, *sp; 7599 7600 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7601 sp = (ushort_t *)label; 7602 sum = 0; 7603 while (count--) { 7604 sum ^= *sp++; 7605 } 7606 7607 return (sum); 7608 } 7609 7610 /* 7611 * Function: 7612 * vdc_validate_geometry 7613 * 7614 * Description: 7615 * This routine discovers the label and geometry of the disk. It stores 7616 * the disk label and related information in the vdc structure. If it 7617 * fails to validate the geometry or to discover the disk label then 7618 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7619 * 7620 * Arguments: 7621 * vdc - soft state pointer for this instance of the device driver. 7622 * 7623 * Return Code: 7624 * 0 - success. 7625 * EINVAL - unknown disk label. 7626 * ENOTSUP - geometry not applicable (EFI label). 7627 * EIO - error accessing the disk. 7628 */ 7629 static int 7630 vdc_validate_geometry(vdc_t *vdc) 7631 { 7632 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7633 dev_t dev; 7634 int rv, rval; 7635 struct dk_label label; 7636 struct dk_geom geom; 7637 struct vtoc vtoc; 7638 efi_gpt_t *gpt; 7639 efi_gpe_t *gpe; 7640 vd_efi_dev_t edev; 7641 7642 ASSERT(vdc != NULL); 7643 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7644 ASSERT(MUTEX_HELD(&vdc->lock)); 7645 7646 mutex_exit(&vdc->lock); 7647 7648 dev = makedevice(ddi_driver_major(vdc->dip), 7649 VD_MAKE_DEV(vdc->instance, 0)); 7650 7651 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7652 if (rv == 0) 7653 rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, 7654 FKIOCTL, &rval); 7655 7656 if (rv == ENOTSUP) { 7657 /* 7658 * If the device does not support VTOC then we try 7659 * to read an EFI label. 7660 * 7661 * We need to know the block size and the disk size to 7662 * be able to read an EFI label. 7663 */ 7664 if (vdc->vdisk_size == 0) { 7665 if ((rv = vdc_check_capacity(vdc)) != 0) { 7666 mutex_enter(&vdc->lock); 7667 vdc_store_label_unk(vdc); 7668 return (rv); 7669 } 7670 } 7671 7672 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7673 7674 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7675 7676 if (rv) { 7677 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7678 vdc->instance, rv); 7679 mutex_enter(&vdc->lock); 7680 vdc_store_label_unk(vdc); 7681 return (EIO); 7682 } 7683 7684 mutex_enter(&vdc->lock); 7685 vdc_store_label_efi(vdc, gpt, gpe); 7686 vd_efi_free(&edev, gpt, gpe); 7687 return (ENOTSUP); 7688 } 7689 7690 if (rv != 0) { 7691 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7692 vdc->instance, rv); 7693 mutex_enter(&vdc->lock); 7694 vdc_store_label_unk(vdc); 7695 if (rv != EINVAL) 7696 rv = EIO; 7697 return (rv); 7698 } 7699 7700 /* check that geometry and vtoc are valid */ 7701 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7702 vtoc.v_sanity != VTOC_SANE) { 7703 mutex_enter(&vdc->lock); 7704 vdc_store_label_unk(vdc); 7705 return (EINVAL); 7706 } 7707 7708 /* 7709 * We have a disk and a valid VTOC. However this does not mean 7710 * that the disk currently have a VTOC label. The returned VTOC may 7711 * be a default VTOC to be used for configuring the disk (this is 7712 * what is done for disk image). So we read the label from the 7713 * beginning of the disk to ensure we really have a VTOC label. 7714 * 7715 * FUTURE: This could be the default way for reading the VTOC 7716 * from the disk as opposed to sending the VD_OP_GET_VTOC 7717 * to the server. This will be the default if vdc is implemented 7718 * ontop of cmlb. 7719 */ 7720 7721 /* 7722 * Single slice disk does not support read using an absolute disk 7723 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7724 */ 7725 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7726 mutex_enter(&vdc->lock); 7727 if (vtoc.v_nparts != 1) { 7728 vdc_store_label_unk(vdc); 7729 return (EINVAL); 7730 } 7731 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7732 return (0); 7733 } 7734 7735 if (vtoc.v_nparts != V_NUMPAR) { 7736 mutex_enter(&vdc->lock); 7737 vdc_store_label_unk(vdc); 7738 return (EINVAL); 7739 } 7740 7741 /* 7742 * Read disk label from start of disk 7743 */ 7744 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7745 bioinit(buf); 7746 buf->b_un.b_addr = (caddr_t)&label; 7747 buf->b_bcount = DK_LABEL_SIZE; 7748 buf->b_flags = B_BUSY | B_READ; 7749 buf->b_dev = cmpdev(dev); 7750 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7751 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7752 if (rv) { 7753 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7754 vdc->instance); 7755 } else { 7756 rv = biowait(buf); 7757 biofini(buf); 7758 } 7759 kmem_free(buf, sizeof (buf_t)); 7760 7761 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7762 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7763 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7764 vdc->instance); 7765 mutex_enter(&vdc->lock); 7766 vdc_store_label_unk(vdc); 7767 return (EINVAL); 7768 } 7769 7770 mutex_enter(&vdc->lock); 7771 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7772 return (0); 7773 } 7774 7775 /* 7776 * Function: 7777 * vdc_validate 7778 * 7779 * Description: 7780 * This routine discovers the label of the disk and create the 7781 * appropriate device nodes if the label has changed. 7782 * 7783 * Arguments: 7784 * vdc - soft state pointer for this instance of the device driver. 7785 * 7786 * Return Code: 7787 * none. 7788 */ 7789 static void 7790 vdc_validate(vdc_t *vdc) 7791 { 7792 vd_disk_label_t old_label; 7793 vd_slice_t old_slice[V_NUMPAR]; 7794 int rv; 7795 7796 ASSERT(!MUTEX_HELD(&vdc->lock)); 7797 7798 mutex_enter(&vdc->lock); 7799 7800 /* save the current label and vtoc */ 7801 old_label = vdc->vdisk_label; 7802 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7803 7804 /* check the geometry */ 7805 (void) vdc_validate_geometry(vdc); 7806 7807 /* if the disk label has changed, update device nodes */ 7808 if (vdc->vdisk_label != old_label) { 7809 7810 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7811 rv = vdc_create_device_nodes_efi(vdc); 7812 else 7813 rv = vdc_create_device_nodes_vtoc(vdc); 7814 7815 if (rv != 0) { 7816 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7817 vdc->instance); 7818 } 7819 } 7820 7821 /* if the vtoc has changed, update device nodes properties */ 7822 if (bcmp(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR) != 0) { 7823 7824 if (vdc_create_device_nodes_props(vdc) != 0) { 7825 DMSG(vdc, 0, "![%d] Failed to update device nodes" 7826 " properties", vdc->instance); 7827 } 7828 } 7829 7830 mutex_exit(&vdc->lock); 7831 } 7832 7833 static void 7834 vdc_validate_task(void *arg) 7835 { 7836 vdc_t *vdc = (vdc_t *)arg; 7837 7838 vdc_validate(vdc); 7839 7840 mutex_enter(&vdc->lock); 7841 ASSERT(vdc->validate_pending > 0); 7842 vdc->validate_pending--; 7843 mutex_exit(&vdc->lock); 7844 } 7845 7846 /* 7847 * Function: 7848 * vdc_setup_devid() 7849 * 7850 * Description: 7851 * This routine discovers the devid of a vDisk. It requests the devid of 7852 * the underlying device from the vDisk server, builds an encapsulated 7853 * devid based on the retrieved devid and registers that new devid to 7854 * the vDisk. 7855 * 7856 * Arguments: 7857 * vdc - soft state pointer for this instance of the device driver. 7858 * 7859 * Return Code: 7860 * 0 - A devid was succesfully registered for the vDisk 7861 */ 7862 static int 7863 vdc_setup_devid(vdc_t *vdc) 7864 { 7865 int rv; 7866 vd_devid_t *vd_devid; 7867 size_t bufsize, bufid_len; 7868 7869 /* 7870 * At first sight, we don't know the size of the devid that the 7871 * server will return but this size will be encoded into the 7872 * reply. So we do a first request using a default size then we 7873 * check if this size was large enough. If not then we do a second 7874 * request with the correct size returned by the server. Note that 7875 * ldc requires size to be 8-byte aligned. 7876 */ 7877 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7878 sizeof (uint64_t)); 7879 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7880 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7881 7882 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7883 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7884 7885 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7886 7887 if (rv) { 7888 kmem_free(vd_devid, bufsize); 7889 return (rv); 7890 } 7891 7892 if (vd_devid->length > bufid_len) { 7893 /* 7894 * The returned devid is larger than the buffer used. Try again 7895 * with a buffer with the right size. 7896 */ 7897 kmem_free(vd_devid, bufsize); 7898 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7899 sizeof (uint64_t)); 7900 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7901 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7902 7903 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7904 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7905 VIO_both_dir, B_TRUE); 7906 7907 if (rv) { 7908 kmem_free(vd_devid, bufsize); 7909 return (rv); 7910 } 7911 } 7912 7913 /* 7914 * The virtual disk should have the same device id as the one associated 7915 * with the physical disk it is mapped on, otherwise sharing a disk 7916 * between a LDom and a non-LDom may not work (for example for a shared 7917 * SVM disk set). 7918 * 7919 * The DDI framework does not allow creating a device id with any 7920 * type so we first create a device id of type DEVID_ENCAP and then 7921 * we restore the orignal type of the physical device. 7922 */ 7923 7924 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7925 7926 /* build an encapsulated devid based on the returned devid */ 7927 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 7928 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 7929 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 7930 kmem_free(vd_devid, bufsize); 7931 return (1); 7932 } 7933 7934 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 7935 7936 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 7937 7938 kmem_free(vd_devid, bufsize); 7939 7940 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 7941 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 7942 return (1); 7943 } 7944 7945 return (0); 7946 } 7947 7948 static void 7949 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 7950 { 7951 int i, nparts; 7952 7953 ASSERT(MUTEX_HELD(&vdc->lock)); 7954 7955 vdc->vdisk_label = VD_DISK_LABEL_EFI; 7956 bzero(vdc->vtoc, sizeof (struct vtoc)); 7957 bzero(vdc->geom, sizeof (struct dk_geom)); 7958 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7959 7960 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 7961 7962 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 7963 7964 if (gpe[i].efi_gpe_StartingLBA == 0 || 7965 gpe[i].efi_gpe_EndingLBA == 0) { 7966 continue; 7967 } 7968 7969 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 7970 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 7971 gpe[i].efi_gpe_StartingLBA + 1; 7972 } 7973 7974 ASSERT(vdc->vdisk_size != 0); 7975 vdc->slice[VD_EFI_WD_SLICE].start = 0; 7976 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 7977 7978 } 7979 7980 static void 7981 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct vtoc *vtoc) 7982 { 7983 int i; 7984 7985 ASSERT(MUTEX_HELD(&vdc->lock)); 7986 ASSERT(vdc->block_size == vtoc->v_sectorsz); 7987 7988 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 7989 bcopy(vtoc, vdc->vtoc, sizeof (struct vtoc)); 7990 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 7991 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 7992 7993 for (i = 0; i < vtoc->v_nparts; i++) { 7994 vdc->slice[i].start = vtoc->v_part[i].p_start; 7995 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 7996 } 7997 } 7998 7999 static void 8000 vdc_store_label_unk(vdc_t *vdc) 8001 { 8002 ASSERT(MUTEX_HELD(&vdc->lock)); 8003 8004 vdc->vdisk_label = VD_DISK_LABEL_UNK; 8005 bzero(vdc->vtoc, sizeof (struct vtoc)); 8006 bzero(vdc->geom, sizeof (struct dk_geom)); 8007 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8008 } 8009