1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * LDoms virtual disk client (vdc) device driver 29 * 30 * This driver runs on a guest logical domain and communicates with the virtual 31 * disk server (vds) driver running on the service domain which is exporting 32 * virtualized "disks" to the guest logical domain. 33 * 34 * The driver can be divided into four sections: 35 * 36 * 1) generic device driver housekeeping 37 * _init, _fini, attach, detach, ops structures, etc. 38 * 39 * 2) communication channel setup 40 * Setup the communications link over the LDC channel that vdc uses to 41 * talk to the vDisk server. Initialise the descriptor ring which 42 * allows the LDC clients to transfer data via memory mappings. 43 * 44 * 3) Support exported to upper layers (filesystems, etc) 45 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 46 * ioctl calls. vdc will copy the data to be written to the descriptor 47 * ring or maps the buffer to store the data read by the vDisk 48 * server into the descriptor ring. It then sends a message to the 49 * vDisk server requesting it to complete the operation. 50 * 51 * 4) Handling responses from vDisk server. 52 * The vDisk server will ACK some or all of the messages vdc sends to it 53 * (this is configured during the handshake). Upon receipt of an ACK 54 * vdc will check the descriptor ring and signal to the upper layer 55 * code waiting on the IO. 56 */ 57 58 #include <sys/atomic.h> 59 #include <sys/conf.h> 60 #include <sys/disp.h> 61 #include <sys/ddi.h> 62 #include <sys/dkio.h> 63 #include <sys/efi_partition.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/kstat.h> 67 #include <sys/mach_descrip.h> 68 #include <sys/modctl.h> 69 #include <sys/mdeg.h> 70 #include <sys/note.h> 71 #include <sys/open.h> 72 #include <sys/random.h> 73 #include <sys/sdt.h> 74 #include <sys/stat.h> 75 #include <sys/sunddi.h> 76 #include <sys/types.h> 77 #include <sys/promif.h> 78 #include <sys/var.h> 79 #include <sys/vtoc.h> 80 #include <sys/archsystm.h> 81 #include <sys/sysmacros.h> 82 83 #include <sys/cdio.h> 84 #include <sys/dktp/fdisk.h> 85 #include <sys/dktp/dadkio.h> 86 #include <sys/fs/dv_node.h> 87 #include <sys/mhd.h> 88 #include <sys/scsi/generic/sense.h> 89 #include <sys/scsi/impl/uscsi.h> 90 #include <sys/scsi/impl/services.h> 91 #include <sys/scsi/targets/sddef.h> 92 93 #include <sys/ldoms.h> 94 #include <sys/ldc.h> 95 #include <sys/vio_common.h> 96 #include <sys/vio_mailbox.h> 97 #include <sys/vio_util.h> 98 #include <sys/vdsk_common.h> 99 #include <sys/vdsk_mailbox.h> 100 #include <sys/vdc.h> 101 102 #define VD_OLDVTOC_LIMIT 0x7fffffff 103 104 /* 105 * function prototypes 106 */ 107 108 /* standard driver functions */ 109 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 110 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 111 static int vdc_strategy(struct buf *buf); 112 static int vdc_print(dev_t dev, char *str); 113 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 114 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 115 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 116 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 117 cred_t *credp, int *rvalp); 118 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 119 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 120 121 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 122 void *arg, void **resultp); 123 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 124 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 125 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 126 int mod_flags, char *name, caddr_t valuep, int *lengthp); 127 128 /* setup */ 129 static void vdc_min(struct buf *bufp); 130 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 131 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 132 static int vdc_start_ldc_connection(vdc_t *vdc); 133 static int vdc_create_device_nodes(vdc_t *vdc); 134 static int vdc_create_device_nodes_efi(vdc_t *vdc); 135 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 136 static void vdc_create_io_kstats(vdc_t *vdc); 137 static void vdc_create_err_kstats(vdc_t *vdc); 138 static void vdc_set_err_kstats(vdc_t *vdc); 139 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 140 mde_cookie_t *vd_nodep); 141 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 142 static void vdc_fini_ports(vdc_t *vdc); 143 static void vdc_switch_server(vdc_t *vdcp); 144 static int vdc_do_ldc_up(vdc_t *vdc); 145 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 146 static int vdc_init_descriptor_ring(vdc_t *vdc); 147 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 148 static int vdc_setup_devid(vdc_t *vdc); 149 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 150 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, 151 struct extvtoc *); 152 static void vdc_store_label_unk(vdc_t *vdc); 153 static boolean_t vdc_is_opened(vdc_t *vdc); 154 static void vdc_update_size(vdc_t *vdc, size_t, size_t, size_t); 155 static int vdc_update_vio_bsize(vdc_t *vdc, uint32_t); 156 157 /* handshake with vds */ 158 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 159 static int vdc_ver_negotiation(vdc_t *vdcp); 160 static int vdc_init_attr_negotiation(vdc_t *vdc); 161 static int vdc_attr_negotiation(vdc_t *vdcp); 162 static int vdc_init_dring_negotiate(vdc_t *vdc); 163 static int vdc_dring_negotiation(vdc_t *vdcp); 164 static int vdc_send_rdx(vdc_t *vdcp); 165 static int vdc_rdx_exchange(vdc_t *vdcp); 166 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 167 168 /* processing incoming messages from vDisk server */ 169 static void vdc_process_msg_thread(vdc_t *vdc); 170 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 171 172 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 173 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 174 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 175 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 176 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 177 static int vdc_send_request(vdc_t *vdcp, int operation, 178 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 179 buf_t *bufp, vio_desc_direction_t dir, int flags); 180 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 181 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 182 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 183 buf_t *bufp, vio_desc_direction_t dir, int flags); 184 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 185 size_t nbytes, int slice, diskaddr_t offset, 186 vio_desc_direction_t dir, boolean_t); 187 static int vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes, 188 int slice, diskaddr_t offset, struct buf *bufp, 189 vio_desc_direction_t dir, int flags); 190 191 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 192 static int vdc_drain_response(vdc_t *vdcp, struct buf *buf); 193 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 194 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 195 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 196 197 /* dkio */ 198 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 199 int *rvalp); 200 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 201 static void vdc_create_fake_geometry(vdc_t *vdc); 202 static int vdc_validate_geometry(vdc_t *vdc); 203 static void vdc_validate(vdc_t *vdc); 204 static void vdc_validate_task(void *arg); 205 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 206 int mode, int dir); 207 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 208 int mode, int dir); 209 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 210 int mode, int dir); 211 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 212 int mode, int dir); 213 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 214 int mode, int dir); 215 static int vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, 216 int mode, int dir); 217 static int vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, 218 int mode, int dir); 219 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 220 int mode, int dir); 221 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 222 int mode, int dir); 223 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 224 int mode, int dir); 225 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 226 int mode, int dir); 227 228 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 229 static int vdc_access_set(vdc_t *vdc, uint64_t flags); 230 static vdc_io_t *vdc_eio_queue(vdc_t *vdc, int index); 231 static void vdc_eio_unqueue(vdc_t *vdc, clock_t deadline, 232 boolean_t complete_io); 233 static int vdc_eio_check(vdc_t *vdc, int flags); 234 static void vdc_eio_thread(void *arg); 235 236 /* 237 * Module variables 238 */ 239 240 /* 241 * Tunable variables to control how long vdc waits before timing out on 242 * various operations 243 */ 244 static int vdc_hshake_retries = 3; 245 246 static int vdc_timeout = 0; /* units: seconds */ 247 static int vdc_ldcup_timeout = 1; /* units: seconds */ 248 249 static uint64_t vdc_hz_min_ldc_delay; 250 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 251 static uint64_t vdc_hz_max_ldc_delay; 252 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 253 254 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 255 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 256 257 /* values for dumping - need to run in a tighter loop */ 258 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 259 static int vdc_dump_retries = 100; 260 261 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 262 263 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 264 265 /* Count of the number of vdc instances attached */ 266 static volatile uint32_t vdc_instance_count = 0; 267 268 /* Tunable to log all SCSI errors */ 269 static boolean_t vdc_scsi_log_error = B_FALSE; 270 271 /* Soft state pointer */ 272 static void *vdc_state; 273 274 /* 275 * Controlling the verbosity of the error/debug messages 276 * 277 * vdc_msglevel - controls level of messages 278 * vdc_matchinst - 64-bit variable where each bit corresponds 279 * to the vdc instance the vdc_msglevel applies. 280 */ 281 int vdc_msglevel = 0x0; 282 uint64_t vdc_matchinst = 0ull; 283 284 /* 285 * Supported vDisk protocol version pairs. 286 * 287 * The first array entry is the latest and preferred version. 288 */ 289 static const vio_ver_t vdc_version[] = {{1, 1}}; 290 291 static struct cb_ops vdc_cb_ops = { 292 vdc_open, /* cb_open */ 293 vdc_close, /* cb_close */ 294 vdc_strategy, /* cb_strategy */ 295 vdc_print, /* cb_print */ 296 vdc_dump, /* cb_dump */ 297 vdc_read, /* cb_read */ 298 vdc_write, /* cb_write */ 299 vdc_ioctl, /* cb_ioctl */ 300 nodev, /* cb_devmap */ 301 nodev, /* cb_mmap */ 302 nodev, /* cb_segmap */ 303 nochpoll, /* cb_chpoll */ 304 vdc_prop_op, /* cb_prop_op */ 305 NULL, /* cb_str */ 306 D_MP | D_64BIT, /* cb_flag */ 307 CB_REV, /* cb_rev */ 308 vdc_aread, /* cb_aread */ 309 vdc_awrite /* cb_awrite */ 310 }; 311 312 static struct dev_ops vdc_ops = { 313 DEVO_REV, /* devo_rev */ 314 0, /* devo_refcnt */ 315 vdc_getinfo, /* devo_getinfo */ 316 nulldev, /* devo_identify */ 317 nulldev, /* devo_probe */ 318 vdc_attach, /* devo_attach */ 319 vdc_detach, /* devo_detach */ 320 nodev, /* devo_reset */ 321 &vdc_cb_ops, /* devo_cb_ops */ 322 NULL, /* devo_bus_ops */ 323 nulldev, /* devo_power */ 324 ddi_quiesce_not_needed, /* devo_quiesce */ 325 }; 326 327 static struct modldrv modldrv = { 328 &mod_driverops, 329 "virtual disk client", 330 &vdc_ops, 331 }; 332 333 static struct modlinkage modlinkage = { 334 MODREV_1, 335 &modldrv, 336 NULL 337 }; 338 339 /* -------------------------------------------------------------------------- */ 340 341 /* 342 * Device Driver housekeeping and setup 343 */ 344 345 int 346 _init(void) 347 { 348 int status; 349 350 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 351 return (status); 352 if ((status = mod_install(&modlinkage)) != 0) 353 ddi_soft_state_fini(&vdc_state); 354 return (status); 355 } 356 357 int 358 _info(struct modinfo *modinfop) 359 { 360 return (mod_info(&modlinkage, modinfop)); 361 } 362 363 int 364 _fini(void) 365 { 366 int status; 367 368 if ((status = mod_remove(&modlinkage)) != 0) 369 return (status); 370 ddi_soft_state_fini(&vdc_state); 371 return (0); 372 } 373 374 static int 375 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 376 { 377 _NOTE(ARGUNUSED(dip)) 378 379 int instance = VDCUNIT((dev_t)arg); 380 vdc_t *vdc = NULL; 381 382 switch (cmd) { 383 case DDI_INFO_DEVT2DEVINFO: 384 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 385 *resultp = NULL; 386 return (DDI_FAILURE); 387 } 388 *resultp = vdc->dip; 389 return (DDI_SUCCESS); 390 case DDI_INFO_DEVT2INSTANCE: 391 *resultp = (void *)(uintptr_t)instance; 392 return (DDI_SUCCESS); 393 default: 394 *resultp = NULL; 395 return (DDI_FAILURE); 396 } 397 } 398 399 static int 400 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 401 { 402 kt_did_t eio_tid, ownership_tid; 403 int instance; 404 int rv; 405 vdc_server_t *srvr; 406 vdc_t *vdc = NULL; 407 408 switch (cmd) { 409 case DDI_DETACH: 410 /* the real work happens below */ 411 break; 412 case DDI_SUSPEND: 413 /* nothing to do for this non-device */ 414 return (DDI_SUCCESS); 415 default: 416 return (DDI_FAILURE); 417 } 418 419 ASSERT(cmd == DDI_DETACH); 420 instance = ddi_get_instance(dip); 421 DMSGX(1, "[%d] Entered\n", instance); 422 423 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 424 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 425 return (DDI_FAILURE); 426 } 427 428 if (vdc_is_opened(vdc)) { 429 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 430 return (DDI_FAILURE); 431 } 432 433 if (vdc->dkio_flush_pending) { 434 DMSG(vdc, 0, 435 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 436 instance, vdc->dkio_flush_pending); 437 return (DDI_FAILURE); 438 } 439 440 if (vdc->validate_pending) { 441 DMSG(vdc, 0, 442 "[%d] Cannot detach: %d outstanding validate request\n", 443 instance, vdc->validate_pending); 444 return (DDI_FAILURE); 445 } 446 447 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 448 449 /* If we took ownership, release ownership */ 450 mutex_enter(&vdc->ownership_lock); 451 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 452 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR); 453 if (rv == 0) { 454 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 455 } 456 } 457 mutex_exit(&vdc->ownership_lock); 458 459 /* mark instance as detaching */ 460 vdc->lifecycle = VDC_LC_DETACHING; 461 462 /* 463 * Try and disable callbacks to prevent another handshake. We have to 464 * disable callbacks for all servers. 465 */ 466 for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) { 467 rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE); 468 DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n", 469 srvr->ldc_id, rv); 470 } 471 472 if (vdc->initialized & VDC_THREAD) { 473 mutex_enter(&vdc->read_lock); 474 if ((vdc->read_state == VDC_READ_WAITING) || 475 (vdc->read_state == VDC_READ_RESET)) { 476 vdc->read_state = VDC_READ_RESET; 477 cv_signal(&vdc->read_cv); 478 } 479 480 mutex_exit(&vdc->read_lock); 481 482 /* wake up any thread waiting for connection to come online */ 483 mutex_enter(&vdc->lock); 484 if (vdc->state == VDC_STATE_INIT_WAITING) { 485 DMSG(vdc, 0, 486 "[%d] write reset - move to resetting state...\n", 487 instance); 488 vdc->state = VDC_STATE_RESETTING; 489 cv_signal(&vdc->initwait_cv); 490 } else if (vdc->state == VDC_STATE_FAILED) { 491 vdc->io_pending = B_TRUE; 492 cv_signal(&vdc->io_pending_cv); 493 } 494 mutex_exit(&vdc->lock); 495 496 /* now wait until state transitions to VDC_STATE_DETACH */ 497 thread_join(vdc->msg_proc_thr->t_did); 498 ASSERT(vdc->state == VDC_STATE_DETACH); 499 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 500 vdc->instance); 501 } 502 503 mutex_enter(&vdc->lock); 504 505 if (vdc->initialized & VDC_DRING) 506 vdc_destroy_descriptor_ring(vdc); 507 508 vdc_fini_ports(vdc); 509 510 if (vdc->eio_thread) { 511 eio_tid = vdc->eio_thread->t_did; 512 vdc->failfast_interval = 0; 513 ASSERT(vdc->num_servers == 0); 514 cv_signal(&vdc->eio_cv); 515 } else { 516 eio_tid = 0; 517 } 518 519 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 520 ownership_tid = vdc->ownership_thread->t_did; 521 vdc->ownership = VDC_OWNERSHIP_NONE; 522 cv_signal(&vdc->ownership_cv); 523 } else { 524 ownership_tid = 0; 525 } 526 527 mutex_exit(&vdc->lock); 528 529 if (eio_tid != 0) 530 thread_join(eio_tid); 531 532 if (ownership_tid != 0) 533 thread_join(ownership_tid); 534 535 if (vdc->initialized & VDC_MINOR) 536 ddi_remove_minor_node(dip, NULL); 537 538 if (vdc->io_stats) { 539 kstat_delete(vdc->io_stats); 540 vdc->io_stats = NULL; 541 } 542 543 if (vdc->err_stats) { 544 kstat_delete(vdc->err_stats); 545 vdc->err_stats = NULL; 546 } 547 548 if (vdc->initialized & VDC_LOCKS) { 549 mutex_destroy(&vdc->lock); 550 mutex_destroy(&vdc->read_lock); 551 mutex_destroy(&vdc->ownership_lock); 552 cv_destroy(&vdc->initwait_cv); 553 cv_destroy(&vdc->dring_free_cv); 554 cv_destroy(&vdc->membind_cv); 555 cv_destroy(&vdc->sync_blocked_cv); 556 cv_destroy(&vdc->read_cv); 557 cv_destroy(&vdc->running_cv); 558 cv_destroy(&vdc->io_pending_cv); 559 cv_destroy(&vdc->ownership_cv); 560 cv_destroy(&vdc->eio_cv); 561 } 562 563 if (vdc->minfo) 564 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 565 566 if (vdc->cinfo) 567 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 568 569 if (vdc->vtoc) 570 kmem_free(vdc->vtoc, sizeof (struct extvtoc)); 571 572 if (vdc->geom) 573 kmem_free(vdc->geom, sizeof (struct dk_geom)); 574 575 if (vdc->devid) { 576 ddi_devid_unregister(dip); 577 ddi_devid_free(vdc->devid); 578 } 579 580 if (vdc->initialized & VDC_SOFT_STATE) 581 ddi_soft_state_free(vdc_state, instance); 582 583 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 584 585 return (DDI_SUCCESS); 586 } 587 588 589 static int 590 vdc_do_attach(dev_info_t *dip) 591 { 592 int instance; 593 vdc_t *vdc = NULL; 594 int status; 595 md_t *mdp; 596 mde_cookie_t vd_node; 597 598 ASSERT(dip != NULL); 599 600 instance = ddi_get_instance(dip); 601 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 602 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 603 instance); 604 return (DDI_FAILURE); 605 } 606 607 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 608 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 609 return (DDI_FAILURE); 610 } 611 612 /* 613 * We assign the value to initialized in this case to zero out the 614 * variable and then set bits in it to indicate what has been done 615 */ 616 vdc->initialized = VDC_SOFT_STATE; 617 618 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 619 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 620 621 vdc->dip = dip; 622 vdc->instance = instance; 623 vdc->vdisk_type = VD_DISK_TYPE_UNK; 624 vdc->vdisk_label = VD_DISK_LABEL_UNK; 625 vdc->state = VDC_STATE_INIT; 626 vdc->lifecycle = VDC_LC_ATTACHING; 627 vdc->session_id = 0; 628 vdc->vdisk_bsize = DEV_BSIZE; 629 vdc->vio_bmask = 0; 630 vdc->vio_bshift = 0; 631 vdc->max_xfer_sz = maxphys / vdc->vdisk_bsize; 632 633 /* 634 * We assume, for now, that the vDisk server will export 'read' 635 * operations to us at a minimum (this is needed because of checks 636 * in vdc for supported operations early in the handshake process). 637 * The vDisk server will return ENOTSUP if this is not the case. 638 * The value will be overwritten during the attribute exchange with 639 * the bitmask of operations exported by server. 640 */ 641 vdc->operations = VD_OP_MASK_READ; 642 643 vdc->vtoc = NULL; 644 vdc->geom = NULL; 645 vdc->cinfo = NULL; 646 vdc->minfo = NULL; 647 648 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 649 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 650 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 651 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 652 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 653 cv_init(&vdc->io_pending_cv, NULL, CV_DRIVER, NULL); 654 655 vdc->io_pending = B_FALSE; 656 vdc->threads_pending = 0; 657 vdc->sync_op_blocked = B_FALSE; 658 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 659 660 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 661 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 662 cv_init(&vdc->eio_cv, NULL, CV_DRIVER, NULL); 663 664 /* init blocking msg read functionality */ 665 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 666 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 667 vdc->read_state = VDC_READ_IDLE; 668 669 vdc->initialized |= VDC_LOCKS; 670 671 /* get device and port MD node for this disk instance */ 672 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 673 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 674 instance); 675 return (DDI_FAILURE); 676 } 677 678 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 679 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 680 return (DDI_FAILURE); 681 } 682 683 (void) md_fini_handle(mdp); 684 685 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 686 vdc_create_io_kstats(vdc); 687 vdc_create_err_kstats(vdc); 688 689 /* Initialize remaining structures before starting the msg thread */ 690 vdc->vdisk_label = VD_DISK_LABEL_UNK; 691 vdc->vtoc = kmem_zalloc(sizeof (struct extvtoc), KM_SLEEP); 692 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 693 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 694 695 /* initialize the thread responsible for managing state with server */ 696 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 697 vdc, 0, &p0, TS_RUN, minclsyspri); 698 if (vdc->msg_proc_thr == NULL) { 699 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 700 instance); 701 return (DDI_FAILURE); 702 } 703 704 /* 705 * If there are multiple servers then start the eio thread. 706 */ 707 if (vdc->num_servers > 1) { 708 vdc->eio_thread = thread_create(NULL, 0, vdc_eio_thread, vdc, 0, 709 &p0, TS_RUN, v.v_maxsyspri - 2); 710 if (vdc->eio_thread == NULL) { 711 cmn_err(CE_NOTE, "[%d] Failed to create error " 712 "I/O thread", instance); 713 return (DDI_FAILURE); 714 } 715 } 716 717 vdc->initialized |= VDC_THREAD; 718 719 atomic_inc_32(&vdc_instance_count); 720 721 /* 722 * Check the disk label. This will send requests and do the handshake. 723 * We don't really care about the disk label now. What we really need is 724 * the handshake do be done so that we know the type of the disk (slice 725 * or full disk) and the appropriate device nodes can be created. 726 */ 727 728 mutex_enter(&vdc->lock); 729 (void) vdc_validate_geometry(vdc); 730 mutex_exit(&vdc->lock); 731 732 /* 733 * Now that we have the device info we can create the device nodes 734 */ 735 status = vdc_create_device_nodes(vdc); 736 if (status) { 737 DMSG(vdc, 0, "[%d] Failed to create device nodes", 738 instance); 739 goto return_status; 740 } 741 742 /* 743 * Fill in the fields of the error statistics kstat that were not 744 * available when creating the kstat 745 */ 746 vdc_set_err_kstats(vdc); 747 748 ddi_report_dev(dip); 749 vdc->lifecycle = VDC_LC_ONLINE; 750 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 751 752 return_status: 753 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 754 return (status); 755 } 756 757 static int 758 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 759 { 760 int status; 761 762 switch (cmd) { 763 case DDI_ATTACH: 764 if ((status = vdc_do_attach(dip)) != 0) 765 (void) vdc_detach(dip, DDI_DETACH); 766 return (status); 767 case DDI_RESUME: 768 /* nothing to do for this non-device */ 769 return (DDI_SUCCESS); 770 default: 771 return (DDI_FAILURE); 772 } 773 } 774 775 static int 776 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 777 { 778 int status = 0; 779 ldc_status_t ldc_state; 780 ldc_attr_t ldc_attr; 781 782 ASSERT(vdc != NULL); 783 ASSERT(srvr != NULL); 784 785 ldc_attr.devclass = LDC_DEV_BLK; 786 ldc_attr.instance = vdc->instance; 787 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 788 ldc_attr.mtu = VD_LDC_MTU; 789 790 if ((srvr->state & VDC_LDC_INIT) == 0) { 791 status = ldc_init(srvr->ldc_id, &ldc_attr, 792 &srvr->ldc_handle); 793 if (status != 0) { 794 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 795 vdc->instance, srvr->ldc_id, status); 796 return (status); 797 } 798 srvr->state |= VDC_LDC_INIT; 799 } 800 status = ldc_status(srvr->ldc_handle, &ldc_state); 801 if (status != 0) { 802 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 803 vdc->instance, status); 804 goto init_exit; 805 } 806 srvr->ldc_state = ldc_state; 807 808 if ((srvr->state & VDC_LDC_CB) == 0) { 809 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 810 (caddr_t)srvr); 811 if (status != 0) { 812 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 813 vdc->instance, status); 814 goto init_exit; 815 } 816 srvr->state |= VDC_LDC_CB; 817 } 818 819 /* 820 * At this stage we have initialised LDC, we will now try and open 821 * the connection. 822 */ 823 if (srvr->ldc_state == LDC_INIT) { 824 status = ldc_open(srvr->ldc_handle); 825 if (status != 0) { 826 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 827 vdc->instance, srvr->ldc_id, status); 828 goto init_exit; 829 } 830 srvr->state |= VDC_LDC_OPEN; 831 } 832 833 init_exit: 834 if (status) { 835 vdc_terminate_ldc(vdc, srvr); 836 } 837 838 return (status); 839 } 840 841 static int 842 vdc_start_ldc_connection(vdc_t *vdc) 843 { 844 int status = 0; 845 846 ASSERT(vdc != NULL); 847 848 ASSERT(MUTEX_HELD(&vdc->lock)); 849 850 status = vdc_do_ldc_up(vdc); 851 852 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 853 854 return (status); 855 } 856 857 static int 858 vdc_stop_ldc_connection(vdc_t *vdcp) 859 { 860 int status; 861 862 ASSERT(vdcp != NULL); 863 864 ASSERT(MUTEX_HELD(&vdcp->lock)); 865 866 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 867 vdcp->state); 868 869 status = ldc_down(vdcp->curr_server->ldc_handle); 870 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 871 872 vdcp->initialized &= ~VDC_HANDSHAKE; 873 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 874 875 return (status); 876 } 877 878 static void 879 vdc_create_io_kstats(vdc_t *vdc) 880 { 881 if (vdc->io_stats != NULL) { 882 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 883 return; 884 } 885 886 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 887 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 888 if (vdc->io_stats != NULL) { 889 vdc->io_stats->ks_lock = &vdc->lock; 890 kstat_install(vdc->io_stats); 891 } else { 892 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 893 " will not be gathered", vdc->instance); 894 } 895 } 896 897 static void 898 vdc_create_err_kstats(vdc_t *vdc) 899 { 900 vd_err_stats_t *stp; 901 char kstatmodule_err[KSTAT_STRLEN]; 902 char kstatname[KSTAT_STRLEN]; 903 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 904 int instance = vdc->instance; 905 906 if (vdc->err_stats != NULL) { 907 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 908 return; 909 } 910 911 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 912 "%serr", VDC_DRIVER_NAME); 913 (void) snprintf(kstatname, sizeof (kstatname), 914 "%s%d,err", VDC_DRIVER_NAME, instance); 915 916 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 917 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 918 919 if (vdc->err_stats == NULL) { 920 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 921 " will not be gathered", instance); 922 return; 923 } 924 925 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 926 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 927 KSTAT_DATA_UINT32); 928 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 929 KSTAT_DATA_UINT32); 930 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 931 KSTAT_DATA_UINT32); 932 kstat_named_init(&stp->vd_vid, "Vendor", 933 KSTAT_DATA_CHAR); 934 kstat_named_init(&stp->vd_pid, "Product", 935 KSTAT_DATA_CHAR); 936 kstat_named_init(&stp->vd_capacity, "Size", 937 KSTAT_DATA_ULONGLONG); 938 939 vdc->err_stats->ks_update = nulldev; 940 941 kstat_install(vdc->err_stats); 942 } 943 944 static void 945 vdc_set_err_kstats(vdc_t *vdc) 946 { 947 vd_err_stats_t *stp; 948 949 if (vdc->err_stats == NULL) 950 return; 951 952 mutex_enter(&vdc->lock); 953 954 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 955 ASSERT(stp != NULL); 956 957 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->vdisk_bsize; 958 (void) strcpy(stp->vd_vid.value.c, "SUN"); 959 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 960 961 mutex_exit(&vdc->lock); 962 } 963 964 static int 965 vdc_create_device_nodes_efi(vdc_t *vdc) 966 { 967 ddi_remove_minor_node(vdc->dip, "h"); 968 ddi_remove_minor_node(vdc->dip, "h,raw"); 969 970 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 971 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 972 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 973 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 974 vdc->instance); 975 return (EIO); 976 } 977 978 /* if any device node is created we set this flag */ 979 vdc->initialized |= VDC_MINOR; 980 981 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 982 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 983 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 984 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 985 vdc->instance); 986 return (EIO); 987 } 988 989 return (0); 990 } 991 992 static int 993 vdc_create_device_nodes_vtoc(vdc_t *vdc) 994 { 995 ddi_remove_minor_node(vdc->dip, "wd"); 996 ddi_remove_minor_node(vdc->dip, "wd,raw"); 997 998 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 999 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 1000 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1001 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 1002 vdc->instance); 1003 return (EIO); 1004 } 1005 1006 /* if any device node is created we set this flag */ 1007 vdc->initialized |= VDC_MINOR; 1008 1009 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 1010 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 1011 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1012 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 1013 vdc->instance); 1014 return (EIO); 1015 } 1016 1017 return (0); 1018 } 1019 1020 /* 1021 * Function: 1022 * vdc_create_device_nodes 1023 * 1024 * Description: 1025 * This function creates the block and character device nodes under 1026 * /devices. It is called as part of the attach(9E) of the instance 1027 * during the handshake with vds after vds has sent the attributes 1028 * to vdc. 1029 * 1030 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1031 * of 2 is used in keeping with the Solaris convention that slice 2 1032 * refers to a whole disk. Slices start at 'a' 1033 * 1034 * Parameters: 1035 * vdc - soft state pointer 1036 * 1037 * Return Values 1038 * 0 - Success 1039 * EIO - Failed to create node 1040 */ 1041 static int 1042 vdc_create_device_nodes(vdc_t *vdc) 1043 { 1044 char name[sizeof ("s,raw")]; 1045 dev_info_t *dip = NULL; 1046 int instance, status; 1047 int num_slices = 1; 1048 int i; 1049 1050 ASSERT(vdc != NULL); 1051 1052 instance = vdc->instance; 1053 dip = vdc->dip; 1054 1055 switch (vdc->vdisk_type) { 1056 case VD_DISK_TYPE_DISK: 1057 case VD_DISK_TYPE_UNK: 1058 num_slices = V_NUMPAR; 1059 break; 1060 case VD_DISK_TYPE_SLICE: 1061 num_slices = 1; 1062 break; 1063 default: 1064 ASSERT(0); 1065 } 1066 1067 /* 1068 * Minor nodes are different for EFI disks: EFI disks do not have 1069 * a minor node 'g' for the minor number corresponding to slice 1070 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1071 * representing the whole disk. 1072 */ 1073 for (i = 0; i < num_slices; i++) { 1074 1075 if (i == VD_EFI_WD_SLICE) { 1076 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1077 status = vdc_create_device_nodes_efi(vdc); 1078 else 1079 status = vdc_create_device_nodes_vtoc(vdc); 1080 if (status != 0) 1081 return (status); 1082 continue; 1083 } 1084 1085 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1086 if (ddi_create_minor_node(dip, name, S_IFBLK, 1087 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1088 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1089 instance, name); 1090 return (EIO); 1091 } 1092 1093 /* if any device node is created we set this flag */ 1094 vdc->initialized |= VDC_MINOR; 1095 1096 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1097 1098 if (ddi_create_minor_node(dip, name, S_IFCHR, 1099 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1100 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1101 instance, name); 1102 return (EIO); 1103 } 1104 } 1105 1106 return (0); 1107 } 1108 1109 /* 1110 * Driver prop_op(9e) entry point function. Return the number of blocks for 1111 * the partition in question or forward the request to the property facilities. 1112 */ 1113 static int 1114 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1115 char *name, caddr_t valuep, int *lengthp) 1116 { 1117 int instance = ddi_get_instance(dip); 1118 vdc_t *vdc; 1119 uint64_t nblocks; 1120 uint_t blksize; 1121 1122 vdc = ddi_get_soft_state(vdc_state, instance); 1123 1124 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1125 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1126 name, valuep, lengthp)); 1127 } 1128 1129 mutex_enter(&vdc->lock); 1130 (void) vdc_validate_geometry(vdc); 1131 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1132 mutex_exit(&vdc->lock); 1133 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1134 name, valuep, lengthp)); 1135 } 1136 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1137 blksize = vdc->vdisk_bsize; 1138 mutex_exit(&vdc->lock); 1139 1140 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1141 name, valuep, lengthp, nblocks, blksize)); 1142 } 1143 1144 /* 1145 * Function: 1146 * vdc_is_opened 1147 * 1148 * Description: 1149 * This function checks if any slice of a given virtual disk is 1150 * currently opened. 1151 * 1152 * Parameters: 1153 * vdc - soft state pointer 1154 * 1155 * Return Values 1156 * B_TRUE - at least one slice is opened. 1157 * B_FALSE - no slice is opened. 1158 */ 1159 static boolean_t 1160 vdc_is_opened(vdc_t *vdc) 1161 { 1162 int i; 1163 1164 /* check if there's any layered open */ 1165 for (i = 0; i < V_NUMPAR; i++) { 1166 if (vdc->open_lyr[i] > 0) 1167 return (B_TRUE); 1168 } 1169 1170 /* check if there is any other kind of open */ 1171 for (i = 0; i < OTYPCNT; i++) { 1172 if (vdc->open[i] != 0) 1173 return (B_TRUE); 1174 } 1175 1176 return (B_FALSE); 1177 } 1178 1179 static int 1180 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1181 { 1182 uint8_t slicemask; 1183 int i; 1184 1185 ASSERT(otyp < OTYPCNT); 1186 ASSERT(slice < V_NUMPAR); 1187 ASSERT(MUTEX_HELD(&vdc->lock)); 1188 1189 slicemask = 1 << slice; 1190 1191 /* 1192 * If we have a single-slice disk which was unavailable during the 1193 * attach then a device was created for each 8 slices. Now that 1194 * the type is known, we prevent opening any slice other than 0 1195 * even if a device still exists. 1196 */ 1197 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0) 1198 return (EIO); 1199 1200 /* check if slice is already exclusively opened */ 1201 if (vdc->open_excl & slicemask) 1202 return (EBUSY); 1203 1204 /* if open exclusive, check if slice is already opened */ 1205 if (flag & FEXCL) { 1206 if (vdc->open_lyr[slice] > 0) 1207 return (EBUSY); 1208 for (i = 0; i < OTYPCNT; i++) { 1209 if (vdc->open[i] & slicemask) 1210 return (EBUSY); 1211 } 1212 vdc->open_excl |= slicemask; 1213 } 1214 1215 /* mark slice as opened */ 1216 if (otyp == OTYP_LYR) { 1217 vdc->open_lyr[slice]++; 1218 } else { 1219 vdc->open[otyp] |= slicemask; 1220 } 1221 1222 return (0); 1223 } 1224 1225 static void 1226 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1227 { 1228 uint8_t slicemask; 1229 1230 ASSERT(otyp < OTYPCNT); 1231 ASSERT(slice < V_NUMPAR); 1232 ASSERT(MUTEX_HELD(&vdc->lock)); 1233 1234 slicemask = 1 << slice; 1235 1236 if (otyp == OTYP_LYR) { 1237 ASSERT(vdc->open_lyr[slice] > 0); 1238 vdc->open_lyr[slice]--; 1239 } else { 1240 vdc->open[otyp] &= ~slicemask; 1241 } 1242 1243 if (flag & FEXCL) 1244 vdc->open_excl &= ~slicemask; 1245 } 1246 1247 static int 1248 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1249 { 1250 _NOTE(ARGUNUSED(cred)) 1251 1252 int instance, nodelay; 1253 int slice, status = 0; 1254 vdc_t *vdc; 1255 1256 ASSERT(dev != NULL); 1257 instance = VDCUNIT(*dev); 1258 1259 if (otyp >= OTYPCNT) 1260 return (EINVAL); 1261 1262 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1263 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1264 return (ENXIO); 1265 } 1266 1267 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1268 getminor(*dev), flag, otyp); 1269 1270 slice = VDCPART(*dev); 1271 1272 nodelay = flag & (FNDELAY | FNONBLOCK); 1273 1274 if ((flag & FWRITE) && (!nodelay) && 1275 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1276 return (EROFS); 1277 } 1278 1279 mutex_enter(&vdc->lock); 1280 1281 status = vdc_mark_opened(vdc, slice, flag, otyp); 1282 1283 if (status != 0) { 1284 mutex_exit(&vdc->lock); 1285 return (status); 1286 } 1287 1288 /* 1289 * If the disk type is unknown then we have to wait for the 1290 * handshake to complete because we don't know if the slice 1291 * device we are opening effectively exists. 1292 */ 1293 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && nodelay) { 1294 1295 /* don't resubmit a validate request if there's already one */ 1296 if (vdc->validate_pending > 0) { 1297 mutex_exit(&vdc->lock); 1298 return (0); 1299 } 1300 1301 /* call vdc_validate() asynchronously to avoid blocking */ 1302 if (taskq_dispatch(system_taskq, vdc_validate_task, 1303 (void *)vdc, TQ_NOSLEEP) == NULL) { 1304 vdc_mark_closed(vdc, slice, flag, otyp); 1305 mutex_exit(&vdc->lock); 1306 return (ENXIO); 1307 } 1308 1309 vdc->validate_pending++; 1310 mutex_exit(&vdc->lock); 1311 return (0); 1312 } 1313 1314 mutex_exit(&vdc->lock); 1315 1316 vdc_validate(vdc); 1317 1318 mutex_enter(&vdc->lock); 1319 1320 if (vdc->vdisk_type == VD_DISK_TYPE_UNK || 1321 (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0) || 1322 (!nodelay && (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1323 vdc->slice[slice].nblocks == 0))) { 1324 vdc_mark_closed(vdc, slice, flag, otyp); 1325 status = EIO; 1326 } 1327 1328 mutex_exit(&vdc->lock); 1329 1330 return (status); 1331 } 1332 1333 static int 1334 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1335 { 1336 _NOTE(ARGUNUSED(cred)) 1337 1338 int instance; 1339 int slice; 1340 int rv, rval; 1341 vdc_t *vdc; 1342 1343 instance = VDCUNIT(dev); 1344 1345 if (otyp >= OTYPCNT) 1346 return (EINVAL); 1347 1348 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1349 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1350 return (ENXIO); 1351 } 1352 1353 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1354 1355 slice = VDCPART(dev); 1356 1357 /* 1358 * Attempt to flush the W$ on a close operation. If this is 1359 * not a supported IOCTL command or the backing device is read-only 1360 * do not fail the close operation. 1361 */ 1362 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1363 1364 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1365 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1366 instance, rv); 1367 return (EIO); 1368 } 1369 1370 mutex_enter(&vdc->lock); 1371 vdc_mark_closed(vdc, slice, flag, otyp); 1372 mutex_exit(&vdc->lock); 1373 1374 return (0); 1375 } 1376 1377 static int 1378 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1379 { 1380 _NOTE(ARGUNUSED(credp)) 1381 1382 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1383 } 1384 1385 static int 1386 vdc_print(dev_t dev, char *str) 1387 { 1388 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1389 return (0); 1390 } 1391 1392 static int 1393 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1394 { 1395 int rv, flags; 1396 size_t nbytes = nblk * DEV_BSIZE; 1397 int instance = VDCUNIT(dev); 1398 vdc_t *vdc = NULL; 1399 diskaddr_t vio_blkno; 1400 1401 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1402 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1403 return (ENXIO); 1404 } 1405 1406 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1407 instance, nbytes, blkno, (void *)addr); 1408 1409 /* convert logical block to vio block */ 1410 if ((blkno & vdc->vio_bmask) != 0) { 1411 DMSG(vdc, 0, "Misaligned block number (%lu)\n", blkno); 1412 return (EINVAL); 1413 } 1414 vio_blkno = blkno >> vdc->vio_bshift; 1415 1416 /* 1417 * If we are panicking, we need the state to be "running" so that we 1418 * can submit I/Os, but we don't want to check for any backend error. 1419 */ 1420 flags = (ddi_in_panic())? VDC_OP_STATE_RUNNING : VDC_OP_NORMAL; 1421 1422 rv = vdc_do_op(vdc, VD_OP_BWRITE, addr, nbytes, VDCPART(dev), 1423 vio_blkno, NULL, VIO_write_dir, flags); 1424 1425 if (rv) { 1426 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1427 return (rv); 1428 } 1429 1430 DMSG(vdc, 0, "[%d] End\n", instance); 1431 1432 return (0); 1433 } 1434 1435 /* -------------------------------------------------------------------------- */ 1436 1437 /* 1438 * Disk access routines 1439 * 1440 */ 1441 1442 /* 1443 * vdc_strategy() 1444 * 1445 * Return Value: 1446 * 0: As per strategy(9E), the strategy() function must return 0 1447 * [ bioerror(9f) sets b_flags to the proper error code ] 1448 */ 1449 static int 1450 vdc_strategy(struct buf *buf) 1451 { 1452 diskaddr_t vio_blkno; 1453 vdc_t *vdc = NULL; 1454 int instance = VDCUNIT(buf->b_edev); 1455 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1456 int slice; 1457 1458 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1459 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1460 bioerror(buf, ENXIO); 1461 biodone(buf); 1462 return (0); 1463 } 1464 1465 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1466 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1467 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1468 1469 bp_mapin(buf); 1470 1471 if ((long)buf->b_private == VD_SLICE_NONE) { 1472 /* I/O using an absolute disk offset */ 1473 slice = VD_SLICE_NONE; 1474 } else { 1475 slice = VDCPART(buf->b_edev); 1476 } 1477 1478 /* 1479 * In the buf structure, b_lblkno represents a logical block number 1480 * using a block size of 512 bytes. For the VIO request, this block 1481 * number has to be converted to be represented with the block size 1482 * used by the VIO protocol. 1483 */ 1484 if ((buf->b_lblkno & vdc->vio_bmask) != 0) { 1485 bioerror(buf, EINVAL); 1486 biodone(buf); 1487 return (0); 1488 } 1489 vio_blkno = buf->b_lblkno >> vdc->vio_bshift; 1490 1491 /* submit the I/O, any error will be reported in the buf structure */ 1492 (void) vdc_do_op(vdc, op, (caddr_t)buf->b_un.b_addr, 1493 buf->b_bcount, slice, vio_blkno, 1494 buf, (op == VD_OP_BREAD) ? VIO_read_dir : VIO_write_dir, 1495 VDC_OP_NORMAL); 1496 1497 return (0); 1498 } 1499 1500 /* 1501 * Function: 1502 * vdc_min 1503 * 1504 * Description: 1505 * Routine to limit the size of a data transfer. Used in 1506 * conjunction with physio(9F). 1507 * 1508 * Arguments: 1509 * bp - pointer to the indicated buf(9S) struct. 1510 * 1511 */ 1512 static void 1513 vdc_min(struct buf *bufp) 1514 { 1515 vdc_t *vdc = NULL; 1516 int instance = VDCUNIT(bufp->b_edev); 1517 1518 vdc = ddi_get_soft_state(vdc_state, instance); 1519 VERIFY(vdc != NULL); 1520 1521 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->vdisk_bsize)) { 1522 bufp->b_bcount = vdc->max_xfer_sz * vdc->vdisk_bsize; 1523 } 1524 } 1525 1526 static int 1527 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1528 { 1529 _NOTE(ARGUNUSED(cred)) 1530 1531 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1532 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1533 } 1534 1535 static int 1536 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1537 { 1538 _NOTE(ARGUNUSED(cred)) 1539 1540 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1541 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1542 } 1543 1544 static int 1545 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1546 { 1547 _NOTE(ARGUNUSED(cred)) 1548 1549 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1550 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1551 } 1552 1553 static int 1554 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1555 { 1556 _NOTE(ARGUNUSED(cred)) 1557 1558 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1559 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1560 } 1561 1562 1563 /* -------------------------------------------------------------------------- */ 1564 1565 /* 1566 * Handshake support 1567 */ 1568 1569 1570 /* 1571 * Function: 1572 * vdc_init_ver_negotiation() 1573 * 1574 * Description: 1575 * 1576 * Arguments: 1577 * vdc - soft state pointer for this instance of the device driver. 1578 * 1579 * Return Code: 1580 * 0 - Success 1581 */ 1582 static int 1583 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1584 { 1585 vio_ver_msg_t pkt; 1586 size_t msglen = sizeof (pkt); 1587 int status = -1; 1588 1589 ASSERT(vdc != NULL); 1590 ASSERT(mutex_owned(&vdc->lock)); 1591 1592 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1593 1594 /* 1595 * set the Session ID to a unique value 1596 * (the lower 32 bits of the clock tick) 1597 */ 1598 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1599 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1600 1601 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1602 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1603 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1604 pkt.tag.vio_sid = vdc->session_id; 1605 pkt.dev_class = VDEV_DISK; 1606 pkt.ver_major = ver.major; 1607 pkt.ver_minor = ver.minor; 1608 1609 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1610 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1611 vdc->instance, status); 1612 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1613 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1614 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1615 vdc->curr_server->ldc_handle, status, msglen); 1616 if (msglen != sizeof (vio_ver_msg_t)) 1617 status = ENOMSG; 1618 } 1619 1620 return (status); 1621 } 1622 1623 /* 1624 * Function: 1625 * vdc_ver_negotiation() 1626 * 1627 * Description: 1628 * 1629 * Arguments: 1630 * vdcp - soft state pointer for this instance of the device driver. 1631 * 1632 * Return Code: 1633 * 0 - Success 1634 */ 1635 static int 1636 vdc_ver_negotiation(vdc_t *vdcp) 1637 { 1638 vio_msg_t vio_msg; 1639 int status; 1640 1641 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1642 return (status); 1643 1644 /* release lock and wait for response */ 1645 mutex_exit(&vdcp->lock); 1646 status = vdc_wait_for_response(vdcp, &vio_msg); 1647 mutex_enter(&vdcp->lock); 1648 if (status) { 1649 DMSG(vdcp, 0, 1650 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1651 vdcp->instance, status); 1652 return (status); 1653 } 1654 1655 /* check type and sub_type ... */ 1656 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1657 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1658 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1659 vdcp->instance); 1660 return (EPROTO); 1661 } 1662 1663 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1664 } 1665 1666 /* 1667 * Function: 1668 * vdc_init_attr_negotiation() 1669 * 1670 * Description: 1671 * 1672 * Arguments: 1673 * vdc - soft state pointer for this instance of the device driver. 1674 * 1675 * Return Code: 1676 * 0 - Success 1677 */ 1678 static int 1679 vdc_init_attr_negotiation(vdc_t *vdc) 1680 { 1681 vd_attr_msg_t pkt; 1682 size_t msglen = sizeof (pkt); 1683 int status; 1684 1685 ASSERT(vdc != NULL); 1686 ASSERT(mutex_owned(&vdc->lock)); 1687 1688 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1689 1690 /* fill in tag */ 1691 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1692 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1693 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1694 pkt.tag.vio_sid = vdc->session_id; 1695 /* fill in payload */ 1696 pkt.max_xfer_sz = vdc->max_xfer_sz; 1697 pkt.vdisk_block_size = vdc->vdisk_bsize; 1698 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1699 pkt.operations = 0; /* server will set bits of valid operations */ 1700 pkt.vdisk_type = 0; /* server will set to valid device type */ 1701 pkt.vdisk_media = 0; /* server will set to valid media type */ 1702 pkt.vdisk_size = 0; /* server will set to valid size */ 1703 1704 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1705 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1706 1707 if ((status != 0) || (msglen != sizeof (vd_attr_msg_t))) { 1708 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1709 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1710 vdc->curr_server->ldc_handle, status, msglen); 1711 if (msglen != sizeof (vd_attr_msg_t)) 1712 status = ENOMSG; 1713 } 1714 1715 return (status); 1716 } 1717 1718 /* 1719 * Function: 1720 * vdc_attr_negotiation() 1721 * 1722 * Description: 1723 * 1724 * Arguments: 1725 * vdc - soft state pointer for this instance of the device driver. 1726 * 1727 * Return Code: 1728 * 0 - Success 1729 */ 1730 static int 1731 vdc_attr_negotiation(vdc_t *vdcp) 1732 { 1733 int status; 1734 vio_msg_t vio_msg; 1735 1736 if (status = vdc_init_attr_negotiation(vdcp)) 1737 return (status); 1738 1739 /* release lock and wait for response */ 1740 mutex_exit(&vdcp->lock); 1741 status = vdc_wait_for_response(vdcp, &vio_msg); 1742 mutex_enter(&vdcp->lock); 1743 if (status) { 1744 DMSG(vdcp, 0, 1745 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1746 vdcp->instance, status); 1747 return (status); 1748 } 1749 1750 /* check type and sub_type ... */ 1751 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1752 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1753 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1754 vdcp->instance); 1755 return (EPROTO); 1756 } 1757 1758 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1759 } 1760 1761 1762 /* 1763 * Function: 1764 * vdc_init_dring_negotiate() 1765 * 1766 * Description: 1767 * 1768 * Arguments: 1769 * vdc - soft state pointer for this instance of the device driver. 1770 * 1771 * Return Code: 1772 * 0 - Success 1773 */ 1774 static int 1775 vdc_init_dring_negotiate(vdc_t *vdc) 1776 { 1777 vio_dring_reg_msg_t pkt; 1778 size_t msglen = sizeof (pkt); 1779 int status = -1; 1780 int retry; 1781 int nretries = 10; 1782 1783 ASSERT(vdc != NULL); 1784 ASSERT(mutex_owned(&vdc->lock)); 1785 1786 for (retry = 0; retry < nretries; retry++) { 1787 status = vdc_init_descriptor_ring(vdc); 1788 if (status != EAGAIN) 1789 break; 1790 drv_usecwait(vdc_min_timeout_ldc); 1791 } 1792 1793 if (status != 0) { 1794 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1795 vdc->instance, status); 1796 return (status); 1797 } 1798 1799 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1800 vdc->instance, status); 1801 1802 /* fill in tag */ 1803 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1804 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1805 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1806 pkt.tag.vio_sid = vdc->session_id; 1807 /* fill in payload */ 1808 pkt.dring_ident = 0; 1809 pkt.num_descriptors = vdc->dring_len; 1810 pkt.descriptor_size = vdc->dring_entry_size; 1811 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1812 pkt.ncookies = vdc->dring_cookie_count; 1813 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1814 1815 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1816 if (status != 0) { 1817 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1818 vdc->instance, status); 1819 } 1820 1821 return (status); 1822 } 1823 1824 1825 /* 1826 * Function: 1827 * vdc_dring_negotiation() 1828 * 1829 * Description: 1830 * 1831 * Arguments: 1832 * vdc - soft state pointer for this instance of the device driver. 1833 * 1834 * Return Code: 1835 * 0 - Success 1836 */ 1837 static int 1838 vdc_dring_negotiation(vdc_t *vdcp) 1839 { 1840 int status; 1841 vio_msg_t vio_msg; 1842 1843 if (status = vdc_init_dring_negotiate(vdcp)) 1844 return (status); 1845 1846 /* release lock and wait for response */ 1847 mutex_exit(&vdcp->lock); 1848 status = vdc_wait_for_response(vdcp, &vio_msg); 1849 mutex_enter(&vdcp->lock); 1850 if (status) { 1851 DMSG(vdcp, 0, 1852 "[%d] Failed waiting for Dring negotiation response," 1853 " rv(%d)", vdcp->instance, status); 1854 return (status); 1855 } 1856 1857 /* check type and sub_type ... */ 1858 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1859 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1860 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1861 vdcp->instance); 1862 return (EPROTO); 1863 } 1864 1865 return (vdc_handle_dring_reg_msg(vdcp, 1866 (vio_dring_reg_msg_t *)&vio_msg)); 1867 } 1868 1869 1870 /* 1871 * Function: 1872 * vdc_send_rdx() 1873 * 1874 * Description: 1875 * 1876 * Arguments: 1877 * vdc - soft state pointer for this instance of the device driver. 1878 * 1879 * Return Code: 1880 * 0 - Success 1881 */ 1882 static int 1883 vdc_send_rdx(vdc_t *vdcp) 1884 { 1885 vio_msg_t msg; 1886 size_t msglen = sizeof (vio_msg_t); 1887 int status; 1888 1889 /* 1890 * Send an RDX message to vds to indicate we are ready 1891 * to send data 1892 */ 1893 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1894 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1895 msg.tag.vio_subtype_env = VIO_RDX; 1896 msg.tag.vio_sid = vdcp->session_id; 1897 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1898 if (status != 0) { 1899 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1900 vdcp->instance, status); 1901 } 1902 1903 return (status); 1904 } 1905 1906 /* 1907 * Function: 1908 * vdc_handle_rdx() 1909 * 1910 * Description: 1911 * 1912 * Arguments: 1913 * vdc - soft state pointer for this instance of the device driver. 1914 * msgp - received msg 1915 * 1916 * Return Code: 1917 * 0 - Success 1918 */ 1919 static int 1920 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1921 { 1922 _NOTE(ARGUNUSED(vdcp)) 1923 _NOTE(ARGUNUSED(msgp)) 1924 1925 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1926 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1927 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1928 1929 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1930 1931 return (0); 1932 } 1933 1934 /* 1935 * Function: 1936 * vdc_rdx_exchange() 1937 * 1938 * Description: 1939 * 1940 * Arguments: 1941 * vdc - soft state pointer for this instance of the device driver. 1942 * 1943 * Return Code: 1944 * 0 - Success 1945 */ 1946 static int 1947 vdc_rdx_exchange(vdc_t *vdcp) 1948 { 1949 int status; 1950 vio_msg_t vio_msg; 1951 1952 if (status = vdc_send_rdx(vdcp)) 1953 return (status); 1954 1955 /* release lock and wait for response */ 1956 mutex_exit(&vdcp->lock); 1957 status = vdc_wait_for_response(vdcp, &vio_msg); 1958 mutex_enter(&vdcp->lock); 1959 if (status) { 1960 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1961 vdcp->instance, status); 1962 return (status); 1963 } 1964 1965 /* check type and sub_type ... */ 1966 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1967 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1968 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1969 return (EPROTO); 1970 } 1971 1972 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1973 } 1974 1975 1976 /* -------------------------------------------------------------------------- */ 1977 1978 /* 1979 * LDC helper routines 1980 */ 1981 1982 static int 1983 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1984 { 1985 int status; 1986 uint64_t delay_time; 1987 size_t len; 1988 1989 /* 1990 * Until we get a blocking ldc read we have to retry until the entire 1991 * LDC message has arrived before ldc_read() will return that message. 1992 * If ldc_read() succeed but returns a zero length message then that 1993 * means that the LDC queue is empty and we have to wait for a 1994 * notification from the LDC callback which will set the read_state to 1995 * VDC_READ_PENDING. Note we also bail out if the channel is reset or 1996 * goes away. 1997 */ 1998 delay_time = vdc_ldc_read_init_delay; 1999 2000 for (;;) { 2001 2002 len = *nbytesp; 2003 /* 2004 * vdc->curr_server is protected by vdc->lock but to avoid 2005 * contentions we don't take the lock here. We can do this 2006 * safely because vdc_recv() is only called from thread 2007 * process_msg_thread() which is also the only thread that 2008 * can change vdc->curr_server. 2009 */ 2010 status = ldc_read(vdc->curr_server->ldc_handle, 2011 (caddr_t)msgp, &len); 2012 2013 if (status == EAGAIN) { 2014 delay_time *= 2; 2015 if (delay_time >= vdc_ldc_read_max_delay) 2016 delay_time = vdc_ldc_read_max_delay; 2017 delay(delay_time); 2018 continue; 2019 } 2020 2021 if (status != 0) { 2022 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2023 break; 2024 } 2025 2026 if (len != 0) { 2027 *nbytesp = len; 2028 break; 2029 } 2030 2031 mutex_enter(&vdc->read_lock); 2032 2033 while (vdc->read_state != VDC_READ_PENDING) { 2034 2035 /* detect if the connection has been reset */ 2036 if (vdc->read_state == VDC_READ_RESET) { 2037 mutex_exit(&vdc->read_lock); 2038 return (ECONNRESET); 2039 } 2040 2041 vdc->read_state = VDC_READ_WAITING; 2042 cv_wait(&vdc->read_cv, &vdc->read_lock); 2043 } 2044 2045 vdc->read_state = VDC_READ_IDLE; 2046 mutex_exit(&vdc->read_lock); 2047 2048 delay_time = vdc_ldc_read_init_delay; 2049 } 2050 2051 return (status); 2052 } 2053 2054 2055 2056 #ifdef DEBUG 2057 void 2058 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2059 { 2060 char *ms, *ss, *ses; 2061 switch (msg->tag.vio_msgtype) { 2062 #define Q(_s) case _s : ms = #_s; break; 2063 Q(VIO_TYPE_CTRL) 2064 Q(VIO_TYPE_DATA) 2065 Q(VIO_TYPE_ERR) 2066 #undef Q 2067 default: ms = "unknown"; break; 2068 } 2069 2070 switch (msg->tag.vio_subtype) { 2071 #define Q(_s) case _s : ss = #_s; break; 2072 Q(VIO_SUBTYPE_INFO) 2073 Q(VIO_SUBTYPE_ACK) 2074 Q(VIO_SUBTYPE_NACK) 2075 #undef Q 2076 default: ss = "unknown"; break; 2077 } 2078 2079 switch (msg->tag.vio_subtype_env) { 2080 #define Q(_s) case _s : ses = #_s; break; 2081 Q(VIO_VER_INFO) 2082 Q(VIO_ATTR_INFO) 2083 Q(VIO_DRING_REG) 2084 Q(VIO_DRING_UNREG) 2085 Q(VIO_RDX) 2086 Q(VIO_PKT_DATA) 2087 Q(VIO_DESC_DATA) 2088 Q(VIO_DRING_DATA) 2089 #undef Q 2090 default: ses = "unknown"; break; 2091 } 2092 2093 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2094 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2095 msg->tag.vio_subtype_env, ms, ss, ses); 2096 } 2097 #endif 2098 2099 /* 2100 * Function: 2101 * vdc_send() 2102 * 2103 * Description: 2104 * The function encapsulates the call to write a message using LDC. 2105 * If LDC indicates that the call failed due to the queue being full, 2106 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2107 * 2108 * Arguments: 2109 * ldc_handle - LDC handle for the channel this instance of vdc uses 2110 * pkt - address of LDC message to be sent 2111 * msglen - the size of the message being sent. When the function 2112 * returns, this contains the number of bytes written. 2113 * 2114 * Return Code: 2115 * 0 - Success. 2116 * EINVAL - pkt or msglen were NULL 2117 * ECONNRESET - The connection was not up. 2118 * EWOULDBLOCK - LDC queue is full 2119 * xxx - other error codes returned by ldc_write 2120 */ 2121 static int 2122 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2123 { 2124 size_t size = 0; 2125 int status = 0; 2126 clock_t delay_ticks; 2127 2128 ASSERT(vdc != NULL); 2129 ASSERT(mutex_owned(&vdc->lock)); 2130 ASSERT(msglen != NULL); 2131 ASSERT(*msglen != 0); 2132 2133 #ifdef DEBUG 2134 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2135 #endif 2136 /* 2137 * Wait indefinitely to send if channel 2138 * is busy, but bail out if we succeed or 2139 * if the channel closes or is reset. 2140 */ 2141 delay_ticks = vdc_hz_min_ldc_delay; 2142 do { 2143 size = *msglen; 2144 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2145 if (status == EWOULDBLOCK) { 2146 delay(delay_ticks); 2147 /* geometric backoff */ 2148 delay_ticks *= 2; 2149 if (delay_ticks > vdc_hz_max_ldc_delay) 2150 delay_ticks = vdc_hz_max_ldc_delay; 2151 } 2152 } while (status == EWOULDBLOCK); 2153 2154 /* if LDC had serious issues --- reset vdc state */ 2155 if (status == EIO || status == ECONNRESET) { 2156 /* LDC had serious issues --- reset vdc state */ 2157 mutex_enter(&vdc->read_lock); 2158 if ((vdc->read_state == VDC_READ_WAITING) || 2159 (vdc->read_state == VDC_READ_RESET)) 2160 cv_signal(&vdc->read_cv); 2161 vdc->read_state = VDC_READ_RESET; 2162 mutex_exit(&vdc->read_lock); 2163 2164 /* wake up any waiters in the reset thread */ 2165 if (vdc->state == VDC_STATE_INIT_WAITING) { 2166 DMSG(vdc, 0, "[%d] write reset - " 2167 "vdc is resetting ..\n", vdc->instance); 2168 vdc->state = VDC_STATE_RESETTING; 2169 cv_signal(&vdc->initwait_cv); 2170 } 2171 2172 return (ECONNRESET); 2173 } 2174 2175 /* return the last size written */ 2176 *msglen = size; 2177 2178 return (status); 2179 } 2180 2181 /* 2182 * Function: 2183 * vdc_get_md_node 2184 * 2185 * Description: 2186 * Get the MD, the device node for the given disk instance. The 2187 * caller is responsible for cleaning up the reference to the 2188 * returned MD (mdpp) by calling md_fini_handle(). 2189 * 2190 * Arguments: 2191 * dip - dev info pointer for this instance of the device driver. 2192 * mdpp - the returned MD. 2193 * vd_nodep - the returned device node. 2194 * 2195 * Return Code: 2196 * 0 - Success. 2197 * ENOENT - Expected node or property did not exist. 2198 * ENXIO - Unexpected error communicating with MD framework 2199 */ 2200 static int 2201 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2202 { 2203 int status = ENOENT; 2204 char *node_name = NULL; 2205 md_t *mdp = NULL; 2206 int num_nodes; 2207 int num_vdevs; 2208 mde_cookie_t rootnode; 2209 mde_cookie_t *listp = NULL; 2210 boolean_t found_inst = B_FALSE; 2211 int listsz; 2212 int idx; 2213 uint64_t md_inst; 2214 int obp_inst; 2215 int instance = ddi_get_instance(dip); 2216 2217 /* 2218 * Get the OBP instance number for comparison with the MD instance 2219 * 2220 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2221 * notion of "instance", or unique identifier, for that node; OBP 2222 * stores the value of the "cfg-handle" MD property as the value of 2223 * the "reg" property on the node in the device tree it builds from 2224 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2225 * "reg" property value to uniquely identify this device instance. 2226 * If the "reg" property cannot be found, the device tree state is 2227 * presumably so broken that there is no point in continuing. 2228 */ 2229 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2230 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2231 return (ENOENT); 2232 } 2233 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2234 OBP_REG, -1); 2235 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2236 2237 /* 2238 * We now walk the MD nodes to find the node for this vdisk. 2239 */ 2240 if ((mdp = md_get_handle()) == NULL) { 2241 cmn_err(CE_WARN, "unable to init machine description"); 2242 return (ENXIO); 2243 } 2244 2245 num_nodes = md_node_count(mdp); 2246 ASSERT(num_nodes > 0); 2247 2248 listsz = num_nodes * sizeof (mde_cookie_t); 2249 2250 /* allocate memory for nodes */ 2251 listp = kmem_zalloc(listsz, KM_SLEEP); 2252 2253 rootnode = md_root_node(mdp); 2254 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2255 2256 /* 2257 * Search for all the virtual devices, we will then check to see which 2258 * ones are disk nodes. 2259 */ 2260 num_vdevs = md_scan_dag(mdp, rootnode, 2261 md_find_name(mdp, VDC_MD_VDEV_NAME), 2262 md_find_name(mdp, "fwd"), listp); 2263 2264 if (num_vdevs <= 0) { 2265 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2266 status = ENOENT; 2267 goto done; 2268 } 2269 2270 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2271 for (idx = 0; idx < num_vdevs; idx++) { 2272 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2273 if ((status != 0) || (node_name == NULL)) { 2274 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2275 ": err %d", VDC_MD_VDEV_NAME, status); 2276 continue; 2277 } 2278 2279 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2280 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2281 status = md_get_prop_val(mdp, listp[idx], 2282 VDC_MD_CFG_HDL, &md_inst); 2283 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2284 instance, md_inst); 2285 if ((status == 0) && (md_inst == obp_inst)) { 2286 found_inst = B_TRUE; 2287 break; 2288 } 2289 } 2290 } 2291 2292 if (!found_inst) { 2293 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2294 status = ENOENT; 2295 goto done; 2296 } 2297 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2298 2299 *vd_nodep = listp[idx]; 2300 *mdpp = mdp; 2301 done: 2302 kmem_free(listp, listsz); 2303 return (status); 2304 } 2305 2306 /* 2307 * Function: 2308 * vdc_init_ports 2309 * 2310 * Description: 2311 * Initialize all the ports for this vdisk instance. 2312 * 2313 * Arguments: 2314 * vdc - soft state pointer for this instance of the device driver. 2315 * mdp - md pointer 2316 * vd_nodep - device md node. 2317 * 2318 * Return Code: 2319 * 0 - Success. 2320 * ENOENT - Expected node or property did not exist. 2321 */ 2322 static int 2323 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2324 { 2325 int status = 0; 2326 int idx; 2327 int num_nodes; 2328 int num_vports; 2329 int num_chans; 2330 int listsz; 2331 mde_cookie_t vd_port; 2332 mde_cookie_t *chanp = NULL; 2333 mde_cookie_t *portp = NULL; 2334 vdc_server_t *srvr; 2335 vdc_server_t *prev_srvr = NULL; 2336 2337 /* 2338 * We now walk the MD nodes to find the port nodes for this vdisk. 2339 */ 2340 num_nodes = md_node_count(mdp); 2341 ASSERT(num_nodes > 0); 2342 2343 listsz = num_nodes * sizeof (mde_cookie_t); 2344 2345 /* allocate memory for nodes */ 2346 portp = kmem_zalloc(listsz, KM_SLEEP); 2347 chanp = kmem_zalloc(listsz, KM_SLEEP); 2348 2349 num_vports = md_scan_dag(mdp, vd_nodep, 2350 md_find_name(mdp, VDC_MD_PORT_NAME), 2351 md_find_name(mdp, "fwd"), portp); 2352 if (num_vports == 0) { 2353 DMSGX(0, "Found no '%s' node for '%s' port\n", 2354 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2355 status = ENOENT; 2356 goto done; 2357 } 2358 2359 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2360 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2361 2362 vdc->num_servers = 0; 2363 for (idx = 0; idx < num_vports; idx++) { 2364 2365 /* initialize this port */ 2366 vd_port = portp[idx]; 2367 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2368 srvr->vdcp = vdc; 2369 srvr->svc_state = VDC_SERVICE_OFFLINE; 2370 srvr->log_state = VDC_SERVICE_NONE; 2371 2372 /* get port id */ 2373 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2374 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2375 VDC_MD_ID); 2376 kmem_free(srvr, sizeof (vdc_server_t)); 2377 continue; 2378 } 2379 2380 /* set the connection timeout */ 2381 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2382 &srvr->ctimeout) != 0) { 2383 srvr->ctimeout = 0; 2384 } 2385 2386 /* get the ldc id */ 2387 num_chans = md_scan_dag(mdp, vd_port, 2388 md_find_name(mdp, VDC_MD_CHAN_NAME), 2389 md_find_name(mdp, "fwd"), chanp); 2390 2391 /* expecting at least one channel */ 2392 if (num_chans <= 0) { 2393 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2394 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2395 kmem_free(srvr, sizeof (vdc_server_t)); 2396 continue; 2397 } else if (num_chans != 1) { 2398 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2399 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2400 num_chans); 2401 } 2402 2403 /* 2404 * We use the first channel found (index 0), irrespective of how 2405 * many are there in total. 2406 */ 2407 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2408 &srvr->ldc_id) != 0) { 2409 cmn_err(CE_NOTE, "Channel '%s' property not found", 2410 VDC_MD_ID); 2411 kmem_free(srvr, sizeof (vdc_server_t)); 2412 continue; 2413 } 2414 2415 /* 2416 * now initialise LDC channel which will be used to 2417 * communicate with this server 2418 */ 2419 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2420 kmem_free(srvr, sizeof (vdc_server_t)); 2421 continue; 2422 } 2423 2424 /* add server to list */ 2425 if (prev_srvr) 2426 prev_srvr->next = srvr; 2427 else 2428 vdc->server_list = srvr; 2429 2430 prev_srvr = srvr; 2431 2432 /* inc numbers of servers */ 2433 vdc->num_servers++; 2434 } 2435 2436 /* 2437 * Adjust the max number of handshake retries to match 2438 * the number of vdisk servers. 2439 */ 2440 if (vdc_hshake_retries < vdc->num_servers) 2441 vdc_hshake_retries = vdc->num_servers; 2442 2443 /* pick first server as current server */ 2444 if (vdc->server_list != NULL) { 2445 vdc->curr_server = vdc->server_list; 2446 status = 0; 2447 } else { 2448 status = ENOENT; 2449 } 2450 2451 done: 2452 kmem_free(chanp, listsz); 2453 kmem_free(portp, listsz); 2454 return (status); 2455 } 2456 2457 2458 /* 2459 * Function: 2460 * vdc_do_ldc_up 2461 * 2462 * Description: 2463 * Bring the channel for the current server up. 2464 * 2465 * Arguments: 2466 * vdc - soft state pointer for this instance of the device driver. 2467 * 2468 * Return Code: 2469 * 0 - Success. 2470 * EINVAL - Driver is detaching / LDC error 2471 * ECONNREFUSED - Other end is not listening 2472 */ 2473 static int 2474 vdc_do_ldc_up(vdc_t *vdc) 2475 { 2476 int status; 2477 ldc_status_t ldc_state; 2478 2479 ASSERT(MUTEX_HELD(&vdc->lock)); 2480 2481 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2482 vdc->instance, vdc->curr_server->ldc_id); 2483 2484 if (vdc->lifecycle == VDC_LC_DETACHING) 2485 return (EINVAL); 2486 2487 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2488 switch (status) { 2489 case ECONNREFUSED: /* listener not ready at other end */ 2490 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2491 vdc->instance, vdc->curr_server->ldc_id, status); 2492 status = 0; 2493 break; 2494 default: 2495 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2496 "channel=%ld, err=%d", vdc->instance, 2497 vdc->curr_server->ldc_id, status); 2498 break; 2499 } 2500 } 2501 2502 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2503 vdc->curr_server->ldc_state = ldc_state; 2504 if (ldc_state == LDC_UP) { 2505 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2506 vdc->instance); 2507 vdc->seq_num = 1; 2508 vdc->seq_num_reply = 0; 2509 } 2510 } 2511 2512 return (status); 2513 } 2514 2515 /* 2516 * Function: 2517 * vdc_terminate_ldc() 2518 * 2519 * Description: 2520 * 2521 * Arguments: 2522 * vdc - soft state pointer for this instance of the device driver. 2523 * srvr - vdc per-server info structure 2524 * 2525 * Return Code: 2526 * None 2527 */ 2528 static void 2529 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2530 { 2531 int instance = ddi_get_instance(vdc->dip); 2532 2533 if (srvr->state & VDC_LDC_OPEN) { 2534 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2535 (void) ldc_close(srvr->ldc_handle); 2536 } 2537 if (srvr->state & VDC_LDC_CB) { 2538 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2539 (void) ldc_unreg_callback(srvr->ldc_handle); 2540 } 2541 if (srvr->state & VDC_LDC_INIT) { 2542 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2543 (void) ldc_fini(srvr->ldc_handle); 2544 srvr->ldc_handle = NULL; 2545 } 2546 2547 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2548 } 2549 2550 /* 2551 * Function: 2552 * vdc_fini_ports() 2553 * 2554 * Description: 2555 * Finalize all ports by closing the channel associated with each 2556 * port and also freeing the server structure. 2557 * 2558 * Arguments: 2559 * vdc - soft state pointer for this instance of the device driver. 2560 * 2561 * Return Code: 2562 * None 2563 */ 2564 static void 2565 vdc_fini_ports(vdc_t *vdc) 2566 { 2567 int instance = ddi_get_instance(vdc->dip); 2568 vdc_server_t *srvr, *prev_srvr; 2569 2570 ASSERT(vdc != NULL); 2571 ASSERT(mutex_owned(&vdc->lock)); 2572 2573 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2574 2575 srvr = vdc->server_list; 2576 2577 while (srvr) { 2578 2579 vdc_terminate_ldc(vdc, srvr); 2580 2581 /* next server */ 2582 prev_srvr = srvr; 2583 srvr = srvr->next; 2584 2585 /* free server */ 2586 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2587 } 2588 2589 vdc->server_list = NULL; 2590 vdc->num_servers = 0; 2591 } 2592 2593 /* -------------------------------------------------------------------------- */ 2594 2595 /* 2596 * Descriptor Ring helper routines 2597 */ 2598 2599 /* 2600 * Function: 2601 * vdc_init_descriptor_ring() 2602 * 2603 * Description: 2604 * 2605 * Arguments: 2606 * vdc - soft state pointer for this instance of the device driver. 2607 * 2608 * Return Code: 2609 * 0 - Success 2610 */ 2611 static int 2612 vdc_init_descriptor_ring(vdc_t *vdc) 2613 { 2614 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2615 int status = 0; 2616 int i; 2617 2618 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2619 2620 ASSERT(vdc != NULL); 2621 ASSERT(mutex_owned(&vdc->lock)); 2622 2623 /* ensure we have enough room to store max sized block */ 2624 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2625 2626 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2627 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2628 /* 2629 * Calculate the maximum block size we can transmit using one 2630 * Descriptor Ring entry from the attributes returned by the 2631 * vDisk server. This is subject to a minimum of 'maxphys' 2632 * as we do not have the capability to split requests over 2633 * multiple DRing entries. 2634 */ 2635 if ((vdc->max_xfer_sz * vdc->vdisk_bsize) < maxphys) { 2636 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2637 vdc->instance); 2638 vdc->dring_max_cookies = maxphys / PAGESIZE; 2639 } else { 2640 vdc->dring_max_cookies = 2641 (vdc->max_xfer_sz * vdc->vdisk_bsize) / PAGESIZE; 2642 } 2643 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2644 (sizeof (ldc_mem_cookie_t) * 2645 (vdc->dring_max_cookies - 1))); 2646 vdc->dring_len = VD_DRING_LEN; 2647 2648 status = ldc_mem_dring_create(vdc->dring_len, 2649 vdc->dring_entry_size, &vdc->dring_hdl); 2650 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2651 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2652 vdc->instance); 2653 return (status); 2654 } 2655 vdc->initialized |= VDC_DRING_INIT; 2656 } 2657 2658 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2659 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2660 vdc->dring_cookie = 2661 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2662 2663 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2664 vdc->dring_hdl, 2665 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2666 &vdc->dring_cookie[0], 2667 &vdc->dring_cookie_count); 2668 if (status != 0) { 2669 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2670 "(%lx) to channel (%lx) status=%d\n", 2671 vdc->instance, vdc->dring_hdl, 2672 vdc->curr_server->ldc_handle, status); 2673 return (status); 2674 } 2675 ASSERT(vdc->dring_cookie_count == 1); 2676 vdc->initialized |= VDC_DRING_BOUND; 2677 } 2678 2679 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2680 if (status != 0) { 2681 DMSG(vdc, 0, 2682 "[%d] Failed to get info for descriptor ring (%lx)\n", 2683 vdc->instance, vdc->dring_hdl); 2684 return (status); 2685 } 2686 2687 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2688 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2689 2690 /* Allocate the local copy of this dring */ 2691 vdc->local_dring = 2692 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2693 KM_SLEEP); 2694 vdc->initialized |= VDC_DRING_LOCAL; 2695 } 2696 2697 /* 2698 * Mark all DRing entries as free and initialize the private 2699 * descriptor's memory handles. If any entry is initialized, 2700 * we need to free it later so we set the bit in 'initialized' 2701 * at the start. 2702 */ 2703 vdc->initialized |= VDC_DRING_ENTRY; 2704 for (i = 0; i < vdc->dring_len; i++) { 2705 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2706 dep->hdr.dstate = VIO_DESC_FREE; 2707 2708 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2709 &vdc->local_dring[i].desc_mhdl); 2710 if (status != 0) { 2711 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2712 " descriptor %d", vdc->instance, i); 2713 return (status); 2714 } 2715 vdc->local_dring[i].is_free = B_TRUE; 2716 vdc->local_dring[i].dep = dep; 2717 } 2718 2719 /* Initialize the starting index */ 2720 vdc->dring_curr_idx = VDC_DRING_FIRST_ENTRY; 2721 2722 return (status); 2723 } 2724 2725 /* 2726 * Function: 2727 * vdc_destroy_descriptor_ring() 2728 * 2729 * Description: 2730 * 2731 * Arguments: 2732 * vdc - soft state pointer for this instance of the device driver. 2733 * 2734 * Return Code: 2735 * None 2736 */ 2737 static void 2738 vdc_destroy_descriptor_ring(vdc_t *vdc) 2739 { 2740 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2741 ldc_mem_handle_t mhdl = NULL; 2742 ldc_mem_info_t minfo; 2743 int status = -1; 2744 int i; /* loop */ 2745 2746 ASSERT(vdc != NULL); 2747 ASSERT(mutex_owned(&vdc->lock)); 2748 2749 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2750 2751 if (vdc->initialized & VDC_DRING_ENTRY) { 2752 DMSG(vdc, 0, 2753 "[%d] Removing Local DRing entries\n", vdc->instance); 2754 for (i = 0; i < vdc->dring_len; i++) { 2755 ldep = &vdc->local_dring[i]; 2756 mhdl = ldep->desc_mhdl; 2757 2758 if (mhdl == NULL) 2759 continue; 2760 2761 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2762 DMSG(vdc, 0, 2763 "ldc_mem_info returned an error: %d\n", 2764 status); 2765 2766 /* 2767 * This must mean that the mem handle 2768 * is not valid. Clear it out so that 2769 * no one tries to use it. 2770 */ 2771 ldep->desc_mhdl = NULL; 2772 continue; 2773 } 2774 2775 if (minfo.status == LDC_BOUND) { 2776 (void) ldc_mem_unbind_handle(mhdl); 2777 } 2778 2779 (void) ldc_mem_free_handle(mhdl); 2780 2781 ldep->desc_mhdl = NULL; 2782 } 2783 vdc->initialized &= ~VDC_DRING_ENTRY; 2784 } 2785 2786 if (vdc->initialized & VDC_DRING_LOCAL) { 2787 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2788 kmem_free(vdc->local_dring, 2789 vdc->dring_len * sizeof (vdc_local_desc_t)); 2790 vdc->initialized &= ~VDC_DRING_LOCAL; 2791 } 2792 2793 if (vdc->initialized & VDC_DRING_BOUND) { 2794 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2795 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2796 if (status == 0) { 2797 vdc->initialized &= ~VDC_DRING_BOUND; 2798 } else { 2799 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2800 vdc->instance, status, vdc->dring_hdl); 2801 } 2802 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2803 } 2804 2805 if (vdc->initialized & VDC_DRING_INIT) { 2806 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2807 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2808 if (status == 0) { 2809 vdc->dring_hdl = NULL; 2810 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2811 vdc->initialized &= ~VDC_DRING_INIT; 2812 } else { 2813 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2814 vdc->instance, status, vdc->dring_hdl); 2815 } 2816 } 2817 } 2818 2819 /* 2820 * Function: 2821 * vdc_map_to_shared_dring() 2822 * 2823 * Description: 2824 * Copy contents of the local descriptor to the shared 2825 * memory descriptor. 2826 * 2827 * Arguments: 2828 * vdcp - soft state pointer for this instance of the device driver. 2829 * idx - descriptor ring index 2830 * 2831 * Return Code: 2832 * None 2833 */ 2834 static int 2835 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2836 { 2837 vdc_local_desc_t *ldep; 2838 vd_dring_entry_t *dep; 2839 int rv; 2840 2841 ldep = &(vdcp->local_dring[idx]); 2842 2843 /* for now leave in the old pop_mem_hdl stuff */ 2844 if (ldep->nbytes > 0) { 2845 rv = vdc_populate_mem_hdl(vdcp, ldep); 2846 if (rv) { 2847 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2848 vdcp->instance); 2849 return (rv); 2850 } 2851 } 2852 2853 /* 2854 * fill in the data details into the DRing 2855 */ 2856 dep = ldep->dep; 2857 ASSERT(dep != NULL); 2858 2859 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2860 dep->payload.operation = ldep->operation; 2861 dep->payload.addr = ldep->offset; 2862 dep->payload.nbytes = ldep->nbytes; 2863 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2864 dep->payload.slice = ldep->slice; 2865 dep->hdr.dstate = VIO_DESC_READY; 2866 dep->hdr.ack = 1; /* request an ACK for every message */ 2867 2868 return (0); 2869 } 2870 2871 /* 2872 * Function: 2873 * vdc_send_request 2874 * 2875 * Description: 2876 * This routine writes the data to be transmitted to vds into the 2877 * descriptor, notifies vds that the ring has been updated and 2878 * then waits for the request to be processed. 2879 * 2880 * Arguments: 2881 * vdcp - the soft state pointer 2882 * operation - operation we want vds to perform (VD_OP_XXX) 2883 * addr - address of data buf to be read/written. 2884 * nbytes - number of bytes to read/write 2885 * slice - the disk slice this request is for 2886 * offset - relative disk offset 2887 * bufp - buf of operation 2888 * dir - direction of operation (READ/WRITE/BOTH) 2889 * 2890 * Return Codes: 2891 * 0 2892 * ENXIO 2893 */ 2894 static int 2895 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2896 size_t nbytes, int slice, diskaddr_t offset, buf_t *bufp, 2897 vio_desc_direction_t dir, int flags) 2898 { 2899 int rv = 0; 2900 2901 ASSERT(vdcp != NULL); 2902 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2903 2904 mutex_enter(&vdcp->lock); 2905 2906 /* 2907 * If this is a block read/write operation we update the I/O statistics 2908 * to indicate that the request is being put on the waitq to be 2909 * serviced. 2910 * 2911 * We do it here (a common routine for both synchronous and strategy 2912 * calls) for performance reasons - we are already holding vdc->lock 2913 * so there is no extra locking overhead. We would have to explicitly 2914 * grab the 'lock' mutex to update the stats if we were to do this 2915 * higher up the stack in vdc_strategy() et. al. 2916 */ 2917 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2918 DTRACE_IO1(start, buf_t *, bufp); 2919 VD_KSTAT_WAITQ_ENTER(vdcp); 2920 } 2921 2922 /* 2923 * If the request does not expect the state to be VDC_STATE_RUNNING 2924 * then we just try to populate the descriptor ring once. 2925 */ 2926 if (!(flags & VDC_OP_STATE_RUNNING)) { 2927 rv = vdc_populate_descriptor(vdcp, operation, addr, 2928 nbytes, slice, offset, bufp, dir, flags); 2929 goto done; 2930 } 2931 2932 do { 2933 while (vdcp->state != VDC_STATE_RUNNING) { 2934 2935 /* return error if detaching */ 2936 if (vdcp->state == VDC_STATE_DETACH) { 2937 rv = ENXIO; 2938 goto done; 2939 } 2940 2941 /* 2942 * If we are panicking and the disk is not ready then 2943 * we can't send any request because we can't complete 2944 * the handshake now. 2945 */ 2946 if (ddi_in_panic()) { 2947 rv = EIO; 2948 goto done; 2949 } 2950 2951 /* 2952 * If the state is faulted, notify that a new I/O is 2953 * being submitted to force the system to check if any 2954 * server has recovered. 2955 */ 2956 if (vdcp->state == VDC_STATE_FAILED) { 2957 vdcp->io_pending = B_TRUE; 2958 cv_signal(&vdcp->io_pending_cv); 2959 } 2960 2961 cv_wait(&vdcp->running_cv, &vdcp->lock); 2962 2963 /* if service is still faulted then fail the request */ 2964 if (vdcp->state == VDC_STATE_FAILED) { 2965 rv = EIO; 2966 goto done; 2967 } 2968 } 2969 2970 } while (vdc_populate_descriptor(vdcp, operation, addr, 2971 nbytes, slice, offset, bufp, dir, flags)); 2972 2973 done: 2974 /* 2975 * If this is a block read/write we update the I/O statistics kstat 2976 * to indicate that this request has been placed on the queue for 2977 * processing (i.e sent to the vDisk server) - iostat(1M) will 2978 * report the time waiting for the vDisk server under the %b column 2979 * In the case of an error we simply take it off the wait queue. 2980 */ 2981 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2982 if (rv == 0) { 2983 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2984 DTRACE_PROBE1(send, buf_t *, bufp); 2985 } else { 2986 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2987 VD_KSTAT_WAITQ_EXIT(vdcp); 2988 DTRACE_IO1(done, buf_t *, bufp); 2989 } 2990 } 2991 2992 mutex_exit(&vdcp->lock); 2993 2994 return (rv); 2995 } 2996 2997 2998 /* 2999 * Function: 3000 * vdc_populate_descriptor 3001 * 3002 * Description: 3003 * This routine writes the data to be transmitted to vds into the 3004 * descriptor, notifies vds that the ring has been updated and 3005 * then waits for the request to be processed. 3006 * 3007 * Arguments: 3008 * vdcp - the soft state pointer 3009 * operation - operation we want vds to perform (VD_OP_XXX) 3010 * addr - address of data buf to be read/written. 3011 * nbytes - number of bytes to read/write 3012 * slice - the disk slice this request is for 3013 * offset - relative disk offset 3014 * bufp - buf of operation 3015 * dir - direction of operation (READ/WRITE/BOTH) 3016 * 3017 * Return Codes: 3018 * 0 3019 * EAGAIN 3020 * ECONNRESET 3021 * ENXIO 3022 */ 3023 static int 3024 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 3025 size_t nbytes, int slice, diskaddr_t offset, 3026 buf_t *bufp, vio_desc_direction_t dir, int flags) 3027 { 3028 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 3029 int idx; /* Index of DRing entry used */ 3030 int next_idx; 3031 vio_dring_msg_t dmsg; 3032 size_t msglen; 3033 int rv; 3034 3035 ASSERT(MUTEX_HELD(&vdcp->lock)); 3036 vdcp->threads_pending++; 3037 loop: 3038 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 3039 3040 if (flags & VDC_OP_DRING_RESERVED) { 3041 /* use D-Ring reserved entry */ 3042 idx = VDC_DRING_FIRST_RESV; 3043 local_dep = &(vdcp->local_dring[idx]); 3044 } else { 3045 /* Get next available D-Ring entry */ 3046 idx = vdcp->dring_curr_idx; 3047 local_dep = &(vdcp->local_dring[idx]); 3048 3049 if (!local_dep->is_free) { 3050 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 3051 vdcp->instance); 3052 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 3053 if (vdcp->state == VDC_STATE_RUNNING || 3054 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3055 goto loop; 3056 } 3057 vdcp->threads_pending--; 3058 return (ECONNRESET); 3059 } 3060 3061 next_idx = idx + 1; 3062 if (next_idx >= vdcp->dring_len) 3063 next_idx = VDC_DRING_FIRST_ENTRY; 3064 vdcp->dring_curr_idx = next_idx; 3065 } 3066 3067 ASSERT(local_dep->is_free); 3068 3069 local_dep->operation = operation; 3070 local_dep->addr = addr; 3071 local_dep->nbytes = nbytes; 3072 local_dep->slice = slice; 3073 local_dep->offset = offset; 3074 local_dep->buf = bufp; 3075 local_dep->dir = dir; 3076 local_dep->flags = flags; 3077 3078 local_dep->is_free = B_FALSE; 3079 3080 rv = vdc_map_to_shared_dring(vdcp, idx); 3081 if (rv) { 3082 if (flags & VDC_OP_DRING_RESERVED) { 3083 DMSG(vdcp, 0, "[%d]: cannot bind memory - error\n", 3084 vdcp->instance); 3085 /* 3086 * We can't wait if we are using reserved slot. 3087 * Free the descriptor and return. 3088 */ 3089 local_dep->is_free = B_TRUE; 3090 vdcp->threads_pending--; 3091 return (rv); 3092 } 3093 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3094 vdcp->instance); 3095 /* free the descriptor */ 3096 local_dep->is_free = B_TRUE; 3097 vdcp->dring_curr_idx = idx; 3098 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3099 if (vdcp->state == VDC_STATE_RUNNING || 3100 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3101 goto loop; 3102 } 3103 vdcp->threads_pending--; 3104 return (ECONNRESET); 3105 } 3106 3107 /* 3108 * Send a msg with the DRing details to vds 3109 */ 3110 VIO_INIT_DRING_DATA_TAG(dmsg); 3111 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3112 dmsg.dring_ident = vdcp->dring_ident; 3113 dmsg.start_idx = idx; 3114 dmsg.end_idx = idx; 3115 vdcp->seq_num++; 3116 3117 DTRACE_PROBE2(populate, int, vdcp->instance, 3118 vdc_local_desc_t *, local_dep); 3119 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3120 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3121 3122 /* 3123 * note we're still holding the lock here to 3124 * make sure the message goes out in order !!!... 3125 */ 3126 msglen = sizeof (dmsg); 3127 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3128 switch (rv) { 3129 case ECONNRESET: 3130 /* 3131 * vdc_send initiates the reset on failure. 3132 * Since the transaction has already been put 3133 * on the local dring, it will automatically get 3134 * retried when the channel is reset. Given that, 3135 * it is ok to just return success even though the 3136 * send failed. 3137 */ 3138 rv = 0; 3139 break; 3140 3141 case 0: /* EOK */ 3142 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3143 break; 3144 3145 default: 3146 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3147 rv = ENXIO; 3148 break; 3149 } 3150 3151 vdcp->threads_pending--; 3152 return (rv); 3153 } 3154 3155 /* 3156 * Function: 3157 * vdc_do_op 3158 * 3159 * Description: 3160 * Wrapper around vdc_submit_request(). Each request is associated with a 3161 * buf structure. If a buf structure is provided (bufp != NULL) then the 3162 * request will be submitted with that buf, and the caller can wait for 3163 * completion of the request with biowait(). If a buf structure is not 3164 * provided (bufp == NULL) then a buf structure is created and the function 3165 * waits for the completion of the request. 3166 * 3167 * If the flag VD_OP_STATE_RUNNING is set then vdc_submit_request() will 3168 * submit the request only when the vdisk is in state VD_STATE_RUNNING. 3169 * If the vdisk is not in that state then the vdc_submit_request() will 3170 * wait for that state to be reached. After the request is submitted, the 3171 * reply will be processed asynchronously by the vdc_process_msg_thread() 3172 * thread. 3173 * 3174 * If the flag VD_OP_STATE_RUNNING is not set then vdc_submit_request() 3175 * submit the request whatever the state of the vdisk is. Then vdc_do_op() 3176 * will wait for a reply message, process the reply and complete the 3177 * request. 3178 * 3179 * Arguments: 3180 * vdc - the soft state pointer 3181 * op - operation we want vds to perform (VD_OP_XXX) 3182 * addr - address of data buf to be read/written. 3183 * nbytes - number of bytes to read/write 3184 * slice - the disk slice this request is for 3185 * offset - relative disk offset 3186 * bufp - buf structure associated with the request (can be NULL). 3187 * dir - direction of operation (READ/WRITE/BOTH) 3188 * flags - flags for the request. 3189 * 3190 * Return Codes: 3191 * 0 - the request has been succesfully submitted and completed. 3192 * != 0 - the request has failed. In that case, if a buf structure 3193 * was provided (bufp != NULL) then the B_ERROR flag is set 3194 * and the b_error field of the buf structure is set to EIO. 3195 */ 3196 static int 3197 vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes, int slice, 3198 diskaddr_t offset, struct buf *bufp, vio_desc_direction_t dir, int flags) 3199 { 3200 vio_msg_t vio_msg; 3201 struct buf buf; 3202 int rv; 3203 3204 if (bufp == NULL) { 3205 /* 3206 * We use buf just as a convenient way to get a notification 3207 * that the request is completed, so we initialize buf to the 3208 * minimum we need. 3209 */ 3210 bioinit(&buf); 3211 buf.b_bcount = nbytes; 3212 buf.b_flags = B_BUSY; 3213 bufp = &buf; 3214 } 3215 3216 rv = vdc_send_request(vdc, op, addr, nbytes, slice, offset, bufp, 3217 dir, flags); 3218 3219 if (rv != 0) 3220 goto done; 3221 3222 /* 3223 * If the request should be done in VDC_STATE_RUNNING state then the 3224 * reply will be received and processed by vdc_process_msg_thread() 3225 * and we just have to handle the panic case. Otherwise we have to 3226 * wait for the reply message and process it. 3227 */ 3228 if (flags & VDC_OP_STATE_RUNNING) { 3229 3230 if (ddi_in_panic()) { 3231 rv = vdc_drain_response(vdc, bufp); 3232 goto done; 3233 } 3234 3235 } else { 3236 /* wait for the response message */ 3237 rv = vdc_wait_for_response(vdc, &vio_msg); 3238 if (rv) { 3239 /* 3240 * If this is a block read/write we update the I/O 3241 * statistics kstat to take it off the run queue. 3242 */ 3243 mutex_enter(&vdc->lock); 3244 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3245 VD_UPDATE_ERR_STATS(vdc, vd_transerrs); 3246 VD_KSTAT_RUNQ_EXIT(vdc); 3247 DTRACE_IO1(done, buf_t *, bufp); 3248 } 3249 mutex_exit(&vdc->lock); 3250 goto done; 3251 } 3252 3253 rv = vdc_process_data_msg(vdc, &vio_msg); 3254 if (rv) 3255 goto done; 3256 } 3257 3258 if (bufp == &buf) 3259 rv = biowait(bufp); 3260 3261 done: 3262 if (bufp == &buf) { 3263 biofini(bufp); 3264 } else if (rv != 0) { 3265 bioerror(bufp, EIO); 3266 biodone(bufp); 3267 } 3268 3269 return (rv); 3270 } 3271 3272 /* 3273 * Function: 3274 * vdc_do_sync_op 3275 * 3276 * Description: 3277 * Wrapper around vdc_do_op that serializes requests. 3278 * 3279 * Arguments: 3280 * vdcp - the soft state pointer 3281 * operation - operation we want vds to perform (VD_OP_XXX) 3282 * addr - address of data buf to be read/written. 3283 * nbytes - number of bytes to read/write 3284 * slice - the disk slice this request is for 3285 * offset - relative disk offset 3286 * dir - direction of operation (READ/WRITE/BOTH) 3287 * rconflict - check for reservation conflict in case of failure 3288 * 3289 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3290 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3291 * result of a successful operation with vdc_scsi_status(). 3292 * 3293 * Return Codes: 3294 * 0 3295 * EAGAIN 3296 * EFAULT 3297 * ENXIO 3298 * EIO 3299 */ 3300 static int 3301 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3302 int slice, diskaddr_t offset, vio_desc_direction_t dir, boolean_t rconflict) 3303 { 3304 int status; 3305 int flags = VDC_OP_NORMAL; 3306 3307 /* 3308 * Grab the lock, if blocked wait until the server 3309 * response causes us to wake up again. 3310 */ 3311 mutex_enter(&vdcp->lock); 3312 vdcp->sync_op_cnt++; 3313 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) { 3314 if (ddi_in_panic()) { 3315 /* don't block if we are panicking */ 3316 vdcp->sync_op_cnt--; 3317 mutex_exit(&vdcp->lock); 3318 return (EIO); 3319 } else { 3320 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3321 } 3322 } 3323 3324 if (vdcp->state == VDC_STATE_DETACH) { 3325 cv_broadcast(&vdcp->sync_blocked_cv); 3326 vdcp->sync_op_cnt--; 3327 mutex_exit(&vdcp->lock); 3328 return (ENXIO); 3329 } 3330 3331 /* now block anyone other thread entering after us */ 3332 vdcp->sync_op_blocked = B_TRUE; 3333 3334 mutex_exit(&vdcp->lock); 3335 3336 if (!rconflict) 3337 flags &= ~VDC_OP_ERRCHK_CONFLICT; 3338 3339 status = vdc_do_op(vdcp, operation, addr, nbytes, slice, offset, 3340 NULL, dir, flags); 3341 3342 mutex_enter(&vdcp->lock); 3343 3344 DMSG(vdcp, 2, ": operation returned %d\n", status); 3345 3346 if (vdcp->state == VDC_STATE_DETACH) { 3347 status = ENXIO; 3348 } 3349 3350 vdcp->sync_op_blocked = B_FALSE; 3351 vdcp->sync_op_cnt--; 3352 3353 /* signal the next waiting thread */ 3354 cv_signal(&vdcp->sync_blocked_cv); 3355 3356 mutex_exit(&vdcp->lock); 3357 3358 return (status); 3359 } 3360 3361 3362 /* 3363 * Function: 3364 * vdc_drain_response() 3365 * 3366 * Description: 3367 * When a guest is panicking, the completion of requests needs to be 3368 * handled differently because interrupts are disabled and vdc 3369 * will not get messages. We have to poll for the messages instead. 3370 * 3371 * Note: since we are panicking we don't implement the io:::done 3372 * DTrace probe or update the I/O statistics kstats. 3373 * 3374 * Arguments: 3375 * vdc - soft state pointer for this instance of the device driver. 3376 * buf - if buf is NULL then we drain all responses, otherwise we 3377 * poll until we receive a ACK/NACK for the specific I/O 3378 * described by buf. 3379 * 3380 * Return Code: 3381 * 0 - Success. If we were expecting a response to a particular 3382 * request then this means that a response has been received. 3383 */ 3384 static int 3385 vdc_drain_response(vdc_t *vdc, struct buf *buf) 3386 { 3387 int rv, idx, retries; 3388 size_t msglen; 3389 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3390 vio_dring_msg_t dmsg; 3391 struct buf *mbuf; 3392 boolean_t ack; 3393 3394 mutex_enter(&vdc->lock); 3395 3396 retries = 0; 3397 for (;;) { 3398 msglen = sizeof (dmsg); 3399 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3400 &msglen); 3401 if (rv) { 3402 rv = EINVAL; 3403 break; 3404 } 3405 3406 /* 3407 * if there are no packets wait and check again 3408 */ 3409 if ((rv == 0) && (msglen == 0)) { 3410 if (retries++ > vdc_dump_retries) { 3411 rv = EAGAIN; 3412 break; 3413 } 3414 3415 drv_usecwait(vdc_usec_timeout_dump); 3416 continue; 3417 } 3418 3419 /* 3420 * Ignore all messages that are not ACKs/NACKs to 3421 * DRing requests. 3422 */ 3423 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3424 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3425 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3426 dmsg.tag.vio_msgtype, 3427 dmsg.tag.vio_subtype, 3428 dmsg.tag.vio_subtype_env); 3429 continue; 3430 } 3431 3432 /* 3433 * Record if the packet was ACK'ed or not. If the packet was not 3434 * ACK'ed then we will just mark the request as failed; we don't 3435 * want to reset the connection at this point. 3436 */ 3437 switch (dmsg.tag.vio_subtype) { 3438 case VIO_SUBTYPE_ACK: 3439 ack = B_TRUE; 3440 break; 3441 case VIO_SUBTYPE_NACK: 3442 ack = B_FALSE; 3443 break; 3444 default: 3445 continue; 3446 } 3447 3448 idx = dmsg.start_idx; 3449 if (idx >= vdc->dring_len) { 3450 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3451 vdc->instance, idx); 3452 continue; 3453 } 3454 ldep = &vdc->local_dring[idx]; 3455 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3456 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3457 vdc->instance, idx, ldep->dep->hdr.dstate); 3458 continue; 3459 } 3460 3461 mbuf = ldep->buf; 3462 ASSERT(mbuf != NULL); 3463 mbuf->b_resid = mbuf->b_bcount - ldep->dep->payload.nbytes; 3464 bioerror(mbuf, ack ? ldep->dep->payload.status : EIO); 3465 biodone(mbuf); 3466 3467 rv = vdc_depopulate_descriptor(vdc, idx); 3468 if (buf != NULL && buf == mbuf) { 3469 rv = 0; 3470 goto done; 3471 } 3472 3473 /* if this is the last descriptor - break out of loop */ 3474 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) { 3475 /* 3476 * If we were expecting a response for a particular 3477 * request then we return with an error otherwise we 3478 * have successfully completed the drain. 3479 */ 3480 rv = (buf != NULL)? ESRCH: 0; 3481 break; 3482 } 3483 } 3484 3485 done: 3486 mutex_exit(&vdc->lock); 3487 DMSG(vdc, 0, "End idx=%d\n", idx); 3488 3489 return (rv); 3490 } 3491 3492 3493 /* 3494 * Function: 3495 * vdc_depopulate_descriptor() 3496 * 3497 * Description: 3498 * 3499 * Arguments: 3500 * vdc - soft state pointer for this instance of the device driver. 3501 * idx - Index of the Descriptor Ring entry being modified 3502 * 3503 * Return Code: 3504 * 0 - Success 3505 */ 3506 static int 3507 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3508 { 3509 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3510 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3511 int status = ENXIO; 3512 int rv = 0; 3513 3514 ASSERT(vdc != NULL); 3515 ASSERT(idx < vdc->dring_len); 3516 ldep = &vdc->local_dring[idx]; 3517 ASSERT(ldep != NULL); 3518 ASSERT(MUTEX_HELD(&vdc->lock)); 3519 3520 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3521 DMSG(vdc, 2, ": idx = %d\n", idx); 3522 3523 dep = ldep->dep; 3524 ASSERT(dep != NULL); 3525 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3526 (dep->payload.status == ECANCELED)); 3527 3528 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3529 3530 ldep->is_free = B_TRUE; 3531 status = dep->payload.status; 3532 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3533 3534 /* 3535 * If no buffers were used to transfer information to the server when 3536 * populating the descriptor then no memory handles need to be unbound 3537 * and we can return now. 3538 */ 3539 if (ldep->nbytes == 0) { 3540 cv_signal(&vdc->dring_free_cv); 3541 return (status); 3542 } 3543 3544 /* 3545 * If the upper layer passed in a misaligned address we copied the 3546 * data into an aligned buffer before sending it to LDC - we now 3547 * copy it back to the original buffer. 3548 */ 3549 if (ldep->align_addr) { 3550 ASSERT(ldep->addr != NULL); 3551 3552 if (dep->payload.nbytes > 0) 3553 bcopy(ldep->align_addr, ldep->addr, 3554 dep->payload.nbytes); 3555 kmem_free(ldep->align_addr, 3556 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3557 ldep->align_addr = NULL; 3558 } 3559 3560 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3561 if (rv != 0) { 3562 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3563 vdc->instance, ldep->desc_mhdl, idx, rv); 3564 /* 3565 * The error returned by the vDisk server is more informative 3566 * and thus has a higher priority but if it isn't set we ensure 3567 * that this function returns an error. 3568 */ 3569 if (status == 0) 3570 status = EINVAL; 3571 } 3572 3573 cv_signal(&vdc->membind_cv); 3574 cv_signal(&vdc->dring_free_cv); 3575 3576 return (status); 3577 } 3578 3579 /* 3580 * Function: 3581 * vdc_populate_mem_hdl() 3582 * 3583 * Description: 3584 * 3585 * Arguments: 3586 * vdc - soft state pointer for this instance of the device driver. 3587 * idx - Index of the Descriptor Ring entry being modified 3588 * addr - virtual address being mapped in 3589 * nybtes - number of bytes in 'addr' 3590 * operation - the vDisk operation being performed (VD_OP_xxx) 3591 * 3592 * Return Code: 3593 * 0 - Success 3594 */ 3595 static int 3596 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3597 { 3598 vd_dring_entry_t *dep = NULL; 3599 ldc_mem_handle_t mhdl; 3600 caddr_t vaddr; 3601 size_t nbytes; 3602 uint8_t perm = LDC_MEM_RW; 3603 uint8_t maptype; 3604 int rv = 0; 3605 int i; 3606 3607 ASSERT(vdcp != NULL); 3608 3609 dep = ldep->dep; 3610 mhdl = ldep->desc_mhdl; 3611 3612 switch (ldep->dir) { 3613 case VIO_read_dir: 3614 perm = LDC_MEM_W; 3615 break; 3616 3617 case VIO_write_dir: 3618 perm = LDC_MEM_R; 3619 break; 3620 3621 case VIO_both_dir: 3622 perm = LDC_MEM_RW; 3623 break; 3624 3625 default: 3626 ASSERT(0); /* catch bad programming in vdc */ 3627 } 3628 3629 /* 3630 * LDC expects any addresses passed in to be 8-byte aligned. We need 3631 * to copy the contents of any misaligned buffers to a newly allocated 3632 * buffer and bind it instead (and copy the the contents back to the 3633 * original buffer passed in when depopulating the descriptor) 3634 */ 3635 vaddr = ldep->addr; 3636 nbytes = ldep->nbytes; 3637 if (((uint64_t)vaddr & 0x7) != 0) { 3638 ASSERT(ldep->align_addr == NULL); 3639 ldep->align_addr = 3640 kmem_alloc(sizeof (caddr_t) * 3641 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3642 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3643 "(buf=%p nb=%ld op=%d)\n", 3644 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3645 nbytes, ldep->operation); 3646 if (perm != LDC_MEM_W) 3647 bcopy(vaddr, ldep->align_addr, nbytes); 3648 vaddr = ldep->align_addr; 3649 } 3650 3651 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3652 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3653 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3654 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3655 vdcp->instance, dep->payload.ncookies); 3656 if (rv != 0) { 3657 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3658 "(mhdl=%p, buf=%p, err=%d)\n", 3659 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3660 if (ldep->align_addr) { 3661 kmem_free(ldep->align_addr, 3662 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3663 ldep->align_addr = NULL; 3664 } 3665 return (EAGAIN); 3666 } 3667 3668 /* 3669 * Get the other cookies (if any). 3670 */ 3671 for (i = 1; i < dep->payload.ncookies; i++) { 3672 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3673 if (rv != 0) { 3674 (void) ldc_mem_unbind_handle(mhdl); 3675 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3676 "(mhdl=%lx cnum=%d), err=%d", 3677 vdcp->instance, mhdl, i, rv); 3678 if (ldep->align_addr) { 3679 kmem_free(ldep->align_addr, 3680 sizeof (caddr_t) * ldep->nbytes); 3681 ldep->align_addr = NULL; 3682 } 3683 return (EAGAIN); 3684 } 3685 } 3686 3687 return (rv); 3688 } 3689 3690 /* 3691 * Interrupt handlers for messages from LDC 3692 */ 3693 3694 /* 3695 * Function: 3696 * vdc_handle_cb() 3697 * 3698 * Description: 3699 * 3700 * Arguments: 3701 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3702 * arg - soft state pointer for this instance of the device driver. 3703 * 3704 * Return Code: 3705 * 0 - Success 3706 */ 3707 static uint_t 3708 vdc_handle_cb(uint64_t event, caddr_t arg) 3709 { 3710 ldc_status_t ldc_state; 3711 int rv = 0; 3712 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3713 vdc_t *vdc = srvr->vdcp; 3714 3715 ASSERT(vdc != NULL); 3716 3717 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3718 3719 /* If callback is not for the current server, ignore it */ 3720 mutex_enter(&vdc->lock); 3721 3722 if (vdc->curr_server != srvr) { 3723 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3724 vdc->instance, event, srvr->id); 3725 mutex_exit(&vdc->lock); 3726 return (LDC_SUCCESS); 3727 } 3728 3729 /* 3730 * Depending on the type of event that triggered this callback, 3731 * we modify the handshake state or read the data. 3732 * 3733 * NOTE: not done as a switch() as event could be triggered by 3734 * a state change and a read request. Also the ordering of the 3735 * check for the event types is deliberate. 3736 */ 3737 if (event & LDC_EVT_UP) { 3738 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3739 3740 /* get LDC state */ 3741 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3742 if (rv != 0) { 3743 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3744 vdc->instance, rv); 3745 mutex_exit(&vdc->lock); 3746 return (LDC_SUCCESS); 3747 } 3748 if (srvr->ldc_state != LDC_UP && 3749 ldc_state == LDC_UP) { 3750 /* 3751 * Reset the transaction sequence numbers when 3752 * LDC comes up. We then kick off the handshake 3753 * negotiation with the vDisk server. 3754 */ 3755 vdc->seq_num = 1; 3756 vdc->seq_num_reply = 0; 3757 vdc->io_pending = B_TRUE; 3758 srvr->ldc_state = ldc_state; 3759 cv_signal(&vdc->initwait_cv); 3760 cv_signal(&vdc->io_pending_cv); 3761 } 3762 } 3763 3764 if (event & LDC_EVT_READ) { 3765 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3766 mutex_enter(&vdc->read_lock); 3767 cv_signal(&vdc->read_cv); 3768 vdc->read_state = VDC_READ_PENDING; 3769 mutex_exit(&vdc->read_lock); 3770 mutex_exit(&vdc->lock); 3771 3772 /* that's all we have to do - no need to handle DOWN/RESET */ 3773 return (LDC_SUCCESS); 3774 } 3775 3776 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3777 3778 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3779 3780 /* 3781 * Need to wake up any readers so they will 3782 * detect that a reset has occurred. 3783 */ 3784 mutex_enter(&vdc->read_lock); 3785 if ((vdc->read_state == VDC_READ_WAITING) || 3786 (vdc->read_state == VDC_READ_RESET)) 3787 cv_signal(&vdc->read_cv); 3788 vdc->read_state = VDC_READ_RESET; 3789 mutex_exit(&vdc->read_lock); 3790 3791 /* wake up any threads waiting for connection to come up */ 3792 if (vdc->state == VDC_STATE_INIT_WAITING) { 3793 vdc->state = VDC_STATE_RESETTING; 3794 cv_signal(&vdc->initwait_cv); 3795 } else if (vdc->state == VDC_STATE_FAILED) { 3796 vdc->io_pending = B_TRUE; 3797 cv_signal(&vdc->io_pending_cv); 3798 } 3799 3800 } 3801 3802 mutex_exit(&vdc->lock); 3803 3804 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3805 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3806 vdc->instance, event); 3807 3808 return (LDC_SUCCESS); 3809 } 3810 3811 /* 3812 * Function: 3813 * vdc_wait_for_response() 3814 * 3815 * Description: 3816 * Block waiting for a response from the server. If there is 3817 * no data the thread block on the read_cv that is signalled 3818 * by the callback when an EVT_READ occurs. 3819 * 3820 * Arguments: 3821 * vdcp - soft state pointer for this instance of the device driver. 3822 * 3823 * Return Code: 3824 * 0 - Success 3825 */ 3826 static int 3827 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3828 { 3829 size_t nbytes = sizeof (*msgp); 3830 int status; 3831 3832 ASSERT(vdcp != NULL); 3833 3834 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3835 3836 status = vdc_recv(vdcp, msgp, &nbytes); 3837 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3838 status, (int)nbytes); 3839 if (status) { 3840 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3841 vdcp->instance, status); 3842 return (status); 3843 } 3844 3845 if (nbytes < sizeof (vio_msg_tag_t)) { 3846 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3847 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3848 return (ENOMSG); 3849 } 3850 3851 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3852 msgp->tag.vio_msgtype, 3853 msgp->tag.vio_subtype, 3854 msgp->tag.vio_subtype_env); 3855 3856 /* 3857 * Verify the Session ID of the message 3858 * 3859 * Every message after the Version has been negotiated should 3860 * have the correct session ID set. 3861 */ 3862 if ((msgp->tag.vio_sid != vdcp->session_id) && 3863 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3864 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3865 "expected 0x%lx [seq num %lx @ %d]", 3866 vdcp->instance, msgp->tag.vio_sid, 3867 vdcp->session_id, 3868 ((vio_dring_msg_t *)msgp)->seq_num, 3869 ((vio_dring_msg_t *)msgp)->start_idx); 3870 return (ENOMSG); 3871 } 3872 return (0); 3873 } 3874 3875 3876 /* 3877 * Function: 3878 * vdc_resubmit_backup_dring() 3879 * 3880 * Description: 3881 * Resubmit each descriptor in the backed up dring to 3882 * vDisk server. The Dring was backed up during connection 3883 * reset. 3884 * 3885 * Arguments: 3886 * vdcp - soft state pointer for this instance of the device driver. 3887 * 3888 * Return Code: 3889 * 0 - Success 3890 */ 3891 static int 3892 vdc_resubmit_backup_dring(vdc_t *vdcp) 3893 { 3894 int processed = 0; 3895 int count; 3896 int b_idx; 3897 int rv = 0; 3898 int dring_size; 3899 vdc_local_desc_t *curr_ldep; 3900 3901 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3902 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3903 3904 if (vdcp->local_dring_backup == NULL) { 3905 /* the pending requests have already been processed */ 3906 return (0); 3907 } 3908 3909 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3910 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3911 3912 /* 3913 * Walk the backup copy of the local descriptor ring and 3914 * resubmit all the outstanding transactions. 3915 */ 3916 b_idx = vdcp->local_dring_backup_tail; 3917 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3918 3919 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3920 3921 /* only resubmit outstanding transactions */ 3922 if (!curr_ldep->is_free) { 3923 3924 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3925 3926 rv = vdc_do_op(vdcp, curr_ldep->operation, 3927 curr_ldep->addr, curr_ldep->nbytes, 3928 curr_ldep->slice, curr_ldep->offset, 3929 curr_ldep->buf, curr_ldep->dir, 3930 curr_ldep->flags & ~VDC_OP_STATE_RUNNING); 3931 3932 if (rv) { 3933 DMSG(vdcp, 1, "[%d] resubmit entry %d failed\n", 3934 vdcp->instance, b_idx); 3935 goto done; 3936 } 3937 3938 /* 3939 * Mark this entry as free so that we will not resubmit 3940 * this "done" request again, if we were to use the same 3941 * backup_dring again in future. This could happen when 3942 * a reset happens while processing the backup_dring. 3943 */ 3944 curr_ldep->is_free = B_TRUE; 3945 processed++; 3946 } 3947 3948 /* get the next element to submit */ 3949 if (++b_idx >= vdcp->local_dring_backup_len) 3950 b_idx = 0; 3951 } 3952 3953 /* all done - now clear up pending dring copy */ 3954 dring_size = vdcp->local_dring_backup_len * 3955 sizeof (vdcp->local_dring_backup[0]); 3956 3957 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3958 3959 vdcp->local_dring_backup = NULL; 3960 3961 done: 3962 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3963 3964 return (rv); 3965 } 3966 3967 /* 3968 * Function: 3969 * vdc_cancel_backup_dring 3970 * 3971 * Description: 3972 * Cancel each descriptor in the backed up dring to vDisk server. 3973 * The Dring was backed up during connection reset. 3974 * 3975 * Arguments: 3976 * vdcp - soft state pointer for this instance of the device driver. 3977 * 3978 * Return Code: 3979 * None 3980 */ 3981 void 3982 vdc_cancel_backup_dring(vdc_t *vdcp) 3983 { 3984 vdc_local_desc_t *ldep; 3985 struct buf *bufp; 3986 int count; 3987 int b_idx; 3988 int dring_size; 3989 int cancelled = 0; 3990 3991 ASSERT(MUTEX_HELD(&vdcp->lock)); 3992 ASSERT(vdcp->state == VDC_STATE_FAILED); 3993 3994 if (vdcp->local_dring_backup == NULL) { 3995 /* the pending requests have already been processed */ 3996 return; 3997 } 3998 3999 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 4000 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 4001 4002 /* 4003 * Walk the backup copy of the local descriptor ring and 4004 * cancel all the outstanding transactions. 4005 */ 4006 b_idx = vdcp->local_dring_backup_tail; 4007 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 4008 4009 ldep = &(vdcp->local_dring_backup[b_idx]); 4010 4011 /* only cancel outstanding transactions */ 4012 if (!ldep->is_free) { 4013 4014 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 4015 cancelled++; 4016 4017 /* 4018 * All requests have already been cleared from the 4019 * local descriptor ring and the LDC channel has been 4020 * reset so we will never get any reply for these 4021 * requests. Now we just have to notify threads waiting 4022 * for replies that the request has failed. 4023 */ 4024 bufp = ldep->buf; 4025 ASSERT(bufp != NULL); 4026 bufp->b_resid = bufp->b_bcount; 4027 if (ldep->operation == VD_OP_BREAD || 4028 ldep->operation == VD_OP_BWRITE) { 4029 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4030 VD_KSTAT_RUNQ_EXIT(vdcp); 4031 DTRACE_IO1(done, buf_t *, bufp); 4032 } 4033 bioerror(bufp, EIO); 4034 biodone(bufp); 4035 } 4036 4037 /* get the next element to cancel */ 4038 if (++b_idx >= vdcp->local_dring_backup_len) 4039 b_idx = 0; 4040 } 4041 4042 /* all done - now clear up pending dring copy */ 4043 dring_size = vdcp->local_dring_backup_len * 4044 sizeof (vdcp->local_dring_backup[0]); 4045 4046 (void) kmem_free(vdcp->local_dring_backup, dring_size); 4047 4048 vdcp->local_dring_backup = NULL; 4049 4050 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 4051 } 4052 4053 /* 4054 * Function: 4055 * vdc_connection_timeout 4056 * 4057 * Description: 4058 * This function is invoked if the timeout set to establish the connection 4059 * with vds expires. This will happen if we spend too much time in the 4060 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. 4061 * 4062 * If the timeout does not expire, it will be cancelled when we reach the 4063 * VDC_STATE_HANDLE_PENDING, VDC_STATE_FAILED or VDC_STATE_DETACH state. 4064 * This function can also be invoked while we are in those states, in 4065 * which case we do nothing because the timeout is being cancelled. 4066 * 4067 * Arguments: 4068 * arg - argument of the timeout function actually a soft state 4069 * pointer for the instance of the device driver. 4070 * 4071 * Return Code: 4072 * None 4073 */ 4074 void 4075 vdc_connection_timeout(void *arg) 4076 { 4077 vdc_t *vdcp = (vdc_t *)arg; 4078 4079 mutex_enter(&vdcp->lock); 4080 4081 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4082 vdcp->state == VDC_STATE_DETACH || 4083 vdcp->state == VDC_STATE_FAILED) { 4084 /* 4085 * The connection has just been re-established, has failed or 4086 * we are detaching. 4087 */ 4088 vdcp->ctimeout_reached = B_FALSE; 4089 } else { 4090 vdcp->ctimeout_reached = B_TRUE; 4091 } 4092 4093 mutex_exit(&vdcp->lock); 4094 } 4095 4096 /* 4097 * Function: 4098 * vdc_backup_local_dring() 4099 * 4100 * Description: 4101 * Backup the current dring in the event of a reset. The Dring 4102 * transactions will be resubmitted to the server when the 4103 * connection is restored. 4104 * 4105 * Arguments: 4106 * vdcp - soft state pointer for this instance of the device driver. 4107 * 4108 * Return Code: 4109 * NONE 4110 */ 4111 static void 4112 vdc_backup_local_dring(vdc_t *vdcp) 4113 { 4114 int dring_size; 4115 4116 ASSERT(MUTEX_HELD(&vdcp->lock)); 4117 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4118 4119 /* 4120 * If the backup dring is stil around, it means 4121 * that the last restore did not complete. However, 4122 * since we never got back into the running state, 4123 * the backup copy we have is still valid. 4124 */ 4125 if (vdcp->local_dring_backup != NULL) { 4126 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4127 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4128 vdcp->local_dring_backup_tail); 4129 return; 4130 } 4131 4132 /* 4133 * The backup dring can be NULL and the local dring may not be 4134 * initialized. This can happen if we had a reset while establishing 4135 * a new connection but after the connection has timed out. In that 4136 * case the backup dring is NULL because the requests have been 4137 * cancelled and the request occured before the local dring is 4138 * initialized. 4139 */ 4140 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4141 return; 4142 4143 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4144 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4145 4146 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4147 4148 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4149 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4150 4151 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4152 vdcp->local_dring_backup_len = vdcp->dring_len; 4153 } 4154 4155 static void 4156 vdc_switch_server(vdc_t *vdcp) 4157 { 4158 int rv; 4159 vdc_server_t *curr_server, *new_server; 4160 4161 ASSERT(MUTEX_HELD(&vdcp->lock)); 4162 4163 /* if there is only one server return back */ 4164 if (vdcp->num_servers == 1) { 4165 return; 4166 } 4167 4168 /* Get current and next server */ 4169 curr_server = vdcp->curr_server; 4170 new_server = 4171 (curr_server->next) ? curr_server->next : vdcp->server_list; 4172 ASSERT(curr_server != new_server); 4173 4174 /* bring current server's channel down */ 4175 rv = ldc_down(curr_server->ldc_handle); 4176 if (rv) { 4177 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4178 vdcp->instance, curr_server->id); 4179 return; 4180 } 4181 4182 /* switch the server */ 4183 vdcp->curr_server = new_server; 4184 4185 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4186 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4187 } 4188 4189 static void 4190 vdc_print_svc_status(vdc_t *vdcp) 4191 { 4192 int instance; 4193 uint64_t ldc_id, port_id; 4194 vdc_service_state_t svc_state; 4195 4196 ASSERT(mutex_owned(&vdcp->lock)); 4197 4198 svc_state = vdcp->curr_server->svc_state; 4199 4200 if (vdcp->curr_server->log_state == svc_state) 4201 return; 4202 4203 instance = vdcp->instance; 4204 ldc_id = vdcp->curr_server->ldc_id; 4205 port_id = vdcp->curr_server->id; 4206 4207 switch (svc_state) { 4208 4209 case VDC_SERVICE_OFFLINE: 4210 cmn_err(CE_CONT, "?vdisk@%d is offline\n", instance); 4211 break; 4212 4213 case VDC_SERVICE_CONNECTED: 4214 cmn_err(CE_CONT, "?vdisk@%d is connected using ldc@%ld,%ld\n", 4215 instance, ldc_id, port_id); 4216 break; 4217 4218 case VDC_SERVICE_ONLINE: 4219 cmn_err(CE_CONT, "?vdisk@%d is online using ldc@%ld,%ld\n", 4220 instance, ldc_id, port_id); 4221 break; 4222 4223 case VDC_SERVICE_FAILED: 4224 cmn_err(CE_CONT, "?vdisk@%d access to service failed " 4225 "using ldc@%ld,%ld\n", instance, ldc_id, port_id); 4226 break; 4227 4228 case VDC_SERVICE_FAULTED: 4229 cmn_err(CE_CONT, "?vdisk@%d access to backend failed " 4230 "using ldc@%ld,%ld\n", instance, ldc_id, port_id); 4231 break; 4232 4233 default: 4234 ASSERT(0); 4235 break; 4236 } 4237 4238 vdcp->curr_server->log_state = svc_state; 4239 } 4240 4241 /* -------------------------------------------------------------------------- */ 4242 4243 /* 4244 * The following functions process the incoming messages from vds 4245 */ 4246 4247 /* 4248 * Function: 4249 * vdc_process_msg_thread() 4250 * 4251 * Description: 4252 * 4253 * Main VDC message processing thread. Each vDisk instance 4254 * consists of a copy of this thread. This thread triggers 4255 * all the handshakes and data exchange with the server. It 4256 * also handles all channel resets 4257 * 4258 * Arguments: 4259 * vdc - soft state pointer for this instance of the device driver. 4260 * 4261 * Return Code: 4262 * None 4263 */ 4264 static void 4265 vdc_process_msg_thread(vdc_t *vdcp) 4266 { 4267 int status; 4268 int ctimeout; 4269 timeout_id_t tmid = 0; 4270 clock_t ldcup_timeout = 0; 4271 vdc_server_t *srvr; 4272 vdc_service_state_t svc_state; 4273 4274 mutex_enter(&vdcp->lock); 4275 4276 for (;;) { 4277 4278 #define Q(_s) (vdcp->state == _s) ? #_s : 4279 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4280 Q(VDC_STATE_INIT) 4281 Q(VDC_STATE_INIT_WAITING) 4282 Q(VDC_STATE_NEGOTIATE) 4283 Q(VDC_STATE_HANDLE_PENDING) 4284 Q(VDC_STATE_FAULTED) 4285 Q(VDC_STATE_FAILED) 4286 Q(VDC_STATE_RUNNING) 4287 Q(VDC_STATE_RESETTING) 4288 Q(VDC_STATE_DETACH) 4289 "UNKNOWN"); 4290 4291 switch (vdcp->state) { 4292 case VDC_STATE_INIT: 4293 4294 /* 4295 * If requested, start a timeout to check if the 4296 * connection with vds is established in the 4297 * specified delay. If the timeout expires, we 4298 * will cancel any pending request. 4299 * 4300 * If some reset have occurred while establishing 4301 * the connection, we already have a timeout armed 4302 * and in that case we don't need to arm a new one. 4303 * 4304 * The same rule applies when there are multiple vds'. 4305 * If either a connection cannot be established or 4306 * the handshake times out, the connection thread will 4307 * try another server. The 'ctimeout' will report 4308 * back an error after it expires irrespective of 4309 * whether the vdisk is trying to connect to just 4310 * one or multiple servers. 4311 */ 4312 ctimeout = (vdc_timeout != 0)? 4313 vdc_timeout : vdcp->curr_server->ctimeout; 4314 4315 if (ctimeout != 0 && tmid == 0) { 4316 tmid = timeout(vdc_connection_timeout, vdcp, 4317 ctimeout * drv_usectohz(MICROSEC)); 4318 } 4319 4320 /* Switch to STATE_DETACH if drv is detaching */ 4321 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4322 vdcp->state = VDC_STATE_DETACH; 4323 break; 4324 } 4325 4326 /* Check if the timeout has been reached */ 4327 if (vdcp->ctimeout_reached) { 4328 ASSERT(tmid != 0); 4329 tmid = 0; 4330 vdcp->state = VDC_STATE_FAILED; 4331 break; 4332 } 4333 4334 /* Check if we are re-initializing repeatedly */ 4335 if (vdcp->hshake_cnt > vdc_hshake_retries && 4336 vdcp->lifecycle != VDC_LC_ONLINE) { 4337 4338 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4339 vdcp->instance, vdcp->hshake_cnt); 4340 vdcp->state = VDC_STATE_FAILED; 4341 break; 4342 } 4343 4344 /* Switch server */ 4345 if (vdcp->hshake_cnt > 0) 4346 vdc_switch_server(vdcp); 4347 vdcp->hshake_cnt++; 4348 4349 /* Bring up connection with vds via LDC */ 4350 status = vdc_start_ldc_connection(vdcp); 4351 if (status != EINVAL) { 4352 vdcp->state = VDC_STATE_INIT_WAITING; 4353 } else { 4354 vdcp->curr_server->svc_state = 4355 VDC_SERVICE_FAILED; 4356 vdc_print_svc_status(vdcp); 4357 } 4358 break; 4359 4360 case VDC_STATE_INIT_WAITING: 4361 4362 /* if channel is UP, start negotiation */ 4363 if (vdcp->curr_server->ldc_state == LDC_UP) { 4364 vdcp->state = VDC_STATE_NEGOTIATE; 4365 break; 4366 } 4367 4368 /* 4369 * Wait for LDC_UP. If it times out and we have multiple 4370 * servers then we will retry using a different server. 4371 */ 4372 ldcup_timeout = ddi_get_lbolt() + (vdc_ldcup_timeout * 4373 drv_usectohz(MICROSEC)); 4374 status = cv_timedwait(&vdcp->initwait_cv, &vdcp->lock, 4375 ldcup_timeout); 4376 if (status == -1 && 4377 vdcp->state == VDC_STATE_INIT_WAITING && 4378 vdcp->curr_server->ldc_state != LDC_UP) { 4379 /* timed out & still waiting */ 4380 vdcp->curr_server->svc_state = 4381 VDC_SERVICE_FAILED; 4382 vdc_print_svc_status(vdcp); 4383 vdcp->state = VDC_STATE_INIT; 4384 break; 4385 } 4386 4387 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4388 DMSG(vdcp, 0, 4389 "state moved to %d out from under us...\n", 4390 vdcp->state); 4391 } 4392 break; 4393 4394 case VDC_STATE_NEGOTIATE: 4395 switch (status = vdc_ver_negotiation(vdcp)) { 4396 case 0: 4397 break; 4398 default: 4399 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4400 status); 4401 goto reset; 4402 } 4403 4404 switch (status = vdc_attr_negotiation(vdcp)) { 4405 case 0: 4406 break; 4407 default: 4408 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4409 status); 4410 goto reset; 4411 } 4412 4413 switch (status = vdc_dring_negotiation(vdcp)) { 4414 case 0: 4415 break; 4416 default: 4417 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4418 status); 4419 goto reset; 4420 } 4421 4422 switch (status = vdc_rdx_exchange(vdcp)) { 4423 case 0: 4424 vdcp->state = VDC_STATE_HANDLE_PENDING; 4425 goto done; 4426 default: 4427 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4428 status); 4429 goto reset; 4430 } 4431 reset: 4432 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4433 status); 4434 vdcp->state = VDC_STATE_RESETTING; 4435 vdcp->self_reset = B_TRUE; 4436 vdcp->curr_server->svc_state = VDC_SERVICE_FAILED; 4437 vdc_print_svc_status(vdcp); 4438 done: 4439 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4440 vdcp->state); 4441 break; 4442 4443 case VDC_STATE_HANDLE_PENDING: 4444 4445 DMSG(vdcp, 0, "[%d] connection to service domain is up", 4446 vdcp->instance); 4447 vdcp->curr_server->svc_state = VDC_SERVICE_CONNECTED; 4448 4449 mutex_exit(&vdcp->lock); 4450 4451 /* 4452 * If we have multiple servers, check that the backend 4453 * is effectively available before resubmitting any IO. 4454 */ 4455 if (vdcp->num_servers > 1 && 4456 vdc_eio_check(vdcp, 0) != 0) { 4457 mutex_enter(&vdcp->lock); 4458 vdcp->curr_server->svc_state = 4459 VDC_SERVICE_FAULTED; 4460 vdcp->state = VDC_STATE_FAULTED; 4461 break; 4462 } 4463 4464 if (tmid != 0) { 4465 (void) untimeout(tmid); 4466 tmid = 0; 4467 vdcp->ctimeout_reached = B_FALSE; 4468 } 4469 4470 /* 4471 * Setup devid 4472 */ 4473 (void) vdc_setup_devid(vdcp); 4474 4475 status = vdc_resubmit_backup_dring(vdcp); 4476 4477 mutex_enter(&vdcp->lock); 4478 4479 if (status) { 4480 vdcp->state = VDC_STATE_RESETTING; 4481 vdcp->self_reset = B_TRUE; 4482 vdcp->curr_server->svc_state = 4483 VDC_SERVICE_FAILED; 4484 vdc_print_svc_status(vdcp); 4485 } else { 4486 vdcp->state = VDC_STATE_RUNNING; 4487 } 4488 break; 4489 4490 case VDC_STATE_FAULTED: 4491 /* 4492 * Server is faulted because the backend is unavailable. 4493 * If all servers are faulted then we mark the service 4494 * as failed, otherwise we reset to switch to another 4495 * server. 4496 */ 4497 vdc_print_svc_status(vdcp); 4498 4499 /* check if all servers are faulted */ 4500 for (srvr = vdcp->server_list; srvr != NULL; 4501 srvr = srvr->next) { 4502 svc_state = srvr->svc_state; 4503 if (svc_state != VDC_SERVICE_FAULTED) 4504 break; 4505 } 4506 4507 if (srvr != NULL) { 4508 vdcp->state = VDC_STATE_RESETTING; 4509 vdcp->self_reset = B_TRUE; 4510 } else { 4511 vdcp->state = VDC_STATE_FAILED; 4512 } 4513 break; 4514 4515 case VDC_STATE_FAILED: 4516 /* 4517 * We reach this state when we are unable to access the 4518 * backend from any server, either because of a maximum 4519 * connection retries or timeout, or because the backend 4520 * is unavailable. 4521 * 4522 * Then we cancel the backup DRing so that errors get 4523 * reported and we wait for a new I/O before attempting 4524 * another connection. 4525 */ 4526 cmn_err(CE_NOTE, "vdisk@%d disk access failed", 4527 vdcp->instance); 4528 4529 /* cancel any timeout */ 4530 if (tmid != 0) { 4531 (void) untimeout(tmid); 4532 tmid = 0; 4533 } 4534 4535 /* cancel pending I/Os */ 4536 cv_broadcast(&vdcp->running_cv); 4537 vdc_cancel_backup_dring(vdcp); 4538 4539 /* wait for new I/O */ 4540 while (!vdcp->io_pending) 4541 cv_wait(&vdcp->io_pending_cv, &vdcp->lock); 4542 4543 /* 4544 * There's a new IO pending. Try to re-establish a 4545 * connection. Mark all services as offline, so that 4546 * we don't stop again before having retried all 4547 * servers. 4548 */ 4549 for (srvr = vdcp->server_list; srvr != NULL; 4550 srvr = srvr->next) { 4551 srvr->svc_state = VDC_SERVICE_OFFLINE; 4552 } 4553 4554 /* reset variables */ 4555 vdcp->hshake_cnt = 0; 4556 vdcp->ctimeout_reached = B_FALSE; 4557 4558 vdcp->state = VDC_STATE_RESETTING; 4559 vdcp->self_reset = B_TRUE; 4560 break; 4561 4562 /* enter running state */ 4563 case VDC_STATE_RUNNING: 4564 /* 4565 * Signal anyone waiting for the connection 4566 * to come on line. 4567 */ 4568 vdcp->hshake_cnt = 0; 4569 cv_broadcast(&vdcp->running_cv); 4570 4571 /* backend has to be checked after reset */ 4572 if (vdcp->failfast_interval != 0 || 4573 vdcp->num_servers > 1) 4574 cv_signal(&vdcp->eio_cv); 4575 4576 /* ownership is lost during reset */ 4577 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4578 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4579 cv_signal(&vdcp->ownership_cv); 4580 4581 vdcp->curr_server->svc_state = VDC_SERVICE_ONLINE; 4582 vdc_print_svc_status(vdcp); 4583 4584 mutex_exit(&vdcp->lock); 4585 4586 for (;;) { 4587 vio_msg_t msg; 4588 status = vdc_wait_for_response(vdcp, &msg); 4589 if (status) break; 4590 4591 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4592 vdcp->instance); 4593 status = vdc_process_data_msg(vdcp, &msg); 4594 if (status) { 4595 DMSG(vdcp, 1, "[%d] process_data_msg " 4596 "returned err=%d\n", vdcp->instance, 4597 status); 4598 break; 4599 } 4600 4601 } 4602 4603 mutex_enter(&vdcp->lock); 4604 4605 /* all servers are now offline */ 4606 for (srvr = vdcp->server_list; srvr != NULL; 4607 srvr = srvr->next) { 4608 srvr->svc_state = VDC_SERVICE_OFFLINE; 4609 srvr->log_state = VDC_SERVICE_NONE; 4610 } 4611 4612 vdc_print_svc_status(vdcp); 4613 4614 vdcp->state = VDC_STATE_RESETTING; 4615 vdcp->self_reset = B_TRUE; 4616 break; 4617 4618 case VDC_STATE_RESETTING: 4619 /* 4620 * When we reach this state, we either come from the 4621 * VDC_STATE_RUNNING state and we can have pending 4622 * request but no timeout is armed; or we come from 4623 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4624 * VDC_HANDLE_PENDING state and there is no pending 4625 * request or pending requests have already been copied 4626 * into the backup dring. So we can safely keep the 4627 * connection timeout armed while we are in this state. 4628 */ 4629 4630 DMSG(vdcp, 0, "Initiating channel reset " 4631 "(pending = %d)\n", (int)vdcp->threads_pending); 4632 4633 if (vdcp->self_reset) { 4634 DMSG(vdcp, 0, 4635 "[%d] calling stop_ldc_connection.\n", 4636 vdcp->instance); 4637 status = vdc_stop_ldc_connection(vdcp); 4638 vdcp->self_reset = B_FALSE; 4639 } 4640 4641 /* 4642 * Wait for all threads currently waiting 4643 * for a free dring entry to use. 4644 */ 4645 while (vdcp->threads_pending) { 4646 cv_broadcast(&vdcp->membind_cv); 4647 cv_broadcast(&vdcp->dring_free_cv); 4648 mutex_exit(&vdcp->lock); 4649 /* give the waiters enough time to wake up */ 4650 delay(vdc_hz_min_ldc_delay); 4651 mutex_enter(&vdcp->lock); 4652 } 4653 4654 ASSERT(vdcp->threads_pending == 0); 4655 4656 /* Sanity check that no thread is receiving */ 4657 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4658 4659 vdcp->read_state = VDC_READ_IDLE; 4660 vdcp->io_pending = B_FALSE; 4661 4662 /* 4663 * Cleanup any pending eio. These I/Os are going to 4664 * be resubmitted. 4665 */ 4666 vdc_eio_unqueue(vdcp, 0, B_FALSE); 4667 4668 vdc_backup_local_dring(vdcp); 4669 4670 /* cleanup the old d-ring */ 4671 vdc_destroy_descriptor_ring(vdcp); 4672 4673 /* go and start again */ 4674 vdcp->state = VDC_STATE_INIT; 4675 4676 break; 4677 4678 case VDC_STATE_DETACH: 4679 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4680 vdcp->instance); 4681 4682 /* cancel any pending timeout */ 4683 mutex_exit(&vdcp->lock); 4684 if (tmid != 0) { 4685 (void) untimeout(tmid); 4686 tmid = 0; 4687 } 4688 mutex_enter(&vdcp->lock); 4689 4690 /* 4691 * Signal anyone waiting for connection 4692 * to come online 4693 */ 4694 cv_broadcast(&vdcp->running_cv); 4695 4696 while (vdcp->sync_op_cnt > 0) { 4697 cv_broadcast(&vdcp->sync_blocked_cv); 4698 mutex_exit(&vdcp->lock); 4699 /* give the waiters enough time to wake up */ 4700 delay(vdc_hz_min_ldc_delay); 4701 mutex_enter(&vdcp->lock); 4702 } 4703 4704 mutex_exit(&vdcp->lock); 4705 4706 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4707 vdcp->instance); 4708 thread_exit(); 4709 break; 4710 } 4711 } 4712 } 4713 4714 4715 /* 4716 * Function: 4717 * vdc_process_data_msg() 4718 * 4719 * Description: 4720 * This function is called by the message processing thread each time 4721 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4722 * be an ACK or NACK from vds[1] which vdc handles as follows. 4723 * ACK - wake up the waiting thread 4724 * NACK - resend any messages necessary 4725 * 4726 * [1] Although the message format allows it, vds should not send a 4727 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4728 * some bizarre reason it does, vdc will reset the connection. 4729 * 4730 * Arguments: 4731 * vdc - soft state pointer for this instance of the device driver. 4732 * msg - the LDC message sent by vds 4733 * 4734 * Return Code: 4735 * 0 - Success. 4736 * > 0 - error value returned by LDC 4737 */ 4738 static int 4739 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4740 { 4741 int status = 0; 4742 vio_dring_msg_t *dring_msg; 4743 vdc_local_desc_t *ldep = NULL; 4744 int start, end; 4745 int idx; 4746 int op; 4747 4748 dring_msg = (vio_dring_msg_t *)msg; 4749 4750 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4751 ASSERT(vdcp != NULL); 4752 4753 mutex_enter(&vdcp->lock); 4754 4755 /* 4756 * Check to see if the message has bogus data 4757 */ 4758 idx = start = dring_msg->start_idx; 4759 end = dring_msg->end_idx; 4760 if ((start >= vdcp->dring_len) || 4761 (end >= vdcp->dring_len) || (end < -1)) { 4762 /* 4763 * Update the I/O statistics to indicate that an error ocurred. 4764 * No need to update the wait/run queues as no specific read or 4765 * write request is being completed in response to this 'msg'. 4766 */ 4767 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4768 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4769 vdcp->instance, start, end); 4770 mutex_exit(&vdcp->lock); 4771 return (EINVAL); 4772 } 4773 4774 /* 4775 * Verify that the sequence number is what vdc expects. 4776 */ 4777 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4778 case VDC_SEQ_NUM_TODO: 4779 break; /* keep processing this message */ 4780 case VDC_SEQ_NUM_SKIP: 4781 mutex_exit(&vdcp->lock); 4782 return (0); 4783 case VDC_SEQ_NUM_INVALID: 4784 /* 4785 * Update the I/O statistics to indicate that an error ocurred. 4786 * No need to update the wait/run queues as no specific read or 4787 * write request is being completed in response to this 'msg'. 4788 */ 4789 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4790 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4791 mutex_exit(&vdcp->lock); 4792 return (ENXIO); 4793 } 4794 4795 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4796 /* 4797 * Update the I/O statistics to indicate that an error ocurred. 4798 * 4799 * We need to update the run queue if a read or write request 4800 * is being NACKed - otherwise there will appear to be an 4801 * indefinite outstanding request and statistics reported by 4802 * iostat(1M) will be incorrect. The transaction will be 4803 * resubmitted from the backup DRing following the reset 4804 * and the wait/run queues will be entered again. 4805 */ 4806 ldep = &vdcp->local_dring[idx]; 4807 op = ldep->operation; 4808 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4809 DTRACE_IO1(done, buf_t *, ldep->buf); 4810 VD_KSTAT_RUNQ_EXIT(vdcp); 4811 } 4812 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4813 VDC_DUMP_DRING_MSG(dring_msg); 4814 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4815 mutex_exit(&vdcp->lock); 4816 return (EIO); 4817 4818 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4819 /* 4820 * Update the I/O statistics to indicate that an error occurred. 4821 * No need to update the wait/run queues as no specific read or 4822 * write request is being completed in response to this 'msg'. 4823 */ 4824 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4825 mutex_exit(&vdcp->lock); 4826 return (EPROTO); 4827 } 4828 4829 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4830 ASSERT(start == end); 4831 4832 ldep = &vdcp->local_dring[idx]; 4833 4834 DMSG(vdcp, 1, ": state 0x%x\n", ldep->dep->hdr.dstate); 4835 4836 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4837 struct buf *bufp; 4838 4839 status = ldep->dep->payload.status; 4840 4841 bufp = ldep->buf; 4842 ASSERT(bufp != NULL); 4843 4844 bufp->b_resid = bufp->b_bcount - ldep->dep->payload.nbytes; 4845 bioerror(bufp, status); 4846 4847 if (status != 0) { 4848 DMSG(vdcp, 1, "I/O status=%d\n", status); 4849 } 4850 4851 DMSG(vdcp, 1, 4852 "I/O complete req=%ld bytes resp=%ld bytes\n", 4853 bufp->b_bcount, ldep->dep->payload.nbytes); 4854 4855 /* 4856 * If the request has failed and we have multiple servers or 4857 * failfast is enabled then we will have to defer the completion 4858 * of the request until we have checked that the vdisk backend 4859 * is effectively available (if multiple server) or that there 4860 * is no reservation conflict (if failfast). 4861 */ 4862 if (status != 0 && 4863 ((vdcp->num_servers > 1 && 4864 (ldep->flags & VDC_OP_ERRCHK_BACKEND)) || 4865 (vdcp->failfast_interval != 0 && 4866 (ldep->flags & VDC_OP_ERRCHK_CONFLICT)))) { 4867 /* 4868 * The I/O has failed and we need to check the error. 4869 */ 4870 (void) vdc_eio_queue(vdcp, idx); 4871 } else { 4872 op = ldep->operation; 4873 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 4874 if (status == 0) { 4875 VD_UPDATE_IO_STATS(vdcp, op, 4876 ldep->dep->payload.nbytes); 4877 } else { 4878 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4879 } 4880 VD_KSTAT_RUNQ_EXIT(vdcp); 4881 DTRACE_IO1(done, buf_t *, bufp); 4882 } 4883 (void) vdc_depopulate_descriptor(vdcp, idx); 4884 biodone(bufp); 4885 } 4886 } 4887 4888 /* let the arrival signal propogate */ 4889 mutex_exit(&vdcp->lock); 4890 4891 /* probe gives the count of how many entries were processed */ 4892 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4893 4894 return (0); 4895 } 4896 4897 4898 /* 4899 * Function: 4900 * vdc_handle_ver_msg() 4901 * 4902 * Description: 4903 * 4904 * Arguments: 4905 * vdc - soft state pointer for this instance of the device driver. 4906 * ver_msg - LDC message sent by vDisk server 4907 * 4908 * Return Code: 4909 * 0 - Success 4910 */ 4911 static int 4912 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4913 { 4914 int status = 0; 4915 4916 ASSERT(vdc != NULL); 4917 ASSERT(mutex_owned(&vdc->lock)); 4918 4919 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4920 return (EPROTO); 4921 } 4922 4923 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4924 return (EINVAL); 4925 } 4926 4927 switch (ver_msg->tag.vio_subtype) { 4928 case VIO_SUBTYPE_ACK: 4929 /* 4930 * We check to see if the version returned is indeed supported 4931 * (The server may have also adjusted the minor number downwards 4932 * and if so 'ver_msg' will contain the actual version agreed) 4933 */ 4934 if (vdc_is_supported_version(ver_msg)) { 4935 vdc->ver.major = ver_msg->ver_major; 4936 vdc->ver.minor = ver_msg->ver_minor; 4937 ASSERT(vdc->ver.major > 0); 4938 } else { 4939 status = EPROTO; 4940 } 4941 break; 4942 4943 case VIO_SUBTYPE_NACK: 4944 /* 4945 * call vdc_is_supported_version() which will return the next 4946 * supported version (if any) in 'ver_msg' 4947 */ 4948 (void) vdc_is_supported_version(ver_msg); 4949 if (ver_msg->ver_major > 0) { 4950 size_t len = sizeof (*ver_msg); 4951 4952 ASSERT(vdc->ver.major > 0); 4953 4954 /* reset the necessary fields and resend */ 4955 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4956 ver_msg->dev_class = VDEV_DISK; 4957 4958 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4959 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4960 vdc->instance, status); 4961 if (len != sizeof (*ver_msg)) 4962 status = EBADMSG; 4963 } else { 4964 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4965 vdc->instance); 4966 status = ENOTSUP; 4967 } 4968 4969 break; 4970 case VIO_SUBTYPE_INFO: 4971 /* 4972 * Handle the case where vds starts handshake 4973 * (for now only vdc is the instigator) 4974 */ 4975 status = ENOTSUP; 4976 break; 4977 4978 default: 4979 status = EINVAL; 4980 break; 4981 } 4982 4983 return (status); 4984 } 4985 4986 /* 4987 * Function: 4988 * vdc_handle_attr_msg() 4989 * 4990 * Description: 4991 * 4992 * Arguments: 4993 * vdc - soft state pointer for this instance of the device driver. 4994 * attr_msg - LDC message sent by vDisk server 4995 * 4996 * Return Code: 4997 * 0 - Success 4998 */ 4999 static int 5000 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 5001 { 5002 int status = 0; 5003 vd_disk_type_t old_type; 5004 5005 ASSERT(vdc != NULL); 5006 ASSERT(mutex_owned(&vdc->lock)); 5007 5008 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 5009 return (EPROTO); 5010 } 5011 5012 switch (attr_msg->tag.vio_subtype) { 5013 case VIO_SUBTYPE_ACK: 5014 /* 5015 * We now verify the attributes sent by vds. 5016 */ 5017 if (attr_msg->vdisk_size == 0) { 5018 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 5019 vdc->instance); 5020 status = EINVAL; 5021 break; 5022 } 5023 5024 if (attr_msg->max_xfer_sz == 0) { 5025 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 5026 vdc->instance); 5027 status = EINVAL; 5028 break; 5029 } 5030 5031 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 5032 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 5033 vdc->instance); 5034 attr_msg->vdisk_size = 0; 5035 } 5036 5037 /* update the VIO block size */ 5038 if (attr_msg->vdisk_block_size > 0 && 5039 vdc_update_vio_bsize(vdc, 5040 attr_msg->vdisk_block_size) != 0) { 5041 DMSG(vdc, 0, "[%d] Invalid block size (%u) from vds", 5042 vdc->instance, attr_msg->vdisk_block_size); 5043 status = EINVAL; 5044 break; 5045 } 5046 5047 /* update disk, block and transfer sizes */ 5048 old_type = vdc->vdisk_type; 5049 vdc_update_size(vdc, attr_msg->vdisk_size, 5050 attr_msg->vdisk_block_size, attr_msg->max_xfer_sz); 5051 vdc->vdisk_type = attr_msg->vdisk_type; 5052 vdc->operations = attr_msg->operations; 5053 if (vio_ver_is_supported(vdc->ver, 1, 1)) 5054 vdc->vdisk_media = attr_msg->vdisk_media; 5055 else 5056 vdc->vdisk_media = 0; 5057 5058 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 5059 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 5060 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 5061 vdc->instance, vdc->vdisk_bsize, 5062 attr_msg->vdisk_block_size); 5063 5064 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 5065 (attr_msg->vdisk_size > INT64_MAX) || 5066 (attr_msg->operations == 0) || 5067 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 5068 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 5069 vdc->instance); 5070 status = EINVAL; 5071 break; 5072 } 5073 5074 /* 5075 * Now that we have received all attributes we can create a 5076 * fake geometry for the disk. 5077 */ 5078 vdc_create_fake_geometry(vdc); 5079 5080 /* 5081 * If the disk type was previously unknown and device nodes 5082 * were created then the driver would have created 8 device 5083 * nodes. If we now find out that this is a single-slice disk 5084 * then we need to re-create the appropriate device nodes. 5085 */ 5086 if (old_type == VD_DISK_TYPE_UNK && 5087 (vdc->initialized & VDC_MINOR) && 5088 vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 5089 ddi_remove_minor_node(vdc->dip, NULL); 5090 (void) devfs_clean(ddi_get_parent(vdc->dip), 5091 NULL, DV_CLEAN_FORCE); 5092 if (vdc_create_device_nodes(vdc) != 0) { 5093 DMSG(vdc, 0, "![%d] Failed to update " 5094 "device nodes", vdc->instance); 5095 } 5096 } 5097 5098 break; 5099 5100 case VIO_SUBTYPE_NACK: 5101 /* 5102 * vds could not handle the attributes we sent so we 5103 * stop negotiating. 5104 */ 5105 status = EPROTO; 5106 break; 5107 5108 case VIO_SUBTYPE_INFO: 5109 /* 5110 * Handle the case where vds starts the handshake 5111 * (for now; vdc is the only supported instigatior) 5112 */ 5113 status = ENOTSUP; 5114 break; 5115 5116 default: 5117 status = ENOTSUP; 5118 break; 5119 } 5120 5121 return (status); 5122 } 5123 5124 /* 5125 * Function: 5126 * vdc_handle_dring_reg_msg() 5127 * 5128 * Description: 5129 * 5130 * Arguments: 5131 * vdc - soft state pointer for this instance of the driver. 5132 * dring_msg - LDC message sent by vDisk server 5133 * 5134 * Return Code: 5135 * 0 - Success 5136 */ 5137 static int 5138 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 5139 { 5140 int status = 0; 5141 5142 ASSERT(vdc != NULL); 5143 ASSERT(mutex_owned(&vdc->lock)); 5144 5145 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 5146 return (EPROTO); 5147 } 5148 5149 switch (dring_msg->tag.vio_subtype) { 5150 case VIO_SUBTYPE_ACK: 5151 /* save the received dring_ident */ 5152 vdc->dring_ident = dring_msg->dring_ident; 5153 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 5154 vdc->instance, vdc->dring_ident); 5155 break; 5156 5157 case VIO_SUBTYPE_NACK: 5158 /* 5159 * vds could not handle the DRing info we sent so we 5160 * stop negotiating. 5161 */ 5162 DMSG(vdc, 0, "[%d] server could not register DRing\n", 5163 vdc->instance); 5164 status = EPROTO; 5165 break; 5166 5167 case VIO_SUBTYPE_INFO: 5168 /* 5169 * Handle the case where vds starts handshake 5170 * (for now only vdc is the instigatior) 5171 */ 5172 status = ENOTSUP; 5173 break; 5174 default: 5175 status = ENOTSUP; 5176 } 5177 5178 return (status); 5179 } 5180 5181 /* 5182 * Function: 5183 * vdc_verify_seq_num() 5184 * 5185 * Description: 5186 * This functions verifies that the sequence number sent back by the vDisk 5187 * server with the latest message is what is expected (i.e. it is greater 5188 * than the last seq num sent by the vDisk server and less than or equal 5189 * to the last seq num generated by vdc). 5190 * 5191 * It then checks the request ID to see if any requests need processing 5192 * in the DRing. 5193 * 5194 * Arguments: 5195 * vdc - soft state pointer for this instance of the driver. 5196 * dring_msg - pointer to the LDC message sent by vds 5197 * 5198 * Return Code: 5199 * VDC_SEQ_NUM_TODO - Message needs to be processed 5200 * VDC_SEQ_NUM_SKIP - Message has already been processed 5201 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 5202 * vdc cannot deal with them 5203 */ 5204 static int 5205 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 5206 { 5207 ASSERT(vdc != NULL); 5208 ASSERT(dring_msg != NULL); 5209 ASSERT(mutex_owned(&vdc->lock)); 5210 5211 /* 5212 * Check to see if the messages were responded to in the correct 5213 * order by vds. 5214 */ 5215 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 5216 (dring_msg->seq_num > vdc->seq_num)) { 5217 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 5218 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 5219 vdc->instance, dring_msg->seq_num, 5220 vdc->seq_num_reply, vdc->seq_num, 5221 vdc->req_id_proc, vdc->req_id); 5222 return (VDC_SEQ_NUM_INVALID); 5223 } 5224 vdc->seq_num_reply = dring_msg->seq_num; 5225 5226 if (vdc->req_id_proc < vdc->req_id) 5227 return (VDC_SEQ_NUM_TODO); 5228 else 5229 return (VDC_SEQ_NUM_SKIP); 5230 } 5231 5232 5233 /* 5234 * Function: 5235 * vdc_is_supported_version() 5236 * 5237 * Description: 5238 * This routine checks if the major/minor version numbers specified in 5239 * 'ver_msg' are supported. If not it finds the next version that is 5240 * in the supported version list 'vdc_version[]' and sets the fields in 5241 * 'ver_msg' to those values 5242 * 5243 * Arguments: 5244 * ver_msg - LDC message sent by vDisk server 5245 * 5246 * Return Code: 5247 * B_TRUE - Success 5248 * B_FALSE - Version not supported 5249 */ 5250 static boolean_t 5251 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5252 { 5253 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5254 5255 for (int i = 0; i < vdc_num_versions; i++) { 5256 ASSERT(vdc_version[i].major > 0); 5257 ASSERT((i == 0) || 5258 (vdc_version[i].major < vdc_version[i-1].major)); 5259 5260 /* 5261 * If the major versions match, adjust the minor version, if 5262 * necessary, down to the highest value supported by this 5263 * client. The server should support all minor versions lower 5264 * than the value it sent 5265 */ 5266 if (ver_msg->ver_major == vdc_version[i].major) { 5267 if (ver_msg->ver_minor > vdc_version[i].minor) { 5268 DMSGX(0, 5269 "Adjusting minor version from %u to %u", 5270 ver_msg->ver_minor, vdc_version[i].minor); 5271 ver_msg->ver_minor = vdc_version[i].minor; 5272 } 5273 return (B_TRUE); 5274 } 5275 5276 /* 5277 * If the message contains a higher major version number, set 5278 * the message's major/minor versions to the current values 5279 * and return false, so this message will get resent with 5280 * these values, and the server will potentially try again 5281 * with the same or a lower version 5282 */ 5283 if (ver_msg->ver_major > vdc_version[i].major) { 5284 ver_msg->ver_major = vdc_version[i].major; 5285 ver_msg->ver_minor = vdc_version[i].minor; 5286 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5287 ver_msg->ver_major, ver_msg->ver_minor); 5288 5289 return (B_FALSE); 5290 } 5291 5292 /* 5293 * Otherwise, the message's major version is less than the 5294 * current major version, so continue the loop to the next 5295 * (lower) supported version 5296 */ 5297 } 5298 5299 /* 5300 * No common version was found; "ground" the version pair in the 5301 * message to terminate negotiation 5302 */ 5303 ver_msg->ver_major = 0; 5304 ver_msg->ver_minor = 0; 5305 5306 return (B_FALSE); 5307 } 5308 /* -------------------------------------------------------------------------- */ 5309 5310 /* 5311 * DKIO(7) support 5312 */ 5313 5314 typedef struct vdc_dk_arg { 5315 struct dk_callback dkc; 5316 int mode; 5317 dev_t dev; 5318 vdc_t *vdc; 5319 } vdc_dk_arg_t; 5320 5321 /* 5322 * Function: 5323 * vdc_dkio_flush_cb() 5324 * 5325 * Description: 5326 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5327 * by kernel code. 5328 * 5329 * Arguments: 5330 * arg - a pointer to a vdc_dk_arg_t structure. 5331 */ 5332 void 5333 vdc_dkio_flush_cb(void *arg) 5334 { 5335 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5336 struct dk_callback *dkc = NULL; 5337 vdc_t *vdc = NULL; 5338 int rv; 5339 5340 if (dk_arg == NULL) { 5341 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5342 return; 5343 } 5344 dkc = &dk_arg->dkc; 5345 vdc = dk_arg->vdc; 5346 ASSERT(vdc != NULL); 5347 5348 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5349 VDCPART(dk_arg->dev), 0, VIO_both_dir, B_TRUE); 5350 if (rv != 0) { 5351 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5352 vdc->instance, rv, 5353 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5354 } 5355 5356 /* 5357 * Trigger the call back to notify the caller the the ioctl call has 5358 * been completed. 5359 */ 5360 if ((dk_arg->mode & FKIOCTL) && 5361 (dkc != NULL) && 5362 (dkc->dkc_callback != NULL)) { 5363 ASSERT(dkc->dkc_cookie != NULL); 5364 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5365 } 5366 5367 /* Indicate that one less DKIO write flush is outstanding */ 5368 mutex_enter(&vdc->lock); 5369 vdc->dkio_flush_pending--; 5370 ASSERT(vdc->dkio_flush_pending >= 0); 5371 mutex_exit(&vdc->lock); 5372 5373 /* free the mem that was allocated when the callback was dispatched */ 5374 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5375 } 5376 5377 /* 5378 * Function: 5379 * vdc_dkio_gapart() 5380 * 5381 * Description: 5382 * This function implements the DKIOCGAPART ioctl. 5383 * 5384 * Arguments: 5385 * vdc - soft state pointer 5386 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5387 * flag - ioctl flags 5388 */ 5389 static int 5390 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5391 { 5392 struct dk_geom *geom; 5393 struct extvtoc *vtoc; 5394 union { 5395 struct dk_map map[NDKMAP]; 5396 struct dk_map32 map32[NDKMAP]; 5397 } data; 5398 int i, rv, size; 5399 5400 mutex_enter(&vdc->lock); 5401 5402 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5403 mutex_exit(&vdc->lock); 5404 return (rv); 5405 } 5406 5407 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) { 5408 mutex_exit(&vdc->lock); 5409 return (EOVERFLOW); 5410 } 5411 5412 vtoc = vdc->vtoc; 5413 geom = vdc->geom; 5414 5415 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5416 5417 for (i = 0; i < vtoc->v_nparts; i++) { 5418 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5419 (geom->dkg_nhead * geom->dkg_nsect); 5420 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5421 } 5422 size = NDKMAP * sizeof (struct dk_map32); 5423 5424 } else { 5425 5426 for (i = 0; i < vtoc->v_nparts; i++) { 5427 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5428 (geom->dkg_nhead * geom->dkg_nsect); 5429 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5430 } 5431 size = NDKMAP * sizeof (struct dk_map); 5432 5433 } 5434 5435 mutex_exit(&vdc->lock); 5436 5437 if (ddi_copyout(&data, arg, size, flag) != 0) 5438 return (EFAULT); 5439 5440 return (0); 5441 } 5442 5443 /* 5444 * Function: 5445 * vdc_dkio_partition() 5446 * 5447 * Description: 5448 * This function implements the DKIOCPARTITION ioctl. 5449 * 5450 * Arguments: 5451 * vdc - soft state pointer 5452 * arg - a pointer to a struct partition64 structure 5453 * flag - ioctl flags 5454 */ 5455 static int 5456 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5457 { 5458 struct partition64 p64; 5459 efi_gpt_t *gpt; 5460 efi_gpe_t *gpe; 5461 vd_efi_dev_t edev; 5462 uint_t partno; 5463 int rv; 5464 5465 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5466 return (EFAULT); 5467 } 5468 5469 VDC_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5470 5471 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5472 return (rv); 5473 } 5474 5475 partno = p64.p_partno; 5476 5477 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5478 vd_efi_free(&edev, gpt, gpe); 5479 return (ESRCH); 5480 } 5481 5482 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5483 sizeof (struct uuid)); 5484 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5485 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5486 5487 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5488 vd_efi_free(&edev, gpt, gpe); 5489 return (EFAULT); 5490 } 5491 5492 vd_efi_free(&edev, gpt, gpe); 5493 return (0); 5494 } 5495 5496 /* 5497 * Function: 5498 * vdc_dioctl_rwcmd() 5499 * 5500 * Description: 5501 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5502 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5503 * 5504 * Arguments: 5505 * dev - device 5506 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5507 * flag - ioctl flags 5508 */ 5509 static int 5510 vdc_dioctl_rwcmd(vdc_t *vdc, caddr_t arg, int flag) 5511 { 5512 struct dadkio_rwcmd32 rwcmd32; 5513 struct dadkio_rwcmd rwcmd; 5514 struct iovec aiov; 5515 struct uio auio; 5516 int rw, status; 5517 struct buf *buf; 5518 5519 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5520 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5521 sizeof (struct dadkio_rwcmd32), flag)) { 5522 return (EFAULT); 5523 } 5524 rwcmd.cmd = rwcmd32.cmd; 5525 rwcmd.flags = rwcmd32.flags; 5526 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5527 rwcmd.buflen = rwcmd32.buflen; 5528 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5529 } else { 5530 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5531 sizeof (struct dadkio_rwcmd), flag)) { 5532 return (EFAULT); 5533 } 5534 } 5535 5536 switch (rwcmd.cmd) { 5537 case DADKIO_RWCMD_READ: 5538 rw = B_READ; 5539 break; 5540 case DADKIO_RWCMD_WRITE: 5541 rw = B_WRITE; 5542 break; 5543 default: 5544 return (EINVAL); 5545 } 5546 5547 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5548 aiov.iov_base = rwcmd.bufaddr; 5549 aiov.iov_len = rwcmd.buflen; 5550 5551 bzero((caddr_t)&auio, sizeof (struct uio)); 5552 auio.uio_iov = &aiov; 5553 auio.uio_iovcnt = 1; 5554 auio.uio_loffset = rwcmd.blkaddr * vdc->vdisk_bsize; 5555 auio.uio_resid = rwcmd.buflen; 5556 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5557 5558 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5559 bioinit(buf); 5560 /* 5561 * We use the private field of buf to specify that this is an 5562 * I/O using an absolute offset. 5563 */ 5564 buf->b_private = (void *)VD_SLICE_NONE; 5565 5566 status = physio(vdc_strategy, buf, VD_MAKE_DEV(vdc->instance, 0), 5567 rw, vdc_min, &auio); 5568 5569 biofini(buf); 5570 kmem_free(buf, sizeof (buf_t)); 5571 5572 return (status); 5573 } 5574 5575 /* 5576 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5577 * buffer is returned in alloc_len. 5578 */ 5579 static vd_scsi_t * 5580 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5581 int *alloc_len) 5582 { 5583 vd_scsi_t *vd_scsi; 5584 int vd_scsi_len = VD_SCSI_SIZE; 5585 5586 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5587 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5588 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5589 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5590 5591 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5592 5593 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5594 5595 vd_scsi->cdb_len = cdb_len; 5596 vd_scsi->sense_len = sense_len; 5597 vd_scsi->datain_len = datain_len; 5598 vd_scsi->dataout_len = dataout_len; 5599 5600 *alloc_len = vd_scsi_len; 5601 5602 return (vd_scsi); 5603 } 5604 5605 /* 5606 * Convert the status of a SCSI command to a Solaris return code. 5607 * 5608 * Arguments: 5609 * vd_scsi - The SCSI operation buffer. 5610 * log_error - indicate if an error message should be logged. 5611 * 5612 * Note that our SCSI error messages are rather primitive for the moment 5613 * and could be improved by decoding some data like the SCSI command and 5614 * the sense key. 5615 * 5616 * Return value: 5617 * 0 - Status is good. 5618 * EACCES - Status reports a reservation conflict. 5619 * ENOTSUP - Status reports a check condition and sense key 5620 * reports an illegal request. 5621 * EIO - Any other status. 5622 */ 5623 static int 5624 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5625 { 5626 int rv; 5627 char path_str[MAXPATHLEN]; 5628 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5629 union scsi_cdb *cdb; 5630 struct scsi_extended_sense *sense; 5631 5632 if (vd_scsi->cmd_status == STATUS_GOOD) 5633 /* no error */ 5634 return (0); 5635 5636 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5637 if (vdc_scsi_log_error) 5638 log_error = B_TRUE; 5639 5640 if (log_error) { 5641 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5642 ddi_pathname(vdc->dip, path_str), vdc->instance, 5643 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5644 } 5645 5646 /* default returned value */ 5647 rv = EIO; 5648 5649 switch (vd_scsi->cmd_status) { 5650 5651 case STATUS_CHECK: 5652 case STATUS_TERMINATED: 5653 if (log_error) 5654 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5655 5656 /* check sense buffer */ 5657 if (vd_scsi->sense_len == 0 || 5658 vd_scsi->sense_status != STATUS_GOOD) { 5659 if (log_error) 5660 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5661 break; 5662 } 5663 5664 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5665 5666 if (log_error) { 5667 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5668 "\tASC: 0x%x, ASCQ: 0x%x\n", 5669 scsi_sense_key((uint8_t *)sense), 5670 scsi_sense_asc((uint8_t *)sense), 5671 scsi_sense_ascq((uint8_t *)sense)); 5672 } 5673 5674 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5675 rv = ENOTSUP; 5676 break; 5677 5678 case STATUS_BUSY: 5679 if (log_error) 5680 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5681 break; 5682 5683 case STATUS_RESERVATION_CONFLICT: 5684 /* 5685 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5686 * reservation conflict could be due to various reasons like 5687 * incorrect keys, not registered or not reserved etc. So, 5688 * we should not panic in that case. 5689 */ 5690 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5691 if (vdc->failfast_interval != 0 && 5692 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5693 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5694 /* failfast is enabled so we have to panic */ 5695 (void) snprintf(panic_str, sizeof (panic_str), 5696 VDC_RESV_CONFLICT_FMT_STR "%s", 5697 ddi_pathname(vdc->dip, path_str)); 5698 panic(panic_str); 5699 } 5700 if (log_error) 5701 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5702 rv = EACCES; 5703 break; 5704 5705 case STATUS_QFULL: 5706 if (log_error) 5707 cmn_err(CE_NOTE, "\tQueue Full\n"); 5708 break; 5709 5710 case STATUS_MET: 5711 case STATUS_INTERMEDIATE: 5712 case STATUS_SCSI2: 5713 case STATUS_INTERMEDIATE_MET: 5714 case STATUS_ACA_ACTIVE: 5715 if (log_error) 5716 cmn_err(CE_CONT, 5717 "\tUnexpected SCSI status received: 0x%x\n", 5718 vd_scsi->cmd_status); 5719 break; 5720 5721 default: 5722 if (log_error) 5723 cmn_err(CE_CONT, 5724 "\tInvalid SCSI status received: 0x%x\n", 5725 vd_scsi->cmd_status); 5726 break; 5727 } 5728 5729 return (rv); 5730 } 5731 5732 /* 5733 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5734 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5735 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5736 * converted to a VD_OP_RESET operation. 5737 */ 5738 static int 5739 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5740 { 5741 struct uscsi_cmd uscsi; 5742 struct uscsi_cmd32 uscsi32; 5743 vd_scsi_t *vd_scsi; 5744 int vd_scsi_len; 5745 union scsi_cdb *cdb; 5746 struct scsi_extended_sense *sense; 5747 char *datain, *dataout; 5748 size_t cdb_len, datain_len, dataout_len, sense_len; 5749 int rv; 5750 5751 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5752 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5753 mode) != 0) 5754 return (EFAULT); 5755 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5756 } else { 5757 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5758 mode) != 0) 5759 return (EFAULT); 5760 } 5761 5762 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5763 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5764 USCSI_RESET_ALL)) { 5765 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, 5766 VIO_both_dir, B_TRUE); 5767 return (rv); 5768 } 5769 5770 /* cdb buffer length */ 5771 cdb_len = uscsi.uscsi_cdblen; 5772 5773 /* data in and out buffers length */ 5774 if (uscsi.uscsi_flags & USCSI_READ) { 5775 datain_len = uscsi.uscsi_buflen; 5776 dataout_len = 0; 5777 } else { 5778 datain_len = 0; 5779 dataout_len = uscsi.uscsi_buflen; 5780 } 5781 5782 /* sense buffer length */ 5783 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5784 sense_len = uscsi.uscsi_rqlen; 5785 else 5786 sense_len = 0; 5787 5788 /* allocate buffer for the VD_SCSICMD_OP operation */ 5789 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5790 &vd_scsi_len); 5791 5792 /* 5793 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5794 * but basically they prevent a SCSI command from being retried in case 5795 * of an error. 5796 */ 5797 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5798 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5799 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5800 5801 /* set task attribute */ 5802 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5803 vd_scsi->task_attribute = 0; 5804 } else { 5805 if (uscsi.uscsi_flags & USCSI_HEAD) 5806 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5807 else if (uscsi.uscsi_flags & USCSI_HTAG) 5808 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5809 else if (uscsi.uscsi_flags & USCSI_OTAG) 5810 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5811 else 5812 vd_scsi->task_attribute = 0; 5813 } 5814 5815 /* set timeout */ 5816 vd_scsi->timeout = uscsi.uscsi_timeout; 5817 5818 /* copy-in cdb data */ 5819 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5820 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5821 rv = EFAULT; 5822 goto done; 5823 } 5824 5825 /* keep a pointer to the sense buffer */ 5826 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5827 5828 /* keep a pointer to the data-in buffer */ 5829 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5830 5831 /* copy-in request data to the data-out buffer */ 5832 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5833 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5834 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5835 mode)) { 5836 rv = EFAULT; 5837 goto done; 5838 } 5839 } 5840 5841 /* submit the request */ 5842 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5843 0, 0, VIO_both_dir, B_FALSE); 5844 5845 if (rv != 0) 5846 goto done; 5847 5848 /* update scsi status */ 5849 uscsi.uscsi_status = vd_scsi->cmd_status; 5850 5851 /* update sense data */ 5852 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5853 (uscsi.uscsi_status == STATUS_CHECK || 5854 uscsi.uscsi_status == STATUS_TERMINATED)) { 5855 5856 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5857 5858 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5859 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5860 vd_scsi->sense_len; 5861 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5862 vd_scsi->sense_len, mode) != 0) { 5863 rv = EFAULT; 5864 goto done; 5865 } 5866 } 5867 } 5868 5869 /* update request data */ 5870 if (uscsi.uscsi_status == STATUS_GOOD) { 5871 if (uscsi.uscsi_flags & USCSI_READ) { 5872 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5873 vd_scsi->datain_len; 5874 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5875 vd_scsi->datain_len, mode) != 0) { 5876 rv = EFAULT; 5877 goto done; 5878 } 5879 } else { 5880 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5881 vd_scsi->dataout_len; 5882 } 5883 } 5884 5885 /* copy-out result */ 5886 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5887 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5888 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5889 mode) != 0) { 5890 rv = EFAULT; 5891 goto done; 5892 } 5893 } else { 5894 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5895 mode) != 0) { 5896 rv = EFAULT; 5897 goto done; 5898 } 5899 } 5900 5901 /* get the return code from the SCSI command status */ 5902 rv = vdc_scsi_status(vdc, vd_scsi, 5903 !(uscsi.uscsi_flags & USCSI_SILENT)); 5904 5905 done: 5906 kmem_free(vd_scsi, vd_scsi_len); 5907 return (rv); 5908 } 5909 5910 /* 5911 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5912 * 5913 * Arguments: 5914 * cmd - SCSI PERSISTENT IN command 5915 * len - length of the SCSI input buffer 5916 * vd_scsi_len - return the length of the allocated buffer 5917 * 5918 * Returned Value: 5919 * a pointer to the allocated VD_OP_SCSICMD buffer. 5920 */ 5921 static vd_scsi_t * 5922 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5923 { 5924 int cdb_len, sense_len, datain_len, dataout_len; 5925 vd_scsi_t *vd_scsi; 5926 union scsi_cdb *cdb; 5927 5928 cdb_len = CDB_GROUP1; 5929 sense_len = sizeof (struct scsi_extended_sense); 5930 datain_len = len; 5931 dataout_len = 0; 5932 5933 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5934 vd_scsi_len); 5935 5936 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5937 5938 /* set cdb */ 5939 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5940 cdb->cdb_opaque[1] = cmd; 5941 FORMG1COUNT(cdb, datain_len); 5942 5943 vd_scsi->timeout = vdc_scsi_timeout; 5944 5945 return (vd_scsi); 5946 } 5947 5948 /* 5949 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5950 * 5951 * Arguments: 5952 * cmd - SCSI PERSISTENT OUT command 5953 * len - length of the SCSI output buffer 5954 * vd_scsi_len - return the length of the allocated buffer 5955 * 5956 * Returned Code: 5957 * a pointer to the allocated VD_OP_SCSICMD buffer. 5958 */ 5959 static vd_scsi_t * 5960 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5961 { 5962 int cdb_len, sense_len, datain_len, dataout_len; 5963 vd_scsi_t *vd_scsi; 5964 union scsi_cdb *cdb; 5965 5966 cdb_len = CDB_GROUP1; 5967 sense_len = sizeof (struct scsi_extended_sense); 5968 datain_len = 0; 5969 dataout_len = len; 5970 5971 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5972 vd_scsi_len); 5973 5974 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5975 5976 /* set cdb */ 5977 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5978 cdb->cdb_opaque[1] = cmd; 5979 FORMG1COUNT(cdb, dataout_len); 5980 5981 vd_scsi->timeout = vdc_scsi_timeout; 5982 5983 return (vd_scsi); 5984 } 5985 5986 /* 5987 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5988 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5989 * server with a VD_OP_SCSICMD operation. 5990 */ 5991 static int 5992 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5993 { 5994 vd_scsi_t *vd_scsi; 5995 mhioc_inkeys_t inkeys; 5996 mhioc_key_list_t klist; 5997 struct mhioc_inkeys32 inkeys32; 5998 struct mhioc_key_list32 klist32; 5999 sd_prin_readkeys_t *scsi_keys; 6000 void *user_keys; 6001 int vd_scsi_len; 6002 int listsize, listlen, rv; 6003 6004 /* copyin arguments */ 6005 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6006 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 6007 if (rv != 0) 6008 return (EFAULT); 6009 6010 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 6011 sizeof (klist32), mode); 6012 if (rv != 0) 6013 return (EFAULT); 6014 6015 listsize = klist32.listsize; 6016 } else { 6017 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 6018 if (rv != 0) 6019 return (EFAULT); 6020 6021 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 6022 if (rv != 0) 6023 return (EFAULT); 6024 6025 listsize = klist.listsize; 6026 } 6027 6028 /* build SCSI VD_OP request */ 6029 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 6030 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 6031 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 6032 6033 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 6034 6035 /* submit the request */ 6036 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6037 0, 0, VIO_both_dir, B_FALSE); 6038 6039 if (rv != 0) 6040 goto done; 6041 6042 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 6043 6044 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6045 inkeys32.generation = scsi_keys->generation; 6046 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 6047 if (rv != 0) { 6048 rv = EFAULT; 6049 goto done; 6050 } 6051 6052 klist32.listlen = listlen; 6053 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 6054 sizeof (klist32), mode); 6055 if (rv != 0) { 6056 rv = EFAULT; 6057 goto done; 6058 } 6059 6060 user_keys = (caddr_t)(uintptr_t)klist32.list; 6061 } else { 6062 inkeys.generation = scsi_keys->generation; 6063 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 6064 if (rv != 0) { 6065 rv = EFAULT; 6066 goto done; 6067 } 6068 6069 klist.listlen = listlen; 6070 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 6071 if (rv != 0) { 6072 rv = EFAULT; 6073 goto done; 6074 } 6075 6076 user_keys = klist.list; 6077 } 6078 6079 /* copy out keys */ 6080 if (listlen > 0 && listsize > 0) { 6081 if (listsize < listlen) 6082 listlen = listsize; 6083 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 6084 listlen * MHIOC_RESV_KEY_SIZE, mode); 6085 if (rv != 0) 6086 rv = EFAULT; 6087 } 6088 6089 if (rv == 0) 6090 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6091 6092 done: 6093 kmem_free(vd_scsi, vd_scsi_len); 6094 6095 return (rv); 6096 } 6097 6098 /* 6099 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 6100 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 6101 * the vdisk server with a VD_OP_SCSICMD operation. 6102 */ 6103 static int 6104 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 6105 { 6106 vd_scsi_t *vd_scsi; 6107 mhioc_inresvs_t inresv; 6108 mhioc_resv_desc_list_t rlist; 6109 struct mhioc_inresvs32 inresv32; 6110 struct mhioc_resv_desc_list32 rlist32; 6111 mhioc_resv_desc_t mhd_resv; 6112 sd_prin_readresv_t *scsi_resv; 6113 sd_readresv_desc_t *resv; 6114 mhioc_resv_desc_t *user_resv; 6115 int vd_scsi_len; 6116 int listsize, listlen, i, rv; 6117 6118 /* copyin arguments */ 6119 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6120 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 6121 if (rv != 0) 6122 return (EFAULT); 6123 6124 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 6125 sizeof (rlist32), mode); 6126 if (rv != 0) 6127 return (EFAULT); 6128 6129 listsize = rlist32.listsize; 6130 } else { 6131 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 6132 if (rv != 0) 6133 return (EFAULT); 6134 6135 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 6136 if (rv != 0) 6137 return (EFAULT); 6138 6139 listsize = rlist.listsize; 6140 } 6141 6142 /* build SCSI VD_OP request */ 6143 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 6144 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 6145 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 6146 6147 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 6148 6149 /* submit the request */ 6150 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6151 0, 0, VIO_both_dir, B_FALSE); 6152 6153 if (rv != 0) 6154 goto done; 6155 6156 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 6157 6158 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 6159 inresv32.generation = scsi_resv->generation; 6160 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 6161 if (rv != 0) { 6162 rv = EFAULT; 6163 goto done; 6164 } 6165 6166 rlist32.listlen = listlen; 6167 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 6168 sizeof (rlist32), mode); 6169 if (rv != 0) { 6170 rv = EFAULT; 6171 goto done; 6172 } 6173 6174 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 6175 } else { 6176 inresv.generation = scsi_resv->generation; 6177 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 6178 if (rv != 0) { 6179 rv = EFAULT; 6180 goto done; 6181 } 6182 6183 rlist.listlen = listlen; 6184 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 6185 if (rv != 0) { 6186 rv = EFAULT; 6187 goto done; 6188 } 6189 6190 user_resv = rlist.list; 6191 } 6192 6193 /* copy out reservations */ 6194 if (listsize > 0 && listlen > 0) { 6195 if (listsize < listlen) 6196 listlen = listsize; 6197 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 6198 6199 for (i = 0; i < listlen; i++) { 6200 mhd_resv.type = resv->type; 6201 mhd_resv.scope = resv->scope; 6202 mhd_resv.scope_specific_addr = 6203 BE_32(resv->scope_specific_addr); 6204 bcopy(&resv->resvkey, &mhd_resv.key, 6205 MHIOC_RESV_KEY_SIZE); 6206 6207 rv = ddi_copyout(&mhd_resv, user_resv, 6208 sizeof (mhd_resv), mode); 6209 if (rv != 0) { 6210 rv = EFAULT; 6211 goto done; 6212 } 6213 resv++; 6214 user_resv++; 6215 } 6216 } 6217 6218 if (rv == 0) 6219 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6220 6221 done: 6222 kmem_free(vd_scsi, vd_scsi_len); 6223 return (rv); 6224 } 6225 6226 /* 6227 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 6228 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 6229 * server with a VD_OP_SCSICMD operation. 6230 */ 6231 static int 6232 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 6233 { 6234 vd_scsi_t *vd_scsi; 6235 sd_prout_t *scsi_prout; 6236 mhioc_register_t mhd_reg; 6237 int vd_scsi_len, rv; 6238 6239 /* copyin arguments */ 6240 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 6241 if (rv != 0) 6242 return (EFAULT); 6243 6244 /* build SCSI VD_OP request */ 6245 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 6246 sizeof (sd_prout_t), &vd_scsi_len); 6247 6248 /* set parameters */ 6249 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6250 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6251 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6252 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6253 6254 /* submit the request */ 6255 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6256 0, 0, VIO_both_dir, B_FALSE); 6257 6258 if (rv == 0) 6259 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6260 6261 kmem_free(vd_scsi, vd_scsi_len); 6262 return (rv); 6263 } 6264 6265 /* 6266 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6267 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6268 * server with a VD_OP_SCSICMD operation. 6269 */ 6270 static int 6271 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6272 { 6273 union scsi_cdb *cdb; 6274 vd_scsi_t *vd_scsi; 6275 sd_prout_t *scsi_prout; 6276 mhioc_resv_desc_t mhd_resv; 6277 int vd_scsi_len, rv; 6278 6279 /* copyin arguments */ 6280 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6281 if (rv != 0) 6282 return (EFAULT); 6283 6284 /* build SCSI VD_OP request */ 6285 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6286 sizeof (sd_prout_t), &vd_scsi_len); 6287 6288 /* set parameters */ 6289 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6290 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6291 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6292 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6293 cdb->cdb_opaque[2] = mhd_resv.type; 6294 6295 /* submit the request */ 6296 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6297 0, 0, VIO_both_dir, B_FALSE); 6298 6299 if (rv == 0) 6300 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6301 6302 kmem_free(vd_scsi, vd_scsi_len); 6303 return (rv); 6304 } 6305 6306 /* 6307 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6308 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6309 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6310 */ 6311 static int 6312 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6313 { 6314 union scsi_cdb *cdb; 6315 vd_scsi_t *vd_scsi; 6316 sd_prout_t *scsi_prout; 6317 mhioc_preemptandabort_t mhd_preempt; 6318 int vd_scsi_len, rv; 6319 6320 /* copyin arguments */ 6321 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6322 if (rv != 0) 6323 return (EFAULT); 6324 6325 /* build SCSI VD_OP request */ 6326 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6327 sizeof (sd_prout_t), &vd_scsi_len); 6328 6329 /* set parameters */ 6330 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6331 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6332 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6333 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6334 MHIOC_RESV_KEY_SIZE); 6335 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6336 MHIOC_RESV_KEY_SIZE); 6337 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6338 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6339 6340 /* submit the request */ 6341 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6342 0, 0, VIO_both_dir, B_FALSE); 6343 6344 if (rv == 0) 6345 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6346 6347 kmem_free(vd_scsi, vd_scsi_len); 6348 return (rv); 6349 } 6350 6351 /* 6352 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6353 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6354 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6355 */ 6356 static int 6357 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6358 { 6359 vd_scsi_t *vd_scsi; 6360 sd_prout_t *scsi_prout; 6361 mhioc_registerandignorekey_t mhd_regi; 6362 int vd_scsi_len, rv; 6363 6364 /* copyin arguments */ 6365 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6366 if (rv != 0) 6367 return (EFAULT); 6368 6369 /* build SCSI VD_OP request */ 6370 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6371 sizeof (sd_prout_t), &vd_scsi_len); 6372 6373 /* set parameters */ 6374 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6375 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6376 MHIOC_RESV_KEY_SIZE); 6377 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6378 6379 /* submit the request */ 6380 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6381 0, 0, VIO_both_dir, B_FALSE); 6382 6383 if (rv == 0) 6384 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6385 6386 kmem_free(vd_scsi, vd_scsi_len); 6387 return (rv); 6388 } 6389 6390 /* 6391 * This function is used to send a (simple) SCSI command and check errors. 6392 */ 6393 static int 6394 vdc_eio_scsi_cmd(vdc_t *vdc, uchar_t scmd, int flags) 6395 { 6396 int cdb_len, sense_len, vd_scsi_len; 6397 vd_scsi_t *vd_scsi; 6398 union scsi_cdb *cdb; 6399 int rv; 6400 6401 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6402 6403 if (scmd == SCMD_WRITE_G1) 6404 cdb_len = CDB_GROUP1; 6405 else 6406 cdb_len = CDB_GROUP0; 6407 6408 sense_len = sizeof (struct scsi_extended_sense); 6409 6410 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6411 6412 /* set cdb */ 6413 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6414 cdb->scc_cmd = scmd; 6415 6416 vd_scsi->timeout = vdc_scsi_timeout; 6417 6418 /* 6419 * Submit the request. Note the operation should not request that any 6420 * error is checked because this function is precisely called when 6421 * checking errors. 6422 */ 6423 ASSERT((flags & VDC_OP_ERRCHK) == 0); 6424 6425 rv = vdc_do_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6426 0, 0, NULL, VIO_both_dir, flags); 6427 6428 if (rv == 0) 6429 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6430 6431 kmem_free(vd_scsi, vd_scsi_len); 6432 return (rv); 6433 } 6434 6435 /* 6436 * This function is used to check if a SCSI backend is accessible. It will 6437 * also detect reservation conflict if failfast is enabled, and panic the 6438 * system in that case. 6439 * 6440 * Returned Code: 6441 * 0 - disk is accessible 6442 * != 0 - disk is inaccessible or unable to check if disk is accessible 6443 */ 6444 static int 6445 vdc_eio_scsi_check(vdc_t *vdc, int flags) 6446 { 6447 int failure = 0; 6448 int rv; 6449 6450 /* 6451 * Send a TEST UNIT READY command. The command will panic 6452 * the system if it fails with a reservation conflict and 6453 * failfast is enabled. If there is a reservation conflict 6454 * and failfast is not enabled then the function will return 6455 * EACCES. In that case, there's no problem with accessing 6456 * the backend, it is just reserved. 6457 */ 6458 rv = vdc_eio_scsi_cmd(vdc, SCMD_TEST_UNIT_READY, flags); 6459 if (rv != 0 && rv != EACCES) 6460 failure++; 6461 6462 /* we don't need to do more checking if failfast is not enabled */ 6463 if (vdc->failfast_interval == 0) 6464 return (failure); 6465 6466 /* 6467 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6468 * a reserved device, so we also do a WRITE(10) of zero byte in 6469 * order to provoke a Reservation Conflict status on those newer 6470 * devices. 6471 */ 6472 if (vdc_eio_scsi_cmd(vdc, SCMD_WRITE_G1, flags) != 0) 6473 failure++; 6474 6475 return (failure); 6476 } 6477 6478 /* 6479 * This function is used to check if a backend is effectively accessible. 6480 * 6481 * Returned Code: 6482 * 0 - disk is accessible 6483 * != 0 - disk is inaccessible or unable to check if disk is accessible 6484 */ 6485 static int 6486 vdc_eio_check(vdc_t *vdc, int flags) 6487 { 6488 char *buffer; 6489 diskaddr_t blkno; 6490 int rv; 6491 6492 ASSERT((flags & VDC_OP_ERRCHK) == 0); 6493 6494 flags |= VDC_OP_DRING_RESERVED; 6495 6496 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 6497 return (vdc_eio_scsi_check(vdc, flags)); 6498 6499 ASSERT(vdc->failfast_interval == 0); 6500 6501 /* 6502 * If the backend does not support SCSI operations then we simply 6503 * check if the backend is accessible by reading some data blocks. 6504 * We first try to read a random block, to try to avoid getting 6505 * a block that might have been cached on the service domain. Then 6506 * we try the last block, and finally the first block. 6507 * 6508 * We return success as soon as we are able to read any block. 6509 */ 6510 buffer = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP); 6511 6512 if (vdc->vdisk_size > 0) { 6513 6514 /* try a random block */ 6515 (void) random_get_pseudo_bytes((uint8_t *)&blkno, 6516 sizeof (diskaddr_t)); 6517 blkno = blkno % vdc->vdisk_size; 6518 rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer, 6519 vdc->vdisk_bsize, VD_SLICE_NONE, blkno, NULL, 6520 VIO_read_dir, flags); 6521 6522 if (rv == 0) 6523 goto done; 6524 6525 /* try the last block */ 6526 blkno = vdc->vdisk_size - 1; 6527 rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer, 6528 vdc->vdisk_bsize, VD_SLICE_NONE, blkno, NULL, 6529 VIO_read_dir, flags); 6530 6531 if (rv == 0) 6532 goto done; 6533 } 6534 6535 /* try block 0 */ 6536 blkno = 0; 6537 rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer, vdc->vdisk_bsize, 6538 VD_SLICE_NONE, blkno, NULL, VIO_read_dir, flags); 6539 6540 done: 6541 kmem_free(buffer, vdc->vdisk_bsize); 6542 return (rv); 6543 } 6544 6545 /* 6546 * Add a pending I/O to the eio queue. An I/O is added to this queue 6547 * when it has failed and failfast is enabled or the vdisk has multiple 6548 * servers. It will then be handled by the eio thread (vdc_eio_thread). 6549 * The eio queue is ordered starting with the most recent I/O added. 6550 */ 6551 static vdc_io_t * 6552 vdc_eio_queue(vdc_t *vdc, int index) 6553 { 6554 vdc_io_t *vio; 6555 6556 ASSERT(MUTEX_HELD(&vdc->lock)); 6557 6558 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6559 vio->vio_next = vdc->eio_queue; 6560 vio->vio_index = index; 6561 vio->vio_qtime = ddi_get_lbolt(); 6562 6563 vdc->eio_queue = vio; 6564 6565 /* notify the eio thread that a new I/O is queued */ 6566 cv_signal(&vdc->eio_cv); 6567 6568 return (vio); 6569 } 6570 6571 /* 6572 * Remove I/Os added before the indicated deadline from the eio queue. A 6573 * deadline of 0 means that all I/Os have to be unqueued. The complete_io 6574 * boolean specifies if unqueued I/Os should be marked as completed or not. 6575 */ 6576 static void 6577 vdc_eio_unqueue(vdc_t *vdc, clock_t deadline, boolean_t complete_io) 6578 { 6579 struct buf *buf; 6580 vdc_io_t *vio, *vio_tmp; 6581 int index, op; 6582 6583 ASSERT(MUTEX_HELD(&vdc->lock)); 6584 6585 vio_tmp = NULL; 6586 vio = vdc->eio_queue; 6587 6588 if (deadline != 0) { 6589 /* 6590 * Skip any io queued after the deadline. The eio queue is 6591 * ordered starting with the last I/O added to the queue. 6592 */ 6593 while (vio != NULL && vio->vio_qtime > deadline) { 6594 vio_tmp = vio; 6595 vio = vio->vio_next; 6596 } 6597 } 6598 6599 if (vio == NULL) 6600 /* nothing to unqueue */ 6601 return; 6602 6603 /* update the queue */ 6604 if (vio_tmp == NULL) 6605 vdc->eio_queue = NULL; 6606 else 6607 vio_tmp->vio_next = NULL; 6608 6609 /* 6610 * Free and complete unqueued I/Os if this was requested. All I/Os 6611 * have a block I/O data transfer structure (buf) and they are 6612 * completed by calling biodone(). 6613 */ 6614 while (vio != NULL) { 6615 vio_tmp = vio->vio_next; 6616 6617 if (complete_io) { 6618 index = vio->vio_index; 6619 op = vdc->local_dring[index].operation; 6620 buf = vdc->local_dring[index].buf; 6621 (void) vdc_depopulate_descriptor(vdc, index); 6622 ASSERT(buf->b_flags & B_ERROR); 6623 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 6624 VD_UPDATE_ERR_STATS(vdc, vd_softerrs); 6625 VD_KSTAT_RUNQ_EXIT(vdc); 6626 DTRACE_IO1(done, buf_t *, buf); 6627 } 6628 biodone(buf); 6629 } 6630 6631 kmem_free(vio, sizeof (vdc_io_t)); 6632 vio = vio_tmp; 6633 } 6634 } 6635 6636 /* 6637 * Error I/O Thread. There is one eio thread for each virtual disk that 6638 * has multiple servers or for which failfast is enabled. Failfast can only 6639 * be enabled for vdisk supporting SCSI commands. 6640 * 6641 * While failfast is enabled, the eio thread sends a TEST UNIT READY 6642 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6643 * we still have access to the disk. If a command fails with a RESERVATION 6644 * CONFLICT error then the system will immediatly panic. 6645 * 6646 * The eio thread is also woken up when an I/O has failed. It then checks 6647 * the access to the disk to ensure that the I/O failure was not due to a 6648 * reservation conflict or to the backend been inaccessible. 6649 * 6650 */ 6651 static void 6652 vdc_eio_thread(void *arg) 6653 { 6654 int status; 6655 vdc_t *vdc = (vdc_t *)arg; 6656 clock_t starttime, timeout = drv_usectohz(vdc->failfast_interval); 6657 6658 mutex_enter(&vdc->lock); 6659 6660 while (vdc->failfast_interval != 0 || vdc->num_servers > 1) { 6661 /* 6662 * Wait if there is nothing in the eio queue or if the state 6663 * is not VDC_STATE_RUNNING. 6664 */ 6665 if (vdc->eio_queue == NULL || vdc->state != VDC_STATE_RUNNING) { 6666 if (vdc->failfast_interval != 0) { 6667 timeout = ddi_get_lbolt() + 6668 drv_usectohz(vdc->failfast_interval); 6669 (void) cv_timedwait(&vdc->eio_cv, &vdc->lock, 6670 timeout); 6671 } else { 6672 ASSERT(vdc->num_servers > 1); 6673 (void) cv_wait(&vdc->eio_cv, &vdc->lock); 6674 } 6675 6676 if (vdc->state != VDC_STATE_RUNNING) 6677 continue; 6678 } 6679 6680 mutex_exit(&vdc->lock); 6681 6682 starttime = ddi_get_lbolt(); 6683 6684 /* check error */ 6685 status = vdc_eio_check(vdc, VDC_OP_STATE_RUNNING); 6686 6687 mutex_enter(&vdc->lock); 6688 /* 6689 * We have dropped the lock to check the backend so we have 6690 * to check that the eio thread is still enabled. 6691 */ 6692 if (vdc->failfast_interval == 0 && vdc->num_servers <= 1) 6693 break; 6694 6695 /* 6696 * If the eio queue is empty or we are not in running state 6697 * anymore then there is nothing to do. 6698 */ 6699 if (vdc->state != VDC_STATE_RUNNING || vdc->eio_queue == NULL) 6700 continue; 6701 6702 if (status == 0) { 6703 /* 6704 * The backend access has been successfully checked, 6705 * we can complete any I/O queued before the last check. 6706 */ 6707 vdc_eio_unqueue(vdc, starttime, B_TRUE); 6708 6709 } else if (vdc->num_servers > 1) { 6710 /* 6711 * The backend is inaccessible for a disk with multiple 6712 * servers. So we force a reset to switch to another 6713 * server. The reset will also clear the eio queue and 6714 * resubmit all pending I/Os. 6715 */ 6716 mutex_enter(&vdc->read_lock); 6717 vdc->read_state = VDC_READ_RESET; 6718 cv_signal(&vdc->read_cv); 6719 mutex_exit(&vdc->read_lock); 6720 } else { 6721 /* 6722 * There is only one path and the backend is not 6723 * accessible, so I/Os are actually failing because 6724 * of that. So we can complete I/O queued before the 6725 * last check. 6726 */ 6727 vdc_eio_unqueue(vdc, starttime, B_TRUE); 6728 } 6729 } 6730 6731 /* 6732 * The thread is being stopped so we can complete any queued I/O. 6733 */ 6734 vdc_eio_unqueue(vdc, 0, B_TRUE); 6735 vdc->eio_thread = NULL; 6736 mutex_exit(&vdc->lock); 6737 thread_exit(); 6738 } 6739 6740 /* 6741 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6742 */ 6743 static int 6744 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6745 { 6746 unsigned int mh_time; 6747 6748 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6749 return (EFAULT); 6750 6751 mutex_enter(&vdc->lock); 6752 if (mh_time != 0 && vdc->eio_thread == NULL) { 6753 vdc->eio_thread = thread_create(NULL, 0, 6754 vdc_eio_thread, vdc, 0, &p0, TS_RUN, 6755 v.v_maxsyspri - 2); 6756 } 6757 6758 vdc->failfast_interval = ((long)mh_time) * MILLISEC; 6759 cv_signal(&vdc->eio_cv); 6760 mutex_exit(&vdc->lock); 6761 6762 return (0); 6763 } 6764 6765 /* 6766 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6767 * converted to VD_OP_SET_ACCESS operations. 6768 */ 6769 static int 6770 vdc_access_set(vdc_t *vdc, uint64_t flags) 6771 { 6772 int rv; 6773 6774 /* submit owership command request */ 6775 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6776 sizeof (uint64_t), 0, 0, VIO_both_dir, B_TRUE); 6777 6778 return (rv); 6779 } 6780 6781 /* 6782 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6783 * VD_OP_GET_ACCESS operation. 6784 */ 6785 static int 6786 vdc_access_get(vdc_t *vdc, uint64_t *status) 6787 { 6788 int rv; 6789 6790 /* submit owership command request */ 6791 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6792 sizeof (uint64_t), 0, 0, VIO_both_dir, B_TRUE); 6793 6794 return (rv); 6795 } 6796 6797 /* 6798 * Disk Ownership Thread. 6799 * 6800 * When we have taken the ownership of a disk, this thread waits to be 6801 * notified when the LDC channel is reset so that it can recover the 6802 * ownership. 6803 * 6804 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6805 * can not be used to do the ownership recovery because it has to be 6806 * running to handle the reply message to the ownership operation. 6807 */ 6808 static void 6809 vdc_ownership_thread(void *arg) 6810 { 6811 vdc_t *vdc = (vdc_t *)arg; 6812 clock_t timeout; 6813 uint64_t status; 6814 6815 mutex_enter(&vdc->ownership_lock); 6816 mutex_enter(&vdc->lock); 6817 6818 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6819 6820 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6821 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6822 /* 6823 * There was a reset so the ownership has been lost, 6824 * try to recover. We do this without using the preempt 6825 * option so that we don't steal the ownership from 6826 * someone who has preempted us. 6827 */ 6828 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6829 vdc->instance); 6830 6831 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6832 VDC_OWNERSHIP_GRANTED); 6833 6834 mutex_exit(&vdc->lock); 6835 6836 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6837 VD_ACCESS_SET_PRESERVE); 6838 6839 mutex_enter(&vdc->lock); 6840 6841 if (status == 0) { 6842 DMSG(vdc, 0, "[%d] Ownership recovered", 6843 vdc->instance); 6844 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6845 } else { 6846 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6847 vdc->instance); 6848 } 6849 6850 } 6851 6852 /* 6853 * If we have the ownership then we just wait for an event 6854 * to happen (LDC reset), otherwise we will retry to recover 6855 * after a delay. 6856 */ 6857 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6858 timeout = 0; 6859 else 6860 timeout = drv_usectohz(vdc_ownership_delay); 6861 6862 /* Release the ownership_lock and wait on the vdc lock */ 6863 mutex_exit(&vdc->ownership_lock); 6864 6865 if (timeout == 0) 6866 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6867 else 6868 (void) cv_reltimedwait(&vdc->ownership_cv, &vdc->lock, 6869 timeout, TR_CLOCK_TICK); 6870 6871 mutex_exit(&vdc->lock); 6872 6873 mutex_enter(&vdc->ownership_lock); 6874 mutex_enter(&vdc->lock); 6875 } 6876 6877 vdc->ownership_thread = NULL; 6878 mutex_exit(&vdc->lock); 6879 mutex_exit(&vdc->ownership_lock); 6880 6881 thread_exit(); 6882 } 6883 6884 static void 6885 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6886 { 6887 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6888 6889 mutex_enter(&vdc->lock); 6890 vdc->ownership = ownership_flags; 6891 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6892 vdc->ownership_thread == NULL) { 6893 /* start ownership thread */ 6894 vdc->ownership_thread = thread_create(NULL, 0, 6895 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6896 v.v_maxsyspri - 2); 6897 } else { 6898 /* notify the ownership thread */ 6899 cv_signal(&vdc->ownership_cv); 6900 } 6901 mutex_exit(&vdc->lock); 6902 } 6903 6904 /* 6905 * Get the size and the block size of a virtual disk from the vdisk server. 6906 */ 6907 static int 6908 vdc_get_capacity(vdc_t *vdc, size_t *dsk_size, size_t *blk_size) 6909 { 6910 int rv = 0; 6911 size_t alloc_len; 6912 vd_capacity_t *vd_cap; 6913 6914 ASSERT(MUTEX_NOT_HELD(&vdc->lock)); 6915 6916 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6917 6918 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6919 6920 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6921 0, 0, VIO_both_dir, B_TRUE); 6922 6923 *dsk_size = vd_cap->vdisk_size; 6924 *blk_size = vd_cap->vdisk_block_size; 6925 6926 kmem_free(vd_cap, alloc_len); 6927 return (rv); 6928 } 6929 6930 /* 6931 * Check the disk capacity. Disk size information is updated if size has 6932 * changed. 6933 * 6934 * Return 0 if the disk capacity is available, or non-zero if it is not. 6935 */ 6936 static int 6937 vdc_check_capacity(vdc_t *vdc) 6938 { 6939 size_t dsk_size, blk_size; 6940 int rv; 6941 6942 /* 6943 * If the vdisk does not support the VD_OP_GET_CAPACITY operation 6944 * then the disk capacity has been retrieved during the handshake 6945 * and there's nothing more to do here. 6946 */ 6947 if (!VD_OP_SUPPORTED(vdc->operations, VD_OP_GET_CAPACITY)) 6948 return (0); 6949 6950 if ((rv = vdc_get_capacity(vdc, &dsk_size, &blk_size)) != 0) 6951 return (rv); 6952 6953 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0 || blk_size == 0) 6954 return (EINVAL); 6955 6956 mutex_enter(&vdc->lock); 6957 /* 6958 * First try to update the VIO block size (which is the same as the 6959 * vdisk block size). If this returns an error then that means that 6960 * we can not use that block size so basically the vdisk is unusable 6961 * and we return an error. 6962 */ 6963 rv = vdc_update_vio_bsize(vdc, blk_size); 6964 if (rv == 0) 6965 vdc_update_size(vdc, dsk_size, blk_size, vdc->max_xfer_sz); 6966 6967 mutex_exit(&vdc->lock); 6968 6969 return (rv); 6970 } 6971 6972 /* 6973 * This structure is used in the DKIO(7I) array below. 6974 */ 6975 typedef struct vdc_dk_ioctl { 6976 uint8_t op; /* VD_OP_XXX value */ 6977 int cmd; /* Solaris ioctl operation number */ 6978 size_t nbytes; /* size of structure to be copied */ 6979 6980 /* function to convert between vDisk and Solaris structure formats */ 6981 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6982 int mode, int dir); 6983 } vdc_dk_ioctl_t; 6984 6985 /* 6986 * Subset of DKIO(7I) operations currently supported 6987 */ 6988 static vdc_dk_ioctl_t dk_ioctl[] = { 6989 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6990 vdc_null_copy_func}, 6991 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6992 vdc_get_wce_convert}, 6993 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6994 vdc_set_wce_convert}, 6995 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6996 vdc_get_vtoc_convert}, 6997 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6998 vdc_set_vtoc_convert}, 6999 {VD_OP_GET_VTOC, DKIOCGEXTVTOC, sizeof (vd_vtoc_t), 7000 vdc_get_extvtoc_convert}, 7001 {VD_OP_SET_VTOC, DKIOCSEXTVTOC, sizeof (vd_vtoc_t), 7002 vdc_set_extvtoc_convert}, 7003 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 7004 vdc_get_geom_convert}, 7005 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 7006 vdc_get_geom_convert}, 7007 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 7008 vdc_get_geom_convert}, 7009 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 7010 vdc_set_geom_convert}, 7011 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 7012 vdc_get_efi_convert}, 7013 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 7014 vdc_set_efi_convert}, 7015 7016 /* DIOCTL_RWCMD is converted to a read or a write */ 7017 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 7018 7019 /* mhd(7I) non-shared multihost disks ioctls */ 7020 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 7021 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 7022 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 7023 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 7024 7025 /* mhd(7I) shared multihost disks ioctls */ 7026 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 7027 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 7028 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 7029 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 7030 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 7031 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 7032 7033 /* mhd(7I) failfast ioctl */ 7034 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 7035 7036 /* 7037 * These particular ioctls are not sent to the server - vdc fakes up 7038 * the necessary info. 7039 */ 7040 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 7041 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 7042 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 7043 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 7044 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 7045 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 7046 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 7047 }; 7048 7049 /* 7050 * This function handles ioctl requests from the vd_efi_alloc_and_read() 7051 * function and forward them to the vdisk. 7052 */ 7053 static int 7054 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 7055 { 7056 vdc_t *vdc = (vdc_t *)vdisk; 7057 dev_t dev; 7058 int rval; 7059 7060 dev = makedevice(ddi_driver_major(vdc->dip), 7061 VD_MAKE_DEV(vdc->instance, 0)); 7062 7063 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 7064 } 7065 7066 /* 7067 * Function: 7068 * vd_process_ioctl() 7069 * 7070 * Description: 7071 * This routine processes disk specific ioctl calls 7072 * 7073 * Arguments: 7074 * dev - the device number 7075 * cmd - the operation [dkio(7I)] to be processed 7076 * arg - pointer to user provided structure 7077 * (contains data to be set or reference parameter for get) 7078 * mode - bit flag, indicating open settings, 32/64 bit type, etc 7079 * rvalp - pointer to return value for calling process. 7080 * 7081 * Return Code: 7082 * 0 7083 * EFAULT 7084 * ENXIO 7085 * EIO 7086 * ENOTSUP 7087 */ 7088 static int 7089 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 7090 { 7091 int instance = VDCUNIT(dev); 7092 vdc_t *vdc = NULL; 7093 int rv = -1; 7094 int idx = 0; /* index into dk_ioctl[] */ 7095 size_t len = 0; /* #bytes to send to vds */ 7096 size_t alloc_len = 0; /* #bytes to allocate mem for */ 7097 caddr_t mem_p = NULL; 7098 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 7099 vdc_dk_ioctl_t *iop; 7100 7101 vdc = ddi_get_soft_state(vdc_state, instance); 7102 if (vdc == NULL) { 7103 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 7104 instance); 7105 return (ENXIO); 7106 } 7107 7108 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 7109 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 7110 7111 if (rvalp != NULL) { 7112 /* the return value of the ioctl is 0 by default */ 7113 *rvalp = 0; 7114 } 7115 7116 /* 7117 * Validate the ioctl operation to be performed. 7118 * 7119 * If we have looped through the array without finding a match then we 7120 * don't support this ioctl. 7121 */ 7122 for (idx = 0; idx < nioctls; idx++) { 7123 if (cmd == dk_ioctl[idx].cmd) 7124 break; 7125 } 7126 7127 if (idx >= nioctls) { 7128 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 7129 vdc->instance, cmd); 7130 return (ENOTSUP); 7131 } 7132 7133 iop = &(dk_ioctl[idx]); 7134 7135 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 7136 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 7137 dk_efi_t dk_efi; 7138 7139 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 7140 if (rv != 0) 7141 return (EFAULT); 7142 7143 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 7144 } else { 7145 len = iop->nbytes; 7146 } 7147 7148 /* check if the ioctl is applicable */ 7149 switch (cmd) { 7150 case CDROMREADOFFSET: 7151 case DKIOCREMOVABLE: 7152 return (ENOTTY); 7153 7154 case USCSICMD: 7155 case MHIOCTKOWN: 7156 case MHIOCSTATUS: 7157 case MHIOCQRESERVE: 7158 case MHIOCRELEASE: 7159 case MHIOCGRP_INKEYS: 7160 case MHIOCGRP_INRESV: 7161 case MHIOCGRP_REGISTER: 7162 case MHIOCGRP_RESERVE: 7163 case MHIOCGRP_PREEMPTANDABORT: 7164 case MHIOCGRP_REGISTERANDIGNOREKEY: 7165 case MHIOCENFAILFAST: 7166 if (vdc->cinfo == NULL) 7167 return (ENXIO); 7168 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 7169 return (ENOTTY); 7170 break; 7171 7172 case DIOCTL_RWCMD: 7173 if (vdc->cinfo == NULL) 7174 return (ENXIO); 7175 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 7176 return (ENOTTY); 7177 break; 7178 7179 case DKIOCINFO: 7180 if (vdc->cinfo == NULL) 7181 return (ENXIO); 7182 break; 7183 7184 case DKIOCGMEDIAINFO: 7185 if (vdc->minfo == NULL) 7186 return (ENXIO); 7187 if (vdc_check_capacity(vdc) != 0) 7188 /* disk capacity is not available */ 7189 return (EIO); 7190 break; 7191 } 7192 7193 /* 7194 * Deal with ioctls which require a processing different than 7195 * converting ioctl arguments and sending a corresponding 7196 * VD operation. 7197 */ 7198 switch (cmd) { 7199 7200 case USCSICMD: 7201 { 7202 return (vdc_uscsi_cmd(vdc, arg, mode)); 7203 } 7204 7205 case MHIOCTKOWN: 7206 { 7207 mutex_enter(&vdc->ownership_lock); 7208 /* 7209 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 7210 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 7211 * while we are processing the ioctl. 7212 */ 7213 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 7214 7215 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 7216 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE); 7217 if (rv == 0) { 7218 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 7219 VDC_OWNERSHIP_GRANTED); 7220 } else { 7221 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 7222 } 7223 mutex_exit(&vdc->ownership_lock); 7224 return (rv); 7225 } 7226 7227 case MHIOCRELEASE: 7228 { 7229 mutex_enter(&vdc->ownership_lock); 7230 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR); 7231 if (rv == 0) { 7232 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 7233 } 7234 mutex_exit(&vdc->ownership_lock); 7235 return (rv); 7236 } 7237 7238 case MHIOCSTATUS: 7239 { 7240 uint64_t status; 7241 7242 rv = vdc_access_get(vdc, &status); 7243 if (rv == 0 && rvalp != NULL) 7244 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 7245 return (rv); 7246 } 7247 7248 case MHIOCQRESERVE: 7249 { 7250 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE); 7251 return (rv); 7252 } 7253 7254 case MHIOCGRP_INKEYS: 7255 { 7256 return (vdc_mhd_inkeys(vdc, arg, mode)); 7257 } 7258 7259 case MHIOCGRP_INRESV: 7260 { 7261 return (vdc_mhd_inresv(vdc, arg, mode)); 7262 } 7263 7264 case MHIOCGRP_REGISTER: 7265 { 7266 return (vdc_mhd_register(vdc, arg, mode)); 7267 } 7268 7269 case MHIOCGRP_RESERVE: 7270 { 7271 return (vdc_mhd_reserve(vdc, arg, mode)); 7272 } 7273 7274 case MHIOCGRP_PREEMPTANDABORT: 7275 { 7276 return (vdc_mhd_preemptabort(vdc, arg, mode)); 7277 } 7278 7279 case MHIOCGRP_REGISTERANDIGNOREKEY: 7280 { 7281 return (vdc_mhd_registerignore(vdc, arg, mode)); 7282 } 7283 7284 case MHIOCENFAILFAST: 7285 { 7286 rv = vdc_failfast(vdc, arg, mode); 7287 return (rv); 7288 } 7289 7290 case DIOCTL_RWCMD: 7291 { 7292 return (vdc_dioctl_rwcmd(vdc, arg, mode)); 7293 } 7294 7295 case DKIOCGAPART: 7296 { 7297 return (vdc_dkio_gapart(vdc, arg, mode)); 7298 } 7299 7300 case DKIOCPARTITION: 7301 { 7302 return (vdc_dkio_partition(vdc, arg, mode)); 7303 } 7304 7305 case DKIOCINFO: 7306 { 7307 struct dk_cinfo cinfo; 7308 7309 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 7310 cinfo.dki_partition = VDCPART(dev); 7311 7312 rv = ddi_copyout(&cinfo, (void *)arg, 7313 sizeof (struct dk_cinfo), mode); 7314 if (rv != 0) 7315 return (EFAULT); 7316 7317 return (0); 7318 } 7319 7320 case DKIOCGMEDIAINFO: 7321 { 7322 ASSERT(vdc->vdisk_size != 0); 7323 ASSERT(vdc->minfo->dki_capacity != 0); 7324 rv = ddi_copyout(vdc->minfo, (void *)arg, 7325 sizeof (struct dk_minfo), mode); 7326 if (rv != 0) 7327 return (EFAULT); 7328 7329 return (0); 7330 } 7331 7332 case DKIOCFLUSHWRITECACHE: 7333 { 7334 struct dk_callback *dkc = 7335 (struct dk_callback *)(uintptr_t)arg; 7336 vdc_dk_arg_t *dkarg = NULL; 7337 7338 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 7339 instance, mode); 7340 7341 /* 7342 * If arg is NULL, then there is no callback function 7343 * registered and the call operates synchronously; we 7344 * break and continue with the rest of the function and 7345 * wait for vds to return (i.e. after the request to 7346 * vds returns successfully, all writes completed prior 7347 * to the ioctl will have been flushed from the disk 7348 * write cache to persistent media. 7349 * 7350 * If a callback function is registered, we dispatch 7351 * the request on a task queue and return immediately. 7352 * The callback will deal with informing the calling 7353 * thread that the flush request is completed. 7354 */ 7355 if (dkc == NULL) 7356 break; 7357 7358 /* 7359 * the asynchronous callback is only supported if 7360 * invoked from within the kernel 7361 */ 7362 if ((mode & FKIOCTL) == 0) 7363 return (ENOTSUP); 7364 7365 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 7366 7367 dkarg->mode = mode; 7368 dkarg->dev = dev; 7369 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 7370 7371 mutex_enter(&vdc->lock); 7372 vdc->dkio_flush_pending++; 7373 dkarg->vdc = vdc; 7374 mutex_exit(&vdc->lock); 7375 7376 /* put the request on a task queue */ 7377 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 7378 (void *)dkarg, DDI_SLEEP); 7379 if (rv == NULL) { 7380 /* clean up if dispatch fails */ 7381 mutex_enter(&vdc->lock); 7382 vdc->dkio_flush_pending--; 7383 mutex_exit(&vdc->lock); 7384 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7385 } 7386 7387 return (rv == NULL ? ENOMEM : 0); 7388 } 7389 } 7390 7391 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7392 ASSERT(iop->op != 0); 7393 7394 /* check if the vDisk server handles the operation for this vDisk */ 7395 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7396 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7397 vdc->instance, iop->op); 7398 return (ENOTSUP); 7399 } 7400 7401 /* LDC requires that the memory being mapped is 8-byte aligned */ 7402 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7403 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7404 instance, len, alloc_len); 7405 7406 if (alloc_len > 0) 7407 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7408 7409 /* 7410 * Call the conversion function for this ioctl which, if necessary, 7411 * converts from the Solaris format to the format ARC'ed 7412 * as part of the vDisk protocol (FWARC 2006/195) 7413 */ 7414 ASSERT(iop->convert != NULL); 7415 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7416 if (rv != 0) { 7417 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7418 instance, rv, cmd); 7419 if (mem_p != NULL) 7420 kmem_free(mem_p, alloc_len); 7421 return (rv); 7422 } 7423 7424 /* 7425 * send request to vds to service the ioctl. 7426 */ 7427 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7428 VDCPART(dev), 0, VIO_both_dir, B_TRUE); 7429 7430 if (rv != 0) { 7431 /* 7432 * This is not necessarily an error. The ioctl could 7433 * be returning a value such as ENOTTY to indicate 7434 * that the ioctl is not applicable. 7435 */ 7436 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7437 instance, rv, cmd); 7438 if (mem_p != NULL) 7439 kmem_free(mem_p, alloc_len); 7440 7441 return (rv); 7442 } 7443 7444 /* 7445 * Call the conversion function (if it exists) for this ioctl 7446 * which converts from the format ARC'ed as part of the vDisk 7447 * protocol (FWARC 2006/195) back to a format understood by 7448 * the rest of Solaris. 7449 */ 7450 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7451 if (rv != 0) { 7452 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7453 instance, rv, cmd); 7454 if (mem_p != NULL) 7455 kmem_free(mem_p, alloc_len); 7456 return (rv); 7457 } 7458 7459 if (mem_p != NULL) 7460 kmem_free(mem_p, alloc_len); 7461 7462 return (rv); 7463 } 7464 7465 /* 7466 * Function: 7467 * 7468 * Description: 7469 * This is an empty conversion function used by ioctl calls which 7470 * do not need to convert the data being passed in/out to userland 7471 */ 7472 static int 7473 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7474 { 7475 _NOTE(ARGUNUSED(vdc)) 7476 _NOTE(ARGUNUSED(from)) 7477 _NOTE(ARGUNUSED(to)) 7478 _NOTE(ARGUNUSED(mode)) 7479 _NOTE(ARGUNUSED(dir)) 7480 7481 return (0); 7482 } 7483 7484 static int 7485 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7486 int mode, int dir) 7487 { 7488 _NOTE(ARGUNUSED(vdc)) 7489 7490 if (dir == VD_COPYIN) 7491 return (0); /* nothing to do */ 7492 7493 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7494 return (EFAULT); 7495 7496 return (0); 7497 } 7498 7499 static int 7500 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7501 int mode, int dir) 7502 { 7503 _NOTE(ARGUNUSED(vdc)) 7504 7505 if (dir == VD_COPYOUT) 7506 return (0); /* nothing to do */ 7507 7508 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7509 return (EFAULT); 7510 7511 return (0); 7512 } 7513 7514 /* 7515 * Function: 7516 * vdc_get_vtoc_convert() 7517 * 7518 * Description: 7519 * This routine performs the necessary convertions from the DKIOCGVTOC 7520 * Solaris structure to the format defined in FWARC 2006/195. 7521 * 7522 * In the struct vtoc definition, the timestamp field is marked as not 7523 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7524 * However SVM uses that field to check it can write into the VTOC, 7525 * so we fake up the info of that field. 7526 * 7527 * Arguments: 7528 * vdc - the vDisk client 7529 * from - the buffer containing the data to be copied from 7530 * to - the buffer to be copied to 7531 * mode - flags passed to ioctl() call 7532 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7533 * 7534 * Return Code: 7535 * 0 - Success 7536 * ENXIO - incorrect buffer passed in. 7537 * EFAULT - ddi_copyout routine encountered an error. 7538 */ 7539 static int 7540 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7541 { 7542 int i; 7543 struct vtoc vtoc; 7544 struct vtoc32 vtoc32; 7545 struct extvtoc evtoc; 7546 int rv; 7547 7548 if (dir != VD_COPYOUT) 7549 return (0); /* nothing to do */ 7550 7551 if ((from == NULL) || (to == NULL)) 7552 return (ENXIO); 7553 7554 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7555 return (EOVERFLOW); 7556 7557 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7558 7559 /* fake the VTOC timestamp field */ 7560 for (i = 0; i < V_NUMPAR; i++) { 7561 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7562 } 7563 7564 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7565 /* LINTED E_ASSIGN_NARROW_CONV */ 7566 extvtoctovtoc32(evtoc, vtoc32); 7567 rv = ddi_copyout(&vtoc32, to, sizeof (vtoc32), mode); 7568 if (rv != 0) 7569 rv = EFAULT; 7570 } else { 7571 extvtoctovtoc(evtoc, vtoc); 7572 rv = ddi_copyout(&vtoc, to, sizeof (vtoc), mode); 7573 if (rv != 0) 7574 rv = EFAULT; 7575 } 7576 7577 return (rv); 7578 } 7579 7580 /* 7581 * Function: 7582 * vdc_set_vtoc_convert() 7583 * 7584 * Description: 7585 * This routine performs the necessary convertions from the DKIOCSVTOC 7586 * Solaris structure to the format defined in FWARC 2006/195. 7587 * 7588 * Arguments: 7589 * vdc - the vDisk client 7590 * from - Buffer with data 7591 * to - Buffer where data is to be copied to 7592 * mode - flags passed to ioctl 7593 * dir - direction of copy (in or out) 7594 * 7595 * Return Code: 7596 * 0 - Success 7597 * ENXIO - Invalid buffer passed in 7598 * EFAULT - ddi_copyin of data failed 7599 */ 7600 static int 7601 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7602 { 7603 void *uvtoc; 7604 struct vtoc vtoc; 7605 struct vtoc32 vtoc32; 7606 struct extvtoc evtoc; 7607 int i, rv; 7608 7609 if ((from == NULL) || (to == NULL)) 7610 return (ENXIO); 7611 7612 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7613 return (EOVERFLOW); 7614 7615 uvtoc = (dir == VD_COPYIN)? from : to; 7616 7617 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7618 rv = ddi_copyin(uvtoc, &vtoc32, sizeof (vtoc32), mode); 7619 if (rv != 0) 7620 return (EFAULT); 7621 vtoc32toextvtoc(vtoc32, evtoc); 7622 } else { 7623 rv = ddi_copyin(uvtoc, &vtoc, sizeof (vtoc), mode); 7624 if (rv != 0) 7625 return (EFAULT); 7626 vtoctoextvtoc(vtoc, evtoc); 7627 } 7628 7629 if (dir == VD_COPYOUT) { 7630 /* 7631 * The disk label may have changed. Revalidate the disk 7632 * geometry. This will also update the device nodes. 7633 */ 7634 vdc_validate(vdc); 7635 7636 /* 7637 * We also need to keep track of the timestamp fields. 7638 */ 7639 for (i = 0; i < V_NUMPAR; i++) { 7640 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7641 } 7642 7643 } else { 7644 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7645 } 7646 7647 return (0); 7648 } 7649 7650 static int 7651 vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7652 { 7653 int i, rv; 7654 struct extvtoc evtoc; 7655 7656 if (dir != VD_COPYOUT) 7657 return (0); /* nothing to do */ 7658 7659 if ((from == NULL) || (to == NULL)) 7660 return (ENXIO); 7661 7662 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7663 7664 /* fake the VTOC timestamp field */ 7665 for (i = 0; i < V_NUMPAR; i++) { 7666 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7667 } 7668 7669 rv = ddi_copyout(&evtoc, to, sizeof (struct extvtoc), mode); 7670 if (rv != 0) 7671 rv = EFAULT; 7672 7673 return (rv); 7674 } 7675 7676 static int 7677 vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7678 { 7679 void *uvtoc; 7680 struct extvtoc evtoc; 7681 int i, rv; 7682 7683 if ((from == NULL) || (to == NULL)) 7684 return (ENXIO); 7685 7686 uvtoc = (dir == VD_COPYIN)? from : to; 7687 7688 rv = ddi_copyin(uvtoc, &evtoc, sizeof (struct extvtoc), mode); 7689 if (rv != 0) 7690 return (EFAULT); 7691 7692 if (dir == VD_COPYOUT) { 7693 /* 7694 * The disk label may have changed. Revalidate the disk 7695 * geometry. This will also update the device nodes. 7696 */ 7697 vdc_validate(vdc); 7698 7699 /* 7700 * We also need to keep track of the timestamp fields. 7701 */ 7702 for (i = 0; i < V_NUMPAR; i++) { 7703 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7704 } 7705 7706 } else { 7707 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7708 } 7709 7710 return (0); 7711 } 7712 7713 /* 7714 * Function: 7715 * vdc_get_geom_convert() 7716 * 7717 * Description: 7718 * This routine performs the necessary convertions from the DKIOCGGEOM, 7719 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7720 * defined in FWARC 2006/195 7721 * 7722 * Arguments: 7723 * vdc - the vDisk client 7724 * from - Buffer with data 7725 * to - Buffer where data is to be copied to 7726 * mode - flags passed to ioctl 7727 * dir - direction of copy (in or out) 7728 * 7729 * Return Code: 7730 * 0 - Success 7731 * ENXIO - Invalid buffer passed in 7732 * EFAULT - ddi_copyout of data failed 7733 */ 7734 static int 7735 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7736 { 7737 _NOTE(ARGUNUSED(vdc)) 7738 7739 struct dk_geom geom; 7740 int copy_len = sizeof (struct dk_geom); 7741 int rv = 0; 7742 7743 if (dir != VD_COPYOUT) 7744 return (0); /* nothing to do */ 7745 7746 if ((from == NULL) || (to == NULL)) 7747 return (ENXIO); 7748 7749 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7750 rv = ddi_copyout(&geom, to, copy_len, mode); 7751 if (rv != 0) 7752 rv = EFAULT; 7753 7754 return (rv); 7755 } 7756 7757 /* 7758 * Function: 7759 * vdc_set_geom_convert() 7760 * 7761 * Description: 7762 * This routine performs the necessary convertions from the DKIOCSGEOM 7763 * Solaris structure to the format defined in FWARC 2006/195. 7764 * 7765 * Arguments: 7766 * vdc - the vDisk client 7767 * from - Buffer with data 7768 * to - Buffer where data is to be copied to 7769 * mode - flags passed to ioctl 7770 * dir - direction of copy (in or out) 7771 * 7772 * Return Code: 7773 * 0 - Success 7774 * ENXIO - Invalid buffer passed in 7775 * EFAULT - ddi_copyin of data failed 7776 */ 7777 static int 7778 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7779 { 7780 _NOTE(ARGUNUSED(vdc)) 7781 7782 vd_geom_t vdgeom; 7783 void *tmp_mem = NULL; 7784 int copy_len = sizeof (struct dk_geom); 7785 int rv = 0; 7786 7787 if (dir != VD_COPYIN) 7788 return (0); /* nothing to do */ 7789 7790 if ((from == NULL) || (to == NULL)) 7791 return (ENXIO); 7792 7793 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7794 7795 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7796 if (rv != 0) { 7797 kmem_free(tmp_mem, copy_len); 7798 return (EFAULT); 7799 } 7800 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7801 bcopy(&vdgeom, to, sizeof (vdgeom)); 7802 kmem_free(tmp_mem, copy_len); 7803 7804 return (0); 7805 } 7806 7807 static int 7808 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7809 { 7810 _NOTE(ARGUNUSED(vdc)) 7811 7812 vd_efi_t *vd_efi; 7813 dk_efi_t dk_efi; 7814 int rv = 0; 7815 void *uaddr; 7816 7817 if ((from == NULL) || (to == NULL)) 7818 return (ENXIO); 7819 7820 if (dir == VD_COPYIN) { 7821 7822 vd_efi = (vd_efi_t *)to; 7823 7824 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7825 if (rv != 0) 7826 return (EFAULT); 7827 7828 vd_efi->lba = dk_efi.dki_lba; 7829 vd_efi->length = dk_efi.dki_length; 7830 bzero(vd_efi->data, vd_efi->length); 7831 7832 } else { 7833 7834 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7835 if (rv != 0) 7836 return (EFAULT); 7837 7838 uaddr = dk_efi.dki_data; 7839 7840 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7841 7842 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7843 7844 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7845 mode); 7846 if (rv != 0) 7847 return (EFAULT); 7848 7849 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7850 } 7851 7852 return (0); 7853 } 7854 7855 static int 7856 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7857 { 7858 _NOTE(ARGUNUSED(vdc)) 7859 7860 dk_efi_t dk_efi; 7861 void *uaddr; 7862 7863 if (dir == VD_COPYOUT) { 7864 /* 7865 * The disk label may have changed. Revalidate the disk 7866 * geometry. This will also update the device nodes. 7867 */ 7868 vdc_validate(vdc); 7869 return (0); 7870 } 7871 7872 if ((from == NULL) || (to == NULL)) 7873 return (ENXIO); 7874 7875 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7876 return (EFAULT); 7877 7878 uaddr = dk_efi.dki_data; 7879 7880 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7881 7882 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7883 return (EFAULT); 7884 7885 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7886 7887 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7888 7889 return (0); 7890 } 7891 7892 7893 /* -------------------------------------------------------------------------- */ 7894 7895 /* 7896 * Function: 7897 * vdc_create_fake_geometry() 7898 * 7899 * Description: 7900 * This routine fakes up the disk info needed for some DKIO ioctls such 7901 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7902 * 7903 * Note: This function must not be called until the vDisk attributes have 7904 * been exchanged as part of the handshake with the vDisk server. 7905 * 7906 * Arguments: 7907 * vdc - soft state pointer for this instance of the device driver. 7908 * 7909 * Return Code: 7910 * none. 7911 */ 7912 static void 7913 vdc_create_fake_geometry(vdc_t *vdc) 7914 { 7915 ASSERT(vdc != NULL); 7916 ASSERT(vdc->max_xfer_sz != 0); 7917 7918 /* 7919 * DKIOCINFO support 7920 */ 7921 if (vdc->cinfo == NULL) 7922 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7923 7924 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7925 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7926 /* max_xfer_sz is #blocks so we don't need to divide by vdisk_bsize */ 7927 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7928 7929 /* 7930 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7931 * operation is supported, otherwise the controller type is DKC_DIRECT. 7932 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7933 * controller type is always DKC_DIRECT in that case. 7934 * 7935 * If the virtual disk is backed by a physical CD/DVD device or 7936 * an ISO image, modify the controller type to indicate this 7937 */ 7938 switch (vdc->vdisk_media) { 7939 case VD_MEDIA_CD: 7940 case VD_MEDIA_DVD: 7941 vdc->cinfo->dki_ctype = DKC_CDROM; 7942 break; 7943 case VD_MEDIA_FIXED: 7944 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7945 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7946 else 7947 vdc->cinfo->dki_ctype = DKC_DIRECT; 7948 break; 7949 default: 7950 /* in the case of v1.0 we default to a fixed disk */ 7951 vdc->cinfo->dki_ctype = DKC_DIRECT; 7952 break; 7953 } 7954 vdc->cinfo->dki_flags = DKI_FMTVOL; 7955 vdc->cinfo->dki_cnum = 0; 7956 vdc->cinfo->dki_addr = 0; 7957 vdc->cinfo->dki_space = 0; 7958 vdc->cinfo->dki_prio = 0; 7959 vdc->cinfo->dki_vec = 0; 7960 vdc->cinfo->dki_unit = vdc->instance; 7961 vdc->cinfo->dki_slave = 0; 7962 /* 7963 * The partition number will be created on the fly depending on the 7964 * actual slice (i.e. minor node) that is used to request the data. 7965 */ 7966 vdc->cinfo->dki_partition = 0; 7967 7968 /* 7969 * DKIOCGMEDIAINFO support 7970 */ 7971 if (vdc->minfo == NULL) 7972 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7973 7974 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7975 vdc->minfo->dki_media_type = 7976 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7977 } else { 7978 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7979 } 7980 7981 vdc->minfo->dki_capacity = vdc->vdisk_size; 7982 vdc->minfo->dki_lbsize = vdc->vdisk_bsize; 7983 } 7984 7985 static ushort_t 7986 vdc_lbl2cksum(struct dk_label *label) 7987 { 7988 int count; 7989 ushort_t sum, *sp; 7990 7991 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7992 sp = (ushort_t *)label; 7993 sum = 0; 7994 while (count--) { 7995 sum ^= *sp++; 7996 } 7997 7998 return (sum); 7999 } 8000 8001 static void 8002 vdc_update_size(vdc_t *vdc, size_t dsk_size, size_t blk_size, size_t xfr_size) 8003 { 8004 vd_err_stats_t *stp; 8005 8006 ASSERT(MUTEX_HELD(&vdc->lock)); 8007 ASSERT(xfr_size != 0); 8008 8009 /* 8010 * If the disk size is unknown or sizes are unchanged then don't 8011 * update anything. 8012 */ 8013 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0 || 8014 (blk_size == vdc->vdisk_bsize && dsk_size == vdc->vdisk_size && 8015 xfr_size == vdc->max_xfer_sz)) 8016 return; 8017 8018 /* 8019 * We don't know at compile time what the vDisk server will think 8020 * are good values but we apply a large (arbitrary) upper bound to 8021 * prevent memory exhaustion in vdc if it was allocating a DRing 8022 * based of huge values sent by the server. We probably will never 8023 * exceed this except if the message was garbage. 8024 */ 8025 if ((xfr_size * blk_size) > (PAGESIZE * DEV_BSIZE)) { 8026 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 8027 " using max supported by vdc", vdc->instance); 8028 xfr_size = maxphys / blk_size; 8029 } 8030 8031 vdc->max_xfer_sz = xfr_size; 8032 vdc->vdisk_bsize = blk_size; 8033 vdc->vdisk_size = dsk_size; 8034 8035 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 8036 stp->vd_capacity.value.ui64 = dsk_size * blk_size; 8037 8038 vdc->minfo->dki_capacity = dsk_size; 8039 vdc->minfo->dki_lbsize = (uint_t)blk_size; 8040 } 8041 8042 /* 8043 * Update information about the VIO block size. The VIO block size is the 8044 * same as the vdisk block size which is stored in vdc->vdisk_bsize so we 8045 * do not store that information again. 8046 * 8047 * However, buf structures will always use a logical block size of 512 bytes 8048 * (DEV_BSIZE) and we will need to convert logical block numbers to VIO block 8049 * numbers for each read or write operation using vdc_strategy(). To speed up 8050 * this conversion, we expect the VIO block size to be a power of 2 and a 8051 * multiple 512 bytes (DEV_BSIZE), and we cache some useful information. 8052 * 8053 * The function return EINVAL if the new VIO block size (blk_size) is not a 8054 * power of 2 or not a multiple of 512 bytes, otherwise it returns 0. 8055 */ 8056 static int 8057 vdc_update_vio_bsize(vdc_t *vdc, uint32_t blk_size) 8058 { 8059 uint32_t ratio, n; 8060 int nshift = 0; 8061 8062 vdc->vio_bmask = 0; 8063 vdc->vio_bshift = 0; 8064 8065 ASSERT(blk_size > 0); 8066 8067 if ((blk_size % DEV_BSIZE) != 0) 8068 return (EINVAL); 8069 8070 ratio = blk_size / DEV_BSIZE; 8071 8072 for (n = ratio; n > 1; n >>= 1) { 8073 if ((n & 0x1) != 0) { 8074 /* blk_size is not a power of 2 */ 8075 return (EINVAL); 8076 } 8077 nshift++; 8078 } 8079 8080 vdc->vio_bshift = nshift; 8081 vdc->vio_bmask = ratio - 1; 8082 8083 return (0); 8084 } 8085 8086 /* 8087 * Function: 8088 * vdc_validate_geometry 8089 * 8090 * Description: 8091 * This routine discovers the label and geometry of the disk. It stores 8092 * the disk label and related information in the vdc structure. If it 8093 * fails to validate the geometry or to discover the disk label then 8094 * the label is marked as unknown (VD_DISK_LABEL_UNK). 8095 * 8096 * Arguments: 8097 * vdc - soft state pointer for this instance of the device driver. 8098 * 8099 * Return Code: 8100 * 0 - success. 8101 * EINVAL - unknown disk label. 8102 * ENOTSUP - geometry not applicable (EFI label). 8103 * EIO - error accessing the disk. 8104 */ 8105 static int 8106 vdc_validate_geometry(vdc_t *vdc) 8107 { 8108 dev_t dev; 8109 int rv, rval; 8110 struct dk_label *label; 8111 struct dk_geom geom; 8112 struct extvtoc vtoc; 8113 efi_gpt_t *gpt; 8114 efi_gpe_t *gpe; 8115 vd_efi_dev_t edev; 8116 8117 ASSERT(vdc != NULL); 8118 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 8119 ASSERT(MUTEX_HELD(&vdc->lock)); 8120 8121 mutex_exit(&vdc->lock); 8122 /* 8123 * Check the disk capacity in case it has changed. If that fails then 8124 * we proceed and we will be using the disk size we currently have. 8125 */ 8126 (void) vdc_check_capacity(vdc); 8127 dev = makedevice(ddi_driver_major(vdc->dip), 8128 VD_MAKE_DEV(vdc->instance, 0)); 8129 8130 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 8131 if (rv == 0) 8132 rv = vd_process_ioctl(dev, DKIOCGEXTVTOC, (caddr_t)&vtoc, 8133 FKIOCTL, &rval); 8134 8135 if (rv == ENOTSUP) { 8136 /* 8137 * If the device does not support VTOC then we try 8138 * to read an EFI label. 8139 * 8140 * We need to know the block size and the disk size to 8141 * be able to read an EFI label. 8142 */ 8143 if (vdc->vdisk_size == 0) { 8144 mutex_enter(&vdc->lock); 8145 vdc_store_label_unk(vdc); 8146 return (EIO); 8147 } 8148 8149 VDC_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 8150 8151 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 8152 8153 if (rv) { 8154 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 8155 vdc->instance, rv); 8156 mutex_enter(&vdc->lock); 8157 vdc_store_label_unk(vdc); 8158 return (EIO); 8159 } 8160 8161 mutex_enter(&vdc->lock); 8162 vdc_store_label_efi(vdc, gpt, gpe); 8163 vd_efi_free(&edev, gpt, gpe); 8164 return (ENOTSUP); 8165 } 8166 8167 if (rv != 0) { 8168 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 8169 vdc->instance, rv); 8170 mutex_enter(&vdc->lock); 8171 vdc_store_label_unk(vdc); 8172 if (rv != EINVAL) 8173 rv = EIO; 8174 return (rv); 8175 } 8176 8177 /* check that geometry and vtoc are valid */ 8178 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 8179 vtoc.v_sanity != VTOC_SANE) { 8180 mutex_enter(&vdc->lock); 8181 vdc_store_label_unk(vdc); 8182 return (EINVAL); 8183 } 8184 8185 /* 8186 * We have a disk and a valid VTOC. However this does not mean 8187 * that the disk currently have a VTOC label. The returned VTOC may 8188 * be a default VTOC to be used for configuring the disk (this is 8189 * what is done for disk image). So we read the label from the 8190 * beginning of the disk to ensure we really have a VTOC label. 8191 * 8192 * FUTURE: This could be the default way for reading the VTOC 8193 * from the disk as opposed to sending the VD_OP_GET_VTOC 8194 * to the server. This will be the default if vdc is implemented 8195 * ontop of cmlb. 8196 */ 8197 8198 /* 8199 * Single slice disk does not support read using an absolute disk 8200 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 8201 */ 8202 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 8203 mutex_enter(&vdc->lock); 8204 if (vtoc.v_nparts != 1) { 8205 vdc_store_label_unk(vdc); 8206 return (EINVAL); 8207 } 8208 vdc_store_label_vtoc(vdc, &geom, &vtoc); 8209 return (0); 8210 } 8211 8212 if (vtoc.v_nparts != V_NUMPAR) { 8213 mutex_enter(&vdc->lock); 8214 vdc_store_label_unk(vdc); 8215 return (EINVAL); 8216 } 8217 8218 /* 8219 * Most CD/DVDs do not have a disk label and the label is 8220 * generated by the disk driver. So the on-disk label check 8221 * below may fail and we return now to avoid this problem. 8222 */ 8223 if (vdc->vdisk_media == VD_MEDIA_CD || 8224 vdc->vdisk_media == VD_MEDIA_DVD) { 8225 mutex_enter(&vdc->lock); 8226 vdc_store_label_vtoc(vdc, &geom, &vtoc); 8227 return (0); 8228 } 8229 8230 /* 8231 * Read disk label from start of disk 8232 */ 8233 label = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP); 8234 8235 rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)label, vdc->vdisk_bsize, 8236 VD_SLICE_NONE, 0, NULL, VIO_read_dir, VDC_OP_NORMAL); 8237 8238 if (rv != 0 || label->dkl_magic != DKL_MAGIC || 8239 label->dkl_cksum != vdc_lbl2cksum(label)) { 8240 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 8241 vdc->instance); 8242 kmem_free(label, vdc->vdisk_bsize); 8243 mutex_enter(&vdc->lock); 8244 vdc_store_label_unk(vdc); 8245 return (EINVAL); 8246 } 8247 8248 kmem_free(label, vdc->vdisk_bsize); 8249 mutex_enter(&vdc->lock); 8250 vdc_store_label_vtoc(vdc, &geom, &vtoc); 8251 return (0); 8252 } 8253 8254 /* 8255 * Function: 8256 * vdc_validate 8257 * 8258 * Description: 8259 * This routine discovers the label of the disk and create the 8260 * appropriate device nodes if the label has changed. 8261 * 8262 * Arguments: 8263 * vdc - soft state pointer for this instance of the device driver. 8264 * 8265 * Return Code: 8266 * none. 8267 */ 8268 static void 8269 vdc_validate(vdc_t *vdc) 8270 { 8271 vd_disk_label_t old_label; 8272 vd_slice_t old_slice[V_NUMPAR]; 8273 int rv; 8274 8275 ASSERT(!MUTEX_HELD(&vdc->lock)); 8276 8277 mutex_enter(&vdc->lock); 8278 8279 /* save the current label and vtoc */ 8280 old_label = vdc->vdisk_label; 8281 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 8282 8283 /* check the geometry */ 8284 (void) vdc_validate_geometry(vdc); 8285 8286 /* if the disk label has changed, update device nodes */ 8287 if (vdc->vdisk_type == VD_DISK_TYPE_DISK && 8288 vdc->vdisk_label != old_label) { 8289 8290 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 8291 rv = vdc_create_device_nodes_efi(vdc); 8292 else 8293 rv = vdc_create_device_nodes_vtoc(vdc); 8294 8295 if (rv != 0) { 8296 DMSG(vdc, 0, "![%d] Failed to update device nodes", 8297 vdc->instance); 8298 } 8299 } 8300 8301 mutex_exit(&vdc->lock); 8302 } 8303 8304 static void 8305 vdc_validate_task(void *arg) 8306 { 8307 vdc_t *vdc = (vdc_t *)arg; 8308 8309 vdc_validate(vdc); 8310 8311 mutex_enter(&vdc->lock); 8312 ASSERT(vdc->validate_pending > 0); 8313 vdc->validate_pending--; 8314 mutex_exit(&vdc->lock); 8315 } 8316 8317 /* 8318 * Function: 8319 * vdc_setup_devid() 8320 * 8321 * Description: 8322 * This routine discovers the devid of a vDisk. It requests the devid of 8323 * the underlying device from the vDisk server, builds an encapsulated 8324 * devid based on the retrieved devid and registers that new devid to 8325 * the vDisk. 8326 * 8327 * Arguments: 8328 * vdc - soft state pointer for this instance of the device driver. 8329 * 8330 * Return Code: 8331 * 0 - A devid was succesfully registered for the vDisk 8332 */ 8333 static int 8334 vdc_setup_devid(vdc_t *vdc) 8335 { 8336 int rv; 8337 vd_devid_t *vd_devid; 8338 size_t bufsize, bufid_len; 8339 ddi_devid_t vdisk_devid; 8340 char *devid_str; 8341 8342 /* 8343 * At first sight, we don't know the size of the devid that the 8344 * server will return but this size will be encoded into the 8345 * reply. So we do a first request using a default size then we 8346 * check if this size was large enough. If not then we do a second 8347 * request with the correct size returned by the server. Note that 8348 * ldc requires size to be 8-byte aligned. 8349 */ 8350 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 8351 sizeof (uint64_t)); 8352 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 8353 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 8354 8355 rv = vdc_do_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 8356 bufsize, 0, 0, NULL, VIO_both_dir, 0); 8357 8358 DMSG(vdc, 2, "do_op returned %d\n", rv); 8359 8360 if (rv) { 8361 kmem_free(vd_devid, bufsize); 8362 return (rv); 8363 } 8364 8365 if (vd_devid->length > bufid_len) { 8366 /* 8367 * The returned devid is larger than the buffer used. Try again 8368 * with a buffer with the right size. 8369 */ 8370 kmem_free(vd_devid, bufsize); 8371 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 8372 sizeof (uint64_t)); 8373 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 8374 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 8375 8376 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 8377 bufsize, 0, 0, VIO_both_dir, B_TRUE); 8378 8379 if (rv) { 8380 kmem_free(vd_devid, bufsize); 8381 return (rv); 8382 } 8383 } 8384 8385 /* 8386 * The virtual disk should have the same device id as the one associated 8387 * with the physical disk it is mapped on, otherwise sharing a disk 8388 * between a LDom and a non-LDom may not work (for example for a shared 8389 * SVM disk set). 8390 * 8391 * The DDI framework does not allow creating a device id with any 8392 * type so we first create a device id of type DEVID_ENCAP and then 8393 * we restore the orignal type of the physical device. 8394 */ 8395 8396 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 8397 8398 /* build an encapsulated devid based on the returned devid */ 8399 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 8400 vd_devid->id, &vdisk_devid) != DDI_SUCCESS) { 8401 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 8402 kmem_free(vd_devid, bufsize); 8403 return (1); 8404 } 8405 8406 DEVID_FORMTYPE((impl_devid_t *)vdisk_devid, vd_devid->type); 8407 8408 ASSERT(ddi_devid_valid(vdisk_devid) == DDI_SUCCESS); 8409 8410 kmem_free(vd_devid, bufsize); 8411 8412 if (vdc->devid != NULL) { 8413 /* check that the devid hasn't changed */ 8414 if (ddi_devid_compare(vdisk_devid, vdc->devid) == 0) { 8415 ddi_devid_free(vdisk_devid); 8416 return (0); 8417 } 8418 8419 cmn_err(CE_WARN, "vdisk@%d backend devid has changed", 8420 vdc->instance); 8421 8422 devid_str = ddi_devid_str_encode(vdc->devid, NULL); 8423 8424 cmn_err(CE_CONT, "vdisk@%d backend initial devid: %s", 8425 vdc->instance, 8426 (devid_str)? devid_str : "<encoding error>"); 8427 8428 if (devid_str) 8429 ddi_devid_str_free(devid_str); 8430 8431 devid_str = ddi_devid_str_encode(vdisk_devid, NULL); 8432 8433 cmn_err(CE_CONT, "vdisk@%d backend current devid: %s", 8434 vdc->instance, 8435 (devid_str)? devid_str : "<encoding error>"); 8436 8437 if (devid_str) 8438 ddi_devid_str_free(devid_str); 8439 8440 ddi_devid_free(vdisk_devid); 8441 return (1); 8442 } 8443 8444 if (ddi_devid_register(vdc->dip, vdisk_devid) != DDI_SUCCESS) { 8445 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 8446 ddi_devid_free(vdisk_devid); 8447 return (1); 8448 } 8449 8450 vdc->devid = vdisk_devid; 8451 8452 return (0); 8453 } 8454 8455 static void 8456 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 8457 { 8458 int i, nparts; 8459 8460 ASSERT(MUTEX_HELD(&vdc->lock)); 8461 8462 vdc->vdisk_label = VD_DISK_LABEL_EFI; 8463 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8464 bzero(vdc->geom, sizeof (struct dk_geom)); 8465 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8466 8467 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 8468 8469 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 8470 8471 if (gpe[i].efi_gpe_StartingLBA == 0 && 8472 gpe[i].efi_gpe_EndingLBA == 0) { 8473 continue; 8474 } 8475 8476 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 8477 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 8478 gpe[i].efi_gpe_StartingLBA + 1; 8479 } 8480 8481 ASSERT(vdc->vdisk_size != 0); 8482 vdc->slice[VD_EFI_WD_SLICE].start = 0; 8483 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 8484 8485 } 8486 8487 static void 8488 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct extvtoc *vtoc) 8489 { 8490 int i; 8491 8492 ASSERT(MUTEX_HELD(&vdc->lock)); 8493 ASSERT(vdc->vdisk_bsize == vtoc->v_sectorsz); 8494 8495 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 8496 bcopy(vtoc, vdc->vtoc, sizeof (struct extvtoc)); 8497 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 8498 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8499 8500 for (i = 0; i < vtoc->v_nparts; i++) { 8501 vdc->slice[i].start = vtoc->v_part[i].p_start; 8502 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 8503 } 8504 } 8505 8506 static void 8507 vdc_store_label_unk(vdc_t *vdc) 8508 { 8509 ASSERT(MUTEX_HELD(&vdc->lock)); 8510 8511 vdc->vdisk_label = VD_DISK_LABEL_UNK; 8512 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8513 bzero(vdc->geom, sizeof (struct dk_geom)); 8514 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8515 } 8516