1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * LDoms virtual disk client (vdc) device driver 29 * 30 * This driver runs on a guest logical domain and communicates with the virtual 31 * disk server (vds) driver running on the service domain which is exporting 32 * virtualized "disks" to the guest logical domain. 33 * 34 * The driver can be divided into four sections: 35 * 36 * 1) generic device driver housekeeping 37 * _init, _fini, attach, detach, ops structures, etc. 38 * 39 * 2) communication channel setup 40 * Setup the communications link over the LDC channel that vdc uses to 41 * talk to the vDisk server. Initialise the descriptor ring which 42 * allows the LDC clients to transfer data via memory mappings. 43 * 44 * 3) Support exported to upper layers (filesystems, etc) 45 * The upper layers call into vdc via strategy(9E) and DKIO(7I) 46 * ioctl calls. vdc will copy the data to be written to the descriptor 47 * ring or maps the buffer to store the data read by the vDisk 48 * server into the descriptor ring. It then sends a message to the 49 * vDisk server requesting it to complete the operation. 50 * 51 * 4) Handling responses from vDisk server. 52 * The vDisk server will ACK some or all of the messages vdc sends to it 53 * (this is configured during the handshake). Upon receipt of an ACK 54 * vdc will check the descriptor ring and signal to the upper layer 55 * code waiting on the IO. 56 */ 57 58 #include <sys/atomic.h> 59 #include <sys/conf.h> 60 #include <sys/disp.h> 61 #include <sys/ddi.h> 62 #include <sys/dkio.h> 63 #include <sys/efi_partition.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/kstat.h> 67 #include <sys/mach_descrip.h> 68 #include <sys/modctl.h> 69 #include <sys/mdeg.h> 70 #include <sys/note.h> 71 #include <sys/open.h> 72 #include <sys/sdt.h> 73 #include <sys/stat.h> 74 #include <sys/sunddi.h> 75 #include <sys/types.h> 76 #include <sys/promif.h> 77 #include <sys/var.h> 78 #include <sys/vtoc.h> 79 #include <sys/archsystm.h> 80 #include <sys/sysmacros.h> 81 82 #include <sys/cdio.h> 83 #include <sys/dktp/fdisk.h> 84 #include <sys/dktp/dadkio.h> 85 #include <sys/mhd.h> 86 #include <sys/scsi/generic/sense.h> 87 #include <sys/scsi/impl/uscsi.h> 88 #include <sys/scsi/impl/services.h> 89 #include <sys/scsi/targets/sddef.h> 90 91 #include <sys/ldoms.h> 92 #include <sys/ldc.h> 93 #include <sys/vio_common.h> 94 #include <sys/vio_mailbox.h> 95 #include <sys/vio_util.h> 96 #include <sys/vdsk_common.h> 97 #include <sys/vdsk_mailbox.h> 98 #include <sys/vdc.h> 99 100 #define VD_OLDVTOC_LIMIT 0x7fffffff 101 102 /* 103 * function prototypes 104 */ 105 106 /* standard driver functions */ 107 static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); 108 static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); 109 static int vdc_strategy(struct buf *buf); 110 static int vdc_print(dev_t dev, char *str); 111 static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); 112 static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); 113 static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); 114 static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, 115 cred_t *credp, int *rvalp); 116 static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); 117 static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); 118 119 static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, 120 void *arg, void **resultp); 121 static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 122 static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 123 static int vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, 124 int mod_flags, char *name, caddr_t valuep, int *lengthp); 125 126 /* setup */ 127 static void vdc_min(struct buf *bufp); 128 static int vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen); 129 static int vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr); 130 static int vdc_start_ldc_connection(vdc_t *vdc); 131 static int vdc_create_device_nodes(vdc_t *vdc); 132 static int vdc_create_device_nodes_efi(vdc_t *vdc); 133 static int vdc_create_device_nodes_vtoc(vdc_t *vdc); 134 static void vdc_create_io_kstats(vdc_t *vdc); 135 static void vdc_create_err_kstats(vdc_t *vdc); 136 static void vdc_set_err_kstats(vdc_t *vdc); 137 static int vdc_get_md_node(dev_info_t *dip, md_t **mdpp, 138 mde_cookie_t *vd_nodep); 139 static int vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep); 140 static void vdc_fini_ports(vdc_t *vdc); 141 static void vdc_switch_server(vdc_t *vdcp); 142 static int vdc_do_ldc_up(vdc_t *vdc); 143 static void vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr); 144 static int vdc_init_descriptor_ring(vdc_t *vdc); 145 static void vdc_destroy_descriptor_ring(vdc_t *vdc); 146 static int vdc_setup_devid(vdc_t *vdc); 147 static void vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *); 148 static void vdc_store_label_vtoc(vdc_t *, struct dk_geom *, 149 struct extvtoc *); 150 static void vdc_store_label_unk(vdc_t *vdc); 151 static boolean_t vdc_is_opened(vdc_t *vdc); 152 static void vdc_update_size(vdc_t *vdc, size_t, size_t, size_t); 153 154 /* handshake with vds */ 155 static int vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver); 156 static int vdc_ver_negotiation(vdc_t *vdcp); 157 static int vdc_init_attr_negotiation(vdc_t *vdc); 158 static int vdc_attr_negotiation(vdc_t *vdcp); 159 static int vdc_init_dring_negotiate(vdc_t *vdc); 160 static int vdc_dring_negotiation(vdc_t *vdcp); 161 static int vdc_send_rdx(vdc_t *vdcp); 162 static int vdc_rdx_exchange(vdc_t *vdcp); 163 static boolean_t vdc_is_supported_version(vio_ver_msg_t *ver_msg); 164 165 /* processing incoming messages from vDisk server */ 166 static void vdc_process_msg_thread(vdc_t *vdc); 167 static int vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp); 168 169 static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); 170 static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg); 171 static int vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg); 172 static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg); 173 static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg); 174 static int vdc_send_request(vdc_t *vdcp, int operation, 175 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 176 int cb_type, void *cb_arg, vio_desc_direction_t dir); 177 static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); 178 static int vdc_populate_descriptor(vdc_t *vdcp, int operation, 179 caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, 180 int cb_type, void *cb_arg, vio_desc_direction_t dir); 181 static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, 182 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 183 void *cb_arg, vio_desc_direction_t dir, boolean_t); 184 185 static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); 186 static int vdc_drain_response(vdc_t *vdcp, struct buf *buf); 187 static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); 188 static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); 189 static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); 190 191 /* dkio */ 192 static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, 193 int *rvalp); 194 static int vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg); 195 static void vdc_create_fake_geometry(vdc_t *vdc); 196 static int vdc_validate_geometry(vdc_t *vdc); 197 static void vdc_validate(vdc_t *vdc); 198 static void vdc_validate_task(void *arg); 199 static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, 200 int mode, int dir); 201 static int vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 202 int mode, int dir); 203 static int vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 204 int mode, int dir); 205 static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, 206 int mode, int dir); 207 static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, 208 int mode, int dir); 209 static int vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, 210 int mode, int dir); 211 static int vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, 212 int mode, int dir); 213 static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, 214 int mode, int dir); 215 static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, 216 int mode, int dir); 217 static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, 218 int mode, int dir); 219 static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, 220 int mode, int dir); 221 222 static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); 223 static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); 224 static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); 225 static int vdc_failfast_check_resv(vdc_t *vdc); 226 227 /* 228 * Module variables 229 */ 230 231 /* 232 * Tunable variables to control how long vdc waits before timing out on 233 * various operations 234 */ 235 static int vdc_hshake_retries = 3; 236 237 static int vdc_timeout = 0; /* units: seconds */ 238 static int vdc_ldcup_timeout = 1; /* units: seconds */ 239 240 static uint64_t vdc_hz_min_ldc_delay; 241 static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC; 242 static uint64_t vdc_hz_max_ldc_delay; 243 static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC; 244 245 static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC; 246 static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; 247 248 /* values for dumping - need to run in a tighter loop */ 249 static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ 250 static int vdc_dump_retries = 100; 251 252 static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ 253 254 static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ 255 256 /* Count of the number of vdc instances attached */ 257 static volatile uint32_t vdc_instance_count = 0; 258 259 /* Tunable to log all SCSI errors */ 260 static boolean_t vdc_scsi_log_error = B_FALSE; 261 262 /* Soft state pointer */ 263 static void *vdc_state; 264 265 /* 266 * Controlling the verbosity of the error/debug messages 267 * 268 * vdc_msglevel - controls level of messages 269 * vdc_matchinst - 64-bit variable where each bit corresponds 270 * to the vdc instance the vdc_msglevel applies. 271 */ 272 int vdc_msglevel = 0x0; 273 uint64_t vdc_matchinst = 0ull; 274 275 /* 276 * Supported vDisk protocol version pairs. 277 * 278 * The first array entry is the latest and preferred version. 279 */ 280 static const vio_ver_t vdc_version[] = {{1, 1}}; 281 282 static struct cb_ops vdc_cb_ops = { 283 vdc_open, /* cb_open */ 284 vdc_close, /* cb_close */ 285 vdc_strategy, /* cb_strategy */ 286 vdc_print, /* cb_print */ 287 vdc_dump, /* cb_dump */ 288 vdc_read, /* cb_read */ 289 vdc_write, /* cb_write */ 290 vdc_ioctl, /* cb_ioctl */ 291 nodev, /* cb_devmap */ 292 nodev, /* cb_mmap */ 293 nodev, /* cb_segmap */ 294 nochpoll, /* cb_chpoll */ 295 vdc_prop_op, /* cb_prop_op */ 296 NULL, /* cb_str */ 297 D_MP | D_64BIT, /* cb_flag */ 298 CB_REV, /* cb_rev */ 299 vdc_aread, /* cb_aread */ 300 vdc_awrite /* cb_awrite */ 301 }; 302 303 static struct dev_ops vdc_ops = { 304 DEVO_REV, /* devo_rev */ 305 0, /* devo_refcnt */ 306 vdc_getinfo, /* devo_getinfo */ 307 nulldev, /* devo_identify */ 308 nulldev, /* devo_probe */ 309 vdc_attach, /* devo_attach */ 310 vdc_detach, /* devo_detach */ 311 nodev, /* devo_reset */ 312 &vdc_cb_ops, /* devo_cb_ops */ 313 NULL, /* devo_bus_ops */ 314 nulldev, /* devo_power */ 315 ddi_quiesce_not_needed, /* devo_quiesce */ 316 }; 317 318 static struct modldrv modldrv = { 319 &mod_driverops, 320 "virtual disk client", 321 &vdc_ops, 322 }; 323 324 static struct modlinkage modlinkage = { 325 MODREV_1, 326 &modldrv, 327 NULL 328 }; 329 330 /* -------------------------------------------------------------------------- */ 331 332 /* 333 * Device Driver housekeeping and setup 334 */ 335 336 int 337 _init(void) 338 { 339 int status; 340 341 if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) 342 return (status); 343 if ((status = mod_install(&modlinkage)) != 0) 344 ddi_soft_state_fini(&vdc_state); 345 return (status); 346 } 347 348 int 349 _info(struct modinfo *modinfop) 350 { 351 return (mod_info(&modlinkage, modinfop)); 352 } 353 354 int 355 _fini(void) 356 { 357 int status; 358 359 if ((status = mod_remove(&modlinkage)) != 0) 360 return (status); 361 ddi_soft_state_fini(&vdc_state); 362 return (0); 363 } 364 365 static int 366 vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) 367 { 368 _NOTE(ARGUNUSED(dip)) 369 370 int instance = VDCUNIT((dev_t)arg); 371 vdc_t *vdc = NULL; 372 373 switch (cmd) { 374 case DDI_INFO_DEVT2DEVINFO: 375 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 376 *resultp = NULL; 377 return (DDI_FAILURE); 378 } 379 *resultp = vdc->dip; 380 return (DDI_SUCCESS); 381 case DDI_INFO_DEVT2INSTANCE: 382 *resultp = (void *)(uintptr_t)instance; 383 return (DDI_SUCCESS); 384 default: 385 *resultp = NULL; 386 return (DDI_FAILURE); 387 } 388 } 389 390 static int 391 vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 392 { 393 kt_did_t failfast_tid, ownership_tid; 394 int instance; 395 int rv; 396 vdc_server_t *srvr; 397 vdc_t *vdc = NULL; 398 399 switch (cmd) { 400 case DDI_DETACH: 401 /* the real work happens below */ 402 break; 403 case DDI_SUSPEND: 404 /* nothing to do for this non-device */ 405 return (DDI_SUCCESS); 406 default: 407 return (DDI_FAILURE); 408 } 409 410 ASSERT(cmd == DDI_DETACH); 411 instance = ddi_get_instance(dip); 412 DMSGX(1, "[%d] Entered\n", instance); 413 414 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 415 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 416 return (DDI_FAILURE); 417 } 418 419 /* 420 * This function is called when vdc is detached or if it has failed to 421 * attach. In that case, the attach may have fail before the vdisk type 422 * has been set so we can't call vdc_is_opened(). However as the attach 423 * has failed, we know that the vdisk is not opened and we can safely 424 * detach. 425 */ 426 if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { 427 DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); 428 return (DDI_FAILURE); 429 } 430 431 if (vdc->dkio_flush_pending) { 432 DMSG(vdc, 0, 433 "[%d] Cannot detach: %d outstanding DKIO flushes\n", 434 instance, vdc->dkio_flush_pending); 435 return (DDI_FAILURE); 436 } 437 438 if (vdc->validate_pending) { 439 DMSG(vdc, 0, 440 "[%d] Cannot detach: %d outstanding validate request\n", 441 instance, vdc->validate_pending); 442 return (DDI_FAILURE); 443 } 444 445 DMSG(vdc, 0, "[%d] proceeding...\n", instance); 446 447 /* If we took ownership, release ownership */ 448 mutex_enter(&vdc->ownership_lock); 449 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { 450 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); 451 if (rv == 0) { 452 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 453 } 454 } 455 mutex_exit(&vdc->ownership_lock); 456 457 /* mark instance as detaching */ 458 vdc->lifecycle = VDC_LC_DETACHING; 459 460 /* 461 * Try and disable callbacks to prevent another handshake. We have to 462 * disable callbacks for all servers. 463 */ 464 for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) { 465 rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE); 466 DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n", 467 srvr->ldc_id, rv); 468 } 469 470 if (vdc->initialized & VDC_THREAD) { 471 mutex_enter(&vdc->read_lock); 472 if ((vdc->read_state == VDC_READ_WAITING) || 473 (vdc->read_state == VDC_READ_RESET)) { 474 vdc->read_state = VDC_READ_RESET; 475 cv_signal(&vdc->read_cv); 476 } 477 478 mutex_exit(&vdc->read_lock); 479 480 /* wake up any thread waiting for connection to come online */ 481 mutex_enter(&vdc->lock); 482 if (vdc->state == VDC_STATE_INIT_WAITING) { 483 DMSG(vdc, 0, 484 "[%d] write reset - move to resetting state...\n", 485 instance); 486 vdc->state = VDC_STATE_RESETTING; 487 cv_signal(&vdc->initwait_cv); 488 } 489 mutex_exit(&vdc->lock); 490 491 /* now wait until state transitions to VDC_STATE_DETACH */ 492 thread_join(vdc->msg_proc_thr->t_did); 493 ASSERT(vdc->state == VDC_STATE_DETACH); 494 DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n", 495 vdc->instance); 496 } 497 498 mutex_enter(&vdc->lock); 499 500 if (vdc->initialized & VDC_DRING) 501 vdc_destroy_descriptor_ring(vdc); 502 503 vdc_fini_ports(vdc); 504 505 if (vdc->failfast_thread) { 506 failfast_tid = vdc->failfast_thread->t_did; 507 vdc->failfast_interval = 0; 508 cv_signal(&vdc->failfast_cv); 509 } else { 510 failfast_tid = 0; 511 } 512 513 if (vdc->ownership & VDC_OWNERSHIP_WANTED) { 514 ownership_tid = vdc->ownership_thread->t_did; 515 vdc->ownership = VDC_OWNERSHIP_NONE; 516 cv_signal(&vdc->ownership_cv); 517 } else { 518 ownership_tid = 0; 519 } 520 521 mutex_exit(&vdc->lock); 522 523 if (failfast_tid != 0) 524 thread_join(failfast_tid); 525 526 if (ownership_tid != 0) 527 thread_join(ownership_tid); 528 529 if (vdc->initialized & VDC_MINOR) 530 ddi_remove_minor_node(dip, NULL); 531 532 if (vdc->io_stats) { 533 kstat_delete(vdc->io_stats); 534 vdc->io_stats = NULL; 535 } 536 537 if (vdc->err_stats) { 538 kstat_delete(vdc->err_stats); 539 vdc->err_stats = NULL; 540 } 541 542 if (vdc->initialized & VDC_LOCKS) { 543 mutex_destroy(&vdc->lock); 544 mutex_destroy(&vdc->read_lock); 545 mutex_destroy(&vdc->ownership_lock); 546 cv_destroy(&vdc->initwait_cv); 547 cv_destroy(&vdc->dring_free_cv); 548 cv_destroy(&vdc->membind_cv); 549 cv_destroy(&vdc->sync_pending_cv); 550 cv_destroy(&vdc->sync_blocked_cv); 551 cv_destroy(&vdc->read_cv); 552 cv_destroy(&vdc->running_cv); 553 cv_destroy(&vdc->ownership_cv); 554 cv_destroy(&vdc->failfast_cv); 555 cv_destroy(&vdc->failfast_io_cv); 556 } 557 558 if (vdc->minfo) 559 kmem_free(vdc->minfo, sizeof (struct dk_minfo)); 560 561 if (vdc->cinfo) 562 kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); 563 564 if (vdc->vtoc) 565 kmem_free(vdc->vtoc, sizeof (struct extvtoc)); 566 567 if (vdc->geom) 568 kmem_free(vdc->geom, sizeof (struct dk_geom)); 569 570 if (vdc->devid) { 571 ddi_devid_unregister(dip); 572 ddi_devid_free(vdc->devid); 573 } 574 575 if (vdc->initialized & VDC_SOFT_STATE) 576 ddi_soft_state_free(vdc_state, instance); 577 578 DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc); 579 580 return (DDI_SUCCESS); 581 } 582 583 584 static int 585 vdc_do_attach(dev_info_t *dip) 586 { 587 int instance; 588 vdc_t *vdc = NULL; 589 int status; 590 md_t *mdp; 591 mde_cookie_t vd_node; 592 593 ASSERT(dip != NULL); 594 595 instance = ddi_get_instance(dip); 596 if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { 597 cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure", 598 instance); 599 return (DDI_FAILURE); 600 } 601 602 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 603 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 604 return (DDI_FAILURE); 605 } 606 607 /* 608 * We assign the value to initialized in this case to zero out the 609 * variable and then set bits in it to indicate what has been done 610 */ 611 vdc->initialized = VDC_SOFT_STATE; 612 613 vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc); 614 vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc); 615 616 vdc->dip = dip; 617 vdc->instance = instance; 618 vdc->vdisk_type = VD_DISK_TYPE_UNK; 619 vdc->vdisk_label = VD_DISK_LABEL_UNK; 620 vdc->state = VDC_STATE_INIT; 621 vdc->lifecycle = VDC_LC_ATTACHING; 622 vdc->session_id = 0; 623 vdc->block_size = DEV_BSIZE; 624 vdc->max_xfer_sz = maxphys / DEV_BSIZE; 625 626 /* 627 * We assume, for now, that the vDisk server will export 'read' 628 * operations to us at a minimum (this is needed because of checks 629 * in vdc for supported operations early in the handshake process). 630 * The vDisk server will return ENOTSUP if this is not the case. 631 * The value will be overwritten during the attribute exchange with 632 * the bitmask of operations exported by server. 633 */ 634 vdc->operations = VD_OP_MASK_READ; 635 636 vdc->vtoc = NULL; 637 vdc->geom = NULL; 638 vdc->cinfo = NULL; 639 vdc->minfo = NULL; 640 641 mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); 642 cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL); 643 cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL); 644 cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL); 645 cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL); 646 647 vdc->threads_pending = 0; 648 vdc->sync_op_pending = B_FALSE; 649 vdc->sync_op_blocked = B_FALSE; 650 cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); 651 cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); 652 653 mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); 654 cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); 655 cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); 656 cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); 657 658 /* init blocking msg read functionality */ 659 mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); 660 cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); 661 vdc->read_state = VDC_READ_IDLE; 662 663 vdc->initialized |= VDC_LOCKS; 664 665 /* get device and port MD node for this disk instance */ 666 if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) { 667 cmn_err(CE_NOTE, "[%d] Could not get machine description node", 668 instance); 669 return (DDI_FAILURE); 670 } 671 672 if (vdc_init_ports(vdc, mdp, vd_node) != 0) { 673 cmn_err(CE_NOTE, "[%d] Error initialising ports", instance); 674 return (DDI_FAILURE); 675 } 676 677 (void) md_fini_handle(mdp); 678 679 /* Create the kstats for saving the I/O statistics used by iostat(1M) */ 680 vdc_create_io_kstats(vdc); 681 vdc_create_err_kstats(vdc); 682 683 /* Initialize remaining structures before starting the msg thread */ 684 vdc->vdisk_label = VD_DISK_LABEL_UNK; 685 vdc->vtoc = kmem_zalloc(sizeof (struct extvtoc), KM_SLEEP); 686 vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP); 687 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 688 689 /* initialize the thread responsible for managing state with server */ 690 vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread, 691 vdc, 0, &p0, TS_RUN, minclsyspri); 692 if (vdc->msg_proc_thr == NULL) { 693 cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", 694 instance); 695 return (DDI_FAILURE); 696 } 697 698 vdc->initialized |= VDC_THREAD; 699 700 atomic_inc_32(&vdc_instance_count); 701 702 /* 703 * Check the disk label. This will send requests and do the handshake. 704 * We don't really care about the disk label now. What we really need is 705 * the handshake do be done so that we know the type of the disk (slice 706 * or full disk) and the appropriate device nodes can be created. 707 */ 708 709 mutex_enter(&vdc->lock); 710 (void) vdc_validate_geometry(vdc); 711 mutex_exit(&vdc->lock); 712 713 /* 714 * Now that we have the device info we can create the device nodes 715 */ 716 status = vdc_create_device_nodes(vdc); 717 if (status) { 718 DMSG(vdc, 0, "[%d] Failed to create device nodes", 719 instance); 720 goto return_status; 721 } 722 723 /* 724 * Setup devid 725 */ 726 if (vdc_setup_devid(vdc)) { 727 DMSG(vdc, 0, "[%d] No device id available\n", instance); 728 } 729 730 /* 731 * Fill in the fields of the error statistics kstat that were not 732 * available when creating the kstat 733 */ 734 vdc_set_err_kstats(vdc); 735 736 ddi_report_dev(dip); 737 vdc->lifecycle = VDC_LC_ONLINE; 738 DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance); 739 740 return_status: 741 DMSG(vdc, 0, "[%d] Attach completed\n", instance); 742 return (status); 743 } 744 745 static int 746 vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 747 { 748 int status; 749 750 switch (cmd) { 751 case DDI_ATTACH: 752 if ((status = vdc_do_attach(dip)) != 0) 753 (void) vdc_detach(dip, DDI_DETACH); 754 return (status); 755 case DDI_RESUME: 756 /* nothing to do for this non-device */ 757 return (DDI_SUCCESS); 758 default: 759 return (DDI_FAILURE); 760 } 761 } 762 763 static int 764 vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr) 765 { 766 int status = 0; 767 ldc_status_t ldc_state; 768 ldc_attr_t ldc_attr; 769 770 ASSERT(vdc != NULL); 771 ASSERT(srvr != NULL); 772 773 ldc_attr.devclass = LDC_DEV_BLK; 774 ldc_attr.instance = vdc->instance; 775 ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ 776 ldc_attr.mtu = VD_LDC_MTU; 777 778 if ((srvr->state & VDC_LDC_INIT) == 0) { 779 status = ldc_init(srvr->ldc_id, &ldc_attr, 780 &srvr->ldc_handle); 781 if (status != 0) { 782 DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d", 783 vdc->instance, srvr->ldc_id, status); 784 return (status); 785 } 786 srvr->state |= VDC_LDC_INIT; 787 } 788 status = ldc_status(srvr->ldc_handle, &ldc_state); 789 if (status != 0) { 790 DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]", 791 vdc->instance, status); 792 goto init_exit; 793 } 794 srvr->ldc_state = ldc_state; 795 796 if ((srvr->state & VDC_LDC_CB) == 0) { 797 status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb, 798 (caddr_t)srvr); 799 if (status != 0) { 800 DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)", 801 vdc->instance, status); 802 goto init_exit; 803 } 804 srvr->state |= VDC_LDC_CB; 805 } 806 807 /* 808 * At this stage we have initialised LDC, we will now try and open 809 * the connection. 810 */ 811 if (srvr->ldc_state == LDC_INIT) { 812 status = ldc_open(srvr->ldc_handle); 813 if (status != 0) { 814 DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d", 815 vdc->instance, srvr->ldc_id, status); 816 goto init_exit; 817 } 818 srvr->state |= VDC_LDC_OPEN; 819 } 820 821 init_exit: 822 if (status) { 823 vdc_terminate_ldc(vdc, srvr); 824 } 825 826 return (status); 827 } 828 829 static int 830 vdc_start_ldc_connection(vdc_t *vdc) 831 { 832 int status = 0; 833 834 ASSERT(vdc != NULL); 835 836 ASSERT(MUTEX_HELD(&vdc->lock)); 837 838 status = vdc_do_ldc_up(vdc); 839 840 DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance); 841 842 return (status); 843 } 844 845 static int 846 vdc_stop_ldc_connection(vdc_t *vdcp) 847 { 848 int status; 849 850 ASSERT(vdcp != NULL); 851 852 ASSERT(MUTEX_HELD(&vdcp->lock)); 853 854 DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n", 855 vdcp->state); 856 857 status = ldc_down(vdcp->curr_server->ldc_handle); 858 DMSG(vdcp, 0, "ldc_down() = %d\n", status); 859 860 vdcp->initialized &= ~VDC_HANDSHAKE; 861 DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized); 862 863 return (status); 864 } 865 866 static void 867 vdc_create_io_kstats(vdc_t *vdc) 868 { 869 if (vdc->io_stats != NULL) { 870 DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance); 871 return; 872 } 873 874 vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL, 875 "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT); 876 if (vdc->io_stats != NULL) { 877 vdc->io_stats->ks_lock = &vdc->lock; 878 kstat_install(vdc->io_stats); 879 } else { 880 cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics" 881 " will not be gathered", vdc->instance); 882 } 883 } 884 885 static void 886 vdc_create_err_kstats(vdc_t *vdc) 887 { 888 vd_err_stats_t *stp; 889 char kstatmodule_err[KSTAT_STRLEN]; 890 char kstatname[KSTAT_STRLEN]; 891 int ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t)); 892 int instance = vdc->instance; 893 894 if (vdc->err_stats != NULL) { 895 DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance); 896 return; 897 } 898 899 (void) snprintf(kstatmodule_err, sizeof (kstatmodule_err), 900 "%serr", VDC_DRIVER_NAME); 901 (void) snprintf(kstatname, sizeof (kstatname), 902 "%s%d,err", VDC_DRIVER_NAME, instance); 903 904 vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname, 905 "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT); 906 907 if (vdc->err_stats == NULL) { 908 cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics" 909 " will not be gathered", instance); 910 return; 911 } 912 913 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 914 kstat_named_init(&stp->vd_softerrs, "Soft Errors", 915 KSTAT_DATA_UINT32); 916 kstat_named_init(&stp->vd_transerrs, "Transport Errors", 917 KSTAT_DATA_UINT32); 918 kstat_named_init(&stp->vd_protoerrs, "Protocol Errors", 919 KSTAT_DATA_UINT32); 920 kstat_named_init(&stp->vd_vid, "Vendor", 921 KSTAT_DATA_CHAR); 922 kstat_named_init(&stp->vd_pid, "Product", 923 KSTAT_DATA_CHAR); 924 kstat_named_init(&stp->vd_capacity, "Size", 925 KSTAT_DATA_ULONGLONG); 926 927 vdc->err_stats->ks_update = nulldev; 928 929 kstat_install(vdc->err_stats); 930 } 931 932 static void 933 vdc_set_err_kstats(vdc_t *vdc) 934 { 935 vd_err_stats_t *stp; 936 937 if (vdc->err_stats == NULL) 938 return; 939 940 mutex_enter(&vdc->lock); 941 942 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 943 ASSERT(stp != NULL); 944 945 stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->block_size; 946 (void) strcpy(stp->vd_vid.value.c, "SUN"); 947 (void) strcpy(stp->vd_pid.value.c, "VDSK"); 948 949 mutex_exit(&vdc->lock); 950 } 951 952 static int 953 vdc_create_device_nodes_efi(vdc_t *vdc) 954 { 955 ddi_remove_minor_node(vdc->dip, "h"); 956 ddi_remove_minor_node(vdc->dip, "h,raw"); 957 958 if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK, 959 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 960 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 961 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'", 962 vdc->instance); 963 return (EIO); 964 } 965 966 /* if any device node is created we set this flag */ 967 vdc->initialized |= VDC_MINOR; 968 969 if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR, 970 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 971 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 972 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'", 973 vdc->instance); 974 return (EIO); 975 } 976 977 return (0); 978 } 979 980 static int 981 vdc_create_device_nodes_vtoc(vdc_t *vdc) 982 { 983 ddi_remove_minor_node(vdc->dip, "wd"); 984 ddi_remove_minor_node(vdc->dip, "wd,raw"); 985 986 if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK, 987 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 988 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 989 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'", 990 vdc->instance); 991 return (EIO); 992 } 993 994 /* if any device node is created we set this flag */ 995 vdc->initialized |= VDC_MINOR; 996 997 if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR, 998 VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE), 999 DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1000 cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'", 1001 vdc->instance); 1002 return (EIO); 1003 } 1004 1005 return (0); 1006 } 1007 1008 /* 1009 * Function: 1010 * vdc_create_device_nodes 1011 * 1012 * Description: 1013 * This function creates the block and character device nodes under 1014 * /devices. It is called as part of the attach(9E) of the instance 1015 * during the handshake with vds after vds has sent the attributes 1016 * to vdc. 1017 * 1018 * If the device is of type VD_DISK_TYPE_SLICE then the minor node 1019 * of 2 is used in keeping with the Solaris convention that slice 2 1020 * refers to a whole disk. Slices start at 'a' 1021 * 1022 * Parameters: 1023 * vdc - soft state pointer 1024 * 1025 * Return Values 1026 * 0 - Success 1027 * EIO - Failed to create node 1028 * EINVAL - Unknown type of disk exported 1029 */ 1030 static int 1031 vdc_create_device_nodes(vdc_t *vdc) 1032 { 1033 char name[sizeof ("s,raw")]; 1034 dev_info_t *dip = NULL; 1035 int instance, status; 1036 int num_slices = 1; 1037 int i; 1038 1039 ASSERT(vdc != NULL); 1040 1041 instance = vdc->instance; 1042 dip = vdc->dip; 1043 1044 switch (vdc->vdisk_type) { 1045 case VD_DISK_TYPE_DISK: 1046 num_slices = V_NUMPAR; 1047 break; 1048 case VD_DISK_TYPE_SLICE: 1049 num_slices = 1; 1050 break; 1051 case VD_DISK_TYPE_UNK: 1052 default: 1053 return (EINVAL); 1054 } 1055 1056 /* 1057 * Minor nodes are different for EFI disks: EFI disks do not have 1058 * a minor node 'g' for the minor number corresponding to slice 1059 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd' 1060 * representing the whole disk. 1061 */ 1062 for (i = 0; i < num_slices; i++) { 1063 1064 if (i == VD_EFI_WD_SLICE) { 1065 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 1066 status = vdc_create_device_nodes_efi(vdc); 1067 else 1068 status = vdc_create_device_nodes_vtoc(vdc); 1069 if (status != 0) 1070 return (status); 1071 continue; 1072 } 1073 1074 (void) snprintf(name, sizeof (name), "%c", 'a' + i); 1075 if (ddi_create_minor_node(dip, name, S_IFBLK, 1076 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1077 cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'", 1078 instance, name); 1079 return (EIO); 1080 } 1081 1082 /* if any device node is created we set this flag */ 1083 vdc->initialized |= VDC_MINOR; 1084 1085 (void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw"); 1086 1087 if (ddi_create_minor_node(dip, name, S_IFCHR, 1088 VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { 1089 cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'", 1090 instance, name); 1091 return (EIO); 1092 } 1093 } 1094 1095 return (0); 1096 } 1097 1098 /* 1099 * Driver prop_op(9e) entry point function. Return the number of blocks for 1100 * the partition in question or forward the request to the property facilities. 1101 */ 1102 static int 1103 vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 1104 char *name, caddr_t valuep, int *lengthp) 1105 { 1106 int instance = ddi_get_instance(dip); 1107 vdc_t *vdc; 1108 uint64_t nblocks; 1109 uint_t blksize; 1110 1111 vdc = ddi_get_soft_state(vdc_state, instance); 1112 1113 if (dev == DDI_DEV_T_ANY || vdc == NULL) { 1114 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1115 name, valuep, lengthp)); 1116 } 1117 1118 mutex_enter(&vdc->lock); 1119 (void) vdc_validate_geometry(vdc); 1120 if (vdc->vdisk_label == VD_DISK_LABEL_UNK) { 1121 mutex_exit(&vdc->lock); 1122 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 1123 name, valuep, lengthp)); 1124 } 1125 nblocks = vdc->slice[VDCPART(dev)].nblocks; 1126 blksize = vdc->block_size; 1127 mutex_exit(&vdc->lock); 1128 1129 return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags, 1130 name, valuep, lengthp, nblocks, blksize)); 1131 } 1132 1133 /* 1134 * Function: 1135 * vdc_is_opened 1136 * 1137 * Description: 1138 * This function checks if any slice of a given virtual disk is 1139 * currently opened. 1140 * 1141 * Parameters: 1142 * vdc - soft state pointer 1143 * 1144 * Return Values 1145 * B_TRUE - at least one slice is opened. 1146 * B_FALSE - no slice is opened. 1147 */ 1148 static boolean_t 1149 vdc_is_opened(vdc_t *vdc) 1150 { 1151 int i, nslices; 1152 1153 switch (vdc->vdisk_type) { 1154 case VD_DISK_TYPE_DISK: 1155 nslices = V_NUMPAR; 1156 break; 1157 case VD_DISK_TYPE_SLICE: 1158 nslices = 1; 1159 break; 1160 case VD_DISK_TYPE_UNK: 1161 default: 1162 ASSERT(0); 1163 } 1164 1165 /* check if there's any layered open */ 1166 for (i = 0; i < nslices; i++) { 1167 if (vdc->open_lyr[i] > 0) 1168 return (B_TRUE); 1169 } 1170 1171 /* check if there is any other kind of open */ 1172 for (i = 0; i < OTYPCNT; i++) { 1173 if (vdc->open[i] != 0) 1174 return (B_TRUE); 1175 } 1176 1177 return (B_FALSE); 1178 } 1179 1180 static int 1181 vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp) 1182 { 1183 uint8_t slicemask; 1184 int i; 1185 1186 ASSERT(otyp < OTYPCNT); 1187 ASSERT(slice < V_NUMPAR); 1188 ASSERT(MUTEX_HELD(&vdc->lock)); 1189 1190 slicemask = 1 << slice; 1191 1192 /* check if slice is already exclusively opened */ 1193 if (vdc->open_excl & slicemask) 1194 return (EBUSY); 1195 1196 /* if open exclusive, check if slice is already opened */ 1197 if (flag & FEXCL) { 1198 if (vdc->open_lyr[slice] > 0) 1199 return (EBUSY); 1200 for (i = 0; i < OTYPCNT; i++) { 1201 if (vdc->open[i] & slicemask) 1202 return (EBUSY); 1203 } 1204 vdc->open_excl |= slicemask; 1205 } 1206 1207 /* mark slice as opened */ 1208 if (otyp == OTYP_LYR) { 1209 vdc->open_lyr[slice]++; 1210 } else { 1211 vdc->open[otyp] |= slicemask; 1212 } 1213 1214 return (0); 1215 } 1216 1217 static void 1218 vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp) 1219 { 1220 uint8_t slicemask; 1221 1222 ASSERT(otyp < OTYPCNT); 1223 ASSERT(slice < V_NUMPAR); 1224 ASSERT(MUTEX_HELD(&vdc->lock)); 1225 1226 slicemask = 1 << slice; 1227 1228 if (otyp == OTYP_LYR) { 1229 ASSERT(vdc->open_lyr[slice] > 0); 1230 vdc->open_lyr[slice]--; 1231 } else { 1232 vdc->open[otyp] &= ~slicemask; 1233 } 1234 1235 if (flag & FEXCL) 1236 vdc->open_excl &= ~slicemask; 1237 } 1238 1239 static int 1240 vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) 1241 { 1242 _NOTE(ARGUNUSED(cred)) 1243 1244 int instance, nodelay; 1245 int slice, status = 0; 1246 vdc_t *vdc; 1247 1248 ASSERT(dev != NULL); 1249 instance = VDCUNIT(*dev); 1250 1251 if (otyp >= OTYPCNT) 1252 return (EINVAL); 1253 1254 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1255 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1256 return (ENXIO); 1257 } 1258 1259 DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n", 1260 getminor(*dev), flag, otyp); 1261 1262 slice = VDCPART(*dev); 1263 1264 nodelay = flag & (FNDELAY | FNONBLOCK); 1265 1266 if ((flag & FWRITE) && (!nodelay) && 1267 !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) { 1268 return (EROFS); 1269 } 1270 1271 mutex_enter(&vdc->lock); 1272 1273 status = vdc_mark_opened(vdc, slice, flag, otyp); 1274 1275 if (status != 0) { 1276 mutex_exit(&vdc->lock); 1277 return (status); 1278 } 1279 1280 if (nodelay) { 1281 1282 /* don't resubmit a validate request if there's already one */ 1283 if (vdc->validate_pending > 0) { 1284 mutex_exit(&vdc->lock); 1285 return (0); 1286 } 1287 1288 /* call vdc_validate() asynchronously to avoid blocking */ 1289 if (taskq_dispatch(system_taskq, vdc_validate_task, 1290 (void *)vdc, TQ_NOSLEEP) == NULL) { 1291 vdc_mark_closed(vdc, slice, flag, otyp); 1292 mutex_exit(&vdc->lock); 1293 return (ENXIO); 1294 } 1295 1296 vdc->validate_pending++; 1297 mutex_exit(&vdc->lock); 1298 return (0); 1299 } 1300 1301 mutex_exit(&vdc->lock); 1302 1303 vdc_validate(vdc); 1304 1305 mutex_enter(&vdc->lock); 1306 1307 if (vdc->vdisk_label == VD_DISK_LABEL_UNK || 1308 vdc->slice[slice].nblocks == 0) { 1309 vdc_mark_closed(vdc, slice, flag, otyp); 1310 status = EIO; 1311 } 1312 1313 mutex_exit(&vdc->lock); 1314 1315 return (status); 1316 } 1317 1318 static int 1319 vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) 1320 { 1321 _NOTE(ARGUNUSED(cred)) 1322 1323 int instance; 1324 int slice; 1325 int rv, rval; 1326 vdc_t *vdc; 1327 1328 instance = VDCUNIT(dev); 1329 1330 if (otyp >= OTYPCNT) 1331 return (EINVAL); 1332 1333 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1334 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1335 return (ENXIO); 1336 } 1337 1338 DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp); 1339 1340 slice = VDCPART(dev); 1341 1342 /* 1343 * Attempt to flush the W$ on a close operation. If this is 1344 * not a supported IOCTL command or the backing device is read-only 1345 * do not fail the close operation. 1346 */ 1347 rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); 1348 1349 if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { 1350 DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", 1351 instance, rv); 1352 return (EIO); 1353 } 1354 1355 mutex_enter(&vdc->lock); 1356 vdc_mark_closed(vdc, slice, flag, otyp); 1357 mutex_exit(&vdc->lock); 1358 1359 return (0); 1360 } 1361 1362 static int 1363 vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1364 { 1365 _NOTE(ARGUNUSED(credp)) 1366 1367 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); 1368 } 1369 1370 static int 1371 vdc_print(dev_t dev, char *str) 1372 { 1373 cmn_err(CE_NOTE, "vdc%d: %s", VDCUNIT(dev), str); 1374 return (0); 1375 } 1376 1377 static int 1378 vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1379 { 1380 int rv; 1381 size_t nbytes = nblk * DEV_BSIZE; 1382 int instance = VDCUNIT(dev); 1383 vdc_t *vdc = NULL; 1384 1385 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1386 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1387 return (ENXIO); 1388 } 1389 1390 DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n", 1391 instance, nbytes, blkno, (void *)addr); 1392 rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes, 1393 VDCPART(dev), blkno, CB_STRATEGY, 0, VIO_write_dir); 1394 if (rv) { 1395 DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv); 1396 return (rv); 1397 } 1398 1399 if (ddi_in_panic()) 1400 (void) vdc_drain_response(vdc, NULL); 1401 1402 DMSG(vdc, 0, "[%d] End\n", instance); 1403 1404 return (0); 1405 } 1406 1407 /* -------------------------------------------------------------------------- */ 1408 1409 /* 1410 * Disk access routines 1411 * 1412 */ 1413 1414 /* 1415 * vdc_strategy() 1416 * 1417 * Return Value: 1418 * 0: As per strategy(9E), the strategy() function must return 0 1419 * [ bioerror(9f) sets b_flags to the proper error code ] 1420 */ 1421 static int 1422 vdc_strategy(struct buf *buf) 1423 { 1424 int rv = -1; 1425 vdc_t *vdc = NULL; 1426 int instance = VDCUNIT(buf->b_edev); 1427 int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; 1428 int slice; 1429 1430 if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { 1431 cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance); 1432 bioerror(buf, ENXIO); 1433 biodone(buf); 1434 return (0); 1435 } 1436 1437 DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n", 1438 instance, (buf->b_flags & B_READ) ? "Read" : "Write", 1439 buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr); 1440 1441 bp_mapin(buf); 1442 1443 if ((long)buf->b_private == VD_SLICE_NONE) { 1444 /* I/O using an absolute disk offset */ 1445 slice = VD_SLICE_NONE; 1446 } else { 1447 slice = VDCPART(buf->b_edev); 1448 } 1449 1450 rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr, 1451 buf->b_bcount, slice, buf->b_lblkno, 1452 CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir : 1453 VIO_write_dir); 1454 1455 /* 1456 * If the request was successfully sent, the strategy call returns and 1457 * the ACK handler calls the bioxxx functions when the vDisk server is 1458 * done otherwise we handle the error here. 1459 */ 1460 if (rv) { 1461 DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv); 1462 bioerror(buf, rv); 1463 biodone(buf); 1464 } else if (ddi_in_panic()) { 1465 (void) vdc_drain_response(vdc, buf); 1466 } 1467 1468 return (0); 1469 } 1470 1471 /* 1472 * Function: 1473 * vdc_min 1474 * 1475 * Description: 1476 * Routine to limit the size of a data transfer. Used in 1477 * conjunction with physio(9F). 1478 * 1479 * Arguments: 1480 * bp - pointer to the indicated buf(9S) struct. 1481 * 1482 */ 1483 static void 1484 vdc_min(struct buf *bufp) 1485 { 1486 vdc_t *vdc = NULL; 1487 int instance = VDCUNIT(bufp->b_edev); 1488 1489 vdc = ddi_get_soft_state(vdc_state, instance); 1490 VERIFY(vdc != NULL); 1491 1492 if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->block_size)) { 1493 bufp->b_bcount = vdc->max_xfer_sz * vdc->block_size; 1494 } 1495 } 1496 1497 static int 1498 vdc_read(dev_t dev, struct uio *uio, cred_t *cred) 1499 { 1500 _NOTE(ARGUNUSED(cred)) 1501 1502 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1503 return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio)); 1504 } 1505 1506 static int 1507 vdc_write(dev_t dev, struct uio *uio, cred_t *cred) 1508 { 1509 _NOTE(ARGUNUSED(cred)) 1510 1511 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1512 return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio)); 1513 } 1514 1515 static int 1516 vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) 1517 { 1518 _NOTE(ARGUNUSED(cred)) 1519 1520 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1521 return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio)); 1522 } 1523 1524 static int 1525 vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) 1526 { 1527 _NOTE(ARGUNUSED(cred)) 1528 1529 DMSGX(1, "[%d] Entered", VDCUNIT(dev)); 1530 return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio)); 1531 } 1532 1533 1534 /* -------------------------------------------------------------------------- */ 1535 1536 /* 1537 * Handshake support 1538 */ 1539 1540 1541 /* 1542 * Function: 1543 * vdc_init_ver_negotiation() 1544 * 1545 * Description: 1546 * 1547 * Arguments: 1548 * vdc - soft state pointer for this instance of the device driver. 1549 * 1550 * Return Code: 1551 * 0 - Success 1552 */ 1553 static int 1554 vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver) 1555 { 1556 vio_ver_msg_t pkt; 1557 size_t msglen = sizeof (pkt); 1558 int status = -1; 1559 1560 ASSERT(vdc != NULL); 1561 ASSERT(mutex_owned(&vdc->lock)); 1562 1563 DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance); 1564 1565 /* 1566 * set the Session ID to a unique value 1567 * (the lower 32 bits of the clock tick) 1568 */ 1569 vdc->session_id = ((uint32_t)gettick() & 0xffffffff); 1570 DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id); 1571 1572 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1573 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1574 pkt.tag.vio_subtype_env = VIO_VER_INFO; 1575 pkt.tag.vio_sid = vdc->session_id; 1576 pkt.dev_class = VDEV_DISK; 1577 pkt.ver_major = ver.major; 1578 pkt.ver_minor = ver.minor; 1579 1580 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1581 DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n", 1582 vdc->instance, status); 1583 if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { 1584 DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: " 1585 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1586 vdc->curr_server->ldc_handle, status, msglen); 1587 if (msglen != sizeof (vio_ver_msg_t)) 1588 status = ENOMSG; 1589 } 1590 1591 return (status); 1592 } 1593 1594 /* 1595 * Function: 1596 * vdc_ver_negotiation() 1597 * 1598 * Description: 1599 * 1600 * Arguments: 1601 * vdcp - soft state pointer for this instance of the device driver. 1602 * 1603 * Return Code: 1604 * 0 - Success 1605 */ 1606 static int 1607 vdc_ver_negotiation(vdc_t *vdcp) 1608 { 1609 vio_msg_t vio_msg; 1610 int status; 1611 1612 if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0])) 1613 return (status); 1614 1615 /* release lock and wait for response */ 1616 mutex_exit(&vdcp->lock); 1617 status = vdc_wait_for_response(vdcp, &vio_msg); 1618 mutex_enter(&vdcp->lock); 1619 if (status) { 1620 DMSG(vdcp, 0, 1621 "[%d] Failed waiting for Ver negotiation response, rv(%d)", 1622 vdcp->instance, status); 1623 return (status); 1624 } 1625 1626 /* check type and sub_type ... */ 1627 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1628 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1629 DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n", 1630 vdcp->instance); 1631 return (EPROTO); 1632 } 1633 1634 return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg)); 1635 } 1636 1637 /* 1638 * Function: 1639 * vdc_init_attr_negotiation() 1640 * 1641 * Description: 1642 * 1643 * Arguments: 1644 * vdc - soft state pointer for this instance of the device driver. 1645 * 1646 * Return Code: 1647 * 0 - Success 1648 */ 1649 static int 1650 vdc_init_attr_negotiation(vdc_t *vdc) 1651 { 1652 vd_attr_msg_t pkt; 1653 size_t msglen = sizeof (pkt); 1654 int status; 1655 1656 ASSERT(vdc != NULL); 1657 ASSERT(mutex_owned(&vdc->lock)); 1658 1659 DMSG(vdc, 0, "[%d] entered\n", vdc->instance); 1660 1661 /* fill in tag */ 1662 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1663 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1664 pkt.tag.vio_subtype_env = VIO_ATTR_INFO; 1665 pkt.tag.vio_sid = vdc->session_id; 1666 /* fill in payload */ 1667 pkt.max_xfer_sz = vdc->max_xfer_sz; 1668 pkt.vdisk_block_size = vdc->block_size; 1669 pkt.xfer_mode = VIO_DRING_MODE_V1_0; 1670 pkt.operations = 0; /* server will set bits of valid operations */ 1671 pkt.vdisk_type = 0; /* server will set to valid device type */ 1672 pkt.vdisk_media = 0; /* server will set to valid media type */ 1673 pkt.vdisk_size = 0; /* server will set to valid size */ 1674 1675 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1676 DMSG(vdc, 0, "Attr info sent (status = %d)\n", status); 1677 1678 if ((status != 0) || (msglen != sizeof (vd_attr_msg_t))) { 1679 DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: " 1680 "id(%lx) rv(%d) size(%ld)", vdc->instance, 1681 vdc->curr_server->ldc_handle, status, msglen); 1682 if (msglen != sizeof (vd_attr_msg_t)) 1683 status = ENOMSG; 1684 } 1685 1686 return (status); 1687 } 1688 1689 /* 1690 * Function: 1691 * vdc_attr_negotiation() 1692 * 1693 * Description: 1694 * 1695 * Arguments: 1696 * vdc - soft state pointer for this instance of the device driver. 1697 * 1698 * Return Code: 1699 * 0 - Success 1700 */ 1701 static int 1702 vdc_attr_negotiation(vdc_t *vdcp) 1703 { 1704 int status; 1705 vio_msg_t vio_msg; 1706 1707 if (status = vdc_init_attr_negotiation(vdcp)) 1708 return (status); 1709 1710 /* release lock and wait for response */ 1711 mutex_exit(&vdcp->lock); 1712 status = vdc_wait_for_response(vdcp, &vio_msg); 1713 mutex_enter(&vdcp->lock); 1714 if (status) { 1715 DMSG(vdcp, 0, 1716 "[%d] Failed waiting for Attr negotiation response, rv(%d)", 1717 vdcp->instance, status); 1718 return (status); 1719 } 1720 1721 /* check type and sub_type ... */ 1722 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1723 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1724 DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n", 1725 vdcp->instance); 1726 return (EPROTO); 1727 } 1728 1729 return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg)); 1730 } 1731 1732 1733 /* 1734 * Function: 1735 * vdc_init_dring_negotiate() 1736 * 1737 * Description: 1738 * 1739 * Arguments: 1740 * vdc - soft state pointer for this instance of the device driver. 1741 * 1742 * Return Code: 1743 * 0 - Success 1744 */ 1745 static int 1746 vdc_init_dring_negotiate(vdc_t *vdc) 1747 { 1748 vio_dring_reg_msg_t pkt; 1749 size_t msglen = sizeof (pkt); 1750 int status = -1; 1751 int retry; 1752 int nretries = 10; 1753 1754 ASSERT(vdc != NULL); 1755 ASSERT(mutex_owned(&vdc->lock)); 1756 1757 for (retry = 0; retry < nretries; retry++) { 1758 status = vdc_init_descriptor_ring(vdc); 1759 if (status != EAGAIN) 1760 break; 1761 drv_usecwait(vdc_min_timeout_ldc); 1762 } 1763 1764 if (status != 0) { 1765 DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n", 1766 vdc->instance, status); 1767 return (status); 1768 } 1769 1770 DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n", 1771 vdc->instance, status); 1772 1773 /* fill in tag */ 1774 pkt.tag.vio_msgtype = VIO_TYPE_CTRL; 1775 pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 1776 pkt.tag.vio_subtype_env = VIO_DRING_REG; 1777 pkt.tag.vio_sid = vdc->session_id; 1778 /* fill in payload */ 1779 pkt.dring_ident = 0; 1780 pkt.num_descriptors = vdc->dring_len; 1781 pkt.descriptor_size = vdc->dring_entry_size; 1782 pkt.options = (VIO_TX_DRING | VIO_RX_DRING); 1783 pkt.ncookies = vdc->dring_cookie_count; 1784 pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ 1785 1786 status = vdc_send(vdc, (caddr_t)&pkt, &msglen); 1787 if (status != 0) { 1788 DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)", 1789 vdc->instance, status); 1790 } 1791 1792 return (status); 1793 } 1794 1795 1796 /* 1797 * Function: 1798 * vdc_dring_negotiation() 1799 * 1800 * Description: 1801 * 1802 * Arguments: 1803 * vdc - soft state pointer for this instance of the device driver. 1804 * 1805 * Return Code: 1806 * 0 - Success 1807 */ 1808 static int 1809 vdc_dring_negotiation(vdc_t *vdcp) 1810 { 1811 int status; 1812 vio_msg_t vio_msg; 1813 1814 if (status = vdc_init_dring_negotiate(vdcp)) 1815 return (status); 1816 1817 /* release lock and wait for response */ 1818 mutex_exit(&vdcp->lock); 1819 status = vdc_wait_for_response(vdcp, &vio_msg); 1820 mutex_enter(&vdcp->lock); 1821 if (status) { 1822 DMSG(vdcp, 0, 1823 "[%d] Failed waiting for Dring negotiation response," 1824 " rv(%d)", vdcp->instance, status); 1825 return (status); 1826 } 1827 1828 /* check type and sub_type ... */ 1829 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1830 vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) { 1831 DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n", 1832 vdcp->instance); 1833 return (EPROTO); 1834 } 1835 1836 return (vdc_handle_dring_reg_msg(vdcp, 1837 (vio_dring_reg_msg_t *)&vio_msg)); 1838 } 1839 1840 1841 /* 1842 * Function: 1843 * vdc_send_rdx() 1844 * 1845 * Description: 1846 * 1847 * Arguments: 1848 * vdc - soft state pointer for this instance of the device driver. 1849 * 1850 * Return Code: 1851 * 0 - Success 1852 */ 1853 static int 1854 vdc_send_rdx(vdc_t *vdcp) 1855 { 1856 vio_msg_t msg; 1857 size_t msglen = sizeof (vio_msg_t); 1858 int status; 1859 1860 /* 1861 * Send an RDX message to vds to indicate we are ready 1862 * to send data 1863 */ 1864 msg.tag.vio_msgtype = VIO_TYPE_CTRL; 1865 msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 1866 msg.tag.vio_subtype_env = VIO_RDX; 1867 msg.tag.vio_sid = vdcp->session_id; 1868 status = vdc_send(vdcp, (caddr_t)&msg, &msglen); 1869 if (status != 0) { 1870 DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)", 1871 vdcp->instance, status); 1872 } 1873 1874 return (status); 1875 } 1876 1877 /* 1878 * Function: 1879 * vdc_handle_rdx() 1880 * 1881 * Description: 1882 * 1883 * Arguments: 1884 * vdc - soft state pointer for this instance of the device driver. 1885 * msgp - received msg 1886 * 1887 * Return Code: 1888 * 0 - Success 1889 */ 1890 static int 1891 vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp) 1892 { 1893 _NOTE(ARGUNUSED(vdcp)) 1894 _NOTE(ARGUNUSED(msgp)) 1895 1896 ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL); 1897 ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK); 1898 ASSERT(msgp->tag.vio_subtype_env == VIO_RDX); 1899 1900 DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance); 1901 1902 return (0); 1903 } 1904 1905 /* 1906 * Function: 1907 * vdc_rdx_exchange() 1908 * 1909 * Description: 1910 * 1911 * Arguments: 1912 * vdc - soft state pointer for this instance of the device driver. 1913 * 1914 * Return Code: 1915 * 0 - Success 1916 */ 1917 static int 1918 vdc_rdx_exchange(vdc_t *vdcp) 1919 { 1920 int status; 1921 vio_msg_t vio_msg; 1922 1923 if (status = vdc_send_rdx(vdcp)) 1924 return (status); 1925 1926 /* release lock and wait for response */ 1927 mutex_exit(&vdcp->lock); 1928 status = vdc_wait_for_response(vdcp, &vio_msg); 1929 mutex_enter(&vdcp->lock); 1930 if (status) { 1931 DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)", 1932 vdcp->instance, status); 1933 return (status); 1934 } 1935 1936 /* check type and sub_type ... */ 1937 if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL || 1938 vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) { 1939 DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance); 1940 return (EPROTO); 1941 } 1942 1943 return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg)); 1944 } 1945 1946 1947 /* -------------------------------------------------------------------------- */ 1948 1949 /* 1950 * LDC helper routines 1951 */ 1952 1953 static int 1954 vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp) 1955 { 1956 int status; 1957 boolean_t q_has_pkts = B_FALSE; 1958 uint64_t delay_time; 1959 size_t len; 1960 1961 mutex_enter(&vdc->read_lock); 1962 1963 if (vdc->read_state == VDC_READ_IDLE) 1964 vdc->read_state = VDC_READ_WAITING; 1965 1966 while (vdc->read_state != VDC_READ_PENDING) { 1967 1968 /* detect if the connection has been reset */ 1969 if (vdc->read_state == VDC_READ_RESET) { 1970 status = ECONNRESET; 1971 goto done; 1972 } 1973 1974 cv_wait(&vdc->read_cv, &vdc->read_lock); 1975 } 1976 1977 /* 1978 * Until we get a blocking ldc read we have to retry 1979 * until the entire LDC message has arrived before 1980 * ldc_read() will succeed. Note we also bail out if 1981 * the channel is reset or goes away. 1982 */ 1983 delay_time = vdc_ldc_read_init_delay; 1984 loop: 1985 len = *nbytesp; 1986 status = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)msgp, &len); 1987 switch (status) { 1988 case EAGAIN: 1989 delay_time *= 2; 1990 if (delay_time >= vdc_ldc_read_max_delay) 1991 delay_time = vdc_ldc_read_max_delay; 1992 delay(delay_time); 1993 goto loop; 1994 1995 case 0: 1996 if (len == 0) { 1997 DMSG(vdc, 1, "[%d] ldc_read returned 0 bytes with " 1998 "no error!\n", vdc->instance); 1999 goto loop; 2000 } 2001 2002 *nbytesp = len; 2003 2004 /* 2005 * If there are pending messages, leave the 2006 * read state as pending. Otherwise, set the state 2007 * back to idle. 2008 */ 2009 status = ldc_chkq(vdc->curr_server->ldc_handle, &q_has_pkts); 2010 if (status == 0 && !q_has_pkts) 2011 vdc->read_state = VDC_READ_IDLE; 2012 2013 break; 2014 default: 2015 DMSG(vdc, 0, "ldc_read returned %d\n", status); 2016 break; 2017 } 2018 2019 done: 2020 mutex_exit(&vdc->read_lock); 2021 2022 return (status); 2023 } 2024 2025 2026 2027 #ifdef DEBUG 2028 void 2029 vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg) 2030 { 2031 char *ms, *ss, *ses; 2032 switch (msg->tag.vio_msgtype) { 2033 #define Q(_s) case _s : ms = #_s; break; 2034 Q(VIO_TYPE_CTRL) 2035 Q(VIO_TYPE_DATA) 2036 Q(VIO_TYPE_ERR) 2037 #undef Q 2038 default: ms = "unknown"; break; 2039 } 2040 2041 switch (msg->tag.vio_subtype) { 2042 #define Q(_s) case _s : ss = #_s; break; 2043 Q(VIO_SUBTYPE_INFO) 2044 Q(VIO_SUBTYPE_ACK) 2045 Q(VIO_SUBTYPE_NACK) 2046 #undef Q 2047 default: ss = "unknown"; break; 2048 } 2049 2050 switch (msg->tag.vio_subtype_env) { 2051 #define Q(_s) case _s : ses = #_s; break; 2052 Q(VIO_VER_INFO) 2053 Q(VIO_ATTR_INFO) 2054 Q(VIO_DRING_REG) 2055 Q(VIO_DRING_UNREG) 2056 Q(VIO_RDX) 2057 Q(VIO_PKT_DATA) 2058 Q(VIO_DESC_DATA) 2059 Q(VIO_DRING_DATA) 2060 #undef Q 2061 default: ses = "unknown"; break; 2062 } 2063 2064 DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n", 2065 msg->tag.vio_msgtype, msg->tag.vio_subtype, 2066 msg->tag.vio_subtype_env, ms, ss, ses); 2067 } 2068 #endif 2069 2070 /* 2071 * Function: 2072 * vdc_send() 2073 * 2074 * Description: 2075 * The function encapsulates the call to write a message using LDC. 2076 * If LDC indicates that the call failed due to the queue being full, 2077 * we retry the ldc_write(), otherwise we return the error returned by LDC. 2078 * 2079 * Arguments: 2080 * ldc_handle - LDC handle for the channel this instance of vdc uses 2081 * pkt - address of LDC message to be sent 2082 * msglen - the size of the message being sent. When the function 2083 * returns, this contains the number of bytes written. 2084 * 2085 * Return Code: 2086 * 0 - Success. 2087 * EINVAL - pkt or msglen were NULL 2088 * ECONNRESET - The connection was not up. 2089 * EWOULDBLOCK - LDC queue is full 2090 * xxx - other error codes returned by ldc_write 2091 */ 2092 static int 2093 vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen) 2094 { 2095 size_t size = 0; 2096 int status = 0; 2097 clock_t delay_ticks; 2098 2099 ASSERT(vdc != NULL); 2100 ASSERT(mutex_owned(&vdc->lock)); 2101 ASSERT(msglen != NULL); 2102 ASSERT(*msglen != 0); 2103 2104 #ifdef DEBUG 2105 vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt); 2106 #endif 2107 /* 2108 * Wait indefinitely to send if channel 2109 * is busy, but bail out if we succeed or 2110 * if the channel closes or is reset. 2111 */ 2112 delay_ticks = vdc_hz_min_ldc_delay; 2113 do { 2114 size = *msglen; 2115 status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size); 2116 if (status == EWOULDBLOCK) { 2117 delay(delay_ticks); 2118 /* geometric backoff */ 2119 delay_ticks *= 2; 2120 if (delay_ticks > vdc_hz_max_ldc_delay) 2121 delay_ticks = vdc_hz_max_ldc_delay; 2122 } 2123 } while (status == EWOULDBLOCK); 2124 2125 /* if LDC had serious issues --- reset vdc state */ 2126 if (status == EIO || status == ECONNRESET) { 2127 /* LDC had serious issues --- reset vdc state */ 2128 mutex_enter(&vdc->read_lock); 2129 if ((vdc->read_state == VDC_READ_WAITING) || 2130 (vdc->read_state == VDC_READ_RESET)) 2131 cv_signal(&vdc->read_cv); 2132 vdc->read_state = VDC_READ_RESET; 2133 mutex_exit(&vdc->read_lock); 2134 2135 /* wake up any waiters in the reset thread */ 2136 if (vdc->state == VDC_STATE_INIT_WAITING) { 2137 DMSG(vdc, 0, "[%d] write reset - " 2138 "vdc is resetting ..\n", vdc->instance); 2139 vdc->state = VDC_STATE_RESETTING; 2140 cv_signal(&vdc->initwait_cv); 2141 } 2142 2143 return (ECONNRESET); 2144 } 2145 2146 /* return the last size written */ 2147 *msglen = size; 2148 2149 return (status); 2150 } 2151 2152 /* 2153 * Function: 2154 * vdc_get_md_node 2155 * 2156 * Description: 2157 * Get the MD, the device node for the given disk instance. The 2158 * caller is responsible for cleaning up the reference to the 2159 * returned MD (mdpp) by calling md_fini_handle(). 2160 * 2161 * Arguments: 2162 * dip - dev info pointer for this instance of the device driver. 2163 * mdpp - the returned MD. 2164 * vd_nodep - the returned device node. 2165 * 2166 * Return Code: 2167 * 0 - Success. 2168 * ENOENT - Expected node or property did not exist. 2169 * ENXIO - Unexpected error communicating with MD framework 2170 */ 2171 static int 2172 vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep) 2173 { 2174 int status = ENOENT; 2175 char *node_name = NULL; 2176 md_t *mdp = NULL; 2177 int num_nodes; 2178 int num_vdevs; 2179 mde_cookie_t rootnode; 2180 mde_cookie_t *listp = NULL; 2181 boolean_t found_inst = B_FALSE; 2182 int listsz; 2183 int idx; 2184 uint64_t md_inst; 2185 int obp_inst; 2186 int instance = ddi_get_instance(dip); 2187 2188 /* 2189 * Get the OBP instance number for comparison with the MD instance 2190 * 2191 * The "cfg-handle" property of a vdc node in an MD contains the MD's 2192 * notion of "instance", or unique identifier, for that node; OBP 2193 * stores the value of the "cfg-handle" MD property as the value of 2194 * the "reg" property on the node in the device tree it builds from 2195 * the MD and passes to Solaris. Thus, we look up the devinfo node's 2196 * "reg" property value to uniquely identify this device instance. 2197 * If the "reg" property cannot be found, the device tree state is 2198 * presumably so broken that there is no point in continuing. 2199 */ 2200 if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { 2201 cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); 2202 return (ENOENT); 2203 } 2204 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, 2205 OBP_REG, -1); 2206 DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst); 2207 2208 /* 2209 * We now walk the MD nodes to find the node for this vdisk. 2210 */ 2211 if ((mdp = md_get_handle()) == NULL) { 2212 cmn_err(CE_WARN, "unable to init machine description"); 2213 return (ENXIO); 2214 } 2215 2216 num_nodes = md_node_count(mdp); 2217 ASSERT(num_nodes > 0); 2218 2219 listsz = num_nodes * sizeof (mde_cookie_t); 2220 2221 /* allocate memory for nodes */ 2222 listp = kmem_zalloc(listsz, KM_SLEEP); 2223 2224 rootnode = md_root_node(mdp); 2225 ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); 2226 2227 /* 2228 * Search for all the virtual devices, we will then check to see which 2229 * ones are disk nodes. 2230 */ 2231 num_vdevs = md_scan_dag(mdp, rootnode, 2232 md_find_name(mdp, VDC_MD_VDEV_NAME), 2233 md_find_name(mdp, "fwd"), listp); 2234 2235 if (num_vdevs <= 0) { 2236 cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); 2237 status = ENOENT; 2238 goto done; 2239 } 2240 2241 DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs); 2242 for (idx = 0; idx < num_vdevs; idx++) { 2243 status = md_get_prop_str(mdp, listp[idx], "name", &node_name); 2244 if ((status != 0) || (node_name == NULL)) { 2245 cmn_err(CE_NOTE, "Unable to get name of node type '%s'" 2246 ": err %d", VDC_MD_VDEV_NAME, status); 2247 continue; 2248 } 2249 2250 DMSGX(1, "[%d] Found node '%s'\n", instance, node_name); 2251 if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { 2252 status = md_get_prop_val(mdp, listp[idx], 2253 VDC_MD_CFG_HDL, &md_inst); 2254 DMSGX(1, "[%d] vdc inst in MD=%lx\n", 2255 instance, md_inst); 2256 if ((status == 0) && (md_inst == obp_inst)) { 2257 found_inst = B_TRUE; 2258 break; 2259 } 2260 } 2261 } 2262 2263 if (!found_inst) { 2264 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME); 2265 status = ENOENT; 2266 goto done; 2267 } 2268 DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst); 2269 2270 *vd_nodep = listp[idx]; 2271 *mdpp = mdp; 2272 done: 2273 kmem_free(listp, listsz); 2274 return (status); 2275 } 2276 2277 /* 2278 * Function: 2279 * vdc_init_ports 2280 * 2281 * Description: 2282 * Initialize all the ports for this vdisk instance. 2283 * 2284 * Arguments: 2285 * vdc - soft state pointer for this instance of the device driver. 2286 * mdp - md pointer 2287 * vd_nodep - device md node. 2288 * 2289 * Return Code: 2290 * 0 - Success. 2291 * ENOENT - Expected node or property did not exist. 2292 */ 2293 static int 2294 vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep) 2295 { 2296 int status = 0; 2297 int idx; 2298 int num_nodes; 2299 int num_vports; 2300 int num_chans; 2301 int listsz; 2302 mde_cookie_t vd_port; 2303 mde_cookie_t *chanp = NULL; 2304 mde_cookie_t *portp = NULL; 2305 vdc_server_t *srvr; 2306 vdc_server_t *prev_srvr = NULL; 2307 2308 /* 2309 * We now walk the MD nodes to find the port nodes for this vdisk. 2310 */ 2311 num_nodes = md_node_count(mdp); 2312 ASSERT(num_nodes > 0); 2313 2314 listsz = num_nodes * sizeof (mde_cookie_t); 2315 2316 /* allocate memory for nodes */ 2317 portp = kmem_zalloc(listsz, KM_SLEEP); 2318 chanp = kmem_zalloc(listsz, KM_SLEEP); 2319 2320 num_vports = md_scan_dag(mdp, vd_nodep, 2321 md_find_name(mdp, VDC_MD_PORT_NAME), 2322 md_find_name(mdp, "fwd"), portp); 2323 if (num_vports == 0) { 2324 DMSGX(0, "Found no '%s' node for '%s' port\n", 2325 VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2326 status = ENOENT; 2327 goto done; 2328 } 2329 2330 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n", 2331 num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME); 2332 2333 vdc->num_servers = 0; 2334 for (idx = 0; idx < num_vports; idx++) { 2335 2336 /* initialize this port */ 2337 vd_port = portp[idx]; 2338 srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP); 2339 srvr->vdcp = vdc; 2340 2341 /* get port id */ 2342 if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) { 2343 cmn_err(CE_NOTE, "vDisk port '%s' property not found", 2344 VDC_MD_ID); 2345 kmem_free(srvr, sizeof (vdc_server_t)); 2346 continue; 2347 } 2348 2349 /* set the connection timeout */ 2350 if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT, 2351 &srvr->ctimeout) != 0) { 2352 srvr->ctimeout = 0; 2353 } 2354 2355 /* get the ldc id */ 2356 num_chans = md_scan_dag(mdp, vd_port, 2357 md_find_name(mdp, VDC_MD_CHAN_NAME), 2358 md_find_name(mdp, "fwd"), chanp); 2359 2360 /* expecting at least one channel */ 2361 if (num_chans <= 0) { 2362 cmn_err(CE_NOTE, "No '%s' node for '%s' port", 2363 VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); 2364 kmem_free(srvr, sizeof (vdc_server_t)); 2365 continue; 2366 } else if (num_chans != 1) { 2367 DMSGX(0, "Expected 1 '%s' node for '%s' port, " 2368 "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, 2369 num_chans); 2370 } 2371 2372 /* 2373 * We use the first channel found (index 0), irrespective of how 2374 * many are there in total. 2375 */ 2376 if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID, 2377 &srvr->ldc_id) != 0) { 2378 cmn_err(CE_NOTE, "Channel '%s' property not found", 2379 VDC_MD_ID); 2380 kmem_free(srvr, sizeof (vdc_server_t)); 2381 continue; 2382 } 2383 2384 /* 2385 * now initialise LDC channel which will be used to 2386 * communicate with this server 2387 */ 2388 if (vdc_do_ldc_init(vdc, srvr) != 0) { 2389 kmem_free(srvr, sizeof (vdc_server_t)); 2390 continue; 2391 } 2392 2393 /* add server to list */ 2394 if (prev_srvr) 2395 prev_srvr->next = srvr; 2396 else 2397 vdc->server_list = srvr; 2398 2399 prev_srvr = srvr; 2400 2401 /* inc numbers of servers */ 2402 vdc->num_servers++; 2403 } 2404 2405 /* 2406 * Adjust the max number of handshake retries to match 2407 * the number of vdisk servers. 2408 */ 2409 if (vdc_hshake_retries < vdc->num_servers) 2410 vdc_hshake_retries = vdc->num_servers; 2411 2412 /* pick first server as current server */ 2413 if (vdc->server_list != NULL) { 2414 vdc->curr_server = vdc->server_list; 2415 status = 0; 2416 } else { 2417 status = ENOENT; 2418 } 2419 2420 done: 2421 kmem_free(chanp, listsz); 2422 kmem_free(portp, listsz); 2423 return (status); 2424 } 2425 2426 2427 /* 2428 * Function: 2429 * vdc_do_ldc_up 2430 * 2431 * Description: 2432 * Bring the channel for the current server up. 2433 * 2434 * Arguments: 2435 * vdc - soft state pointer for this instance of the device driver. 2436 * 2437 * Return Code: 2438 * 0 - Success. 2439 * EINVAL - Driver is detaching / LDC error 2440 * ECONNREFUSED - Other end is not listening 2441 */ 2442 static int 2443 vdc_do_ldc_up(vdc_t *vdc) 2444 { 2445 int status; 2446 ldc_status_t ldc_state; 2447 2448 ASSERT(MUTEX_HELD(&vdc->lock)); 2449 2450 DMSG(vdc, 0, "[%d] Bringing up channel %lx\n", 2451 vdc->instance, vdc->curr_server->ldc_id); 2452 2453 if (vdc->lifecycle == VDC_LC_DETACHING) 2454 return (EINVAL); 2455 2456 if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) { 2457 switch (status) { 2458 case ECONNREFUSED: /* listener not ready at other end */ 2459 DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n", 2460 vdc->instance, vdc->curr_server->ldc_id, status); 2461 status = 0; 2462 break; 2463 default: 2464 DMSG(vdc, 0, "[%d] Failed to bring up LDC: " 2465 "channel=%ld, err=%d", vdc->instance, 2466 vdc->curr_server->ldc_id, status); 2467 break; 2468 } 2469 } 2470 2471 if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) { 2472 vdc->curr_server->ldc_state = ldc_state; 2473 if (ldc_state == LDC_UP) { 2474 DMSG(vdc, 0, "[%d] LDC channel already up\n", 2475 vdc->instance); 2476 vdc->seq_num = 1; 2477 vdc->seq_num_reply = 0; 2478 } 2479 } 2480 2481 return (status); 2482 } 2483 2484 /* 2485 * Function: 2486 * vdc_terminate_ldc() 2487 * 2488 * Description: 2489 * 2490 * Arguments: 2491 * vdc - soft state pointer for this instance of the device driver. 2492 * srvr - vdc per-server info structure 2493 * 2494 * Return Code: 2495 * None 2496 */ 2497 static void 2498 vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr) 2499 { 2500 int instance = ddi_get_instance(vdc->dip); 2501 2502 if (srvr->state & VDC_LDC_OPEN) { 2503 DMSG(vdc, 0, "[%d] ldc_close()\n", instance); 2504 (void) ldc_close(srvr->ldc_handle); 2505 } 2506 if (srvr->state & VDC_LDC_CB) { 2507 DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance); 2508 (void) ldc_unreg_callback(srvr->ldc_handle); 2509 } 2510 if (srvr->state & VDC_LDC_INIT) { 2511 DMSG(vdc, 0, "[%d] ldc_fini()\n", instance); 2512 (void) ldc_fini(srvr->ldc_handle); 2513 srvr->ldc_handle = NULL; 2514 } 2515 2516 srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN); 2517 } 2518 2519 /* 2520 * Function: 2521 * vdc_fini_ports() 2522 * 2523 * Description: 2524 * Finalize all ports by closing the channel associated with each 2525 * port and also freeing the server structure. 2526 * 2527 * Arguments: 2528 * vdc - soft state pointer for this instance of the device driver. 2529 * 2530 * Return Code: 2531 * None 2532 */ 2533 static void 2534 vdc_fini_ports(vdc_t *vdc) 2535 { 2536 int instance = ddi_get_instance(vdc->dip); 2537 vdc_server_t *srvr, *prev_srvr; 2538 2539 ASSERT(vdc != NULL); 2540 ASSERT(mutex_owned(&vdc->lock)); 2541 2542 DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized); 2543 2544 srvr = vdc->server_list; 2545 2546 while (srvr) { 2547 2548 vdc_terminate_ldc(vdc, srvr); 2549 2550 /* next server */ 2551 prev_srvr = srvr; 2552 srvr = srvr->next; 2553 2554 /* free server */ 2555 kmem_free(prev_srvr, sizeof (vdc_server_t)); 2556 } 2557 2558 vdc->server_list = NULL; 2559 } 2560 2561 /* -------------------------------------------------------------------------- */ 2562 2563 /* 2564 * Descriptor Ring helper routines 2565 */ 2566 2567 /* 2568 * Function: 2569 * vdc_init_descriptor_ring() 2570 * 2571 * Description: 2572 * 2573 * Arguments: 2574 * vdc - soft state pointer for this instance of the device driver. 2575 * 2576 * Return Code: 2577 * 0 - Success 2578 */ 2579 static int 2580 vdc_init_descriptor_ring(vdc_t *vdc) 2581 { 2582 vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ 2583 int status = 0; 2584 int i; 2585 2586 DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized); 2587 2588 ASSERT(vdc != NULL); 2589 ASSERT(mutex_owned(&vdc->lock)); 2590 2591 /* ensure we have enough room to store max sized block */ 2592 ASSERT(maxphys <= VD_MAX_BLOCK_SIZE); 2593 2594 if ((vdc->initialized & VDC_DRING_INIT) == 0) { 2595 DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance); 2596 /* 2597 * Calculate the maximum block size we can transmit using one 2598 * Descriptor Ring entry from the attributes returned by the 2599 * vDisk server. This is subject to a minimum of 'maxphys' 2600 * as we do not have the capability to split requests over 2601 * multiple DRing entries. 2602 */ 2603 if ((vdc->max_xfer_sz * vdc->block_size) < maxphys) { 2604 DMSG(vdc, 0, "[%d] using minimum DRing size\n", 2605 vdc->instance); 2606 vdc->dring_max_cookies = maxphys / PAGESIZE; 2607 } else { 2608 vdc->dring_max_cookies = 2609 (vdc->max_xfer_sz * vdc->block_size) / PAGESIZE; 2610 } 2611 vdc->dring_entry_size = (sizeof (vd_dring_entry_t) + 2612 (sizeof (ldc_mem_cookie_t) * 2613 (vdc->dring_max_cookies - 1))); 2614 vdc->dring_len = VD_DRING_LEN; 2615 2616 status = ldc_mem_dring_create(vdc->dring_len, 2617 vdc->dring_entry_size, &vdc->dring_hdl); 2618 if ((vdc->dring_hdl == NULL) || (status != 0)) { 2619 DMSG(vdc, 0, "[%d] Descriptor ring creation failed", 2620 vdc->instance); 2621 return (status); 2622 } 2623 vdc->initialized |= VDC_DRING_INIT; 2624 } 2625 2626 if ((vdc->initialized & VDC_DRING_BOUND) == 0) { 2627 DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance); 2628 vdc->dring_cookie = 2629 kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); 2630 2631 status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle, 2632 vdc->dring_hdl, 2633 LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW, 2634 &vdc->dring_cookie[0], 2635 &vdc->dring_cookie_count); 2636 if (status != 0) { 2637 DMSG(vdc, 0, "[%d] Failed to bind descriptor ring " 2638 "(%lx) to channel (%lx) status=%d\n", 2639 vdc->instance, vdc->dring_hdl, 2640 vdc->curr_server->ldc_handle, status); 2641 return (status); 2642 } 2643 ASSERT(vdc->dring_cookie_count == 1); 2644 vdc->initialized |= VDC_DRING_BOUND; 2645 } 2646 2647 status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info); 2648 if (status != 0) { 2649 DMSG(vdc, 0, 2650 "[%d] Failed to get info for descriptor ring (%lx)\n", 2651 vdc->instance, vdc->dring_hdl); 2652 return (status); 2653 } 2654 2655 if ((vdc->initialized & VDC_DRING_LOCAL) == 0) { 2656 DMSG(vdc, 0, "[%d] local dring\n", vdc->instance); 2657 2658 /* Allocate the local copy of this dring */ 2659 vdc->local_dring = 2660 kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t), 2661 KM_SLEEP); 2662 vdc->initialized |= VDC_DRING_LOCAL; 2663 } 2664 2665 /* 2666 * Mark all DRing entries as free and initialize the private 2667 * descriptor's memory handles. If any entry is initialized, 2668 * we need to free it later so we set the bit in 'initialized' 2669 * at the start. 2670 */ 2671 vdc->initialized |= VDC_DRING_ENTRY; 2672 for (i = 0; i < vdc->dring_len; i++) { 2673 dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); 2674 dep->hdr.dstate = VIO_DESC_FREE; 2675 2676 status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle, 2677 &vdc->local_dring[i].desc_mhdl); 2678 if (status != 0) { 2679 DMSG(vdc, 0, "![%d] Failed to alloc mem handle for" 2680 " descriptor %d", vdc->instance, i); 2681 return (status); 2682 } 2683 vdc->local_dring[i].is_free = B_TRUE; 2684 vdc->local_dring[i].dep = dep; 2685 } 2686 2687 /* Initialize the starting index */ 2688 vdc->dring_curr_idx = 0; 2689 2690 return (status); 2691 } 2692 2693 /* 2694 * Function: 2695 * vdc_destroy_descriptor_ring() 2696 * 2697 * Description: 2698 * 2699 * Arguments: 2700 * vdc - soft state pointer for this instance of the device driver. 2701 * 2702 * Return Code: 2703 * None 2704 */ 2705 static void 2706 vdc_destroy_descriptor_ring(vdc_t *vdc) 2707 { 2708 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 2709 ldc_mem_handle_t mhdl = NULL; 2710 ldc_mem_info_t minfo; 2711 int status = -1; 2712 int i; /* loop */ 2713 2714 ASSERT(vdc != NULL); 2715 ASSERT(mutex_owned(&vdc->lock)); 2716 2717 DMSG(vdc, 0, "[%d] Entered\n", vdc->instance); 2718 2719 if (vdc->initialized & VDC_DRING_ENTRY) { 2720 DMSG(vdc, 0, 2721 "[%d] Removing Local DRing entries\n", vdc->instance); 2722 for (i = 0; i < vdc->dring_len; i++) { 2723 ldep = &vdc->local_dring[i]; 2724 mhdl = ldep->desc_mhdl; 2725 2726 if (mhdl == NULL) 2727 continue; 2728 2729 if ((status = ldc_mem_info(mhdl, &minfo)) != 0) { 2730 DMSG(vdc, 0, 2731 "ldc_mem_info returned an error: %d\n", 2732 status); 2733 2734 /* 2735 * This must mean that the mem handle 2736 * is not valid. Clear it out so that 2737 * no one tries to use it. 2738 */ 2739 ldep->desc_mhdl = NULL; 2740 continue; 2741 } 2742 2743 if (minfo.status == LDC_BOUND) { 2744 (void) ldc_mem_unbind_handle(mhdl); 2745 } 2746 2747 (void) ldc_mem_free_handle(mhdl); 2748 2749 ldep->desc_mhdl = NULL; 2750 } 2751 vdc->initialized &= ~VDC_DRING_ENTRY; 2752 } 2753 2754 if (vdc->initialized & VDC_DRING_LOCAL) { 2755 DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance); 2756 kmem_free(vdc->local_dring, 2757 vdc->dring_len * sizeof (vdc_local_desc_t)); 2758 vdc->initialized &= ~VDC_DRING_LOCAL; 2759 } 2760 2761 if (vdc->initialized & VDC_DRING_BOUND) { 2762 DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance); 2763 status = ldc_mem_dring_unbind(vdc->dring_hdl); 2764 if (status == 0) { 2765 vdc->initialized &= ~VDC_DRING_BOUND; 2766 } else { 2767 DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx", 2768 vdc->instance, status, vdc->dring_hdl); 2769 } 2770 kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t)); 2771 } 2772 2773 if (vdc->initialized & VDC_DRING_INIT) { 2774 DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance); 2775 status = ldc_mem_dring_destroy(vdc->dring_hdl); 2776 if (status == 0) { 2777 vdc->dring_hdl = NULL; 2778 bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); 2779 vdc->initialized &= ~VDC_DRING_INIT; 2780 } else { 2781 DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)", 2782 vdc->instance, status, vdc->dring_hdl); 2783 } 2784 } 2785 } 2786 2787 /* 2788 * Function: 2789 * vdc_map_to_shared_dring() 2790 * 2791 * Description: 2792 * Copy contents of the local descriptor to the shared 2793 * memory descriptor. 2794 * 2795 * Arguments: 2796 * vdcp - soft state pointer for this instance of the device driver. 2797 * idx - descriptor ring index 2798 * 2799 * Return Code: 2800 * None 2801 */ 2802 static int 2803 vdc_map_to_shared_dring(vdc_t *vdcp, int idx) 2804 { 2805 vdc_local_desc_t *ldep; 2806 vd_dring_entry_t *dep; 2807 int rv; 2808 2809 ldep = &(vdcp->local_dring[idx]); 2810 2811 /* for now leave in the old pop_mem_hdl stuff */ 2812 if (ldep->nbytes > 0) { 2813 rv = vdc_populate_mem_hdl(vdcp, ldep); 2814 if (rv) { 2815 DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n", 2816 vdcp->instance); 2817 return (rv); 2818 } 2819 } 2820 2821 /* 2822 * fill in the data details into the DRing 2823 */ 2824 dep = ldep->dep; 2825 ASSERT(dep != NULL); 2826 2827 dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp); 2828 dep->payload.operation = ldep->operation; 2829 dep->payload.addr = ldep->offset; 2830 dep->payload.nbytes = ldep->nbytes; 2831 dep->payload.status = (uint32_t)-1; /* vds will set valid value */ 2832 dep->payload.slice = ldep->slice; 2833 dep->hdr.dstate = VIO_DESC_READY; 2834 dep->hdr.ack = 1; /* request an ACK for every message */ 2835 2836 return (0); 2837 } 2838 2839 /* 2840 * Function: 2841 * vdc_send_request 2842 * 2843 * Description: 2844 * This routine writes the data to be transmitted to vds into the 2845 * descriptor, notifies vds that the ring has been updated and 2846 * then waits for the request to be processed. 2847 * 2848 * Arguments: 2849 * vdcp - the soft state pointer 2850 * operation - operation we want vds to perform (VD_OP_XXX) 2851 * addr - address of data buf to be read/written. 2852 * nbytes - number of bytes to read/write 2853 * slice - the disk slice this request is for 2854 * offset - relative disk offset 2855 * cb_type - type of call - STRATEGY or SYNC 2856 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2857 * . mode for ioctl(9e) 2858 * . LP64 diskaddr_t (block I/O) 2859 * dir - direction of operation (READ/WRITE/BOTH) 2860 * 2861 * Return Codes: 2862 * 0 2863 * ENXIO 2864 */ 2865 static int 2866 vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, 2867 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2868 void *cb_arg, vio_desc_direction_t dir) 2869 { 2870 int rv = 0; 2871 2872 ASSERT(vdcp != NULL); 2873 ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR); 2874 2875 mutex_enter(&vdcp->lock); 2876 2877 /* 2878 * If this is a block read/write operation we update the I/O statistics 2879 * to indicate that the request is being put on the waitq to be 2880 * serviced. 2881 * 2882 * We do it here (a common routine for both synchronous and strategy 2883 * calls) for performance reasons - we are already holding vdc->lock 2884 * so there is no extra locking overhead. We would have to explicitly 2885 * grab the 'lock' mutex to update the stats if we were to do this 2886 * higher up the stack in vdc_strategy() et. al. 2887 */ 2888 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2889 DTRACE_IO1(start, buf_t *, cb_arg); 2890 VD_KSTAT_WAITQ_ENTER(vdcp); 2891 } 2892 2893 do { 2894 while (vdcp->state != VDC_STATE_RUNNING) { 2895 2896 /* return error if detaching */ 2897 if (vdcp->state == VDC_STATE_DETACH) { 2898 rv = ENXIO; 2899 goto done; 2900 } 2901 2902 /* fail request if connection timeout is reached */ 2903 if (vdcp->ctimeout_reached) { 2904 rv = EIO; 2905 goto done; 2906 } 2907 2908 /* 2909 * If we are panicking and the disk is not ready then 2910 * we can't send any request because we can't complete 2911 * the handshake now. 2912 */ 2913 if (ddi_in_panic()) { 2914 rv = EIO; 2915 goto done; 2916 } 2917 2918 cv_wait(&vdcp->running_cv, &vdcp->lock); 2919 } 2920 2921 } while (vdc_populate_descriptor(vdcp, operation, addr, 2922 nbytes, slice, offset, cb_type, cb_arg, dir)); 2923 2924 done: 2925 /* 2926 * If this is a block read/write we update the I/O statistics kstat 2927 * to indicate that this request has been placed on the queue for 2928 * processing (i.e sent to the vDisk server) - iostat(1M) will 2929 * report the time waiting for the vDisk server under the %b column 2930 * In the case of an error we simply take it off the wait queue. 2931 */ 2932 if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { 2933 if (rv == 0) { 2934 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 2935 DTRACE_PROBE1(send, buf_t *, cb_arg); 2936 } else { 2937 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 2938 VD_KSTAT_WAITQ_EXIT(vdcp); 2939 DTRACE_IO1(done, buf_t *, cb_arg); 2940 } 2941 } 2942 2943 mutex_exit(&vdcp->lock); 2944 2945 return (rv); 2946 } 2947 2948 2949 /* 2950 * Function: 2951 * vdc_populate_descriptor 2952 * 2953 * Description: 2954 * This routine writes the data to be transmitted to vds into the 2955 * descriptor, notifies vds that the ring has been updated and 2956 * then waits for the request to be processed. 2957 * 2958 * Arguments: 2959 * vdcp - the soft state pointer 2960 * operation - operation we want vds to perform (VD_OP_XXX) 2961 * addr - address of data buf to be read/written. 2962 * nbytes - number of bytes to read/write 2963 * slice - the disk slice this request is for 2964 * offset - relative disk offset 2965 * cb_type - type of call - STRATEGY or SYNC 2966 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 2967 * . mode for ioctl(9e) 2968 * . LP64 diskaddr_t (block I/O) 2969 * dir - direction of operation (READ/WRITE/BOTH) 2970 * 2971 * Return Codes: 2972 * 0 2973 * EAGAIN 2974 * ECONNRESET 2975 * ENXIO 2976 */ 2977 static int 2978 vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, 2979 size_t nbytes, int slice, diskaddr_t offset, int cb_type, 2980 void *cb_arg, vio_desc_direction_t dir) 2981 { 2982 vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */ 2983 int idx; /* Index of DRing entry used */ 2984 int next_idx; 2985 vio_dring_msg_t dmsg; 2986 size_t msglen; 2987 int rv; 2988 2989 ASSERT(MUTEX_HELD(&vdcp->lock)); 2990 vdcp->threads_pending++; 2991 loop: 2992 DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx); 2993 2994 /* Get next available D-Ring entry */ 2995 idx = vdcp->dring_curr_idx; 2996 local_dep = &(vdcp->local_dring[idx]); 2997 2998 if (!local_dep->is_free) { 2999 DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n", 3000 vdcp->instance); 3001 cv_wait(&vdcp->dring_free_cv, &vdcp->lock); 3002 if (vdcp->state == VDC_STATE_RUNNING || 3003 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3004 goto loop; 3005 } 3006 vdcp->threads_pending--; 3007 return (ECONNRESET); 3008 } 3009 3010 next_idx = idx + 1; 3011 if (next_idx >= vdcp->dring_len) 3012 next_idx = 0; 3013 vdcp->dring_curr_idx = next_idx; 3014 3015 ASSERT(local_dep->is_free); 3016 3017 local_dep->operation = operation; 3018 local_dep->addr = addr; 3019 local_dep->nbytes = nbytes; 3020 local_dep->slice = slice; 3021 local_dep->offset = offset; 3022 local_dep->cb_type = cb_type; 3023 local_dep->cb_arg = cb_arg; 3024 local_dep->dir = dir; 3025 3026 local_dep->is_free = B_FALSE; 3027 3028 rv = vdc_map_to_shared_dring(vdcp, idx); 3029 if (rv) { 3030 DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n", 3031 vdcp->instance); 3032 /* free the descriptor */ 3033 local_dep->is_free = B_TRUE; 3034 vdcp->dring_curr_idx = idx; 3035 cv_wait(&vdcp->membind_cv, &vdcp->lock); 3036 if (vdcp->state == VDC_STATE_RUNNING || 3037 vdcp->state == VDC_STATE_HANDLE_PENDING) { 3038 goto loop; 3039 } 3040 vdcp->threads_pending--; 3041 return (ECONNRESET); 3042 } 3043 3044 /* 3045 * Send a msg with the DRing details to vds 3046 */ 3047 VIO_INIT_DRING_DATA_TAG(dmsg); 3048 VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp); 3049 dmsg.dring_ident = vdcp->dring_ident; 3050 dmsg.start_idx = idx; 3051 dmsg.end_idx = idx; 3052 vdcp->seq_num++; 3053 3054 DTRACE_PROBE2(populate, int, vdcp->instance, 3055 vdc_local_desc_t *, local_dep); 3056 DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n", 3057 vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num); 3058 3059 /* 3060 * note we're still holding the lock here to 3061 * make sure the message goes out in order !!!... 3062 */ 3063 msglen = sizeof (dmsg); 3064 rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen); 3065 switch (rv) { 3066 case ECONNRESET: 3067 /* 3068 * vdc_send initiates the reset on failure. 3069 * Since the transaction has already been put 3070 * on the local dring, it will automatically get 3071 * retried when the channel is reset. Given that, 3072 * it is ok to just return success even though the 3073 * send failed. 3074 */ 3075 rv = 0; 3076 break; 3077 3078 case 0: /* EOK */ 3079 DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv); 3080 break; 3081 3082 default: 3083 goto cleanup_and_exit; 3084 } 3085 3086 vdcp->threads_pending--; 3087 return (rv); 3088 3089 cleanup_and_exit: 3090 DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv); 3091 return (ENXIO); 3092 } 3093 3094 /* 3095 * Function: 3096 * vdc_do_sync_op 3097 * 3098 * Description: 3099 * Wrapper around vdc_populate_descriptor that blocks until the 3100 * response to the message is available. 3101 * 3102 * Arguments: 3103 * vdcp - the soft state pointer 3104 * operation - operation we want vds to perform (VD_OP_XXX) 3105 * addr - address of data buf to be read/written. 3106 * nbytes - number of bytes to read/write 3107 * slice - the disk slice this request is for 3108 * offset - relative disk offset 3109 * cb_type - type of call - STRATEGY or SYNC 3110 * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type) 3111 * . mode for ioctl(9e) 3112 * . LP64 diskaddr_t (block I/O) 3113 * dir - direction of operation (READ/WRITE/BOTH) 3114 * rconflict - check for reservation conflict in case of failure 3115 * 3116 * rconflict should be set to B_TRUE by most callers. Callers invoking the 3117 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the 3118 * result of a successful operation with vd_scsi_status(). 3119 * 3120 * Return Codes: 3121 * 0 3122 * EAGAIN 3123 * EFAULT 3124 * ENXIO 3125 * EIO 3126 */ 3127 static int 3128 vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, 3129 int slice, diskaddr_t offset, int cb_type, void *cb_arg, 3130 vio_desc_direction_t dir, boolean_t rconflict) 3131 { 3132 int status; 3133 vdc_io_t *vio; 3134 boolean_t check_resv_conflict = B_FALSE; 3135 3136 ASSERT(cb_type == CB_SYNC); 3137 3138 /* 3139 * Grab the lock, if blocked wait until the server 3140 * response causes us to wake up again. 3141 */ 3142 mutex_enter(&vdcp->lock); 3143 vdcp->sync_op_cnt++; 3144 while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) 3145 cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock); 3146 3147 if (vdcp->state == VDC_STATE_DETACH) { 3148 cv_broadcast(&vdcp->sync_blocked_cv); 3149 vdcp->sync_op_cnt--; 3150 mutex_exit(&vdcp->lock); 3151 return (ENXIO); 3152 } 3153 3154 /* now block anyone other thread entering after us */ 3155 vdcp->sync_op_blocked = B_TRUE; 3156 vdcp->sync_op_pending = B_TRUE; 3157 mutex_exit(&vdcp->lock); 3158 3159 status = vdc_send_request(vdcp, operation, addr, 3160 nbytes, slice, offset, cb_type, cb_arg, dir); 3161 3162 mutex_enter(&vdcp->lock); 3163 3164 if (status != 0) { 3165 vdcp->sync_op_pending = B_FALSE; 3166 } else { 3167 /* 3168 * block until our transaction completes. 3169 * Also anyone else waiting also gets to go next. 3170 */ 3171 while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH) 3172 cv_wait(&vdcp->sync_pending_cv, &vdcp->lock); 3173 3174 DMSG(vdcp, 2, ": operation returned %d\n", 3175 vdcp->sync_op_status); 3176 if (vdcp->state == VDC_STATE_DETACH) { 3177 vdcp->sync_op_pending = B_FALSE; 3178 status = ENXIO; 3179 } else { 3180 status = vdcp->sync_op_status; 3181 if (status != 0 && vdcp->failfast_interval != 0) { 3182 /* 3183 * Operation has failed and failfast is enabled. 3184 * We need to check if the failure is due to a 3185 * reservation conflict if this was requested. 3186 */ 3187 check_resv_conflict = rconflict; 3188 } 3189 3190 } 3191 } 3192 3193 vdcp->sync_op_status = 0; 3194 vdcp->sync_op_blocked = B_FALSE; 3195 vdcp->sync_op_cnt--; 3196 3197 /* signal the next waiting thread */ 3198 cv_signal(&vdcp->sync_blocked_cv); 3199 3200 /* 3201 * We have to check for reservation conflict after unblocking sync 3202 * operations because some sync operations will be used to do this 3203 * check. 3204 */ 3205 if (check_resv_conflict) { 3206 vio = vdc_failfast_io_queue(vdcp, NULL); 3207 while (vio->vio_qtime != 0) 3208 cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); 3209 kmem_free(vio, sizeof (vdc_io_t)); 3210 } 3211 3212 mutex_exit(&vdcp->lock); 3213 3214 return (status); 3215 } 3216 3217 3218 /* 3219 * Function: 3220 * vdc_drain_response() 3221 * 3222 * Description: 3223 * When a guest is panicking, the completion of requests needs to be 3224 * handled differently because interrupts are disabled and vdc 3225 * will not get messages. We have to poll for the messages instead. 3226 * 3227 * Note: since we are panicking we don't implement the io:::done 3228 * DTrace probe or update the I/O statistics kstats. 3229 * 3230 * Arguments: 3231 * vdc - soft state pointer for this instance of the device driver. 3232 * buf - if buf is NULL then we drain all responses, otherwise we 3233 * poll until we receive a ACK/NACK for the specific I/O 3234 * described by buf. 3235 * 3236 * Return Code: 3237 * 0 - Success 3238 */ 3239 static int 3240 vdc_drain_response(vdc_t *vdc, struct buf *buf) 3241 { 3242 int rv, idx, retries; 3243 size_t msglen; 3244 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3245 vio_dring_msg_t dmsg; 3246 struct buf *mbuf; 3247 3248 mutex_enter(&vdc->lock); 3249 3250 retries = 0; 3251 for (;;) { 3252 msglen = sizeof (dmsg); 3253 rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg, 3254 &msglen); 3255 if (rv) { 3256 rv = EINVAL; 3257 break; 3258 } 3259 3260 /* 3261 * if there are no packets wait and check again 3262 */ 3263 if ((rv == 0) && (msglen == 0)) { 3264 if (retries++ > vdc_dump_retries) { 3265 rv = EAGAIN; 3266 break; 3267 } 3268 3269 drv_usecwait(vdc_usec_timeout_dump); 3270 continue; 3271 } 3272 3273 /* 3274 * Ignore all messages that are not ACKs/NACKs to 3275 * DRing requests. 3276 */ 3277 if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || 3278 (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { 3279 DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n", 3280 dmsg.tag.vio_msgtype, 3281 dmsg.tag.vio_subtype, 3282 dmsg.tag.vio_subtype_env); 3283 continue; 3284 } 3285 3286 /* 3287 * set the appropriate return value for the current request. 3288 */ 3289 switch (dmsg.tag.vio_subtype) { 3290 case VIO_SUBTYPE_ACK: 3291 rv = 0; 3292 break; 3293 case VIO_SUBTYPE_NACK: 3294 rv = EAGAIN; 3295 break; 3296 default: 3297 continue; 3298 } 3299 3300 idx = dmsg.start_idx; 3301 if (idx >= vdc->dring_len) { 3302 DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n", 3303 vdc->instance, idx); 3304 continue; 3305 } 3306 ldep = &vdc->local_dring[idx]; 3307 if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { 3308 DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n", 3309 vdc->instance, idx, ldep->dep->hdr.dstate); 3310 continue; 3311 } 3312 3313 if (buf != NULL && ldep->cb_type == CB_STRATEGY) { 3314 mbuf = ldep->cb_arg; 3315 mbuf->b_resid = mbuf->b_bcount - 3316 ldep->dep->payload.nbytes; 3317 bioerror(mbuf, (rv == EAGAIN)? EIO: 3318 ldep->dep->payload.status); 3319 biodone(mbuf); 3320 } else { 3321 mbuf = NULL; 3322 } 3323 3324 DMSG(vdc, 1, "[%d] Depopulating idx=%d state=%d\n", 3325 vdc->instance, idx, ldep->dep->hdr.dstate); 3326 3327 rv = vdc_depopulate_descriptor(vdc, idx); 3328 if (rv) { 3329 DMSG(vdc, 0, 3330 "[%d] Entry @ %d - depopulate failed ..\n", 3331 vdc->instance, idx); 3332 } 3333 3334 /* we have received an ACK/NACK for the specified buffer */ 3335 if (buf != NULL && buf == mbuf) { 3336 rv = 0; 3337 break; 3338 } 3339 3340 /* if this is the last descriptor - break out of loop */ 3341 if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) { 3342 if (buf != NULL) { 3343 /* 3344 * We never got a response for the specified 3345 * buffer so we fail the I/O. 3346 */ 3347 bioerror(buf, EIO); 3348 biodone(buf); 3349 } 3350 break; 3351 } 3352 } 3353 3354 mutex_exit(&vdc->lock); 3355 DMSG(vdc, 0, "End idx=%d\n", idx); 3356 3357 return (rv); 3358 } 3359 3360 3361 /* 3362 * Function: 3363 * vdc_depopulate_descriptor() 3364 * 3365 * Description: 3366 * 3367 * Arguments: 3368 * vdc - soft state pointer for this instance of the device driver. 3369 * idx - Index of the Descriptor Ring entry being modified 3370 * 3371 * Return Code: 3372 * 0 - Success 3373 */ 3374 static int 3375 vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) 3376 { 3377 vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ 3378 vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ 3379 int status = ENXIO; 3380 int rv = 0; 3381 3382 ASSERT(vdc != NULL); 3383 ASSERT(idx < vdc->dring_len); 3384 ldep = &vdc->local_dring[idx]; 3385 ASSERT(ldep != NULL); 3386 ASSERT(MUTEX_HELD(&vdc->lock)); 3387 3388 DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep); 3389 DMSG(vdc, 2, ": idx = %d\n", idx); 3390 3391 dep = ldep->dep; 3392 ASSERT(dep != NULL); 3393 ASSERT((dep->hdr.dstate == VIO_DESC_DONE) || 3394 (dep->payload.status == ECANCELED)); 3395 3396 VDC_MARK_DRING_ENTRY_FREE(vdc, idx); 3397 3398 ldep->is_free = B_TRUE; 3399 status = dep->payload.status; 3400 DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status); 3401 3402 /* 3403 * If no buffers were used to transfer information to the server when 3404 * populating the descriptor then no memory handles need to be unbound 3405 * and we can return now. 3406 */ 3407 if (ldep->nbytes == 0) { 3408 cv_signal(&vdc->dring_free_cv); 3409 return (status); 3410 } 3411 3412 /* 3413 * If the upper layer passed in a misaligned address we copied the 3414 * data into an aligned buffer before sending it to LDC - we now 3415 * copy it back to the original buffer. 3416 */ 3417 if (ldep->align_addr) { 3418 ASSERT(ldep->addr != NULL); 3419 3420 if (dep->payload.nbytes > 0) 3421 bcopy(ldep->align_addr, ldep->addr, 3422 dep->payload.nbytes); 3423 kmem_free(ldep->align_addr, 3424 sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8)); 3425 ldep->align_addr = NULL; 3426 } 3427 3428 rv = ldc_mem_unbind_handle(ldep->desc_mhdl); 3429 if (rv != 0) { 3430 DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)", 3431 vdc->instance, ldep->desc_mhdl, idx, rv); 3432 /* 3433 * The error returned by the vDisk server is more informative 3434 * and thus has a higher priority but if it isn't set we ensure 3435 * that this function returns an error. 3436 */ 3437 if (status == 0) 3438 status = EINVAL; 3439 } 3440 3441 cv_signal(&vdc->membind_cv); 3442 cv_signal(&vdc->dring_free_cv); 3443 3444 return (status); 3445 } 3446 3447 /* 3448 * Function: 3449 * vdc_populate_mem_hdl() 3450 * 3451 * Description: 3452 * 3453 * Arguments: 3454 * vdc - soft state pointer for this instance of the device driver. 3455 * idx - Index of the Descriptor Ring entry being modified 3456 * addr - virtual address being mapped in 3457 * nybtes - number of bytes in 'addr' 3458 * operation - the vDisk operation being performed (VD_OP_xxx) 3459 * 3460 * Return Code: 3461 * 0 - Success 3462 */ 3463 static int 3464 vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep) 3465 { 3466 vd_dring_entry_t *dep = NULL; 3467 ldc_mem_handle_t mhdl; 3468 caddr_t vaddr; 3469 size_t nbytes; 3470 uint8_t perm = LDC_MEM_RW; 3471 uint8_t maptype; 3472 int rv = 0; 3473 int i; 3474 3475 ASSERT(vdcp != NULL); 3476 3477 dep = ldep->dep; 3478 mhdl = ldep->desc_mhdl; 3479 3480 switch (ldep->dir) { 3481 case VIO_read_dir: 3482 perm = LDC_MEM_W; 3483 break; 3484 3485 case VIO_write_dir: 3486 perm = LDC_MEM_R; 3487 break; 3488 3489 case VIO_both_dir: 3490 perm = LDC_MEM_RW; 3491 break; 3492 3493 default: 3494 ASSERT(0); /* catch bad programming in vdc */ 3495 } 3496 3497 /* 3498 * LDC expects any addresses passed in to be 8-byte aligned. We need 3499 * to copy the contents of any misaligned buffers to a newly allocated 3500 * buffer and bind it instead (and copy the the contents back to the 3501 * original buffer passed in when depopulating the descriptor) 3502 */ 3503 vaddr = ldep->addr; 3504 nbytes = ldep->nbytes; 3505 if (((uint64_t)vaddr & 0x7) != 0) { 3506 ASSERT(ldep->align_addr == NULL); 3507 ldep->align_addr = 3508 kmem_alloc(sizeof (caddr_t) * 3509 P2ROUNDUP(nbytes, 8), KM_SLEEP); 3510 DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating " 3511 "(buf=%p nb=%ld op=%d)\n", 3512 vdcp->instance, (void *)vaddr, (void *)ldep->align_addr, 3513 nbytes, ldep->operation); 3514 if (perm != LDC_MEM_W) 3515 bcopy(vaddr, ldep->align_addr, nbytes); 3516 vaddr = ldep->align_addr; 3517 } 3518 3519 maptype = LDC_IO_MAP|LDC_SHADOW_MAP|LDC_DIRECT_MAP; 3520 rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), 3521 maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies); 3522 DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n", 3523 vdcp->instance, dep->payload.ncookies); 3524 if (rv != 0) { 3525 DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle " 3526 "(mhdl=%p, buf=%p, err=%d)\n", 3527 vdcp->instance, (void *)mhdl, (void *)vaddr, rv); 3528 if (ldep->align_addr) { 3529 kmem_free(ldep->align_addr, 3530 sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); 3531 ldep->align_addr = NULL; 3532 } 3533 return (EAGAIN); 3534 } 3535 3536 /* 3537 * Get the other cookies (if any). 3538 */ 3539 for (i = 1; i < dep->payload.ncookies; i++) { 3540 rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); 3541 if (rv != 0) { 3542 (void) ldc_mem_unbind_handle(mhdl); 3543 DMSG(vdcp, 0, "?[%d] Failed to get next cookie " 3544 "(mhdl=%lx cnum=%d), err=%d", 3545 vdcp->instance, mhdl, i, rv); 3546 if (ldep->align_addr) { 3547 kmem_free(ldep->align_addr, 3548 sizeof (caddr_t) * ldep->nbytes); 3549 ldep->align_addr = NULL; 3550 } 3551 return (EAGAIN); 3552 } 3553 } 3554 3555 return (rv); 3556 } 3557 3558 /* 3559 * Interrupt handlers for messages from LDC 3560 */ 3561 3562 /* 3563 * Function: 3564 * vdc_handle_cb() 3565 * 3566 * Description: 3567 * 3568 * Arguments: 3569 * event - Type of event (LDC_EVT_xxx) that triggered the callback 3570 * arg - soft state pointer for this instance of the device driver. 3571 * 3572 * Return Code: 3573 * 0 - Success 3574 */ 3575 static uint_t 3576 vdc_handle_cb(uint64_t event, caddr_t arg) 3577 { 3578 ldc_status_t ldc_state; 3579 int rv = 0; 3580 vdc_server_t *srvr = (vdc_server_t *)(void *)arg; 3581 vdc_t *vdc = srvr->vdcp; 3582 3583 ASSERT(vdc != NULL); 3584 3585 DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num); 3586 3587 /* If callback is not for the current server, ignore it */ 3588 mutex_enter(&vdc->lock); 3589 3590 if (vdc->curr_server != srvr) { 3591 DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n", 3592 vdc->instance, event, srvr->id); 3593 mutex_exit(&vdc->lock); 3594 return (LDC_SUCCESS); 3595 } 3596 3597 /* 3598 * Depending on the type of event that triggered this callback, 3599 * we modify the handshake state or read the data. 3600 * 3601 * NOTE: not done as a switch() as event could be triggered by 3602 * a state change and a read request. Also the ordering of the 3603 * check for the event types is deliberate. 3604 */ 3605 if (event & LDC_EVT_UP) { 3606 DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance); 3607 3608 /* get LDC state */ 3609 rv = ldc_status(srvr->ldc_handle, &ldc_state); 3610 if (rv != 0) { 3611 DMSG(vdc, 0, "[%d] Couldn't get LDC status %d", 3612 vdc->instance, rv); 3613 mutex_exit(&vdc->lock); 3614 return (LDC_SUCCESS); 3615 } 3616 if (srvr->ldc_state != LDC_UP && 3617 ldc_state == LDC_UP) { 3618 /* 3619 * Reset the transaction sequence numbers when 3620 * LDC comes up. We then kick off the handshake 3621 * negotiation with the vDisk server. 3622 */ 3623 vdc->seq_num = 1; 3624 vdc->seq_num_reply = 0; 3625 srvr->ldc_state = ldc_state; 3626 cv_signal(&vdc->initwait_cv); 3627 } 3628 } 3629 3630 if (event & LDC_EVT_READ) { 3631 DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance); 3632 mutex_enter(&vdc->read_lock); 3633 cv_signal(&vdc->read_cv); 3634 vdc->read_state = VDC_READ_PENDING; 3635 mutex_exit(&vdc->read_lock); 3636 mutex_exit(&vdc->lock); 3637 3638 /* that's all we have to do - no need to handle DOWN/RESET */ 3639 return (LDC_SUCCESS); 3640 } 3641 3642 if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) { 3643 3644 DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance); 3645 3646 /* 3647 * Need to wake up any readers so they will 3648 * detect that a reset has occurred. 3649 */ 3650 mutex_enter(&vdc->read_lock); 3651 if ((vdc->read_state == VDC_READ_WAITING) || 3652 (vdc->read_state == VDC_READ_RESET)) 3653 cv_signal(&vdc->read_cv); 3654 vdc->read_state = VDC_READ_RESET; 3655 mutex_exit(&vdc->read_lock); 3656 3657 /* wake up any threads waiting for connection to come up */ 3658 if (vdc->state == VDC_STATE_INIT_WAITING) { 3659 vdc->state = VDC_STATE_RESETTING; 3660 cv_signal(&vdc->initwait_cv); 3661 } 3662 3663 } 3664 3665 mutex_exit(&vdc->lock); 3666 3667 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) 3668 DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received", 3669 vdc->instance, event); 3670 3671 return (LDC_SUCCESS); 3672 } 3673 3674 /* 3675 * Function: 3676 * vdc_wait_for_response() 3677 * 3678 * Description: 3679 * Block waiting for a response from the server. If there is 3680 * no data the thread block on the read_cv that is signalled 3681 * by the callback when an EVT_READ occurs. 3682 * 3683 * Arguments: 3684 * vdcp - soft state pointer for this instance of the device driver. 3685 * 3686 * Return Code: 3687 * 0 - Success 3688 */ 3689 static int 3690 vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp) 3691 { 3692 size_t nbytes = sizeof (*msgp); 3693 int status; 3694 3695 ASSERT(vdcp != NULL); 3696 3697 DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance); 3698 3699 status = vdc_recv(vdcp, msgp, &nbytes); 3700 DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n", 3701 status, (int)nbytes); 3702 if (status) { 3703 DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n", 3704 vdcp->instance, status); 3705 return (status); 3706 } 3707 3708 if (nbytes < sizeof (vio_msg_tag_t)) { 3709 DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n", 3710 vdcp->instance, sizeof (vio_msg_tag_t), nbytes); 3711 return (ENOMSG); 3712 } 3713 3714 DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance, 3715 msgp->tag.vio_msgtype, 3716 msgp->tag.vio_subtype, 3717 msgp->tag.vio_subtype_env); 3718 3719 /* 3720 * Verify the Session ID of the message 3721 * 3722 * Every message after the Version has been negotiated should 3723 * have the correct session ID set. 3724 */ 3725 if ((msgp->tag.vio_sid != vdcp->session_id) && 3726 (msgp->tag.vio_subtype_env != VIO_VER_INFO)) { 3727 DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, " 3728 "expected 0x%lx [seq num %lx @ %d]", 3729 vdcp->instance, msgp->tag.vio_sid, 3730 vdcp->session_id, 3731 ((vio_dring_msg_t *)msgp)->seq_num, 3732 ((vio_dring_msg_t *)msgp)->start_idx); 3733 return (ENOMSG); 3734 } 3735 return (0); 3736 } 3737 3738 3739 /* 3740 * Function: 3741 * vdc_resubmit_backup_dring() 3742 * 3743 * Description: 3744 * Resubmit each descriptor in the backed up dring to 3745 * vDisk server. The Dring was backed up during connection 3746 * reset. 3747 * 3748 * Arguments: 3749 * vdcp - soft state pointer for this instance of the device driver. 3750 * 3751 * Return Code: 3752 * 0 - Success 3753 */ 3754 static int 3755 vdc_resubmit_backup_dring(vdc_t *vdcp) 3756 { 3757 int processed = 0; 3758 int count; 3759 int b_idx; 3760 int rv = 0; 3761 int dring_size; 3762 int op; 3763 vio_msg_t vio_msg; 3764 vdc_local_desc_t *curr_ldep; 3765 3766 ASSERT(MUTEX_NOT_HELD(&vdcp->lock)); 3767 ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING); 3768 3769 if (vdcp->local_dring_backup == NULL) { 3770 /* the pending requests have already been processed */ 3771 return (0); 3772 } 3773 3774 DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n", 3775 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3776 3777 /* 3778 * Walk the backup copy of the local descriptor ring and 3779 * resubmit all the outstanding transactions. 3780 */ 3781 b_idx = vdcp->local_dring_backup_tail; 3782 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3783 3784 curr_ldep = &(vdcp->local_dring_backup[b_idx]); 3785 3786 /* only resubmit outstanding transactions */ 3787 if (!curr_ldep->is_free) { 3788 /* 3789 * If we are retrying a block read/write operation we 3790 * need to update the I/O statistics to indicate that 3791 * the request is being put back on the waitq to be 3792 * serviced (it will have been taken off after the 3793 * error was reported). 3794 */ 3795 mutex_enter(&vdcp->lock); 3796 op = curr_ldep->operation; 3797 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3798 DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg); 3799 VD_KSTAT_WAITQ_ENTER(vdcp); 3800 } 3801 3802 DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx); 3803 rv = vdc_populate_descriptor(vdcp, op, 3804 curr_ldep->addr, curr_ldep->nbytes, 3805 curr_ldep->slice, curr_ldep->offset, 3806 curr_ldep->cb_type, curr_ldep->cb_arg, 3807 curr_ldep->dir); 3808 3809 if (rv) { 3810 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3811 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3812 VD_KSTAT_WAITQ_EXIT(vdcp); 3813 DTRACE_IO1(done, buf_t *, 3814 curr_ldep->cb_arg); 3815 } 3816 DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n", 3817 vdcp->instance, b_idx); 3818 mutex_exit(&vdcp->lock); 3819 goto done; 3820 } 3821 3822 /* 3823 * If this is a block read/write we update the I/O 3824 * statistics kstat to indicate that the request 3825 * has been sent back to the vDisk server and should 3826 * now be put on the run queue. 3827 */ 3828 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 3829 DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg); 3830 VD_KSTAT_WAITQ_TO_RUNQ(vdcp); 3831 } 3832 mutex_exit(&vdcp->lock); 3833 3834 /* Wait for the response message. */ 3835 DMSG(vdcp, 1, "waiting for response to idx=%x\n", 3836 b_idx); 3837 rv = vdc_wait_for_response(vdcp, &vio_msg); 3838 if (rv) { 3839 /* 3840 * If this is a block read/write we update 3841 * the I/O statistics kstat to take it 3842 * off the run queue. 3843 */ 3844 mutex_enter(&vdcp->lock); 3845 if (op == VD_OP_BREAD || op == VD_OP_BWRITE) { 3846 VD_UPDATE_ERR_STATS(vdcp, vd_transerrs); 3847 VD_KSTAT_RUNQ_EXIT(vdcp); 3848 DTRACE_IO1(done, buf_t *, 3849 curr_ldep->cb_arg); 3850 } 3851 DMSG(vdcp, 1, "[%d] wait_for_response " 3852 "returned err=%d\n", vdcp->instance, 3853 rv); 3854 mutex_exit(&vdcp->lock); 3855 goto done; 3856 } 3857 3858 DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx); 3859 rv = vdc_process_data_msg(vdcp, &vio_msg); 3860 if (rv) { 3861 DMSG(vdcp, 1, "[%d] process_data_msg " 3862 "returned err=%d\n", vdcp->instance, 3863 rv); 3864 goto done; 3865 } 3866 /* 3867 * Mark this entry as free so that we will not resubmit 3868 * this "done" request again, if we were to use the same 3869 * backup_dring again in future. This could happen when 3870 * a reset happens while processing the backup_dring. 3871 */ 3872 curr_ldep->is_free = B_TRUE; 3873 processed++; 3874 } 3875 3876 /* get the next element to submit */ 3877 if (++b_idx >= vdcp->local_dring_backup_len) 3878 b_idx = 0; 3879 } 3880 3881 /* all done - now clear up pending dring copy */ 3882 dring_size = vdcp->local_dring_backup_len * 3883 sizeof (vdcp->local_dring_backup[0]); 3884 3885 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3886 3887 vdcp->local_dring_backup = NULL; 3888 3889 done: 3890 DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp); 3891 3892 return (rv); 3893 } 3894 3895 /* 3896 * Function: 3897 * vdc_cancel_backup_dring 3898 * 3899 * Description: 3900 * Cancel each descriptor in the backed up dring to vDisk server. 3901 * The Dring was backed up during connection reset. 3902 * 3903 * Arguments: 3904 * vdcp - soft state pointer for this instance of the device driver. 3905 * 3906 * Return Code: 3907 * None 3908 */ 3909 void 3910 vdc_cancel_backup_dring(vdc_t *vdcp) 3911 { 3912 vdc_local_desc_t *ldep; 3913 struct buf *bufp; 3914 int count; 3915 int b_idx; 3916 int dring_size; 3917 int cancelled = 0; 3918 3919 ASSERT(MUTEX_HELD(&vdcp->lock)); 3920 ASSERT(vdcp->state == VDC_STATE_INIT || 3921 vdcp->state == VDC_STATE_INIT_WAITING || 3922 vdcp->state == VDC_STATE_NEGOTIATE || 3923 vdcp->state == VDC_STATE_RESETTING); 3924 3925 if (vdcp->local_dring_backup == NULL) { 3926 /* the pending requests have already been processed */ 3927 return; 3928 } 3929 3930 DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n", 3931 vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail); 3932 3933 /* 3934 * Walk the backup copy of the local descriptor ring and 3935 * cancel all the outstanding transactions. 3936 */ 3937 b_idx = vdcp->local_dring_backup_tail; 3938 for (count = 0; count < vdcp->local_dring_backup_len; count++) { 3939 3940 ldep = &(vdcp->local_dring_backup[b_idx]); 3941 3942 /* only cancel outstanding transactions */ 3943 if (!ldep->is_free) { 3944 3945 DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx); 3946 cancelled++; 3947 3948 /* 3949 * All requests have already been cleared from the 3950 * local descriptor ring and the LDC channel has been 3951 * reset so we will never get any reply for these 3952 * requests. Now we just have to notify threads waiting 3953 * for replies that the request has failed. 3954 */ 3955 switch (ldep->cb_type) { 3956 case CB_SYNC: 3957 ASSERT(vdcp->sync_op_pending); 3958 vdcp->sync_op_status = EIO; 3959 vdcp->sync_op_pending = B_FALSE; 3960 cv_signal(&vdcp->sync_pending_cv); 3961 break; 3962 3963 case CB_STRATEGY: 3964 bufp = ldep->cb_arg; 3965 ASSERT(bufp != NULL); 3966 bufp->b_resid = bufp->b_bcount; 3967 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 3968 VD_KSTAT_RUNQ_EXIT(vdcp); 3969 DTRACE_IO1(done, buf_t *, bufp); 3970 bioerror(bufp, EIO); 3971 biodone(bufp); 3972 break; 3973 3974 default: 3975 ASSERT(0); 3976 } 3977 3978 } 3979 3980 /* get the next element to cancel */ 3981 if (++b_idx >= vdcp->local_dring_backup_len) 3982 b_idx = 0; 3983 } 3984 3985 /* all done - now clear up pending dring copy */ 3986 dring_size = vdcp->local_dring_backup_len * 3987 sizeof (vdcp->local_dring_backup[0]); 3988 3989 (void) kmem_free(vdcp->local_dring_backup, dring_size); 3990 3991 vdcp->local_dring_backup = NULL; 3992 3993 DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp); 3994 } 3995 3996 /* 3997 * Function: 3998 * vdc_connection_timeout 3999 * 4000 * Description: 4001 * This function is invoked if the timeout set to establish the connection 4002 * with vds expires. This will happen if we spend too much time in the 4003 * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will 4004 * cancel any pending request and mark them as failed. 4005 * 4006 * If the timeout does not expire, it will be cancelled when we reach the 4007 * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can 4008 * be invoked while we are in the VDC_STATE_HANDLE_PENDING or 4009 * VDC_STATE_RESETTING state in which case we do nothing because the 4010 * timeout is being cancelled. 4011 * 4012 * Arguments: 4013 * arg - argument of the timeout function actually a soft state 4014 * pointer for the instance of the device driver. 4015 * 4016 * Return Code: 4017 * None 4018 */ 4019 void 4020 vdc_connection_timeout(void *arg) 4021 { 4022 vdc_t *vdcp = (vdc_t *)arg; 4023 4024 mutex_enter(&vdcp->lock); 4025 4026 if (vdcp->state == VDC_STATE_HANDLE_PENDING || 4027 vdcp->state == VDC_STATE_DETACH) { 4028 /* 4029 * The connection has just been re-established or 4030 * we are detaching. 4031 */ 4032 vdcp->ctimeout_reached = B_FALSE; 4033 mutex_exit(&vdcp->lock); 4034 return; 4035 } 4036 4037 vdcp->ctimeout_reached = B_TRUE; 4038 4039 /* notify requests waiting for sending */ 4040 cv_broadcast(&vdcp->running_cv); 4041 4042 /* cancel requests waiting for a result */ 4043 vdc_cancel_backup_dring(vdcp); 4044 4045 mutex_exit(&vdcp->lock); 4046 4047 cmn_err(CE_NOTE, "[%d] connection to service domain timeout", 4048 vdcp->instance); 4049 } 4050 4051 /* 4052 * Function: 4053 * vdc_backup_local_dring() 4054 * 4055 * Description: 4056 * Backup the current dring in the event of a reset. The Dring 4057 * transactions will be resubmitted to the server when the 4058 * connection is restored. 4059 * 4060 * Arguments: 4061 * vdcp - soft state pointer for this instance of the device driver. 4062 * 4063 * Return Code: 4064 * NONE 4065 */ 4066 static void 4067 vdc_backup_local_dring(vdc_t *vdcp) 4068 { 4069 int dring_size; 4070 4071 ASSERT(MUTEX_HELD(&vdcp->lock)); 4072 ASSERT(vdcp->state == VDC_STATE_RESETTING); 4073 4074 /* 4075 * If the backup dring is stil around, it means 4076 * that the last restore did not complete. However, 4077 * since we never got back into the running state, 4078 * the backup copy we have is still valid. 4079 */ 4080 if (vdcp->local_dring_backup != NULL) { 4081 DMSG(vdcp, 1, "reusing local descriptor ring backup " 4082 "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len, 4083 vdcp->local_dring_backup_tail); 4084 return; 4085 } 4086 4087 /* 4088 * The backup dring can be NULL and the local dring may not be 4089 * initialized. This can happen if we had a reset while establishing 4090 * a new connection but after the connection has timed out. In that 4091 * case the backup dring is NULL because the requests have been 4092 * cancelled and the request occured before the local dring is 4093 * initialized. 4094 */ 4095 if (!(vdcp->initialized & VDC_DRING_LOCAL)) 4096 return; 4097 4098 DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, " 4099 "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx); 4100 4101 dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]); 4102 4103 vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP); 4104 bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size); 4105 4106 vdcp->local_dring_backup_tail = vdcp->dring_curr_idx; 4107 vdcp->local_dring_backup_len = vdcp->dring_len; 4108 } 4109 4110 static void 4111 vdc_switch_server(vdc_t *vdcp) 4112 { 4113 int rv; 4114 vdc_server_t *curr_server, *new_server; 4115 4116 ASSERT(MUTEX_HELD(&vdcp->lock)); 4117 4118 /* if there is only one server return back */ 4119 if (vdcp->num_servers == 1) { 4120 return; 4121 } 4122 4123 /* Get current and next server */ 4124 curr_server = vdcp->curr_server; 4125 new_server = 4126 (curr_server->next) ? curr_server->next : vdcp->server_list; 4127 ASSERT(curr_server != new_server); 4128 4129 /* bring current server's channel down */ 4130 rv = ldc_down(curr_server->ldc_handle); 4131 if (rv) { 4132 DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n", 4133 vdcp->instance, curr_server->id); 4134 return; 4135 } 4136 4137 /* switch the server */ 4138 vdcp->curr_server = new_server; 4139 4140 DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n", 4141 vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id); 4142 } 4143 4144 /* -------------------------------------------------------------------------- */ 4145 4146 /* 4147 * The following functions process the incoming messages from vds 4148 */ 4149 4150 /* 4151 * Function: 4152 * vdc_process_msg_thread() 4153 * 4154 * Description: 4155 * 4156 * Main VDC message processing thread. Each vDisk instance 4157 * consists of a copy of this thread. This thread triggers 4158 * all the handshakes and data exchange with the server. It 4159 * also handles all channel resets 4160 * 4161 * Arguments: 4162 * vdc - soft state pointer for this instance of the device driver. 4163 * 4164 * Return Code: 4165 * None 4166 */ 4167 static void 4168 vdc_process_msg_thread(vdc_t *vdcp) 4169 { 4170 int status; 4171 int ctimeout; 4172 timeout_id_t tmid = 0; 4173 clock_t ldcup_timeout = 0; 4174 4175 mutex_enter(&vdcp->lock); 4176 4177 for (;;) { 4178 4179 #define Q(_s) (vdcp->state == _s) ? #_s : 4180 DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state, 4181 Q(VDC_STATE_INIT) 4182 Q(VDC_STATE_INIT_WAITING) 4183 Q(VDC_STATE_NEGOTIATE) 4184 Q(VDC_STATE_HANDLE_PENDING) 4185 Q(VDC_STATE_RUNNING) 4186 Q(VDC_STATE_RESETTING) 4187 Q(VDC_STATE_DETACH) 4188 "UNKNOWN"); 4189 4190 switch (vdcp->state) { 4191 case VDC_STATE_INIT: 4192 4193 /* 4194 * If requested, start a timeout to check if the 4195 * connection with vds is established in the 4196 * specified delay. If the timeout expires, we 4197 * will cancel any pending request. 4198 * 4199 * If some reset have occurred while establishing 4200 * the connection, we already have a timeout armed 4201 * and in that case we don't need to arm a new one. 4202 * 4203 * The same rule applies when there are multiple vds'. 4204 * If either a connection cannot be established or 4205 * the handshake times out, the connection thread will 4206 * try another server. The 'ctimeout' will report 4207 * back an error after it expires irrespective of 4208 * whether the vdisk is trying to connect to just 4209 * one or multiple servers. 4210 */ 4211 ctimeout = (vdc_timeout != 0)? 4212 vdc_timeout : vdcp->curr_server->ctimeout; 4213 4214 if (ctimeout != 0 && tmid == 0) { 4215 tmid = timeout(vdc_connection_timeout, vdcp, 4216 ctimeout * drv_usectohz(MICROSEC)); 4217 } 4218 4219 /* Check if we are re-initializing repeatedly */ 4220 if (vdcp->hshake_cnt > vdc_hshake_retries && 4221 vdcp->lifecycle != VDC_LC_ONLINE) { 4222 4223 DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d", 4224 vdcp->instance, vdcp->hshake_cnt); 4225 cmn_err(CE_NOTE, "[%d] disk access failed.\n", 4226 vdcp->instance); 4227 vdcp->state = VDC_STATE_DETACH; 4228 break; 4229 } 4230 4231 /* Switch to STATE_DETACH if drv is detaching */ 4232 if (vdcp->lifecycle == VDC_LC_DETACHING) { 4233 vdcp->state = VDC_STATE_DETACH; 4234 break; 4235 } 4236 4237 /* Switch server */ 4238 if (vdcp->hshake_cnt > 0) 4239 vdc_switch_server(vdcp); 4240 vdcp->hshake_cnt++; 4241 4242 /* Bring up connection with vds via LDC */ 4243 status = vdc_start_ldc_connection(vdcp); 4244 if (status != EINVAL) { 4245 vdcp->state = VDC_STATE_INIT_WAITING; 4246 } 4247 break; 4248 4249 case VDC_STATE_INIT_WAITING: 4250 4251 /* if channel is UP, start negotiation */ 4252 if (vdcp->curr_server->ldc_state == LDC_UP) { 4253 vdcp->state = VDC_STATE_NEGOTIATE; 4254 break; 4255 } 4256 4257 /* check if only one server exists */ 4258 if (vdcp->num_servers == 1) { 4259 cv_wait(&vdcp->initwait_cv, &vdcp->lock); 4260 } else { 4261 /* 4262 * wait for LDC_UP, if it times out, switch 4263 * to another server. 4264 */ 4265 ldcup_timeout = ddi_get_lbolt() + 4266 (vdc_ldcup_timeout * 4267 drv_usectohz(MICROSEC)); 4268 status = cv_timedwait(&vdcp->initwait_cv, 4269 &vdcp->lock, ldcup_timeout); 4270 if (status == -1 && 4271 vdcp->state == VDC_STATE_INIT_WAITING && 4272 vdcp->curr_server->ldc_state != LDC_UP) { 4273 /* timed out & still waiting */ 4274 vdcp->state = VDC_STATE_INIT; 4275 break; 4276 } 4277 } 4278 4279 if (vdcp->state != VDC_STATE_INIT_WAITING) { 4280 DMSG(vdcp, 0, 4281 "state moved to %d out from under us...\n", 4282 vdcp->state); 4283 } 4284 break; 4285 4286 case VDC_STATE_NEGOTIATE: 4287 switch (status = vdc_ver_negotiation(vdcp)) { 4288 case 0: 4289 break; 4290 default: 4291 DMSG(vdcp, 0, "ver negotiate failed (%d)..\n", 4292 status); 4293 goto reset; 4294 } 4295 4296 switch (status = vdc_attr_negotiation(vdcp)) { 4297 case 0: 4298 break; 4299 default: 4300 DMSG(vdcp, 0, "attr negotiate failed (%d)..\n", 4301 status); 4302 goto reset; 4303 } 4304 4305 switch (status = vdc_dring_negotiation(vdcp)) { 4306 case 0: 4307 break; 4308 default: 4309 DMSG(vdcp, 0, "dring negotiate failed (%d)..\n", 4310 status); 4311 goto reset; 4312 } 4313 4314 switch (status = vdc_rdx_exchange(vdcp)) { 4315 case 0: 4316 vdcp->state = VDC_STATE_HANDLE_PENDING; 4317 goto done; 4318 default: 4319 DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n", 4320 status); 4321 goto reset; 4322 } 4323 reset: 4324 DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n", 4325 status); 4326 vdcp->state = VDC_STATE_RESETTING; 4327 vdcp->self_reset = B_TRUE; 4328 done: 4329 DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n", 4330 vdcp->state); 4331 break; 4332 4333 case VDC_STATE_HANDLE_PENDING: 4334 4335 if (vdcp->ctimeout_reached) { 4336 /* 4337 * The connection timeout had been reached so 4338 * pending requests have been cancelled. Now 4339 * that the connection is back we can reset 4340 * the timeout. 4341 */ 4342 ASSERT(vdcp->local_dring_backup == NULL); 4343 ASSERT(tmid != 0); 4344 tmid = 0; 4345 vdcp->ctimeout_reached = B_FALSE; 4346 vdcp->state = VDC_STATE_RUNNING; 4347 DMSG(vdcp, 0, "[%d] connection to service " 4348 "domain is up", vdcp->instance); 4349 break; 4350 } 4351 4352 mutex_exit(&vdcp->lock); 4353 if (tmid != 0) { 4354 (void) untimeout(tmid); 4355 tmid = 0; 4356 } 4357 status = vdc_resubmit_backup_dring(vdcp); 4358 mutex_enter(&vdcp->lock); 4359 4360 if (status) 4361 vdcp->state = VDC_STATE_RESETTING; 4362 else 4363 vdcp->state = VDC_STATE_RUNNING; 4364 4365 break; 4366 4367 /* enter running state */ 4368 case VDC_STATE_RUNNING: 4369 /* 4370 * Signal anyone waiting for the connection 4371 * to come on line. 4372 */ 4373 vdcp->hshake_cnt = 0; 4374 cv_broadcast(&vdcp->running_cv); 4375 4376 /* failfast has to been checked after reset */ 4377 cv_signal(&vdcp->failfast_cv); 4378 4379 /* ownership is lost during reset */ 4380 if (vdcp->ownership & VDC_OWNERSHIP_WANTED) 4381 vdcp->ownership |= VDC_OWNERSHIP_RESET; 4382 cv_signal(&vdcp->ownership_cv); 4383 4384 cmn_err(CE_CONT, "?vdisk@%d is online using " 4385 "ldc@%ld,%ld\n", vdcp->instance, 4386 vdcp->curr_server->ldc_id, vdcp->curr_server->id); 4387 4388 mutex_exit(&vdcp->lock); 4389 4390 for (;;) { 4391 vio_msg_t msg; 4392 status = vdc_wait_for_response(vdcp, &msg); 4393 if (status) break; 4394 4395 DMSG(vdcp, 1, "[%d] new pkt(s) available\n", 4396 vdcp->instance); 4397 status = vdc_process_data_msg(vdcp, &msg); 4398 if (status) { 4399 DMSG(vdcp, 1, "[%d] process_data_msg " 4400 "returned err=%d\n", vdcp->instance, 4401 status); 4402 break; 4403 } 4404 4405 } 4406 4407 mutex_enter(&vdcp->lock); 4408 4409 cmn_err(CE_CONT, "?vdisk@%d is offline\n", 4410 vdcp->instance); 4411 4412 vdcp->state = VDC_STATE_RESETTING; 4413 vdcp->self_reset = B_TRUE; 4414 break; 4415 4416 case VDC_STATE_RESETTING: 4417 /* 4418 * When we reach this state, we either come from the 4419 * VDC_STATE_RUNNING state and we can have pending 4420 * request but no timeout is armed; or we come from 4421 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or 4422 * VDC_HANDLE_PENDING state and there is no pending 4423 * request or pending requests have already been copied 4424 * into the backup dring. So we can safely keep the 4425 * connection timeout armed while we are in this state. 4426 */ 4427 4428 DMSG(vdcp, 0, "Initiating channel reset " 4429 "(pending = %d)\n", (int)vdcp->threads_pending); 4430 4431 if (vdcp->self_reset) { 4432 DMSG(vdcp, 0, 4433 "[%d] calling stop_ldc_connection.\n", 4434 vdcp->instance); 4435 status = vdc_stop_ldc_connection(vdcp); 4436 vdcp->self_reset = B_FALSE; 4437 } 4438 4439 /* 4440 * Wait for all threads currently waiting 4441 * for a free dring entry to use. 4442 */ 4443 while (vdcp->threads_pending) { 4444 cv_broadcast(&vdcp->membind_cv); 4445 cv_broadcast(&vdcp->dring_free_cv); 4446 mutex_exit(&vdcp->lock); 4447 /* give the waiters enough time to wake up */ 4448 delay(vdc_hz_min_ldc_delay); 4449 mutex_enter(&vdcp->lock); 4450 } 4451 4452 ASSERT(vdcp->threads_pending == 0); 4453 4454 /* Sanity check that no thread is receiving */ 4455 ASSERT(vdcp->read_state != VDC_READ_WAITING); 4456 4457 vdcp->read_state = VDC_READ_IDLE; 4458 4459 vdc_backup_local_dring(vdcp); 4460 4461 /* cleanup the old d-ring */ 4462 vdc_destroy_descriptor_ring(vdcp); 4463 4464 /* go and start again */ 4465 vdcp->state = VDC_STATE_INIT; 4466 4467 break; 4468 4469 case VDC_STATE_DETACH: 4470 DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n", 4471 vdcp->instance); 4472 4473 /* cancel any pending timeout */ 4474 mutex_exit(&vdcp->lock); 4475 if (tmid != 0) { 4476 (void) untimeout(tmid); 4477 tmid = 0; 4478 } 4479 mutex_enter(&vdcp->lock); 4480 4481 /* 4482 * Signal anyone waiting for connection 4483 * to come online 4484 */ 4485 cv_broadcast(&vdcp->running_cv); 4486 4487 while (vdcp->sync_op_pending) { 4488 cv_signal(&vdcp->sync_pending_cv); 4489 cv_signal(&vdcp->sync_blocked_cv); 4490 mutex_exit(&vdcp->lock); 4491 /* give the waiters enough time to wake up */ 4492 delay(vdc_hz_min_ldc_delay); 4493 mutex_enter(&vdcp->lock); 4494 } 4495 4496 mutex_exit(&vdcp->lock); 4497 4498 DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n", 4499 vdcp->instance); 4500 thread_exit(); 4501 break; 4502 } 4503 } 4504 } 4505 4506 4507 /* 4508 * Function: 4509 * vdc_process_data_msg() 4510 * 4511 * Description: 4512 * This function is called by the message processing thread each time 4513 * a message with a msgtype of VIO_TYPE_DATA is received. It will either 4514 * be an ACK or NACK from vds[1] which vdc handles as follows. 4515 * ACK - wake up the waiting thread 4516 * NACK - resend any messages necessary 4517 * 4518 * [1] Although the message format allows it, vds should not send a 4519 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for 4520 * some bizarre reason it does, vdc will reset the connection. 4521 * 4522 * Arguments: 4523 * vdc - soft state pointer for this instance of the device driver. 4524 * msg - the LDC message sent by vds 4525 * 4526 * Return Code: 4527 * 0 - Success. 4528 * > 0 - error value returned by LDC 4529 */ 4530 static int 4531 vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) 4532 { 4533 int status = 0; 4534 vio_dring_msg_t *dring_msg; 4535 vdc_local_desc_t *ldep = NULL; 4536 int start, end; 4537 int idx; 4538 int op; 4539 4540 dring_msg = (vio_dring_msg_t *)msg; 4541 4542 ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA); 4543 ASSERT(vdcp != NULL); 4544 4545 mutex_enter(&vdcp->lock); 4546 4547 /* 4548 * Check to see if the message has bogus data 4549 */ 4550 idx = start = dring_msg->start_idx; 4551 end = dring_msg->end_idx; 4552 if ((start >= vdcp->dring_len) || 4553 (end >= vdcp->dring_len) || (end < -1)) { 4554 /* 4555 * Update the I/O statistics to indicate that an error ocurred. 4556 * No need to update the wait/run queues as no specific read or 4557 * write request is being completed in response to this 'msg'. 4558 */ 4559 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4560 DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n", 4561 vdcp->instance, start, end); 4562 mutex_exit(&vdcp->lock); 4563 return (EINVAL); 4564 } 4565 4566 /* 4567 * Verify that the sequence number is what vdc expects. 4568 */ 4569 switch (vdc_verify_seq_num(vdcp, dring_msg)) { 4570 case VDC_SEQ_NUM_TODO: 4571 break; /* keep processing this message */ 4572 case VDC_SEQ_NUM_SKIP: 4573 mutex_exit(&vdcp->lock); 4574 return (0); 4575 case VDC_SEQ_NUM_INVALID: 4576 /* 4577 * Update the I/O statistics to indicate that an error ocurred. 4578 * No need to update the wait/run queues as no specific read or 4579 * write request is being completed in response to this 'msg'. 4580 */ 4581 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4582 DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance); 4583 mutex_exit(&vdcp->lock); 4584 return (ENXIO); 4585 } 4586 4587 if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) { 4588 /* 4589 * Update the I/O statistics to indicate that an error ocurred. 4590 * 4591 * We need to update the run queue if a read or write request 4592 * is being NACKed - otherwise there will appear to be an 4593 * indefinite outstanding request and statistics reported by 4594 * iostat(1M) will be incorrect. The transaction will be 4595 * resubmitted from the backup DRing following the reset 4596 * and the wait/run queues will be entered again. 4597 */ 4598 ldep = &vdcp->local_dring[idx]; 4599 op = ldep->operation; 4600 if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) { 4601 DTRACE_IO1(done, buf_t *, ldep->cb_arg); 4602 VD_KSTAT_RUNQ_EXIT(vdcp); 4603 } 4604 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4605 VDC_DUMP_DRING_MSG(dring_msg); 4606 DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance); 4607 mutex_exit(&vdcp->lock); 4608 return (EIO); 4609 4610 } else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) { 4611 /* 4612 * Update the I/O statistics to indicate that an error occurred. 4613 * No need to update the wait/run queues as no specific read or 4614 * write request is being completed in response to this 'msg'. 4615 */ 4616 VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs); 4617 mutex_exit(&vdcp->lock); 4618 return (EPROTO); 4619 } 4620 4621 DMSG(vdcp, 1, ": start %d end %d\n", start, end); 4622 ASSERT(start == end); 4623 4624 ldep = &vdcp->local_dring[idx]; 4625 4626 DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n", 4627 ldep->dep->hdr.dstate, ldep->cb_type); 4628 4629 if (ldep->dep->hdr.dstate == VIO_DESC_DONE) { 4630 struct buf *bufp; 4631 4632 switch (ldep->cb_type) { 4633 case CB_SYNC: 4634 ASSERT(vdcp->sync_op_pending); 4635 4636 status = vdc_depopulate_descriptor(vdcp, idx); 4637 vdcp->sync_op_status = status; 4638 vdcp->sync_op_pending = B_FALSE; 4639 cv_signal(&vdcp->sync_pending_cv); 4640 break; 4641 4642 case CB_STRATEGY: 4643 bufp = ldep->cb_arg; 4644 ASSERT(bufp != NULL); 4645 bufp->b_resid = 4646 bufp->b_bcount - ldep->dep->payload.nbytes; 4647 status = ldep->dep->payload.status; /* Future:ntoh */ 4648 if (status != 0) { 4649 DMSG(vdcp, 1, "strategy status=%d\n", status); 4650 VD_UPDATE_ERR_STATS(vdcp, vd_softerrs); 4651 bioerror(bufp, status); 4652 } 4653 4654 (void) vdc_depopulate_descriptor(vdcp, idx); 4655 4656 DMSG(vdcp, 1, 4657 "strategy complete req=%ld bytes resp=%ld bytes\n", 4658 bufp->b_bcount, ldep->dep->payload.nbytes); 4659 4660 if (status != 0 && vdcp->failfast_interval != 0) { 4661 /* 4662 * The I/O has failed and failfast is enabled. 4663 * We need the failfast thread to check if the 4664 * failure is due to a reservation conflict. 4665 */ 4666 (void) vdc_failfast_io_queue(vdcp, bufp); 4667 } else { 4668 if (status == 0) { 4669 op = (bufp->b_flags & B_READ) ? 4670 VD_OP_BREAD : VD_OP_BWRITE; 4671 VD_UPDATE_IO_STATS(vdcp, op, 4672 ldep->dep->payload.nbytes); 4673 } 4674 VD_KSTAT_RUNQ_EXIT(vdcp); 4675 DTRACE_IO1(done, buf_t *, bufp); 4676 biodone(bufp); 4677 } 4678 break; 4679 4680 default: 4681 ASSERT(0); 4682 } 4683 } 4684 4685 /* let the arrival signal propogate */ 4686 mutex_exit(&vdcp->lock); 4687 4688 /* probe gives the count of how many entries were processed */ 4689 DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp); 4690 4691 return (0); 4692 } 4693 4694 4695 /* 4696 * Function: 4697 * vdc_handle_ver_msg() 4698 * 4699 * Description: 4700 * 4701 * Arguments: 4702 * vdc - soft state pointer for this instance of the device driver. 4703 * ver_msg - LDC message sent by vDisk server 4704 * 4705 * Return Code: 4706 * 0 - Success 4707 */ 4708 static int 4709 vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg) 4710 { 4711 int status = 0; 4712 4713 ASSERT(vdc != NULL); 4714 ASSERT(mutex_owned(&vdc->lock)); 4715 4716 if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) { 4717 return (EPROTO); 4718 } 4719 4720 if (ver_msg->dev_class != VDEV_DISK_SERVER) { 4721 return (EINVAL); 4722 } 4723 4724 switch (ver_msg->tag.vio_subtype) { 4725 case VIO_SUBTYPE_ACK: 4726 /* 4727 * We check to see if the version returned is indeed supported 4728 * (The server may have also adjusted the minor number downwards 4729 * and if so 'ver_msg' will contain the actual version agreed) 4730 */ 4731 if (vdc_is_supported_version(ver_msg)) { 4732 vdc->ver.major = ver_msg->ver_major; 4733 vdc->ver.minor = ver_msg->ver_minor; 4734 ASSERT(vdc->ver.major > 0); 4735 } else { 4736 status = EPROTO; 4737 } 4738 break; 4739 4740 case VIO_SUBTYPE_NACK: 4741 /* 4742 * call vdc_is_supported_version() which will return the next 4743 * supported version (if any) in 'ver_msg' 4744 */ 4745 (void) vdc_is_supported_version(ver_msg); 4746 if (ver_msg->ver_major > 0) { 4747 size_t len = sizeof (*ver_msg); 4748 4749 ASSERT(vdc->ver.major > 0); 4750 4751 /* reset the necessary fields and resend */ 4752 ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO; 4753 ver_msg->dev_class = VDEV_DISK; 4754 4755 status = vdc_send(vdc, (caddr_t)ver_msg, &len); 4756 DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n", 4757 vdc->instance, status); 4758 if (len != sizeof (*ver_msg)) 4759 status = EBADMSG; 4760 } else { 4761 DMSG(vdc, 0, "[%d] No common version with vDisk server", 4762 vdc->instance); 4763 status = ENOTSUP; 4764 } 4765 4766 break; 4767 case VIO_SUBTYPE_INFO: 4768 /* 4769 * Handle the case where vds starts handshake 4770 * (for now only vdc is the instigator) 4771 */ 4772 status = ENOTSUP; 4773 break; 4774 4775 default: 4776 status = EINVAL; 4777 break; 4778 } 4779 4780 return (status); 4781 } 4782 4783 /* 4784 * Function: 4785 * vdc_handle_attr_msg() 4786 * 4787 * Description: 4788 * 4789 * Arguments: 4790 * vdc - soft state pointer for this instance of the device driver. 4791 * attr_msg - LDC message sent by vDisk server 4792 * 4793 * Return Code: 4794 * 0 - Success 4795 */ 4796 static int 4797 vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) 4798 { 4799 int status = 0; 4800 4801 ASSERT(vdc != NULL); 4802 ASSERT(mutex_owned(&vdc->lock)); 4803 4804 if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) { 4805 return (EPROTO); 4806 } 4807 4808 switch (attr_msg->tag.vio_subtype) { 4809 case VIO_SUBTYPE_ACK: 4810 /* 4811 * We now verify the attributes sent by vds. 4812 */ 4813 if (attr_msg->vdisk_size == 0) { 4814 DMSG(vdc, 0, "[%d] Invalid disk size from vds", 4815 vdc->instance); 4816 status = EINVAL; 4817 break; 4818 } 4819 4820 if (attr_msg->max_xfer_sz == 0) { 4821 DMSG(vdc, 0, "[%d] Invalid transfer size from vds", 4822 vdc->instance); 4823 status = EINVAL; 4824 break; 4825 } 4826 4827 if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { 4828 DMSG(vdc, 0, "[%d] Unknown disk size from vds", 4829 vdc->instance); 4830 attr_msg->vdisk_size = 0; 4831 } 4832 /* update disk, block and transfer sizes */ 4833 vdc_update_size(vdc, attr_msg->vdisk_size, 4834 attr_msg->vdisk_block_size, attr_msg->max_xfer_sz); 4835 vdc->vdisk_type = attr_msg->vdisk_type; 4836 vdc->operations = attr_msg->operations; 4837 if (vio_ver_is_supported(vdc->ver, 1, 1)) 4838 vdc->vdisk_media = attr_msg->vdisk_media; 4839 else 4840 vdc->vdisk_media = 0; 4841 4842 DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n", 4843 vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz); 4844 DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n", 4845 vdc->instance, vdc->block_size, 4846 attr_msg->vdisk_block_size); 4847 4848 if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) || 4849 (attr_msg->vdisk_size > INT64_MAX) || 4850 (attr_msg->operations == 0) || 4851 (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { 4852 DMSG(vdc, 0, "[%d] Invalid attributes from vds", 4853 vdc->instance); 4854 status = EINVAL; 4855 break; 4856 } 4857 4858 /* 4859 * Now that we have received all attributes we can create a 4860 * fake geometry for the disk. 4861 */ 4862 vdc_create_fake_geometry(vdc); 4863 break; 4864 4865 case VIO_SUBTYPE_NACK: 4866 /* 4867 * vds could not handle the attributes we sent so we 4868 * stop negotiating. 4869 */ 4870 status = EPROTO; 4871 break; 4872 4873 case VIO_SUBTYPE_INFO: 4874 /* 4875 * Handle the case where vds starts the handshake 4876 * (for now; vdc is the only supported instigatior) 4877 */ 4878 status = ENOTSUP; 4879 break; 4880 4881 default: 4882 status = ENOTSUP; 4883 break; 4884 } 4885 4886 return (status); 4887 } 4888 4889 /* 4890 * Function: 4891 * vdc_handle_dring_reg_msg() 4892 * 4893 * Description: 4894 * 4895 * Arguments: 4896 * vdc - soft state pointer for this instance of the driver. 4897 * dring_msg - LDC message sent by vDisk server 4898 * 4899 * Return Code: 4900 * 0 - Success 4901 */ 4902 static int 4903 vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg) 4904 { 4905 int status = 0; 4906 4907 ASSERT(vdc != NULL); 4908 ASSERT(mutex_owned(&vdc->lock)); 4909 4910 if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) { 4911 return (EPROTO); 4912 } 4913 4914 switch (dring_msg->tag.vio_subtype) { 4915 case VIO_SUBTYPE_ACK: 4916 /* save the received dring_ident */ 4917 vdc->dring_ident = dring_msg->dring_ident; 4918 DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n", 4919 vdc->instance, vdc->dring_ident); 4920 break; 4921 4922 case VIO_SUBTYPE_NACK: 4923 /* 4924 * vds could not handle the DRing info we sent so we 4925 * stop negotiating. 4926 */ 4927 DMSG(vdc, 0, "[%d] server could not register DRing\n", 4928 vdc->instance); 4929 status = EPROTO; 4930 break; 4931 4932 case VIO_SUBTYPE_INFO: 4933 /* 4934 * Handle the case where vds starts handshake 4935 * (for now only vdc is the instigatior) 4936 */ 4937 status = ENOTSUP; 4938 break; 4939 default: 4940 status = ENOTSUP; 4941 } 4942 4943 return (status); 4944 } 4945 4946 /* 4947 * Function: 4948 * vdc_verify_seq_num() 4949 * 4950 * Description: 4951 * This functions verifies that the sequence number sent back by the vDisk 4952 * server with the latest message is what is expected (i.e. it is greater 4953 * than the last seq num sent by the vDisk server and less than or equal 4954 * to the last seq num generated by vdc). 4955 * 4956 * It then checks the request ID to see if any requests need processing 4957 * in the DRing. 4958 * 4959 * Arguments: 4960 * vdc - soft state pointer for this instance of the driver. 4961 * dring_msg - pointer to the LDC message sent by vds 4962 * 4963 * Return Code: 4964 * VDC_SEQ_NUM_TODO - Message needs to be processed 4965 * VDC_SEQ_NUM_SKIP - Message has already been processed 4966 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync, 4967 * vdc cannot deal with them 4968 */ 4969 static int 4970 vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg) 4971 { 4972 ASSERT(vdc != NULL); 4973 ASSERT(dring_msg != NULL); 4974 ASSERT(mutex_owned(&vdc->lock)); 4975 4976 /* 4977 * Check to see if the messages were responded to in the correct 4978 * order by vds. 4979 */ 4980 if ((dring_msg->seq_num <= vdc->seq_num_reply) || 4981 (dring_msg->seq_num > vdc->seq_num)) { 4982 DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: " 4983 "%lu > expected <= %lu (last proc req %lu sent %lu)\n", 4984 vdc->instance, dring_msg->seq_num, 4985 vdc->seq_num_reply, vdc->seq_num, 4986 vdc->req_id_proc, vdc->req_id); 4987 return (VDC_SEQ_NUM_INVALID); 4988 } 4989 vdc->seq_num_reply = dring_msg->seq_num; 4990 4991 if (vdc->req_id_proc < vdc->req_id) 4992 return (VDC_SEQ_NUM_TODO); 4993 else 4994 return (VDC_SEQ_NUM_SKIP); 4995 } 4996 4997 4998 /* 4999 * Function: 5000 * vdc_is_supported_version() 5001 * 5002 * Description: 5003 * This routine checks if the major/minor version numbers specified in 5004 * 'ver_msg' are supported. If not it finds the next version that is 5005 * in the supported version list 'vdc_version[]' and sets the fields in 5006 * 'ver_msg' to those values 5007 * 5008 * Arguments: 5009 * ver_msg - LDC message sent by vDisk server 5010 * 5011 * Return Code: 5012 * B_TRUE - Success 5013 * B_FALSE - Version not supported 5014 */ 5015 static boolean_t 5016 vdc_is_supported_version(vio_ver_msg_t *ver_msg) 5017 { 5018 int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]); 5019 5020 for (int i = 0; i < vdc_num_versions; i++) { 5021 ASSERT(vdc_version[i].major > 0); 5022 ASSERT((i == 0) || 5023 (vdc_version[i].major < vdc_version[i-1].major)); 5024 5025 /* 5026 * If the major versions match, adjust the minor version, if 5027 * necessary, down to the highest value supported by this 5028 * client. The server should support all minor versions lower 5029 * than the value it sent 5030 */ 5031 if (ver_msg->ver_major == vdc_version[i].major) { 5032 if (ver_msg->ver_minor > vdc_version[i].minor) { 5033 DMSGX(0, 5034 "Adjusting minor version from %u to %u", 5035 ver_msg->ver_minor, vdc_version[i].minor); 5036 ver_msg->ver_minor = vdc_version[i].minor; 5037 } 5038 return (B_TRUE); 5039 } 5040 5041 /* 5042 * If the message contains a higher major version number, set 5043 * the message's major/minor versions to the current values 5044 * and return false, so this message will get resent with 5045 * these values, and the server will potentially try again 5046 * with the same or a lower version 5047 */ 5048 if (ver_msg->ver_major > vdc_version[i].major) { 5049 ver_msg->ver_major = vdc_version[i].major; 5050 ver_msg->ver_minor = vdc_version[i].minor; 5051 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n", 5052 ver_msg->ver_major, ver_msg->ver_minor); 5053 5054 return (B_FALSE); 5055 } 5056 5057 /* 5058 * Otherwise, the message's major version is less than the 5059 * current major version, so continue the loop to the next 5060 * (lower) supported version 5061 */ 5062 } 5063 5064 /* 5065 * No common version was found; "ground" the version pair in the 5066 * message to terminate negotiation 5067 */ 5068 ver_msg->ver_major = 0; 5069 ver_msg->ver_minor = 0; 5070 5071 return (B_FALSE); 5072 } 5073 /* -------------------------------------------------------------------------- */ 5074 5075 /* 5076 * DKIO(7) support 5077 */ 5078 5079 typedef struct vdc_dk_arg { 5080 struct dk_callback dkc; 5081 int mode; 5082 dev_t dev; 5083 vdc_t *vdc; 5084 } vdc_dk_arg_t; 5085 5086 /* 5087 * Function: 5088 * vdc_dkio_flush_cb() 5089 * 5090 * Description: 5091 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called 5092 * by kernel code. 5093 * 5094 * Arguments: 5095 * arg - a pointer to a vdc_dk_arg_t structure. 5096 */ 5097 void 5098 vdc_dkio_flush_cb(void *arg) 5099 { 5100 struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; 5101 struct dk_callback *dkc = NULL; 5102 vdc_t *vdc = NULL; 5103 int rv; 5104 5105 if (dk_arg == NULL) { 5106 cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n"); 5107 return; 5108 } 5109 dkc = &dk_arg->dkc; 5110 vdc = dk_arg->vdc; 5111 ASSERT(vdc != NULL); 5112 5113 rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, 5114 VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 5115 if (rv != 0) { 5116 DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", 5117 vdc->instance, rv, 5118 ddi_model_convert_from(dk_arg->mode & FMODELS)); 5119 } 5120 5121 /* 5122 * Trigger the call back to notify the caller the the ioctl call has 5123 * been completed. 5124 */ 5125 if ((dk_arg->mode & FKIOCTL) && 5126 (dkc != NULL) && 5127 (dkc->dkc_callback != NULL)) { 5128 ASSERT(dkc->dkc_cookie != NULL); 5129 (*dkc->dkc_callback)(dkc->dkc_cookie, rv); 5130 } 5131 5132 /* Indicate that one less DKIO write flush is outstanding */ 5133 mutex_enter(&vdc->lock); 5134 vdc->dkio_flush_pending--; 5135 ASSERT(vdc->dkio_flush_pending >= 0); 5136 mutex_exit(&vdc->lock); 5137 5138 /* free the mem that was allocated when the callback was dispatched */ 5139 kmem_free(arg, sizeof (vdc_dk_arg_t)); 5140 } 5141 5142 /* 5143 * Function: 5144 * vdc_dkio_gapart() 5145 * 5146 * Description: 5147 * This function implements the DKIOCGAPART ioctl. 5148 * 5149 * Arguments: 5150 * vdc - soft state pointer 5151 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure 5152 * flag - ioctl flags 5153 */ 5154 static int 5155 vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag) 5156 { 5157 struct dk_geom *geom; 5158 struct extvtoc *vtoc; 5159 union { 5160 struct dk_map map[NDKMAP]; 5161 struct dk_map32 map32[NDKMAP]; 5162 } data; 5163 int i, rv, size; 5164 5165 mutex_enter(&vdc->lock); 5166 5167 if ((rv = vdc_validate_geometry(vdc)) != 0) { 5168 mutex_exit(&vdc->lock); 5169 return (rv); 5170 } 5171 5172 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) { 5173 mutex_exit(&vdc->lock); 5174 return (EOVERFLOW); 5175 } 5176 5177 vtoc = vdc->vtoc; 5178 geom = vdc->geom; 5179 5180 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5181 5182 for (i = 0; i < vtoc->v_nparts; i++) { 5183 data.map32[i].dkl_cylno = vtoc->v_part[i].p_start / 5184 (geom->dkg_nhead * geom->dkg_nsect); 5185 data.map32[i].dkl_nblk = vtoc->v_part[i].p_size; 5186 } 5187 size = NDKMAP * sizeof (struct dk_map32); 5188 5189 } else { 5190 5191 for (i = 0; i < vtoc->v_nparts; i++) { 5192 data.map[i].dkl_cylno = vtoc->v_part[i].p_start / 5193 (geom->dkg_nhead * geom->dkg_nsect); 5194 data.map[i].dkl_nblk = vtoc->v_part[i].p_size; 5195 } 5196 size = NDKMAP * sizeof (struct dk_map); 5197 5198 } 5199 5200 mutex_exit(&vdc->lock); 5201 5202 if (ddi_copyout(&data, arg, size, flag) != 0) 5203 return (EFAULT); 5204 5205 return (0); 5206 } 5207 5208 /* 5209 * Function: 5210 * vdc_dkio_partition() 5211 * 5212 * Description: 5213 * This function implements the DKIOCPARTITION ioctl. 5214 * 5215 * Arguments: 5216 * vdc - soft state pointer 5217 * arg - a pointer to a struct partition64 structure 5218 * flag - ioctl flags 5219 */ 5220 static int 5221 vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag) 5222 { 5223 struct partition64 p64; 5224 efi_gpt_t *gpt; 5225 efi_gpe_t *gpe; 5226 vd_efi_dev_t edev; 5227 uint_t partno; 5228 int rv; 5229 5230 if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) { 5231 return (EFAULT); 5232 } 5233 5234 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 5235 5236 if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) { 5237 return (rv); 5238 } 5239 5240 partno = p64.p_partno; 5241 5242 if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) { 5243 vd_efi_free(&edev, gpt, gpe); 5244 return (ESRCH); 5245 } 5246 5247 bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type, 5248 sizeof (struct uuid)); 5249 p64.p_start = gpe[partno].efi_gpe_StartingLBA; 5250 p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1; 5251 5252 if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) { 5253 vd_efi_free(&edev, gpt, gpe); 5254 return (EFAULT); 5255 } 5256 5257 vd_efi_free(&edev, gpt, gpe); 5258 return (0); 5259 } 5260 5261 /* 5262 * Function: 5263 * vdc_dioctl_rwcmd() 5264 * 5265 * Description: 5266 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used 5267 * for DKC_DIRECT disks to read or write at an absolute disk offset. 5268 * 5269 * Arguments: 5270 * dev - device 5271 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure 5272 * flag - ioctl flags 5273 */ 5274 static int 5275 vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) 5276 { 5277 struct dadkio_rwcmd32 rwcmd32; 5278 struct dadkio_rwcmd rwcmd; 5279 struct iovec aiov; 5280 struct uio auio; 5281 int rw, status; 5282 struct buf *buf; 5283 5284 if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) { 5285 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32, 5286 sizeof (struct dadkio_rwcmd32), flag)) { 5287 return (EFAULT); 5288 } 5289 rwcmd.cmd = rwcmd32.cmd; 5290 rwcmd.flags = rwcmd32.flags; 5291 rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr; 5292 rwcmd.buflen = rwcmd32.buflen; 5293 rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr; 5294 } else { 5295 if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd, 5296 sizeof (struct dadkio_rwcmd), flag)) { 5297 return (EFAULT); 5298 } 5299 } 5300 5301 switch (rwcmd.cmd) { 5302 case DADKIO_RWCMD_READ: 5303 rw = B_READ; 5304 break; 5305 case DADKIO_RWCMD_WRITE: 5306 rw = B_WRITE; 5307 break; 5308 default: 5309 return (EINVAL); 5310 } 5311 5312 bzero((caddr_t)&aiov, sizeof (struct iovec)); 5313 aiov.iov_base = rwcmd.bufaddr; 5314 aiov.iov_len = rwcmd.buflen; 5315 5316 bzero((caddr_t)&auio, sizeof (struct uio)); 5317 auio.uio_iov = &aiov; 5318 auio.uio_iovcnt = 1; 5319 auio.uio_loffset = rwcmd.blkaddr * DEV_BSIZE; 5320 auio.uio_resid = rwcmd.buflen; 5321 auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE; 5322 5323 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 5324 bioinit(buf); 5325 /* 5326 * We use the private field of buf to specify that this is an 5327 * I/O using an absolute offset. 5328 */ 5329 buf->b_private = (void *)VD_SLICE_NONE; 5330 5331 status = physio(vdc_strategy, buf, dev, rw, vdc_min, &auio); 5332 5333 biofini(buf); 5334 kmem_free(buf, sizeof (buf_t)); 5335 5336 return (status); 5337 } 5338 5339 /* 5340 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated 5341 * buffer is returned in alloc_len. 5342 */ 5343 static vd_scsi_t * 5344 vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, 5345 int *alloc_len) 5346 { 5347 vd_scsi_t *vd_scsi; 5348 int vd_scsi_len = VD_SCSI_SIZE; 5349 5350 vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); 5351 vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); 5352 vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); 5353 vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); 5354 5355 ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); 5356 5357 vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); 5358 5359 vd_scsi->cdb_len = cdb_len; 5360 vd_scsi->sense_len = sense_len; 5361 vd_scsi->datain_len = datain_len; 5362 vd_scsi->dataout_len = dataout_len; 5363 5364 *alloc_len = vd_scsi_len; 5365 5366 return (vd_scsi); 5367 } 5368 5369 /* 5370 * Convert the status of a SCSI command to a Solaris return code. 5371 * 5372 * Arguments: 5373 * vd_scsi - The SCSI operation buffer. 5374 * log_error - indicate if an error message should be logged. 5375 * 5376 * Note that our SCSI error messages are rather primitive for the moment 5377 * and could be improved by decoding some data like the SCSI command and 5378 * the sense key. 5379 * 5380 * Return value: 5381 * 0 - Status is good. 5382 * EACCES - Status reports a reservation conflict. 5383 * ENOTSUP - Status reports a check condition and sense key 5384 * reports an illegal request. 5385 * EIO - Any other status. 5386 */ 5387 static int 5388 vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) 5389 { 5390 int rv; 5391 char path_str[MAXPATHLEN]; 5392 char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; 5393 union scsi_cdb *cdb; 5394 struct scsi_extended_sense *sense; 5395 5396 if (vd_scsi->cmd_status == STATUS_GOOD) 5397 /* no error */ 5398 return (0); 5399 5400 /* when the tunable vdc_scsi_log_error is true we log all errors */ 5401 if (vdc_scsi_log_error) 5402 log_error = B_TRUE; 5403 5404 if (log_error) { 5405 cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", 5406 ddi_pathname(vdc->dip, path_str), vdc->instance, 5407 GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); 5408 } 5409 5410 /* default returned value */ 5411 rv = EIO; 5412 5413 switch (vd_scsi->cmd_status) { 5414 5415 case STATUS_CHECK: 5416 case STATUS_TERMINATED: 5417 if (log_error) 5418 cmn_err(CE_CONT, "\tCheck Condition Error\n"); 5419 5420 /* check sense buffer */ 5421 if (vd_scsi->sense_len == 0 || 5422 vd_scsi->sense_status != STATUS_GOOD) { 5423 if (log_error) 5424 cmn_err(CE_CONT, "\tNo Sense Data Available\n"); 5425 break; 5426 } 5427 5428 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5429 5430 if (log_error) { 5431 cmn_err(CE_CONT, "\tSense Key: 0x%x\n" 5432 "\tASC: 0x%x, ASCQ: 0x%x\n", 5433 scsi_sense_key((uint8_t *)sense), 5434 scsi_sense_asc((uint8_t *)sense), 5435 scsi_sense_ascq((uint8_t *)sense)); 5436 } 5437 5438 if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) 5439 rv = ENOTSUP; 5440 break; 5441 5442 case STATUS_BUSY: 5443 if (log_error) 5444 cmn_err(CE_NOTE, "\tDevice Busy\n"); 5445 break; 5446 5447 case STATUS_RESERVATION_CONFLICT: 5448 /* 5449 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then 5450 * reservation conflict could be due to various reasons like 5451 * incorrect keys, not registered or not reserved etc. So, 5452 * we should not panic in that case. 5453 */ 5454 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5455 if (vdc->failfast_interval != 0 && 5456 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && 5457 cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { 5458 /* failfast is enabled so we have to panic */ 5459 (void) snprintf(panic_str, sizeof (panic_str), 5460 VDC_RESV_CONFLICT_FMT_STR "%s", 5461 ddi_pathname(vdc->dip, path_str)); 5462 panic(panic_str); 5463 } 5464 if (log_error) 5465 cmn_err(CE_NOTE, "\tReservation Conflict\n"); 5466 rv = EACCES; 5467 break; 5468 5469 case STATUS_QFULL: 5470 if (log_error) 5471 cmn_err(CE_NOTE, "\tQueue Full\n"); 5472 break; 5473 5474 case STATUS_MET: 5475 case STATUS_INTERMEDIATE: 5476 case STATUS_SCSI2: 5477 case STATUS_INTERMEDIATE_MET: 5478 case STATUS_ACA_ACTIVE: 5479 if (log_error) 5480 cmn_err(CE_CONT, 5481 "\tUnexpected SCSI status received: 0x%x\n", 5482 vd_scsi->cmd_status); 5483 break; 5484 5485 default: 5486 if (log_error) 5487 cmn_err(CE_CONT, 5488 "\tInvalid SCSI status received: 0x%x\n", 5489 vd_scsi->cmd_status); 5490 break; 5491 } 5492 5493 return (rv); 5494 } 5495 5496 /* 5497 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to 5498 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI 5499 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is 5500 * converted to a VD_OP_RESET operation. 5501 */ 5502 static int 5503 vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) 5504 { 5505 struct uscsi_cmd uscsi; 5506 struct uscsi_cmd32 uscsi32; 5507 vd_scsi_t *vd_scsi; 5508 int vd_scsi_len; 5509 union scsi_cdb *cdb; 5510 struct scsi_extended_sense *sense; 5511 char *datain, *dataout; 5512 size_t cdb_len, datain_len, dataout_len, sense_len; 5513 int rv; 5514 5515 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5516 if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), 5517 mode) != 0) 5518 return (EFAULT); 5519 uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); 5520 } else { 5521 if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), 5522 mode) != 0) 5523 return (EFAULT); 5524 } 5525 5526 /* a uscsi reset is converted to a VD_OP_RESET operation */ 5527 if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | 5528 USCSI_RESET_ALL)) { 5529 rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, 5530 (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); 5531 return (rv); 5532 } 5533 5534 /* cdb buffer length */ 5535 cdb_len = uscsi.uscsi_cdblen; 5536 5537 /* data in and out buffers length */ 5538 if (uscsi.uscsi_flags & USCSI_READ) { 5539 datain_len = uscsi.uscsi_buflen; 5540 dataout_len = 0; 5541 } else { 5542 datain_len = 0; 5543 dataout_len = uscsi.uscsi_buflen; 5544 } 5545 5546 /* sense buffer length */ 5547 if (uscsi.uscsi_flags & USCSI_RQENABLE) 5548 sense_len = uscsi.uscsi_rqlen; 5549 else 5550 sense_len = 0; 5551 5552 /* allocate buffer for the VD_SCSICMD_OP operation */ 5553 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5554 &vd_scsi_len); 5555 5556 /* 5557 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, 5558 * but basically they prevent a SCSI command from being retried in case 5559 * of an error. 5560 */ 5561 if ((uscsi.uscsi_flags & USCSI_ISOLATE) || 5562 (uscsi.uscsi_flags & USCSI_DIAGNOSE)) 5563 vd_scsi->options |= VD_SCSI_OPT_NORETRY; 5564 5565 /* set task attribute */ 5566 if (uscsi.uscsi_flags & USCSI_NOTAG) { 5567 vd_scsi->task_attribute = 0; 5568 } else { 5569 if (uscsi.uscsi_flags & USCSI_HEAD) 5570 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 5571 else if (uscsi.uscsi_flags & USCSI_HTAG) 5572 vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; 5573 else if (uscsi.uscsi_flags & USCSI_OTAG) 5574 vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; 5575 else 5576 vd_scsi->task_attribute = 0; 5577 } 5578 5579 /* set timeout */ 5580 vd_scsi->timeout = uscsi.uscsi_timeout; 5581 5582 /* copy-in cdb data */ 5583 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5584 if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { 5585 rv = EFAULT; 5586 goto done; 5587 } 5588 5589 /* keep a pointer to the sense buffer */ 5590 sense = VD_SCSI_DATA_SENSE(vd_scsi); 5591 5592 /* keep a pointer to the data-in buffer */ 5593 datain = (char *)VD_SCSI_DATA_IN(vd_scsi); 5594 5595 /* copy-in request data to the data-out buffer */ 5596 dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); 5597 if (!(uscsi.uscsi_flags & USCSI_READ)) { 5598 if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, 5599 mode)) { 5600 rv = EFAULT; 5601 goto done; 5602 } 5603 } 5604 5605 /* submit the request */ 5606 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5607 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5608 5609 if (rv != 0) 5610 goto done; 5611 5612 /* update scsi status */ 5613 uscsi.uscsi_status = vd_scsi->cmd_status; 5614 5615 /* update sense data */ 5616 if ((uscsi.uscsi_flags & USCSI_RQENABLE) && 5617 (uscsi.uscsi_status == STATUS_CHECK || 5618 uscsi.uscsi_status == STATUS_TERMINATED)) { 5619 5620 uscsi.uscsi_rqstatus = vd_scsi->sense_status; 5621 5622 if (uscsi.uscsi_rqstatus == STATUS_GOOD) { 5623 uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - 5624 vd_scsi->sense_len; 5625 if (ddi_copyout(sense, uscsi.uscsi_rqbuf, 5626 vd_scsi->sense_len, mode) != 0) { 5627 rv = EFAULT; 5628 goto done; 5629 } 5630 } 5631 } 5632 5633 /* update request data */ 5634 if (uscsi.uscsi_status == STATUS_GOOD) { 5635 if (uscsi.uscsi_flags & USCSI_READ) { 5636 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5637 vd_scsi->datain_len; 5638 if (ddi_copyout(datain, uscsi.uscsi_bufaddr, 5639 vd_scsi->datain_len, mode) != 0) { 5640 rv = EFAULT; 5641 goto done; 5642 } 5643 } else { 5644 uscsi.uscsi_resid = uscsi.uscsi_buflen - 5645 vd_scsi->dataout_len; 5646 } 5647 } 5648 5649 /* copy-out result */ 5650 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5651 uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); 5652 if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), 5653 mode) != 0) { 5654 rv = EFAULT; 5655 goto done; 5656 } 5657 } else { 5658 if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), 5659 mode) != 0) { 5660 rv = EFAULT; 5661 goto done; 5662 } 5663 } 5664 5665 /* get the return code from the SCSI command status */ 5666 rv = vdc_scsi_status(vdc, vd_scsi, 5667 !(uscsi.uscsi_flags & USCSI_SILENT)); 5668 5669 done: 5670 kmem_free(vd_scsi, vd_scsi_len); 5671 return (rv); 5672 } 5673 5674 /* 5675 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. 5676 * 5677 * Arguments: 5678 * cmd - SCSI PERSISTENT IN command 5679 * len - length of the SCSI input buffer 5680 * vd_scsi_len - return the length of the allocated buffer 5681 * 5682 * Returned Value: 5683 * a pointer to the allocated VD_OP_SCSICMD buffer. 5684 */ 5685 static vd_scsi_t * 5686 vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) 5687 { 5688 int cdb_len, sense_len, datain_len, dataout_len; 5689 vd_scsi_t *vd_scsi; 5690 union scsi_cdb *cdb; 5691 5692 cdb_len = CDB_GROUP1; 5693 sense_len = sizeof (struct scsi_extended_sense); 5694 datain_len = len; 5695 dataout_len = 0; 5696 5697 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5698 vd_scsi_len); 5699 5700 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5701 5702 /* set cdb */ 5703 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; 5704 cdb->cdb_opaque[1] = cmd; 5705 FORMG1COUNT(cdb, datain_len); 5706 5707 vd_scsi->timeout = vdc_scsi_timeout; 5708 5709 return (vd_scsi); 5710 } 5711 5712 /* 5713 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. 5714 * 5715 * Arguments: 5716 * cmd - SCSI PERSISTENT OUT command 5717 * len - length of the SCSI output buffer 5718 * vd_scsi_len - return the length of the allocated buffer 5719 * 5720 * Returned Code: 5721 * a pointer to the allocated VD_OP_SCSICMD buffer. 5722 */ 5723 static vd_scsi_t * 5724 vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) 5725 { 5726 int cdb_len, sense_len, datain_len, dataout_len; 5727 vd_scsi_t *vd_scsi; 5728 union scsi_cdb *cdb; 5729 5730 cdb_len = CDB_GROUP1; 5731 sense_len = sizeof (struct scsi_extended_sense); 5732 datain_len = 0; 5733 dataout_len = len; 5734 5735 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, 5736 vd_scsi_len); 5737 5738 cdb = VD_SCSI_DATA_CDB(vd_scsi); 5739 5740 /* set cdb */ 5741 cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; 5742 cdb->cdb_opaque[1] = cmd; 5743 FORMG1COUNT(cdb, dataout_len); 5744 5745 vd_scsi->timeout = vdc_scsi_timeout; 5746 5747 return (vd_scsi); 5748 } 5749 5750 /* 5751 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted 5752 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk 5753 * server with a VD_OP_SCSICMD operation. 5754 */ 5755 static int 5756 vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) 5757 { 5758 vd_scsi_t *vd_scsi; 5759 mhioc_inkeys_t inkeys; 5760 mhioc_key_list_t klist; 5761 struct mhioc_inkeys32 inkeys32; 5762 struct mhioc_key_list32 klist32; 5763 sd_prin_readkeys_t *scsi_keys; 5764 void *user_keys; 5765 int vd_scsi_len; 5766 int listsize, listlen, rv; 5767 5768 /* copyin arguments */ 5769 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5770 rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); 5771 if (rv != 0) 5772 return (EFAULT); 5773 5774 rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, 5775 sizeof (klist32), mode); 5776 if (rv != 0) 5777 return (EFAULT); 5778 5779 listsize = klist32.listsize; 5780 } else { 5781 rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); 5782 if (rv != 0) 5783 return (EFAULT); 5784 5785 rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); 5786 if (rv != 0) 5787 return (EFAULT); 5788 5789 listsize = klist.listsize; 5790 } 5791 5792 /* build SCSI VD_OP request */ 5793 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, 5794 sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + 5795 (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); 5796 5797 scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); 5798 5799 /* submit the request */ 5800 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5801 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5802 5803 if (rv != 0) 5804 goto done; 5805 5806 listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; 5807 5808 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5809 inkeys32.generation = scsi_keys->generation; 5810 rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); 5811 if (rv != 0) { 5812 rv = EFAULT; 5813 goto done; 5814 } 5815 5816 klist32.listlen = listlen; 5817 rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, 5818 sizeof (klist32), mode); 5819 if (rv != 0) { 5820 rv = EFAULT; 5821 goto done; 5822 } 5823 5824 user_keys = (caddr_t)(uintptr_t)klist32.list; 5825 } else { 5826 inkeys.generation = scsi_keys->generation; 5827 rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); 5828 if (rv != 0) { 5829 rv = EFAULT; 5830 goto done; 5831 } 5832 5833 klist.listlen = listlen; 5834 rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); 5835 if (rv != 0) { 5836 rv = EFAULT; 5837 goto done; 5838 } 5839 5840 user_keys = klist.list; 5841 } 5842 5843 /* copy out keys */ 5844 if (listlen > 0 && listsize > 0) { 5845 if (listsize < listlen) 5846 listlen = listsize; 5847 rv = ddi_copyout(&scsi_keys->keylist, user_keys, 5848 listlen * MHIOC_RESV_KEY_SIZE, mode); 5849 if (rv != 0) 5850 rv = EFAULT; 5851 } 5852 5853 if (rv == 0) 5854 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5855 5856 done: 5857 kmem_free(vd_scsi, vd_scsi_len); 5858 5859 return (rv); 5860 } 5861 5862 /* 5863 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted 5864 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to 5865 * the vdisk server with a VD_OP_SCSICMD operation. 5866 */ 5867 static int 5868 vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) 5869 { 5870 vd_scsi_t *vd_scsi; 5871 mhioc_inresvs_t inresv; 5872 mhioc_resv_desc_list_t rlist; 5873 struct mhioc_inresvs32 inresv32; 5874 struct mhioc_resv_desc_list32 rlist32; 5875 mhioc_resv_desc_t mhd_resv; 5876 sd_prin_readresv_t *scsi_resv; 5877 sd_readresv_desc_t *resv; 5878 mhioc_resv_desc_t *user_resv; 5879 int vd_scsi_len; 5880 int listsize, listlen, i, rv; 5881 5882 /* copyin arguments */ 5883 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5884 rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); 5885 if (rv != 0) 5886 return (EFAULT); 5887 5888 rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, 5889 sizeof (rlist32), mode); 5890 if (rv != 0) 5891 return (EFAULT); 5892 5893 listsize = rlist32.listsize; 5894 } else { 5895 rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); 5896 if (rv != 0) 5897 return (EFAULT); 5898 5899 rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); 5900 if (rv != 0) 5901 return (EFAULT); 5902 5903 listsize = rlist.listsize; 5904 } 5905 5906 /* build SCSI VD_OP request */ 5907 vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, 5908 sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + 5909 (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); 5910 5911 scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); 5912 5913 /* submit the request */ 5914 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 5915 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 5916 5917 if (rv != 0) 5918 goto done; 5919 5920 listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; 5921 5922 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 5923 inresv32.generation = scsi_resv->generation; 5924 rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); 5925 if (rv != 0) { 5926 rv = EFAULT; 5927 goto done; 5928 } 5929 5930 rlist32.listlen = listlen; 5931 rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, 5932 sizeof (rlist32), mode); 5933 if (rv != 0) { 5934 rv = EFAULT; 5935 goto done; 5936 } 5937 5938 user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; 5939 } else { 5940 inresv.generation = scsi_resv->generation; 5941 rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); 5942 if (rv != 0) { 5943 rv = EFAULT; 5944 goto done; 5945 } 5946 5947 rlist.listlen = listlen; 5948 rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); 5949 if (rv != 0) { 5950 rv = EFAULT; 5951 goto done; 5952 } 5953 5954 user_resv = rlist.list; 5955 } 5956 5957 /* copy out reservations */ 5958 if (listsize > 0 && listlen > 0) { 5959 if (listsize < listlen) 5960 listlen = listsize; 5961 resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; 5962 5963 for (i = 0; i < listlen; i++) { 5964 mhd_resv.type = resv->type; 5965 mhd_resv.scope = resv->scope; 5966 mhd_resv.scope_specific_addr = 5967 BE_32(resv->scope_specific_addr); 5968 bcopy(&resv->resvkey, &mhd_resv.key, 5969 MHIOC_RESV_KEY_SIZE); 5970 5971 rv = ddi_copyout(&mhd_resv, user_resv, 5972 sizeof (mhd_resv), mode); 5973 if (rv != 0) { 5974 rv = EFAULT; 5975 goto done; 5976 } 5977 resv++; 5978 user_resv++; 5979 } 5980 } 5981 5982 if (rv == 0) 5983 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 5984 5985 done: 5986 kmem_free(vd_scsi, vd_scsi_len); 5987 return (rv); 5988 } 5989 5990 /* 5991 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted 5992 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk 5993 * server with a VD_OP_SCSICMD operation. 5994 */ 5995 static int 5996 vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) 5997 { 5998 vd_scsi_t *vd_scsi; 5999 sd_prout_t *scsi_prout; 6000 mhioc_register_t mhd_reg; 6001 int vd_scsi_len, rv; 6002 6003 /* copyin arguments */ 6004 rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); 6005 if (rv != 0) 6006 return (EFAULT); 6007 6008 /* build SCSI VD_OP request */ 6009 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, 6010 sizeof (sd_prout_t), &vd_scsi_len); 6011 6012 /* set parameters */ 6013 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6014 bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6015 bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); 6016 scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; 6017 6018 /* submit the request */ 6019 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6020 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6021 6022 if (rv == 0) 6023 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6024 6025 kmem_free(vd_scsi, vd_scsi_len); 6026 return (rv); 6027 } 6028 6029 /* 6030 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted 6031 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk 6032 * server with a VD_OP_SCSICMD operation. 6033 */ 6034 static int 6035 vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) 6036 { 6037 union scsi_cdb *cdb; 6038 vd_scsi_t *vd_scsi; 6039 sd_prout_t *scsi_prout; 6040 mhioc_resv_desc_t mhd_resv; 6041 int vd_scsi_len, rv; 6042 6043 /* copyin arguments */ 6044 rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); 6045 if (rv != 0) 6046 return (EFAULT); 6047 6048 /* build SCSI VD_OP request */ 6049 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, 6050 sizeof (sd_prout_t), &vd_scsi_len); 6051 6052 /* set parameters */ 6053 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6054 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6055 bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); 6056 scsi_prout->scope_address = mhd_resv.scope_specific_addr; 6057 cdb->cdb_opaque[2] = mhd_resv.type; 6058 6059 /* submit the request */ 6060 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6061 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6062 6063 if (rv == 0) 6064 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6065 6066 kmem_free(vd_scsi, vd_scsi_len); 6067 return (rv); 6068 } 6069 6070 /* 6071 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is 6072 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which 6073 * is sent to the vdisk server with a VD_OP_SCSICMD operation. 6074 */ 6075 static int 6076 vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) 6077 { 6078 union scsi_cdb *cdb; 6079 vd_scsi_t *vd_scsi; 6080 sd_prout_t *scsi_prout; 6081 mhioc_preemptandabort_t mhd_preempt; 6082 int vd_scsi_len, rv; 6083 6084 /* copyin arguments */ 6085 rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); 6086 if (rv != 0) 6087 return (EFAULT); 6088 6089 /* build SCSI VD_OP request */ 6090 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, 6091 sizeof (sd_prout_t), &vd_scsi_len); 6092 6093 /* set parameters */ 6094 vd_scsi->task_attribute = VD_SCSI_TASK_ACA; 6095 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6096 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6097 bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, 6098 MHIOC_RESV_KEY_SIZE); 6099 bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, 6100 MHIOC_RESV_KEY_SIZE); 6101 scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; 6102 cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; 6103 6104 /* submit the request */ 6105 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6106 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6107 6108 if (rv == 0) 6109 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6110 6111 kmem_free(vd_scsi, vd_scsi_len); 6112 return (rv); 6113 } 6114 6115 /* 6116 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl 6117 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY 6118 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. 6119 */ 6120 static int 6121 vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) 6122 { 6123 vd_scsi_t *vd_scsi; 6124 sd_prout_t *scsi_prout; 6125 mhioc_registerandignorekey_t mhd_regi; 6126 int vd_scsi_len, rv; 6127 6128 /* copyin arguments */ 6129 rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); 6130 if (rv != 0) 6131 return (EFAULT); 6132 6133 /* build SCSI VD_OP request */ 6134 vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, 6135 sizeof (sd_prout_t), &vd_scsi_len); 6136 6137 /* set parameters */ 6138 scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); 6139 bcopy(mhd_regi.newkey.key, scsi_prout->service_key, 6140 MHIOC_RESV_KEY_SIZE); 6141 scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; 6142 6143 /* submit the request */ 6144 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6145 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); 6146 6147 if (rv == 0) 6148 rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6149 6150 kmem_free(vd_scsi, vd_scsi_len); 6151 return (rv); 6152 } 6153 6154 /* 6155 * This function is used by the failfast mechanism to send a SCSI command 6156 * to check for reservation conflict. 6157 */ 6158 static int 6159 vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) 6160 { 6161 int cdb_len, sense_len, vd_scsi_len; 6162 vd_scsi_t *vd_scsi; 6163 union scsi_cdb *cdb; 6164 int rv; 6165 6166 ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); 6167 6168 if (scmd == SCMD_WRITE_G1) 6169 cdb_len = CDB_GROUP1; 6170 else 6171 cdb_len = CDB_GROUP0; 6172 6173 sense_len = sizeof (struct scsi_extended_sense); 6174 6175 vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); 6176 6177 /* set cdb */ 6178 cdb = VD_SCSI_DATA_CDB(vd_scsi); 6179 cdb->scc_cmd = scmd; 6180 6181 vd_scsi->timeout = vdc_scsi_timeout; 6182 6183 /* 6184 * Submit the request. The last argument has to be B_FALSE so that 6185 * vdc_do_sync_op does not loop checking for reservation conflict if 6186 * the operation returns an error. 6187 */ 6188 rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, 6189 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); 6190 6191 if (rv == 0) 6192 (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); 6193 6194 kmem_free(vd_scsi, vd_scsi_len); 6195 return (rv); 6196 } 6197 6198 /* 6199 * This function is used by the failfast mechanism to check for reservation 6200 * conflict. It sends some SCSI commands which will fail with a reservation 6201 * conflict error if the system does not have access to the disk and this 6202 * will panic the system. 6203 * 6204 * Returned Code: 6205 * 0 - disk is accessible without reservation conflict error 6206 * != 0 - unable to check if disk is accessible 6207 */ 6208 int 6209 vdc_failfast_check_resv(vdc_t *vdc) 6210 { 6211 int failure = 0; 6212 6213 /* 6214 * Send a TEST UNIT READY command. The command will panic 6215 * the system if it fails with a reservation conflict. 6216 */ 6217 if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) 6218 failure++; 6219 6220 /* 6221 * With SPC-3 compliant devices TEST UNIT READY will succeed on 6222 * a reserved device, so we also do a WRITE(10) of zero byte in 6223 * order to provoke a Reservation Conflict status on those newer 6224 * devices. 6225 */ 6226 if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) 6227 failure++; 6228 6229 return (failure); 6230 } 6231 6232 /* 6233 * Add a pending I/O to the failfast I/O queue. An I/O is added to this 6234 * queue when it has failed and failfast is enabled. Then we have to check 6235 * if it has failed because of a reservation conflict in which case we have 6236 * to panic the system. 6237 * 6238 * Async I/O should be queued with their block I/O data transfer structure 6239 * (buf). Sync I/O should be queued with buf = NULL. 6240 */ 6241 static vdc_io_t * 6242 vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) 6243 { 6244 vdc_io_t *vio; 6245 6246 ASSERT(MUTEX_HELD(&vdc->lock)); 6247 6248 vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); 6249 vio->vio_next = vdc->failfast_io_queue; 6250 vio->vio_buf = buf; 6251 vio->vio_qtime = ddi_get_lbolt(); 6252 6253 vdc->failfast_io_queue = vio; 6254 6255 /* notify the failfast thread that a new I/O is queued */ 6256 cv_signal(&vdc->failfast_cv); 6257 6258 return (vio); 6259 } 6260 6261 /* 6262 * Remove and complete I/O in the failfast I/O queue which have been 6263 * added after the indicated deadline. A deadline of 0 means that all 6264 * I/O have to be unqueued and marked as completed. 6265 */ 6266 static void 6267 vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) 6268 { 6269 vdc_io_t *vio, *vio_tmp; 6270 6271 ASSERT(MUTEX_HELD(&vdc->lock)); 6272 6273 vio_tmp = NULL; 6274 vio = vdc->failfast_io_queue; 6275 6276 if (deadline != 0) { 6277 /* 6278 * Skip any io queued after the deadline. The failfast 6279 * I/O queue is ordered starting with the last I/O added 6280 * to the queue. 6281 */ 6282 while (vio != NULL && vio->vio_qtime > deadline) { 6283 vio_tmp = vio; 6284 vio = vio->vio_next; 6285 } 6286 } 6287 6288 if (vio == NULL) 6289 /* nothing to unqueue */ 6290 return; 6291 6292 /* update the queue */ 6293 if (vio_tmp == NULL) 6294 vdc->failfast_io_queue = NULL; 6295 else 6296 vio_tmp->vio_next = NULL; 6297 6298 /* 6299 * Complete unqueued I/O. Async I/O have a block I/O data transfer 6300 * structure (buf) and they are completed by calling biodone(). Sync 6301 * I/O do not have a buf and they are completed by setting the 6302 * vio_qtime to zero and signaling failfast_io_cv. In that case, the 6303 * thread waiting for the I/O to complete is responsible for freeing 6304 * the vio structure. 6305 */ 6306 while (vio != NULL) { 6307 vio_tmp = vio->vio_next; 6308 if (vio->vio_buf != NULL) { 6309 VD_KSTAT_RUNQ_EXIT(vdc); 6310 DTRACE_IO1(done, buf_t *, vio->vio_buf); 6311 biodone(vio->vio_buf); 6312 kmem_free(vio, sizeof (vdc_io_t)); 6313 } else { 6314 vio->vio_qtime = 0; 6315 } 6316 vio = vio_tmp; 6317 } 6318 6319 cv_broadcast(&vdc->failfast_io_cv); 6320 } 6321 6322 /* 6323 * Failfast Thread. 6324 * 6325 * While failfast is enabled, the failfast thread sends a TEST UNIT READY 6326 * and a zero size WRITE(10) SCSI commands on a regular basis to check that 6327 * we still have access to the disk. If a command fails with a RESERVATION 6328 * CONFLICT error then the system will immediatly panic. 6329 * 6330 * The failfast thread is also woken up when an I/O has failed. It then check 6331 * the access to the disk to ensure that the I/O failure was not due to a 6332 * reservation conflict. 6333 * 6334 * There is one failfast thread for each virtual disk for which failfast is 6335 * enabled. We could have only one thread sending requests for all disks but 6336 * this would need vdc to send asynchronous requests and to have callbacks to 6337 * process replies. 6338 */ 6339 static void 6340 vdc_failfast_thread(void *arg) 6341 { 6342 int status; 6343 vdc_t *vdc = (vdc_t *)arg; 6344 clock_t timeout, starttime; 6345 6346 mutex_enter(&vdc->lock); 6347 6348 while (vdc->failfast_interval != 0) { 6349 6350 starttime = ddi_get_lbolt(); 6351 6352 mutex_exit(&vdc->lock); 6353 6354 /* check for reservation conflict */ 6355 status = vdc_failfast_check_resv(vdc); 6356 6357 mutex_enter(&vdc->lock); 6358 /* 6359 * We have dropped the lock to send the SCSI command so we have 6360 * to check that failfast is still enabled. 6361 */ 6362 if (vdc->failfast_interval == 0) 6363 break; 6364 6365 /* 6366 * If we have successfully check the disk access and there was 6367 * no reservation conflict then we can complete any I/O queued 6368 * before the last check. 6369 */ 6370 if (status == 0) 6371 vdc_failfast_io_unqueue(vdc, starttime); 6372 6373 /* proceed again if some I/O are still in the queue */ 6374 if (vdc->failfast_io_queue != NULL) 6375 continue; 6376 6377 timeout = ddi_get_lbolt() + 6378 drv_usectohz(vdc->failfast_interval); 6379 (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); 6380 } 6381 6382 /* 6383 * Failfast is being stop so we can complete any queued I/O. 6384 */ 6385 vdc_failfast_io_unqueue(vdc, 0); 6386 vdc->failfast_thread = NULL; 6387 mutex_exit(&vdc->lock); 6388 thread_exit(); 6389 } 6390 6391 /* 6392 * Implement the MHIOCENFAILFAST mhd(7i) ioctl. 6393 */ 6394 static int 6395 vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) 6396 { 6397 unsigned int mh_time; 6398 6399 if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) 6400 return (EFAULT); 6401 6402 mutex_enter(&vdc->lock); 6403 if (mh_time != 0 && vdc->failfast_thread == NULL) { 6404 vdc->failfast_thread = thread_create(NULL, 0, 6405 vdc_failfast_thread, vdc, 0, &p0, TS_RUN, 6406 v.v_maxsyspri - 2); 6407 } 6408 6409 vdc->failfast_interval = mh_time * 1000; 6410 cv_signal(&vdc->failfast_cv); 6411 mutex_exit(&vdc->lock); 6412 6413 return (0); 6414 } 6415 6416 /* 6417 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are 6418 * converted to VD_OP_SET_ACCESS operations. 6419 */ 6420 static int 6421 vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) 6422 { 6423 int rv; 6424 6425 /* submit owership command request */ 6426 rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, 6427 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6428 VIO_both_dir, B_TRUE); 6429 6430 return (rv); 6431 } 6432 6433 /* 6434 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a 6435 * VD_OP_GET_ACCESS operation. 6436 */ 6437 static int 6438 vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) 6439 { 6440 int rv; 6441 6442 /* submit owership command request */ 6443 rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, 6444 sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, 6445 VIO_both_dir, B_TRUE); 6446 6447 return (rv); 6448 } 6449 6450 /* 6451 * Disk Ownership Thread. 6452 * 6453 * When we have taken the ownership of a disk, this thread waits to be 6454 * notified when the LDC channel is reset so that it can recover the 6455 * ownership. 6456 * 6457 * Note that the thread handling the LDC reset (vdc_process_msg_thread()) 6458 * can not be used to do the ownership recovery because it has to be 6459 * running to handle the reply message to the ownership operation. 6460 */ 6461 static void 6462 vdc_ownership_thread(void *arg) 6463 { 6464 vdc_t *vdc = (vdc_t *)arg; 6465 clock_t timeout; 6466 uint64_t status; 6467 6468 mutex_enter(&vdc->ownership_lock); 6469 mutex_enter(&vdc->lock); 6470 6471 while (vdc->ownership & VDC_OWNERSHIP_WANTED) { 6472 6473 if ((vdc->ownership & VDC_OWNERSHIP_RESET) || 6474 !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { 6475 /* 6476 * There was a reset so the ownership has been lost, 6477 * try to recover. We do this without using the preempt 6478 * option so that we don't steal the ownership from 6479 * someone who has preempted us. 6480 */ 6481 DMSG(vdc, 0, "[%d] Ownership lost, recovering", 6482 vdc->instance); 6483 6484 vdc->ownership &= ~(VDC_OWNERSHIP_RESET | 6485 VDC_OWNERSHIP_GRANTED); 6486 6487 mutex_exit(&vdc->lock); 6488 6489 status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6490 VD_ACCESS_SET_PRESERVE, FKIOCTL); 6491 6492 mutex_enter(&vdc->lock); 6493 6494 if (status == 0) { 6495 DMSG(vdc, 0, "[%d] Ownership recovered", 6496 vdc->instance); 6497 vdc->ownership |= VDC_OWNERSHIP_GRANTED; 6498 } else { 6499 DMSG(vdc, 0, "[%d] Fail to recover ownership", 6500 vdc->instance); 6501 } 6502 6503 } 6504 6505 /* 6506 * If we have the ownership then we just wait for an event 6507 * to happen (LDC reset), otherwise we will retry to recover 6508 * after a delay. 6509 */ 6510 if (vdc->ownership & VDC_OWNERSHIP_GRANTED) 6511 timeout = 0; 6512 else 6513 timeout = ddi_get_lbolt() + 6514 drv_usectohz(vdc_ownership_delay); 6515 6516 /* Release the ownership_lock and wait on the vdc lock */ 6517 mutex_exit(&vdc->ownership_lock); 6518 6519 if (timeout == 0) 6520 (void) cv_wait(&vdc->ownership_cv, &vdc->lock); 6521 else 6522 (void) cv_timedwait(&vdc->ownership_cv, 6523 &vdc->lock, timeout); 6524 6525 mutex_exit(&vdc->lock); 6526 6527 mutex_enter(&vdc->ownership_lock); 6528 mutex_enter(&vdc->lock); 6529 } 6530 6531 vdc->ownership_thread = NULL; 6532 mutex_exit(&vdc->lock); 6533 mutex_exit(&vdc->ownership_lock); 6534 6535 thread_exit(); 6536 } 6537 6538 static void 6539 vdc_ownership_update(vdc_t *vdc, int ownership_flags) 6540 { 6541 ASSERT(MUTEX_HELD(&vdc->ownership_lock)); 6542 6543 mutex_enter(&vdc->lock); 6544 vdc->ownership = ownership_flags; 6545 if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && 6546 vdc->ownership_thread == NULL) { 6547 /* start ownership thread */ 6548 vdc->ownership_thread = thread_create(NULL, 0, 6549 vdc_ownership_thread, vdc, 0, &p0, TS_RUN, 6550 v.v_maxsyspri - 2); 6551 } else { 6552 /* notify the ownership thread */ 6553 cv_signal(&vdc->ownership_cv); 6554 } 6555 mutex_exit(&vdc->lock); 6556 } 6557 6558 /* 6559 * Get the size and the block size of a virtual disk from the vdisk server. 6560 */ 6561 static int 6562 vdc_get_capacity(vdc_t *vdc, size_t *dsk_size, size_t *blk_size) 6563 { 6564 int rv = 0; 6565 size_t alloc_len; 6566 vd_capacity_t *vd_cap; 6567 6568 ASSERT(MUTEX_NOT_HELD(&vdc->lock)); 6569 6570 alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); 6571 6572 vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); 6573 6574 rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, 6575 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); 6576 6577 *dsk_size = vd_cap->vdisk_size; 6578 *blk_size = vd_cap->vdisk_block_size; 6579 6580 kmem_free(vd_cap, alloc_len); 6581 return (rv); 6582 } 6583 6584 /* 6585 * Check the disk capacity. Disk size information is updated if size has 6586 * changed. 6587 * 6588 * Return 0 if the disk capacity is available, or non-zero if it is not. 6589 */ 6590 static int 6591 vdc_check_capacity(vdc_t *vdc) 6592 { 6593 size_t dsk_size, blk_size; 6594 int rv; 6595 6596 /* 6597 * If the vdisk does not support the VD_OP_GET_CAPACITY operation 6598 * then the disk capacity has been retrieved during the handshake 6599 * and there's nothing more to do here. 6600 */ 6601 if (!VD_OP_SUPPORTED(vdc->operations, VD_OP_GET_CAPACITY)) 6602 return (0); 6603 6604 if ((rv = vdc_get_capacity(vdc, &dsk_size, &blk_size)) != 0) 6605 return (rv); 6606 6607 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0) 6608 return (EINVAL); 6609 6610 mutex_enter(&vdc->lock); 6611 vdc_update_size(vdc, dsk_size, blk_size, vdc->max_xfer_sz); 6612 mutex_exit(&vdc->lock); 6613 6614 return (0); 6615 } 6616 6617 /* 6618 * This structure is used in the DKIO(7I) array below. 6619 */ 6620 typedef struct vdc_dk_ioctl { 6621 uint8_t op; /* VD_OP_XXX value */ 6622 int cmd; /* Solaris ioctl operation number */ 6623 size_t nbytes; /* size of structure to be copied */ 6624 6625 /* function to convert between vDisk and Solaris structure formats */ 6626 int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, 6627 int mode, int dir); 6628 } vdc_dk_ioctl_t; 6629 6630 /* 6631 * Subset of DKIO(7I) operations currently supported 6632 */ 6633 static vdc_dk_ioctl_t dk_ioctl[] = { 6634 {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, 6635 vdc_null_copy_func}, 6636 {VD_OP_GET_WCE, DKIOCGETWCE, sizeof (int), 6637 vdc_get_wce_convert}, 6638 {VD_OP_SET_WCE, DKIOCSETWCE, sizeof (int), 6639 vdc_set_wce_convert}, 6640 {VD_OP_GET_VTOC, DKIOCGVTOC, sizeof (vd_vtoc_t), 6641 vdc_get_vtoc_convert}, 6642 {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), 6643 vdc_set_vtoc_convert}, 6644 {VD_OP_GET_VTOC, DKIOCGEXTVTOC, sizeof (vd_vtoc_t), 6645 vdc_get_extvtoc_convert}, 6646 {VD_OP_SET_VTOC, DKIOCSEXTVTOC, sizeof (vd_vtoc_t), 6647 vdc_set_extvtoc_convert}, 6648 {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), 6649 vdc_get_geom_convert}, 6650 {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), 6651 vdc_get_geom_convert}, 6652 {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), 6653 vdc_get_geom_convert}, 6654 {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), 6655 vdc_set_geom_convert}, 6656 {VD_OP_GET_EFI, DKIOCGETEFI, 0, 6657 vdc_get_efi_convert}, 6658 {VD_OP_SET_EFI, DKIOCSETEFI, 0, 6659 vdc_set_efi_convert}, 6660 6661 /* DIOCTL_RWCMD is converted to a read or a write */ 6662 {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, 6663 6664 /* mhd(7I) non-shared multihost disks ioctls */ 6665 {0, MHIOCTKOWN, 0, vdc_null_copy_func}, 6666 {0, MHIOCRELEASE, 0, vdc_null_copy_func}, 6667 {0, MHIOCSTATUS, 0, vdc_null_copy_func}, 6668 {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, 6669 6670 /* mhd(7I) shared multihost disks ioctls */ 6671 {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, 6672 {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, 6673 {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, 6674 {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, 6675 {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, 6676 {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, 6677 6678 /* mhd(7I) failfast ioctl */ 6679 {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, 6680 6681 /* 6682 * These particular ioctls are not sent to the server - vdc fakes up 6683 * the necessary info. 6684 */ 6685 {0, DKIOCINFO, sizeof (struct dk_cinfo), vdc_null_copy_func}, 6686 {0, DKIOCGMEDIAINFO, sizeof (struct dk_minfo), vdc_null_copy_func}, 6687 {0, USCSICMD, sizeof (struct uscsi_cmd), vdc_null_copy_func}, 6688 {0, DKIOCPARTITION, 0, vdc_null_copy_func }, 6689 {0, DKIOCGAPART, 0, vdc_null_copy_func }, 6690 {0, DKIOCREMOVABLE, 0, vdc_null_copy_func}, 6691 {0, CDROMREADOFFSET, 0, vdc_null_copy_func} 6692 }; 6693 6694 /* 6695 * This function handles ioctl requests from the vd_efi_alloc_and_read() 6696 * function and forward them to the vdisk. 6697 */ 6698 static int 6699 vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg) 6700 { 6701 vdc_t *vdc = (vdc_t *)vdisk; 6702 dev_t dev; 6703 int rval; 6704 6705 dev = makedevice(ddi_driver_major(vdc->dip), 6706 VD_MAKE_DEV(vdc->instance, 0)); 6707 6708 return (vd_process_ioctl(dev, cmd, (caddr_t)arg, FKIOCTL, &rval)); 6709 } 6710 6711 /* 6712 * Function: 6713 * vd_process_ioctl() 6714 * 6715 * Description: 6716 * This routine processes disk specific ioctl calls 6717 * 6718 * Arguments: 6719 * dev - the device number 6720 * cmd - the operation [dkio(7I)] to be processed 6721 * arg - pointer to user provided structure 6722 * (contains data to be set or reference parameter for get) 6723 * mode - bit flag, indicating open settings, 32/64 bit type, etc 6724 * rvalp - pointer to return value for calling process. 6725 * 6726 * Return Code: 6727 * 0 6728 * EFAULT 6729 * ENXIO 6730 * EIO 6731 * ENOTSUP 6732 */ 6733 static int 6734 vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) 6735 { 6736 int instance = VDCUNIT(dev); 6737 vdc_t *vdc = NULL; 6738 int rv = -1; 6739 int idx = 0; /* index into dk_ioctl[] */ 6740 size_t len = 0; /* #bytes to send to vds */ 6741 size_t alloc_len = 0; /* #bytes to allocate mem for */ 6742 caddr_t mem_p = NULL; 6743 size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); 6744 vdc_dk_ioctl_t *iop; 6745 6746 vdc = ddi_get_soft_state(vdc_state, instance); 6747 if (vdc == NULL) { 6748 cmn_err(CE_NOTE, "![%d] Could not get soft state structure", 6749 instance); 6750 return (ENXIO); 6751 } 6752 6753 DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", 6754 instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); 6755 6756 if (rvalp != NULL) { 6757 /* the return value of the ioctl is 0 by default */ 6758 *rvalp = 0; 6759 } 6760 6761 /* 6762 * Validate the ioctl operation to be performed. 6763 * 6764 * If we have looped through the array without finding a match then we 6765 * don't support this ioctl. 6766 */ 6767 for (idx = 0; idx < nioctls; idx++) { 6768 if (cmd == dk_ioctl[idx].cmd) 6769 break; 6770 } 6771 6772 if (idx >= nioctls) { 6773 DMSG(vdc, 0, "[%d] Unsupported ioctl (0x%x)\n", 6774 vdc->instance, cmd); 6775 return (ENOTSUP); 6776 } 6777 6778 iop = &(dk_ioctl[idx]); 6779 6780 if (cmd == DKIOCGETEFI || cmd == DKIOCSETEFI) { 6781 /* size is not fixed for EFI ioctls, it depends on ioctl arg */ 6782 dk_efi_t dk_efi; 6783 6784 rv = ddi_copyin(arg, &dk_efi, sizeof (dk_efi_t), mode); 6785 if (rv != 0) 6786 return (EFAULT); 6787 6788 len = sizeof (vd_efi_t) - 1 + dk_efi.dki_length; 6789 } else { 6790 len = iop->nbytes; 6791 } 6792 6793 /* check if the ioctl is applicable */ 6794 switch (cmd) { 6795 case CDROMREADOFFSET: 6796 case DKIOCREMOVABLE: 6797 return (ENOTTY); 6798 6799 case USCSICMD: 6800 case MHIOCTKOWN: 6801 case MHIOCSTATUS: 6802 case MHIOCQRESERVE: 6803 case MHIOCRELEASE: 6804 case MHIOCGRP_INKEYS: 6805 case MHIOCGRP_INRESV: 6806 case MHIOCGRP_REGISTER: 6807 case MHIOCGRP_RESERVE: 6808 case MHIOCGRP_PREEMPTANDABORT: 6809 case MHIOCGRP_REGISTERANDIGNOREKEY: 6810 case MHIOCENFAILFAST: 6811 if (vdc->cinfo == NULL) 6812 return (ENXIO); 6813 if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) 6814 return (ENOTTY); 6815 break; 6816 6817 case DIOCTL_RWCMD: 6818 if (vdc->cinfo == NULL) 6819 return (ENXIO); 6820 if (vdc->cinfo->dki_ctype != DKC_DIRECT) 6821 return (ENOTTY); 6822 break; 6823 6824 case DKIOCINFO: 6825 if (vdc->cinfo == NULL) 6826 return (ENXIO); 6827 break; 6828 6829 case DKIOCGMEDIAINFO: 6830 if (vdc->minfo == NULL) 6831 return (ENXIO); 6832 if (vdc_check_capacity(vdc) != 0) 6833 /* disk capacity is not available */ 6834 return (EIO); 6835 break; 6836 } 6837 6838 /* 6839 * Deal with ioctls which require a processing different than 6840 * converting ioctl arguments and sending a corresponding 6841 * VD operation. 6842 */ 6843 switch (cmd) { 6844 6845 case USCSICMD: 6846 { 6847 return (vdc_uscsi_cmd(vdc, arg, mode)); 6848 } 6849 6850 case MHIOCTKOWN: 6851 { 6852 mutex_enter(&vdc->ownership_lock); 6853 /* 6854 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership 6855 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset 6856 * while we are processing the ioctl. 6857 */ 6858 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); 6859 6860 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | 6861 VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); 6862 if (rv == 0) { 6863 vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | 6864 VDC_OWNERSHIP_GRANTED); 6865 } else { 6866 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6867 } 6868 mutex_exit(&vdc->ownership_lock); 6869 return (rv); 6870 } 6871 6872 case MHIOCRELEASE: 6873 { 6874 mutex_enter(&vdc->ownership_lock); 6875 rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); 6876 if (rv == 0) { 6877 vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); 6878 } 6879 mutex_exit(&vdc->ownership_lock); 6880 return (rv); 6881 } 6882 6883 case MHIOCSTATUS: 6884 { 6885 uint64_t status; 6886 6887 rv = vdc_access_get(vdc, &status, mode); 6888 if (rv == 0 && rvalp != NULL) 6889 *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; 6890 return (rv); 6891 } 6892 6893 case MHIOCQRESERVE: 6894 { 6895 rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); 6896 return (rv); 6897 } 6898 6899 case MHIOCGRP_INKEYS: 6900 { 6901 return (vdc_mhd_inkeys(vdc, arg, mode)); 6902 } 6903 6904 case MHIOCGRP_INRESV: 6905 { 6906 return (vdc_mhd_inresv(vdc, arg, mode)); 6907 } 6908 6909 case MHIOCGRP_REGISTER: 6910 { 6911 return (vdc_mhd_register(vdc, arg, mode)); 6912 } 6913 6914 case MHIOCGRP_RESERVE: 6915 { 6916 return (vdc_mhd_reserve(vdc, arg, mode)); 6917 } 6918 6919 case MHIOCGRP_PREEMPTANDABORT: 6920 { 6921 return (vdc_mhd_preemptabort(vdc, arg, mode)); 6922 } 6923 6924 case MHIOCGRP_REGISTERANDIGNOREKEY: 6925 { 6926 return (vdc_mhd_registerignore(vdc, arg, mode)); 6927 } 6928 6929 case MHIOCENFAILFAST: 6930 { 6931 rv = vdc_failfast(vdc, arg, mode); 6932 return (rv); 6933 } 6934 6935 case DIOCTL_RWCMD: 6936 { 6937 return (vdc_dioctl_rwcmd(dev, arg, mode)); 6938 } 6939 6940 case DKIOCGAPART: 6941 { 6942 return (vdc_dkio_gapart(vdc, arg, mode)); 6943 } 6944 6945 case DKIOCPARTITION: 6946 { 6947 return (vdc_dkio_partition(vdc, arg, mode)); 6948 } 6949 6950 case DKIOCINFO: 6951 { 6952 struct dk_cinfo cinfo; 6953 6954 bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); 6955 cinfo.dki_partition = VDCPART(dev); 6956 6957 rv = ddi_copyout(&cinfo, (void *)arg, 6958 sizeof (struct dk_cinfo), mode); 6959 if (rv != 0) 6960 return (EFAULT); 6961 6962 return (0); 6963 } 6964 6965 case DKIOCGMEDIAINFO: 6966 { 6967 ASSERT(vdc->vdisk_size != 0); 6968 ASSERT(vdc->minfo->dki_capacity != 0); 6969 rv = ddi_copyout(vdc->minfo, (void *)arg, 6970 sizeof (struct dk_minfo), mode); 6971 if (rv != 0) 6972 return (EFAULT); 6973 6974 return (0); 6975 } 6976 6977 case DKIOCFLUSHWRITECACHE: 6978 { 6979 struct dk_callback *dkc = 6980 (struct dk_callback *)(uintptr_t)arg; 6981 vdc_dk_arg_t *dkarg = NULL; 6982 6983 DMSG(vdc, 1, "[%d] Flush W$: mode %x\n", 6984 instance, mode); 6985 6986 /* 6987 * If arg is NULL, then there is no callback function 6988 * registered and the call operates synchronously; we 6989 * break and continue with the rest of the function and 6990 * wait for vds to return (i.e. after the request to 6991 * vds returns successfully, all writes completed prior 6992 * to the ioctl will have been flushed from the disk 6993 * write cache to persistent media. 6994 * 6995 * If a callback function is registered, we dispatch 6996 * the request on a task queue and return immediately. 6997 * The callback will deal with informing the calling 6998 * thread that the flush request is completed. 6999 */ 7000 if (dkc == NULL) 7001 break; 7002 7003 /* 7004 * the asynchronous callback is only supported if 7005 * invoked from within the kernel 7006 */ 7007 if ((mode & FKIOCTL) == 0) 7008 return (ENOTSUP); 7009 7010 dkarg = kmem_zalloc(sizeof (vdc_dk_arg_t), KM_SLEEP); 7011 7012 dkarg->mode = mode; 7013 dkarg->dev = dev; 7014 bcopy(dkc, &dkarg->dkc, sizeof (*dkc)); 7015 7016 mutex_enter(&vdc->lock); 7017 vdc->dkio_flush_pending++; 7018 dkarg->vdc = vdc; 7019 mutex_exit(&vdc->lock); 7020 7021 /* put the request on a task queue */ 7022 rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, 7023 (void *)dkarg, DDI_SLEEP); 7024 if (rv == NULL) { 7025 /* clean up if dispatch fails */ 7026 mutex_enter(&vdc->lock); 7027 vdc->dkio_flush_pending--; 7028 mutex_exit(&vdc->lock); 7029 kmem_free(dkarg, sizeof (vdc_dk_arg_t)); 7030 } 7031 7032 return (rv == NULL ? ENOMEM : 0); 7033 } 7034 } 7035 7036 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ 7037 ASSERT(iop->op != 0); 7038 7039 /* check if the vDisk server handles the operation for this vDisk */ 7040 if (VD_OP_SUPPORTED(vdc->operations, iop->op) == B_FALSE) { 7041 DMSG(vdc, 0, "[%d] Unsupported VD_OP operation (0x%x)\n", 7042 vdc->instance, iop->op); 7043 return (ENOTSUP); 7044 } 7045 7046 /* LDC requires that the memory being mapped is 8-byte aligned */ 7047 alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); 7048 DMSG(vdc, 1, "[%d] struct size %ld alloc %ld\n", 7049 instance, len, alloc_len); 7050 7051 if (alloc_len > 0) 7052 mem_p = kmem_zalloc(alloc_len, KM_SLEEP); 7053 7054 /* 7055 * Call the conversion function for this ioctl which, if necessary, 7056 * converts from the Solaris format to the format ARC'ed 7057 * as part of the vDisk protocol (FWARC 2006/195) 7058 */ 7059 ASSERT(iop->convert != NULL); 7060 rv = (iop->convert)(vdc, arg, mem_p, mode, VD_COPYIN); 7061 if (rv != 0) { 7062 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7063 instance, rv, cmd); 7064 if (mem_p != NULL) 7065 kmem_free(mem_p, alloc_len); 7066 return (rv); 7067 } 7068 7069 /* 7070 * send request to vds to service the ioctl. 7071 */ 7072 rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, 7073 VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, 7074 VIO_both_dir, B_TRUE); 7075 7076 if (rv != 0) { 7077 /* 7078 * This is not necessarily an error. The ioctl could 7079 * be returning a value such as ENOTTY to indicate 7080 * that the ioctl is not applicable. 7081 */ 7082 DMSG(vdc, 0, "[%d] vds returned %d for ioctl 0x%x\n", 7083 instance, rv, cmd); 7084 if (mem_p != NULL) 7085 kmem_free(mem_p, alloc_len); 7086 7087 return (rv); 7088 } 7089 7090 /* 7091 * Call the conversion function (if it exists) for this ioctl 7092 * which converts from the format ARC'ed as part of the vDisk 7093 * protocol (FWARC 2006/195) back to a format understood by 7094 * the rest of Solaris. 7095 */ 7096 rv = (iop->convert)(vdc, mem_p, arg, mode, VD_COPYOUT); 7097 if (rv != 0) { 7098 DMSG(vdc, 0, "[%d] convert func returned %d for ioctl 0x%x\n", 7099 instance, rv, cmd); 7100 if (mem_p != NULL) 7101 kmem_free(mem_p, alloc_len); 7102 return (rv); 7103 } 7104 7105 if (mem_p != NULL) 7106 kmem_free(mem_p, alloc_len); 7107 7108 return (rv); 7109 } 7110 7111 /* 7112 * Function: 7113 * 7114 * Description: 7115 * This is an empty conversion function used by ioctl calls which 7116 * do not need to convert the data being passed in/out to userland 7117 */ 7118 static int 7119 vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) 7120 { 7121 _NOTE(ARGUNUSED(vdc)) 7122 _NOTE(ARGUNUSED(from)) 7123 _NOTE(ARGUNUSED(to)) 7124 _NOTE(ARGUNUSED(mode)) 7125 _NOTE(ARGUNUSED(dir)) 7126 7127 return (0); 7128 } 7129 7130 static int 7131 vdc_get_wce_convert(vdc_t *vdc, void *from, void *to, 7132 int mode, int dir) 7133 { 7134 _NOTE(ARGUNUSED(vdc)) 7135 7136 if (dir == VD_COPYIN) 7137 return (0); /* nothing to do */ 7138 7139 if (ddi_copyout(from, to, sizeof (int), mode) != 0) 7140 return (EFAULT); 7141 7142 return (0); 7143 } 7144 7145 static int 7146 vdc_set_wce_convert(vdc_t *vdc, void *from, void *to, 7147 int mode, int dir) 7148 { 7149 _NOTE(ARGUNUSED(vdc)) 7150 7151 if (dir == VD_COPYOUT) 7152 return (0); /* nothing to do */ 7153 7154 if (ddi_copyin(from, to, sizeof (int), mode) != 0) 7155 return (EFAULT); 7156 7157 return (0); 7158 } 7159 7160 /* 7161 * Function: 7162 * vdc_get_vtoc_convert() 7163 * 7164 * Description: 7165 * This routine performs the necessary convertions from the DKIOCGVTOC 7166 * Solaris structure to the format defined in FWARC 2006/195. 7167 * 7168 * In the struct vtoc definition, the timestamp field is marked as not 7169 * supported so it is not part of vDisk protocol (FWARC 2006/195). 7170 * However SVM uses that field to check it can write into the VTOC, 7171 * so we fake up the info of that field. 7172 * 7173 * Arguments: 7174 * vdc - the vDisk client 7175 * from - the buffer containing the data to be copied from 7176 * to - the buffer to be copied to 7177 * mode - flags passed to ioctl() call 7178 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT 7179 * 7180 * Return Code: 7181 * 0 - Success 7182 * ENXIO - incorrect buffer passed in. 7183 * EFAULT - ddi_copyout routine encountered an error. 7184 */ 7185 static int 7186 vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7187 { 7188 int i; 7189 struct vtoc vtoc; 7190 struct vtoc32 vtoc32; 7191 struct extvtoc evtoc; 7192 int rv; 7193 7194 if (dir != VD_COPYOUT) 7195 return (0); /* nothing to do */ 7196 7197 if ((from == NULL) || (to == NULL)) 7198 return (ENXIO); 7199 7200 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7201 return (EOVERFLOW); 7202 7203 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7204 7205 /* fake the VTOC timestamp field */ 7206 for (i = 0; i < V_NUMPAR; i++) { 7207 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7208 } 7209 7210 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7211 /* LINTED E_ASSIGN_NARROW_CONV */ 7212 extvtoctovtoc32(evtoc, vtoc32); 7213 rv = ddi_copyout(&vtoc32, to, sizeof (vtoc32), mode); 7214 if (rv != 0) 7215 rv = EFAULT; 7216 } else { 7217 extvtoctovtoc(evtoc, vtoc); 7218 rv = ddi_copyout(&vtoc, to, sizeof (vtoc), mode); 7219 if (rv != 0) 7220 rv = EFAULT; 7221 } 7222 7223 return (rv); 7224 } 7225 7226 /* 7227 * Function: 7228 * vdc_set_vtoc_convert() 7229 * 7230 * Description: 7231 * This routine performs the necessary convertions from the DKIOCSVTOC 7232 * Solaris structure to the format defined in FWARC 2006/195. 7233 * 7234 * Arguments: 7235 * vdc - the vDisk client 7236 * from - Buffer with data 7237 * to - Buffer where data is to be copied to 7238 * mode - flags passed to ioctl 7239 * dir - direction of copy (in or out) 7240 * 7241 * Return Code: 7242 * 0 - Success 7243 * ENXIO - Invalid buffer passed in 7244 * EFAULT - ddi_copyin of data failed 7245 */ 7246 static int 7247 vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7248 { 7249 void *uvtoc; 7250 struct vtoc vtoc; 7251 struct vtoc32 vtoc32; 7252 struct extvtoc evtoc; 7253 int i, rv; 7254 7255 if ((from == NULL) || (to == NULL)) 7256 return (ENXIO); 7257 7258 if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) 7259 return (EOVERFLOW); 7260 7261 uvtoc = (dir == VD_COPYIN)? from : to; 7262 7263 if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { 7264 rv = ddi_copyin(uvtoc, &vtoc32, sizeof (vtoc32), mode); 7265 if (rv != 0) 7266 return (EFAULT); 7267 vtoc32toextvtoc(vtoc32, evtoc); 7268 } else { 7269 rv = ddi_copyin(uvtoc, &vtoc, sizeof (vtoc), mode); 7270 if (rv != 0) 7271 return (EFAULT); 7272 vtoctoextvtoc(vtoc, evtoc); 7273 } 7274 7275 if (dir == VD_COPYOUT) { 7276 /* 7277 * The disk label may have changed. Revalidate the disk 7278 * geometry. This will also update the device nodes. 7279 */ 7280 vdc_validate(vdc); 7281 7282 /* 7283 * We also need to keep track of the timestamp fields. 7284 */ 7285 for (i = 0; i < V_NUMPAR; i++) { 7286 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7287 } 7288 7289 } else { 7290 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7291 } 7292 7293 return (0); 7294 } 7295 7296 static int 7297 vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7298 { 7299 int i, rv; 7300 struct extvtoc evtoc; 7301 7302 if (dir != VD_COPYOUT) 7303 return (0); /* nothing to do */ 7304 7305 if ((from == NULL) || (to == NULL)) 7306 return (ENXIO); 7307 7308 VD_VTOC2VTOC((vd_vtoc_t *)from, &evtoc); 7309 7310 /* fake the VTOC timestamp field */ 7311 for (i = 0; i < V_NUMPAR; i++) { 7312 evtoc.timestamp[i] = vdc->vtoc->timestamp[i]; 7313 } 7314 7315 rv = ddi_copyout(&evtoc, to, sizeof (struct extvtoc), mode); 7316 if (rv != 0) 7317 rv = EFAULT; 7318 7319 return (rv); 7320 } 7321 7322 static int 7323 vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7324 { 7325 void *uvtoc; 7326 struct extvtoc evtoc; 7327 int i, rv; 7328 7329 if ((from == NULL) || (to == NULL)) 7330 return (ENXIO); 7331 7332 uvtoc = (dir == VD_COPYIN)? from : to; 7333 7334 rv = ddi_copyin(uvtoc, &evtoc, sizeof (struct extvtoc), mode); 7335 if (rv != 0) 7336 return (EFAULT); 7337 7338 if (dir == VD_COPYOUT) { 7339 /* 7340 * The disk label may have changed. Revalidate the disk 7341 * geometry. This will also update the device nodes. 7342 */ 7343 vdc_validate(vdc); 7344 7345 /* 7346 * We also need to keep track of the timestamp fields. 7347 */ 7348 for (i = 0; i < V_NUMPAR; i++) { 7349 vdc->vtoc->timestamp[i] = evtoc.timestamp[i]; 7350 } 7351 7352 } else { 7353 VTOC2VD_VTOC(&evtoc, (vd_vtoc_t *)to); 7354 } 7355 7356 return (0); 7357 } 7358 7359 /* 7360 * Function: 7361 * vdc_get_geom_convert() 7362 * 7363 * Description: 7364 * This routine performs the necessary convertions from the DKIOCGGEOM, 7365 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format 7366 * defined in FWARC 2006/195 7367 * 7368 * Arguments: 7369 * vdc - the vDisk client 7370 * from - Buffer with data 7371 * to - Buffer where data is to be copied to 7372 * mode - flags passed to ioctl 7373 * dir - direction of copy (in or out) 7374 * 7375 * Return Code: 7376 * 0 - Success 7377 * ENXIO - Invalid buffer passed in 7378 * EFAULT - ddi_copyout of data failed 7379 */ 7380 static int 7381 vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7382 { 7383 _NOTE(ARGUNUSED(vdc)) 7384 7385 struct dk_geom geom; 7386 int copy_len = sizeof (struct dk_geom); 7387 int rv = 0; 7388 7389 if (dir != VD_COPYOUT) 7390 return (0); /* nothing to do */ 7391 7392 if ((from == NULL) || (to == NULL)) 7393 return (ENXIO); 7394 7395 VD_GEOM2DK_GEOM((vd_geom_t *)from, &geom); 7396 rv = ddi_copyout(&geom, to, copy_len, mode); 7397 if (rv != 0) 7398 rv = EFAULT; 7399 7400 return (rv); 7401 } 7402 7403 /* 7404 * Function: 7405 * vdc_set_geom_convert() 7406 * 7407 * Description: 7408 * This routine performs the necessary convertions from the DKIOCSGEOM 7409 * Solaris structure to the format defined in FWARC 2006/195. 7410 * 7411 * Arguments: 7412 * vdc - the vDisk client 7413 * from - Buffer with data 7414 * to - Buffer where data is to be copied to 7415 * mode - flags passed to ioctl 7416 * dir - direction of copy (in or out) 7417 * 7418 * Return Code: 7419 * 0 - Success 7420 * ENXIO - Invalid buffer passed in 7421 * EFAULT - ddi_copyin of data failed 7422 */ 7423 static int 7424 vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7425 { 7426 _NOTE(ARGUNUSED(vdc)) 7427 7428 vd_geom_t vdgeom; 7429 void *tmp_mem = NULL; 7430 int copy_len = sizeof (struct dk_geom); 7431 int rv = 0; 7432 7433 if (dir != VD_COPYIN) 7434 return (0); /* nothing to do */ 7435 7436 if ((from == NULL) || (to == NULL)) 7437 return (ENXIO); 7438 7439 tmp_mem = kmem_alloc(copy_len, KM_SLEEP); 7440 7441 rv = ddi_copyin(from, tmp_mem, copy_len, mode); 7442 if (rv != 0) { 7443 kmem_free(tmp_mem, copy_len); 7444 return (EFAULT); 7445 } 7446 DK_GEOM2VD_GEOM((struct dk_geom *)tmp_mem, &vdgeom); 7447 bcopy(&vdgeom, to, sizeof (vdgeom)); 7448 kmem_free(tmp_mem, copy_len); 7449 7450 return (0); 7451 } 7452 7453 static int 7454 vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7455 { 7456 _NOTE(ARGUNUSED(vdc)) 7457 7458 vd_efi_t *vd_efi; 7459 dk_efi_t dk_efi; 7460 int rv = 0; 7461 void *uaddr; 7462 7463 if ((from == NULL) || (to == NULL)) 7464 return (ENXIO); 7465 7466 if (dir == VD_COPYIN) { 7467 7468 vd_efi = (vd_efi_t *)to; 7469 7470 rv = ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode); 7471 if (rv != 0) 7472 return (EFAULT); 7473 7474 vd_efi->lba = dk_efi.dki_lba; 7475 vd_efi->length = dk_efi.dki_length; 7476 bzero(vd_efi->data, vd_efi->length); 7477 7478 } else { 7479 7480 rv = ddi_copyin(to, &dk_efi, sizeof (dk_efi_t), mode); 7481 if (rv != 0) 7482 return (EFAULT); 7483 7484 uaddr = dk_efi.dki_data; 7485 7486 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7487 7488 VD_EFI2DK_EFI((vd_efi_t *)from, &dk_efi); 7489 7490 rv = ddi_copyout(dk_efi.dki_data, uaddr, dk_efi.dki_length, 7491 mode); 7492 if (rv != 0) 7493 return (EFAULT); 7494 7495 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7496 } 7497 7498 return (0); 7499 } 7500 7501 static int 7502 vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) 7503 { 7504 _NOTE(ARGUNUSED(vdc)) 7505 7506 dk_efi_t dk_efi; 7507 void *uaddr; 7508 7509 if (dir == VD_COPYOUT) { 7510 /* 7511 * The disk label may have changed. Revalidate the disk 7512 * geometry. This will also update the device nodes. 7513 */ 7514 vdc_validate(vdc); 7515 return (0); 7516 } 7517 7518 if ((from == NULL) || (to == NULL)) 7519 return (ENXIO); 7520 7521 if (ddi_copyin(from, &dk_efi, sizeof (dk_efi_t), mode) != 0) 7522 return (EFAULT); 7523 7524 uaddr = dk_efi.dki_data; 7525 7526 dk_efi.dki_data = kmem_alloc(dk_efi.dki_length, KM_SLEEP); 7527 7528 if (ddi_copyin(uaddr, dk_efi.dki_data, dk_efi.dki_length, mode) != 0) 7529 return (EFAULT); 7530 7531 DK_EFI2VD_EFI(&dk_efi, (vd_efi_t *)to); 7532 7533 kmem_free(dk_efi.dki_data, dk_efi.dki_length); 7534 7535 return (0); 7536 } 7537 7538 7539 /* -------------------------------------------------------------------------- */ 7540 7541 /* 7542 * Function: 7543 * vdc_create_fake_geometry() 7544 * 7545 * Description: 7546 * This routine fakes up the disk info needed for some DKIO ioctls such 7547 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do] 7548 * 7549 * Note: This function must not be called until the vDisk attributes have 7550 * been exchanged as part of the handshake with the vDisk server. 7551 * 7552 * Arguments: 7553 * vdc - soft state pointer for this instance of the device driver. 7554 * 7555 * Return Code: 7556 * none. 7557 */ 7558 static void 7559 vdc_create_fake_geometry(vdc_t *vdc) 7560 { 7561 ASSERT(vdc != NULL); 7562 ASSERT(vdc->max_xfer_sz != 0); 7563 7564 /* 7565 * DKIOCINFO support 7566 */ 7567 if (vdc->cinfo == NULL) 7568 vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); 7569 7570 (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); 7571 (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); 7572 /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ 7573 vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; 7574 7575 /* 7576 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD 7577 * operation is supported, otherwise the controller type is DKC_DIRECT. 7578 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the 7579 * controller type is always DKC_DIRECT in that case. 7580 * 7581 * If the virtual disk is backed by a physical CD/DVD device or 7582 * an ISO image, modify the controller type to indicate this 7583 */ 7584 switch (vdc->vdisk_media) { 7585 case VD_MEDIA_CD: 7586 case VD_MEDIA_DVD: 7587 vdc->cinfo->dki_ctype = DKC_CDROM; 7588 break; 7589 case VD_MEDIA_FIXED: 7590 if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) 7591 vdc->cinfo->dki_ctype = DKC_SCSI_CCS; 7592 else 7593 vdc->cinfo->dki_ctype = DKC_DIRECT; 7594 break; 7595 default: 7596 /* in the case of v1.0 we default to a fixed disk */ 7597 vdc->cinfo->dki_ctype = DKC_DIRECT; 7598 break; 7599 } 7600 vdc->cinfo->dki_flags = DKI_FMTVOL; 7601 vdc->cinfo->dki_cnum = 0; 7602 vdc->cinfo->dki_addr = 0; 7603 vdc->cinfo->dki_space = 0; 7604 vdc->cinfo->dki_prio = 0; 7605 vdc->cinfo->dki_vec = 0; 7606 vdc->cinfo->dki_unit = vdc->instance; 7607 vdc->cinfo->dki_slave = 0; 7608 /* 7609 * The partition number will be created on the fly depending on the 7610 * actual slice (i.e. minor node) that is used to request the data. 7611 */ 7612 vdc->cinfo->dki_partition = 0; 7613 7614 /* 7615 * DKIOCGMEDIAINFO support 7616 */ 7617 if (vdc->minfo == NULL) 7618 vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); 7619 7620 if (vio_ver_is_supported(vdc->ver, 1, 1)) { 7621 vdc->minfo->dki_media_type = 7622 VD_MEDIATYPE2DK_MEDIATYPE(vdc->vdisk_media); 7623 } else { 7624 vdc->minfo->dki_media_type = DK_FIXED_DISK; 7625 } 7626 7627 vdc->minfo->dki_capacity = vdc->vdisk_size; 7628 vdc->minfo->dki_lbsize = vdc->block_size; 7629 } 7630 7631 static ushort_t 7632 vdc_lbl2cksum(struct dk_label *label) 7633 { 7634 int count; 7635 ushort_t sum, *sp; 7636 7637 count = (sizeof (struct dk_label)) / (sizeof (short)) - 1; 7638 sp = (ushort_t *)label; 7639 sum = 0; 7640 while (count--) { 7641 sum ^= *sp++; 7642 } 7643 7644 return (sum); 7645 } 7646 7647 static void 7648 vdc_update_size(vdc_t *vdc, size_t dsk_size, size_t blk_size, size_t xfr_size) 7649 { 7650 vd_err_stats_t *stp; 7651 7652 ASSERT(MUTEX_HELD(&vdc->lock)); 7653 ASSERT(xfr_size != 0); 7654 7655 /* 7656 * If the disk size is unknown or sizes are unchanged then don't 7657 * update anything. 7658 */ 7659 if (dsk_size == VD_SIZE_UNKNOWN || dsk_size == 0 || 7660 (blk_size == vdc->block_size && dsk_size == vdc->vdisk_size && 7661 xfr_size == vdc->max_xfer_sz)) 7662 return; 7663 7664 /* 7665 * We don't know at compile time what the vDisk server will think 7666 * are good values but we apply a large (arbitrary) upper bound to 7667 * prevent memory exhaustion in vdc if it was allocating a DRing 7668 * based of huge values sent by the server. We probably will never 7669 * exceed this except if the message was garbage. 7670 */ 7671 if ((xfr_size * blk_size) > (PAGESIZE * DEV_BSIZE)) { 7672 DMSG(vdc, 0, "[%d] vds block transfer size too big;" 7673 " using max supported by vdc", vdc->instance); 7674 xfr_size = maxphys / DEV_BSIZE; 7675 dsk_size = (dsk_size * blk_size) / DEV_BSIZE; 7676 blk_size = DEV_BSIZE; 7677 } 7678 7679 vdc->max_xfer_sz = xfr_size; 7680 vdc->block_size = blk_size; 7681 vdc->vdisk_size = dsk_size; 7682 7683 stp = (vd_err_stats_t *)vdc->err_stats->ks_data; 7684 stp->vd_capacity.value.ui64 = dsk_size * blk_size; 7685 7686 vdc->minfo->dki_capacity = dsk_size; 7687 vdc->minfo->dki_lbsize = (uint_t)blk_size; 7688 } 7689 7690 /* 7691 * Function: 7692 * vdc_validate_geometry 7693 * 7694 * Description: 7695 * This routine discovers the label and geometry of the disk. It stores 7696 * the disk label and related information in the vdc structure. If it 7697 * fails to validate the geometry or to discover the disk label then 7698 * the label is marked as unknown (VD_DISK_LABEL_UNK). 7699 * 7700 * Arguments: 7701 * vdc - soft state pointer for this instance of the device driver. 7702 * 7703 * Return Code: 7704 * 0 - success. 7705 * EINVAL - unknown disk label. 7706 * ENOTSUP - geometry not applicable (EFI label). 7707 * EIO - error accessing the disk. 7708 */ 7709 static int 7710 vdc_validate_geometry(vdc_t *vdc) 7711 { 7712 buf_t *buf; /* BREAD requests need to be in a buf_t structure */ 7713 dev_t dev; 7714 int rv, rval; 7715 struct dk_label label; 7716 struct dk_geom geom; 7717 struct extvtoc vtoc; 7718 efi_gpt_t *gpt; 7719 efi_gpe_t *gpe; 7720 vd_efi_dev_t edev; 7721 7722 ASSERT(vdc != NULL); 7723 ASSERT(vdc->vtoc != NULL && vdc->geom != NULL); 7724 ASSERT(MUTEX_HELD(&vdc->lock)); 7725 7726 mutex_exit(&vdc->lock); 7727 /* 7728 * Check the disk capacity in case it has changed. If that fails then 7729 * we proceed and we will be using the disk size we currently have. 7730 */ 7731 (void) vdc_check_capacity(vdc); 7732 dev = makedevice(ddi_driver_major(vdc->dip), 7733 VD_MAKE_DEV(vdc->instance, 0)); 7734 7735 rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); 7736 if (rv == 0) 7737 rv = vd_process_ioctl(dev, DKIOCGEXTVTOC, (caddr_t)&vtoc, 7738 FKIOCTL, &rval); 7739 7740 if (rv == ENOTSUP) { 7741 /* 7742 * If the device does not support VTOC then we try 7743 * to read an EFI label. 7744 * 7745 * We need to know the block size and the disk size to 7746 * be able to read an EFI label. 7747 */ 7748 if (vdc->vdisk_size == 0) { 7749 mutex_enter(&vdc->lock); 7750 vdc_store_label_unk(vdc); 7751 return (EIO); 7752 } 7753 7754 VD_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl); 7755 7756 rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe); 7757 7758 if (rv) { 7759 DMSG(vdc, 0, "[%d] Failed to get EFI (err=%d)", 7760 vdc->instance, rv); 7761 mutex_enter(&vdc->lock); 7762 vdc_store_label_unk(vdc); 7763 return (EIO); 7764 } 7765 7766 mutex_enter(&vdc->lock); 7767 vdc_store_label_efi(vdc, gpt, gpe); 7768 vd_efi_free(&edev, gpt, gpe); 7769 return (ENOTSUP); 7770 } 7771 7772 if (rv != 0) { 7773 DMSG(vdc, 0, "[%d] Failed to get VTOC (err=%d)", 7774 vdc->instance, rv); 7775 mutex_enter(&vdc->lock); 7776 vdc_store_label_unk(vdc); 7777 if (rv != EINVAL) 7778 rv = EIO; 7779 return (rv); 7780 } 7781 7782 /* check that geometry and vtoc are valid */ 7783 if (geom.dkg_nhead == 0 || geom.dkg_nsect == 0 || 7784 vtoc.v_sanity != VTOC_SANE) { 7785 mutex_enter(&vdc->lock); 7786 vdc_store_label_unk(vdc); 7787 return (EINVAL); 7788 } 7789 7790 /* 7791 * We have a disk and a valid VTOC. However this does not mean 7792 * that the disk currently have a VTOC label. The returned VTOC may 7793 * be a default VTOC to be used for configuring the disk (this is 7794 * what is done for disk image). So we read the label from the 7795 * beginning of the disk to ensure we really have a VTOC label. 7796 * 7797 * FUTURE: This could be the default way for reading the VTOC 7798 * from the disk as opposed to sending the VD_OP_GET_VTOC 7799 * to the server. This will be the default if vdc is implemented 7800 * ontop of cmlb. 7801 */ 7802 7803 /* 7804 * Single slice disk does not support read using an absolute disk 7805 * offset so we just rely on the DKIOCGVTOC ioctl in that case. 7806 */ 7807 if (vdc->vdisk_type == VD_DISK_TYPE_SLICE) { 7808 mutex_enter(&vdc->lock); 7809 if (vtoc.v_nparts != 1) { 7810 vdc_store_label_unk(vdc); 7811 return (EINVAL); 7812 } 7813 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7814 return (0); 7815 } 7816 7817 if (vtoc.v_nparts != V_NUMPAR) { 7818 mutex_enter(&vdc->lock); 7819 vdc_store_label_unk(vdc); 7820 return (EINVAL); 7821 } 7822 7823 /* 7824 * Read disk label from start of disk 7825 */ 7826 buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); 7827 bioinit(buf); 7828 buf->b_un.b_addr = (caddr_t)&label; 7829 buf->b_bcount = DK_LABEL_SIZE; 7830 buf->b_flags = B_BUSY | B_READ; 7831 buf->b_dev = cmpdev(dev); 7832 rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)&label, 7833 DK_LABEL_SIZE, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir); 7834 if (rv) { 7835 DMSG(vdc, 1, "[%d] Failed to read disk block 0\n", 7836 vdc->instance); 7837 } else { 7838 rv = biowait(buf); 7839 biofini(buf); 7840 } 7841 kmem_free(buf, sizeof (buf_t)); 7842 7843 if (rv != 0 || label.dkl_magic != DKL_MAGIC || 7844 label.dkl_cksum != vdc_lbl2cksum(&label)) { 7845 DMSG(vdc, 1, "[%d] Got VTOC with invalid label\n", 7846 vdc->instance); 7847 mutex_enter(&vdc->lock); 7848 vdc_store_label_unk(vdc); 7849 return (EINVAL); 7850 } 7851 7852 mutex_enter(&vdc->lock); 7853 vdc_store_label_vtoc(vdc, &geom, &vtoc); 7854 return (0); 7855 } 7856 7857 /* 7858 * Function: 7859 * vdc_validate 7860 * 7861 * Description: 7862 * This routine discovers the label of the disk and create the 7863 * appropriate device nodes if the label has changed. 7864 * 7865 * Arguments: 7866 * vdc - soft state pointer for this instance of the device driver. 7867 * 7868 * Return Code: 7869 * none. 7870 */ 7871 static void 7872 vdc_validate(vdc_t *vdc) 7873 { 7874 vd_disk_label_t old_label; 7875 vd_slice_t old_slice[V_NUMPAR]; 7876 int rv; 7877 7878 ASSERT(!MUTEX_HELD(&vdc->lock)); 7879 7880 mutex_enter(&vdc->lock); 7881 7882 /* save the current label and vtoc */ 7883 old_label = vdc->vdisk_label; 7884 bcopy(vdc->slice, &old_slice, sizeof (vd_slice_t) * V_NUMPAR); 7885 7886 /* check the geometry */ 7887 (void) vdc_validate_geometry(vdc); 7888 7889 /* if the disk label has changed, update device nodes */ 7890 if (vdc->vdisk_label != old_label) { 7891 7892 if (vdc->vdisk_label == VD_DISK_LABEL_EFI) 7893 rv = vdc_create_device_nodes_efi(vdc); 7894 else 7895 rv = vdc_create_device_nodes_vtoc(vdc); 7896 7897 if (rv != 0) { 7898 DMSG(vdc, 0, "![%d] Failed to update device nodes", 7899 vdc->instance); 7900 } 7901 } 7902 7903 mutex_exit(&vdc->lock); 7904 } 7905 7906 static void 7907 vdc_validate_task(void *arg) 7908 { 7909 vdc_t *vdc = (vdc_t *)arg; 7910 7911 vdc_validate(vdc); 7912 7913 mutex_enter(&vdc->lock); 7914 ASSERT(vdc->validate_pending > 0); 7915 vdc->validate_pending--; 7916 mutex_exit(&vdc->lock); 7917 } 7918 7919 /* 7920 * Function: 7921 * vdc_setup_devid() 7922 * 7923 * Description: 7924 * This routine discovers the devid of a vDisk. It requests the devid of 7925 * the underlying device from the vDisk server, builds an encapsulated 7926 * devid based on the retrieved devid and registers that new devid to 7927 * the vDisk. 7928 * 7929 * Arguments: 7930 * vdc - soft state pointer for this instance of the device driver. 7931 * 7932 * Return Code: 7933 * 0 - A devid was succesfully registered for the vDisk 7934 */ 7935 static int 7936 vdc_setup_devid(vdc_t *vdc) 7937 { 7938 int rv; 7939 vd_devid_t *vd_devid; 7940 size_t bufsize, bufid_len; 7941 7942 /* 7943 * At first sight, we don't know the size of the devid that the 7944 * server will return but this size will be encoded into the 7945 * reply. So we do a first request using a default size then we 7946 * check if this size was large enough. If not then we do a second 7947 * request with the correct size returned by the server. Note that 7948 * ldc requires size to be 8-byte aligned. 7949 */ 7950 bufsize = P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN), 7951 sizeof (uint64_t)); 7952 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7953 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7954 7955 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, 7956 bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); 7957 7958 DMSG(vdc, 2, "sync_op returned %d\n", rv); 7959 7960 if (rv) { 7961 kmem_free(vd_devid, bufsize); 7962 return (rv); 7963 } 7964 7965 if (vd_devid->length > bufid_len) { 7966 /* 7967 * The returned devid is larger than the buffer used. Try again 7968 * with a buffer with the right size. 7969 */ 7970 kmem_free(vd_devid, bufsize); 7971 bufsize = P2ROUNDUP(VD_DEVID_SIZE(vd_devid->length), 7972 sizeof (uint64_t)); 7973 vd_devid = kmem_zalloc(bufsize, KM_SLEEP); 7974 bufid_len = bufsize - sizeof (vd_efi_t) - 1; 7975 7976 rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, 7977 (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, 7978 VIO_both_dir, B_TRUE); 7979 7980 if (rv) { 7981 kmem_free(vd_devid, bufsize); 7982 return (rv); 7983 } 7984 } 7985 7986 /* 7987 * The virtual disk should have the same device id as the one associated 7988 * with the physical disk it is mapped on, otherwise sharing a disk 7989 * between a LDom and a non-LDom may not work (for example for a shared 7990 * SVM disk set). 7991 * 7992 * The DDI framework does not allow creating a device id with any 7993 * type so we first create a device id of type DEVID_ENCAP and then 7994 * we restore the orignal type of the physical device. 7995 */ 7996 7997 DMSG(vdc, 2, ": devid length = %d\n", vd_devid->length); 7998 7999 /* build an encapsulated devid based on the returned devid */ 8000 if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length, 8001 vd_devid->id, &vdc->devid) != DDI_SUCCESS) { 8002 DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance); 8003 kmem_free(vd_devid, bufsize); 8004 return (1); 8005 } 8006 8007 DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type); 8008 8009 ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS); 8010 8011 kmem_free(vd_devid, bufsize); 8012 8013 if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) { 8014 DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance); 8015 return (1); 8016 } 8017 8018 return (0); 8019 } 8020 8021 static void 8022 vdc_store_label_efi(vdc_t *vdc, efi_gpt_t *gpt, efi_gpe_t *gpe) 8023 { 8024 int i, nparts; 8025 8026 ASSERT(MUTEX_HELD(&vdc->lock)); 8027 8028 vdc->vdisk_label = VD_DISK_LABEL_EFI; 8029 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8030 bzero(vdc->geom, sizeof (struct dk_geom)); 8031 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8032 8033 nparts = gpt->efi_gpt_NumberOfPartitionEntries; 8034 8035 for (i = 0; i < nparts && i < VD_EFI_WD_SLICE; i++) { 8036 8037 if (gpe[i].efi_gpe_StartingLBA == 0 || 8038 gpe[i].efi_gpe_EndingLBA == 0) { 8039 continue; 8040 } 8041 8042 vdc->slice[i].start = gpe[i].efi_gpe_StartingLBA; 8043 vdc->slice[i].nblocks = gpe[i].efi_gpe_EndingLBA - 8044 gpe[i].efi_gpe_StartingLBA + 1; 8045 } 8046 8047 ASSERT(vdc->vdisk_size != 0); 8048 vdc->slice[VD_EFI_WD_SLICE].start = 0; 8049 vdc->slice[VD_EFI_WD_SLICE].nblocks = vdc->vdisk_size; 8050 8051 } 8052 8053 static void 8054 vdc_store_label_vtoc(vdc_t *vdc, struct dk_geom *geom, struct extvtoc *vtoc) 8055 { 8056 int i; 8057 8058 ASSERT(MUTEX_HELD(&vdc->lock)); 8059 ASSERT(vdc->block_size == vtoc->v_sectorsz); 8060 8061 vdc->vdisk_label = VD_DISK_LABEL_VTOC; 8062 bcopy(vtoc, vdc->vtoc, sizeof (struct extvtoc)); 8063 bcopy(geom, vdc->geom, sizeof (struct dk_geom)); 8064 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8065 8066 for (i = 0; i < vtoc->v_nparts; i++) { 8067 vdc->slice[i].start = vtoc->v_part[i].p_start; 8068 vdc->slice[i].nblocks = vtoc->v_part[i].p_size; 8069 } 8070 } 8071 8072 static void 8073 vdc_store_label_unk(vdc_t *vdc) 8074 { 8075 ASSERT(MUTEX_HELD(&vdc->lock)); 8076 8077 vdc->vdisk_label = VD_DISK_LABEL_UNK; 8078 bzero(vdc->vtoc, sizeof (struct extvtoc)); 8079 bzero(vdc->geom, sizeof (struct dk_geom)); 8080 bzero(vdc->slice, sizeof (vd_slice_t) * V_NUMPAR); 8081 } 8082